PCA is an effective dimension reduction technique. If you have a dataset with large dimensionality, you can reduce it to a dataset with fewer dimensions that explains most of the variance.
PCA can be used on a large dataset to lower dimensionality to something manageable.
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.datasets import fetch_openml
import seaborn as sns
= fetch_openml('mnist_784')
mnist = mnist.data / 255.0 # Normalize the data to [0, 1]
X = mnist.target.astype(np.int64) y
= PCA(n_components=30)
pca = pca.fit_transform(X) X_pca
=(8, 4))
plt.figure(figsize
plt.plot(np.cumsum(pca.explained_variance_ratio_))'Cumulative Explained Variance')
plt.title('Number of Components')
plt.xlabel('Cumulative Explained Variance')
plt.ylabel(True)
plt.grid( plt.show()
# Convert the PCA results to a DataFrame for easier plotting
import pandas as pd
import seaborn as sns
= pd.DataFrame(X_pca[:, :5], columns=[f'PC{i+1}' for i in range(5)])
pca_df 'Target'] = y # Adding the target variable for coloring
pca_df[
# Use seaborn's pairplot to create the scatter plot matrix
='Target', vars=[f'PC{i+1}' for i in range(5)], palette='viridis')
sns.pairplot(pca_df, hue'Pairwise Scatter Plots of the First 5 PCA Components', y=1.02)
plt.suptitle( plt.show()