πŸ“ Dimensionality Reduction

Reduce features while preserving information

What is Dimensionality Reduction?

Dimensionality reduction transforms high-dimensional data into fewer dimensions while retaining most of the important information. Essential for visualization, computation, and avoiding the curse of dimensionality.

Why Reduce Dimensions?

  • Visualization: Plot 100D data in 2D/3D
  • Speed: Faster training with fewer features
  • Storage: Less disk space and memory
  • Noise reduction: Remove irrelevant features
  • Curse of dimensionality: Many algorithms struggle in high dimensions

πŸ“Š Principal Component Analysis (PCA)

PCA finds orthogonal directions (principal components) that maximize variance in the data.

from sklearn.decomposition import PCA
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import numpy as np

# Load data
iris = load_iris()
X, y = iris.data, iris.target

# Scale features (PCA sensitive to scale!)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA
pca = PCA(n_components=2)  # Reduce to 2 dimensions
X_pca = pca.fit_transform(X_scaled)

print(f"Original shape: {X.shape}")      # (150, 4)
print(f"Reduced shape: {X_pca.shape}")   # (150, 2)

# Variance explained
print(f"\nExplained variance ratio: {pca.explained_variance_ratio_}")
print(f"Total variance explained: {pca.explained_variance_ratio_.sum():.2%}")

# Visualize
plt.figure(figsize=(10, 6))
for i, target_name in enumerate(iris.target_names):
    plt.scatter(X_pca[y == i, 0], X_pca[y == i, 1], 
                label=target_name, alpha=0.7)
plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%})')
plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%})')
plt.title('PCA of Iris Dataset')
plt.legend()
plt.grid(True)
plt.show()

Choosing Number of Components

# Method 1: Explained variance threshold
pca = PCA(n_components=0.95)  # Keep 95% of variance
X_pca = pca.fit_transform(X_scaled)
print(f"Components needed for 95% variance: {pca.n_components_}")

# Method 2: Scree plot
pca_full = PCA()
pca_full.fit(X_scaled)

plt.figure(figsize=(10, 6))
plt.plot(range(1, len(pca_full.explained_variance_ratio_) + 1),
         np.cumsum(pca_full.explained_variance_ratio_), 'bo-')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Scree Plot')
plt.axhline(y=0.95, color='r', linestyle='--', label='95% threshold')
plt.legend()
plt.grid(True)
plt.show()

# Method 3: Elbow method
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(pca_full.explained_variance_ratio_) + 1),
         pca_full.explained_variance_ratio_, 'bo-')
plt.xlabel('Component')
plt.ylabel('Explained Variance')
plt.title('Variance per Component')
plt.grid(True)
plt.show()

Inverse Transform

# Reconstruct original data from reduced dimensions
X_reconstructed = pca.inverse_transform(X_pca)

# Calculate reconstruction error
reconstruction_error = np.mean((X_scaled - X_reconstructed) ** 2)
print(f"Reconstruction error: {reconstruction_error:.4f}")

# Visualize original vs reconstructed
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
axes[0].scatter(X_scaled[:, 0], X_scaled[:, 1], alpha=0.7)
axes[0].set_title('Original Data')
axes[1].scatter(X_reconstructed[:, 0], X_reconstructed[:, 1], alpha=0.7)
axes[1].set_title('Reconstructed Data')
plt.show()

πŸŒ€ t-SNE

t-SNE (t-Distributed Stochastic Neighbor Embedding) preserves local structure, excellent for visualization.

from sklearn.manifold import TSNE

# Apply t-SNE
tsne = TSNE(
    n_components=2,      # Usually 2 or 3 for visualization
    perplexity=30,       # Balance local vs global structure (5-50)
    learning_rate=200,   # 10-1000, higher = faster but less stable
    n_iter=1000,         # More iterations = better results
    random_state=42
)

X_tsne = tsne.fit_transform(X_scaled)

# Visualize
plt.figure(figsize=(10, 6))
for i, target_name in enumerate(iris.target_names):
    plt.scatter(X_tsne[y == i, 0], X_tsne[y == i, 1],
                label=target_name, alpha=0.7)
plt.xlabel('t-SNE 1')
plt.ylabel('t-SNE 2')
plt.title('t-SNE of Iris Dataset')
plt.legend()
plt.grid(True)
plt.show()

# Important notes:
# - t-SNE is non-linear (unlike PCA)
# - No inverse_transform (can't go back to original space)
# - Slow for large datasets (use PCA first to reduce to ~50D)
# - Perplexity affects clustering (try 5-50)
# - Different runs give different results (non-deterministic)

t-SNE for Large Datasets

# For large datasets: PCA first, then t-SNE
from sklearn.datasets import fetch_openml

# Load MNIST (70,000 images, 784 features)
mnist = fetch_openml('mnist_784', version=1, parser='auto')
X_mnist, y_mnist = mnist.data[:10000], mnist.target[:10000]  # Use subset

# Step 1: PCA to reduce to 50 dimensions
pca = PCA(n_components=50)
X_pca = pca.fit_transform(X_mnist)
print(f"PCA variance explained: {pca.explained_variance_ratio_.sum():.2%}")

# Step 2: t-SNE on reduced data
tsne = TSNE(n_components=2, random_state=42, verbose=1)
X_tsne = tsne.fit_transform(X_pca)

# Visualize
plt.figure(figsize=(12, 10))
scatter = plt.scatter(X_tsne[:, 0], X_tsne[:, 1], 
                     c=y_mnist.astype(int), cmap='tab10', alpha=0.5)
plt.colorbar(scatter, label='Digit')
plt.title('t-SNE of MNIST Digits')
plt.show()

πŸ—ΊοΈ UMAP

UMAP (Uniform Manifold Approximation and Projection) is faster than t-SNE and preserves global structure better.

# Install: pip install umap-learn
import umap

# Apply UMAP
reducer = umap.UMAP(
    n_components=2,      # Output dimensions
    n_neighbors=15,      # Local neighborhood size (2-100)
    min_dist=0.1,        # Min distance in embedding (0.0-0.99)
    metric='euclidean',  # Distance metric
    random_state=42
)

X_umap = reducer.fit_transform(X_scaled)

# Visualize
plt.figure(figsize=(10, 6))
for i, target_name in enumerate(iris.target_names):
    plt.scatter(X_umap[y == i, 0], X_umap[y == i, 1],
                label=target_name, alpha=0.7)
plt.xlabel('UMAP 1')
plt.ylabel('UMAP 2')
plt.title('UMAP of Iris Dataset')
plt.legend()
plt.grid(True)
plt.show()

# UMAP advantages:
# - Much faster than t-SNE
# - Preserves global structure better
# - Can transform new data (unlike t-SNE)
# - Works well for large datasets

UMAP vs t-SNE Comparison

import time

# Compare speed
start = time.time()
X_tsne = TSNE(random_state=42).fit_transform(X_scaled)
tsne_time = time.time() - start

start = time.time()
X_umap = umap.UMAP(random_state=42).fit_transform(X_scaled)
umap_time = time.time() - start

print(f"t-SNE time: {tsne_time:.2f}s")
print(f"UMAP time: {umap_time:.2f}s")
print(f"UMAP is {tsne_time/umap_time:.1f}x faster")

# Side-by-side comparison
fig, axes = plt.subplots(1, 2, figsize=(15, 6))
for i, name in enumerate(iris.target_names):
    axes[0].scatter(X_tsne[y == i, 0], X_tsne[y == i, 1], 
                   label=name, alpha=0.7)
    axes[1].scatter(X_umap[y == i, 0], X_umap[y == i, 1],
                   label=name, alpha=0.7)
axes[0].set_title('t-SNE')
axes[1].set_title('UMAP')
for ax in axes:
    ax.legend()
    ax.grid(True)
plt.show()

πŸ“‰ Linear Discriminant Analysis (LDA)

LDA finds directions that maximize class separation (supervised method).

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# LDA requires labels (supervised)
lda = LinearDiscriminantAnalysis(n_components=2)
X_lda = lda.fit_transform(X_scaled, y)

print(f"Original shape: {X.shape}")
print(f"LDA shape: {X_lda.shape}")
print(f"Explained variance ratio: {lda.explained_variance_ratio_}")

# Visualize
plt.figure(figsize=(10, 6))
for i, target_name in enumerate(iris.target_names):
    plt.scatter(X_lda[y == i, 0], X_lda[y == i, 1],
                label=target_name, alpha=0.7)
plt.xlabel('LD1')
plt.ylabel('LD2')
plt.title('LDA of Iris Dataset')
plt.legend()
plt.grid(True)
plt.show()

# LDA vs PCA:
# - LDA: Supervised (uses labels), maximizes class separation
# - PCA: Unsupervised, maximizes variance
# - LDA: Max n_components = n_classes - 1
# - PCA: Max n_components = n_features

πŸ”§ Other Techniques

Truncated SVD (for Sparse Data)

from sklearn.decomposition import TruncatedSVD

# Good for sparse matrices (e.g., TF-IDF)
svd = TruncatedSVD(n_components=50, random_state=42)
X_svd = svd.fit_transform(X)

print(f"Explained variance: {svd.explained_variance_ratio_.sum():.2%}")

Kernel PCA (Non-linear)

from sklearn.decomposition import KernelPCA

# Non-linear dimensionality reduction
kpca = KernelPCA(n_components=2, kernel='rbf', gamma=15)
X_kpca = kpca.fit_transform(X_scaled)

plt.scatter(X_kpca[:, 0], X_kpca[:, 1], c=y, cmap='viridis')
plt.title('Kernel PCA')
plt.show()

Autoencoder (Deep Learning)

from tensorflow import keras
from tensorflow.keras import layers

# Neural network for dimensionality reduction
encoding_dim = 2

encoder = keras.Sequential([
    layers.Dense(64, activation='relu', input_shape=(X.shape[1],)),
    layers.Dense(32, activation='relu'),
    layers.Dense(encoding_dim, activation='relu')
])

decoder = keras.Sequential([
    layers.Dense(32, activation='relu', input_shape=(encoding_dim,)),
    layers.Dense(64, activation='relu'),
    layers.Dense(X.shape[1], activation='linear')
])

autoencoder = keras.Sequential([encoder, decoder])
autoencoder.compile(optimizer='adam', loss='mse')

autoencoder.fit(X_scaled, X_scaled, epochs=50, batch_size=32, verbose=0)

# Get encoded representation
X_encoded = encoder.predict(X_scaled)

πŸ“Š Comparison Table

Method Linear Supervised Speed Best For
PCA Yes No Fast General purpose, preprocessing
t-SNE No No Slow Visualization, local structure
UMAP No No Medium Visualization, large datasets
LDA Yes Yes Fast Classification tasks
SVD Yes No Fast Sparse data, text
Kernel PCA No No Medium Non-linear patterns
Autoencoder No No Slow Complex patterns, deep learning

πŸ’‘ Best Practices

⚠️ Common Pitfalls

🎯 Key Takeaways