Overfitting & Regularization

What is Overfitting?

Overfitting occurs when a model learns the training data too well, including noise and outliers, resulting in poor generalization to new data.

                The Bias-Variance Tradeoff:
                Underfitting (High Bias): Model too simple, poor on both train and test
Good Fit: Balanced, generalizes well
Overfitting (High Variance): Model too complex, great on train, poor on test

            

🔍 Detecting Overfitting

from sklearn.model_selection import train_test_split, learning_curve
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
import numpy as np
import matplotlib.pyplot as plt

# Load data
iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(
    iris.data, iris.target, test_size=0.2, random_state=42
)

# Train model
model = RandomForestClassifier(max_depth=10, random_state=42)
model.fit(X_train, y_train)

# Check for overfitting
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)

print(f"Training score: {train_score:.3f}")
print(f"Test score: {test_score:.3f}")
print(f"Gap: {train_score - test_score:.3f}")

# Signs of overfitting:
# - Training score >> Test score (large gap)
# - Training score near 1.0, test score much lower
# - Model performs worse on new data

Learning Curves

# Plot learning curves to diagnose overfitting
def plot_learning_curve(estimator, X, y):
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=5, n_jobs=-1,
        train_sizes=np.linspace(0.1, 1.0, 10),
        scoring='accuracy'
    )
    
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)
    
    plt.figure(figsize=(10, 6))
    plt.plot(train_sizes, train_mean, label='Training score', color='blue')
    plt.fill_between(train_sizes, train_mean - train_std, 
                     train_mean + train_std, alpha=0.1, color='blue')
    plt.plot(train_sizes, test_mean, label='Validation score', color='red')
    plt.fill_between(train_sizes, test_mean - test_std,
                     test_mean + test_std, alpha=0.1, color='red')
    plt.xlabel('Training Size')
    plt.ylabel('Score')
    plt.title('Learning Curves')
    plt.legend()
    plt.grid(True)
    plt.show()

plot_learning_curve(model, iris.data, iris.target)

# Interpretation:
# - Large gap between curves: Overfitting
# - Both curves low: Underfitting
# - Converging curves: Good fit

🛡️ L1 Regularization (Lasso)

L1 adds penalty equal to absolute value of coefficients. Produces sparse models (some weights = 0).

from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.preprocessing import StandardScaler

# Scale features (important for regularization!)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Lasso Regression (L1)
lasso = Lasso(alpha=0.1)  # alpha controls regularization strength
lasso.fit(X_train_scaled, y_train)

# Check coefficients
print("Coefficients:", lasso.coef_)
print("Non-zero features:", np.sum(lasso.coef_ != 0))

# L1 for classification
logreg_l1 = LogisticRegression(penalty='l1', C=1.0, solver='liblinear')
logreg_l1.fit(X_train_scaled, y_train)

# C is inverse of alpha: smaller C = stronger regularization
# C = 0.01 (strong), C = 1 (medium), C = 100 (weak)

Feature Selection with L1

# L1 automatically does feature selection
from sklearn.datasets import make_classification

# Create dataset with irrelevant features
X, y = make_classification(n_samples=1000, n_features=20, 
                           n_informative=5, n_redundant=5,
                           random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Strong L1 regularization
logreg_l1 = LogisticRegression(penalty='l1', C=0.1, solver='liblinear', max_iter=1000)
logreg_l1.fit(X_train, y_train)

# Count selected features
n_selected = np.sum(logreg_l1.coef_ != 0)
print(f"Selected {n_selected} out of 20 features")

# Without regularization
logreg_none = LogisticRegression(penalty=None, max_iter=1000)
logreg_none.fit(X_train, y_train)

print(f"\nWith L1: {logreg_l1.score(X_test, y_test):.3f}")
print(f"Without: {logreg_none.score(X_test, y_test):.3f}")

🎯 L2 Regularization (Ridge)

L2 adds penalty equal to square of coefficients. Shrinks weights but doesn't make them zero.

from sklearn.linear_model import Ridge

# Ridge Regression (L2)
ridge = Ridge(alpha=1.0)  # alpha controls regularization
ridge.fit(X_train_scaled, y_train)

print("Coefficients:", ridge.coef_)
print("All coefficients non-zero:", np.all(ridge.coef_ != 0))

# L2 for classification
logreg_l2 = LogisticRegression(penalty='l2', C=1.0, solver='lbfgs')
logreg_l2.fit(X_train_scaled, y_train)

# Compare different alphas
for alpha in [0.01, 0.1, 1.0, 10.0]:
    ridge = Ridge(alpha=alpha)
    ridge.fit(X_train_scaled, y_train)
    train_score = ridge.score(X_train_scaled, y_train)
    test_score = ridge.score(X_test_scaled, y_test)
    print(f"Alpha={alpha}: Train={train_score:.3f}, Test={test_score:.3f}")

⚖️ Elastic Net (L1 + L2)

Elastic Net combines L1 and L2 regularization for the best of both worlds.

from sklearn.linear_model import ElasticNet

# Elastic Net combines L1 and L2
elastic = ElasticNet(
    alpha=1.0,        # Overall regularization strength
    l1_ratio=0.5      # Balance: 0=pure L2, 1=pure L1, 0.5=equal mix
)
elastic.fit(X_train_scaled, y_train)

# Try different l1_ratio values
for l1_ratio in [0.0, 0.25, 0.5, 0.75, 1.0]:
    elastic = ElasticNet(alpha=0.1, l1_ratio=l1_ratio)
    elastic.fit(X_train_scaled, y_train)
    score = elastic.score(X_test_scaled, y_test)
    n_nonzero = np.sum(elastic.coef_ != 0)
    print(f"L1 ratio={l1_ratio}: Score={score:.3f}, "
          f"Non-zero features={n_nonzero}")

📊 L1 vs L2 vs Elastic Net

Feature	L1 (Lasso)	L2 (Ridge)	Elastic Net
Penalty	\|w\|	w²	α₁\|w\| + α₂w²
Feature Selection	Yes (sets weights to 0)	No (shrinks weights)	Yes (partial)
Sparse Model	Yes	No	Yes
Correlated Features	Picks one randomly	Keeps all, shrinks equally	Groups correlated features
Best For	Feature selection, sparse data	Most cases, collinearity	Many correlated features

🌳 Regularization in Tree Models

from sklearn.ensemble import RandomForestClassifier

# Random Forest regularization parameters
rf = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,              # Limit tree depth
    min_samples_split=10,      # Min samples to split node
    min_samples_leaf=5,        # Min samples in leaf
    max_features='sqrt',       # Random feature subset
    max_leaf_nodes=50,         # Limit leaf nodes
    min_impurity_decrease=0.01 # Min improvement to split
)

rf.fit(X_train, y_train)

# Compare different max_depth
for depth in [3, 5, 10, None]:
    rf = RandomForestClassifier(max_depth=depth, random_state=42)
    rf.fit(X_train, y_train)
    train_score = rf.score(X_train, y_train)
    test_score = rf.score(X_test, y_test)
    print(f"Depth={depth}: Train={train_score:.3f}, Test={test_score:.3f}")

Gradient Boosting Regularization

from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier(
    n_estimators=100,
    learning_rate=0.1,      # Lower = more regularization
    max_depth=3,            # Shallow trees
    min_samples_split=10,
    min_samples_leaf=5,
    subsample=0.8,          # Row sampling (< 1.0)
    max_features='sqrt'     # Column sampling
)

gb.fit(X_train, y_train)

# Learning rate effect
for lr in [0.01, 0.1, 0.5, 1.0]:
    gb = GradientBoostingClassifier(learning_rate=lr, n_estimators=100)
    gb.fit(X_train, y_train)
    print(f"LR={lr}: Test={gb.score(X_test, y_test):.3f}")

🧠 Dropout (Neural Networks)

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Neural network with dropout
model = keras.Sequential([
    layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    layers.Dropout(0.5),  # Drop 50% of neurons during training
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.3),  # Drop 30%
    layers.Dense(10, activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Dropout is only active during training
# Automatically disabled during inference

history = model.fit(X_train, y_train, epochs=50, 
                    validation_split=0.2, verbose=0)

# Compare with/without dropout
# Without dropout: More overfitting (larger train-val gap)
# With dropout: Better generalization

📉 Early Stopping

# Stop training when validation performance stops improving

# For neural networks
early_stop = keras.callbacks.EarlyStopping(
    monitor='val_loss',      # Metric to monitor
    patience=10,             # Epochs to wait before stopping
    restore_best_weights=True # Restore best model
)

history = model.fit(
    X_train, y_train,
    epochs=1000,             # Large number
    validation_split=0.2,
    callbacks=[early_stop],
    verbose=0
)

print(f"Stopped at epoch {len(history.history['loss'])}")

# For Gradient Boosting
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier(
    n_estimators=1000,
    learning_rate=0.1,
    validation_fraction=0.2,
    n_iter_no_change=10,     # Early stopping patience
    random_state=42
)

gb.fit(X_train, y_train)
print(f"Stopped at {gb.n_estimators_} trees")

🔄 Data Augmentation

# Increase training data artificially (images)
from tensorflow.keras.preprocessing.image import ImageDataGenerator

datagen = ImageDataGenerator(
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    horizontal_flip=True,
    zoom_range=0.2
)

# For time series: Add noise, shift, scale
def augment_time_series(X, noise_level=0.01):
    noise = np.random.normal(0, noise_level, X.shape)
    return X + noise

# For text: Synonym replacement, back-translation

📦 More Regularization Techniques

1. More Training Data

Best solution if possible! More data = less overfitting.

2. Simplify Model

# Fewer features
from sklearn.feature_selection import SelectKBest, f_classif

selector = SelectKBest(f_classif, k=10)
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)

# Simpler model architecture
# Instead of: 1000 → 500 → 250 → 100 → 10
# Use: 100 → 50 → 10

3. Ensemble Methods

# Bagging reduces variance (overfitting)
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bagging = BaggingClassifier(
    DecisionTreeClassifier(),
    n_estimators=100,
    max_samples=0.8,    # Bootstrap 80% of data
    random_state=42
)
bagging.fit(X_train, y_train)

4. Batch Normalization

# For neural networks
model = keras.Sequential([
    layers.Dense(128, input_shape=(X_train.shape[1],)),
    layers.BatchNormalization(),  # Regularization effect
    layers.Activation('relu'),
    layers.Dense(64),
    layers.BatchNormalization(),
    layers.Activation('relu'),
    layers.Dense(10, activation='softmax')
])

5. Weight Decay

# L2 regularization for neural networks
model = keras.Sequential([
    layers.Dense(128, activation='relu',
                 kernel_regularizer=keras.regularizers.l2(0.01),
                 input_shape=(X_train.shape[1],)),
    layers.Dense(10, activation='softmax',
                 kernel_regularizer=keras.regularizers.l2(0.01))
])

# L1 regularization
keras.regularizers.l1(0.01)

# L1 + L2
keras.regularizers.l1_l2(l1=0.01, l2=0.01)

🎯 Choosing Regularization Strength

from sklearn.linear_model import RidgeCV, LassoCV

# Cross-validated Ridge (automatically finds best alpha)
ridge_cv = RidgeCV(alphas=[0.01, 0.1, 1.0, 10.0, 100.0], cv=5)
ridge_cv.fit(X_train_scaled, y_train)
print(f"Best alpha: {ridge_cv.alpha_}")

# Cross-validated Lasso
lasso_cv = LassoCV(alphas=[0.01, 0.1, 1.0, 10.0], cv=5)
lasso_cv.fit(X_train_scaled, y_train)
print(f"Best alpha: {lasso_cv.alpha_}")

# Manual search with validation curve
from sklearn.model_selection import validation_curve

alphas = np.logspace(-3, 3, 20)
train_scores, val_scores = validation_curve(
    Ridge(), X_train_scaled, y_train,
    param_name='alpha', param_range=alphas,
    cv=5, scoring='neg_mean_squared_error'
)

plt.figure(figsize=(10, 6))
plt.semilogx(alphas, -train_scores.mean(axis=1), label='Training')
plt.semilogx(alphas, -val_scores.mean(axis=1), label='Validation')
plt.xlabel('Alpha')
plt.ylabel('MSE')
plt.legend()
plt.title('Validation Curve')
plt.show()

💡 Best Practices

Always scale features before using L1/L2 regularization
Start with L2, try L1 if you need feature selection
Use cross-validation to tune regularization strength
Monitor train/val curves to detect overfitting
Early stopping for iterative algorithms (boosting, NNs)
Dropout 0.2-0.5 for dense layers in neural networks
More data > complex regularization when possible
Ensemble methods naturally reduce overfitting

⚠️ Common Mistakes

Not scaling: L1/L2 sensitive to feature scales
Too strong regularization: Can cause underfitting
Ignoring validation set: Tune on validation, not test
Dropout at inference: Should be disabled (automatic in Keras)
Wrong solver: Use 'liblinear' for L1 in LogisticRegression
Forgetting to tune: Default regularization may not be optimal

🎯 Key Takeaways

Overfitting = model memorizes training data, poor generalization
L1 (Lasso) produces sparse models, good for feature selection
L2 (Ridge) shrinks weights, works well for correlated features
Elastic Net combines L1 and L2 benefits
Dropout randomly disables neurons during training
Early stopping halts training when validation stops improving
Always use cross-validation to tune regularization strength