⚖️ Handling Imbalanced Data

Deal with skewed class distributions

What is Imbalanced Data?

Imbalanced data occurs when classes are not represented equally. Common in fraud detection (99% normal, 1% fraud), medical diagnosis, anomaly detection, and many real-world problems.

Why It's a Problem:

  • Accuracy is misleading: 99% accuracy predicting all majority class
  • Model bias: Learns to ignore minority class
  • Poor predictions: Fails on the class you care about most
  • Evaluation challenges: Need better metrics than accuracy

📊 Understanding the Problem

import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Create imbalanced dataset
X, y = make_classification(
    n_samples=10000,
    n_features=20,
    n_informative=15,
    n_redundant=5,
    weights=[0.95, 0.05],  # 95% class 0, 5% class 1
    random_state=42
)

print(f"Class distribution:")
print(pd.Series(y).value_counts())
print(f"\nImbalance ratio: {pd.Series(y).value_counts()[0] / pd.Series(y).value_counts()[1]:.1f}:1")

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Train baseline model
rf_baseline = RandomForestClassifier(random_state=42)
rf_baseline.fit(X_train, y_train)
y_pred = rf_baseline.predict(X_test)

# Evaluate
print("\nBaseline Model (No handling):")
print(f"Accuracy: {rf_baseline.score(X_test, y_test):.3f}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

📈 Resampling Techniques

1. Random Over-Sampling

from imblearn.over_sampling import RandomOverSampler

# Randomly duplicate minority class samples
ros = RandomOverSampler(random_state=42)
X_train_ros, y_train_ros = ros.fit_resample(X_train, y_train)

print(f"Original training set:")
print(pd.Series(y_train).value_counts())
print(f"\nAfter over-sampling:")
print(pd.Series(y_train_ros).value_counts())

# Train model
rf_ros = RandomForestClassifier(random_state=42)
rf_ros.fit(X_train_ros, y_train_ros)
y_pred_ros = rf_ros.predict(X_test)

print("\nWith Random Over-Sampling:")
print(classification_report(y_test, y_pred_ros))

2. Random Under-Sampling

from imblearn.under_sampling import RandomUnderSampler

# Randomly remove majority class samples
rus = RandomUnderSampler(random_state=42)
X_train_rus, y_train_rus = rus.fit_resample(X_train, y_train)

print(f"After under-sampling:")
print(pd.Series(y_train_rus).value_counts())

# Train model
rf_rus = RandomForestClassifier(random_state=42)
rf_rus.fit(X_train_rus, y_train_rus)
y_pred_rus = rf_rus.predict(X_test)

print("\nWith Random Under-Sampling:")
print(classification_report(y_test, y_pred_rus))

3. SMOTE (Synthetic Minority Over-sampling)

from imblearn.over_sampling import SMOTE

# Create synthetic samples by interpolating between minority samples
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

print(f"After SMOTE:")
print(pd.Series(y_train_smote).value_counts())

# Train model
rf_smote = RandomForestClassifier(random_state=42)
rf_smote.fit(X_train_smote, y_train_smote)
y_pred_smote = rf_smote.predict(X_test)

print("\nWith SMOTE:")
print(classification_report(y_test, y_pred_smote))

# SMOTE variants
from imblearn.over_sampling import SMOTENC, SMOTEN, ADASYN

# SMOTENC: For datasets with categorical features
# SMOTEN: For datasets with only categorical features
# ADASYN: Adaptive Synthetic Sampling (focuses on hard-to-learn samples)

4. Combined Sampling (SMOTE + Tomek Links)

from imblearn.combine import SMOTETomek

# Over-sample with SMOTE, then clean up with Tomek links
smote_tomek = SMOTETomek(random_state=42)
X_train_st, y_train_st = smote_tomek.fit_resample(X_train, y_train)

print(f"After SMOTE + Tomek:")
print(pd.Series(y_train_st).value_counts())

rf_st = RandomForestClassifier(random_state=42)
rf_st.fit(X_train_st, y_train_st)
y_pred_st = rf_st.predict(X_test)

print("\nWith SMOTE + Tomek:")
print(classification_report(y_test, y_pred_st))

⚖️ Class Weights

# Penalize misclassification of minority class more
from sklearn.utils.class_weight import compute_class_weight

# Compute balanced class weights
class_weights = compute_class_weight(
    'balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weight_dict = dict(enumerate(class_weights))
print(f"Class weights: {class_weight_dict}")

# Use in model
rf_weighted = RandomForestClassifier(
    class_weight='balanced',  # or pass class_weight_dict
    random_state=42
)
rf_weighted.fit(X_train, y_train)
y_pred_weighted = rf_weighted.predict(X_test)

print("\nWith Class Weights:")
print(classification_report(y_test, y_pred_weighted))

# Works with many sklearn models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

logreg_weighted = LogisticRegression(class_weight='balanced', max_iter=1000)
svm_weighted = SVC(class_weight='balanced')
dt_weighted = DecisionTreeClassifier(class_weight='balanced')

🎯 Threshold Adjustment

# Adjust decision threshold instead of 0.5
from sklearn.metrics import precision_recall_curve, roc_curve

# Get probability predictions
y_proba = rf_baseline.predict_proba(X_test)[:, 1]

# Find optimal threshold
precision, recall, thresholds = precision_recall_curve(y_test, y_proba)

# F1 score for each threshold
f1_scores = 2 * (precision * recall) / (precision + recall + 1e-10)
optimal_idx = np.argmax(f1_scores)
optimal_threshold = thresholds[optimal_idx]

print(f"Optimal threshold: {optimal_threshold:.3f}")
print(f"Best F1 score: {f1_scores[optimal_idx]:.3f}")

# Apply optimal threshold
y_pred_optimal = (y_proba >= optimal_threshold).astype(int)

print("\nWith Optimal Threshold:")
print(classification_report(y_test, y_pred_optimal))

# Plot threshold vs metrics
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.plot(thresholds, precision[:-1], label='Precision')
plt.plot(thresholds, recall[:-1], label='Recall')
plt.plot(thresholds, f1_scores[:-1], label='F1 Score')
plt.axvline(optimal_threshold, color='r', linestyle='--', label='Optimal')
plt.xlabel('Threshold')
plt.ylabel('Score')
plt.title('Metrics vs Threshold')
plt.legend()
plt.grid(True)
plt.show()

📊 Ensemble Methods

1. Balanced Random Forest

from imblearn.ensemble import BalancedRandomForestClassifier

# Each tree trained on balanced bootstrap sample
brf = BalancedRandomForestClassifier(
    n_estimators=100,
    sampling_strategy='auto',  # Balance classes in each bootstrap
    replacement=True,
    random_state=42
)

brf.fit(X_train, y_train)
y_pred_brf = brf.predict(X_test)

print("Balanced Random Forest:")
print(classification_report(y_test, y_pred_brf))

2. Easy Ensemble

from imblearn.ensemble import EasyEnsembleClassifier

# Multiple balanced subsets with replacement
eec = EasyEnsembleClassifier(
    n_estimators=10,
    random_state=42
)

eec.fit(X_train, y_train)
y_pred_eec = eec.predict(X_test)

print("\nEasy Ensemble:")
print(classification_report(y_test, y_pred_eec))

3. Balanced Bagging

from imblearn.ensemble import BalancedBaggingClassifier

bbc = BalancedBaggingClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=10,
    random_state=42
)

bbc.fit(X_train, y_train)
y_pred_bbc = bbc.predict(X_test)

print("\nBalanced Bagging:")
print(classification_report(y_test, y_pred_bbc))

📏 Better Evaluation Metrics

from sklearn.metrics import (
    balanced_accuracy_score,
    matthews_corrcoef,
    roc_auc_score,
    average_precision_score,
    f1_score
)

# Don't use accuracy!
print("Evaluation Metrics for Imbalanced Data:")
print(f"Accuracy: {rf_baseline.score(X_test, y_test):.3f} ❌ Misleading!")
print(f"Balanced Accuracy: {balanced_accuracy_score(y_test, y_pred):.3f}")
print(f"F1 Score: {f1_score(y_test, y_pred):.3f}")
print(f"ROC-AUC: {roc_auc_score(y_test, y_proba):.3f}")
print(f"PR-AUC: {average_precision_score(y_test, y_proba):.3f}")
print(f"MCC: {matthews_corrcoef(y_test, y_pred):.3f}")

# Confusion matrix breakdown
from sklearn.metrics import confusion_matrix
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

print(f"\nDetailed Metrics:")
print(f"True Negatives: {tn}")
print(f"False Positives: {fp}")
print(f"False Negatives: {fn}")
print(f"True Positives: {tp}")
print(f"Sensitivity (Recall): {tp/(tp+fn):.3f}")
print(f"Specificity: {tn/(tn+fp):.3f}")
print(f"Precision: {tp/(tp+fp):.3f}")

📊 Comparison Table

Technique Pros Cons When to Use
Random Over-Sampling Simple, fast Overfitting risk Quick baseline
Random Under-Sampling Fast, reduces data Loses information Very large datasets
SMOTE Synthetic samples, no overfitting Can create noise Most cases (best default)
Class Weights No data modification, fast Limited effectiveness Tree models, linear models
Threshold Tuning No retraining needed Requires probability calibration After model training
Ensemble Methods Often best performance More complex, slower Production systems

💡 Best Practices

⚠️ Common Mistakes

🔍 Complete Workflow

from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as ImbPipeline

# Create imbalanced data handling pipeline
pipeline = ImbPipeline([
    ('sampling', SMOTE(random_state=42)),
    ('classifier', RandomForestClassifier(class_weight='balanced', random_state=42))
])

# Cross-validation with stratification
from sklearn.model_selection import cross_val_score, StratifiedKFold

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X_train, y_train, 
                         cv=cv, scoring='f1')

print(f"Cross-validation F1 scores: {scores}")
print(f"Mean F1: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})")

# Final evaluation
pipeline.fit(X_train, y_train)
y_pred_final = pipeline.predict(X_test)
y_proba_final = pipeline.predict_proba(X_test)[:, 1]

print("\nFinal Model Performance:")
print(classification_report(y_test, y_pred_final))
print(f"ROC-AUC: {roc_auc_score(y_test, y_proba_final):.3f}")
print(f"PR-AUC: {average_precision_score(y_test, y_proba_final):.3f}")

🎯 Key Takeaways