Handling Imbalanced Data

What is Imbalanced Data?

Imbalanced data occurs when classes are not represented equally. Common in fraud detection (99% normal, 1% fraud), medical diagnosis, anomaly detection, and many real-world problems.

                Why It's a Problem:
                Accuracy is misleading: 99% accuracy predicting all majority class
Model bias: Learns to ignore minority class
Poor predictions: Fails on the class you care about most
Evaluation challenges: Need better metrics than accuracy

            

📊 Understanding the Problem

import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Create imbalanced dataset
X, y = make_classification(
    n_samples=10000,
    n_features=20,
    n_informative=15,
    n_redundant=5,
    weights=[0.95, 0.05],  # 95% class 0, 5% class 1
    random_state=42
)

print(f"Class distribution:")
print(pd.Series(y).value_counts())
print(f"\nImbalance ratio: {pd.Series(y).value_counts()[0] / pd.Series(y).value_counts()[1]:.1f}:1")

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Train baseline model
rf_baseline = RandomForestClassifier(random_state=42)
rf_baseline.fit(X_train, y_train)
y_pred = rf_baseline.predict(X_test)

# Evaluate
print("\nBaseline Model (No handling):")
print(f"Accuracy: {rf_baseline.score(X_test, y_test):.3f}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

📈 Resampling Techniques

1. Random Over-Sampling

from imblearn.over_sampling import RandomOverSampler

# Randomly duplicate minority class samples
ros = RandomOverSampler(random_state=42)
X_train_ros, y_train_ros = ros.fit_resample(X_train, y_train)

print(f"Original training set:")
print(pd.Series(y_train).value_counts())
print(f"\nAfter over-sampling:")
print(pd.Series(y_train_ros).value_counts())

# Train model
rf_ros = RandomForestClassifier(random_state=42)
rf_ros.fit(X_train_ros, y_train_ros)
y_pred_ros = rf_ros.predict(X_test)

print("\nWith Random Over-Sampling:")
print(classification_report(y_test, y_pred_ros))

2. Random Under-Sampling

from imblearn.under_sampling import RandomUnderSampler

# Randomly remove majority class samples
rus = RandomUnderSampler(random_state=42)
X_train_rus, y_train_rus = rus.fit_resample(X_train, y_train)

print(f"After under-sampling:")
print(pd.Series(y_train_rus).value_counts())

# Train model
rf_rus = RandomForestClassifier(random_state=42)
rf_rus.fit(X_train_rus, y_train_rus)
y_pred_rus = rf_rus.predict(X_test)

print("\nWith Random Under-Sampling:")
print(classification_report(y_test, y_pred_rus))

3. SMOTE (Synthetic Minority Over-sampling)

from imblearn.over_sampling import SMOTE

# Create synthetic samples by interpolating between minority samples
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

print(f"After SMOTE:")
print(pd.Series(y_train_smote).value_counts())

# Train model
rf_smote = RandomForestClassifier(random_state=42)
rf_smote.fit(X_train_smote, y_train_smote)
y_pred_smote = rf_smote.predict(X_test)

print("\nWith SMOTE:")
print(classification_report(y_test, y_pred_smote))

# SMOTE variants
from imblearn.over_sampling import SMOTENC, SMOTEN, ADASYN

# SMOTENC: For datasets with categorical features
# SMOTEN: For datasets with only categorical features
# ADASYN: Adaptive Synthetic Sampling (focuses on hard-to-learn samples)

4. Combined Sampling (SMOTE + Tomek Links)

from imblearn.combine import SMOTETomek

# Over-sample with SMOTE, then clean up with Tomek links
smote_tomek = SMOTETomek(random_state=42)
X_train_st, y_train_st = smote_tomek.fit_resample(X_train, y_train)

print(f"After SMOTE + Tomek:")
print(pd.Series(y_train_st).value_counts())

rf_st = RandomForestClassifier(random_state=42)
rf_st.fit(X_train_st, y_train_st)
y_pred_st = rf_st.predict(X_test)

print("\nWith SMOTE + Tomek:")
print(classification_report(y_test, y_pred_st))

⚖️ Class Weights

# Penalize misclassification of minority class more
from sklearn.utils.class_weight import compute_class_weight

# Compute balanced class weights
class_weights = compute_class_weight(
    'balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weight_dict = dict(enumerate(class_weights))
print(f"Class weights: {class_weight_dict}")

# Use in model
rf_weighted = RandomForestClassifier(
    class_weight='balanced',  # or pass class_weight_dict
    random_state=42
)
rf_weighted.fit(X_train, y_train)
y_pred_weighted = rf_weighted.predict(X_test)

print("\nWith Class Weights:")
print(classification_report(y_test, y_pred_weighted))

# Works with many sklearn models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

logreg_weighted = LogisticRegression(class_weight='balanced', max_iter=1000)
svm_weighted = SVC(class_weight='balanced')
dt_weighted = DecisionTreeClassifier(class_weight='balanced')

🎯 Threshold Adjustment

# Adjust decision threshold instead of 0.5
from sklearn.metrics import precision_recall_curve, roc_curve

# Get probability predictions
y_proba = rf_baseline.predict_proba(X_test)[:, 1]

# Find optimal threshold
precision, recall, thresholds = precision_recall_curve(y_test, y_proba)

# F1 score for each threshold
f1_scores = 2 * (precision * recall) / (precision + recall + 1e-10)
optimal_idx = np.argmax(f1_scores)
optimal_threshold = thresholds[optimal_idx]

print(f"Optimal threshold: {optimal_threshold:.3f}")
print(f"Best F1 score: {f1_scores[optimal_idx]:.3f}")

# Apply optimal threshold
y_pred_optimal = (y_proba >= optimal_threshold).astype(int)

print("\nWith Optimal Threshold:")
print(classification_report(y_test, y_pred_optimal))

# Plot threshold vs metrics
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.plot(thresholds, precision[:-1], label='Precision')
plt.plot(thresholds, recall[:-1], label='Recall')
plt.plot(thresholds, f1_scores[:-1], label='F1 Score')
plt.axvline(optimal_threshold, color='r', linestyle='--', label='Optimal')
plt.xlabel('Threshold')
plt.ylabel('Score')
plt.title('Metrics vs Threshold')
plt.legend()
plt.grid(True)
plt.show()

📊 Ensemble Methods

1. Balanced Random Forest

from imblearn.ensemble import BalancedRandomForestClassifier

# Each tree trained on balanced bootstrap sample
brf = BalancedRandomForestClassifier(
    n_estimators=100,
    sampling_strategy='auto',  # Balance classes in each bootstrap
    replacement=True,
    random_state=42
)

brf.fit(X_train, y_train)
y_pred_brf = brf.predict(X_test)

print("Balanced Random Forest:")
print(classification_report(y_test, y_pred_brf))

2. Easy Ensemble

from imblearn.ensemble import EasyEnsembleClassifier

# Multiple balanced subsets with replacement
eec = EasyEnsembleClassifier(
    n_estimators=10,
    random_state=42
)

eec.fit(X_train, y_train)
y_pred_eec = eec.predict(X_test)

print("\nEasy Ensemble:")
print(classification_report(y_test, y_pred_eec))

3. Balanced Bagging

from imblearn.ensemble import BalancedBaggingClassifier

bbc = BalancedBaggingClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=10,
    random_state=42
)

bbc.fit(X_train, y_train)
y_pred_bbc = bbc.predict(X_test)

print("\nBalanced Bagging:")
print(classification_report(y_test, y_pred_bbc))

📏 Better Evaluation Metrics

from sklearn.metrics import (
    balanced_accuracy_score,
    matthews_corrcoef,
    roc_auc_score,
    average_precision_score,
    f1_score
)

# Don't use accuracy!
print("Evaluation Metrics for Imbalanced Data:")
print(f"Accuracy: {rf_baseline.score(X_test, y_test):.3f} ❌ Misleading!")
print(f"Balanced Accuracy: {balanced_accuracy_score(y_test, y_pred):.3f}")
print(f"F1 Score: {f1_score(y_test, y_pred):.3f}")
print(f"ROC-AUC: {roc_auc_score(y_test, y_proba):.3f}")
print(f"PR-AUC: {average_precision_score(y_test, y_proba):.3f}")
print(f"MCC: {matthews_corrcoef(y_test, y_pred):.3f}")

# Confusion matrix breakdown
from sklearn.metrics import confusion_matrix
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

print(f"\nDetailed Metrics:")
print(f"True Negatives: {tn}")
print(f"False Positives: {fp}")
print(f"False Negatives: {fn}")
print(f"True Positives: {tp}")
print(f"Sensitivity (Recall): {tp/(tp+fn):.3f}")
print(f"Specificity: {tn/(tn+fp):.3f}")
print(f"Precision: {tp/(tp+fp):.3f}")

📊 Comparison Table

Technique	Pros	Cons	When to Use
Random Over-Sampling	Simple, fast	Overfitting risk	Quick baseline
Random Under-Sampling	Fast, reduces data	Loses information	Very large datasets
SMOTE	Synthetic samples, no overfitting	Can create noise	Most cases (best default)
Class Weights	No data modification, fast	Limited effectiveness	Tree models, linear models
Threshold Tuning	No retraining needed	Requires probability calibration	After model training
Ensemble Methods	Often best performance	More complex, slower	Production systems

💡 Best Practices

Stratify train/test split: Maintain class ratio in both sets
Don't use accuracy: Use F1, ROC-AUC, PR-AUC, or MCC
Try SMOTE first: Often works well as default
Combine techniques: SMOTE + class weights can work better
Tune threshold: Find optimal decision boundary
Use cross-validation: Stratified K-fold for reliable estimates
Consider business cost: FP vs FN may have different costs
Collect more minority data: Best solution if possible

⚠️ Common Mistakes

Using accuracy: 99% accuracy is useless if minority class ignored
Not stratifying split: Can end up with no minority samples in test

Resampling before split: Causes data leakage

# WRONG: Resample entire dataset
X_resampled, y_resampled = SMOTE().fit_resample(X, y)
X_train, X_test = train_test_split(X_resampled, y_resampled)

# CORRECT: Resample only training data
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)
X_train_resampled, y_train_resampled = SMOTE().fit_resample(X_train, y_train)

Over-sampling test set: Only resample training data!
Ignoring data collection: Getting more minority samples often best
Not trying multiple approaches: Different techniques work for different data

🔍 Complete Workflow

from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as ImbPipeline

# Create imbalanced data handling pipeline
pipeline = ImbPipeline([
    ('sampling', SMOTE(random_state=42)),
    ('classifier', RandomForestClassifier(class_weight='balanced', random_state=42))
])

# Cross-validation with stratification
from sklearn.model_selection import cross_val_score, StratifiedKFold

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X_train, y_train, 
                         cv=cv, scoring='f1')

print(f"Cross-validation F1 scores: {scores}")
print(f"Mean F1: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})")

# Final evaluation
pipeline.fit(X_train, y_train)
y_pred_final = pipeline.predict(X_test)
y_proba_final = pipeline.predict_proba(X_test)[:, 1]

print("\nFinal Model Performance:")
print(classification_report(y_test, y_pred_final))
print(f"ROC-AUC: {roc_auc_score(y_test, y_proba_final):.3f}")
print(f"PR-AUC: {average_precision_score(y_test, y_proba_final):.3f}")

🎯 Key Takeaways

Imbalanced data is common in real-world problems
Accuracy is misleading - use F1, ROC-AUC, or PR-AUC
SMOTE creates synthetic minority samples (best default)
Class weights penalize minority misclassification more
Threshold tuning adjusts decision boundary
Always stratify train/test split and cross-validation
Resample training data only to avoid leakage