What is Imbalanced Data?
Imbalanced data occurs when classes are not represented equally. Common in fraud detection (99% normal, 1% fraud), medical diagnosis, anomaly detection, and many real-world problems.
Why It's a Problem:
- Accuracy is misleading: 99% accuracy predicting all majority class
- Model bias: Learns to ignore minority class
- Poor predictions: Fails on the class you care about most
- Evaluation challenges: Need better metrics than accuracy
📊 Understanding the Problem
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
# Create imbalanced dataset
X, y = make_classification(
n_samples=10000,
n_features=20,
n_informative=15,
n_redundant=5,
weights=[0.95, 0.05], # 95% class 0, 5% class 1
random_state=42
)
print(f"Class distribution:")
print(pd.Series(y).value_counts())
print(f"\nImbalance ratio: {pd.Series(y).value_counts()[0] / pd.Series(y).value_counts()[1]:.1f}:1")
# Split data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
# Train baseline model
rf_baseline = RandomForestClassifier(random_state=42)
rf_baseline.fit(X_train, y_train)
y_pred = rf_baseline.predict(X_test)
# Evaluate
print("\nBaseline Model (No handling):")
print(f"Accuracy: {rf_baseline.score(X_test, y_test):.3f}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
📈 Resampling Techniques
1. Random Over-Sampling
from imblearn.over_sampling import RandomOverSampler
# Randomly duplicate minority class samples
ros = RandomOverSampler(random_state=42)
X_train_ros, y_train_ros = ros.fit_resample(X_train, y_train)
print(f"Original training set:")
print(pd.Series(y_train).value_counts())
print(f"\nAfter over-sampling:")
print(pd.Series(y_train_ros).value_counts())
# Train model
rf_ros = RandomForestClassifier(random_state=42)
rf_ros.fit(X_train_ros, y_train_ros)
y_pred_ros = rf_ros.predict(X_test)
print("\nWith Random Over-Sampling:")
print(classification_report(y_test, y_pred_ros))
2. Random Under-Sampling
from imblearn.under_sampling import RandomUnderSampler
# Randomly remove majority class samples
rus = RandomUnderSampler(random_state=42)
X_train_rus, y_train_rus = rus.fit_resample(X_train, y_train)
print(f"After under-sampling:")
print(pd.Series(y_train_rus).value_counts())
# Train model
rf_rus = RandomForestClassifier(random_state=42)
rf_rus.fit(X_train_rus, y_train_rus)
y_pred_rus = rf_rus.predict(X_test)
print("\nWith Random Under-Sampling:")
print(classification_report(y_test, y_pred_rus))
3. SMOTE (Synthetic Minority Over-sampling)
from imblearn.over_sampling import SMOTE
# Create synthetic samples by interpolating between minority samples
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
print(f"After SMOTE:")
print(pd.Series(y_train_smote).value_counts())
# Train model
rf_smote = RandomForestClassifier(random_state=42)
rf_smote.fit(X_train_smote, y_train_smote)
y_pred_smote = rf_smote.predict(X_test)
print("\nWith SMOTE:")
print(classification_report(y_test, y_pred_smote))
# SMOTE variants
from imblearn.over_sampling import SMOTENC, SMOTEN, ADASYN
# SMOTENC: For datasets with categorical features
# SMOTEN: For datasets with only categorical features
# ADASYN: Adaptive Synthetic Sampling (focuses on hard-to-learn samples)
4. Combined Sampling (SMOTE + Tomek Links)
from imblearn.combine import SMOTETomek
# Over-sample with SMOTE, then clean up with Tomek links
smote_tomek = SMOTETomek(random_state=42)
X_train_st, y_train_st = smote_tomek.fit_resample(X_train, y_train)
print(f"After SMOTE + Tomek:")
print(pd.Series(y_train_st).value_counts())
rf_st = RandomForestClassifier(random_state=42)
rf_st.fit(X_train_st, y_train_st)
y_pred_st = rf_st.predict(X_test)
print("\nWith SMOTE + Tomek:")
print(classification_report(y_test, y_pred_st))
⚖️ Class Weights
# Penalize misclassification of minority class more
from sklearn.utils.class_weight import compute_class_weight
# Compute balanced class weights
class_weights = compute_class_weight(
'balanced',
classes=np.unique(y_train),
y=y_train
)
class_weight_dict = dict(enumerate(class_weights))
print(f"Class weights: {class_weight_dict}")
# Use in model
rf_weighted = RandomForestClassifier(
class_weight='balanced', # or pass class_weight_dict
random_state=42
)
rf_weighted.fit(X_train, y_train)
y_pred_weighted = rf_weighted.predict(X_test)
print("\nWith Class Weights:")
print(classification_report(y_test, y_pred_weighted))
# Works with many sklearn models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
logreg_weighted = LogisticRegression(class_weight='balanced', max_iter=1000)
svm_weighted = SVC(class_weight='balanced')
dt_weighted = DecisionTreeClassifier(class_weight='balanced')
🎯 Threshold Adjustment
# Adjust decision threshold instead of 0.5
from sklearn.metrics import precision_recall_curve, roc_curve
# Get probability predictions
y_proba = rf_baseline.predict_proba(X_test)[:, 1]
# Find optimal threshold
precision, recall, thresholds = precision_recall_curve(y_test, y_proba)
# F1 score for each threshold
f1_scores = 2 * (precision * recall) / (precision + recall + 1e-10)
optimal_idx = np.argmax(f1_scores)
optimal_threshold = thresholds[optimal_idx]
print(f"Optimal threshold: {optimal_threshold:.3f}")
print(f"Best F1 score: {f1_scores[optimal_idx]:.3f}")
# Apply optimal threshold
y_pred_optimal = (y_proba >= optimal_threshold).astype(int)
print("\nWith Optimal Threshold:")
print(classification_report(y_test, y_pred_optimal))
# Plot threshold vs metrics
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 6))
plt.plot(thresholds, precision[:-1], label='Precision')
plt.plot(thresholds, recall[:-1], label='Recall')
plt.plot(thresholds, f1_scores[:-1], label='F1 Score')
plt.axvline(optimal_threshold, color='r', linestyle='--', label='Optimal')
plt.xlabel('Threshold')
plt.ylabel('Score')
plt.title('Metrics vs Threshold')
plt.legend()
plt.grid(True)
plt.show()
📊 Ensemble Methods
1. Balanced Random Forest
from imblearn.ensemble import BalancedRandomForestClassifier
# Each tree trained on balanced bootstrap sample
brf = BalancedRandomForestClassifier(
n_estimators=100,
sampling_strategy='auto', # Balance classes in each bootstrap
replacement=True,
random_state=42
)
brf.fit(X_train, y_train)
y_pred_brf = brf.predict(X_test)
print("Balanced Random Forest:")
print(classification_report(y_test, y_pred_brf))
2. Easy Ensemble
from imblearn.ensemble import EasyEnsembleClassifier
# Multiple balanced subsets with replacement
eec = EasyEnsembleClassifier(
n_estimators=10,
random_state=42
)
eec.fit(X_train, y_train)
y_pred_eec = eec.predict(X_test)
print("\nEasy Ensemble:")
print(classification_report(y_test, y_pred_eec))
3. Balanced Bagging
from imblearn.ensemble import BalancedBaggingClassifier
bbc = BalancedBaggingClassifier(
estimator=DecisionTreeClassifier(),
n_estimators=10,
random_state=42
)
bbc.fit(X_train, y_train)
y_pred_bbc = bbc.predict(X_test)
print("\nBalanced Bagging:")
print(classification_report(y_test, y_pred_bbc))
📏 Better Evaluation Metrics
from sklearn.metrics import (
balanced_accuracy_score,
matthews_corrcoef,
roc_auc_score,
average_precision_score,
f1_score
)
# Don't use accuracy!
print("Evaluation Metrics for Imbalanced Data:")
print(f"Accuracy: {rf_baseline.score(X_test, y_test):.3f} ❌ Misleading!")
print(f"Balanced Accuracy: {balanced_accuracy_score(y_test, y_pred):.3f}")
print(f"F1 Score: {f1_score(y_test, y_pred):.3f}")
print(f"ROC-AUC: {roc_auc_score(y_test, y_proba):.3f}")
print(f"PR-AUC: {average_precision_score(y_test, y_proba):.3f}")
print(f"MCC: {matthews_corrcoef(y_test, y_pred):.3f}")
# Confusion matrix breakdown
from sklearn.metrics import confusion_matrix
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print(f"\nDetailed Metrics:")
print(f"True Negatives: {tn}")
print(f"False Positives: {fp}")
print(f"False Negatives: {fn}")
print(f"True Positives: {tp}")
print(f"Sensitivity (Recall): {tp/(tp+fn):.3f}")
print(f"Specificity: {tn/(tn+fp):.3f}")
print(f"Precision: {tp/(tp+fp):.3f}")
📊 Comparison Table
| Technique | Pros | Cons | When to Use |
|---|---|---|---|
| Random Over-Sampling | Simple, fast | Overfitting risk | Quick baseline |
| Random Under-Sampling | Fast, reduces data | Loses information | Very large datasets |
| SMOTE | Synthetic samples, no overfitting | Can create noise | Most cases (best default) |
| Class Weights | No data modification, fast | Limited effectiveness | Tree models, linear models |
| Threshold Tuning | No retraining needed | Requires probability calibration | After model training |
| Ensemble Methods | Often best performance | More complex, slower | Production systems |
💡 Best Practices
- Stratify train/test split: Maintain class ratio in both sets
- Don't use accuracy: Use F1, ROC-AUC, PR-AUC, or MCC
- Try SMOTE first: Often works well as default
- Combine techniques: SMOTE + class weights can work better
- Tune threshold: Find optimal decision boundary
- Use cross-validation: Stratified K-fold for reliable estimates
- Consider business cost: FP vs FN may have different costs
- Collect more minority data: Best solution if possible
⚠️ Common Mistakes
- Using accuracy: 99% accuracy is useless if minority class ignored
- Not stratifying split: Can end up with no minority samples in test
- Resampling before split: Causes data leakage
# WRONG: Resample entire dataset X_resampled, y_resampled = SMOTE().fit_resample(X, y) X_train, X_test = train_test_split(X_resampled, y_resampled) # CORRECT: Resample only training data X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y) X_train_resampled, y_train_resampled = SMOTE().fit_resample(X_train, y_train) - Over-sampling test set: Only resample training data!
- Ignoring data collection: Getting more minority samples often best
- Not trying multiple approaches: Different techniques work for different data
🔍 Complete Workflow
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as ImbPipeline
# Create imbalanced data handling pipeline
pipeline = ImbPipeline([
('sampling', SMOTE(random_state=42)),
('classifier', RandomForestClassifier(class_weight='balanced', random_state=42))
])
# Cross-validation with stratification
from sklearn.model_selection import cross_val_score, StratifiedKFold
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X_train, y_train,
cv=cv, scoring='f1')
print(f"Cross-validation F1 scores: {scores}")
print(f"Mean F1: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})")
# Final evaluation
pipeline.fit(X_train, y_train)
y_pred_final = pipeline.predict(X_test)
y_proba_final = pipeline.predict_proba(X_test)[:, 1]
print("\nFinal Model Performance:")
print(classification_report(y_test, y_pred_final))
print(f"ROC-AUC: {roc_auc_score(y_test, y_proba_final):.3f}")
print(f"PR-AUC: {average_precision_score(y_test, y_proba_final):.3f}")
🎯 Key Takeaways
- Imbalanced data is common in real-world problems
- Accuracy is misleading - use F1, ROC-AUC, or PR-AUC
- SMOTE creates synthetic minority samples (best default)
- Class weights penalize minority misclassification more
- Threshold tuning adjusts decision boundary
- Always stratify train/test split and cross-validation
- Resample training data only to avoid leakage