Ensemble Methods

What are Ensemble Methods?

Ensemble methods combine multiple models to create a stronger predictor. The idea: "wisdom of crowds" - many models together perform better than any single model.

                Why Ensemble Methods Work:
                Reduce variance: Averaging reduces overfitting
Reduce bias: Combining weak learners creates strong learner
Improve predictions: Often win ML competitions
Robust: Less sensitive to noise and outliers

            

🎒 Bagging (Bootstrap Aggregating)

from sklearn.ensemble import BaggingClassifier, BaggingRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, cross_val_score
import numpy as np

# Generate data
X, y = make_classification(n_samples=1000, n_features=20, 
                          n_informative=15, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Single decision tree (baseline)
tree = DecisionTreeClassifier(random_state=42)
tree.fit(X_train, y_train)
tree_score = tree.score(X_test, y_test)
print(f"Single Decision Tree: {tree_score:.3f}")

# Bagging: Train multiple trees on bootstrap samples
bagging = BaggingClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=50,      # Number of base models
    max_samples=0.8,      # 80% of data per bootstrap
    max_features=0.8,     # 80% of features per model
    bootstrap=True,       # Sample with replacement
    random_state=42,
    n_jobs=-1
)

bagging.fit(X_train, y_train)
bagging_score = bagging.score(X_test, y_test)
print(f"Bagging (50 trees): {bagging_score:.3f}")
print(f"Improvement: {bagging_score - tree_score:.3f}")

How Bagging Works:

Create N bootstrap samples (random sampling with replacement)
Train a model on each sample
Average predictions (regression) or vote (classification)

🌲 Random Forest

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

# Random Forest = Bagging + Random Feature Selection
rf = RandomForestClassifier(
    n_estimators=100,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features='sqrt',  # sqrt(n_features) per split
    bootstrap=True,
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)
rf_score = rf.score(X_test, y_test)
print(f"Random Forest: {rf_score:.3f}")

# Feature importance
importances = rf.feature_importances_
indices = np.argsort(importances)[::-1]

print("\nTop 5 Features:")
for i in range(5):
    print(f"{i+1}. Feature {indices[i]}: {importances[indices[i]]:.4f}")

# Out-of-bag score (no need for separate validation)
rf_oob = RandomForestClassifier(n_estimators=100, oob_score=True, random_state=42)
rf_oob.fit(X_train, y_train)
print(f"\nOOB Score: {rf_oob.oob_score_:.3f}")

🚀 Boosting - AdaBoost

from sklearn.ensemble import AdaBoostClassifier

# AdaBoost: Sequential learning, focus on misclassified samples
adaboost = AdaBoostClassifier(
    estimator=DecisionTreeClassifier(max_depth=1),  # Weak learners
    n_estimators=50,
    learning_rate=1.0,
    random_state=42
)

adaboost.fit(X_train, y_train)
ada_score = adaboost.score(X_test, y_test)
print(f"AdaBoost: {ada_score:.3f}")

# Estimator weights (higher = more important)
print(f"Estimator weights: {adaboost.estimator_weights_[:5]}")

How AdaBoost Works:

Train weak learner on data
Increase weight of misclassified samples
Train next learner focusing on hard examples
Combine all learners with weighted vote

🎯 Gradient Boosting

from sklearn.ensemble import GradientBoostingClassifier

# Gradient Boosting: Fit new models to residual errors
gb = GradientBoostingClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3,
    subsample=0.8,
    random_state=42
)

gb.fit(X_train, y_train)
gb_score = gb.score(X_test, y_test)
print(f"Gradient Boosting: {gb_score:.3f}")

# Feature importance
gb_importances = gb.feature_importances_
print(f"\nTop feature importance: {gb_importances.max():.4f}")

⚡ XGBoost

# pip install xgboost
import xgboost as xgb

# XGBoost: Optimized gradient boosting
xgb_clf = xgb.XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)

xgb_clf.fit(X_train, y_train)
xgb_score = xgb_clf.score(X_test, y_test)
print(f"XGBoost: {xgb_score:.3f}")

# Early stopping with validation set
xgb_early = xgb.XGBClassifier(n_estimators=1000, learning_rate=0.1, 
                              early_stopping_rounds=10, random_state=42)
xgb_early.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
print(f"Best iteration: {xgb_early.best_iteration}")
print(f"Best score: {xgb_early.best_score:.3f}")

💡 LightGBM

# pip install lightgbm
import lightgbm as lgb

# LightGBM: Fast gradient boosting
lgb_clf = lgb.LGBMClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    num_leaves=31,
    random_state=42
)

lgb_clf.fit(X_train, y_train)
lgb_score = lgb_clf.score(X_test, y_test)
print(f"LightGBM: {lgb_score:.3f}")

🐱 CatBoost

# pip install catboost
from catboost import CatBoostClassifier

# CatBoost: Handles categorical features automatically
catboost_clf = CatBoostClassifier(
    iterations=100,
    learning_rate=0.1,
    depth=5,
    random_state=42,
    verbose=False
)

catboost_clf.fit(X_train, y_train)
catboost_score = catboost_clf.score(X_test, y_test)
print(f"CatBoost: {catboost_score:.3f}")

🎭 Stacking

from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# Level 0: Base models
estimators = [
    ('rf', RandomForestClassifier(n_estimators=50, random_state=42)),
    ('gb', GradientBoostingClassifier(n_estimators=50, random_state=42)),
    ('svc', SVC(probability=True, random_state=42))
]

# Level 1: Meta-model
stacking = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(),
    cv=5
)

stacking.fit(X_train, y_train)
stacking_score = stacking.score(X_test, y_test)
print(f"Stacking: {stacking_score:.3f}")

How Stacking Works:

Train multiple base models
Use their predictions as features
Train meta-model on these predictions

🗳️ Voting

from sklearn.ensemble import VotingClassifier

# Hard voting: Majority vote
voting_hard = VotingClassifier(
    estimators=[
        ('rf', RandomForestClassifier(n_estimators=50, random_state=42)),
        ('gb', GradientBoostingClassifier(n_estimators=50, random_state=42)),
        ('svc', SVC(random_state=42))
    ],
    voting='hard'
)

voting_hard.fit(X_train, y_train)
print(f"Hard Voting: {voting_hard.score(X_test, y_test):.3f}")

# Soft voting: Average probabilities
voting_soft = VotingClassifier(
    estimators=[
        ('rf', RandomForestClassifier(n_estimators=50, random_state=42)),
        ('gb', GradientBoostingClassifier(n_estimators=50, random_state=42)),
        ('svc', SVC(probability=True, random_state=42))
    ],
    voting='soft'
)

voting_soft.fit(X_train, y_train)
print(f"Soft Voting: {voting_soft.score(X_test, y_test):.3f}")

📊 Comparison

import matplotlib.pyplot as plt

models = {
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'AdaBoost': AdaBoostClassifier(n_estimators=50, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'XGBoost': xgb.XGBClassifier(n_estimators=100, random_state=42, use_label_encoder=False),
    'Voting': voting_soft
}

results = {}
for name, model in models.items():
    scores = cross_val_score(model, X_train, y_train, cv=5)
    results[name] = scores.mean()
    print(f"{name}: {scores.mean():.3f} (+/- {scores.std():.3f})")

# Plot
plt.figure(figsize=(12, 6))
plt.bar(results.keys(), results.values())
plt.ylabel('Accuracy')
plt.title('Ensemble Methods Comparison')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

💡 Best Practices

Start with Random Forest: Great default choice
Try XGBoost/LightGBM: Often best performance
Tune hyperparameters: Use GridSearchCV
Use early stopping: Prevent overfitting in boosting
Lower learning rate: 0.01-0.1 for better generalization
More trees with lower LR: Better than fewer trees with high LR
Cross-validate: Ensemble methods can still overfit
Diversity matters: Use different model types in ensemble

🎯 Key Takeaways

Bagging reduces variance through averaging
Boosting reduces bias by sequential learning
Random Forest = bagging + random features
XGBoost/LightGBM/CatBoost are optimized gradient boosting
Stacking uses predictions as features for meta-model
Voting combines models through majority vote or averaging
Ensemble methods often win Kaggle competitions