What are Ensemble Methods?
Ensemble methods combine multiple models to create a stronger predictor. The idea: "wisdom of crowds" - many models together perform better than any single model.
Why Ensemble Methods Work:
- Reduce variance: Averaging reduces overfitting
- Reduce bias: Combining weak learners creates strong learner
- Improve predictions: Often win ML competitions
- Robust: Less sensitive to noise and outliers
🎒 Bagging (Bootstrap Aggregating)
from sklearn.ensemble import BaggingClassifier, BaggingRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, cross_val_score
import numpy as np
# Generate data
X, y = make_classification(n_samples=1000, n_features=20,
n_informative=15, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# Single decision tree (baseline)
tree = DecisionTreeClassifier(random_state=42)
tree.fit(X_train, y_train)
tree_score = tree.score(X_test, y_test)
print(f"Single Decision Tree: {tree_score:.3f}")
# Bagging: Train multiple trees on bootstrap samples
bagging = BaggingClassifier(
estimator=DecisionTreeClassifier(),
n_estimators=50, # Number of base models
max_samples=0.8, # 80% of data per bootstrap
max_features=0.8, # 80% of features per model
bootstrap=True, # Sample with replacement
random_state=42,
n_jobs=-1
)
bagging.fit(X_train, y_train)
bagging_score = bagging.score(X_test, y_test)
print(f"Bagging (50 trees): {bagging_score:.3f}")
print(f"Improvement: {bagging_score - tree_score:.3f}")
How Bagging Works:
- Create N bootstrap samples (random sampling with replacement)
- Train a model on each sample
- Average predictions (regression) or vote (classification)
🌲 Random Forest
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
# Random Forest = Bagging + Random Feature Selection
rf = RandomForestClassifier(
n_estimators=100,
max_depth=None,
min_samples_split=2,
min_samples_leaf=1,
max_features='sqrt', # sqrt(n_features) per split
bootstrap=True,
random_state=42,
n_jobs=-1
)
rf.fit(X_train, y_train)
rf_score = rf.score(X_test, y_test)
print(f"Random Forest: {rf_score:.3f}")
# Feature importance
importances = rf.feature_importances_
indices = np.argsort(importances)[::-1]
print("\nTop 5 Features:")
for i in range(5):
print(f"{i+1}. Feature {indices[i]}: {importances[indices[i]]:.4f}")
# Out-of-bag score (no need for separate validation)
rf_oob = RandomForestClassifier(n_estimators=100, oob_score=True, random_state=42)
rf_oob.fit(X_train, y_train)
print(f"\nOOB Score: {rf_oob.oob_score_:.3f}")
🚀 Boosting - AdaBoost
from sklearn.ensemble import AdaBoostClassifier
# AdaBoost: Sequential learning, focus on misclassified samples
adaboost = AdaBoostClassifier(
estimator=DecisionTreeClassifier(max_depth=1), # Weak learners
n_estimators=50,
learning_rate=1.0,
random_state=42
)
adaboost.fit(X_train, y_train)
ada_score = adaboost.score(X_test, y_test)
print(f"AdaBoost: {ada_score:.3f}")
# Estimator weights (higher = more important)
print(f"Estimator weights: {adaboost.estimator_weights_[:5]}")
How AdaBoost Works:
- Train weak learner on data
- Increase weight of misclassified samples
- Train next learner focusing on hard examples
- Combine all learners with weighted vote
🎯 Gradient Boosting
from sklearn.ensemble import GradientBoostingClassifier
# Gradient Boosting: Fit new models to residual errors
gb = GradientBoostingClassifier(
n_estimators=100,
learning_rate=0.1,
max_depth=3,
subsample=0.8,
random_state=42
)
gb.fit(X_train, y_train)
gb_score = gb.score(X_test, y_test)
print(f"Gradient Boosting: {gb_score:.3f}")
# Feature importance
gb_importances = gb.feature_importances_
print(f"\nTop feature importance: {gb_importances.max():.4f}")
⚡ XGBoost
# pip install xgboost
import xgboost as xgb
# XGBoost: Optimized gradient boosting
xgb_clf = xgb.XGBClassifier(
n_estimators=100,
learning_rate=0.1,
max_depth=5,
subsample=0.8,
colsample_bytree=0.8,
random_state=42,
use_label_encoder=False,
eval_metric='logloss'
)
xgb_clf.fit(X_train, y_train)
xgb_score = xgb_clf.score(X_test, y_test)
print(f"XGBoost: {xgb_score:.3f}")
# Early stopping with validation set
xgb_early = xgb.XGBClassifier(n_estimators=1000, learning_rate=0.1,
early_stopping_rounds=10, random_state=42)
xgb_early.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
print(f"Best iteration: {xgb_early.best_iteration}")
print(f"Best score: {xgb_early.best_score:.3f}")
💡 LightGBM
# pip install lightgbm
import lightgbm as lgb
# LightGBM: Fast gradient boosting
lgb_clf = lgb.LGBMClassifier(
n_estimators=100,
learning_rate=0.1,
max_depth=5,
num_leaves=31,
random_state=42
)
lgb_clf.fit(X_train, y_train)
lgb_score = lgb_clf.score(X_test, y_test)
print(f"LightGBM: {lgb_score:.3f}")
🐱 CatBoost
# pip install catboost
from catboost import CatBoostClassifier
# CatBoost: Handles categorical features automatically
catboost_clf = CatBoostClassifier(
iterations=100,
learning_rate=0.1,
depth=5,
random_state=42,
verbose=False
)
catboost_clf.fit(X_train, y_train)
catboost_score = catboost_clf.score(X_test, y_test)
print(f"CatBoost: {catboost_score:.3f}")
🎭 Stacking
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
# Level 0: Base models
estimators = [
('rf', RandomForestClassifier(n_estimators=50, random_state=42)),
('gb', GradientBoostingClassifier(n_estimators=50, random_state=42)),
('svc', SVC(probability=True, random_state=42))
]
# Level 1: Meta-model
stacking = StackingClassifier(
estimators=estimators,
final_estimator=LogisticRegression(),
cv=5
)
stacking.fit(X_train, y_train)
stacking_score = stacking.score(X_test, y_test)
print(f"Stacking: {stacking_score:.3f}")
How Stacking Works:
- Train multiple base models
- Use their predictions as features
- Train meta-model on these predictions
🗳️ Voting
from sklearn.ensemble import VotingClassifier
# Hard voting: Majority vote
voting_hard = VotingClassifier(
estimators=[
('rf', RandomForestClassifier(n_estimators=50, random_state=42)),
('gb', GradientBoostingClassifier(n_estimators=50, random_state=42)),
('svc', SVC(random_state=42))
],
voting='hard'
)
voting_hard.fit(X_train, y_train)
print(f"Hard Voting: {voting_hard.score(X_test, y_test):.3f}")
# Soft voting: Average probabilities
voting_soft = VotingClassifier(
estimators=[
('rf', RandomForestClassifier(n_estimators=50, random_state=42)),
('gb', GradientBoostingClassifier(n_estimators=50, random_state=42)),
('svc', SVC(probability=True, random_state=42))
],
voting='soft'
)
voting_soft.fit(X_train, y_train)
print(f"Soft Voting: {voting_soft.score(X_test, y_test):.3f}")
📊 Comparison
import matplotlib.pyplot as plt
models = {
'Decision Tree': DecisionTreeClassifier(random_state=42),
'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
'AdaBoost': AdaBoostClassifier(n_estimators=50, random_state=42),
'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
'XGBoost': xgb.XGBClassifier(n_estimators=100, random_state=42, use_label_encoder=False),
'Voting': voting_soft
}
results = {}
for name, model in models.items():
scores = cross_val_score(model, X_train, y_train, cv=5)
results[name] = scores.mean()
print(f"{name}: {scores.mean():.3f} (+/- {scores.std():.3f})")
# Plot
plt.figure(figsize=(12, 6))
plt.bar(results.keys(), results.values())
plt.ylabel('Accuracy')
plt.title('Ensemble Methods Comparison')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()
💡 Best Practices
- Start with Random Forest: Great default choice
- Try XGBoost/LightGBM: Often best performance
- Tune hyperparameters: Use GridSearchCV
- Use early stopping: Prevent overfitting in boosting
- Lower learning rate: 0.01-0.1 for better generalization
- More trees with lower LR: Better than fewer trees with high LR
- Cross-validate: Ensemble methods can still overfit
- Diversity matters: Use different model types in ensemble
🎯 Key Takeaways
- Bagging reduces variance through averaging
- Boosting reduces bias by sequential learning
- Random Forest = bagging + random features
- XGBoost/LightGBM/CatBoost are optimized gradient boosting
- Stacking uses predictions as features for meta-model
- Voting combines models through majority vote or averaging
- Ensemble methods often win Kaggle competitions