What is AutoML?
AutoML automates the end-to-end process of applying machine learning. Automatically selects models, tunes hyperparameters, and engineers features.
AutoML Automates:
- Data preprocessing: Cleaning, encoding, scaling
- Feature engineering: Create and select features
- Model selection: Try many algorithms
- Hyperparameter tuning: Find optimal settings
- Ensemble creation: Combine multiple models
📦 Auto-sklearn
# pip install auto-sklearn
import autosklearn.classification
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# Generate data
X, y = make_classification(n_samples=1000, n_features=20,
n_informative=15, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# Auto-sklearn automatically:
# - Preprocesses data
# - Selects best algorithm
# - Tunes hyperparameters
# - Creates ensemble
automl = autosklearn.classification.AutoSklearnClassifier(
time_left_for_this_task=120, # 2 minutes
per_run_time_limit=30, # 30 seconds per model
n_jobs=-1
)
# Fit (this searches for best pipeline)
automl.fit(X_train, y_train)
# Predict
y_pred = automl.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.3f}")
# Show models tried
print(f"\nModels evaluated: {len(automl.cv_results_['mean_test_score'])}")
# Show best model
print("\nBest model:")
print(automl.show_models())
# Get statistics
print("\nSearch statistics:")
print(automl.sprint_statistics())
🔥 H2O AutoML
# pip install h2o
import h2o
from h2o.automl import H2OAutoML
# Initialize H2O
h2o.init()
# Convert to H2O frame
train = h2o.H2OFrame(X_train)
test = h2o.H2OFrame(X_test)
train['target'] = h2o.H2OFrame(y_train.reshape(-1, 1))
test['target'] = h2o.H2OFrame(y_test.reshape(-1, 1))
# Run AutoML
aml = H2OAutoML(
max_runtime_secs=120, # 2 minutes
max_models=20,
seed=42
)
aml.train(
y='target',
training_frame=train
)
# Leaderboard
lb = aml.leaderboard
print("Leaderboard:")
print(lb.head())
# Best model
best_model = aml.leader
print(f"\nBest model: {best_model.model_id}")
# Predict
predictions = best_model.predict(test)
print(predictions.head())
# Shutdown
h2o.cluster().shutdown()
🎯 TPOT
# pip install tpot
from tpot import TPOTClassifier
# TPOT uses genetic programming to optimize pipelines
tpot = TPOTClassifier(
generations=5, # Number of iterations
population_size=20, # Number of individuals
cv=5, # Cross-validation folds
random_state=42,
verbosity=2,
n_jobs=-1
)
# Fit
tpot.fit(X_train, y_train)
# Evaluate
score = tpot.score(X_test, y_test)
print(f"Test accuracy: {score:.3f}")
# Export best pipeline as Python code
tpot.export('best_pipeline.py')
# You can now use the exported code directly!
Exported Pipeline Example:
# best_pipeline.py (auto-generated by TPOT)
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
# Create pipeline
exported_pipeline = make_pipeline(
StandardScaler(),
RandomForestClassifier(
max_depth=10,
min_samples_split=5,
n_estimators=100,
random_state=42
)
)
exported_pipeline.fit(X_train, y_train)
results = exported_pipeline.predict(X_test)
🔮 PyCaret
# pip install pycaret
from pycaret.classification import *
import pandas as pd
# Create DataFrame
df = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(X.shape[1])])
df['target'] = y
# Setup (preprocessing + train/test split)
clf = setup(
data=df,
target='target',
session_id=42,
silent=True,
verbose=False
)
# Compare models
best_models = compare_models(n_select=3)
# Tune best model
tuned_model = tune_model(best_models[0])
# Ensemble
ensemble_model = ensemble_model(tuned_model)
# Evaluate
evaluate_model(ensemble_model)
# Predict
predictions = predict_model(ensemble_model, data=df)
# Save model
save_model(ensemble_model, 'final_model')
🧠 AutoKeras (Neural Architecture Search)
# pip install autokeras
import autokeras as ak
# Automatically find best neural network architecture
clf = ak.StructuredDataClassifier(
max_trials=10, # Number of architectures to try
overwrite=True
)
# Fit
clf.fit(X_train, y_train, epochs=10)
# Predict
y_pred = clf.predict(X_test)
# Export best model
best_model = clf.export_model()
best_model.summary()
# Save
best_model.save('autokeras_model.h5')
📊 AutoML Tools Comparison
| Tool | Approach | Speed | Best For |
|---|---|---|---|
| Auto-sklearn | Bayesian optimization | Medium | Sklearn integration |
| H2O AutoML | Grid/Random search | Fast | Large datasets, production |
| TPOT | Genetic programming | Slow | Best pipeline, exportable |
| PyCaret | Low-code framework | Fast | Quick prototyping |
| AutoKeras | Neural architecture search | Very slow | Deep learning tasks |
🔧 Custom AutoML Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
class SimpleAutoML:
def __init__(self, time_budget=60):
self.time_budget = time_budget
self.best_model = None
self.best_score = 0
def fit(self, X, y):
# Define candidate models
models = {
'rf': RandomForestClassifier(),
'lr': LogisticRegression(max_iter=1000),
'svc': SVC()
}
# Define hyperparameter grids
param_grids = {
'rf': {
'model__n_estimators': [50, 100],
'model__max_depth': [5, 10, None]
},
'lr': {
'model__C': [0.1, 1.0, 10.0]
},
'svc': {
'model__C': [0.1, 1.0],
'model__kernel': ['rbf', 'linear']
}
}
# Try each model
for name, model in models.items():
print(f"Trying {name}...")
pipeline = Pipeline([
('scaler', StandardScaler()),
('model', model)
])
grid_search = GridSearchCV(
pipeline,
param_grids[name],
cv=5,
n_jobs=-1
)
grid_search.fit(X, y)
if grid_search.best_score_ > self.best_score:
self.best_score = grid_search.best_score_
self.best_model = grid_search.best_estimator_
print(f"New best: {name} with score {self.best_score:.3f}")
return self
def predict(self, X):
return self.best_model.predict(X)
def score(self, X, y):
return self.best_model.score(X, y)
# Usage
automl = SimpleAutoML()
automl.fit(X_train, y_train)
print(f"\nFinal score: {automl.score(X_test, y_test):.3f}")
⚖️ Pros and Cons
Pros:
- ✅ Fast development: Skip manual tuning
- ✅ Good baselines: Often beats manual approaches
- ✅ Accessible: Non-experts can build models
- ✅ Comprehensive search: Tries many options
- ✅ Best practices: Follows ML guidelines
Cons:
- ❌ Computational cost: Requires significant resources
- ❌ Black box: Less control over process
- ❌ Overfitting risk: May overfit to validation set
- ❌ Domain knowledge: Can't replace expertise
- ❌ Limited creativity: Won't find novel approaches
💡 Best Practices
- Start with AutoML: Get quick baseline
- Allocate enough time: More time = better results
- Validate results: Check if model makes sense
- Use for prototyping: Then refine manually
- Monitor resource usage: Can be expensive
- Understand limitations: Not magic solution
- Combine with expertise: Domain knowledge still valuable
- Export pipelines: Use TPOT to learn best practices
🎯 When to Use AutoML
Good Use Cases:
- 📊 Prototyping: Quick proof of concept
- 🎯 Baseline models: Starting point for optimization
- ⏰ Time constraints: Need results fast
- 🆕 New problem: Unsure which algorithm to try
- 📚 Learning: See what works well
Less Suitable:
- 🔬 Research: Need novel approaches
- ⚡ Real-time: Latency-critical applications
- 🎛️ Custom objectives: Non-standard metrics
- 💰 Production critical: Need full control
🎯 Key Takeaways
- AutoML automates model selection and tuning
- Auto-sklearn best for sklearn integration
- H2O AutoML fast and production-ready
- TPOT exports pipelines as Python code
- PyCaret low-code, easy prototyping
- AutoKeras for neural architecture search
- Use for baselines then refine manually