AutoML

What is AutoML?

AutoML automates the end-to-end process of applying machine learning. Automatically selects models, tunes hyperparameters, and engineers features.

                AutoML Automates:
                Data preprocessing: Cleaning, encoding, scaling
Feature engineering: Create and select features
Model selection: Try many algorithms
Hyperparameter tuning: Find optimal settings
Ensemble creation: Combine multiple models

            

📦 Auto-sklearn

# pip install auto-sklearn
import autosklearn.classification
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Generate data
X, y = make_classification(n_samples=1000, n_features=20, 
                          n_informative=15, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Auto-sklearn automatically:
# - Preprocesses data
# - Selects best algorithm
# - Tunes hyperparameters
# - Creates ensemble
automl = autosklearn.classification.AutoSklearnClassifier(
    time_left_for_this_task=120,  # 2 minutes
    per_run_time_limit=30,        # 30 seconds per model
    n_jobs=-1
)

# Fit (this searches for best pipeline)
automl.fit(X_train, y_train)

# Predict
y_pred = automl.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.3f}")

# Show models tried
print(f"\nModels evaluated: {len(automl.cv_results_['mean_test_score'])}")

# Show best model
print("\nBest model:")
print(automl.show_models())

# Get statistics
print("\nSearch statistics:")
print(automl.sprint_statistics())

🔥 H2O AutoML

# pip install h2o
import h2o
from h2o.automl import H2OAutoML

# Initialize H2O
h2o.init()

# Convert to H2O frame
train = h2o.H2OFrame(X_train)
test = h2o.H2OFrame(X_test)
train['target'] = h2o.H2OFrame(y_train.reshape(-1, 1))
test['target'] = h2o.H2OFrame(y_test.reshape(-1, 1))

# Run AutoML
aml = H2OAutoML(
    max_runtime_secs=120,  # 2 minutes
    max_models=20,
    seed=42
)

aml.train(
    y='target',
    training_frame=train
)

# Leaderboard
lb = aml.leaderboard
print("Leaderboard:")
print(lb.head())

# Best model
best_model = aml.leader
print(f"\nBest model: {best_model.model_id}")

# Predict
predictions = best_model.predict(test)
print(predictions.head())

# Shutdown
h2o.cluster().shutdown()

🎯 TPOT

# pip install tpot
from tpot import TPOTClassifier

# TPOT uses genetic programming to optimize pipelines
tpot = TPOTClassifier(
    generations=5,          # Number of iterations
    population_size=20,     # Number of individuals
    cv=5,                   # Cross-validation folds
    random_state=42,
    verbosity=2,
    n_jobs=-1
)

# Fit
tpot.fit(X_train, y_train)

# Evaluate
score = tpot.score(X_test, y_test)
print(f"Test accuracy: {score:.3f}")

# Export best pipeline as Python code
tpot.export('best_pipeline.py')

# You can now use the exported code directly!

Exported Pipeline Example:

# best_pipeline.py (auto-generated by TPOT)
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

# Create pipeline
exported_pipeline = make_pipeline(
    StandardScaler(),
    RandomForestClassifier(
        max_depth=10,
        min_samples_split=5,
        n_estimators=100,
        random_state=42
    )
)

exported_pipeline.fit(X_train, y_train)
results = exported_pipeline.predict(X_test)

🔮 PyCaret

# pip install pycaret
from pycaret.classification import *
import pandas as pd

# Create DataFrame
df = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(X.shape[1])])
df['target'] = y

# Setup (preprocessing + train/test split)
clf = setup(
    data=df,
    target='target',
    session_id=42,
    silent=True,
    verbose=False
)

# Compare models
best_models = compare_models(n_select=3)

# Tune best model
tuned_model = tune_model(best_models[0])

# Ensemble
ensemble_model = ensemble_model(tuned_model)

# Evaluate
evaluate_model(ensemble_model)

# Predict
predictions = predict_model(ensemble_model, data=df)

# Save model
save_model(ensemble_model, 'final_model')

🧠 AutoKeras (Neural Architecture Search)

# pip install autokeras
import autokeras as ak

# Automatically find best neural network architecture
clf = ak.StructuredDataClassifier(
    max_trials=10,  # Number of architectures to try
    overwrite=True
)

# Fit
clf.fit(X_train, y_train, epochs=10)

# Predict
y_pred = clf.predict(X_test)

# Export best model
best_model = clf.export_model()
best_model.summary()

# Save
best_model.save('autokeras_model.h5')

📊 AutoML Tools Comparison

Tool	Approach	Speed	Best For
Auto-sklearn	Bayesian optimization	Medium	Sklearn integration
H2O AutoML	Grid/Random search	Fast	Large datasets, production
TPOT	Genetic programming	Slow	Best pipeline, exportable
PyCaret	Low-code framework	Fast	Quick prototyping
AutoKeras	Neural architecture search	Very slow	Deep learning tasks

🔧 Custom AutoML Pipeline

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

class SimpleAutoML:
    def __init__(self, time_budget=60):
        self.time_budget = time_budget
        self.best_model = None
        self.best_score = 0
        
    def fit(self, X, y):
        # Define candidate models
        models = {
            'rf': RandomForestClassifier(),
            'lr': LogisticRegression(max_iter=1000),
            'svc': SVC()
        }
        
        # Define hyperparameter grids
        param_grids = {
            'rf': {
                'model__n_estimators': [50, 100],
                'model__max_depth': [5, 10, None]
            },
            'lr': {
                'model__C': [0.1, 1.0, 10.0]
            },
            'svc': {
                'model__C': [0.1, 1.0],
                'model__kernel': ['rbf', 'linear']
            }
        }
        
        # Try each model
        for name, model in models.items():
            print(f"Trying {name}...")
            
            pipeline = Pipeline([
                ('scaler', StandardScaler()),
                ('model', model)
            ])
            
            grid_search = GridSearchCV(
                pipeline,
                param_grids[name],
                cv=5,
                n_jobs=-1
            )
            
            grid_search.fit(X, y)
            
            if grid_search.best_score_ > self.best_score:
                self.best_score = grid_search.best_score_
                self.best_model = grid_search.best_estimator_
                print(f"New best: {name} with score {self.best_score:.3f}")
        
        return self
    
    def predict(self, X):
        return self.best_model.predict(X)
    
    def score(self, X, y):
        return self.best_model.score(X, y)

# Usage
automl = SimpleAutoML()
automl.fit(X_train, y_train)
print(f"\nFinal score: {automl.score(X_test, y_test):.3f}")

⚖️ Pros and Cons

Pros:

✅ Fast development: Skip manual tuning
✅ Good baselines: Often beats manual approaches
✅ Accessible: Non-experts can build models
✅ Comprehensive search: Tries many options
✅ Best practices: Follows ML guidelines

Cons:

❌ Computational cost: Requires significant resources
❌ Black box: Less control over process
❌ Overfitting risk: May overfit to validation set
❌ Domain knowledge: Can't replace expertise
❌ Limited creativity: Won't find novel approaches

💡 Best Practices

Start with AutoML: Get quick baseline
Allocate enough time: More time = better results
Validate results: Check if model makes sense
Use for prototyping: Then refine manually
Monitor resource usage: Can be expensive
Understand limitations: Not magic solution
Combine with expertise: Domain knowledge still valuable
Export pipelines: Use TPOT to learn best practices

🎯 When to Use AutoML

Good Use Cases:

📊 Prototyping: Quick proof of concept
🎯 Baseline models: Starting point for optimization
⏰ Time constraints: Need results fast
🆕 New problem: Unsure which algorithm to try
📚 Learning: See what works well

Less Suitable:

🔬 Research: Need novel approaches
⚡ Real-time: Latency-critical applications
🎛️ Custom objectives: Non-standard metrics
💰 Production critical: Need full control

🎯 Key Takeaways

AutoML automates model selection and tuning
Auto-sklearn best for sklearn integration
H2O AutoML fast and production-ready
TPOT exports pipelines as Python code
PyCaret low-code, easy prototyping
AutoKeras for neural architecture search
Use for baselines then refine manually