What You'll Build
In this lesson, you'll train a complete machine learning model from scratch. We'll predict house prices based on features like size, location, and number of rooms. This is a classic regression problem that teaches all the fundamentals.
What You'll Learn:
- Load data: Import and explore a dataset
- Prepare data: Clean and preprocess
- Train model: Fit a model to data
- Evaluate: Measure performance
- Predict: Make predictions on new data
- Improve: Tune for better results
📊 Step 1: Load and Explore Data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
# Load dataset (using Boston Housing dataset)
from sklearn.datasets import fetch_california_housing
data = fetch_california_housing()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['Price'] = data.target
print("Dataset shape:", df.shape)
print("\nFirst few rows:")
print(df.head())
# Explore data
print("\nDataset statistics:")
print(df.describe())
# Check for missing values
print("\nMissing values:")
print(df.isnull().sum())
# Visualize target variable
plt.figure(figsize=(10, 4))
plt.subplot(1, 2, 1)
plt.hist(df['Price'], bins=50, edgecolor='black')
plt.xlabel('House Price ($100,000s)')
plt.ylabel('Frequency')
plt.title('Distribution of House Prices')
plt.subplot(1, 2, 2)
plt.scatter(df['MedInc'], df['Price'], alpha=0.3)
plt.xlabel('Median Income')
plt.ylabel('House Price')
plt.title('Price vs Income')
plt.tight_layout()
plt.show()
print("\n✓ Data loaded and explored!")
🔧 Step 2: Prepare Data
# Separate features (X) and target (y)
X = df.drop('Price', axis=1)
y = df['Price']
print("Features shape:", X.shape)
print("Target shape:", y.shape)
# Split data into training and testing sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
print(f"\nTraining set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
# Scale features (important for many algorithms)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print("\n✓ Data prepared and split!")
# Feature importance visualization
feature_names = X.columns
print("\nFeatures we'll use for prediction:")
for i, feature in enumerate(feature_names, 1):
print(f" {i}. {feature}")
Why Split Data?
- Training set: Model learns patterns from this
- Test set: Evaluate how well model generalizes
- Never test on training data - leads to overfitting
- 80-20 split is common, but 70-30 or 90-10 also work
🤖 Step 3: Train the Model
# Try two different models
# Model 1: Linear Regression (simple, interpretable)
print("Training Linear Regression...")
lr_model = LinearRegression()
lr_model.fit(X_train_scaled, y_train)
# Model 2: Random Forest (more powerful, handles non-linearity)
print("Training Random Forest...")
rf_model = RandomForestRegressor(
n_estimators=100, # Number of trees
max_depth=10, # Maximum tree depth
random_state=42,
n_jobs=-1 # Use all CPU cores
)
rf_model.fit(X_train_scaled, y_train)
print("\n✓ Models trained!")
# Display learned parameters (for Linear Regression)
print("\nLinear Regression Coefficients:")
for feature, coef in zip(feature_names, lr_model.coef_):
print(f" {feature:15s}: {coef:+.4f}")
📊 Step 4: Evaluate Performance
# Make predictions on test set
lr_predictions = lr_model.predict(X_test_scaled)
rf_predictions = rf_model.predict(X_test_scaled)
# Calculate metrics
def evaluate_model(name, y_true, y_pred):
"""Calculate and display model performance metrics"""
# Mean Squared Error (lower is better)
mse = mean_squared_error(y_true, y_pred)
# Root Mean Squared Error (in same units as target)
rmse = np.sqrt(mse)
# R² Score (0 to 1, higher is better)
r2 = r2_score(y_true, y_pred)
# Mean Absolute Error
mae = np.mean(np.abs(y_true - y_pred))
print(f"\n{name} Performance:")
print(f" RMSE: ${rmse:.2f} (×$100k)")
print(f" MAE: ${mae:.2f} (×$100k)")
print(f" R²: {r2:.4f} ({r2*100:.2f}% variance explained)")
return rmse, mae, r2
# Evaluate both models
lr_rmse, lr_mae, lr_r2 = evaluate_model("Linear Regression", y_test, lr_predictions)
rf_rmse, rf_mae, rf_r2 = evaluate_model("Random Forest", y_test, rf_predictions)
# Compare models
print("\n" + "="*50)
print("MODEL COMPARISON:")
if rf_r2 > lr_r2:
improvement = (rf_r2 - lr_r2) / lr_r2 * 100
print(f"✓ Random Forest performs {improvement:.1f}% better!")
best_model = rf_model
best_predictions = rf_predictions
else:
print("✓ Linear Regression is sufficient for this dataset")
best_model = lr_model
best_predictions = lr_predictions
# Visualize predictions vs actual
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.scatter(y_test, lr_predictions, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.title(f'Linear Regression (R² = {lr_r2:.3f})')
plt.subplot(1, 2, 2)
plt.scatter(y_test, rf_predictions, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.title(f'Random Forest (R² = {rf_r2:.3f})')
plt.tight_layout()
plt.show()
Understanding Metrics:
- RMSE: Average prediction error (lower is better)
- MAE: Mean Absolute Error, easier to interpret
- R² Score: Percentage of variance explained (higher is better)
- Perfect score: R² = 1.0, RMSE = 0
🔮 Step 5: Make Predictions
# Create new house data for prediction
new_houses = pd.DataFrame({
'MedInc': [3.5, 6.0, 2.0], # Median income
'HouseAge': [25, 10, 40], # House age
'AveRooms': [5.0, 7.0, 4.0], # Average rooms
'AveBedrms': [1.2, 1.5, 1.0], # Average bedrooms
'Population': [1500, 800, 2000], # Population
'AveOccup': [3.0, 2.5, 4.0], # Average occupancy
'Latitude': [37.5, 34.0, 38.0], # Latitude
'Longitude': [-122, -118, -121] # Longitude
})
print("New houses to predict:")
print(new_houses)
# Scale the new data (using same scaler from training)
new_houses_scaled = scaler.transform(new_houses)
# Make predictions
predictions = best_model.predict(new_houses_scaled)
# Display results
print("\nPredicted Prices:")
for i, (price, income) in enumerate(zip(predictions, new_houses['MedInc'])):
print(f" House {i+1} (Income: ${income:.1f}×$10k): ${price:.2f}×$100k = ${price*100:.0f}k")
🎯 Step 6: Improve the Model
# Try different hyperparameters
from sklearn.model_selection import GridSearchCV
print("Tuning Random Forest hyperparameters...")
# Define parameter grid to search
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15, None],
'min_samples_split': [2, 5, 10]
}
# Grid search with cross-validation
grid_search = GridSearchCV(
RandomForestRegressor(random_state=42),
param_grid,
cv=5, # 5-fold cross-validation
scoring='r2', # Optimize for R² score
n_jobs=-1,
verbose=1
)
grid_search.fit(X_train_scaled, y_train)
# Best parameters
print("\nBest parameters found:")
print(grid_search.best_params_)
# Evaluate tuned model
tuned_model = grid_search.best_estimator_
tuned_predictions = tuned_model.predict(X_test_scaled)
tuned_r2 = r2_score(y_test, tuned_predictions)
print(f"\nImproved R² Score: {tuned_r2:.4f}")
print(f"Original R² Score: {rf_r2:.4f}")
print(f"Improvement: {(tuned_r2 - rf_r2)*100:.2f}%")
Ways to Improve Models:
- Get more data - More examples = better learning
- Feature engineering - Create new features from existing ones
- Try different algorithms - Some work better for specific problems
- Tune hyperparameters - Optimize model settings
- Ensemble methods - Combine multiple models
💾 Step 7: Save the Model
import joblib
# Save model and scaler
joblib.dump(best_model, 'house_price_model.pkl')
joblib.dump(scaler, 'scaler.pkl')
print("✓ Model saved to house_price_model.pkl")
print("✓ Scaler saved to scaler.pkl")
# Later, load and use the model
# loaded_model = joblib.load('house_price_model.pkl')
# loaded_scaler = joblib.load('scaler.pkl')
# predictions = loaded_model.predict(loaded_scaler.transform(new_data))
📈 Complete Training Pipeline
# Complete end-to-end pipeline function
def train_ml_model(X, y, test_size=0.2):
"""
Complete ML training pipeline
Args:
X: Features
y: Target
test_size: Proportion of data for testing
Returns:
model, scaler, metrics
"""
# 1. Split data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=test_size, random_state=42
)
# 2. Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# 3. Train model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)
# 4. Evaluate
predictions = model.predict(X_test_scaled)
rmse = np.sqrt(mean_squared_error(y_test, predictions))
r2 = r2_score(y_test, predictions)
metrics = {
'rmse': rmse,
'r2': r2,
'test_size': len(y_test),
'train_size': len(y_train)
}
return model, scaler, metrics
# Use the pipeline
model, scaler, metrics = train_ml_model(X, y)
print("\nPipeline Results:")
print(f" Training samples: {metrics['train_size']}")
print(f" Test samples: {metrics['test_size']}")
print(f" RMSE: {metrics['rmse']:.4f}")
print(f" R² Score: {metrics['r2']:.4f}")
print("\n✓ Training complete!")
🎯 Key Takeaways
- Load & Explore: Understand your data first
- Prepare Data: Split into train/test, scale features
- Train Model: Use appropriate algorithm
- Evaluate: Test on unseen data
- Iterate: Try different approaches
- Deploy: Save model for production use
- Monitor: Track performance over time
🚀 Next Steps
- Try different datasets - Kaggle has thousands
- Experiment with algorithms - Try SVM, XGBoost, neural networks
- Learn feature engineering - Create better features
- Study deep learning - For more complex problems
- Build a portfolio - Showcase your projects