Why Evaluation Matters
Choosing the right metric is crucial! Accuracy alone can be misleading, especially with imbalanced datasets. Different metrics answer different questions about your model's performance.
Key Question: What's more important - catching all positives (recall) or being sure when you predict positive (precision)?
📊 Classification Metrics
Confusion Matrix
Foundation of all classification metrics:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
y_true = [0, 1, 0, 1, 0, 1, 1, 0]
y_pred = [0, 1, 0, 0, 0, 1, 1, 1]
cm = confusion_matrix(y_true, y_pred)
print(cm)
# [[3 1] TN FP
# [1 3]] FN TP
# Visualize
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Negative', 'Positive'])
disp.plot()
plt.show()
Accuracy
from sklearn.metrics import accuracy_score
# Accuracy = (TP + TN) / (TP + TN + FP + FN)
accuracy = accuracy_score(y_true, y_pred)
print(f"Accuracy: {accuracy:.3f}") # 0.750
# Good for balanced datasets
# Misleading for imbalanced data!
Precision
from sklearn.metrics import precision_score
# Precision = TP / (TP + FP)
# "Of all positive predictions, how many were correct?"
precision = precision_score(y_true, y_pred)
print(f"Precision: {precision:.3f}") # 0.750
# Use when False Positives are costly
# Example: Spam detection (don't want to flag real emails as spam)
Recall (Sensitivity)
from sklearn.metrics import recall_score
# Recall = TP / (TP + FN)
# "Of all actual positives, how many did we catch?"
recall = recall_score(y_true, y_pred)
print(f"Recall: {recall:.3f}") # 0.750
# Use when False Negatives are costly
# Example: Cancer detection (must catch all cases)
F1 Score
from sklearn.metrics import f1_score
# F1 = 2 * (Precision * Recall) / (Precision + Recall)
# Harmonic mean of precision and recall
f1 = f1_score(y_true, y_pred)
print(f"F1 Score: {f1:.3f}") # 0.750
# Balanced metric - good when you need both precision and recall
# Use when data is imbalanced
Classification Report
from sklearn.metrics import classification_report
# Get all metrics at once
print(classification_report(y_true, y_pred, target_names=['Class 0', 'Class 1']))
# precision recall f1-score support
# Class 0 0.75 0.75 0.75 4
# Class 1 0.75 0.75 0.75 4
# accuracy 0.75 8
# macro avg 0.75 0.75 0.75 8
#weighted avg 0.75 0.75 0.75 8
📈 ROC Curve & AUC
from sklearn.metrics import roc_curve, roc_auc_score, RocCurveDisplay
# Get predicted probabilities
y_proba = model.predict_proba(X_test)[:, 1]
# Calculate ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
# Calculate AUC
auc = roc_auc_score(y_test, y_proba)
print(f"AUC: {auc:.3f}")
# Plot ROC curve
display = RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=auc)
display.plot()
plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier')
plt.title('ROC Curve')
plt.show()
# AUC interpretation:
# 1.0 = Perfect classifier
# 0.9-1.0 = Excellent
# 0.8-0.9 = Good
# 0.7-0.8 = Fair
# 0.5 = Random guessing
# <0.5 = Worse than random
Precision-Recall Curve
from sklearn.metrics import precision_recall_curve, average_precision_score, PrecisionRecallDisplay
precision, recall, thresholds = precision_recall_curve(y_test, y_proba)
ap = average_precision_score(y_test, y_proba)
display = PrecisionRecallDisplay(precision=precision, recall=recall, average_precision=ap)
display.plot()
plt.title('Precision-Recall Curve')
plt.show()
# Better than ROC for imbalanced datasets
# Shows trade-off between precision and recall
📉 Regression Metrics
Mean Absolute Error (MAE)
from sklearn.metrics import mean_absolute_error
# MAE = (1/n) * Σ|y_true - y_pred|
mae = mean_absolute_error(y_test, y_pred)
print(f"MAE: {mae:.3f}")
# Easy to interpret - average error in original units
# Less sensitive to outliers than MSE
Mean Squared Error (MSE) & RMSE
from sklearn.metrics import mean_squared_error
import numpy as np
# MSE = (1/n) * Σ(y_true - y_pred)²
mse = mean_squared_error(y_test, y_pred)
print(f"MSE: {mse:.3f}")
# RMSE = sqrt(MSE) - same units as target
rmse = np.sqrt(mse)
print(f"RMSE: {rmse:.3f}")
# Penalizes large errors more heavily
# Most commonly used for regression
R² Score (Coefficient of Determination)
from sklearn.metrics import r2_score
# R² = 1 - (SS_res / SS_tot)
r2 = r2_score(y_test, y_pred)
print(f"R² Score: {r2:.3f}")
# Interpretation:
# 1.0 = Perfect predictions
# 0.8-1.0 = Very good
# 0.6-0.8 = Good
# 0.4-0.6 = Moderate
# <0.4 = Poor
# <0 = Worse than mean baseline
# Indicates proportion of variance explained by model
Mean Absolute Percentage Error (MAPE)
from sklearn.metrics import mean_absolute_percentage_error
# MAPE = (100/n) * Σ|y_true - y_pred| / |y_true|
mape = mean_absolute_percentage_error(y_test, y_pred)
print(f"MAPE: {mape:.1f}%")
# Percentage error - easy to understand
# Asymmetric - penalizes under-predictions more
# Undefined when y_true = 0
🎯 Multiclass Classification
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
# Multiclass predictions
y_true = [0, 1, 2, 0, 1, 2, 1, 2]
y_pred = [0, 2, 2, 0, 1, 1, 1, 2]
# Overall accuracy
accuracy = accuracy_score(y_true, y_pred)
print(f"Accuracy: {accuracy:.3f}")
# Per-class metrics
precision, recall, f1, support = precision_recall_fscore_support(y_true, y_pred)
for i in range(3):
print(f"Class {i}:")
print(f" Precision: {precision[i]:.3f}")
print(f" Recall: {recall[i]:.3f}")
print(f" F1: {f1[i]:.3f}")
print(f" Support: {support[i]}")
# Macro average (treat all classes equally)
from sklearn.metrics import f1_score
f1_macro = f1_score(y_true, y_pred, average='macro')
# Weighted average (weight by class frequency)
f1_weighted = f1_score(y_true, y_pred, average='weighted')
⚖️ Handling Imbalanced Data
# Example: 95% negative, 5% positive
# Accuracy can be misleading!
# Baseline: Always predict negative
# Accuracy = 95% (but useless!)
# Better metrics for imbalanced data:
from sklearn.metrics import balanced_accuracy_score, matthews_corrcoef
# Balanced Accuracy (average of recall for each class)
balanced_acc = balanced_accuracy_score(y_test, y_pred)
print(f"Balanced Accuracy: {balanced_acc:.3f}")
# Matthews Correlation Coefficient (-1 to 1)
mcc = matthews_corrcoef(y_test, y_pred)
print(f"MCC: {mcc:.3f}")
# F1 Score (harmonic mean of precision and recall)
f1 = f1_score(y_test, y_pred)
print(f"F1 Score: {f1:.3f}")
# ROC-AUC (area under ROC curve)
auc = roc_auc_score(y_test, y_pred_proba)
print(f"AUC: {auc:.3f}")
📊 Custom Metrics
from sklearn.metrics import make_scorer
# Define custom metric
def custom_metric(y_true, y_pred):
# Example: Weighted F1 with higher penalty for false negatives
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
# Custom formula
score = tp / (tp + 2*fn + 0.5*fp)
return score
# Create scorer
custom_scorer = make_scorer(custom_metric, greater_is_better=True)
# Use in cross-validation
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, X, y, cv=5, scoring=custom_scorer)
🎯 Choosing the Right Metric
Classification:
- Accuracy: Balanced datasets, all errors equally important
- Precision: False positives costly (spam detection, medical diagnosis)
- Recall: False negatives costly (fraud detection, cancer screening)
- F1 Score: Imbalanced data, need balance
- ROC-AUC: Compare models, threshold independent
- PR-AUC: Highly imbalanced data
Regression:
- MAE: Easy interpretation, robust to outliers
- RMSE: Penalize large errors, most common
- R²: Explained variance, model comparison
- MAPE: Percentage error, scale-independent
📈 Visualizing Performance
import matplotlib.pyplot as plt
import seaborn as sns
# 1. Confusion Matrix Heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.title('Confusion Matrix')
plt.show()
# 2. Feature Importance
importances = model.feature_importances_
plt.barh(range(len(importances)), importances)
plt.title('Feature Importance')
plt.show()
# 3. Residuals Plot (Regression)
residuals = y_test - y_pred
plt.scatter(y_pred, residuals)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.show()
# 4. Actual vs Predicted
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Actual vs Predicted')
plt.show()
🎯 Key Takeaways
- No single best metric - depends on your problem
- Accuracy misleads with imbalanced data
- Precision vs Recall: Trade-off based on cost of errors
- F1 Score: Balances precision and recall
- ROC-AUC: Threshold-independent, good for comparison
- Always visualize: Confusion matrix, ROC curve, residuals