Model Evaluation Metrics

Why Evaluation Matters

Choosing the right metric is crucial! Accuracy alone can be misleading, especially with imbalanced datasets. Different metrics answer different questions about your model's performance.

Key Question: What's more important - catching all positives (recall) or being sure when you predict positive (precision)?

📊 Classification Metrics

Confusion Matrix

Foundation of all classification metrics:

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

y_true = [0, 1, 0, 1, 0, 1, 1, 0]
y_pred = [0, 1, 0, 0, 0, 1, 1, 1]

cm = confusion_matrix(y_true, y_pred)
print(cm)
# [[3 1]    TN FP
#  [1 3]]   FN TP

# Visualize
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Negative', 'Positive'])
disp.plot()
plt.show()

Accuracy

from sklearn.metrics import accuracy_score

# Accuracy = (TP + TN) / (TP + TN + FP + FN)
accuracy = accuracy_score(y_true, y_pred)
print(f"Accuracy: {accuracy:.3f}")  # 0.750

# Good for balanced datasets
# Misleading for imbalanced data!

Precision

from sklearn.metrics import precision_score

# Precision = TP / (TP + FP)
# "Of all positive predictions, how many were correct?"
precision = precision_score(y_true, y_pred)
print(f"Precision: {precision:.3f}")  # 0.750

# Use when False Positives are costly
# Example: Spam detection (don't want to flag real emails as spam)

Recall (Sensitivity)

from sklearn.metrics import recall_score

# Recall = TP / (TP + FN)
# "Of all actual positives, how many did we catch?"
recall = recall_score(y_true, y_pred)
print(f"Recall: {recall:.3f}")  # 0.750

# Use when False Negatives are costly
# Example: Cancer detection (must catch all cases)

F1 Score

from sklearn.metrics import f1_score

# F1 = 2 * (Precision * Recall) / (Precision + Recall)
# Harmonic mean of precision and recall
f1 = f1_score(y_true, y_pred)
print(f"F1 Score: {f1:.3f}")  # 0.750

# Balanced metric - good when you need both precision and recall
# Use when data is imbalanced

Classification Report

from sklearn.metrics import classification_report

# Get all metrics at once
print(classification_report(y_true, y_pred, target_names=['Class 0', 'Class 1']))

#               precision    recall  f1-score   support
#     Class 0       0.75      0.75      0.75         4
#     Class 1       0.75      0.75      0.75         4
#    accuracy                           0.75         8
#   macro avg       0.75      0.75      0.75         8
#weighted avg       0.75      0.75      0.75         8

📈 ROC Curve & AUC

from sklearn.metrics import roc_curve, roc_auc_score, RocCurveDisplay

# Get predicted probabilities
y_proba = model.predict_proba(X_test)[:, 1]

# Calculate ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_proba)

# Calculate AUC
auc = roc_auc_score(y_test, y_proba)
print(f"AUC: {auc:.3f}")

# Plot ROC curve
display = RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=auc)
display.plot()
plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier')
plt.title('ROC Curve')
plt.show()

# AUC interpretation:
# 1.0 = Perfect classifier
# 0.9-1.0 = Excellent
# 0.8-0.9 = Good
# 0.7-0.8 = Fair
# 0.5 = Random guessing
# <0.5 = Worse than random

Precision-Recall Curve

from sklearn.metrics import precision_recall_curve, average_precision_score, PrecisionRecallDisplay

precision, recall, thresholds = precision_recall_curve(y_test, y_proba)
ap = average_precision_score(y_test, y_proba)

display = PrecisionRecallDisplay(precision=precision, recall=recall, average_precision=ap)
display.plot()
plt.title('Precision-Recall Curve')
plt.show()

# Better than ROC for imbalanced datasets
# Shows trade-off between precision and recall

📉 Regression Metrics

Mean Absolute Error (MAE)

from sklearn.metrics import mean_absolute_error

# MAE = (1/n) * Σ|y_true - y_pred|
mae = mean_absolute_error(y_test, y_pred)
print(f"MAE: {mae:.3f}")

# Easy to interpret - average error in original units
# Less sensitive to outliers than MSE

Mean Squared Error (MSE) & RMSE

from sklearn.metrics import mean_squared_error
import numpy as np

# MSE = (1/n) * Σ(y_true - y_pred)²
mse = mean_squared_error(y_test, y_pred)
print(f"MSE: {mse:.3f}")

# RMSE = sqrt(MSE) - same units as target
rmse = np.sqrt(mse)
print(f"RMSE: {rmse:.3f}")

# Penalizes large errors more heavily
# Most commonly used for regression

R² Score (Coefficient of Determination)

from sklearn.metrics import r2_score

# R² = 1 - (SS_res / SS_tot)
r2 = r2_score(y_test, y_pred)
print(f"R² Score: {r2:.3f}")

# Interpretation:
# 1.0 = Perfect predictions
# 0.8-1.0 = Very good
# 0.6-0.8 = Good
# 0.4-0.6 = Moderate
# <0.4 = Poor
# <0 = Worse than mean baseline

# Indicates proportion of variance explained by model

Mean Absolute Percentage Error (MAPE)

from sklearn.metrics import mean_absolute_percentage_error

# MAPE = (100/n) * Σ|y_true - y_pred| / |y_true|
mape = mean_absolute_percentage_error(y_test, y_pred)
print(f"MAPE: {mape:.1f}%")

# Percentage error - easy to understand
# Asymmetric - penalizes under-predictions more
# Undefined when y_true = 0

🎯 Multiclass Classification

from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Multiclass predictions
y_true = [0, 1, 2, 0, 1, 2, 1, 2]
y_pred = [0, 2, 2, 0, 1, 1, 1, 2]

# Overall accuracy
accuracy = accuracy_score(y_true, y_pred)
print(f"Accuracy: {accuracy:.3f}")

# Per-class metrics
precision, recall, f1, support = precision_recall_fscore_support(y_true, y_pred)

for i in range(3):
    print(f"Class {i}:")
    print(f"  Precision: {precision[i]:.3f}")
    print(f"  Recall: {recall[i]:.3f}")
    print(f"  F1: {f1[i]:.3f}")
    print(f"  Support: {support[i]}")

# Macro average (treat all classes equally)
from sklearn.metrics import f1_score
f1_macro = f1_score(y_true, y_pred, average='macro')

# Weighted average (weight by class frequency)
f1_weighted = f1_score(y_true, y_pred, average='weighted')

⚖️ Handling Imbalanced Data

# Example: 95% negative, 5% positive
# Accuracy can be misleading!

# Baseline: Always predict negative
# Accuracy = 95% (but useless!)

# Better metrics for imbalanced data:
from sklearn.metrics import balanced_accuracy_score, matthews_corrcoef

# Balanced Accuracy (average of recall for each class)
balanced_acc = balanced_accuracy_score(y_test, y_pred)
print(f"Balanced Accuracy: {balanced_acc:.3f}")

# Matthews Correlation Coefficient (-1 to 1)
mcc = matthews_corrcoef(y_test, y_pred)
print(f"MCC: {mcc:.3f}")

# F1 Score (harmonic mean of precision and recall)
f1 = f1_score(y_test, y_pred)
print(f"F1 Score: {f1:.3f}")

# ROC-AUC (area under ROC curve)
auc = roc_auc_score(y_test, y_pred_proba)
print(f"AUC: {auc:.3f}")

📊 Custom Metrics

from sklearn.metrics import make_scorer

# Define custom metric
def custom_metric(y_true, y_pred):
    # Example: Weighted F1 with higher penalty for false negatives
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    
    # Custom formula
    score = tp / (tp + 2*fn + 0.5*fp)
    return score

# Create scorer
custom_scorer = make_scorer(custom_metric, greater_is_better=True)

# Use in cross-validation
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, X, y, cv=5, scoring=custom_scorer)

🎯 Choosing the Right Metric

Classification:

Accuracy: Balanced datasets, all errors equally important
Precision: False positives costly (spam detection, medical diagnosis)
Recall: False negatives costly (fraud detection, cancer screening)
F1 Score: Imbalanced data, need balance
ROC-AUC: Compare models, threshold independent
PR-AUC: Highly imbalanced data

Regression:

MAE: Easy interpretation, robust to outliers
RMSE: Penalize large errors, most common
R²: Explained variance, model comparison
MAPE: Percentage error, scale-independent

📈 Visualizing Performance

import matplotlib.pyplot as plt
import seaborn as sns

# 1. Confusion Matrix Heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.title('Confusion Matrix')
plt.show()

# 2. Feature Importance
importances = model.feature_importances_
plt.barh(range(len(importances)), importances)
plt.title('Feature Importance')
plt.show()

# 3. Residuals Plot (Regression)
residuals = y_test - y_pred
plt.scatter(y_pred, residuals)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.show()

# 4. Actual vs Predicted
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Actual vs Predicted')
plt.show()

🎯 Key Takeaways

No single best metric - depends on your problem
Accuracy misleads with imbalanced data
Precision vs Recall: Trade-off based on cost of errors
F1 Score: Balances precision and recall
ROC-AUC: Threshold-independent, good for comparison
Always visualize: Confusion matrix, ROC curve, residuals