🔍 Model Interpretability

Understanding black-box models

Why Model Interpretability?

Model interpretability explains how models make predictions. Critical for trust, debugging, compliance, and improving models.

Benefits:

  • Trust: Users trust what they understand
  • Debugging: Find and fix model errors
  • Compliance: GDPR "right to explanation"
  • Fairness: Detect and mitigate bias
  • Improvement: Guide feature engineering

📊 Feature Importance

import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

# Train model
X, y = make_classification(n_samples=1000, n_features=10, 
                          n_informative=5, random_state=42)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X, y)

# Get feature importances
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]

# Plot
plt.figure(figsize=(10, 6))
plt.bar(range(len(importances)), importances[indices])
plt.xlabel('Feature Index')
plt.ylabel('Importance')
plt.title('Feature Importances')
plt.xticks(range(len(importances)), indices)
plt.show()

print("Feature ranking:")
for i, idx in enumerate(indices):
    print(f"{i+1}. Feature {idx}: {importances[idx]:.4f}")

🎯 Permutation Importance

from sklearn.inspection import permutation_importance

# More reliable than feature_importances_
# Measures decrease in performance when feature is shuffled
perm_importance = permutation_importance(
    model, X, y, 
    n_repeats=10,
    random_state=42
)

# Plot
sorted_idx = perm_importance.importances_mean.argsort()[::-1]

plt.figure(figsize=(10, 6))
plt.barh(range(len(sorted_idx)), 
         perm_importance.importances_mean[sorted_idx])
plt.yticks(range(len(sorted_idx)), sorted_idx)
plt.xlabel('Permutation Importance')
plt.ylabel('Feature')
plt.title('Permutation Feature Importance')
plt.show()

print("\nPermutation Importance:")
for idx in sorted_idx:
    print(f"Feature {idx}: {perm_importance.importances_mean[idx]:.4f} "
          f"+/- {perm_importance.importances_std[idx]:.4f}")

📈 Partial Dependence Plots

from sklearn.inspection import PartialDependenceDisplay

# Shows relationship between feature and prediction
features = [0, 1, (0, 1)]  # Individual and interaction
fig, ax = plt.subplots(figsize=(15, 5))
PartialDependenceDisplay.from_estimator(
    model, X, features, ax=ax
)
plt.tight_layout()
plt.show()

💎 SHAP Values

# pip install shap
import shap

# Create explainer
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X)

# Summary plot
shap.summary_plot(shap_values[1], X, plot_type="bar")

# Detailed summary
shap.summary_plot(shap_values[1], X)

# Individual prediction
idx = 0
shap.force_plot(
    explainer.expected_value[1],
    shap_values[1][idx],
    X[idx],
    matplotlib=True
)

# Waterfall plot for single prediction
shap.waterfall_plot(
    shap.Explanation(
        values=shap_values[1][idx],
        base_values=explainer.expected_value[1],
        data=X[idx]
    )
)

🔬 LIME

# pip install lime
from lime import lime_tabular

# Create explainer
explainer = lime_tabular.LimeTabularExplainer(
    X,
    mode='classification',
    feature_names=[f'feature_{i}' for i in range(X.shape[1])]
)

# Explain single prediction
idx = 0
exp = explainer.explain_instance(
    X[idx],
    model.predict_proba,
    num_features=10
)

# Show explanation
exp.show_in_notebook()

# Or as list
print("\nLIME Explanation:")
for feature, weight in exp.as_list():
    print(f"{feature}: {weight:.4f}")

🌳 Decision Tree Visualization

from sklearn.tree import DecisionTreeClassifier, plot_tree

# Train simple tree
tree_model = DecisionTreeClassifier(max_depth=3, random_state=42)
tree_model.fit(X, y)

# Visualize
plt.figure(figsize=(20, 10))
plot_tree(
    tree_model,
    feature_names=[f'F{i}' for i in range(X.shape[1])],
    class_names=['Class 0', 'Class 1'],
    filled=True,
    rounded=True,
    fontsize=10
)
plt.show()

# Export as text
from sklearn.tree import export_text
tree_rules = export_text(tree_model, 
                        feature_names=[f'F{i}' for i in range(X.shape[1])])
print(tree_rules)

📐 Linear Model Coefficients

from sklearn.linear_model import LogisticRegression

# Train linear model
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X, y)

# Get coefficients
coefficients = lr_model.coef_[0]

# Plot
plt.figure(figsize=(10, 6))
plt.bar(range(len(coefficients)), coefficients)
plt.xlabel('Feature Index')
plt.ylabel('Coefficient')
plt.title('Logistic Regression Coefficients')
plt.axhline(y=0, color='r', linestyle='--')
plt.show()

print("Coefficients:")
for i, coef in enumerate(coefficients):
    print(f"Feature {i}: {coef:.4f}")

🔥 Integrated Gradients (Neural Networks)

import tensorflow as tf

def integrated_gradients(model, input_data, baseline=None, steps=50):
    """
    Compute integrated gradients for neural network
    """
    if baseline is None:
        baseline = tf.zeros_like(input_data)
    
    # Generate alphas
    alphas = tf.linspace(0.0, 1.0, steps + 1)
    
    # Interpolate between baseline and input
    interpolated = baseline + alphas[:, tf.newaxis] * (input_data - baseline)
    
    # Compute gradients
    with tf.GradientTape() as tape:
        tape.watch(interpolated)
        predictions = model(interpolated)
    
    gradients = tape.gradient(predictions, interpolated)
    
    # Average gradients
    avg_gradients = tf.reduce_mean(gradients, axis=0)
    
    # Integrated gradients
    integrated_grads = (input_data - baseline) * avg_gradients
    
    return integrated_grads

# Example usage
# ig = integrated_gradients(neural_network_model, input_sample)
# plt.bar(range(len(ig)), ig)
# plt.show()

📊 Interpretation Methods Comparison

Method Scope Speed Best For
Feature Importance Global Fast Tree models, quick overview
Permutation Importance Global Medium Any model, reliable
Partial Dependence Global Medium Feature relationships
SHAP Both Slow Detailed, theoretically sound
LIME Local Medium Individual predictions
Coefficients Global Fast Linear models only

🎭 Model-Agnostic vs Model-Specific

Model-Agnostic (Works with any model):

Model-Specific:

💡 Best Practices

⚖️ Fairness Analysis

# Check if model is biased against protected groups
def analyze_fairness(model, X, y, protected_attr_idx):
    """
    Analyze model fairness across protected attribute
    """
    protected_values = np.unique(X[:, protected_attr_idx])
    
    print("Fairness Analysis:")
    for value in protected_values:
        mask = X[:, protected_attr_idx] == value
        X_group = X[mask]
        y_group = y[mask]
        
        if len(y_group) == 0:
            continue
        
        accuracy = model.score(X_group, y_group)
        predictions = model.predict(X_group)
        positive_rate = (predictions == 1).mean()
        
        print(f"\nGroup {value}:")
        print(f"  Sample size: {len(y_group)}")
        print(f"  Accuracy: {accuracy:.3f}")
        print(f"  Positive rate: {positive_rate:.3f}")

# Usage
# analyze_fairness(model, X_test, y_test, protected_attr_idx=0)

🎯 Key Takeaways