Model Interpretability

Why Model Interpretability?

Model interpretability explains how models make predictions. Critical for trust, debugging, compliance, and improving models.

                Benefits:
                Trust: Users trust what they understand
Debugging: Find and fix model errors
Compliance: GDPR "right to explanation"
Fairness: Detect and mitigate bias
Improvement: Guide feature engineering

            

📊 Feature Importance

import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

# Train model
X, y = make_classification(n_samples=1000, n_features=10, 
                          n_informative=5, random_state=42)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X, y)

# Get feature importances
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]

# Plot
plt.figure(figsize=(10, 6))
plt.bar(range(len(importances)), importances[indices])
plt.xlabel('Feature Index')
plt.ylabel('Importance')
plt.title('Feature Importances')
plt.xticks(range(len(importances)), indices)
plt.show()

print("Feature ranking:")
for i, idx in enumerate(indices):
    print(f"{i+1}. Feature {idx}: {importances[idx]:.4f}")

🎯 Permutation Importance

from sklearn.inspection import permutation_importance

# More reliable than feature_importances_
# Measures decrease in performance when feature is shuffled
perm_importance = permutation_importance(
    model, X, y, 
    n_repeats=10,
    random_state=42
)

# Plot
sorted_idx = perm_importance.importances_mean.argsort()[::-1]

plt.figure(figsize=(10, 6))
plt.barh(range(len(sorted_idx)), 
         perm_importance.importances_mean[sorted_idx])
plt.yticks(range(len(sorted_idx)), sorted_idx)
plt.xlabel('Permutation Importance')
plt.ylabel('Feature')
plt.title('Permutation Feature Importance')
plt.show()

print("\nPermutation Importance:")
for idx in sorted_idx:
    print(f"Feature {idx}: {perm_importance.importances_mean[idx]:.4f} "
          f"+/- {perm_importance.importances_std[idx]:.4f}")

📈 Partial Dependence Plots

from sklearn.inspection import PartialDependenceDisplay

# Shows relationship between feature and prediction
features = [0, 1, (0, 1)]  # Individual and interaction
fig, ax = plt.subplots(figsize=(15, 5))
PartialDependenceDisplay.from_estimator(
    model, X, features, ax=ax
)
plt.tight_layout()
plt.show()

💎 SHAP Values

# pip install shap
import shap

# Create explainer
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X)

# Summary plot
shap.summary_plot(shap_values[1], X, plot_type="bar")

# Detailed summary
shap.summary_plot(shap_values[1], X)

# Individual prediction
idx = 0
shap.force_plot(
    explainer.expected_value[1],
    shap_values[1][idx],
    X[idx],
    matplotlib=True
)

# Waterfall plot for single prediction
shap.waterfall_plot(
    shap.Explanation(
        values=shap_values[1][idx],
        base_values=explainer.expected_value[1],
        data=X[idx]
    )
)

🔬 LIME

# pip install lime
from lime import lime_tabular

# Create explainer
explainer = lime_tabular.LimeTabularExplainer(
    X,
    mode='classification',
    feature_names=[f'feature_{i}' for i in range(X.shape[1])]
)

# Explain single prediction
idx = 0
exp = explainer.explain_instance(
    X[idx],
    model.predict_proba,
    num_features=10
)

# Show explanation
exp.show_in_notebook()

# Or as list
print("\nLIME Explanation:")
for feature, weight in exp.as_list():
    print(f"{feature}: {weight:.4f}")

🌳 Decision Tree Visualization

from sklearn.tree import DecisionTreeClassifier, plot_tree

# Train simple tree
tree_model = DecisionTreeClassifier(max_depth=3, random_state=42)
tree_model.fit(X, y)

# Visualize
plt.figure(figsize=(20, 10))
plot_tree(
    tree_model,
    feature_names=[f'F{i}' for i in range(X.shape[1])],
    class_names=['Class 0', 'Class 1'],
    filled=True,
    rounded=True,
    fontsize=10
)
plt.show()

# Export as text
from sklearn.tree import export_text
tree_rules = export_text(tree_model, 
                        feature_names=[f'F{i}' for i in range(X.shape[1])])
print(tree_rules)

📐 Linear Model Coefficients

from sklearn.linear_model import LogisticRegression

# Train linear model
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X, y)

# Get coefficients
coefficients = lr_model.coef_[0]

# Plot
plt.figure(figsize=(10, 6))
plt.bar(range(len(coefficients)), coefficients)
plt.xlabel('Feature Index')
plt.ylabel('Coefficient')
plt.title('Logistic Regression Coefficients')
plt.axhline(y=0, color='r', linestyle='--')
plt.show()

print("Coefficients:")
for i, coef in enumerate(coefficients):
    print(f"Feature {i}: {coef:.4f}")

🔥 Integrated Gradients (Neural Networks)

import tensorflow as tf

def integrated_gradients(model, input_data, baseline=None, steps=50):
    """
    Compute integrated gradients for neural network
    """
    if baseline is None:
        baseline = tf.zeros_like(input_data)
    
    # Generate alphas
    alphas = tf.linspace(0.0, 1.0, steps + 1)
    
    # Interpolate between baseline and input
    interpolated = baseline + alphas[:, tf.newaxis] * (input_data - baseline)
    
    # Compute gradients
    with tf.GradientTape() as tape:
        tape.watch(interpolated)
        predictions = model(interpolated)
    
    gradients = tape.gradient(predictions, interpolated)
    
    # Average gradients
    avg_gradients = tf.reduce_mean(gradients, axis=0)
    
    # Integrated gradients
    integrated_grads = (input_data - baseline) * avg_gradients
    
    return integrated_grads

# Example usage
# ig = integrated_gradients(neural_network_model, input_sample)
# plt.bar(range(len(ig)), ig)
# plt.show()

📊 Interpretation Methods Comparison

Method	Scope	Speed	Best For
Feature Importance	Global	Fast	Tree models, quick overview
Permutation Importance	Global	Medium	Any model, reliable
Partial Dependence	Global	Medium	Feature relationships
SHAP	Both	Slow	Detailed, theoretically sound
LIME	Local	Medium	Individual predictions
Coefficients	Global	Fast	Linear models only

🎭 Model-Agnostic vs Model-Specific

Model-Agnostic (Works with any model):

LIME
SHAP
Permutation Importance
Partial Dependence Plots

Model-Specific:

Tree feature importances (Trees only)
Coefficients (Linear models only)
Attention weights (Transformers only)
Gradient-based methods (Neural networks only)

💡 Best Practices

Use multiple methods: Each shows different aspects
Global + Local: Understand both overall and individual
Validate insights: Do explanations make domain sense?
Consider audience: Technical vs non-technical
Visualize clearly: Good charts communicate better
Document decisions: Why certain features matter
Check for bias: Are protected features used unfairly?
Iterative process: Use insights to improve model

⚖️ Fairness Analysis

# Check if model is biased against protected groups
def analyze_fairness(model, X, y, protected_attr_idx):
    """
    Analyze model fairness across protected attribute
    """
    protected_values = np.unique(X[:, protected_attr_idx])
    
    print("Fairness Analysis:")
    for value in protected_values:
        mask = X[:, protected_attr_idx] == value
        X_group = X[mask]
        y_group = y[mask]
        
        if len(y_group) == 0:
            continue
        
        accuracy = model.score(X_group, y_group)
        predictions = model.predict(X_group)
        positive_rate = (predictions == 1).mean()
        
        print(f"\nGroup {value}:")
        print(f"  Sample size: {len(y_group)}")
        print(f"  Accuracy: {accuracy:.3f}")
        print(f"  Positive rate: {positive_rate:.3f}")

# Usage
# analyze_fairness(model, X_test, y_test, protected_attr_idx=0)

🎯 Key Takeaways

Feature importance shows most influential features
SHAP provides detailed, theoretically sound explanations
LIME explains individual predictions locally
Partial dependence shows feature-prediction relationships
Use multiple methods for complete understanding
Check for bias and fairness issues
Interpretability builds trust and enables debugging