Why Model Interpretability?
Model interpretability explains how models make predictions. Critical for trust, debugging, compliance, and improving models.
Benefits:
- Trust: Users trust what they understand
- Debugging: Find and fix model errors
- Compliance: GDPR "right to explanation"
- Fairness: Detect and mitigate bias
- Improvement: Guide feature engineering
📊 Feature Importance
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
# Train model
X, y = make_classification(n_samples=1000, n_features=10,
n_informative=5, random_state=42)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X, y)
# Get feature importances
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]
# Plot
plt.figure(figsize=(10, 6))
plt.bar(range(len(importances)), importances[indices])
plt.xlabel('Feature Index')
plt.ylabel('Importance')
plt.title('Feature Importances')
plt.xticks(range(len(importances)), indices)
plt.show()
print("Feature ranking:")
for i, idx in enumerate(indices):
print(f"{i+1}. Feature {idx}: {importances[idx]:.4f}")
🎯 Permutation Importance
from sklearn.inspection import permutation_importance
# More reliable than feature_importances_
# Measures decrease in performance when feature is shuffled
perm_importance = permutation_importance(
model, X, y,
n_repeats=10,
random_state=42
)
# Plot
sorted_idx = perm_importance.importances_mean.argsort()[::-1]
plt.figure(figsize=(10, 6))
plt.barh(range(len(sorted_idx)),
perm_importance.importances_mean[sorted_idx])
plt.yticks(range(len(sorted_idx)), sorted_idx)
plt.xlabel('Permutation Importance')
plt.ylabel('Feature')
plt.title('Permutation Feature Importance')
plt.show()
print("\nPermutation Importance:")
for idx in sorted_idx:
print(f"Feature {idx}: {perm_importance.importances_mean[idx]:.4f} "
f"+/- {perm_importance.importances_std[idx]:.4f}")
📈 Partial Dependence Plots
from sklearn.inspection import PartialDependenceDisplay
# Shows relationship between feature and prediction
features = [0, 1, (0, 1)] # Individual and interaction
fig, ax = plt.subplots(figsize=(15, 5))
PartialDependenceDisplay.from_estimator(
model, X, features, ax=ax
)
plt.tight_layout()
plt.show()
💎 SHAP Values
# pip install shap
import shap
# Create explainer
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X)
# Summary plot
shap.summary_plot(shap_values[1], X, plot_type="bar")
# Detailed summary
shap.summary_plot(shap_values[1], X)
# Individual prediction
idx = 0
shap.force_plot(
explainer.expected_value[1],
shap_values[1][idx],
X[idx],
matplotlib=True
)
# Waterfall plot for single prediction
shap.waterfall_plot(
shap.Explanation(
values=shap_values[1][idx],
base_values=explainer.expected_value[1],
data=X[idx]
)
)
🔬 LIME
# pip install lime
from lime import lime_tabular
# Create explainer
explainer = lime_tabular.LimeTabularExplainer(
X,
mode='classification',
feature_names=[f'feature_{i}' for i in range(X.shape[1])]
)
# Explain single prediction
idx = 0
exp = explainer.explain_instance(
X[idx],
model.predict_proba,
num_features=10
)
# Show explanation
exp.show_in_notebook()
# Or as list
print("\nLIME Explanation:")
for feature, weight in exp.as_list():
print(f"{feature}: {weight:.4f}")
🌳 Decision Tree Visualization
from sklearn.tree import DecisionTreeClassifier, plot_tree
# Train simple tree
tree_model = DecisionTreeClassifier(max_depth=3, random_state=42)
tree_model.fit(X, y)
# Visualize
plt.figure(figsize=(20, 10))
plot_tree(
tree_model,
feature_names=[f'F{i}' for i in range(X.shape[1])],
class_names=['Class 0', 'Class 1'],
filled=True,
rounded=True,
fontsize=10
)
plt.show()
# Export as text
from sklearn.tree import export_text
tree_rules = export_text(tree_model,
feature_names=[f'F{i}' for i in range(X.shape[1])])
print(tree_rules)
📐 Linear Model Coefficients
from sklearn.linear_model import LogisticRegression
# Train linear model
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X, y)
# Get coefficients
coefficients = lr_model.coef_[0]
# Plot
plt.figure(figsize=(10, 6))
plt.bar(range(len(coefficients)), coefficients)
plt.xlabel('Feature Index')
plt.ylabel('Coefficient')
plt.title('Logistic Regression Coefficients')
plt.axhline(y=0, color='r', linestyle='--')
plt.show()
print("Coefficients:")
for i, coef in enumerate(coefficients):
print(f"Feature {i}: {coef:.4f}")
🔥 Integrated Gradients (Neural Networks)
import tensorflow as tf
def integrated_gradients(model, input_data, baseline=None, steps=50):
"""
Compute integrated gradients for neural network
"""
if baseline is None:
baseline = tf.zeros_like(input_data)
# Generate alphas
alphas = tf.linspace(0.0, 1.0, steps + 1)
# Interpolate between baseline and input
interpolated = baseline + alphas[:, tf.newaxis] * (input_data - baseline)
# Compute gradients
with tf.GradientTape() as tape:
tape.watch(interpolated)
predictions = model(interpolated)
gradients = tape.gradient(predictions, interpolated)
# Average gradients
avg_gradients = tf.reduce_mean(gradients, axis=0)
# Integrated gradients
integrated_grads = (input_data - baseline) * avg_gradients
return integrated_grads
# Example usage
# ig = integrated_gradients(neural_network_model, input_sample)
# plt.bar(range(len(ig)), ig)
# plt.show()
📊 Interpretation Methods Comparison
| Method | Scope | Speed | Best For |
|---|---|---|---|
| Feature Importance | Global | Fast | Tree models, quick overview |
| Permutation Importance | Global | Medium | Any model, reliable |
| Partial Dependence | Global | Medium | Feature relationships |
| SHAP | Both | Slow | Detailed, theoretically sound |
| LIME | Local | Medium | Individual predictions |
| Coefficients | Global | Fast | Linear models only |
🎭 Model-Agnostic vs Model-Specific
Model-Agnostic (Works with any model):
- LIME
- SHAP
- Permutation Importance
- Partial Dependence Plots
Model-Specific:
- Tree feature importances (Trees only)
- Coefficients (Linear models only)
- Attention weights (Transformers only)
- Gradient-based methods (Neural networks only)
💡 Best Practices
- Use multiple methods: Each shows different aspects
- Global + Local: Understand both overall and individual
- Validate insights: Do explanations make domain sense?
- Consider audience: Technical vs non-technical
- Visualize clearly: Good charts communicate better
- Document decisions: Why certain features matter
- Check for bias: Are protected features used unfairly?
- Iterative process: Use insights to improve model
⚖️ Fairness Analysis
# Check if model is biased against protected groups
def analyze_fairness(model, X, y, protected_attr_idx):
"""
Analyze model fairness across protected attribute
"""
protected_values = np.unique(X[:, protected_attr_idx])
print("Fairness Analysis:")
for value in protected_values:
mask = X[:, protected_attr_idx] == value
X_group = X[mask]
y_group = y[mask]
if len(y_group) == 0:
continue
accuracy = model.score(X_group, y_group)
predictions = model.predict(X_group)
positive_rate = (predictions == 1).mean()
print(f"\nGroup {value}:")
print(f" Sample size: {len(y_group)}")
print(f" Accuracy: {accuracy:.3f}")
print(f" Positive rate: {positive_rate:.3f}")
# Usage
# analyze_fairness(model, X_test, y_test, protected_attr_idx=0)
🎯 Key Takeaways
- Feature importance shows most influential features
- SHAP provides detailed, theoretically sound explanations
- LIME explains individual predictions locally
- Partial dependence shows feature-prediction relationships
- Use multiple methods for complete understanding
- Check for bias and fairness issues
- Interpretability builds trust and enables debugging