What are Association Rules?
Association rules discover relationships between items in large datasets. Famous for "people who bought X also bought Y" recommendations.
Applications:
- Retail: Product recommendations, store layout
- E-commerce: "Customers also bought" suggestions
- Healthcare: Disease co-occurrence patterns
- Web mining: Page visit patterns
📊 Key Concepts
# Transaction example
transactions = [
['Milk', 'Bread', 'Butter'],
['Beer', 'Diapers'],
['Milk', 'Bread', 'Beer'],
['Milk', 'Bread', 'Butter', 'Beer'],
['Bread', 'Butter']
]
# Rule: {Milk, Bread} → {Butter}
# If customer buys Milk and Bread, they likely buy Butter
Metrics:
- Support: How often itemset appears
- support({Milk, Bread}) = transactions with both / total transactions
- Confidence: How often rule is true
- confidence({Milk} → {Bread}) = support({Milk, Bread}) / support({Milk})
- Lift: How much more likely B given A
- lift({Milk} → {Bread}) = confidence / support({Bread})
- Lift > 1: Positive correlation
- Lift = 1: Independent
- Lift < 1: Negative correlation
🔍 Apriori Algorithm
# pip install mlxtend
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder
import pandas as pd
# Sample transactions
transactions = [
['Milk', 'Bread', 'Butter'],
['Beer', 'Diapers', 'Eggs'],
['Milk', 'Bread', 'Beer', 'Cola'],
['Milk', 'Bread', 'Butter', 'Beer'],
['Bread', 'Butter', 'Cola'],
['Milk', 'Bread', 'Butter', 'Cola'],
['Beer', 'Diapers'],
['Milk', 'Bread', 'Butter', 'Eggs'],
['Bread', 'Butter', 'Cola', 'Eggs'],
['Milk', 'Beer', 'Cola']
]
# Transform to one-hot encoded DataFrame
te = TransactionEncoder()
te_array = te.fit(transactions).transform(transactions)
df = pd.DataFrame(te_array, columns=te.columns_)
print("Transaction DataFrame:")
print(df.head())
# Find frequent itemsets
frequent_itemsets = apriori(df, min_support=0.3, use_colnames=True)
print(f"\nFrequent Itemsets (support >= 0.3):")
print(frequent_itemsets)
# Generate association rules
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.6)
print(f"\nAssociation Rules (confidence >= 0.6):")
print(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']])
# Sort by lift
rules_sorted = rules.sort_values('lift', ascending=False)
print("\nTop rules by lift:")
print(rules_sorted.head())
📈 Interpreting Results
# Filter rules by metrics
high_confidence = rules[rules['confidence'] >= 0.7]
print(f"High confidence rules (≥0.7): {len(high_confidence)}")
high_lift = rules[rules['lift'] >= 1.2]
print(f"High lift rules (≥1.2): {len(high_lift)}")
# Combined criteria
strong_rules = rules[
(rules['confidence'] >= 0.6) &
(rules['lift'] >= 1.2) &
(rules['support'] >= 0.2)
]
print("\nStrong Rules:")
for idx, rule in strong_rules.iterrows():
antecedents = ', '.join(list(rule['antecedents']))
consequents = ', '.join(list(rule['consequents']))
print(f"{antecedents} → {consequents}")
print(f" Support: {rule['support']:.3f}")
print(f" Confidence: {rule['confidence']:.3f}")
print(f" Lift: {rule['lift']:.3f}\n")
🛒 Market Basket Analysis Example
import numpy as np
# Larger dataset
np.random.seed(42)
products = ['Milk', 'Bread', 'Butter', 'Beer', 'Diapers',
'Eggs', 'Cola', 'Chips', 'Cheese', 'Coffee']
# Generate 100 transactions
transactions = []
for _ in range(100):
n_items = np.random.randint(2, 6)
transaction = list(np.random.choice(products, n_items, replace=False))
transactions.append(transaction)
# Add some patterns
for _ in range(20):
transactions.append(['Milk', 'Bread', 'Butter']) # Common combo
for _ in range(15):
transactions.append(['Beer', 'Diapers']) # Famous example
for _ in range(10):
transactions.append(['Coffee', 'Milk'])
# Encode and analyze
te = TransactionEncoder()
te_array = te.fit(transactions).transform(transactions)
df = pd.DataFrame(te_array, columns=te.columns_)
# Frequent itemsets
frequent_itemsets = apriori(df, min_support=0.1, use_colnames=True)
# Association rules
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.0)
# Top recommendations
print("Top 10 Product Recommendations:")
top_rules = rules.sort_values(['lift', 'confidence'], ascending=False).head(10)
for idx, rule in top_rules.iterrows():
ant = ', '.join(list(rule['antecedents']))
cons = ', '.join(list(rule['consequents']))
print(f"\nIf customer buys: {ant}")
print(f"Recommend: {cons}")
print(f"Lift: {rule['lift']:.2f}x more likely")
print(f"Confidence: {rule['confidence']:.1%}")
📊 Visualizing Rules
import matplotlib.pyplot as plt
import seaborn as sns
# Scatter plot: Support vs Confidence (color by Lift)
plt.figure(figsize=(12, 6))
scatter = plt.scatter(
rules['support'],
rules['confidence'],
c=rules['lift'],
s=rules['lift']*50,
cmap='viridis',
alpha=0.6
)
plt.colorbar(scatter, label='Lift')
plt.xlabel('Support')
plt.ylabel('Confidence')
plt.title('Association Rules: Support vs Confidence (sized/colored by Lift)')
plt.grid(True, alpha=0.3)
plt.show()
# Heatmap of top rules
top_10 = rules.nlargest(10, 'lift')
rule_names = [
f"{', '.join(list(r['antecedents']))} → {', '.join(list(r['consequents']))}"
for _, r in top_10.iterrows()
]
metrics_df = top_10[['support', 'confidence', 'lift']].T
metrics_df.columns = [f"Rule {i+1}" for i in range(len(top_10))]
plt.figure(figsize=(14, 5))
sns.heatmap(metrics_df, annot=True, fmt='.2f', cmap='YlOrRd')
plt.title('Top 10 Association Rules Metrics')
plt.show()
🎯 FP-Growth Algorithm
# FP-Growth: Faster than Apriori for large datasets
from mlxtend.frequent_patterns import fpgrowth
# Use FP-Growth instead of Apriori
frequent_itemsets_fp = fpgrowth(df, min_support=0.1, use_colnames=True)
print(f"Apriori found: {len(frequent_itemsets)} itemsets")
print(f"FP-Growth found: {len(frequent_itemsets_fp)} itemsets")
# Generate rules (same as before)
rules_fp = association_rules(frequent_itemsets_fp, metric="confidence", min_threshold=0.6)
print(f"\nRules generated: {len(rules_fp)}")
# FP-Growth advantages:
# - Only 2 database scans (Apriori: multiple)
# - More efficient for large datasets
# - Same results as Apriori
🔍 Advanced Filtering
# Find rules for specific items
def find_rules_for_item(rules, item):
# Rules where item is antecedent
as_antecedent = rules[
rules['antecedents'].apply(lambda x: item in x)
]
# Rules where item is consequent
as_consequent = rules[
rules['consequents'].apply(lambda x: item in x)
]
return as_antecedent, as_consequent
# Example: What to recommend if customer buys Milk?
ant, cons = find_rules_for_item(rules, 'Milk')
print("If customer buys Milk, recommend:")
for _, rule in ant.sort_values('lift', ascending=False).head(5).iterrows():
consequents = ', '.join(list(rule['consequents']))
print(f" {consequents} (lift: {rule['lift']:.2f})")
# Filter by length
def filter_by_length(rules, antecedent_len=None, consequent_len=None):
filtered = rules.copy()
if antecedent_len:
filtered = filtered[
filtered['antecedents'].apply(len) == antecedent_len
]
if consequent_len:
filtered = filtered[
filtered['consequents'].apply(len) == consequent_len
]
return filtered
# Get rules with single item → single item
simple_rules = filter_by_length(rules, antecedent_len=1, consequent_len=1)
print(f"\nSimple 1→1 rules: {len(simple_rules)}")
💡 Best Practices
- Set appropriate thresholds:
- Support: 0.01-0.05 for large datasets, 0.1-0.3 for small
- Confidence: Usually 0.5-0.8
- Lift: > 1.0 for positive associations
- Start conservative: High thresholds, then lower if needed
- Use FP-Growth: Faster for large datasets
- Filter by lift: Avoid spurious correlations
- Consider business context: Not all strong rules are useful
- Remove trivial rules: Everyone buys milk → irrelevant
- Focus on actionable insights: Can you change store layout?
⚠️ Common Mistakes
- Low support threshold: Too many rules, many meaningless
- Ignoring lift: High confidence doesn't mean correlation
# Example: Milk has 80% support # Rule: Bread → Milk, confidence = 0.8, lift = 1.0 # Milk is just popular, not related to Bread! # Always check lift > 1.0 - Not removing redundant rules: {A} → {B} vs {A, C} → {B}
- Overfitting to data: Patterns may not generalize
- Ignoring temporal patterns: Seasonality matters
🎯 Real-World Example
# Complete workflow
def market_basket_analysis(transactions, min_support=0.05,
min_confidence=0.5, min_lift=1.0):
# Encode
te = TransactionEncoder()
te_array = te.fit(transactions).transform(transactions)
df = pd.DataFrame(te_array, columns=te.columns_)
# Find patterns
frequent_itemsets = fpgrowth(df, min_support=min_support, use_colnames=True)
if len(frequent_itemsets) == 0:
print("No frequent itemsets found. Lower min_support.")
return None
# Generate rules
rules = association_rules(
frequent_itemsets,
metric="confidence",
min_threshold=min_confidence
)
# Filter by lift
rules = rules[rules['lift'] >= min_lift]
# Sort by lift
rules = rules.sort_values(['lift', 'confidence'], ascending=False)
return rules
# Run analysis
rules = market_basket_analysis(transactions, min_support=0.1)
if rules is not None:
print(f"Found {len(rules)} strong association rules")
print("\nTop 5 recommendations:")
for idx, rule in rules.head(5).iterrows():
ant = ', '.join(list(rule['antecedents']))
cons = ', '.join(list(rule['consequents']))
print(f"{ant} → {cons} (lift: {rule['lift']:.2f})")
🎯 Key Takeaways
- Association rules find patterns in transaction data
- Support = frequency, Confidence = reliability, Lift = correlation
- Apriori is classic algorithm, FP-Growth is faster
- Lift > 1.0 indicates positive association
- Use mlxtend library for easy implementation
- Start with high thresholds then lower if needed
- Filter by business value not just metrics