What are Recommender Systems?
Recommender systems predict user preferences and suggest relevant items. Power Netflix, Amazon, Spotify, and more.
Types:
- Content-based: Recommend similar items based on features
- Collaborative filtering: Use behavior of similar users
- Hybrid: Combine multiple approaches
- Deep learning: Neural networks for complex patterns
📊 Sample Data
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
# User-item ratings matrix
ratings_dict = {
'user_id': [1,1,1,2,2,3,3,3,4,4,5,5],
'movie_id': [1,2,3,1,3,2,3,4,1,4,2,5],
'rating': [5,4,3,4,5,5,4,4,3,5,4,5]
}
ratings = pd.DataFrame(ratings_dict)
print("Ratings DataFrame:")
print(ratings)
# Pivot to user-item matrix
user_item = ratings.pivot(index='user_id',
columns='movie_id',
values='rating').fillna(0)
print("\nUser-Item Matrix:")
print(user_item)
🤝 Collaborative Filtering - User-Based
# Calculate user similarity
user_similarity = cosine_similarity(user_item)
user_sim_df = pd.DataFrame(user_similarity,
index=user_item.index,
columns=user_item.index)
print("User Similarity Matrix:")
print(user_sim_df)
# Recommend for user 1
def recommend_user_based(user_id, user_item, user_sim_df, n=3):
# Get similar users
similar_users = user_sim_df[user_id].sort_values(ascending=False)[1:]
# Get items user hasn't rated
user_ratings = user_item.loc[user_id]
unrated_items = user_ratings[user_ratings == 0].index
# Score each unrated item
scores = {}
for item in unrated_items:
score = 0
sim_sum = 0
for similar_user, similarity in similar_users.items():
rating = user_item.loc[similar_user, item]
if rating > 0:
score += similarity * rating
sim_sum += similarity
if sim_sum > 0:
scores[item] = score / sim_sum
# Top N recommendations
recommendations = sorted(scores.items(),
key=lambda x: x[1],
reverse=True)[:n]
return recommendations
recs = recommend_user_based(1, user_item, user_sim_df)
print(f"\nRecommendations for User 1: {recs}")
🎬 Collaborative Filtering - Item-Based
# Calculate item similarity
item_similarity = cosine_similarity(user_item.T)
item_sim_df = pd.DataFrame(item_similarity,
index=user_item.columns,
columns=user_item.columns)
print("Item Similarity Matrix:")
print(item_sim_df)
# Recommend based on items user liked
def recommend_item_based(user_id, user_item, item_sim_df, n=3):
user_ratings = user_item.loc[user_id]
rated_items = user_ratings[user_ratings > 0]
# Score unrated items
scores = {}
for item in user_item.columns:
if user_ratings[item] == 0:
score = 0
sim_sum = 0
for rated_item, rating in rated_items.items():
similarity = item_sim_df.loc[item, rated_item]
score += similarity * rating
sim_sum += similarity
if sim_sum > 0:
scores[item] = score / sim_sum
recommendations = sorted(scores.items(),
key=lambda x: x[1],
reverse=True)[:n]
return recommendations
recs = recommend_item_based(1, user_item, item_sim_df)
print(f"\nItem-based recommendations for User 1: {recs}")
🔢 Matrix Factorization - SVD
from scipy.sparse.linalg import svds
# Perform SVD
U, sigma, Vt = svds(user_item.values, k=2) # k latent factors
# Reconstruct matrix
sigma = np.diag(sigma)
predicted_ratings = np.dot(np.dot(U, sigma), Vt)
predicted_df = pd.DataFrame(predicted_ratings,
index=user_item.index,
columns=user_item.columns)
print("Predicted Ratings:")
print(predicted_df)
# Recommend for user
def recommend_svd(user_id, user_item, predicted_df, n=3):
user_ratings = user_item.loc[user_id]
predictions = predicted_df.loc[user_id]
# Get unrated items
unrated = user_ratings[user_ratings == 0].index
recommendations = predictions[unrated].sort_values(ascending=False)[:n]
return list(zip(recommendations.index, recommendations.values))
recs = recommend_svd(1, user_item, predicted_df)
print(f"\nSVD recommendations for User 1: {recs}")
⚡ Surprise Library
# pip install scikit-surprise
from surprise import Dataset, Reader, SVD, KNNBasic
from surprise.model_selection import cross_validate, train_test_split
from surprise import accuracy
# Load data
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings[['user_id', 'movie_id', 'rating']], reader)
# Split train/test
trainset, testset = train_test_split(data, test_size=0.25)
# SVD Algorithm
algo = SVD(n_factors=10, n_epochs=20, lr_all=0.005, reg_all=0.02)
algo.fit(trainset)
# Predict
predictions = algo.test(testset)
rmse = accuracy.rmse(predictions)
print(f"RMSE: {rmse:.3f}")
# Predict single rating
user_id = 1
movie_id = 5
pred = algo.predict(user_id, movie_id)
print(f"\nPredicted rating for User {user_id}, Movie {movie_id}: {pred.est:.2f}")
# Get top N recommendations
from collections import defaultdict
def get_top_n(predictions, n=3):
top_n = defaultdict(list)
for uid, iid, true_r, est, _ in predictions:
top_n[uid].append((iid, est))
for uid, user_ratings in top_n.items():
user_ratings.sort(key=lambda x: x[1], reverse=True)
top_n[uid] = user_ratings[:n]
return top_n
top_n = get_top_n(predictions, n=3)
print(f"\nTop 3 for each user: {dict(top_n)}")
📝 Content-Based Filtering
from sklearn.feature_extraction.text import TfidfVectorizer
# Movie features
movies = pd.DataFrame({
'movie_id': [1, 2, 3, 4, 5],
'title': ['Action Hero', 'Romance Story', 'Action Adventure',
'Romantic Comedy', 'Sci-Fi Action'],
'genres': ['action thriller', 'romance drama', 'action adventure',
'romance comedy', 'scifi action']
})
# TF-IDF on genres
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies['genres'])
# Calculate similarity
content_similarity = cosine_similarity(tfidf_matrix)
content_sim_df = pd.DataFrame(content_similarity,
index=movies['movie_id'],
columns=movies['movie_id'])
print("Content Similarity:")
print(content_sim_df)
# Recommend similar movies
def recommend_content_based(movie_id, similarity_df, movies, n=3):
similar_scores = similarity_df[movie_id].sort_values(ascending=False)[1:n+1]
recommended_movies = movies[movies['movie_id'].isin(similar_scores.index)]
return recommended_movies[['movie_id', 'title']].values.tolist()
recs = recommend_content_based(1, content_sim_df, movies)
print(f"\nSimilar to Movie 1: {recs}")
🧠 Neural Collaborative Filtering
from tensorflow import keras
from tensorflow.keras import layers
# Prepare data
n_users = ratings['user_id'].nunique()
n_movies = ratings['movie_id'].nunique()
user_ids = ratings['user_id'].values - 1 # 0-indexed
movie_ids = ratings['movie_id'].values - 1
ratings_values = ratings['rating'].values
# Build NCF model
embedding_size = 8
# User embedding
user_input = layers.Input(shape=(1,))
user_embedding = layers.Embedding(n_users, embedding_size)(user_input)
user_vec = layers.Flatten()(user_embedding)
# Movie embedding
movie_input = layers.Input(shape=(1,))
movie_embedding = layers.Embedding(n_movies, embedding_size)(movie_input)
movie_vec = layers.Flatten()(movie_embedding)
# Concatenate and predict
concat = layers.concatenate([user_vec, movie_vec])
dense1 = layers.Dense(64, activation='relu')(concat)
dense2 = layers.Dense(32, activation='relu')(dense1)
output = layers.Dense(1)(dense2)
model = keras.Model(inputs=[user_input, movie_input], outputs=output)
model.compile(optimizer='adam', loss='mse', metrics=['mae'])
# Train
history = model.fit(
[user_ids, movie_ids],
ratings_values,
epochs=50,
batch_size=4,
validation_split=0.2,
verbose=0
)
# Predict
test_user = np.array([0]) # User 1
test_movie = np.array([4]) # Movie 5
prediction = model.predict([test_user, test_movie])
print(f"NCF Prediction: {prediction[0][0]:.2f}")
📊 Evaluation Metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error
# RMSE
def rmse(y_true, y_pred):
return np.sqrt(mean_squared_error(y_true, y_pred))
# MAE
def mae(y_true, y_pred):
return mean_absolute_error(y_true, y_pred)
# Precision@K
def precision_at_k(recommendations, relevant_items, k):
recommended_items = [item for item, score in recommendations[:k]]
hits = len(set(recommended_items) & set(relevant_items))
return hits / k
# Recall@K
def recall_at_k(recommendations, relevant_items, k):
recommended_items = [item for item, score in recommendations[:k]]
hits = len(set(recommended_items) & set(relevant_items))
return hits / len(relevant_items) if relevant_items else 0
# Example
recs = [(1, 4.5), (3, 4.2), (5, 4.0), (2, 3.8)]
relevant = [1, 3, 7]
print(f"Precision@3: {precision_at_k(recs, relevant, 3):.2f}")
print(f"Recall@3: {recall_at_k(recs, relevant, 3):.2f}")
💡 Best Practices
- Start with collaborative filtering: Often most effective
- Handle cold start: Use content-based for new users/items
- Use implicit feedback: Clicks, views, not just ratings
- Consider diversity: Don't just recommend similar items
- Update regularly: User preferences change over time
- A/B test: Measure real user engagement
- Explain recommendations: Increase trust and engagement
- Handle popularity bias: Don't only recommend popular items
⚠️ Common Challenges
- Cold start problem: New users/items have no history
- Solution: Use content-based, ask for preferences, use popular items
- Sparsity: Most users rate very few items
- Solution: Matrix factorization, hybrid approaches
- Scalability: Millions of users and items
- Solution: Approximate methods, caching, batch processing
- Filter bubble: Only show similar content
- Solution: Add diversity, exploration vs exploitation
🎯 Key Takeaways
- Collaborative filtering uses user behavior patterns
- Content-based recommends similar items by features
- Matrix factorization (SVD) reduces dimensionality
- Surprise library easy implementation of algorithms
- Neural networks capture complex patterns
- Hybrid approaches combine multiple methods
- Evaluate with Precision@K, Recall@K, RMSE