Project Overview
Difficulty: Intermediate
Goal: Build a movie recommendation system like Netflix
Techniques: Collaborative Filtering, Content-Based Filtering
Time Required: 3-4 hours
What You'll Learn
- Collaborative filtering algorithms
- Content-based recommendation
- Matrix factorization
- Similarity metrics (cosine, Euclidean)
- Handling sparse data
Types of Recommendation Systems
Content-Based
Recommends similar items based on features
Collaborative Filtering
Uses user behavior patterns
Hybrid
Combines both approaches
Step 1: Load Movie Dataset
# movie_recommender.py
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
# Sample movie data (in practice, use MovieLens dataset)
movies = pd.DataFrame({
'movie_id': [1, 2, 3, 4, 5, 6],
'title': ['The Matrix', 'Inception', 'Interstellar', 'The Godfather',
'Pulp Fiction', 'The Dark Knight'],
'genre': ['Sci-Fi Action', 'Sci-Fi Thriller', 'Sci-Fi Drama',
'Crime Drama', 'Crime Thriller', 'Action Thriller'],
'year': [1999, 2010, 2014, 1972, 1994, 2008]
})
# User ratings (user_id, movie_id, rating)
ratings = pd.DataFrame({
'user_id': [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4],
'movie_id': [1, 2, 3, 1, 4, 5, 2, 3, 6, 1, 4, 6],
'rating': [5, 4, 5, 4, 5, 4, 5, 5, 4, 3, 4, 5]
})
print("Movies:")
print(movies)
print("\nRatings:")
print(ratings)
Step 2: Content-Based Filtering
# Content-based recommendation
def content_based_recommendations(movie_title, movies, n=3):
"""Recommend movies based on genre similarity"""
# Create TF-IDF matrix from genres
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies['genre'])
# Calculate cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
# Get movie index
idx = movies[movies['title'] == movie_title].index[0]
# Get similarity scores
sim_scores = list(enumerate(cosine_sim[idx]))
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
# Get top N similar movies (excluding itself)
sim_scores = sim_scores[1:n+1]
movie_indices = [i[0] for i in sim_scores]
recommendations = movies.iloc[movie_indices][['title', 'genre']]
recommendations['similarity'] = [score[1] for score in sim_scores]
return recommendations
# Test content-based recommendations
print("\nContent-Based Recommendations for 'The Matrix':")
print(content_based_recommendations('The Matrix', movies))
# Output:
# title genre similarity
# 1 Inception Sci-Fi Thriller 0.577350
# 2 Interstellar Sci-Fi Drama 0.408248
# 5 The Dark Knight Action Thriller 0.333333
Step 3: User-Based Collaborative Filtering
# Create user-movie rating matrix
def create_rating_matrix(ratings, movies):
"""Create pivot table of user ratings"""
rating_matrix = ratings.pivot_table(
index='user_id',
columns='movie_id',
values='rating'
).fillna(0)
return rating_matrix
rating_matrix = create_rating_matrix(ratings, movies)
print("\nRating Matrix:")
print(rating_matrix)
# Calculate user similarity
def calculate_user_similarity(rating_matrix):
"""Calculate similarity between users"""
user_similarity = cosine_similarity(rating_matrix)
user_similarity_df = pd.DataFrame(
user_similarity,
index=rating_matrix.index,
columns=rating_matrix.index
)
return user_similarity_df
user_sim = calculate_user_similarity(rating_matrix)
print("\nUser Similarity:")
print(user_sim)
# Recommend based on similar users
def collaborative_user_recommendations(user_id, rating_matrix, user_similarity, n=3):
"""Recommend movies based on similar users"""
# Get similar users
similar_users = user_similarity[user_id].sort_values(ascending=False)[1:]
# Get movies rated by similar users but not by target user
user_ratings = rating_matrix.loc[user_id]
unwatched_movies = user_ratings[user_ratings == 0].index
# Calculate weighted average ratings
recommendations = {}
for movie in unwatched_movies:
weighted_sum = 0
similarity_sum = 0
for similar_user in similar_users.index:
if rating_matrix.loc[similar_user, movie] > 0:
weighted_sum += (user_similarity[user_id][similar_user] *
rating_matrix.loc[similar_user, movie])
similarity_sum += user_similarity[user_id][similar_user]
if similarity_sum > 0:
recommendations[movie] = weighted_sum / similarity_sum
# Sort and return top N
recommendations = sorted(recommendations.items(),
key=lambda x: x[1], reverse=True)[:n]
return recommendations
# Test collaborative filtering
print("\nCollaborative Recommendations for User 1:")
recs = collaborative_user_recommendations(1, rating_matrix, user_sim)
for movie_id, score in recs:
movie_title = movies[movies['movie_id'] == movie_id]['title'].values[0]
print(f"{movie_title}: {score:.2f}")
Step 4: Item-Based Collaborative Filtering
# Item-based collaborative filtering
def calculate_item_similarity(rating_matrix):
"""Calculate similarity between movies"""
item_similarity = cosine_similarity(rating_matrix.T)
item_similarity_df = pd.DataFrame(
item_similarity,
index=rating_matrix.columns,
columns=rating_matrix.columns
)
return item_similarity_df
item_sim = calculate_item_similarity(rating_matrix)
print("\nMovie Similarity Matrix:")
print(item_sim)
def collaborative_item_recommendations(user_id, movie_id, rating_matrix,
item_similarity, movies, n=3):
"""Recommend similar movies to one user liked"""
# Get similar movies
similar_movies = item_similarity[movie_id].sort_values(ascending=False)[1:]
# Filter out movies user has already rated
user_ratings = rating_matrix.loc[user_id]
unwatched = user_ratings[user_ratings == 0].index
# Get top similar unwatched movies
recommendations = similar_movies[similar_movies.index.isin(unwatched)][:n]
result = []
for movie_id, similarity in recommendations.items():
title = movies[movies['movie_id'] == movie_id]['title'].values[0]
result.append((title, similarity))
return result
# Test item-based recommendations
print("\nItem-Based Recommendations (similar to movie 1):")
recs = collaborative_item_recommendations(1, 1, rating_matrix, item_sim, movies)
for title, similarity in recs:
print(f"{title}: {similarity:.2f}")
Step 5: Matrix Factorization (Advanced)
# Using Singular Value Decomposition (SVD)
from scipy.sparse.linalg import svds
def matrix_factorization_recommendations(rating_matrix, n_factors=2):
"""Use SVD for recommendations"""
# Perform SVD
U, sigma, Vt = svds(rating_matrix.values, k=n_factors)
# Convert sigma to diagonal matrix
sigma = np.diag(sigma)
# Predict ratings
predicted_ratings = np.dot(np.dot(U, sigma), Vt)
predicted_ratings_df = pd.DataFrame(
predicted_ratings,
index=rating_matrix.index,
columns=rating_matrix.columns
)
return predicted_ratings_df
predicted_ratings = matrix_factorization_recommendations(rating_matrix)
print("\nPredicted Ratings:")
print(predicted_ratings)
def get_svd_recommendations(user_id, predicted_ratings, rating_matrix, movies, n=3):
"""Get recommendations from predicted ratings"""
# Get user's actual ratings
user_ratings = rating_matrix.loc[user_id]
# Get predicted ratings for unwatched movies
user_predictions = predicted_ratings.loc[user_id]
unwatched = user_ratings[user_ratings == 0].index
recommendations = user_predictions[unwatched].sort_values(ascending=False)[:n]
result = []
for movie_id, rating in recommendations.items():
title = movies[movies['movie_id'] == movie_id]['title'].values[0]
result.append((title, rating))
return result
# Test SVD recommendations
print("\nSVD Recommendations for User 1:")
recs = get_svd_recommendations(1, predicted_ratings, rating_matrix, movies)
for title, rating in recs:
print(f"{title}: {rating:.2f}")
Step 6: Complete Recommendation System
# complete_recommender.py - Hybrid approach
class MovieRecommender:
def __init__(self, movies, ratings):
self.movies = movies
self.ratings = ratings
self.rating_matrix = self._create_rating_matrix()
self.user_similarity = self._calculate_user_similarity()
self.item_similarity = self._calculate_item_similarity()
def _create_rating_matrix(self):
return self.ratings.pivot_table(
index='user_id',
columns='movie_id',
values='rating'
).fillna(0)
def _calculate_user_similarity(self):
sim = cosine_similarity(self.rating_matrix)
return pd.DataFrame(sim,
index=self.rating_matrix.index,
columns=self.rating_matrix.index)
def _calculate_item_similarity(self):
sim = cosine_similarity(self.rating_matrix.T)
return pd.DataFrame(sim,
index=self.rating_matrix.columns,
columns=self.rating_matrix.columns)
def recommend_for_user(self, user_id, n=5):
"""Get hybrid recommendations"""
# Get collaborative filtering recommendations
collab_recs = self._collaborative_recommendations(user_id, n)
# Get content-based recommendations for user's favorite movies
user_movies = self.ratings[
(self.ratings['user_id'] == user_id) &
(self.ratings['rating'] >= 4)
]['movie_id'].values
content_recs = []
for movie_id in user_movies:
title = self.movies[self.movies['movie_id'] == movie_id]['title'].values[0]
recs = content_based_recommendations(title, self.movies, n=2)
content_recs.extend(recs['title'].values)
# Combine recommendations
all_recs = list(set(list(collab_recs.keys()) + content_recs))
return all_recs[:n]
def _collaborative_recommendations(self, user_id, n):
# Implementation from Step 3
pass
# Create recommender system
recommender = MovieRecommender(movies, ratings)
# Get recommendations
print("\nHybrid Recommendations for User 1:")
recommendations = recommender.recommend_for_user(1, n=3)
for rec in recommendations:
print(f"- {rec}")
Real-World Applications
Where This Technology Is Used
- Netflix: Movie and TV show recommendations
- Amazon: Product recommendations
- Spotify: Music and podcast suggestions
- YouTube: Video recommendations
- LinkedIn: Job and connection suggestions
Enhancement Ideas
- 🎯 Add real MovieLens dataset (100K+ ratings)
- 🎯 Implement deep learning recommendation (Neural CF)
- 🎯 Add time-based features (trending items)
- 🎯 Include demographic information
- 🎯 Add A/B testing framework
- 🎯 Build web interface with Flask/Django
- 🎯 Add real-time updates