🎯 Project Overview
Use K-Means clustering to segment customers based on their purchasing behavior. This unsupervised learning project helps businesses identify distinct customer groups for targeted marketing strategies.
What You'll Learn:
- Unsupervised learning with K-Means
- Finding optimal number of clusters (Elbow method)
- Feature scaling and PCA
- Customer profile interpretation
- Business insights from clustering
🛠️ Technologies
Python 3.8+
Pandas
NumPy
Scikit-learn
Matplotlib
Seaborn
📦 Step 1: Import Libraries and Generate Data
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
import warnings
warnings.filterwarnings('ignore')
# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
# Generate synthetic customer data
np.random.seed(42)
n_customers = 500
# Customer features
data = {
'CustomerID': range(1, n_customers + 1),
'Age': np.random.randint(18, 70, n_customers),
'Annual_Income': np.random.randint(15, 150, n_customers) * 1000,
'Spending_Score': np.random.randint(1, 100, n_customers),
'Purchase_Frequency': np.random.randint(1, 50, n_customers),
'Average_Transaction': np.random.randint(20, 500, n_customers),
'Tenure_Months': np.random.randint(1, 60, n_customers)
}
df = pd.DataFrame(data)
print("Dataset Shape:", df.shape)
print("\nFirst few rows:")
print(df.head())
print("\nStatistical Summary:")
print(df.describe())
📊 Step 2: Exploratory Data Analysis
# Check for missing values
print("Missing values:")
print(df.isnull().sum())
# Distributions
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.ravel()
features = ['Age', 'Annual_Income', 'Spending_Score',
'Purchase_Frequency', 'Average_Transaction', 'Tenure_Months']
for idx, feature in enumerate(features):
axes[idx].hist(df[feature], bins=30, edgecolor='black')
axes[idx].set_title(f'{feature} Distribution')
axes[idx].set_xlabel(feature)
axes[idx].set_ylabel('Frequency')
plt.tight_layout()
plt.show()
# Correlation matrix
plt.figure(figsize=(10, 8))
correlation = df[features].corr()
sns.heatmap(correlation, annot=True, fmt='.2f', cmap='coolwarm',
square=True, linewidths=0.5)
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()
# Pairplot for key features
key_features = ['Annual_Income', 'Spending_Score', 'Purchase_Frequency']
sns.pairplot(df[key_features])
plt.suptitle('Pairplot of Key Features', y=1.02)
plt.show()
🔧 Step 3: Feature Preparation
# Select features for clustering (exclude CustomerID)
feature_columns = ['Age', 'Annual_Income', 'Spending_Score',
'Purchase_Frequency', 'Average_Transaction', 'Tenure_Months']
X = df[feature_columns].values
print("Feature matrix shape:", X.shape)
# Scale features (important for K-Means!)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print("\nOriginal data (first 3 samples):")
print(X[:3])
print("\nScaled data (first 3 samples):")
print(X_scaled[:3])
# Check scaling
print("\nScaled features - Mean:", X_scaled.mean(axis=0))
print("Scaled features - Std:", X_scaled.std(axis=0))
📈 Step 4: Find Optimal Number of Clusters
# Elbow Method
inertias = []
silhouette_scores = []
K_range = range(2, 11)
for k in K_range:
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
kmeans.fit(X_scaled)
inertias.append(kmeans.inertia_)
silhouette_scores.append(silhouette_score(X_scaled, kmeans.labels_))
# Plot Elbow Curve
fig, axes = plt.subplots(1, 2, figsize=(15, 5))
# Inertia (Within-cluster sum of squares)
axes[0].plot(K_range, inertias, 'bo-', linewidth=2, markersize=8)
axes[0].set_xlabel('Number of Clusters (k)')
axes[0].set_ylabel('Inertia (WCSS)')
axes[0].set_title('Elbow Method - Inertia')
axes[0].grid(True)
# Silhouette Score
axes[1].plot(K_range, silhouette_scores, 'ro-', linewidth=2, markersize=8)
axes[1].set_xlabel('Number of Clusters (k)')
axes[1].set_ylabel('Silhouette Score')
axes[1].set_title('Silhouette Score by K')
axes[1].grid(True)
plt.tight_layout()
plt.show()
print("Inertia values:", inertias)
print("Silhouette scores:", silhouette_scores)
# Based on elbow curve, let's choose k=4
optimal_k = 4
print(f"\nChosen number of clusters: {optimal_k}")
🎯 Step 5: Train K-Means Model
# Train K-Means with optimal k
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
clusters = kmeans.fit_predict(X_scaled)
# Add cluster labels to dataframe
df['Cluster'] = clusters
print(f"Cluster assignments (first 10): {clusters[:10]}")
print(f"\nCluster distribution:")
print(df['Cluster'].value_counts().sort_index())
# Cluster centers (in original scale)
cluster_centers_scaled = kmeans.cluster_centers_
cluster_centers = scaler.inverse_transform(cluster_centers_scaled)
# Create DataFrame for cluster centers
centers_df = pd.DataFrame(cluster_centers, columns=feature_columns)
centers_df['Cluster'] = range(optimal_k)
print("\nCluster Centers (Original Scale):")
print(centers_df)
🎨 Step 6: Visualize Clusters
# 2D Visualization using PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
plt.figure(figsize=(12, 8))
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=clusters,
cmap='viridis', s=50, alpha=0.6, edgecolors='k')
plt.colorbar(scatter, label='Cluster')
# Plot cluster centers
centers_pca = pca.transform(cluster_centers_scaled)
plt.scatter(centers_pca[:, 0], centers_pca[:, 1], c='red',
s=300, alpha=0.8, edgecolors='black', linewidths=2,
marker='X', label='Centroids')
plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)')
plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)')
plt.title('Customer Segments (PCA Visualization)')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# 3D visualization
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure(figsize=(12, 8))
ax = fig.add_subplot(111, projection='3d')
for cluster in range(optimal_k):
cluster_data = df[df['Cluster'] == cluster]
ax.scatter(cluster_data['Annual_Income'],
cluster_data['Spending_Score'],
cluster_data['Purchase_Frequency'],
label=f'Cluster {cluster}', s=50, alpha=0.6)
ax.set_xlabel('Annual Income')
ax.set_ylabel('Spending Score')
ax.set_zlabel('Purchase Frequency')
ax.set_title('3D Customer Segmentation')
ax.legend()
plt.show()
📊 Step 7: Analyze Customer Segments
# Statistical analysis by cluster
cluster_analysis = df.groupby('Cluster')[feature_columns].mean()
print("Average values per cluster:")
print(cluster_analysis)
# Visualize cluster characteristics
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.ravel()
for idx, feature in enumerate(feature_columns):
cluster_means = df.groupby('Cluster')[feature].mean()
axes[idx].bar(cluster_means.index, cluster_means.values,
color=['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728'][:optimal_k])
axes[idx].set_xlabel('Cluster')
axes[idx].set_ylabel(f'Average {feature}')
axes[idx].set_title(f'{feature} by Cluster')
axes[idx].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# Box plots for key metrics
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
key_metrics = ['Annual_Income', 'Spending_Score',
'Purchase_Frequency', 'Average_Transaction']
for idx, metric in enumerate(key_metrics):
ax = axes[idx // 2, idx % 2]
df.boxplot(column=metric, by='Cluster', ax=ax)
ax.set_title(f'{metric} Distribution by Cluster')
ax.set_xlabel('Cluster')
ax.set_ylabel(metric)
plt.suptitle('') # Remove default title
plt.tight_layout()
plt.show()
💼 Step 8: Business Insights & Profiling
# Create detailed cluster profiles
def profile_clusters(df, centers_df):
profiles = {}
for cluster in range(optimal_k):
cluster_data = df[df['Cluster'] == cluster]
profile = {
'Size': len(cluster_data),
'Percentage': f"{len(cluster_data)/len(df)*100:.1f}%",
'Avg_Age': f"{centers_df.loc[cluster, 'Age']:.0f}",
'Avg_Income': f"${centers_df.loc[cluster, 'Annual_Income']:,.0f}",
'Avg_Spending': f"{centers_df.loc[cluster, 'Spending_Score']:.0f}",
'Avg_Frequency': f"{centers_df.loc[cluster, 'Purchase_Frequency']:.0f}",
'Avg_Transaction': f"${centers_df.loc[cluster, 'Average_Transaction']:.0f}",
'Avg_Tenure': f"{centers_df.loc[cluster, 'Tenure_Months']:.0f} months"
}
profiles[f'Cluster {cluster}'] = profile
return pd.DataFrame(profiles).T
profile_df = profile_clusters(df, centers_df)
print("\n" + "="*80)
print("CUSTOMER SEGMENT PROFILES")
print("="*80)
print(profile_df)
# Name the segments based on characteristics
segment_names = {
0: "Budget Conscious",
1: "High-Value Customers",
2: "Occasional Shoppers",
3: "Loyal & Frequent"
}
df['Segment_Name'] = df['Cluster'].map(segment_names)
print("\n" + "="*80)
print("SEGMENT NAMING")
print("="*80)
for cluster, name in segment_names.items():
count = len(df[df['Cluster'] == cluster])
pct = count / len(df) * 100
print(f"Cluster {cluster}: {name} ({count} customers, {pct:.1f}%)")
# Marketing recommendations
recommendations = {
"Budget Conscious": [
"Offer discount coupons and loyalty programs",
"Promote value-for-money products",
"Send budget-friendly bundle offers"
],
"High-Value Customers": [
"Provide premium services and exclusive access",
"Offer personalized shopping experiences",
"Invite to VIP events and early product launches"
],
"Occasional Shoppers": [
"Send targeted re-engagement campaigns",
"Offer limited-time promotions",
"Provide incentives for repeat purchases"
],
"Loyal & Frequent": [
"Reward with points and cashback",
"Provide early access to sales",
"Create referral program incentives"
]
}
print("\n" + "="*80)
print("MARKETING RECOMMENDATIONS")
print("="*80)
for segment, recs in recommendations.items():
print(f"\n{segment}:")
for rec in recs:
print(f" • {rec}")
📈 Step 9: Model Evaluation
# Silhouette analysis for chosen k
from sklearn.metrics import silhouette_samples
silhouette_avg = silhouette_score(X_scaled, clusters)
sample_silhouette_values = silhouette_samples(X_scaled, clusters)
print(f"Average Silhouette Score: {silhouette_avg:.3f}")
# Visualize silhouette scores
fig, ax = plt.subplots(figsize=(10, 7))
y_lower = 10
for i in range(optimal_k):
# Get silhouette values for cluster i
cluster_silhouette_values = sample_silhouette_values[clusters == i]
cluster_silhouette_values.sort()
size_cluster_i = cluster_silhouette_values.shape[0]
y_upper = y_lower + size_cluster_i
color = plt.cm.viridis(float(i) / optimal_k)
ax.fill_betweenx(np.arange(y_lower, y_upper),
0, cluster_silhouette_values,
facecolor=color, edgecolor=color, alpha=0.7)
ax.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
y_lower = y_upper + 10
ax.set_xlabel("Silhouette Coefficient")
ax.set_ylabel("Cluster")
ax.axvline(x=silhouette_avg, color="red", linestyle="--",
label=f'Average: {silhouette_avg:.3f}')
ax.set_title("Silhouette Plot for Customer Segments")
ax.legend()
plt.tight_layout()
plt.show()
# Davies-Bouldin Index (lower is better)
from sklearn.metrics import davies_bouldin_score
db_score = davies_bouldin_score(X_scaled, clusters)
print(f"\nDavies-Bouldin Index: {db_score:.3f} (lower is better)")
# Calinski-Harabasz Index (higher is better)
from sklearn.metrics import calinski_harabasz_score
ch_score = calinski_harabasz_score(X_scaled, clusters)
print(f"Calinski-Harabasz Score: {ch_score:.2f} (higher is better)")
💾 Step 10: Save Results
import joblib
# Save the model and scaler
joblib.dump(kmeans, 'customer_segmentation_model.pkl')
joblib.dump(scaler, 'feature_scaler.pkl')
print("Model saved successfully!")
# Save segmented customer data
df.to_csv('customer_segments.csv', index=False)
print("Customer segments saved to CSV!")
# Function to predict segment for new customers
def predict_segment(new_customer_data):
"""
Predict segment for new customer
new_customer_data: dict with keys matching feature_columns
"""
# Convert to array
features = np.array([[new_customer_data[col] for col in feature_columns]])
# Scale
features_scaled = scaler.transform(features)
# Predict
cluster = kmeans.predict(features_scaled)[0]
segment_name = segment_names[cluster]
return cluster, segment_name
# Example: Predict for new customer
new_customer = {
'Age': 35,
'Annual_Income': 85000,
'Spending_Score': 75,
'Purchase_Frequency': 25,
'Average_Transaction': 250,
'Tenure_Months': 24
}
cluster, segment = predict_segment(new_customer)
print(f"\nNew customer belongs to:")
print(f" Cluster: {cluster}")
print(f" Segment: {segment}")
🎓 Key Takeaways
- K-Means Clustering: Effective unsupervised learning for customer segmentation
- Feature Scaling: Critical for K-Means to work properly
- Elbow Method: Helps determine optimal number of clusters
- Business Value: Enables targeted marketing and personalization
- Multiple Metrics: Use silhouette score, DB index for validation
- Interpretability: Cluster centers provide actionable insights
🚀 Next Steps
- Try different clustering algorithms (DBSCAN, Hierarchical)
- Experiment with different feature combinations
- Add more behavioral features (product preferences, channel usage)
- Implement real-time segmentation with streaming data
- Build A/B tests to validate marketing strategies
- Create a dashboard to monitor segment evolution