Customer Segmentation Project

🎯 Project Overview

Use K-Means clustering to segment customers based on their purchasing behavior. This unsupervised learning project helps businesses identify distinct customer groups for targeted marketing strategies.

What You'll Learn:

Unsupervised learning with K-Means
Finding optimal number of clusters (Elbow method)
Feature scaling and PCA
Customer profile interpretation
Business insights from clustering

🛠️ Technologies

Python 3.8+ Pandas NumPy Scikit-learn Matplotlib Seaborn

📦 Step 1: Import Libraries and Generate Data

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Generate synthetic customer data
np.random.seed(42)
n_customers = 500

# Customer features
data = {
    'CustomerID': range(1, n_customers + 1),
    'Age': np.random.randint(18, 70, n_customers),
    'Annual_Income': np.random.randint(15, 150, n_customers) * 1000,
    'Spending_Score': np.random.randint(1, 100, n_customers),
    'Purchase_Frequency': np.random.randint(1, 50, n_customers),
    'Average_Transaction': np.random.randint(20, 500, n_customers),
    'Tenure_Months': np.random.randint(1, 60, n_customers)
}

df = pd.DataFrame(data)

print("Dataset Shape:", df.shape)
print("\nFirst few rows:")
print(df.head())
print("\nStatistical Summary:")
print(df.describe())

📊 Step 2: Exploratory Data Analysis

# Check for missing values
print("Missing values:")
print(df.isnull().sum())

# Distributions
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.ravel()

features = ['Age', 'Annual_Income', 'Spending_Score', 
           'Purchase_Frequency', 'Average_Transaction', 'Tenure_Months']

for idx, feature in enumerate(features):
    axes[idx].hist(df[feature], bins=30, edgecolor='black')
    axes[idx].set_title(f'{feature} Distribution')
    axes[idx].set_xlabel(feature)
    axes[idx].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

# Correlation matrix
plt.figure(figsize=(10, 8))
correlation = df[features].corr()
sns.heatmap(correlation, annot=True, fmt='.2f', cmap='coolwarm', 
            square=True, linewidths=0.5)
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

# Pairplot for key features
key_features = ['Annual_Income', 'Spending_Score', 'Purchase_Frequency']
sns.pairplot(df[key_features])
plt.suptitle('Pairplot of Key Features', y=1.02)
plt.show()

🔧 Step 3: Feature Preparation

# Select features for clustering (exclude CustomerID)
feature_columns = ['Age', 'Annual_Income', 'Spending_Score', 
                  'Purchase_Frequency', 'Average_Transaction', 'Tenure_Months']

X = df[feature_columns].values

print("Feature matrix shape:", X.shape)

# Scale features (important for K-Means!)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print("\nOriginal data (first 3 samples):")
print(X[:3])
print("\nScaled data (first 3 samples):")
print(X_scaled[:3])

# Check scaling
print("\nScaled features - Mean:", X_scaled.mean(axis=0))
print("Scaled features - Std:", X_scaled.std(axis=0))

📈 Step 4: Find Optimal Number of Clusters

# Elbow Method
inertias = []
silhouette_scores = []
K_range = range(2, 11)

for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_scaled)
    inertias.append(kmeans.inertia_)
    silhouette_scores.append(silhouette_score(X_scaled, kmeans.labels_))

# Plot Elbow Curve
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Inertia (Within-cluster sum of squares)
axes[0].plot(K_range, inertias, 'bo-', linewidth=2, markersize=8)
axes[0].set_xlabel('Number of Clusters (k)')
axes[0].set_ylabel('Inertia (WCSS)')
axes[0].set_title('Elbow Method - Inertia')
axes[0].grid(True)

# Silhouette Score
axes[1].plot(K_range, silhouette_scores, 'ro-', linewidth=2, markersize=8)
axes[1].set_xlabel('Number of Clusters (k)')
axes[1].set_ylabel('Silhouette Score')
axes[1].set_title('Silhouette Score by K')
axes[1].grid(True)

plt.tight_layout()
plt.show()

print("Inertia values:", inertias)
print("Silhouette scores:", silhouette_scores)

# Based on elbow curve, let's choose k=4
optimal_k = 4
print(f"\nChosen number of clusters: {optimal_k}")

🎯 Step 5: Train K-Means Model

# Train K-Means with optimal k
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
clusters = kmeans.fit_predict(X_scaled)

# Add cluster labels to dataframe
df['Cluster'] = clusters

print(f"Cluster assignments (first 10): {clusters[:10]}")
print(f"\nCluster distribution:")
print(df['Cluster'].value_counts().sort_index())

# Cluster centers (in original scale)
cluster_centers_scaled = kmeans.cluster_centers_
cluster_centers = scaler.inverse_transform(cluster_centers_scaled)

# Create DataFrame for cluster centers
centers_df = pd.DataFrame(cluster_centers, columns=feature_columns)
centers_df['Cluster'] = range(optimal_k)

print("\nCluster Centers (Original Scale):")
print(centers_df)

🎨 Step 6: Visualize Clusters

# 2D Visualization using PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

plt.figure(figsize=(12, 8))
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=clusters, 
                     cmap='viridis', s=50, alpha=0.6, edgecolors='k')
plt.colorbar(scatter, label='Cluster')

# Plot cluster centers
centers_pca = pca.transform(cluster_centers_scaled)
plt.scatter(centers_pca[:, 0], centers_pca[:, 1], c='red', 
           s=300, alpha=0.8, edgecolors='black', linewidths=2, 
           marker='X', label='Centroids')

plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)')
plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)')
plt.title('Customer Segments (PCA Visualization)')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# 3D visualization
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure(figsize=(12, 8))
ax = fig.add_subplot(111, projection='3d')

for cluster in range(optimal_k):
    cluster_data = df[df['Cluster'] == cluster]
    ax.scatter(cluster_data['Annual_Income'], 
              cluster_data['Spending_Score'],
              cluster_data['Purchase_Frequency'],
              label=f'Cluster {cluster}', s=50, alpha=0.6)

ax.set_xlabel('Annual Income')
ax.set_ylabel('Spending Score')
ax.set_zlabel('Purchase Frequency')
ax.set_title('3D Customer Segmentation')
ax.legend()
plt.show()

📊 Step 7: Analyze Customer Segments

# Statistical analysis by cluster
cluster_analysis = df.groupby('Cluster')[feature_columns].mean()
print("Average values per cluster:")
print(cluster_analysis)

# Visualize cluster characteristics
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.ravel()

for idx, feature in enumerate(feature_columns):
    cluster_means = df.groupby('Cluster')[feature].mean()
    axes[idx].bar(cluster_means.index, cluster_means.values, 
                  color=['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728'][:optimal_k])
    axes[idx].set_xlabel('Cluster')
    axes[idx].set_ylabel(f'Average {feature}')
    axes[idx].set_title(f'{feature} by Cluster')
    axes[idx].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Box plots for key metrics
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

key_metrics = ['Annual_Income', 'Spending_Score', 
              'Purchase_Frequency', 'Average_Transaction']

for idx, metric in enumerate(key_metrics):
    ax = axes[idx // 2, idx % 2]
    df.boxplot(column=metric, by='Cluster', ax=ax)
    ax.set_title(f'{metric} Distribution by Cluster')
    ax.set_xlabel('Cluster')
    ax.set_ylabel(metric)

plt.suptitle('')  # Remove default title
plt.tight_layout()
plt.show()

💼 Step 8: Business Insights & Profiling

# Create detailed cluster profiles
def profile_clusters(df, centers_df):
    profiles = {}
    
    for cluster in range(optimal_k):
        cluster_data = df[df['Cluster'] == cluster]
        profile = {
            'Size': len(cluster_data),
            'Percentage': f"{len(cluster_data)/len(df)*100:.1f}%",
            'Avg_Age': f"{centers_df.loc[cluster, 'Age']:.0f}",
            'Avg_Income': f"${centers_df.loc[cluster, 'Annual_Income']:,.0f}",
            'Avg_Spending': f"{centers_df.loc[cluster, 'Spending_Score']:.0f}",
            'Avg_Frequency': f"{centers_df.loc[cluster, 'Purchase_Frequency']:.0f}",
            'Avg_Transaction': f"${centers_df.loc[cluster, 'Average_Transaction']:.0f}",
            'Avg_Tenure': f"{centers_df.loc[cluster, 'Tenure_Months']:.0f} months"
        }
        profiles[f'Cluster {cluster}'] = profile
    
    return pd.DataFrame(profiles).T

profile_df = profile_clusters(df, centers_df)
print("\n" + "="*80)
print("CUSTOMER SEGMENT PROFILES")
print("="*80)
print(profile_df)

# Name the segments based on characteristics
segment_names = {
    0: "Budget Conscious",
    1: "High-Value Customers",
    2: "Occasional Shoppers",
    3: "Loyal & Frequent"
}

df['Segment_Name'] = df['Cluster'].map(segment_names)

print("\n" + "="*80)
print("SEGMENT NAMING")
print("="*80)
for cluster, name in segment_names.items():
    count = len(df[df['Cluster'] == cluster])
    pct = count / len(df) * 100
    print(f"Cluster {cluster}: {name} ({count} customers, {pct:.1f}%)")

# Marketing recommendations
recommendations = {
    "Budget Conscious": [
        "Offer discount coupons and loyalty programs",
        "Promote value-for-money products",
        "Send budget-friendly bundle offers"
    ],
    "High-Value Customers": [
        "Provide premium services and exclusive access",
        "Offer personalized shopping experiences",
        "Invite to VIP events and early product launches"
    ],
    "Occasional Shoppers": [
        "Send targeted re-engagement campaigns",
        "Offer limited-time promotions",
        "Provide incentives for repeat purchases"
    ],
    "Loyal & Frequent": [
        "Reward with points and cashback",
        "Provide early access to sales",
        "Create referral program incentives"
    ]
}

print("\n" + "="*80)
print("MARKETING RECOMMENDATIONS")
print("="*80)
for segment, recs in recommendations.items():
    print(f"\n{segment}:")
    for rec in recs:
        print(f"  • {rec}")

📈 Step 9: Model Evaluation

# Silhouette analysis for chosen k
from sklearn.metrics import silhouette_samples

silhouette_avg = silhouette_score(X_scaled, clusters)
sample_silhouette_values = silhouette_samples(X_scaled, clusters)

print(f"Average Silhouette Score: {silhouette_avg:.3f}")

# Visualize silhouette scores
fig, ax = plt.subplots(figsize=(10, 7))
y_lower = 10

for i in range(optimal_k):
    # Get silhouette values for cluster i
    cluster_silhouette_values = sample_silhouette_values[clusters == i]
    cluster_silhouette_values.sort()
    
    size_cluster_i = cluster_silhouette_values.shape[0]
    y_upper = y_lower + size_cluster_i
    
    color = plt.cm.viridis(float(i) / optimal_k)
    ax.fill_betweenx(np.arange(y_lower, y_upper),
                     0, cluster_silhouette_values,
                     facecolor=color, edgecolor=color, alpha=0.7)
    
    ax.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
    y_lower = y_upper + 10

ax.set_xlabel("Silhouette Coefficient")
ax.set_ylabel("Cluster")
ax.axvline(x=silhouette_avg, color="red", linestyle="--", 
          label=f'Average: {silhouette_avg:.3f}')
ax.set_title("Silhouette Plot for Customer Segments")
ax.legend()
plt.tight_layout()
plt.show()

# Davies-Bouldin Index (lower is better)
from sklearn.metrics import davies_bouldin_score
db_score = davies_bouldin_score(X_scaled, clusters)
print(f"\nDavies-Bouldin Index: {db_score:.3f} (lower is better)")

# Calinski-Harabasz Index (higher is better)
from sklearn.metrics import calinski_harabasz_score
ch_score = calinski_harabasz_score(X_scaled, clusters)
print(f"Calinski-Harabasz Score: {ch_score:.2f} (higher is better)")

💾 Step 10: Save Results

import joblib

# Save the model and scaler
joblib.dump(kmeans, 'customer_segmentation_model.pkl')
joblib.dump(scaler, 'feature_scaler.pkl')
print("Model saved successfully!")

# Save segmented customer data
df.to_csv('customer_segments.csv', index=False)
print("Customer segments saved to CSV!")

# Function to predict segment for new customers
def predict_segment(new_customer_data):
    """
    Predict segment for new customer
    new_customer_data: dict with keys matching feature_columns
    """
    # Convert to array
    features = np.array([[new_customer_data[col] for col in feature_columns]])
    
    # Scale
    features_scaled = scaler.transform(features)
    
    # Predict
    cluster = kmeans.predict(features_scaled)[0]
    segment_name = segment_names[cluster]
    
    return cluster, segment_name

# Example: Predict for new customer
new_customer = {
    'Age': 35,
    'Annual_Income': 85000,
    'Spending_Score': 75,
    'Purchase_Frequency': 25,
    'Average_Transaction': 250,
    'Tenure_Months': 24
}

cluster, segment = predict_segment(new_customer)
print(f"\nNew customer belongs to:")
print(f"  Cluster: {cluster}")
print(f"  Segment: {segment}")

🎓 Key Takeaways

K-Means Clustering: Effective unsupervised learning for customer segmentation
Feature Scaling: Critical for K-Means to work properly
Elbow Method: Helps determine optimal number of clusters
Business Value: Enables targeted marketing and personalization
Multiple Metrics: Use silhouette score, DB index for validation
Interpretability: Cluster centers provide actionable insights

🚀 Next Steps

Try different clustering algorithms (DBSCAN, Hierarchical)
Experiment with different feature combinations
Add more behavioral features (product preferences, channel usage)
Implement real-time segmentation with streaming data
Build A/B tests to validate marketing strategies
Create a dashboard to monitor segment evolution