Data Preprocessing & Preparation

Why Data Preprocessing?

Real-world data is messy! Before feeding data to ML models, we need to clean and transform it. Good preprocessing can dramatically improve model performance.

💡 Data scientists spend 60-80% of their time on data preparation!

Common Data Problems

❌ Missing Values

Empty cells or NULL values

📏 Different Scales

Age (0-100) vs Salary (0-1M)

🔤 Text Data

Categories need numeric encoding

📊 Outliers

Extreme values that skew results

1. Handling Missing Data

# Handling missing values
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer

# Sample data with missing values
data = pd.DataFrame({
    'age': [25, 30, np.nan, 35, 40],
    'salary': [50000, 60000, 55000, np.nan, 75000],
    'city': ['NYC', 'LA', np.nan, 'Chicago', 'NYC']
})

print("Original data:")
print(data)

# Strategy 1: Remove rows with missing values
clean_data = data.dropna()
print("\nAfter dropping rows with NaN:")
print(clean_data)

# Strategy 2: Fill with mean (for numeric columns)
imputer = SimpleImputer(strategy='mean')
data[['age', 'salary']] = imputer.fit_transform(data[['age', 'salary']])
print("\nAfter filling with mean:")
print(data)

# Strategy 3: Fill with most frequent (for categorical)
data['city'].fillna(data['city'].mode()[0], inplace=True)
print("\nFinal clean data:")
print(data)

2. Feature Scaling

ML algorithms work better when features are on similar scales.

# Feature Scaling
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import numpy as np

# Sample data: age (20-70) and salary (30k-150k)
data = np.array([
    [25, 50000],
    [35, 60000],
    [45, 80000],
    [55, 120000],
    [65, 150000]
])

print("Original data:")
print(data)

# Method 1: Standardization (mean=0, std=1)
scaler = StandardScaler()
standardized = scaler.fit_transform(data)
print("\nStandardized (Z-score):")
print(standardized)
print(f"Mean: {standardized.mean(axis=0)}")
print(f"Std: {standardized.std(axis=0)}")

# Method 2: Normalization (scale to 0-1)
normalizer = MinMaxScaler()
normalized = normalizer.fit_transform(data)
print("\nNormalized (0-1 range):")
print(normalized)
print(f"Min: {normalized.min(axis=0)}")
print(f"Max: {normalized.max(axis=0)}")

3. Encoding Categorical Data

Convert text categories to numbers that ML models can understand.

# Encoding Categorical Variables
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import pandas as pd
import numpy as np

# Sample data
data = pd.DataFrame({
    'color': ['red', 'blue', 'green', 'blue', 'red'],
    'size': ['S', 'M', 'L', 'M', 'S'],
    'price': [10, 15, 20, 15, 10]
})

print("Original data:")
print(data)

# Method 1: Label Encoding (for ordinal data)
label_encoder = LabelEncoder()
data['size_encoded'] = label_encoder.fit_transform(data['size'])
print("\nLabel Encoded size (S=2, M=1, L=0):")
print(data[['size', 'size_encoded']])

# Method 2: One-Hot Encoding (for nominal data)
color_encoded = pd.get_dummies(data['color'], prefix='color')
print("\nOne-Hot Encoded color:")
print(color_encoded)

# Combine with original data
data_final = pd.concat([data, color_encoded], axis=1)
print("\nFinal encoded data:")
print(data_final)

When to use which encoding?

Label Encoding: Ordinal data (Small < Medium < Large)
One-Hot Encoding: Nominal data (Red, Blue, Green - no order)

4. Splitting Data

Always split your data into training and testing sets!

# Train-Test Split
from sklearn.model_selection import train_test_split
import numpy as np

# Sample dataset
X = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]])  # Features
y = np.array([0, 0, 1, 1, 1])  # Labels

# Split: 80% training, 20% testing
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2,  # 20% for testing
    random_state=42  # For reproducibility
)

print("Training set:")
print(f"X_train: {X_train.shape}")
print(f"y_train: {y_train.shape}")
print("\nTest set:")
print(f"X_test: {X_test.shape}")
print(f"y_test: {y_test.shape}")

# Output:
# Training set:
# X_train: (4, 2)
# y_train: (4,)
# Test set:
# X_test: (1, 2)
# y_test: (1,)

5. Feature Engineering

Creating new features from existing ones to improve model performance.

# Feature Engineering Example
import pandas as pd

# Original data
data = pd.DataFrame({
    'length': [10, 15, 20, 25],
    'width': [5, 7, 10, 12],
    'height': [3, 4, 5, 6]
})

# Create new features
data['area'] = data['length'] * data['width']
data['volume'] = data['length'] * data['width'] * data['height']
data['aspect_ratio'] = data['length'] / data['width']
data['is_tall'] = (data['height'] > 5).astype(int)

print("Enhanced dataset with engineered features:")
print(data)

# Output shows original features plus:
# - area: derived from length × width
# - volume: derived from all three dimensions
# - aspect_ratio: relationship between dimensions
# - is_tall: binary feature based on condition

Complete Preprocessing Pipeline

# Complete Preprocessing Example
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer

# Load sample data
data = pd.DataFrame({
    'age': [25, 30, np.nan, 35, 40, 45, 50, 55],
    'income': [50000, 60000, 55000, np.nan, 75000, 80000, 90000, 100000],
    'education': ['Bachelor', 'Master', 'PhD', 'Bachelor', 'Master', 
                  'PhD', 'Bachelor', 'Master'],
    'purchased': [0, 0, 1, 0, 1, 1, 1, 1]
})

print("Step 1: Original data")
print(data.head())

# Step 2: Handle missing values
imputer = SimpleImputer(strategy='mean')
data[['age', 'income']] = imputer.fit_transform(data[['age', 'income']])
print("\nStep 2: Missing values filled")

# Step 3: Encode categorical variables
education_encoded = pd.get_dummies(data['education'], prefix='edu')
data = pd.concat([data, education_encoded], axis=1)
data.drop('education', axis=1, inplace=True)
print("\nStep 3: Categorical variables encoded")

# Step 4: Split features and target
X = data.drop('purchased', axis=1)
y = data['purchased']

# Step 5: Split into train and test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Step 6: Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("\nStep 6: Data ready for modeling!")
print(f"Training set shape: {X_train_scaled.shape}")
print(f"Test set shape: {X_test_scaled.shape}")

Best Practices

✅ Always split data before scaling (prevent data leakage)
✅ Fit scalers/encoders on training data only
✅ Apply same transformations to test data
✅ Document your preprocessing steps
✅ Keep a copy of original data
✅ Understand your data before preprocessing
❌ Don't scale before splitting (causes data leakage)
❌ Don't remove outliers without investigation

Real-World Example: Customer Data

# Preprocessing customer churn dataset
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Simulated customer data
data = pd.DataFrame({
    'age': [25, 35, 45, 22, 50, 30, 40],
    'months_subscribed': [12, 24, 6, 36, 18, 48, 3],
    'monthly_spend': [50, 120, 80, 150, 90, 200, 40],
    'support_tickets': [0, 2, 5, 1, 3, 0, 8],
    'churned': [0, 0, 1, 0, 0, 0, 1]  # 1 = customer left
})

# Feature engineering
data['spend_per_month'] = data['monthly_spend'] / data['months_subscribed']
data['ticket_rate'] = data['support_tickets'] / data['months_subscribed']

# Prepare for modeling
X = data.drop('churned', axis=1)
y = data['churned']

# Split and scale
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Preprocessed customer data ready for churn prediction!")
print(f"Features: {X.columns.tolist()}")
print(f"Training samples: {len(X_train_scaled)}")
print(f"Test samples: {len(X_test_scaled)}")