Why Data Preprocessing?
Real-world data is messy! Before feeding data to ML models, we need to clean and transform it. Good preprocessing can dramatically improve model performance.
💡 Data scientists spend 60-80% of their time on data preparation!
Common Data Problems
❌ Missing Values
Empty cells or NULL values
📏 Different Scales
Age (0-100) vs Salary (0-1M)
🔤 Text Data
Categories need numeric encoding
📊 Outliers
Extreme values that skew results
1. Handling Missing Data
# Handling missing values
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
# Sample data with missing values
data = pd.DataFrame({
'age': [25, 30, np.nan, 35, 40],
'salary': [50000, 60000, 55000, np.nan, 75000],
'city': ['NYC', 'LA', np.nan, 'Chicago', 'NYC']
})
print("Original data:")
print(data)
# Strategy 1: Remove rows with missing values
clean_data = data.dropna()
print("\nAfter dropping rows with NaN:")
print(clean_data)
# Strategy 2: Fill with mean (for numeric columns)
imputer = SimpleImputer(strategy='mean')
data[['age', 'salary']] = imputer.fit_transform(data[['age', 'salary']])
print("\nAfter filling with mean:")
print(data)
# Strategy 3: Fill with most frequent (for categorical)
data['city'].fillna(data['city'].mode()[0], inplace=True)
print("\nFinal clean data:")
print(data)
2. Feature Scaling
ML algorithms work better when features are on similar scales.
# Feature Scaling
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import numpy as np
# Sample data: age (20-70) and salary (30k-150k)
data = np.array([
[25, 50000],
[35, 60000],
[45, 80000],
[55, 120000],
[65, 150000]
])
print("Original data:")
print(data)
# Method 1: Standardization (mean=0, std=1)
scaler = StandardScaler()
standardized = scaler.fit_transform(data)
print("\nStandardized (Z-score):")
print(standardized)
print(f"Mean: {standardized.mean(axis=0)}")
print(f"Std: {standardized.std(axis=0)}")
# Method 2: Normalization (scale to 0-1)
normalizer = MinMaxScaler()
normalized = normalizer.fit_transform(data)
print("\nNormalized (0-1 range):")
print(normalized)
print(f"Min: {normalized.min(axis=0)}")
print(f"Max: {normalized.max(axis=0)}")
3. Encoding Categorical Data
Convert text categories to numbers that ML models can understand.
# Encoding Categorical Variables
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import pandas as pd
import numpy as np
# Sample data
data = pd.DataFrame({
'color': ['red', 'blue', 'green', 'blue', 'red'],
'size': ['S', 'M', 'L', 'M', 'S'],
'price': [10, 15, 20, 15, 10]
})
print("Original data:")
print(data)
# Method 1: Label Encoding (for ordinal data)
label_encoder = LabelEncoder()
data['size_encoded'] = label_encoder.fit_transform(data['size'])
print("\nLabel Encoded size (S=2, M=1, L=0):")
print(data[['size', 'size_encoded']])
# Method 2: One-Hot Encoding (for nominal data)
color_encoded = pd.get_dummies(data['color'], prefix='color')
print("\nOne-Hot Encoded color:")
print(color_encoded)
# Combine with original data
data_final = pd.concat([data, color_encoded], axis=1)
print("\nFinal encoded data:")
print(data_final)
When to use which encoding?
- Label Encoding: Ordinal data (Small < Medium < Large)
- One-Hot Encoding: Nominal data (Red, Blue, Green - no order)
4. Splitting Data
Always split your data into training and testing sets!
# Train-Test Split
from sklearn.model_selection import train_test_split
import numpy as np
# Sample dataset
X = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]) # Features
y = np.array([0, 0, 1, 1, 1]) # Labels
# Split: 80% training, 20% testing
X_train, X_test, y_train, y_test = train_test_split(
X, y,
test_size=0.2, # 20% for testing
random_state=42 # For reproducibility
)
print("Training set:")
print(f"X_train: {X_train.shape}")
print(f"y_train: {y_train.shape}")
print("\nTest set:")
print(f"X_test: {X_test.shape}")
print(f"y_test: {y_test.shape}")
# Output:
# Training set:
# X_train: (4, 2)
# y_train: (4,)
# Test set:
# X_test: (1, 2)
# y_test: (1,)
5. Feature Engineering
Creating new features from existing ones to improve model performance.
# Feature Engineering Example
import pandas as pd
# Original data
data = pd.DataFrame({
'length': [10, 15, 20, 25],
'width': [5, 7, 10, 12],
'height': [3, 4, 5, 6]
})
# Create new features
data['area'] = data['length'] * data['width']
data['volume'] = data['length'] * data['width'] * data['height']
data['aspect_ratio'] = data['length'] / data['width']
data['is_tall'] = (data['height'] > 5).astype(int)
print("Enhanced dataset with engineered features:")
print(data)
# Output shows original features plus:
# - area: derived from length × width
# - volume: derived from all three dimensions
# - aspect_ratio: relationship between dimensions
# - is_tall: binary feature based on condition
Complete Preprocessing Pipeline
# Complete Preprocessing Example
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
# Load sample data
data = pd.DataFrame({
'age': [25, 30, np.nan, 35, 40, 45, 50, 55],
'income': [50000, 60000, 55000, np.nan, 75000, 80000, 90000, 100000],
'education': ['Bachelor', 'Master', 'PhD', 'Bachelor', 'Master',
'PhD', 'Bachelor', 'Master'],
'purchased': [0, 0, 1, 0, 1, 1, 1, 1]
})
print("Step 1: Original data")
print(data.head())
# Step 2: Handle missing values
imputer = SimpleImputer(strategy='mean')
data[['age', 'income']] = imputer.fit_transform(data[['age', 'income']])
print("\nStep 2: Missing values filled")
# Step 3: Encode categorical variables
education_encoded = pd.get_dummies(data['education'], prefix='edu')
data = pd.concat([data, education_encoded], axis=1)
data.drop('education', axis=1, inplace=True)
print("\nStep 3: Categorical variables encoded")
# Step 4: Split features and target
X = data.drop('purchased', axis=1)
y = data['purchased']
# Step 5: Split into train and test
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# Step 6: Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print("\nStep 6: Data ready for modeling!")
print(f"Training set shape: {X_train_scaled.shape}")
print(f"Test set shape: {X_test_scaled.shape}")
Best Practices
- ✅ Always split data before scaling (prevent data leakage)
- ✅ Fit scalers/encoders on training data only
- ✅ Apply same transformations to test data
- ✅ Document your preprocessing steps
- ✅ Keep a copy of original data
- ✅ Understand your data before preprocessing
- ❌ Don't scale before splitting (causes data leakage)
- ❌ Don't remove outliers without investigation
Real-World Example: Customer Data
# Preprocessing customer churn dataset
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
# Simulated customer data
data = pd.DataFrame({
'age': [25, 35, 45, 22, 50, 30, 40],
'months_subscribed': [12, 24, 6, 36, 18, 48, 3],
'monthly_spend': [50, 120, 80, 150, 90, 200, 40],
'support_tickets': [0, 2, 5, 1, 3, 0, 8],
'churned': [0, 0, 1, 0, 0, 0, 1] # 1 = customer left
})
# Feature engineering
data['spend_per_month'] = data['monthly_spend'] / data['months_subscribed']
data['ticket_rate'] = data['support_tickets'] / data['months_subscribed']
# Prepare for modeling
X = data.drop('churned', axis=1)
y = data['churned']
# Split and scale
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print("Preprocessed customer data ready for churn prediction!")
print(f"Features: {X.columns.tolist()}")
print(f"Training samples: {len(X_train_scaled)}")
print(f"Test samples: {len(X_test_scaled)}")