What are CNNs?
Convolutional Neural Networks are specialized deep learning architectures designed for processing grid-like data, especially images. They automatically learn hierarchical feature representations.
Key Components:
- Convolutional Layers: Learn spatial features (edges, textures, patterns)
- Pooling Layers: Reduce spatial dimensions, increase invariance
- Fully Connected Layers: Make final classification decisions
- Activation Functions: Add non-linearity (ReLU most common)
🔍 How Convolution Works
A filter (kernel) slides over the input image, performing element-wise multiplication and summing the results.
import numpy as np
# Example: 5x5 image
image = np.array([
[1, 2, 3, 0, 1],
[0, 1, 2, 3, 0],
[1, 0, 1, 2, 3],
[2, 1, 0, 1, 2],
[3, 2, 1, 0, 1]
])
# 3x3 edge detection filter
kernel = np.array([
[-1, -1, -1],
[ 0, 0, 0],
[ 1, 1, 1]
])
# Manual convolution (simplified)
def convolve2d(image, kernel):
img_h, img_w = image.shape
ker_h, ker_w = kernel.shape
out_h = img_h - ker_h + 1
out_w = img_w - ker_w + 1
output = np.zeros((out_h, out_w))
for i in range(out_h):
for j in range(out_w):
region = image[i:i+ker_h, j:j+ker_w]
output[i, j] = np.sum(region * kernel)
return output
result = convolve2d(image, kernel)
print("Convolved output:")
print(result)
🏗️ Building a CNN with TensorFlow/Keras
Simple CNN for MNIST
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
import matplotlib.pyplot as plt
# Load MNIST dataset
(X_train, y_train), (X_test, y_test) = keras.datasets.mnist.load_data()
# Preprocess
X_train = X_train.reshape(-1, 28, 28, 1).astype('float32') / 255.0
X_test = X_test.reshape(-1, 28, 28, 1).astype('float32') / 255.0
y_train = keras.utils.to_categorical(y_train, 10)
y_test = keras.utils.to_categorical(y_test, 10)
print(f"Training shape: {X_train.shape}") # (60000, 28, 28, 1)
print(f"Test shape: {X_test.shape}") # (10000, 28, 28, 1)
# Build CNN
model = keras.Sequential([
# First convolutional block
layers.Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=(28, 28, 1)),
layers.MaxPooling2D(pool_size=(2, 2)),
# Second convolutional block
layers.Conv2D(64, kernel_size=(3, 3), activation='relu'),
layers.MaxPooling2D(pool_size=(2, 2)),
# Flatten and dense layers
layers.Flatten(),
layers.Dropout(0.5),
layers.Dense(128, activation='relu'),
layers.Dropout(0.5),
layers.Dense(10, activation='softmax')
])
# Compile
model.compile(
optimizer='adam',
loss='categorical_crossentropy',
metrics=['accuracy']
)
# Model summary
model.summary()
Train the Model
# Train
history = model.fit(
X_train, y_train,
batch_size=128,
epochs=10,
validation_split=0.1,
verbose=1
)
# Evaluate
test_loss, test_acc = model.evaluate(X_test, y_test, verbose=0)
print(f"\nTest accuracy: {test_acc:.4f}")
# Plot training history
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Training')
plt.plot(history.history['val_accuracy'], label='Validation')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Training')
plt.plot(history.history['val_loss'], label='Validation')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()
🔧 CNN Layers Explained
1. Convolutional Layer
# Conv2D parameters
layers.Conv2D(
filters=32, # Number of output filters
kernel_size=(3, 3), # Filter size (3x3)
strides=(1, 1), # Step size (default 1)
padding='valid', # 'valid' = no padding, 'same' = output same size
activation='relu', # Activation function
use_bias=True, # Add bias term
kernel_initializer='glorot_uniform' # Weight initialization
)
# Example: Different configurations
layers.Conv2D(64, (5, 5), activation='relu') # 64 filters, 5x5 kernel
layers.Conv2D(128, (3, 3), padding='same') # Same padding
layers.Conv2D(256, (1, 1)) # 1x1 conv (channel mixing)
2. Pooling Layers
# Max Pooling (most common)
layers.MaxPooling2D(
pool_size=(2, 2), # 2x2 window
strides=None, # Default = pool_size
padding='valid'
)
# Average Pooling
layers.AveragePooling2D(pool_size=(2, 2))
# Global pooling (reduces to 1x1)
layers.GlobalMaxPooling2D()
layers.GlobalAveragePooling2D()
# Why pooling?
# - Reduces spatial dimensions (fewer parameters)
# - Increases receptive field
# - Translation invariance
# - Controls overfitting
3. Batch Normalization
# Add after Conv layers for faster training
model = keras.Sequential([
layers.Conv2D(32, (3, 3), input_shape=(28, 28, 1)),
layers.BatchNormalization(), # Normalize activations
layers.Activation('relu'),
layers.MaxPooling2D((2, 2))
])
# Benefits:
# - Faster training
# - Higher learning rates possible
# - Less sensitive to initialization
# - Regularization effect
🎨 CNN for Color Images (CIFAR-10)
# Load CIFAR-10 (32x32 RGB images)
(X_train, y_train), (X_test, y_test) = keras.datasets.cifar10.load_data()
# Normalize
X_train = X_train.astype('float32') / 255.0
X_test = X_test.astype('float32') / 255.0
y_train = keras.utils.to_categorical(y_train, 10)
y_test = keras.utils.to_categorical(y_test, 10)
# Deeper CNN for CIFAR-10
model = keras.Sequential([
# Block 1
layers.Conv2D(32, (3, 3), padding='same', input_shape=(32, 32, 3)),
layers.BatchNormalization(),
layers.Activation('relu'),
layers.Conv2D(32, (3, 3), padding='same'),
layers.BatchNormalization(),
layers.Activation('relu'),
layers.MaxPooling2D((2, 2)),
layers.Dropout(0.2),
# Block 2
layers.Conv2D(64, (3, 3), padding='same'),
layers.BatchNormalization(),
layers.Activation('relu'),
layers.Conv2D(64, (3, 3), padding='same'),
layers.BatchNormalization(),
layers.Activation('relu'),
layers.MaxPooling2D((2, 2)),
layers.Dropout(0.3),
# Block 3
layers.Conv2D(128, (3, 3), padding='same'),
layers.BatchNormalization(),
layers.Activation('relu'),
layers.Conv2D(128, (3, 3), padding='same'),
layers.BatchNormalization(),
layers.Activation('relu'),
layers.MaxPooling2D((2, 2)),
layers.Dropout(0.4),
# Classifier
layers.Flatten(),
layers.Dense(128, activation='relu'),
layers.BatchNormalization(),
layers.Dropout(0.5),
layers.Dense(10, activation='softmax')
])
model.compile(
optimizer=keras.optimizers.Adam(learning_rate=0.001),
loss='categorical_crossentropy',
metrics=['accuracy']
)
# Train with data augmentation
datagen = keras.preprocessing.image.ImageDataGenerator(
rotation_range=15,
width_shift_range=0.1,
height_shift_range=0.1,
horizontal_flip=True
)
history = model.fit(
datagen.flow(X_train, y_train, batch_size=64),
epochs=50,
validation_data=(X_test, y_test),
callbacks=[
keras.callbacks.ReduceLROnPlateau(patience=5),
keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)
]
)
🚀 Transfer Learning
Use pre-trained models for faster training and better performance!
# Load pre-trained VGG16 (trained on ImageNet)
base_model = keras.applications.VGG16(
weights='imagenet',
include_top=False, # Exclude classification layer
input_shape=(224, 224, 3)
)
# Freeze base model weights
base_model.trainable = False
# Add custom classifier
model = keras.Sequential([
base_model,
layers.GlobalAveragePooling2D(),
layers.Dense(256, activation='relu'),
layers.Dropout(0.5),
layers.Dense(10, activation='softmax') # Your number of classes
])
model.compile(
optimizer='adam',
loss='categorical_crossentropy',
metrics=['accuracy']
)
# Train only the new layers
model.fit(X_train, y_train, epochs=10, validation_split=0.2)
# Optionally: Fine-tune by unfreezing some layers
base_model.trainable = True
# Freeze all but last 4 layers
for layer in base_model.layers[:-4]:
layer.trainable = False
model.compile(
optimizer=keras.optimizers.Adam(1e-5), # Lower learning rate
loss='categorical_crossentropy',
metrics=['accuracy']
)
model.fit(X_train, y_train, epochs=5, validation_split=0.2)
Popular Pre-trained Models
# Available in Keras
keras.applications.VGG16()
keras.applications.VGG19()
keras.applications.ResNet50()
keras.applications.ResNet101()
keras.applications.InceptionV3()
keras.applications.MobileNetV2() # Lightweight
keras.applications.EfficientNetB0() # State-of-the-art
# Usage example
base = keras.applications.ResNet50(weights='imagenet', include_top=False)
📊 Data Augmentation
from tensorflow.keras.preprocessing.image import ImageDataGenerator
# Create augmentation generator
datagen = ImageDataGenerator(
rotation_range=20, # Random rotation ±20 degrees
width_shift_range=0.2, # Horizontal shift 20%
height_shift_range=0.2, # Vertical shift 20%
shear_range=0.2, # Shear transformation
zoom_range=0.2, # Random zoom
horizontal_flip=True, # Random horizontal flip
fill_mode='nearest' # Fill empty pixels
)
# Fit to training data
datagen.fit(X_train)
# Train with augmentation
model.fit(
datagen.flow(X_train, y_train, batch_size=32),
steps_per_epoch=len(X_train) // 32,
epochs=50,
validation_data=(X_test, y_test)
)
# Visualize augmentation
fig, axes = plt.subplots(2, 5, figsize=(15, 6))
for i, ax in enumerate(axes.flat):
augmented = datagen.random_transform(X_train[0])
ax.imshow(augmented.squeeze(), cmap='gray')
ax.axis('off')
plt.show()
🎯 Famous CNN Architectures
| Architecture | Year | Key Innovation | Depth |
|---|---|---|---|
| LeNet-5 | 1998 | First CNN (digit recognition) | 7 layers |
| AlexNet | 2012 | ReLU, Dropout, GPU training | 8 layers |
| VGG-16/19 | 2014 | Small 3x3 filters, deep network | 16-19 layers |
| Inception | 2014 | Multi-scale filters in parallel | 22 layers |
| ResNet | 2015 | Skip connections, very deep | 50-152 layers |
| MobileNet | 2017 | Depthwise separable convs | 28 layers |
| EfficientNet | 2019 | Compound scaling | Variable |
🔍 Visualizing CNN Features
# Visualize learned filters
def plot_filters(model, layer_name, n_filters=32):
layer = model.get_layer(layer_name)
filters, biases = layer.get_weights()
# Normalize filters
f_min, f_max = filters.min(), filters.max()
filters = (filters - f_min) / (f_max - f_min)
# Plot
n_cols = 8
n_rows = n_filters // n_cols
fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 8))
for i, ax in enumerate(axes.flat):
if i < n_filters:
ax.imshow(filters[:, :, 0, i], cmap='gray')
ax.axis('off')
plt.show()
plot_filters(model, 'conv2d')
# Visualize feature maps
def visualize_activation(model, layer_name, img):
# Create model that outputs intermediate layer
activation_model = keras.Model(
inputs=model.input,
outputs=model.get_layer(layer_name).output
)
# Get activations
activations = activation_model.predict(img[np.newaxis, ...])
# Plot
n_features = activations.shape[-1]
size = activations.shape[1]
n_cols = 8
n_rows = n_features // n_cols
fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 15))
for i, ax in enumerate(axes.flat):
if i < n_features:
ax.imshow(activations[0, :, :, i], cmap='viridis')
ax.axis('off')
plt.show()
visualize_activation(model, 'conv2d', X_test[0])
💡 Best Practices
- Start simple: Begin with shallow CNN, add layers if needed
- Use transfer learning: Faster convergence, better performance
- Data augmentation: Essential for small datasets
- Batch normalization: Add after Conv layers for faster training
- Dropout: Use in fully connected layers (0.5 typical)
- ReLU activation: Default choice for hidden layers
- Adam optimizer: Good default, works well most of the time
- Early stopping: Monitor validation loss, stop when overfitting
- Learning rate scheduling: Reduce LR when plateauing
⚠️ Common Pitfalls
- Not normalizing inputs: Always scale pixels to [0, 1]
- Too few training samples: Use data augmentation or transfer learning
- Forgetting to freeze: Freeze pre-trained layers initially
- Wrong input shape: Check model.summary() carefully
- Too large learning rate: Start with 0.001 for Adam
- Not using GPU: CNNs are slow on CPU
🎯 Key Takeaways
- CNNs automatically learn hierarchical features from images
- Convolutional layers detect patterns, pooling reduces dimensions
- Transfer learning with pre-trained models saves time
- Data augmentation increases dataset size artificially
- Batch normalization speeds up training
- Dropout prevents overfitting in dense layers
- ResNet, VGG, EfficientNet are popular architectures