🖼️ Convolutional Neural Networks

Deep learning for computer vision

What are CNNs?

Convolutional Neural Networks are specialized deep learning architectures designed for processing grid-like data, especially images. They automatically learn hierarchical feature representations.

Key Components:

  • Convolutional Layers: Learn spatial features (edges, textures, patterns)
  • Pooling Layers: Reduce spatial dimensions, increase invariance
  • Fully Connected Layers: Make final classification decisions
  • Activation Functions: Add non-linearity (ReLU most common)

🔍 How Convolution Works

A filter (kernel) slides over the input image, performing element-wise multiplication and summing the results.

import numpy as np

# Example: 5x5 image
image = np.array([
    [1, 2, 3, 0, 1],
    [0, 1, 2, 3, 0],
    [1, 0, 1, 2, 3],
    [2, 1, 0, 1, 2],
    [3, 2, 1, 0, 1]
])

# 3x3 edge detection filter
kernel = np.array([
    [-1, -1, -1],
    [ 0,  0,  0],
    [ 1,  1,  1]
])

# Manual convolution (simplified)
def convolve2d(image, kernel):
    img_h, img_w = image.shape
    ker_h, ker_w = kernel.shape
    
    out_h = img_h - ker_h + 1
    out_w = img_w - ker_w + 1
    
    output = np.zeros((out_h, out_w))
    
    for i in range(out_h):
        for j in range(out_w):
            region = image[i:i+ker_h, j:j+ker_w]
            output[i, j] = np.sum(region * kernel)
    
    return output

result = convolve2d(image, kernel)
print("Convolved output:")
print(result)

🏗️ Building a CNN with TensorFlow/Keras

Simple CNN for MNIST

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
import matplotlib.pyplot as plt

# Load MNIST dataset
(X_train, y_train), (X_test, y_test) = keras.datasets.mnist.load_data()

# Preprocess
X_train = X_train.reshape(-1, 28, 28, 1).astype('float32') / 255.0
X_test = X_test.reshape(-1, 28, 28, 1).astype('float32') / 255.0
y_train = keras.utils.to_categorical(y_train, 10)
y_test = keras.utils.to_categorical(y_test, 10)

print(f"Training shape: {X_train.shape}")  # (60000, 28, 28, 1)
print(f"Test shape: {X_test.shape}")        # (10000, 28, 28, 1)

# Build CNN
model = keras.Sequential([
    # First convolutional block
    layers.Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=(28, 28, 1)),
    layers.MaxPooling2D(pool_size=(2, 2)),
    
    # Second convolutional block
    layers.Conv2D(64, kernel_size=(3, 3), activation='relu'),
    layers.MaxPooling2D(pool_size=(2, 2)),
    
    # Flatten and dense layers
    layers.Flatten(),
    layers.Dropout(0.5),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(10, activation='softmax')
])

# Compile
model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

# Model summary
model.summary()

Train the Model

# Train
history = model.fit(
    X_train, y_train,
    batch_size=128,
    epochs=10,
    validation_split=0.1,
    verbose=1
)

# Evaluate
test_loss, test_acc = model.evaluate(X_test, y_test, verbose=0)
print(f"\nTest accuracy: {test_acc:.4f}")

# Plot training history
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Training')
plt.plot(history.history['val_accuracy'], label='Validation')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Training')
plt.plot(history.history['val_loss'], label='Validation')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

🔧 CNN Layers Explained

1. Convolutional Layer

# Conv2D parameters
layers.Conv2D(
    filters=32,              # Number of output filters
    kernel_size=(3, 3),      # Filter size (3x3)
    strides=(1, 1),          # Step size (default 1)
    padding='valid',         # 'valid' = no padding, 'same' = output same size
    activation='relu',       # Activation function
    use_bias=True,           # Add bias term
    kernel_initializer='glorot_uniform'  # Weight initialization
)

# Example: Different configurations
layers.Conv2D(64, (5, 5), activation='relu')  # 64 filters, 5x5 kernel
layers.Conv2D(128, (3, 3), padding='same')    # Same padding
layers.Conv2D(256, (1, 1))  # 1x1 conv (channel mixing)

2. Pooling Layers

# Max Pooling (most common)
layers.MaxPooling2D(
    pool_size=(2, 2),  # 2x2 window
    strides=None,      # Default = pool_size
    padding='valid'
)

# Average Pooling
layers.AveragePooling2D(pool_size=(2, 2))

# Global pooling (reduces to 1x1)
layers.GlobalMaxPooling2D()
layers.GlobalAveragePooling2D()

# Why pooling?
# - Reduces spatial dimensions (fewer parameters)
# - Increases receptive field
# - Translation invariance
# - Controls overfitting

3. Batch Normalization

# Add after Conv layers for faster training
model = keras.Sequential([
    layers.Conv2D(32, (3, 3), input_shape=(28, 28, 1)),
    layers.BatchNormalization(),  # Normalize activations
    layers.Activation('relu'),
    layers.MaxPooling2D((2, 2))
])

# Benefits:
# - Faster training
# - Higher learning rates possible
# - Less sensitive to initialization
# - Regularization effect

🎨 CNN for Color Images (CIFAR-10)

# Load CIFAR-10 (32x32 RGB images)
(X_train, y_train), (X_test, y_test) = keras.datasets.cifar10.load_data()

# Normalize
X_train = X_train.astype('float32') / 255.0
X_test = X_test.astype('float32') / 255.0
y_train = keras.utils.to_categorical(y_train, 10)
y_test = keras.utils.to_categorical(y_test, 10)

# Deeper CNN for CIFAR-10
model = keras.Sequential([
    # Block 1
    layers.Conv2D(32, (3, 3), padding='same', input_shape=(32, 32, 3)),
    layers.BatchNormalization(),
    layers.Activation('relu'),
    layers.Conv2D(32, (3, 3), padding='same'),
    layers.BatchNormalization(),
    layers.Activation('relu'),
    layers.MaxPooling2D((2, 2)),
    layers.Dropout(0.2),
    
    # Block 2
    layers.Conv2D(64, (3, 3), padding='same'),
    layers.BatchNormalization(),
    layers.Activation('relu'),
    layers.Conv2D(64, (3, 3), padding='same'),
    layers.BatchNormalization(),
    layers.Activation('relu'),
    layers.MaxPooling2D((2, 2)),
    layers.Dropout(0.3),
    
    # Block 3
    layers.Conv2D(128, (3, 3), padding='same'),
    layers.BatchNormalization(),
    layers.Activation('relu'),
    layers.Conv2D(128, (3, 3), padding='same'),
    layers.BatchNormalization(),
    layers.Activation('relu'),
    layers.MaxPooling2D((2, 2)),
    layers.Dropout(0.4),
    
    # Classifier
    layers.Flatten(),
    layers.Dense(128, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.5),
    layers.Dense(10, activation='softmax')
])

model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

# Train with data augmentation
datagen = keras.preprocessing.image.ImageDataGenerator(
    rotation_range=15,
    width_shift_range=0.1,
    height_shift_range=0.1,
    horizontal_flip=True
)

history = model.fit(
    datagen.flow(X_train, y_train, batch_size=64),
    epochs=50,
    validation_data=(X_test, y_test),
    callbacks=[
        keras.callbacks.ReduceLROnPlateau(patience=5),
        keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)
    ]
)

🚀 Transfer Learning

Use pre-trained models for faster training and better performance!

# Load pre-trained VGG16 (trained on ImageNet)
base_model = keras.applications.VGG16(
    weights='imagenet',
    include_top=False,  # Exclude classification layer
    input_shape=(224, 224, 3)
)

# Freeze base model weights
base_model.trainable = False

# Add custom classifier
model = keras.Sequential([
    base_model,
    layers.GlobalAveragePooling2D(),
    layers.Dense(256, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(10, activation='softmax')  # Your number of classes
])

model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

# Train only the new layers
model.fit(X_train, y_train, epochs=10, validation_split=0.2)

# Optionally: Fine-tune by unfreezing some layers
base_model.trainable = True
# Freeze all but last 4 layers
for layer in base_model.layers[:-4]:
    layer.trainable = False

model.compile(
    optimizer=keras.optimizers.Adam(1e-5),  # Lower learning rate
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

model.fit(X_train, y_train, epochs=5, validation_split=0.2)

Popular Pre-trained Models

# Available in Keras
keras.applications.VGG16()
keras.applications.VGG19()
keras.applications.ResNet50()
keras.applications.ResNet101()
keras.applications.InceptionV3()
keras.applications.MobileNetV2()  # Lightweight
keras.applications.EfficientNetB0()  # State-of-the-art

# Usage example
base = keras.applications.ResNet50(weights='imagenet', include_top=False)

📊 Data Augmentation

from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Create augmentation generator
datagen = ImageDataGenerator(
    rotation_range=20,           # Random rotation ±20 degrees
    width_shift_range=0.2,       # Horizontal shift 20%
    height_shift_range=0.2,      # Vertical shift 20%
    shear_range=0.2,            # Shear transformation
    zoom_range=0.2,             # Random zoom
    horizontal_flip=True,        # Random horizontal flip
    fill_mode='nearest'         # Fill empty pixels
)

# Fit to training data
datagen.fit(X_train)

# Train with augmentation
model.fit(
    datagen.flow(X_train, y_train, batch_size=32),
    steps_per_epoch=len(X_train) // 32,
    epochs=50,
    validation_data=(X_test, y_test)
)

# Visualize augmentation
fig, axes = plt.subplots(2, 5, figsize=(15, 6))
for i, ax in enumerate(axes.flat):
    augmented = datagen.random_transform(X_train[0])
    ax.imshow(augmented.squeeze(), cmap='gray')
    ax.axis('off')
plt.show()

🎯 Famous CNN Architectures

Architecture Year Key Innovation Depth
LeNet-5 1998 First CNN (digit recognition) 7 layers
AlexNet 2012 ReLU, Dropout, GPU training 8 layers
VGG-16/19 2014 Small 3x3 filters, deep network 16-19 layers
Inception 2014 Multi-scale filters in parallel 22 layers
ResNet 2015 Skip connections, very deep 50-152 layers
MobileNet 2017 Depthwise separable convs 28 layers
EfficientNet 2019 Compound scaling Variable

🔍 Visualizing CNN Features

# Visualize learned filters
def plot_filters(model, layer_name, n_filters=32):
    layer = model.get_layer(layer_name)
    filters, biases = layer.get_weights()
    
    # Normalize filters
    f_min, f_max = filters.min(), filters.max()
    filters = (filters - f_min) / (f_max - f_min)
    
    # Plot
    n_cols = 8
    n_rows = n_filters // n_cols
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 8))
    
    for i, ax in enumerate(axes.flat):
        if i < n_filters:
            ax.imshow(filters[:, :, 0, i], cmap='gray')
        ax.axis('off')
    plt.show()

plot_filters(model, 'conv2d')

# Visualize feature maps
def visualize_activation(model, layer_name, img):
    # Create model that outputs intermediate layer
    activation_model = keras.Model(
        inputs=model.input,
        outputs=model.get_layer(layer_name).output
    )
    
    # Get activations
    activations = activation_model.predict(img[np.newaxis, ...])
    
    # Plot
    n_features = activations.shape[-1]
    size = activations.shape[1]
    n_cols = 8
    n_rows = n_features // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 15))
    for i, ax in enumerate(axes.flat):
        if i < n_features:
            ax.imshow(activations[0, :, :, i], cmap='viridis')
        ax.axis('off')
    plt.show()

visualize_activation(model, 'conv2d', X_test[0])

💡 Best Practices

⚠️ Common Pitfalls

🎯 Key Takeaways