👁️ Computer Vision & Image Recognition

CNNs, Image Classification, and Object Detection

What is Computer Vision?

Computer vision enables computers to understand and interpret visual information from images and videos. It powers facial recognition, self-driving cars, medical image analysis, and more. The breakthrough came with Convolutional Neural Networks (CNNs), which can automatically learn visual features.

Real-World Applications:

  • Healthcare: Detecting diseases from X-rays and MRIs
  • Autonomous vehicles: Recognizing pedestrians, signs, lanes
  • Security: Facial recognition, surveillance
  • Retail: Product recognition, inventory management
  • Agriculture: Crop disease detection

🖼️ Understanding Images

import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import cv2

# Images are just arrays of numbers!

# Load an image
img = plt.imread('sample_image.jpg')  # Or use Image.open()

print("Image shape:", img.shape)  # (height, width, channels)
print("Data type:", img.dtype)
print("Value range:", img.min(), "to", img.max())

# For RGB images: shape is (H, W, 3)
# Channel 0 = Red, Channel 1 = Green, Channel 2 = Blue

# Visualize image
plt.figure(figsize=(15, 5))

plt.subplot(1, 4, 1)
plt.imshow(img)
plt.title('Original Image')
plt.axis('off')

plt.subplot(1, 4, 2)
plt.imshow(img[:, :, 0], cmap='Reds')
plt.title('Red Channel')
plt.axis('off')

plt.subplot(1, 4, 3)
plt.imshow(img[:, :, 1], cmap='Greens')
plt.title('Green Channel')
plt.axis('off')

plt.subplot(1, 4, 4)
plt.imshow(img[:, :, 2], cmap='Blues')
plt.title('Blue Channel')
plt.axis('off')

plt.tight_layout()
plt.show()

# Grayscale conversion
gray_img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
print("\nGrayscale shape:", gray_img.shape)  # (H, W) - single channel

🔄 Image Preprocessing

from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.preprocessing import StandardScaler

# Essential preprocessing steps

# 1. Resize images (models expect fixed size)
def resize_image(img, target_size=(224, 224)):
    """Resize to target dimensions"""
    return cv2.resize(img, target_size)

# 2. Normalize pixel values (0-255 → 0-1)
def normalize_image(img):
    """Scale pixel values to [0, 1]"""
    return img.astype('float32') / 255.0

# 3. Standardize (mean=0, std=1)
def standardize_image(img):
    """Center around mean with unit variance"""
    mean = np.mean(img)
    std = np.std(img)
    return (img - mean) / std

# Image augmentation - Create variations for training
datagen = ImageDataGenerator(
    rotation_range=20,        # Rotate up to 20 degrees
    width_shift_range=0.2,    # Shift horizontally
    height_shift_range=0.2,   # Shift vertically
    horizontal_flip=True,     # Flip left-right
    zoom_range=0.2,           # Zoom in/out
    shear_range=0.2,          # Shear transformation
    brightness_range=[0.8, 1.2],  # Adjust brightness
    fill_mode='nearest'       # Fill empty pixels
)

# Apply augmentation
img_array = img.reshape((1,) + img.shape)  # Add batch dimension

plt.figure(figsize=(15, 3))
for i, batch in enumerate(datagen.flow(img_array, batch_size=1)):
    plt.subplot(1, 6, i + 1)
    plt.imshow(batch[0].astype('uint8'))
    plt.title(f'Augmented {i+1}')
    plt.axis('off')
    
    if i >= 5:  # Show 6 examples
        break

plt.tight_layout()
plt.show()

print("✓ Augmentation increases dataset size and prevents overfitting!")

🧠 Convolutional Neural Networks (CNNs)

# CNNs are designed for image data

# Key components:
# 1. Convolutional layers - Detect features
# 2. Pooling layers - Reduce dimensions
# 3. Fully connected layers - Make predictions

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Simple CNN architecture
def create_simple_cnn(input_shape=(32, 32, 3), num_classes=10):
    """
    Build a basic CNN
    
    Args:
        input_shape: Image dimensions (height, width, channels)
        num_classes: Number of output classes
    """
    
    model = keras.Sequential([
        # First convolutional block
        layers.Conv2D(32, (3, 3), activation='relu', input_shape=input_shape, padding='same'),
        layers.BatchNormalization(),
        layers.Conv2D(32, (3, 3), activation='relu', padding='same'),
        layers.MaxPooling2D((2, 2)),
        layers.Dropout(0.25),
        
        # Second convolutional block
        layers.Conv2D(64, (3, 3), activation='relu', padding='same'),
        layers.BatchNormalization(),
        layers.Conv2D(64, (3, 3), activation='relu', padding='same'),
        layers.MaxPooling2D((2, 2)),
        layers.Dropout(0.25),
        
        # Third convolutional block
        layers.Conv2D(128, (3, 3), activation='relu', padding='same'),
        layers.BatchNormalization(),
        layers.Conv2D(128, (3, 3), activation='relu', padding='same'),
        layers.MaxPooling2D((2, 2)),
        layers.Dropout(0.25),
        
        # Fully connected layers
        layers.Flatten(),
        layers.Dense(256, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.5),
        layers.Dense(num_classes, activation='softmax')
    ])
    
    return model

# Create model
model = create_simple_cnn()
model.summary()

# Explanation of layers:
print("\nLayer Breakdown:")
print("Conv2D: Applies filters to detect features (edges, textures, etc.)")
print("MaxPooling2D: Reduces spatial dimensions, keeps important features")
print("BatchNormalization: Normalizes layer outputs (faster training)")
print("Dropout: Randomly disables neurons (prevents overfitting)")
print("Flatten: Converts 2D feature maps to 1D vector")
print("Dense: Fully connected layer for classification")

How Convolution Works:

# Convolution example: Apply a filter to detect edges

# Simple 3x3 edge detection filter
edge_filter = np.array([
    [-1, -1, -1],
    [ 0,  0,  0],
    [ 1,  1,  1]
])

# Apply filter to image
def apply_filter(img, kernel):
    """Convolve image with kernel"""
    return cv2.filter2D(img, -1, kernel)

# Load grayscale image
gray_img = cv2.imread('sample.jpg', cv2.IMREAD_GRAYSCALE)

# Apply edge detection
edges = apply_filter(gray_img, edge_filter)

plt.figure(figsize=(10, 4))
plt.subplot(1, 2, 1)
plt.imshow(gray_img, cmap='gray')
plt.title('Original Image')

plt.subplot(1, 2, 2)
plt.imshow(edges, cmap='gray')
plt.title('After Edge Filter')

plt.show()

print("CNNs learn these filters automatically during training!")

🎯 Complete Image Classification Example

# Train CNN on CIFAR-10 dataset

from tensorflow.keras.datasets import cifar10
from tensorflow.keras.utils import to_categorical

# Load CIFAR-10 (60,000 32x32 color images in 10 classes)
(X_train, y_train), (X_test, y_test) = cifar10.load_data()

# Class names
class_names = ['airplane', 'automobile', 'bird', 'cat', 'deer',
               'dog', 'frog', 'horse', 'ship', 'truck']

print(f"Training images: {X_train.shape}")
print(f"Test images: {X_test.shape}")

# Preprocess data
X_train = X_train.astype('float32') / 255.0
X_test = X_test.astype('float32') / 255.0

y_train = to_categorical(y_train, 10)
y_test = to_categorical(y_test, 10)

# Create and compile model
model = create_simple_cnn(input_shape=(32, 32, 3), num_classes=10)

model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

# Train model
print("\nTraining CNN...")
history = model.fit(
    X_train, y_train,
    batch_size=128,
    epochs=20,
    validation_split=0.1,
    callbacks=[
        keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True),
        keras.callbacks.ReduceLROnPlateau(factor=0.5, patience=2)
    ]
)

# Evaluate
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"\nTest Accuracy: {test_accuracy:.2%}")

# Make predictions
predictions = model.predict(X_test[:10])

# Visualize predictions
plt.figure(figsize=(15, 3))
for i in range(10):
    plt.subplot(2, 5, i + 1)
    plt.imshow(X_test[i])
    
    true_label = class_names[np.argmax(y_test[i])]
    pred_label = class_names[np.argmax(predictions[i])]
    confidence = np.max(predictions[i])
    
    color = 'green' if true_label == pred_label else 'red'
    plt.title(f'True: {true_label}\nPred: {pred_label}\n({confidence:.2f})', 
              color=color, fontsize=8)
    plt.axis('off')

plt.tight_layout()
plt.show()

🔥 Transfer Learning

# Use pre-trained models (trained on millions of images)

from tensorflow.keras.applications import ResNet50, VGG16, MobileNetV2
from tensorflow.keras.applications.resnet50 import preprocess_input, decode_predictions

# Load pre-trained ResNet50 (trained on ImageNet)
base_model = ResNet50(
    weights='imagenet',
    include_top=False,  # Remove classification layer
    input_shape=(224, 224, 3)
)

# Freeze pre-trained weights
base_model.trainable = False

# Add custom classification layers
model = keras.Sequential([
    base_model,
    layers.GlobalAveragePooling2D(),
    layers.Dense(256, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(10, activation='softmax')  # 10 classes
])

model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.0001),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

print("✓ Transfer learning model ready!")
print(f"Total parameters: {model.count_params():,}")
print(f"Trainable parameters: {sum([tf.size(w).numpy() for w in model.trainable_weights]):,}")

# Why transfer learning?
benefits = {
    'Faster training': 'Pre-trained features already learned',
    'Better accuracy': 'Especially with small datasets',
    'Less data needed': 'Can work with 100s of images instead of millions',
    'Production ready': 'State-of-the-art architectures'
}

print("\nTransfer Learning Benefits:")
for benefit, reason in benefits.items():
    print(f"  • {benefit}: {reason}")

Popular Pre-trained Models:

# Compare popular CNN architectures

architectures = {
    'VGG16': {
        'Parameters': '138M',
        'Depth': '16 layers',
        'Best for': 'Simple, interpretable',
        'Speed': 'Slow'
    },
    'ResNet50': {
        'Parameters': '25M',
        'Depth': '50 layers',
        'Best for': 'High accuracy',
        'Speed': 'Medium'
    },
    'InceptionV3': {
        'Parameters': '24M',
        'Depth': '48 layers',
        'Best for': 'Efficiency',
        'Speed': 'Medium'
    },
    'MobileNetV2': {
        'Parameters': '3.5M',
        'Depth': '53 layers',
        'Best for': 'Mobile/edge devices',
        'Speed': 'Fast'
    },
    'EfficientNetB0': {
        'Parameters': '5.3M',
        'Depth': 'Variable',
        'Best for': 'Best accuracy/size ratio',
        'Speed': 'Fast'
    }
}

print("CNN Architecture Comparison:")
print("="*60)
for name, specs in architectures.items():
    print(f"\n{name}:")
    for key, value in specs.items():
        print(f"  {key:15s}: {value}")

🎯 Object Detection Basics

# Object detection: Find and classify multiple objects

# Popular approaches:
# 1. YOLO (You Only Look Once) - Fast, real-time
# 2. R-CNN family (R-CNN, Fast R-CNN, Faster R-CNN) - More accurate
# 3. SSD (Single Shot Detector) - Balance of speed and accuracy

# Example using pre-trained YOLO (conceptual)
"""
from ultralytics import YOLO

# Load pre-trained model
model = YOLO('yolov8n.pt')  # Nano version (fastest)

# Detect objects in image
results = model('image.jpg')

# Process results
for result in results:
    boxes = result.boxes
    for box in boxes:
        # Bounding box coordinates
        x1, y1, x2, y2 = box.xyxy[0]
        
        # Confidence and class
        confidence = box.conf[0]
        class_id = box.cls[0]
        class_name = model.names[int(class_id)]
        
        print(f"Found {class_name} with {confidence:.2f} confidence at ({x1}, {y1})")
"""

# Object detection output includes:
detection_output = {
    'Bounding boxes': '(x, y, width, height) for each object',
    'Class labels': 'What object is it?',
    'Confidence scores': 'How sure is the model?',
    'Number of objects': 'Can detect multiple objects per image'
}

print("Object Detection Output:")
for component, description in detection_output.items():
    print(f"  • {component}: {description}")

# Use cases:
print("\nObject Detection Applications:")
print("  • Self-driving cars: Detect pedestrians, vehicles, signs")
print("  • Retail: Automated checkout, inventory tracking")
print("  • Security: Intrusion detection, crowd monitoring")
print("  • Healthcare: Tumor detection in medical images")
print("  • Agriculture: Crop disease detection, yield estimation")

📊 Visualizing CNN Features

# Understand what CNNs learn

def visualize_filters(model, layer_name):
    """Display filters from a convolutional layer"""
    
    # Get layer by name
    layer = model.get_layer(layer_name)
    filters, biases = layer.get_weights()
    
    # Normalize filters
    f_min, f_max = filters.min(), filters.max()
    filters = (filters - f_min) / (f_max - f_min)
    
    # Plot filters
    n_filters = min(filters.shape[3], 64)  # Show up to 64 filters
    
    plt.figure(figsize=(12, 12))
    for i in range(n_filters):
        plt.subplot(8, 8, i + 1)
        
        # Get filter
        f = filters[:, :, 0, i]
        plt.imshow(f, cmap='viridis')
        plt.axis('off')
    
    plt.suptitle(f'Filters from {layer_name}')
    plt.tight_layout()
    plt.show()

# Visualize feature maps
def visualize_feature_maps(model, image, layer_name):
    """Show what features are detected in an image"""
    
    # Create model that outputs intermediate layer
    feature_model = keras.Model(
        inputs=model.input,
        outputs=model.get_layer(layer_name).output
    )
    
    # Get features
    features = feature_model.predict(image[np.newaxis, ...])
    
    # Plot feature maps
    n_features = min(features.shape[-1], 64)
    
    plt.figure(figsize=(12, 12))
    for i in range(n_features):
        plt.subplot(8, 8, i + 1)
        plt.imshow(features[0, :, :, i], cmap='viridis')
        plt.axis('off')
    
    plt.suptitle(f'Feature Maps from {layer_name}')
    plt.tight_layout()
    plt.show()

# Early layers: Detect edges, colors
# Middle layers: Detect textures, patterns
# Late layers: Detect complex objects, parts

print("CNN learns hierarchical features:")
print("  Layer 1: Edges, gradients, colors")
print("  Layer 2-3: Textures, patterns")
print("  Layer 4-5: Object parts (wheels, eyes, etc.)")
print("  Final layers: Whole objects (cars, faces, etc.)")

🎯 Key Takeaways