Computer Vision & Image Recognition

What is Computer Vision?

Computer vision enables computers to understand and interpret visual information from images and videos. It powers facial recognition, self-driving cars, medical image analysis, and more. The breakthrough came with Convolutional Neural Networks (CNNs), which can automatically learn visual features.

                Real-World Applications:
                Healthcare: Detecting diseases from X-rays and MRIs
Autonomous vehicles: Recognizing pedestrians, signs, lanes
Security: Facial recognition, surveillance
Retail: Product recognition, inventory management
Agriculture: Crop disease detection

            

🖼️ Understanding Images

import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import cv2

# Images are just arrays of numbers!

# Load an image
img = plt.imread('sample_image.jpg')  # Or use Image.open()

print("Image shape:", img.shape)  # (height, width, channels)
print("Data type:", img.dtype)
print("Value range:", img.min(), "to", img.max())

# For RGB images: shape is (H, W, 3)
# Channel 0 = Red, Channel 1 = Green, Channel 2 = Blue

# Visualize image
plt.figure(figsize=(15, 5))

plt.subplot(1, 4, 1)
plt.imshow(img)
plt.title('Original Image')
plt.axis('off')

plt.subplot(1, 4, 2)
plt.imshow(img[:, :, 0], cmap='Reds')
plt.title('Red Channel')
plt.axis('off')

plt.subplot(1, 4, 3)
plt.imshow(img[:, :, 1], cmap='Greens')
plt.title('Green Channel')
plt.axis('off')

plt.subplot(1, 4, 4)
plt.imshow(img[:, :, 2], cmap='Blues')
plt.title('Blue Channel')
plt.axis('off')

plt.tight_layout()
plt.show()

# Grayscale conversion
gray_img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
print("\nGrayscale shape:", gray_img.shape)  # (H, W) - single channel

🔄 Image Preprocessing

from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.preprocessing import StandardScaler

# Essential preprocessing steps

# 1. Resize images (models expect fixed size)
def resize_image(img, target_size=(224, 224)):
    """Resize to target dimensions"""
    return cv2.resize(img, target_size)

# 2. Normalize pixel values (0-255 → 0-1)
def normalize_image(img):
    """Scale pixel values to [0, 1]"""
    return img.astype('float32') / 255.0

# 3. Standardize (mean=0, std=1)
def standardize_image(img):
    """Center around mean with unit variance"""
    mean = np.mean(img)
    std = np.std(img)
    return (img - mean) / std

# Image augmentation - Create variations for training
datagen = ImageDataGenerator(
    rotation_range=20,        # Rotate up to 20 degrees
    width_shift_range=0.2,    # Shift horizontally
    height_shift_range=0.2,   # Shift vertically
    horizontal_flip=True,     # Flip left-right
    zoom_range=0.2,           # Zoom in/out
    shear_range=0.2,          # Shear transformation
    brightness_range=[0.8, 1.2],  # Adjust brightness
    fill_mode='nearest'       # Fill empty pixels
)

# Apply augmentation
img_array = img.reshape((1,) + img.shape)  # Add batch dimension

plt.figure(figsize=(15, 3))
for i, batch in enumerate(datagen.flow(img_array, batch_size=1)):
    plt.subplot(1, 6, i + 1)
    plt.imshow(batch[0].astype('uint8'))
    plt.title(f'Augmented {i+1}')
    plt.axis('off')
    
    if i >= 5:  # Show 6 examples
        break

plt.tight_layout()
plt.show()

print("✓ Augmentation increases dataset size and prevents overfitting!")

🧠 Convolutional Neural Networks (CNNs)

# CNNs are designed for image data

# Key components:
# 1. Convolutional layers - Detect features
# 2. Pooling layers - Reduce dimensions
# 3. Fully connected layers - Make predictions

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Simple CNN architecture
def create_simple_cnn(input_shape=(32, 32, 3), num_classes=10):
    """
    Build a basic CNN
    
    Args:
        input_shape: Image dimensions (height, width, channels)
        num_classes: Number of output classes
    """
    
    model = keras.Sequential([
        # First convolutional block
        layers.Conv2D(32, (3, 3), activation='relu', input_shape=input_shape, padding='same'),
        layers.BatchNormalization(),
        layers.Conv2D(32, (3, 3), activation='relu', padding='same'),
        layers.MaxPooling2D((2, 2)),
        layers.Dropout(0.25),
        
        # Second convolutional block
        layers.Conv2D(64, (3, 3), activation='relu', padding='same'),
        layers.BatchNormalization(),
        layers.Conv2D(64, (3, 3), activation='relu', padding='same'),
        layers.MaxPooling2D((2, 2)),
        layers.Dropout(0.25),
        
        # Third convolutional block
        layers.Conv2D(128, (3, 3), activation='relu', padding='same'),
        layers.BatchNormalization(),
        layers.Conv2D(128, (3, 3), activation='relu', padding='same'),
        layers.MaxPooling2D((2, 2)),
        layers.Dropout(0.25),
        
        # Fully connected layers
        layers.Flatten(),
        layers.Dense(256, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.5),
        layers.Dense(num_classes, activation='softmax')
    ])
    
    return model

# Create model
model = create_simple_cnn()
model.summary()

# Explanation of layers:
print("\nLayer Breakdown:")
print("Conv2D: Applies filters to detect features (edges, textures, etc.)")
print("MaxPooling2D: Reduces spatial dimensions, keeps important features")
print("BatchNormalization: Normalizes layer outputs (faster training)")
print("Dropout: Randomly disables neurons (prevents overfitting)")
print("Flatten: Converts 2D feature maps to 1D vector")
print("Dense: Fully connected layer for classification")

How Convolution Works:

# Convolution example: Apply a filter to detect edges

# Simple 3x3 edge detection filter
edge_filter = np.array([
    [-1, -1, -1],
    [ 0,  0,  0],
    [ 1,  1,  1]
])

# Apply filter to image
def apply_filter(img, kernel):
    """Convolve image with kernel"""
    return cv2.filter2D(img, -1, kernel)

# Load grayscale image
gray_img = cv2.imread('sample.jpg', cv2.IMREAD_GRAYSCALE)

# Apply edge detection
edges = apply_filter(gray_img, edge_filter)

plt.figure(figsize=(10, 4))
plt.subplot(1, 2, 1)
plt.imshow(gray_img, cmap='gray')
plt.title('Original Image')

plt.subplot(1, 2, 2)
plt.imshow(edges, cmap='gray')
plt.title('After Edge Filter')

plt.show()

print("CNNs learn these filters automatically during training!")

🎯 Complete Image Classification Example

# Train CNN on CIFAR-10 dataset

from tensorflow.keras.datasets import cifar10
from tensorflow.keras.utils import to_categorical

# Load CIFAR-10 (60,000 32x32 color images in 10 classes)
(X_train, y_train), (X_test, y_test) = cifar10.load_data()

# Class names
class_names = ['airplane', 'automobile', 'bird', 'cat', 'deer',
               'dog', 'frog', 'horse', 'ship', 'truck']

print(f"Training images: {X_train.shape}")
print(f"Test images: {X_test.shape}")

# Preprocess data
X_train = X_train.astype('float32') / 255.0
X_test = X_test.astype('float32') / 255.0

y_train = to_categorical(y_train, 10)
y_test = to_categorical(y_test, 10)

# Create and compile model
model = create_simple_cnn(input_shape=(32, 32, 3), num_classes=10)

model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

# Train model
print("\nTraining CNN...")
history = model.fit(
    X_train, y_train,
    batch_size=128,
    epochs=20,
    validation_split=0.1,
    callbacks=[
        keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True),
        keras.callbacks.ReduceLROnPlateau(factor=0.5, patience=2)
    ]
)

# Evaluate
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"\nTest Accuracy: {test_accuracy:.2%}")

# Make predictions
predictions = model.predict(X_test[:10])

# Visualize predictions
plt.figure(figsize=(15, 3))
for i in range(10):
    plt.subplot(2, 5, i + 1)
    plt.imshow(X_test[i])
    
    true_label = class_names[np.argmax(y_test[i])]
    pred_label = class_names[np.argmax(predictions[i])]
    confidence = np.max(predictions[i])
    
    color = 'green' if true_label == pred_label else 'red'
    plt.title(f'True: {true_label}\nPred: {pred_label}\n({confidence:.2f})', 
              color=color, fontsize=8)
    plt.axis('off')

plt.tight_layout()
plt.show()

🔥 Transfer Learning

# Use pre-trained models (trained on millions of images)

from tensorflow.keras.applications import ResNet50, VGG16, MobileNetV2
from tensorflow.keras.applications.resnet50 import preprocess_input, decode_predictions

# Load pre-trained ResNet50 (trained on ImageNet)
base_model = ResNet50(
    weights='imagenet',
    include_top=False,  # Remove classification layer
    input_shape=(224, 224, 3)
)

# Freeze pre-trained weights
base_model.trainable = False

# Add custom classification layers
model = keras.Sequential([
    base_model,
    layers.GlobalAveragePooling2D(),
    layers.Dense(256, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(10, activation='softmax')  # 10 classes
])

model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.0001),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

print("✓ Transfer learning model ready!")
print(f"Total parameters: {model.count_params():,}")
print(f"Trainable parameters: {sum([tf.size(w).numpy() for w in model.trainable_weights]):,}")

# Why transfer learning?
benefits = {
    'Faster training': 'Pre-trained features already learned',
    'Better accuracy': 'Especially with small datasets',
    'Less data needed': 'Can work with 100s of images instead of millions',
    'Production ready': 'State-of-the-art architectures'
}

print("\nTransfer Learning Benefits:")
for benefit, reason in benefits.items():
    print(f"  • {benefit}: {reason}")

Popular Pre-trained Models:

# Compare popular CNN architectures

architectures = {
    'VGG16': {
        'Parameters': '138M',
        'Depth': '16 layers',
        'Best for': 'Simple, interpretable',
        'Speed': 'Slow'
    },
    'ResNet50': {
        'Parameters': '25M',
        'Depth': '50 layers',
        'Best for': 'High accuracy',
        'Speed': 'Medium'
    },
    'InceptionV3': {
        'Parameters': '24M',
        'Depth': '48 layers',
        'Best for': 'Efficiency',
        'Speed': 'Medium'
    },
    'MobileNetV2': {
        'Parameters': '3.5M',
        'Depth': '53 layers',
        'Best for': 'Mobile/edge devices',
        'Speed': 'Fast'
    },
    'EfficientNetB0': {
        'Parameters': '5.3M',
        'Depth': 'Variable',
        'Best for': 'Best accuracy/size ratio',
        'Speed': 'Fast'
    }
}

print("CNN Architecture Comparison:")
print("="*60)
for name, specs in architectures.items():
    print(f"\n{name}:")
    for key, value in specs.items():
        print(f"  {key:15s}: {value}")

🎯 Object Detection Basics

# Object detection: Find and classify multiple objects

# Popular approaches:
# 1. YOLO (You Only Look Once) - Fast, real-time
# 2. R-CNN family (R-CNN, Fast R-CNN, Faster R-CNN) - More accurate
# 3. SSD (Single Shot Detector) - Balance of speed and accuracy

# Example using pre-trained YOLO (conceptual)
"""
from ultralytics import YOLO

# Load pre-trained model
model = YOLO('yolov8n.pt')  # Nano version (fastest)

# Detect objects in image
results = model('image.jpg')

# Process results
for result in results:
    boxes = result.boxes
    for box in boxes:
        # Bounding box coordinates
        x1, y1, x2, y2 = box.xyxy[0]
        
        # Confidence and class
        confidence = box.conf[0]
        class_id = box.cls[0]
        class_name = model.names[int(class_id)]
        
        print(f"Found {class_name} with {confidence:.2f} confidence at ({x1}, {y1})")
"""

# Object detection output includes:
detection_output = {
    'Bounding boxes': '(x, y, width, height) for each object',
    'Class labels': 'What object is it?',
    'Confidence scores': 'How sure is the model?',
    'Number of objects': 'Can detect multiple objects per image'
}

print("Object Detection Output:")
for component, description in detection_output.items():
    print(f"  • {component}: {description}")

# Use cases:
print("\nObject Detection Applications:")
print("  • Self-driving cars: Detect pedestrians, vehicles, signs")
print("  • Retail: Automated checkout, inventory tracking")
print("  • Security: Intrusion detection, crowd monitoring")
print("  • Healthcare: Tumor detection in medical images")
print("  • Agriculture: Crop disease detection, yield estimation")

📊 Visualizing CNN Features

# Understand what CNNs learn

def visualize_filters(model, layer_name):
    """Display filters from a convolutional layer"""
    
    # Get layer by name
    layer = model.get_layer(layer_name)
    filters, biases = layer.get_weights()
    
    # Normalize filters
    f_min, f_max = filters.min(), filters.max()
    filters = (filters - f_min) / (f_max - f_min)
    
    # Plot filters
    n_filters = min(filters.shape[3], 64)  # Show up to 64 filters
    
    plt.figure(figsize=(12, 12))
    for i in range(n_filters):
        plt.subplot(8, 8, i + 1)
        
        # Get filter
        f = filters[:, :, 0, i]
        plt.imshow(f, cmap='viridis')
        plt.axis('off')
    
    plt.suptitle(f'Filters from {layer_name}')
    plt.tight_layout()
    plt.show()

# Visualize feature maps
def visualize_feature_maps(model, image, layer_name):
    """Show what features are detected in an image"""
    
    # Create model that outputs intermediate layer
    feature_model = keras.Model(
        inputs=model.input,
        outputs=model.get_layer(layer_name).output
    )
    
    # Get features
    features = feature_model.predict(image[np.newaxis, ...])
    
    # Plot feature maps
    n_features = min(features.shape[-1], 64)
    
    plt.figure(figsize=(12, 12))
    for i in range(n_features):
        plt.subplot(8, 8, i + 1)
        plt.imshow(features[0, :, :, i], cmap='viridis')
        plt.axis('off')
    
    plt.suptitle(f'Feature Maps from {layer_name}')
    plt.tight_layout()
    plt.show()

# Early layers: Detect edges, colors
# Middle layers: Detect textures, patterns
# Late layers: Detect complex objects, parts

print("CNN learns hierarchical features:")
print("  Layer 1: Edges, gradients, colors")
print("  Layer 2-3: Textures, patterns")
print("  Layer 4-5: Object parts (wheels, eyes, etc.)")
print("  Final layers: Whole objects (cars, faces, etc.)")

🎯 Key Takeaways

Images are arrays: Height × Width × Channels (RGB)
Preprocessing: Resize, normalize, augment data
CNNs: Designed for images, learn features automatically
Convolution: Apply filters to detect patterns
Pooling: Reduce dimensions, keep important features
Transfer learning: Use pre-trained models for faster, better results
Data augmentation: Create variations to prevent overfitting
Object detection: Find and classify multiple objects