What is Computer Vision?
Computer vision enables computers to understand and interpret visual information from images and videos. It powers facial recognition, self-driving cars, medical image analysis, and more. The breakthrough came with Convolutional Neural Networks (CNNs), which can automatically learn visual features.
Real-World Applications:
- Healthcare: Detecting diseases from X-rays and MRIs
- Autonomous vehicles: Recognizing pedestrians, signs, lanes
- Security: Facial recognition, surveillance
- Retail: Product recognition, inventory management
- Agriculture: Crop disease detection
🖼️ Understanding Images
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import cv2
# Images are just arrays of numbers!
# Load an image
img = plt.imread('sample_image.jpg') # Or use Image.open()
print("Image shape:", img.shape) # (height, width, channels)
print("Data type:", img.dtype)
print("Value range:", img.min(), "to", img.max())
# For RGB images: shape is (H, W, 3)
# Channel 0 = Red, Channel 1 = Green, Channel 2 = Blue
# Visualize image
plt.figure(figsize=(15, 5))
plt.subplot(1, 4, 1)
plt.imshow(img)
plt.title('Original Image')
plt.axis('off')
plt.subplot(1, 4, 2)
plt.imshow(img[:, :, 0], cmap='Reds')
plt.title('Red Channel')
plt.axis('off')
plt.subplot(1, 4, 3)
plt.imshow(img[:, :, 1], cmap='Greens')
plt.title('Green Channel')
plt.axis('off')
plt.subplot(1, 4, 4)
plt.imshow(img[:, :, 2], cmap='Blues')
plt.title('Blue Channel')
plt.axis('off')
plt.tight_layout()
plt.show()
# Grayscale conversion
gray_img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
print("\nGrayscale shape:", gray_img.shape) # (H, W) - single channel
🔄 Image Preprocessing
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.preprocessing import StandardScaler
# Essential preprocessing steps
# 1. Resize images (models expect fixed size)
def resize_image(img, target_size=(224, 224)):
"""Resize to target dimensions"""
return cv2.resize(img, target_size)
# 2. Normalize pixel values (0-255 → 0-1)
def normalize_image(img):
"""Scale pixel values to [0, 1]"""
return img.astype('float32') / 255.0
# 3. Standardize (mean=0, std=1)
def standardize_image(img):
"""Center around mean with unit variance"""
mean = np.mean(img)
std = np.std(img)
return (img - mean) / std
# Image augmentation - Create variations for training
datagen = ImageDataGenerator(
rotation_range=20, # Rotate up to 20 degrees
width_shift_range=0.2, # Shift horizontally
height_shift_range=0.2, # Shift vertically
horizontal_flip=True, # Flip left-right
zoom_range=0.2, # Zoom in/out
shear_range=0.2, # Shear transformation
brightness_range=[0.8, 1.2], # Adjust brightness
fill_mode='nearest' # Fill empty pixels
)
# Apply augmentation
img_array = img.reshape((1,) + img.shape) # Add batch dimension
plt.figure(figsize=(15, 3))
for i, batch in enumerate(datagen.flow(img_array, batch_size=1)):
plt.subplot(1, 6, i + 1)
plt.imshow(batch[0].astype('uint8'))
plt.title(f'Augmented {i+1}')
plt.axis('off')
if i >= 5: # Show 6 examples
break
plt.tight_layout()
plt.show()
print("✓ Augmentation increases dataset size and prevents overfitting!")
🧠 Convolutional Neural Networks (CNNs)
# CNNs are designed for image data
# Key components:
# 1. Convolutional layers - Detect features
# 2. Pooling layers - Reduce dimensions
# 3. Fully connected layers - Make predictions
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
# Simple CNN architecture
def create_simple_cnn(input_shape=(32, 32, 3), num_classes=10):
"""
Build a basic CNN
Args:
input_shape: Image dimensions (height, width, channels)
num_classes: Number of output classes
"""
model = keras.Sequential([
# First convolutional block
layers.Conv2D(32, (3, 3), activation='relu', input_shape=input_shape, padding='same'),
layers.BatchNormalization(),
layers.Conv2D(32, (3, 3), activation='relu', padding='same'),
layers.MaxPooling2D((2, 2)),
layers.Dropout(0.25),
# Second convolutional block
layers.Conv2D(64, (3, 3), activation='relu', padding='same'),
layers.BatchNormalization(),
layers.Conv2D(64, (3, 3), activation='relu', padding='same'),
layers.MaxPooling2D((2, 2)),
layers.Dropout(0.25),
# Third convolutional block
layers.Conv2D(128, (3, 3), activation='relu', padding='same'),
layers.BatchNormalization(),
layers.Conv2D(128, (3, 3), activation='relu', padding='same'),
layers.MaxPooling2D((2, 2)),
layers.Dropout(0.25),
# Fully connected layers
layers.Flatten(),
layers.Dense(256, activation='relu'),
layers.BatchNormalization(),
layers.Dropout(0.5),
layers.Dense(num_classes, activation='softmax')
])
return model
# Create model
model = create_simple_cnn()
model.summary()
# Explanation of layers:
print("\nLayer Breakdown:")
print("Conv2D: Applies filters to detect features (edges, textures, etc.)")
print("MaxPooling2D: Reduces spatial dimensions, keeps important features")
print("BatchNormalization: Normalizes layer outputs (faster training)")
print("Dropout: Randomly disables neurons (prevents overfitting)")
print("Flatten: Converts 2D feature maps to 1D vector")
print("Dense: Fully connected layer for classification")
How Convolution Works:
# Convolution example: Apply a filter to detect edges
# Simple 3x3 edge detection filter
edge_filter = np.array([
[-1, -1, -1],
[ 0, 0, 0],
[ 1, 1, 1]
])
# Apply filter to image
def apply_filter(img, kernel):
"""Convolve image with kernel"""
return cv2.filter2D(img, -1, kernel)
# Load grayscale image
gray_img = cv2.imread('sample.jpg', cv2.IMREAD_GRAYSCALE)
# Apply edge detection
edges = apply_filter(gray_img, edge_filter)
plt.figure(figsize=(10, 4))
plt.subplot(1, 2, 1)
plt.imshow(gray_img, cmap='gray')
plt.title('Original Image')
plt.subplot(1, 2, 2)
plt.imshow(edges, cmap='gray')
plt.title('After Edge Filter')
plt.show()
print("CNNs learn these filters automatically during training!")
🎯 Complete Image Classification Example
# Train CNN on CIFAR-10 dataset
from tensorflow.keras.datasets import cifar10
from tensorflow.keras.utils import to_categorical
# Load CIFAR-10 (60,000 32x32 color images in 10 classes)
(X_train, y_train), (X_test, y_test) = cifar10.load_data()
# Class names
class_names = ['airplane', 'automobile', 'bird', 'cat', 'deer',
'dog', 'frog', 'horse', 'ship', 'truck']
print(f"Training images: {X_train.shape}")
print(f"Test images: {X_test.shape}")
# Preprocess data
X_train = X_train.astype('float32') / 255.0
X_test = X_test.astype('float32') / 255.0
y_train = to_categorical(y_train, 10)
y_test = to_categorical(y_test, 10)
# Create and compile model
model = create_simple_cnn(input_shape=(32, 32, 3), num_classes=10)
model.compile(
optimizer=keras.optimizers.Adam(learning_rate=0.001),
loss='categorical_crossentropy',
metrics=['accuracy']
)
# Train model
print("\nTraining CNN...")
history = model.fit(
X_train, y_train,
batch_size=128,
epochs=20,
validation_split=0.1,
callbacks=[
keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True),
keras.callbacks.ReduceLROnPlateau(factor=0.5, patience=2)
]
)
# Evaluate
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"\nTest Accuracy: {test_accuracy:.2%}")
# Make predictions
predictions = model.predict(X_test[:10])
# Visualize predictions
plt.figure(figsize=(15, 3))
for i in range(10):
plt.subplot(2, 5, i + 1)
plt.imshow(X_test[i])
true_label = class_names[np.argmax(y_test[i])]
pred_label = class_names[np.argmax(predictions[i])]
confidence = np.max(predictions[i])
color = 'green' if true_label == pred_label else 'red'
plt.title(f'True: {true_label}\nPred: {pred_label}\n({confidence:.2f})',
color=color, fontsize=8)
plt.axis('off')
plt.tight_layout()
plt.show()
🔥 Transfer Learning
# Use pre-trained models (trained on millions of images)
from tensorflow.keras.applications import ResNet50, VGG16, MobileNetV2
from tensorflow.keras.applications.resnet50 import preprocess_input, decode_predictions
# Load pre-trained ResNet50 (trained on ImageNet)
base_model = ResNet50(
weights='imagenet',
include_top=False, # Remove classification layer
input_shape=(224, 224, 3)
)
# Freeze pre-trained weights
base_model.trainable = False
# Add custom classification layers
model = keras.Sequential([
base_model,
layers.GlobalAveragePooling2D(),
layers.Dense(256, activation='relu'),
layers.Dropout(0.5),
layers.Dense(10, activation='softmax') # 10 classes
])
model.compile(
optimizer=keras.optimizers.Adam(learning_rate=0.0001),
loss='categorical_crossentropy',
metrics=['accuracy']
)
print("✓ Transfer learning model ready!")
print(f"Total parameters: {model.count_params():,}")
print(f"Trainable parameters: {sum([tf.size(w).numpy() for w in model.trainable_weights]):,}")
# Why transfer learning?
benefits = {
'Faster training': 'Pre-trained features already learned',
'Better accuracy': 'Especially with small datasets',
'Less data needed': 'Can work with 100s of images instead of millions',
'Production ready': 'State-of-the-art architectures'
}
print("\nTransfer Learning Benefits:")
for benefit, reason in benefits.items():
print(f" • {benefit}: {reason}")
Popular Pre-trained Models:
# Compare popular CNN architectures
architectures = {
'VGG16': {
'Parameters': '138M',
'Depth': '16 layers',
'Best for': 'Simple, interpretable',
'Speed': 'Slow'
},
'ResNet50': {
'Parameters': '25M',
'Depth': '50 layers',
'Best for': 'High accuracy',
'Speed': 'Medium'
},
'InceptionV3': {
'Parameters': '24M',
'Depth': '48 layers',
'Best for': 'Efficiency',
'Speed': 'Medium'
},
'MobileNetV2': {
'Parameters': '3.5M',
'Depth': '53 layers',
'Best for': 'Mobile/edge devices',
'Speed': 'Fast'
},
'EfficientNetB0': {
'Parameters': '5.3M',
'Depth': 'Variable',
'Best for': 'Best accuracy/size ratio',
'Speed': 'Fast'
}
}
print("CNN Architecture Comparison:")
print("="*60)
for name, specs in architectures.items():
print(f"\n{name}:")
for key, value in specs.items():
print(f" {key:15s}: {value}")
🎯 Object Detection Basics
# Object detection: Find and classify multiple objects
# Popular approaches:
# 1. YOLO (You Only Look Once) - Fast, real-time
# 2. R-CNN family (R-CNN, Fast R-CNN, Faster R-CNN) - More accurate
# 3. SSD (Single Shot Detector) - Balance of speed and accuracy
# Example using pre-trained YOLO (conceptual)
"""
from ultralytics import YOLO
# Load pre-trained model
model = YOLO('yolov8n.pt') # Nano version (fastest)
# Detect objects in image
results = model('image.jpg')
# Process results
for result in results:
boxes = result.boxes
for box in boxes:
# Bounding box coordinates
x1, y1, x2, y2 = box.xyxy[0]
# Confidence and class
confidence = box.conf[0]
class_id = box.cls[0]
class_name = model.names[int(class_id)]
print(f"Found {class_name} with {confidence:.2f} confidence at ({x1}, {y1})")
"""
# Object detection output includes:
detection_output = {
'Bounding boxes': '(x, y, width, height) for each object',
'Class labels': 'What object is it?',
'Confidence scores': 'How sure is the model?',
'Number of objects': 'Can detect multiple objects per image'
}
print("Object Detection Output:")
for component, description in detection_output.items():
print(f" • {component}: {description}")
# Use cases:
print("\nObject Detection Applications:")
print(" • Self-driving cars: Detect pedestrians, vehicles, signs")
print(" • Retail: Automated checkout, inventory tracking")
print(" • Security: Intrusion detection, crowd monitoring")
print(" • Healthcare: Tumor detection in medical images")
print(" • Agriculture: Crop disease detection, yield estimation")
📊 Visualizing CNN Features
# Understand what CNNs learn
def visualize_filters(model, layer_name):
"""Display filters from a convolutional layer"""
# Get layer by name
layer = model.get_layer(layer_name)
filters, biases = layer.get_weights()
# Normalize filters
f_min, f_max = filters.min(), filters.max()
filters = (filters - f_min) / (f_max - f_min)
# Plot filters
n_filters = min(filters.shape[3], 64) # Show up to 64 filters
plt.figure(figsize=(12, 12))
for i in range(n_filters):
plt.subplot(8, 8, i + 1)
# Get filter
f = filters[:, :, 0, i]
plt.imshow(f, cmap='viridis')
plt.axis('off')
plt.suptitle(f'Filters from {layer_name}')
plt.tight_layout()
plt.show()
# Visualize feature maps
def visualize_feature_maps(model, image, layer_name):
"""Show what features are detected in an image"""
# Create model that outputs intermediate layer
feature_model = keras.Model(
inputs=model.input,
outputs=model.get_layer(layer_name).output
)
# Get features
features = feature_model.predict(image[np.newaxis, ...])
# Plot feature maps
n_features = min(features.shape[-1], 64)
plt.figure(figsize=(12, 12))
for i in range(n_features):
plt.subplot(8, 8, i + 1)
plt.imshow(features[0, :, :, i], cmap='viridis')
plt.axis('off')
plt.suptitle(f'Feature Maps from {layer_name}')
plt.tight_layout()
plt.show()
# Early layers: Detect edges, colors
# Middle layers: Detect textures, patterns
# Late layers: Detect complex objects, parts
print("CNN learns hierarchical features:")
print(" Layer 1: Edges, gradients, colors")
print(" Layer 2-3: Textures, patterns")
print(" Layer 4-5: Object parts (wheels, eyes, etc.)")
print(" Final layers: Whole objects (cars, faces, etc.)")
🎯 Key Takeaways
- Images are arrays: Height × Width × Channels (RGB)
- Preprocessing: Resize, normalize, augment data
- CNNs: Designed for images, learn features automatically
- Convolution: Apply filters to detect patterns
- Pooling: Reduce dimensions, keep important features
- Transfer learning: Use pre-trained models for faster, better results
- Data augmentation: Create variations to prevent overfitting
- Object detection: Find and classify multiple objects