Computer Vision

What is Computer Vision?

Computer vision enables machines to extract information from images and videos. Powers facial recognition, self-driving cars, medical imaging, and more.

                Tasks:
                Image classification: What is in the image?
Object detection: Where are objects located?
Segmentation: Pixel-level classification
Face recognition: Identify individuals
Pose estimation: Track human body positions

            

📷 Image Basics

import cv2
import numpy as np
import matplotlib.pyplot as plt

# Read image
img = cv2.imread('image.jpg')
img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

print(f"Shape: {img_rgb.shape}")  # (height, width, channels)
print(f"Data type: {img_rgb.dtype}")  # uint8 (0-255)

# Display
plt.figure(figsize=(10, 6))
plt.imshow(img_rgb)
plt.axis('off')
plt.show()

# Access pixel
pixel = img_rgb[100, 100]  # (R, G, B)
print(f"Pixel at (100,100): {pixel}")

# Convert to grayscale
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
print(f"Grayscale shape: {gray.shape}")

🎨 Image Preprocessing

from PIL import Image
from skimage import transform

# Resize
resized = cv2.resize(img_rgb, (224, 224))
print(f"Resized: {resized.shape}")

# Crop
cropped = img_rgb[50:200, 100:300]

# Rotate
rotated = transform.rotate(img_rgb, angle=45)

# Flip
flipped_h = cv2.flip(img_rgb, 1)  # Horizontal
flipped_v = cv2.flip(img_rgb, 0)  # Vertical

# Normalize
normalized = img_rgb / 255.0  # Scale to [0, 1]

# Visualize
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes[0,0].imshow(img_rgb); axes[0,0].set_title('Original')
axes[0,1].imshow(resized); axes[0,1].set_title('Resized')
axes[0,2].imshow(cropped); axes[0,2].set_title('Cropped')
axes[1,0].imshow(rotated); axes[1,0].set_title('Rotated')
axes[1,1].imshow(flipped_h); axes[1,1].set_title('Flipped H')
axes[1,2].imshow(flipped_v); axes[1,2].set_title('Flipped V')
plt.tight_layout()
plt.show()

🔍 Feature Extraction

Edges (Canny)

# Detect edges
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
edges = cv2.Canny(gray, threshold1=100, threshold2=200)

plt.figure(figsize=(12, 6))
plt.subplot(121); plt.imshow(gray, cmap='gray'); plt.title('Original')
plt.subplot(122); plt.imshow(edges, cmap='gray'); plt.title('Edges')
plt.show()

SIFT Features

# Scale-Invariant Feature Transform
sift = cv2.SIFT_create()
keypoints, descriptors = sift.detectAndCompute(gray, None)

img_kp = cv2.drawKeypoints(img_rgb, keypoints, None)
plt.imshow(img_kp)
plt.title(f'SIFT Keypoints: {len(keypoints)}')
plt.show()

🖼️ Image Classification with CNN

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.datasets import cifar10

# Load CIFAR-10
(X_train, y_train), (X_test, y_test) = cifar10.load_data()

# Normalize
X_train = X_train / 255.0
X_test = X_test / 255.0

print(f"Train: {X_train.shape}, Test: {X_test.shape}")

# Build CNN
model = keras.Sequential([
    layers.Conv2D(32, (3,3), activation='relu', input_shape=(32,32,3)),
    layers.MaxPooling2D((2,2)),
    layers.Conv2D(64, (3,3), activation='relu'),
    layers.MaxPooling2D((2,2)),
    layers.Conv2D(64, (3,3), activation='relu'),
    layers.Flatten(),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(10, activation='softmax')
])

model.compile(optimizer='adam',
             loss='sparse_categorical_crossentropy',
             metrics=['accuracy'])

# Train
history = model.fit(X_train, y_train, epochs=10,
                   validation_data=(X_test, y_test),
                   batch_size=64)

# Evaluate
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"Test accuracy: {test_acc:.3f}")

🎯 Transfer Learning

from tensorflow.keras.applications import VGG16, ResNet50
from tensorflow.keras.preprocessing import image

# Load pre-trained model
base_model = VGG16(weights='imagenet', include_top=False, 
                   input_shape=(224, 224, 3))

# Freeze base layers
base_model.trainable = False

# Add custom layers
model = keras.Sequential([
    base_model,
    layers.GlobalAveragePooling2D(),
    layers.Dense(256, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(10, activation='softmax')  # 10 classes
])

model.compile(optimizer='adam',
             loss='sparse_categorical_crossentropy',
             metrics=['accuracy'])

print(model.summary())

# Fine-tuning: Unfreeze last layers
base_model.trainable = True
for layer in base_model.layers[:-4]:
    layer.trainable = False

model.compile(optimizer=keras.optimizers.Adam(1e-5),
             loss='sparse_categorical_crossentropy',
             metrics=['accuracy'])

📦 Object Detection - YOLO Concept

# YOLO (You Only Look Once) - Single pass detection
# For actual implementation, use pre-trained models:
# pip install ultralytics

from ultralytics import YOLO

# Load pre-trained model
model = YOLO('yolov8n.pt')  # nano model

# Detect objects
results = model('image.jpg')

# Process results
for result in results:
    boxes = result.boxes
    for box in boxes:
        x1, y1, x2, y2 = box.xyxy[0]
        conf = box.conf[0]
        cls = box.cls[0]
        print(f"Class: {cls}, Confidence: {conf:.2f}")
        print(f"Box: ({x1},{y1}) to ({x2},{y2})")

🎭 Face Detection

# Haar Cascade for face detection
face_cascade = cv2.CascadeClassifier(
    cv2.data.haarcascades + 'haarcascade_frontalface_default.xml'
)

# Read image
img = cv2.imread('people.jpg')
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

# Detect faces
faces = face_cascade.detectMultiScale(
    gray, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30)
)

print(f"Found {len(faces)} faces")

# Draw rectangles
for (x, y, w, h) in faces:
    cv2.rectangle(img, (x, y), (x+w, y+h), (0, 255, 0), 2)

img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
plt.imshow(img_rgb)
plt.title('Face Detection')
plt.show()

🎨 Image Segmentation

# Simple thresholding segmentation
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

# Binary threshold
_, binary = cv2.threshold(gray, 127, 255, cv2.THRESH_BINARY)

# Otsu's method (automatic threshold)
_, otsu = cv2.threshold(gray, 0, 255, 
                       cv2.THRESH_BINARY + cv2.THRESH_OTSU)

# Adaptive threshold
adaptive = cv2.adaptiveThreshold(
    gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
    cv2.THRESH_BINARY, 11, 2
)

# Display
fig, axes = plt.subplots(1, 4, figsize=(16, 4))
axes[0].imshow(gray, cmap='gray'); axes[0].set_title('Original')
axes[1].imshow(binary, cmap='gray'); axes[1].set_title('Binary')
axes[2].imshow(otsu, cmap='gray'); axes[2].set_title('Otsu')
axes[3].imshow(adaptive, cmap='gray'); axes[3].set_title('Adaptive')
plt.show()

🔄 Data Augmentation

from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Create augmentation
datagen = ImageDataGenerator(
    rotation_range=40,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)

# Load sample image
sample_img = X_train[0].reshape((1, 32, 32, 3))

# Generate augmented images
fig, axes = plt.subplots(2, 4, figsize=(16, 8))
axes = axes.ravel()

for i, batch in enumerate(datagen.flow(sample_img, batch_size=1)):
    axes[i].imshow(batch[0])
    axes[i].axis('off')
    if i >= 7:
        break

plt.suptitle('Data Augmentation Examples')
plt.show()

💡 Best Practices

Normalize inputs: Scale pixels to [0,1] or standardize
Use data augmentation: Prevents overfitting
Start with transfer learning: Pre-trained models save time
Resize images consistently: Fixed input size for CNNs
Use GPU: Essential for training CNNs
Monitor validation loss: Early stopping prevents overfitting
Try different architectures: VGG, ResNet, EfficientNet
Fine-tune carefully: Lower learning rate for pre-trained models

🎯 Key Takeaways

OpenCV & PIL for image processing
CNNs are standard for image tasks
Transfer learning with VGG16, ResNet50, EfficientNet
Data augmentation essential for small datasets
YOLO for real-time object detection
Haar Cascades for fast face detection
Normalize and resize images before training