What is Computer Vision?
Computer vision enables machines to extract information from images and videos. Powers facial recognition, self-driving cars, medical imaging, and more.
Tasks:
- Image classification: What is in the image?
- Object detection: Where are objects located?
- Segmentation: Pixel-level classification
- Face recognition: Identify individuals
- Pose estimation: Track human body positions
📷 Image Basics
import cv2
import numpy as np
import matplotlib.pyplot as plt
# Read image
img = cv2.imread('image.jpg')
img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
print(f"Shape: {img_rgb.shape}") # (height, width, channels)
print(f"Data type: {img_rgb.dtype}") # uint8 (0-255)
# Display
plt.figure(figsize=(10, 6))
plt.imshow(img_rgb)
plt.axis('off')
plt.show()
# Access pixel
pixel = img_rgb[100, 100] # (R, G, B)
print(f"Pixel at (100,100): {pixel}")
# Convert to grayscale
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
print(f"Grayscale shape: {gray.shape}")
🎨 Image Preprocessing
from PIL import Image
from skimage import transform
# Resize
resized = cv2.resize(img_rgb, (224, 224))
print(f"Resized: {resized.shape}")
# Crop
cropped = img_rgb[50:200, 100:300]
# Rotate
rotated = transform.rotate(img_rgb, angle=45)
# Flip
flipped_h = cv2.flip(img_rgb, 1) # Horizontal
flipped_v = cv2.flip(img_rgb, 0) # Vertical
# Normalize
normalized = img_rgb / 255.0 # Scale to [0, 1]
# Visualize
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes[0,0].imshow(img_rgb); axes[0,0].set_title('Original')
axes[0,1].imshow(resized); axes[0,1].set_title('Resized')
axes[0,2].imshow(cropped); axes[0,2].set_title('Cropped')
axes[1,0].imshow(rotated); axes[1,0].set_title('Rotated')
axes[1,1].imshow(flipped_h); axes[1,1].set_title('Flipped H')
axes[1,2].imshow(flipped_v); axes[1,2].set_title('Flipped V')
plt.tight_layout()
plt.show()
🔍 Feature Extraction
Edges (Canny)
# Detect edges
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
edges = cv2.Canny(gray, threshold1=100, threshold2=200)
plt.figure(figsize=(12, 6))
plt.subplot(121); plt.imshow(gray, cmap='gray'); plt.title('Original')
plt.subplot(122); plt.imshow(edges, cmap='gray'); plt.title('Edges')
plt.show()
SIFT Features
# Scale-Invariant Feature Transform
sift = cv2.SIFT_create()
keypoints, descriptors = sift.detectAndCompute(gray, None)
img_kp = cv2.drawKeypoints(img_rgb, keypoints, None)
plt.imshow(img_kp)
plt.title(f'SIFT Keypoints: {len(keypoints)}')
plt.show()
🖼️ Image Classification with CNN
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.datasets import cifar10
# Load CIFAR-10
(X_train, y_train), (X_test, y_test) = cifar10.load_data()
# Normalize
X_train = X_train / 255.0
X_test = X_test / 255.0
print(f"Train: {X_train.shape}, Test: {X_test.shape}")
# Build CNN
model = keras.Sequential([
layers.Conv2D(32, (3,3), activation='relu', input_shape=(32,32,3)),
layers.MaxPooling2D((2,2)),
layers.Conv2D(64, (3,3), activation='relu'),
layers.MaxPooling2D((2,2)),
layers.Conv2D(64, (3,3), activation='relu'),
layers.Flatten(),
layers.Dense(64, activation='relu'),
layers.Dropout(0.5),
layers.Dense(10, activation='softmax')
])
model.compile(optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy'])
# Train
history = model.fit(X_train, y_train, epochs=10,
validation_data=(X_test, y_test),
batch_size=64)
# Evaluate
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"Test accuracy: {test_acc:.3f}")
🎯 Transfer Learning
from tensorflow.keras.applications import VGG16, ResNet50
from tensorflow.keras.preprocessing import image
# Load pre-trained model
base_model = VGG16(weights='imagenet', include_top=False,
input_shape=(224, 224, 3))
# Freeze base layers
base_model.trainable = False
# Add custom layers
model = keras.Sequential([
base_model,
layers.GlobalAveragePooling2D(),
layers.Dense(256, activation='relu'),
layers.Dropout(0.5),
layers.Dense(10, activation='softmax') # 10 classes
])
model.compile(optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy'])
print(model.summary())
# Fine-tuning: Unfreeze last layers
base_model.trainable = True
for layer in base_model.layers[:-4]:
layer.trainable = False
model.compile(optimizer=keras.optimizers.Adam(1e-5),
loss='sparse_categorical_crossentropy',
metrics=['accuracy'])
📦 Object Detection - YOLO Concept
# YOLO (You Only Look Once) - Single pass detection
# For actual implementation, use pre-trained models:
# pip install ultralytics
from ultralytics import YOLO
# Load pre-trained model
model = YOLO('yolov8n.pt') # nano model
# Detect objects
results = model('image.jpg')
# Process results
for result in results:
boxes = result.boxes
for box in boxes:
x1, y1, x2, y2 = box.xyxy[0]
conf = box.conf[0]
cls = box.cls[0]
print(f"Class: {cls}, Confidence: {conf:.2f}")
print(f"Box: ({x1},{y1}) to ({x2},{y2})")
🎭 Face Detection
# Haar Cascade for face detection
face_cascade = cv2.CascadeClassifier(
cv2.data.haarcascades + 'haarcascade_frontalface_default.xml'
)
# Read image
img = cv2.imread('people.jpg')
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# Detect faces
faces = face_cascade.detectMultiScale(
gray, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30)
)
print(f"Found {len(faces)} faces")
# Draw rectangles
for (x, y, w, h) in faces:
cv2.rectangle(img, (x, y), (x+w, y+h), (0, 255, 0), 2)
img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
plt.imshow(img_rgb)
plt.title('Face Detection')
plt.show()
🎨 Image Segmentation
# Simple thresholding segmentation
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# Binary threshold
_, binary = cv2.threshold(gray, 127, 255, cv2.THRESH_BINARY)
# Otsu's method (automatic threshold)
_, otsu = cv2.threshold(gray, 0, 255,
cv2.THRESH_BINARY + cv2.THRESH_OTSU)
# Adaptive threshold
adaptive = cv2.adaptiveThreshold(
gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY, 11, 2
)
# Display
fig, axes = plt.subplots(1, 4, figsize=(16, 4))
axes[0].imshow(gray, cmap='gray'); axes[0].set_title('Original')
axes[1].imshow(binary, cmap='gray'); axes[1].set_title('Binary')
axes[2].imshow(otsu, cmap='gray'); axes[2].set_title('Otsu')
axes[3].imshow(adaptive, cmap='gray'); axes[3].set_title('Adaptive')
plt.show()
🔄 Data Augmentation
from tensorflow.keras.preprocessing.image import ImageDataGenerator
# Create augmentation
datagen = ImageDataGenerator(
rotation_range=40,
width_shift_range=0.2,
height_shift_range=0.2,
shear_range=0.2,
zoom_range=0.2,
horizontal_flip=True,
fill_mode='nearest'
)
# Load sample image
sample_img = X_train[0].reshape((1, 32, 32, 3))
# Generate augmented images
fig, axes = plt.subplots(2, 4, figsize=(16, 8))
axes = axes.ravel()
for i, batch in enumerate(datagen.flow(sample_img, batch_size=1)):
axes[i].imshow(batch[0])
axes[i].axis('off')
if i >= 7:
break
plt.suptitle('Data Augmentation Examples')
plt.show()
💡 Best Practices
- Normalize inputs: Scale pixels to [0,1] or standardize
- Use data augmentation: Prevents overfitting
- Start with transfer learning: Pre-trained models save time
- Resize images consistently: Fixed input size for CNNs
- Use GPU: Essential for training CNNs
- Monitor validation loss: Early stopping prevents overfitting
- Try different architectures: VGG, ResNet, EfficientNet
- Fine-tune carefully: Lower learning rate for pre-trained models
🎯 Key Takeaways
- OpenCV & PIL for image processing
- CNNs are standard for image tasks
- Transfer learning with VGG16, ResNet50, EfficientNet
- Data augmentation essential for small datasets
- YOLO for real-time object detection
- Haar Cascades for fast face detection
- Normalize and resize images before training