What is Deep Learning?
Deep learning is a subset of machine learning that uses neural networks with multiple layers (hence "deep"). These networks can automatically learn hierarchical representations of data, making them incredibly powerful for complex tasks like image recognition, language understanding, and game playing.
Why "Deep"?
- Multiple layers: Input โ Hidden Layer 1 โ Hidden Layer 2 โ ... โ Output
- Hierarchical learning: Early layers learn simple patterns, later layers learn complex concepts
- Automatic features: No manual feature engineering needed
- Scalable: Performance improves with more data
๐๏ธ Neural Network Architecture
import numpy as np
import matplotlib.pyplot as plt
# Visualize network architecture
def visualize_network():
"""Display neural network structure"""
layers = {
'Input Layer': 4, # 4 input features
'Hidden Layer 1': 8, # 8 neurons
'Hidden Layer 2': 6, # 6 neurons
'Output Layer': 3 # 3 classes
}
print("Neural Network Architecture:")
print("="*50)
for layer_name, neurons in layers.items():
print(f"{layer_name:20s}: {neurons} neurons")
if 'Hidden' in layer_name:
print(f"{'':20s} Activation: ReLU")
elif 'Output' in layer_name:
print(f"{'':20s} Activation: Softmax")
# Calculate total parameters
total_params = (4 * 8 + 8) + (8 * 6 + 6) + (6 * 3 + 3)
print(f"\nTotal Parameters: {total_params}")
visualize_network()
# What each layer does:
# Input Layer: Receives raw data
# Hidden Layers: Learn increasingly complex patterns
# Output Layer: Makes final prediction
Layer Types:
- Dense (Fully Connected): Every neuron connects to all previous neurons
- Convolutional: For image data, learns spatial patterns
- Recurrent: For sequential data, has memory
- Dropout: Randomly disables neurons during training (prevents overfitting)
- Batch Normalization: Normalizes layer inputs (speeds up training)
โก Activation Functions
# Common activation functions
# 1. ReLU (Rectified Linear Unit) - Most popular
def relu(x):
"""f(x) = max(0, x)"""
return np.maximum(0, x)
# 2. Sigmoid - For binary classification
def sigmoid(x):
"""f(x) = 1 / (1 + e^-x)"""
return 1 / (1 + np.exp(-x))
# 3. Tanh (Hyperbolic Tangent) - Centered around 0
def tanh(x):
"""f(x) = (e^x - e^-x) / (e^x + e^-x)"""
return np.tanh(x)
# 4. Softmax - For multi-class classification
def softmax(x):
"""Converts scores to probabilities"""
exp_x = np.exp(x - np.max(x)) # Subtract max for numerical stability
return exp_x / exp_x.sum()
# Visualize activation functions
x = np.linspace(-5, 5, 100)
plt.figure(figsize=(12, 8))
plt.subplot(2, 2, 1)
plt.plot(x, relu(x), 'b-', linewidth=2)
plt.title('ReLU: f(x) = max(0, x)')
plt.grid(True)
plt.axhline(y=0, color='k', linestyle='--', alpha=0.3)
plt.axvline(x=0, color='k', linestyle='--', alpha=0.3)
plt.subplot(2, 2, 2)
plt.plot(x, sigmoid(x), 'r-', linewidth=2)
plt.title('Sigmoid: f(x) = 1/(1+e^-x)')
plt.grid(True)
plt.axhline(y=0, color='k', linestyle='--', alpha=0.3)
plt.subplot(2, 2, 3)
plt.plot(x, tanh(x), 'g-', linewidth=2)
plt.title('Tanh: f(x) = (e^x - e^-x)/(e^x + e^-x)')
plt.grid(True)
plt.axhline(y=0, color='k', linestyle='--', alpha=0.3)
plt.subplot(2, 2, 4)
test_scores = np.array([2.0, 1.0, 0.1])
probs = softmax(test_scores)
plt.bar(['Class A', 'Class B', 'Class C'], probs)
plt.title('Softmax: Converts to probabilities')
plt.ylabel('Probability')
plt.tight_layout()
plt.show()
# When to use which?
activation_guide = {
'ReLU': 'Hidden layers (default choice)',
'Sigmoid': 'Binary classification output',
'Softmax': 'Multi-class classification output',
'Tanh': 'Hidden layers (when data centered at 0)'
}
print("\nActivation Function Guide:")
for func, usage in activation_guide.items():
print(f" {func:10s}: {usage}")
๐ Forward Propagation
# Forward pass: Input โ Hidden โ Output
class NeuralNetwork:
def __init__(self, input_size, hidden_size, output_size):
"""Initialize network with random weights"""
# Weights (Xavier initialization)
self.W1 = np.random.randn(input_size, hidden_size) * np.sqrt(2.0 / input_size)
self.b1 = np.zeros((1, hidden_size))
self.W2 = np.random.randn(hidden_size, output_size) * np.sqrt(2.0 / hidden_size)
self.b2 = np.zeros((1, output_size))
def forward(self, X):
"""Forward propagation"""
# Layer 1: Input โ Hidden
self.z1 = np.dot(X, self.W1) + self.b1 # Linear transformation
self.a1 = relu(self.z1) # Apply activation
# Layer 2: Hidden โ Output
self.z2 = np.dot(self.a1, self.W2) + self.b2
self.a2 = softmax(self.z2) # Output probabilities
return self.a2
def predict(self, X):
"""Make predictions"""
probs = self.forward(X)
return np.argmax(probs, axis=1)
# Example usage
nn = NeuralNetwork(input_size=4, hidden_size=8, output_size=3)
# Sample input
sample_input = np.array([[1.0, 2.0, 3.0, 4.0]])
# Forward pass
output = nn.forward(sample_input)
print("Input shape:", sample_input.shape)
print("Output probabilities:", output)
print("Predicted class:", np.argmax(output))
What Happens in Forward Pass:
- Linear transformation: z = Wยทx + b
- Activation function: a = activation(z)
- Repeat for each layer
- Final output: Predictions or probabilities
๐ฏ Loss Functions
# Loss functions measure prediction error
# 1. Mean Squared Error (MSE) - For regression
def mse_loss(y_true, y_pred):
"""Average of squared differences"""
return np.mean((y_true - y_pred) ** 2)
# 2. Binary Cross-Entropy - For binary classification
def binary_crossentropy(y_true, y_pred):
"""Loss for binary (0 or 1) predictions"""
epsilon = 1e-7 # Avoid log(0)
y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
return -np.mean(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))
# 3. Categorical Cross-Entropy - For multi-class classification
def categorical_crossentropy(y_true, y_pred):
"""Loss for multi-class predictions"""
epsilon = 1e-7
y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
return -np.sum(y_true * np.log(y_pred)) / y_true.shape[0]
# Example: Multi-class classification
y_true = np.array([[1, 0, 0]]) # True class: 0
y_pred = np.array([[0.7, 0.2, 0.1]]) # Predicted probabilities
loss = categorical_crossentropy(y_true, y_pred)
print(f"Loss: {loss:.4f}")
print("\nLower loss = Better predictions")
print("Perfect prediction (1.0, 0.0, 0.0) would give loss โ 0")
๐ Backpropagation
# Backpropagation: Calculate gradients and update weights
class NeuralNetworkWithBackprop(NeuralNetwork):
def __init__(self, input_size, hidden_size, output_size, learning_rate=0.01):
super().__init__(input_size, hidden_size, output_size)
self.learning_rate = learning_rate
def backward(self, X, y):
"""Backpropagation: Compute gradients"""
m = X.shape[0] # Number of samples
# Output layer gradients
dz2 = self.a2 - y # Derivative of softmax + cross-entropy
dW2 = np.dot(self.a1.T, dz2) / m
db2 = np.sum(dz2, axis=0, keepdims=True) / m
# Hidden layer gradients
da1 = np.dot(dz2, self.W2.T)
dz1 = da1 * (self.z1 > 0) # ReLU derivative
dW1 = np.dot(X.T, dz1) / m
db1 = np.sum(dz1, axis=0, keepdims=True) / m
# Store gradients
self.dW1, self.db1 = dW1, db1
self.dW2, self.db2 = dW2, db2
def update_weights(self):
"""Gradient descent: Update weights"""
self.W1 -= self.learning_rate * self.dW1
self.b1 -= self.learning_rate * self.db1
self.W2 -= self.learning_rate * self.dW2
self.b2 -= self.learning_rate * self.db2
def train_step(self, X, y):
"""One training iteration"""
# Forward pass
predictions = self.forward(X)
# Calculate loss
loss = categorical_crossentropy(y, predictions)
# Backward pass
self.backward(X, y)
# Update weights
self.update_weights()
return loss
# Example training loop
def train_network(X_train, y_train, epochs=100):
"""Train neural network"""
input_size = X_train.shape[1]
output_size = y_train.shape[1]
nn = NeuralNetworkWithBackprop(
input_size=input_size,
hidden_size=16,
output_size=output_size,
learning_rate=0.1
)
losses = []
for epoch in range(epochs):
loss = nn.train_step(X_train, y_train)
losses.append(loss)
if (epoch + 1) % 10 == 0:
accuracy = np.mean(nn.predict(X_train) == np.argmax(y_train, axis=1))
print(f"Epoch {epoch+1}/{epochs} - Loss: {loss:.4f}, Accuracy: {accuracy:.2%}")
return nn, losses
# Visualize training
# plt.plot(losses)
# plt.xlabel('Epoch')
# plt.ylabel('Loss')
# plt.title('Training Progress')
# plt.show()
Backpropagation Steps:
- Compute loss: How wrong are predictions?
- Calculate gradients: Use chain rule to find โLoss/โW
- Update weights: W = W - learning_rate ร gradient
- Repeat: Until loss stops decreasing
๐ Complete Deep Learning Example
# Using TensorFlow/Keras for real-world deep learning
import tensorflow as tf
from tensorflow import keras
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
# Create synthetic dataset
X, y = make_classification(
n_samples=1000,
n_features=20,
n_informative=15,
n_classes=3,
random_state=42
)
# Split and scale
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# Build deep neural network
model = keras.Sequential([
# Input layer
keras.layers.Dense(64, activation='relu', input_shape=(20,)),
keras.layers.BatchNormalization(),
keras.layers.Dropout(0.3),
# Hidden layer 1
keras.layers.Dense(32, activation='relu'),
keras.layers.BatchNormalization(),
keras.layers.Dropout(0.3),
# Hidden layer 2
keras.layers.Dense(16, activation='relu'),
keras.layers.BatchNormalization(),
keras.layers.Dropout(0.2),
# Output layer
keras.layers.Dense(3, activation='softmax')
])
# Compile model
model.compile(
optimizer=keras.optimizers.Adam(learning_rate=0.001),
loss='sparse_categorical_crossentropy',
metrics=['accuracy']
)
# Display architecture
model.summary()
# Train model
print("\nTraining deep neural network...")
history = model.fit(
X_train, y_train,
epochs=50,
batch_size=32,
validation_split=0.2,
verbose=0
)
# Evaluate
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"\nTest Accuracy: {test_accuracy:.2%}")
# Plot training history
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.title('Loss During Training')
plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.title('Accuracy During Training')
plt.tight_layout()
plt.show()
๐ Optimization Techniques
# Modern optimization algorithms
# 1. Stochastic Gradient Descent (SGD)
# W = W - learning_rate ร gradient
# 2. SGD with Momentum
# velocity = beta ร velocity + gradient
# W = W - learning_rate ร velocity
# 3. Adam (Adaptive Moment Estimation) - Most popular
# Combines momentum + adaptive learning rates
# Works well for most problems
# Compare optimizers
optimizers = {
'SGD': keras.optimizers.SGD(learning_rate=0.01),
'SGD + Momentum': keras.optimizers.SGD(learning_rate=0.01, momentum=0.9),
'RMSprop': keras.optimizers.RMSprop(learning_rate=0.001),
'Adam': keras.optimizers.Adam(learning_rate=0.001)
}
print("Optimizer Comparison:")
print("="*50)
for name, optimizer in optimizers.items():
print(f"\n{name}:")
if name == 'SGD':
print(" โข Simple, stable")
print(" โข Can be slow")
elif 'Momentum' in name:
print(" โข Faster than SGD")
print(" โข Better for noisy gradients")
elif name == 'RMSprop':
print(" โข Adaptive learning rates")
print(" โข Good for RNNs")
elif name == 'Adam':
print(" โข Combines best of momentum + RMSprop")
print(" โข Default choice for most problems")
print(" โข โ Recommended for beginners")
๐ฏ Key Takeaways
- Deep learning: Multiple layers learn hierarchical features
- Forward propagation: Input โ Hidden โ Output
- Activation functions: ReLU for hidden, softmax for output
- Loss functions: MSE for regression, cross-entropy for classification
- Backpropagation: Calculate gradients using chain rule
- Gradient descent: Update weights to minimize loss
- Regularization: Dropout, batch norm prevent overfitting
- Adam optimizer: Good default choice