Deep Learning Fundamentals

What is Deep Learning?

Deep learning is a subset of machine learning that uses neural networks with multiple layers (hence "deep"). These networks can automatically learn hierarchical representations of data, making them incredibly powerful for complex tasks like image recognition, language understanding, and game playing.

                Why "Deep"?
                Multiple layers: Input → Hidden Layer 1 → Hidden Layer 2 → ... → Output
Hierarchical learning: Early layers learn simple patterns, later layers learn complex concepts
Automatic features: No manual feature engineering needed
Scalable: Performance improves with more data

            

🏗️ Neural Network Architecture

import numpy as np
import matplotlib.pyplot as plt

# Visualize network architecture
def visualize_network():
    """Display neural network structure"""
    
    layers = {
        'Input Layer': 4,      # 4 input features
        'Hidden Layer 1': 8,   # 8 neurons
        'Hidden Layer 2': 6,   # 6 neurons
        'Output Layer': 3      # 3 classes
    }
    
    print("Neural Network Architecture:")
    print("="*50)
    
    for layer_name, neurons in layers.items():
        print(f"{layer_name:20s}: {neurons} neurons")
        if 'Hidden' in layer_name:
            print(f"{'':20s}  Activation: ReLU")
        elif 'Output' in layer_name:
            print(f"{'':20s}  Activation: Softmax")
    
    # Calculate total parameters
    total_params = (4 * 8 + 8) + (8 * 6 + 6) + (6 * 3 + 3)
    print(f"\nTotal Parameters: {total_params}")

visualize_network()

# What each layer does:
# Input Layer: Receives raw data
# Hidden Layers: Learn increasingly complex patterns
# Output Layer: Makes final prediction

Layer Types:

Dense (Fully Connected): Every neuron connects to all previous neurons
Convolutional: For image data, learns spatial patterns
Recurrent: For sequential data, has memory
Dropout: Randomly disables neurons during training (prevents overfitting)
Batch Normalization: Normalizes layer inputs (speeds up training)

⚡ Activation Functions

# Common activation functions

# 1. ReLU (Rectified Linear Unit) - Most popular
def relu(x):
    """f(x) = max(0, x)"""
    return np.maximum(0, x)

# 2. Sigmoid - For binary classification
def sigmoid(x):
    """f(x) = 1 / (1 + e^-x)"""
    return 1 / (1 + np.exp(-x))

# 3. Tanh (Hyperbolic Tangent) - Centered around 0
def tanh(x):
    """f(x) = (e^x - e^-x) / (e^x + e^-x)"""
    return np.tanh(x)

# 4. Softmax - For multi-class classification
def softmax(x):
    """Converts scores to probabilities"""
    exp_x = np.exp(x - np.max(x))  # Subtract max for numerical stability
    return exp_x / exp_x.sum()

# Visualize activation functions
x = np.linspace(-5, 5, 100)

plt.figure(figsize=(12, 8))

plt.subplot(2, 2, 1)
plt.plot(x, relu(x), 'b-', linewidth=2)
plt.title('ReLU: f(x) = max(0, x)')
plt.grid(True)
plt.axhline(y=0, color='k', linestyle='--', alpha=0.3)
plt.axvline(x=0, color='k', linestyle='--', alpha=0.3)

plt.subplot(2, 2, 2)
plt.plot(x, sigmoid(x), 'r-', linewidth=2)
plt.title('Sigmoid: f(x) = 1/(1+e^-x)')
plt.grid(True)
plt.axhline(y=0, color='k', linestyle='--', alpha=0.3)

plt.subplot(2, 2, 3)
plt.plot(x, tanh(x), 'g-', linewidth=2)
plt.title('Tanh: f(x) = (e^x - e^-x)/(e^x + e^-x)')
plt.grid(True)
plt.axhline(y=0, color='k', linestyle='--', alpha=0.3)

plt.subplot(2, 2, 4)
test_scores = np.array([2.0, 1.0, 0.1])
probs = softmax(test_scores)
plt.bar(['Class A', 'Class B', 'Class C'], probs)
plt.title('Softmax: Converts to probabilities')
plt.ylabel('Probability')

plt.tight_layout()
plt.show()

# When to use which?
activation_guide = {
    'ReLU': 'Hidden layers (default choice)',
    'Sigmoid': 'Binary classification output',
    'Softmax': 'Multi-class classification output',
    'Tanh': 'Hidden layers (when data centered at 0)'
}

print("\nActivation Function Guide:")
for func, usage in activation_guide.items():
    print(f"  {func:10s}: {usage}")

🔄 Forward Propagation

# Forward pass: Input → Hidden → Output

class NeuralNetwork:
    def __init__(self, input_size, hidden_size, output_size):
        """Initialize network with random weights"""
        
        # Weights (Xavier initialization)
        self.W1 = np.random.randn(input_size, hidden_size) * np.sqrt(2.0 / input_size)
        self.b1 = np.zeros((1, hidden_size))
        
        self.W2 = np.random.randn(hidden_size, output_size) * np.sqrt(2.0 / hidden_size)
        self.b2 = np.zeros((1, output_size))
    
    def forward(self, X):
        """Forward propagation"""
        
        # Layer 1: Input → Hidden
        self.z1 = np.dot(X, self.W1) + self.b1  # Linear transformation
        self.a1 = relu(self.z1)                  # Apply activation
        
        # Layer 2: Hidden → Output
        self.z2 = np.dot(self.a1, self.W2) + self.b2
        self.a2 = softmax(self.z2)               # Output probabilities
        
        return self.a2
    
    def predict(self, X):
        """Make predictions"""
        probs = self.forward(X)
        return np.argmax(probs, axis=1)

# Example usage
nn = NeuralNetwork(input_size=4, hidden_size=8, output_size=3)

# Sample input
sample_input = np.array([[1.0, 2.0, 3.0, 4.0]])

# Forward pass
output = nn.forward(sample_input)
print("Input shape:", sample_input.shape)
print("Output probabilities:", output)
print("Predicted class:", np.argmax(output))

What Happens in Forward Pass:

Linear transformation: z = W·x + b
Activation function: a = activation(z)
Repeat for each layer
Final output: Predictions or probabilities

🎯 Loss Functions

# Loss functions measure prediction error

# 1. Mean Squared Error (MSE) - For regression
def mse_loss(y_true, y_pred):
    """Average of squared differences"""
    return np.mean((y_true - y_pred) ** 2)

# 2. Binary Cross-Entropy - For binary classification
def binary_crossentropy(y_true, y_pred):
    """Loss for binary (0 or 1) predictions"""
    epsilon = 1e-7  # Avoid log(0)
    y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
    return -np.mean(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))

# 3. Categorical Cross-Entropy - For multi-class classification
def categorical_crossentropy(y_true, y_pred):
    """Loss for multi-class predictions"""
    epsilon = 1e-7
    y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
    return -np.sum(y_true * np.log(y_pred)) / y_true.shape[0]

# Example: Multi-class classification
y_true = np.array([[1, 0, 0]])  # True class: 0
y_pred = np.array([[0.7, 0.2, 0.1]])  # Predicted probabilities

loss = categorical_crossentropy(y_true, y_pred)
print(f"Loss: {loss:.4f}")
print("\nLower loss = Better predictions")
print("Perfect prediction (1.0, 0.0, 0.0) would give loss ≈ 0")

🔙 Backpropagation

# Backpropagation: Calculate gradients and update weights

class NeuralNetworkWithBackprop(NeuralNetwork):
    def __init__(self, input_size, hidden_size, output_size, learning_rate=0.01):
        super().__init__(input_size, hidden_size, output_size)
        self.learning_rate = learning_rate
    
    def backward(self, X, y):
        """Backpropagation: Compute gradients"""
        
        m = X.shape[0]  # Number of samples
        
        # Output layer gradients
        dz2 = self.a2 - y  # Derivative of softmax + cross-entropy
        dW2 = np.dot(self.a1.T, dz2) / m
        db2 = np.sum(dz2, axis=0, keepdims=True) / m
        
        # Hidden layer gradients
        da1 = np.dot(dz2, self.W2.T)
        dz1 = da1 * (self.z1 > 0)  # ReLU derivative
        dW1 = np.dot(X.T, dz1) / m
        db1 = np.sum(dz1, axis=0, keepdims=True) / m
        
        # Store gradients
        self.dW1, self.db1 = dW1, db1
        self.dW2, self.db2 = dW2, db2
    
    def update_weights(self):
        """Gradient descent: Update weights"""
        
        self.W1 -= self.learning_rate * self.dW1
        self.b1 -= self.learning_rate * self.db1
        self.W2 -= self.learning_rate * self.dW2
        self.b2 -= self.learning_rate * self.db2
    
    def train_step(self, X, y):
        """One training iteration"""
        
        # Forward pass
        predictions = self.forward(X)
        
        # Calculate loss
        loss = categorical_crossentropy(y, predictions)
        
        # Backward pass
        self.backward(X, y)
        
        # Update weights
        self.update_weights()
        
        return loss

# Example training loop
def train_network(X_train, y_train, epochs=100):
    """Train neural network"""
    
    input_size = X_train.shape[1]
    output_size = y_train.shape[1]
    
    nn = NeuralNetworkWithBackprop(
        input_size=input_size,
        hidden_size=16,
        output_size=output_size,
        learning_rate=0.1
    )
    
    losses = []
    
    for epoch in range(epochs):
        loss = nn.train_step(X_train, y_train)
        losses.append(loss)
        
        if (epoch + 1) % 10 == 0:
            accuracy = np.mean(nn.predict(X_train) == np.argmax(y_train, axis=1))
            print(f"Epoch {epoch+1}/{epochs} - Loss: {loss:.4f}, Accuracy: {accuracy:.2%}")
    
    return nn, losses

# Visualize training
# plt.plot(losses)
# plt.xlabel('Epoch')
# plt.ylabel('Loss')
# plt.title('Training Progress')
# plt.show()

Backpropagation Steps:

Compute loss: How wrong are predictions?
Calculate gradients: Use chain rule to find ∂Loss/∂W
Update weights: W = W - learning_rate × gradient
Repeat: Until loss stops decreasing

🎓 Complete Deep Learning Example

# Using TensorFlow/Keras for real-world deep learning

import tensorflow as tf
from tensorflow import keras
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Create synthetic dataset
X, y = make_classification(
    n_samples=1000,
    n_features=20,
    n_informative=15,
    n_classes=3,
    random_state=42
)

# Split and scale
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Build deep neural network
model = keras.Sequential([
    # Input layer
    keras.layers.Dense(64, activation='relu', input_shape=(20,)),
    keras.layers.BatchNormalization(),
    keras.layers.Dropout(0.3),
    
    # Hidden layer 1
    keras.layers.Dense(32, activation='relu'),
    keras.layers.BatchNormalization(),
    keras.layers.Dropout(0.3),
    
    # Hidden layer 2
    keras.layers.Dense(16, activation='relu'),
    keras.layers.BatchNormalization(),
    keras.layers.Dropout(0.2),
    
    # Output layer
    keras.layers.Dense(3, activation='softmax')
])

# Compile model
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

# Display architecture
model.summary()

# Train model
print("\nTraining deep neural network...")
history = model.fit(
    X_train, y_train,
    epochs=50,
    batch_size=32,
    validation_split=0.2,
    verbose=0
)

# Evaluate
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"\nTest Accuracy: {test_accuracy:.2%}")

# Plot training history
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.title('Loss During Training')

plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.title('Accuracy During Training')

plt.tight_layout()
plt.show()

🚀 Optimization Techniques

# Modern optimization algorithms

# 1. Stochastic Gradient Descent (SGD)
# W = W - learning_rate × gradient

# 2. SGD with Momentum
# velocity = beta × velocity + gradient
# W = W - learning_rate × velocity

# 3. Adam (Adaptive Moment Estimation) - Most popular
# Combines momentum + adaptive learning rates
# Works well for most problems

# Compare optimizers
optimizers = {
    'SGD': keras.optimizers.SGD(learning_rate=0.01),
    'SGD + Momentum': keras.optimizers.SGD(learning_rate=0.01, momentum=0.9),
    'RMSprop': keras.optimizers.RMSprop(learning_rate=0.001),
    'Adam': keras.optimizers.Adam(learning_rate=0.001)
}

print("Optimizer Comparison:")
print("="*50)
for name, optimizer in optimizers.items():
    print(f"\n{name}:")
    if name == 'SGD':
        print("  • Simple, stable")
        print("  • Can be slow")
    elif 'Momentum' in name:
        print("  • Faster than SGD")
        print("  • Better for noisy gradients")
    elif name == 'RMSprop':
        print("  • Adaptive learning rates")
        print("  • Good for RNNs")
    elif name == 'Adam':
        print("  • Combines best of momentum + RMSprop")
        print("  • Default choice for most problems")
        print("  • ✓ Recommended for beginners")

🎯 Key Takeaways

Deep learning: Multiple layers learn hierarchical features
Forward propagation: Input → Hidden → Output
Activation functions: ReLU for hidden, softmax for output
Loss functions: MSE for regression, cross-entropy for classification
Backpropagation: Calculate gradients using chain rule
Gradient descent: Update weights to minimize loss
Regularization: Dropout, batch norm prevent overfitting
Adam optimizer: Good default choice