š Introduction
Neural Networks are computational models inspired by the human brain. They consist of interconnected nodes (neurons) organized in layers that learn to recognize patterns through training. They're the foundation of modern deep learning!
šÆ Neural Network Architecture
Input Layer Hidden Layer Output Layer
(X) (H) (Y)
ā ā ā
ā ā± ā ā² ā
ā ā ā ā ā ā ā
ā ā² ā ā± ā
ā ā ā
Features Processing Predictions
Key Components:
- Input Layer: Receives raw features
- Hidden Layers: Process and transform data
- Output Layer: Produces final predictions
- Weights: Parameters learned during training
- Biases: Shifts to allow better fitting
- Activation Functions: Add non-linearity
ā” Activation Functions
import numpy as np
import matplotlib.pyplot as plt
x = np.linspace(-5, 5, 100)
# Sigmoid
sigmoid = 1 / (1 + np.exp(-x))
# Tanh
tanh = np.tanh(x)
# ReLU
relu = np.maximum(0, x)
# Leaky ReLU
leaky_relu = np.where(x > 0, x, 0.01 * x)
# Softmax (for 3 classes)
def softmax(x):
exp_x = np.exp(x - np.max(x))
return exp_x / exp_x.sum()
# Plot
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes[0, 0].plot(x, sigmoid)
axes[0, 0].set_title('Sigmoid: Ļ(x) = 1/(1+eā»Ė£)')
axes[0, 0].grid(True)
axes[0, 1].plot(x, tanh)
axes[0, 1].set_title('Tanh: tanh(x)')
axes[0, 1].grid(True)
axes[0, 2].plot(x, relu)
axes[0, 2].set_title('ReLU: max(0, x)')
axes[0, 2].grid(True)
axes[1, 0].plot(x, leaky_relu)
axes[1, 0].set_title('Leaky ReLU: max(0.01x, x)')
axes[1, 0].grid(True)
# ELU
elu = np.where(x > 0, x, np.exp(x) - 1)
axes[1, 1].plot(x, elu)
axes[1, 1].set_title('ELU')
axes[1, 1].grid(True)
# Comparison
axes[1, 2].plot(x, sigmoid, label='Sigmoid')
axes[1, 2].plot(x, tanh, label='Tanh')
axes[1, 2].plot(x, relu, label='ReLU')
axes[1, 2].set_title('Comparison')
axes[1, 2].legend()
axes[1, 2].grid(True)
plt.tight_layout()
plt.show()
Sigmoid
Range: (0, 1). Used in output for binary classification
Tanh
Range: (-1, 1). Zero-centered, better than sigmoid
ReLU
Most popular! Fast, helps gradient flow. f(x) = max(0, x)
Leaky ReLU
Fixes "dying ReLU". f(x) = max(0.01x, x)
Softmax
Multi-class output. Converts to probabilities
ELU
Exponential Linear Unit. Smooth, negative values OK
š Forward Propagation
Process of passing input through the network to get output.
Single Neuron:
z = wāxā + wāxā + ... + wāxā + b
a = Ļ(z)
where Ļ is the activation function
# Simple forward pass example
def forward_pass(X, W1, b1, W2, b2):
"""
X: input (n_samples, n_features)
W1: weights layer 1 (n_features, n_hidden)
b1: bias layer 1 (n_hidden,)
W2: weights layer 2 (n_hidden, n_output)
b2: bias layer 2 (n_output,)
"""
# Layer 1
Z1 = np.dot(X, W1) + b1
A1 = np.maximum(0, Z1) # ReLU activation
# Layer 2 (output)
Z2 = np.dot(A1, W2) + b2
A2 = 1 / (1 + np.exp(-Z2)) # Sigmoid activation
return A2, A1, Z1
# Example
X = np.array([[1, 2], [3, 4]]) # 2 samples, 2 features
W1 = np.random.randn(2, 3) # 3 hidden neurons
b1 = np.zeros(3)
W2 = np.random.randn(3, 1) # 1 output
b2 = np.zeros(1)
output, _, _ = forward_pass(X, W1, b1, W2, b2)
print("Input shape:", X.shape)
print("Output shape:", output.shape)
print("Predictions:\n", output)
š Backpropagation
Algorithm for training neural networks by propagating errors backward to update weights.
- Forward Pass: Compute predictions
- Calculate Loss: Measure error
- Backward Pass: Compute gradients using chain rule
- Update Weights: Adjust parameters using gradient descent
- Repeat: Until convergence
Weight Update Rule:
W = W - α Ć āL/āW
b = b - α Ć āL/āb
where α is the learning rate
š Neural Network from Scratch
class NeuralNetwork:
def __init__(self, input_size, hidden_size, output_size):
# Initialize weights randomly
self.W1 = np.random.randn(input_size, hidden_size) * 0.01
self.b1 = np.zeros((1, hidden_size))
self.W2 = np.random.randn(hidden_size, output_size) * 0.01
self.b2 = np.zeros((1, output_size))
def sigmoid(self, z):
return 1 / (1 + np.exp(-np.clip(z, -500, 500)))
def sigmoid_derivative(self, z):
s = self.sigmoid(z)
return s * (1 - s)
def relu(self, z):
return np.maximum(0, z)
def relu_derivative(self, z):
return (z > 0).astype(float)
def forward(self, X):
# Layer 1
self.Z1 = np.dot(X, self.W1) + self.b1
self.A1 = self.relu(self.Z1)
# Layer 2 (output)
self.Z2 = np.dot(self.A1, self.W2) + self.b2
self.A2 = self.sigmoid(self.Z2)
return self.A2
def backward(self, X, y, learning_rate):
m = X.shape[0]
# Output layer gradients
dZ2 = self.A2 - y
dW2 = (1/m) * np.dot(self.A1.T, dZ2)
db2 = (1/m) * np.sum(dZ2, axis=0, keepdims=True)
# Hidden layer gradients
dA1 = np.dot(dZ2, self.W2.T)
dZ1 = dA1 * self.relu_derivative(self.Z1)
dW1 = (1/m) * np.dot(X.T, dZ1)
db1 = (1/m) * np.sum(dZ1, axis=0, keepdims=True)
# Update weights
self.W2 -= learning_rate * dW2
self.b2 -= learning_rate * db2
self.W1 -= learning_rate * dW1
self.b1 -= learning_rate * db1
def compute_loss(self, y_pred, y_true):
m = y_true.shape[0]
loss = -np.mean(y_true * np.log(y_pred + 1e-8) +
(1 - y_true) * np.log(1 - y_pred + 1e-8))
return loss
def train(self, X, y, epochs=1000, learning_rate=0.1):
losses = []
for epoch in range(epochs):
# Forward pass
y_pred = self.forward(X)
# Compute loss
loss = self.compute_loss(y_pred, y)
losses.append(loss)
# Backward pass
self.backward(X, y, learning_rate)
if epoch % 100 == 0:
print(f"Epoch {epoch}, Loss: {loss:.4f}")
return losses
def predict(self, X):
y_pred = self.forward(X)
return (y_pred > 0.5).astype(int)
# Test on synthetic data
np.random.seed(42)
X = np.random.randn(100, 2)
y = ((X[:, 0] + X[:, 1]) > 0).astype(int).reshape(-1, 1)
# Create and train network
nn = NeuralNetwork(input_size=2, hidden_size=4, output_size=1)
losses = nn.train(X, y, epochs=1000, learning_rate=0.1)
# Evaluate
predictions = nn.predict(X)
accuracy = np.mean(predictions == y)
print(f"\nFinal Accuracy: {accuracy:.2%}")
# Plot loss curve
plt.figure(figsize=(10, 6))
plt.plot(losses)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training Loss Over Time')
plt.grid(True)
plt.show()
š§ Using Keras/TensorFlow
import tensorflow as tf
from tensorflow import keras
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
# Generate dataset
X, y = make_classification(n_samples=1000, n_features=20,
n_informative=15, n_redundant=5,
random_state=42)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# Build model
model = keras.Sequential([
keras.layers.Dense(64, activation='relu', input_shape=(20,)),
keras.layers.Dropout(0.3),
keras.layers.Dense(32, activation='relu'),
keras.layers.Dropout(0.3),
keras.layers.Dense(1, activation='sigmoid')
])
# Compile
model.compile(
optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy']
)
# Train
history = model.fit(
X_train, y_train,
epochs=50,
batch_size=32,
validation_split=0.2,
verbose=0
)
# Evaluate
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_acc:.2%}")
# Plot training history
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Loss Over Time')
plt.legend()
plt.grid(True)
plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'], label='Train Acc')
plt.plot(history.history['val_accuracy'], label='Val Acc')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Accuracy Over Time')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
# Model summary
model.summary()
āļø Hyperparameters
Learning Rate
Step size for weight updates. Typical: 0.001 - 0.1
Batch Size
Samples per gradient update. Common: 32, 64, 128
Epochs
Full passes through training data
Hidden Layers
Number and size of hidden layers
Dropout Rate
Fraction of neurons to drop. Prevents overfitting
Optimizer
SGD, Adam, RMSprop, etc.
š” Best Practices
- Normalize inputs: Scale features to similar ranges
- Use ReLU: Start with ReLU activations in hidden layers
- Start simple: Begin with fewer layers, add complexity if needed
- Use dropout: Prevents overfitting (0.2-0.5 rate)
- Monitor validation: Watch for overfitting
- Try Adam optimizer: Usually works well out of the box
- Use early stopping: Stop training when validation loss increases
- Batch normalization: Normalize layer inputs for faster training