What is Reinforcement Learning?
Reinforcement Learning (RL) is about learning from experience. An agent learns to make decisions by interacting with an environment, receiving rewards for good actions and penalties for bad ones. It's how AlphaGo learned to beat world champions, how robots learn to walk, and how recommendation systems optimize engagement.
Key Difference from Other ML:
- Supervised Learning: Learn from labeled examples (X ā Y)
- Unsupervised Learning: Find patterns in data
- Reinforcement Learning: Learn from rewards through trial and error
š§© Core Concepts
# RL Components
rl_components = {
'Agent': 'The learner/decision maker (e.g., game player, robot)',
'Environment': 'The world the agent interacts with',
'State (s)': 'Current situation of the agent',
'Action (a)': 'What the agent can do',
'Reward (r)': 'Feedback signal (positive or negative)',
'Policy (Ļ)': 'Strategy for choosing actions',
'Value Function (V)': 'Expected long-term reward from a state',
'Q-Function (Q)': 'Expected reward for taking action a in state s'
}
print("Reinforcement Learning Components:")
print("="*60)
for component, description in rl_components.items():
print(f"{component:20s}: {description}")
# RL Loop:
# 1. Agent observes state s
# 2. Agent takes action a
# 3. Environment gives reward r and new state s'
# 4. Agent learns from experience
# 5. Repeat
print("\n" + "="*60)
print("RL Training Loop:")
print(" State ā Action ā Reward ā New State ā Learn ā Repeat")
print("="*60)
Real-World Examples:
- Game Playing: State = board position, Action = move, Reward = win/lose
- Robotics: State = sensor data, Action = motor commands, Reward = task completion
- Trading: State = market data, Action = buy/sell, Reward = profit/loss
- Recommendation: State = user history, Action = recommend item, Reward = engagement
šÆ Markov Decision Process (MDP)
# MDP: Mathematical framework for RL
import numpy as np
import matplotlib.pyplot as plt
# Simple grid world example
# Agent navigates 4x4 grid to reach goal
class GridWorld:
def __init__(self, size=4):
self.size = size
self.state = (0, 0) # Start position
self.goal = (3, 3) # Goal position
def reset(self):
"""Reset to start state"""
self.state = (0, 0)
return self.state
def step(self, action):
"""
Take action and return (new_state, reward, done)
Actions: 0=up, 1=right, 2=down, 3=left
"""
x, y = self.state
# Take action
if action == 0 and x > 0: # Up
x -= 1
elif action == 1 and y < self.size - 1: # Right
y += 1
elif action == 2 and x < self.size - 1: # Down
x += 1
elif action == 3 and y > 0: # Left
y -= 1
# Update state
self.state = (x, y)
# Calculate reward
if self.state == self.goal:
reward = 10 # Reached goal!
done = True
else:
reward = -1 # Small penalty for each step
done = False
return self.state, reward, done
def render(self):
"""Visualize grid"""
grid = np.zeros((self.size, self.size))
grid[self.state] = 1 # Agent
grid[self.goal] = 2 # Goal
plt.imshow(grid, cmap='RdYlGn', vmin=0, vmax=2)
plt.title(f'Agent at {self.state}, Goal at {self.goal}')
plt.colorbar(ticks=[0, 1, 2], label='0=Empty, 1=Agent, 2=Goal')
plt.show()
# Create environment
env = GridWorld()
print("Grid World Environment:")
print(f" Size: {env.size}x{env.size}")
print(f" Start: {env.state}")
print(f" Goal: {env.goal}")
print(f" Actions: 0=up, 1=right, 2=down, 3=left")
print()
# Test environment
state = env.reset()
print(f"Initial state: {state}")
state, reward, done = env.step(1) # Move right
print(f"After moving right: state={state}, reward={reward}, done={done}")
state, reward, done = env.step(2) # Move down
print(f"After moving down: state={state}, reward={reward}, done={done}")
š Exploration vs Exploitation
# The RL dilemma: Explore new actions or exploit known good ones?
class EpsilonGreedy:
"""ε-greedy strategy for balancing exploration and exploitation"""
def __init__(self, epsilon=0.1):
self.epsilon = epsilon # Probability of exploring
def select_action(self, q_values):
"""
Select action using ε-greedy strategy
Args:
q_values: Array of Q-values for each action
Returns:
Selected action index
"""
if np.random.random() < self.epsilon:
# Explore: Choose random action
return np.random.randint(len(q_values))
else:
# Exploit: Choose best known action
return np.argmax(q_values)
def decay_epsilon(self, decay_rate=0.995):
"""Reduce exploration over time"""
self.epsilon *= decay_rate
# Example usage
strategy = EpsilonGreedy(epsilon=0.3)
# Simulate Q-values for 4 actions
q_values = np.array([0.5, 0.8, 0.3, 0.6])
print("Q-values for each action:", q_values)
print(f"Best action: {np.argmax(q_values)} (Q={q_values[np.argmax(q_values)]:.2f})")
print()
# Test ε-greedy selection
actions_selected = []
for _ in range(100):
action = strategy.select_action(q_values)
actions_selected.append(action)
print(f"With ε={strategy.epsilon}:")
for action in range(len(q_values)):
count = actions_selected.count(action)
print(f" Action {action} selected: {count}% of time")
print("\nā Higher ε = more exploration")
print("ā Lower ε = more exploitation")
print("ā Typically start high (0.3-1.0) and decay over time")
š Q-Learning Algorithm
# Q-Learning: Learn optimal action-value function
class QLearningAgent:
def __init__(self, n_states, n_actions, learning_rate=0.1,
discount_factor=0.95, epsilon=0.1):
"""
Initialize Q-Learning agent
Args:
n_states: Number of possible states
n_actions: Number of possible actions
learning_rate (α): How much to update Q-values (0-1)
discount_factor (γ): Importance of future rewards (0-1)
epsilon (ε): Exploration rate (0-1)
"""
self.n_states = n_states
self.n_actions = n_actions
self.alpha = learning_rate
self.gamma = discount_factor
self.epsilon = epsilon
# Initialize Q-table with zeros
self.Q = np.zeros((n_states, n_actions))
def get_action(self, state):
"""Choose action using ε-greedy policy"""
if np.random.random() < self.epsilon:
return np.random.randint(self.n_actions) # Explore
else:
return np.argmax(self.Q[state]) # Exploit
def update(self, state, action, reward, next_state, done):
"""
Update Q-value using Q-Learning formula:
Q(s,a) ā Q(s,a) + α[r + γ·max Q(s',a') - Q(s,a)]
"""
if done:
# No future rewards if episode ended
target = reward
else:
# Best possible future reward
target = reward + self.gamma * np.max(self.Q[next_state])
# Update Q-value
self.Q[state, action] += self.alpha * (target - self.Q[state, action])
def decay_epsilon(self, decay_rate=0.995):
"""Reduce exploration over time"""
self.epsilon = max(0.01, self.epsilon * decay_rate)
# Example Q-table for 4x4 grid
n_states = 16 # 4x4 grid
n_actions = 4 # up, right, down, left
agent = QLearningAgent(n_states, n_actions)
print("Q-Learning Agent initialized!")
print(f" States: {n_states}")
print(f" Actions: {n_actions}")
print(f" Learning rate (α): {agent.alpha}")
print(f" Discount factor (γ): {agent.gamma}")
print(f" Exploration (ε): {agent.epsilon}")
print()
print("Q-Table shape:", agent.Q.shape)
print("Initial Q-values (all zeros):")
print(agent.Q[:4, :]) # Show first 4 states
šļø Training Q-Learning Agent
# Complete training loop
def train_q_learning(env, agent, episodes=1000, max_steps=100):
"""
Train Q-Learning agent
Args:
env: Environment
agent: Q-Learning agent
episodes: Number of training episodes
max_steps: Maximum steps per episode
Returns:
rewards_history: List of total rewards per episode
"""
rewards_history = []
for episode in range(episodes):
state = env.reset()
state_idx = state[0] * env.size + state[1] # Convert (x,y) to index
total_reward = 0
for step in range(max_steps):
# Choose action
action = agent.get_action(state_idx)
# Take action
next_state, reward, done = env.step(action)
next_state_idx = next_state[0] * env.size + next_state[1]
# Update Q-table
agent.update(state_idx, action, reward, next_state_idx, done)
# Update state and reward
state_idx = next_state_idx
total_reward += reward
if done:
break
# Decay exploration
agent.decay_epsilon()
# Record episode reward
rewards_history.append(total_reward)
# Print progress
if (episode + 1) % 100 == 0:
avg_reward = np.mean(rewards_history[-100:])
print(f"Episode {episode+1}/{episodes} - "
f"Avg Reward: {avg_reward:.2f}, "
f"ε: {agent.epsilon:.3f}")
return rewards_history
# Train agent
print("Training Q-Learning agent...\n")
env = GridWorld(size=4)
agent = QLearningAgent(n_states=16, n_actions=4)
rewards = train_q_learning(env, agent, episodes=500)
# Plot learning curve
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(rewards, alpha=0.3)
plt.plot(np.convolve(rewards, np.ones(50)/50, mode='valid'), linewidth=2)
plt.xlabel('Episode')
plt.ylabel('Total Reward')
plt.title('Learning Curve')
plt.grid(True)
plt.subplot(1, 2, 2)
plt.imshow(agent.Q, cmap='RdYlGn', aspect='auto')
plt.colorbar(label='Q-value')
plt.xlabel('Action')
plt.ylabel('State')
plt.title('Learned Q-Table')
plt.tight_layout()
plt.show()
print("\nā Agent trained!")
š® Using OpenAI Gym
# OpenAI Gym: Standard RL environments
import gym
# Create environment
# env = gym.make('CartPole-v1') # Balance pole on cart
# Available environments:
popular_envs = {
'CartPole-v1': 'Balance a pole on a moving cart',
'MountainCar-v0': 'Drive car up a steep mountain',
'LunarLander-v2': 'Land spacecraft safely',
'FrozenLake-v1': 'Navigate frozen lake to goal',
'Taxi-v3': 'Pick up and drop off passengers',
'Acrobot-v1': 'Swing up a two-link robot',
'Pong-v0': 'Classic Atari Pong game',
'Breakout-v0': 'Classic Atari Breakout game'
}
print("Popular Gym Environments:")
for env_name, description in popular_envs.items():
print(f" {env_name:20s}: {description}")
# Example: CartPole
print("\n" + "="*60)
print("CartPole-v1 Environment:")
# Pseudocode (requires gym installation)
"""
env = gym.make('CartPole-v1')
# Environment info
print(f"Observation space: {env.observation_space}") # State
print(f"Action space: {env.action_space}") # Actions
# Reset environment
state = env.reset()
# Take action
action = env.action_space.sample() # Random action
next_state, reward, done, info = env.step(action)
# Render (visualize)
env.render()
# Close
env.close()
"""
print(" State: [cart_position, cart_velocity, pole_angle, pole_velocity]")
print(" Actions: 0=push left, 1=push right")
print(" Goal: Keep pole balanced for 500 steps")
print(" Reward: +1 for each step balanced")
print("="*60)
š Deep Q-Networks (DQN)
# DQN: Use neural network instead of Q-table
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from collections import deque
import random
class DQNAgent:
def __init__(self, state_size, action_size):
"""
Deep Q-Network agent
Args:
state_size: Dimension of state space
action_size: Number of actions
"""
self.state_size = state_size
self.action_size = action_size
# Hyperparameters
self.gamma = 0.95 # Discount factor
self.epsilon = 1.0 # Exploration rate
self.epsilon_min = 0.01
self.epsilon_decay = 0.995
self.learning_rate = 0.001
# Experience replay buffer
self.memory = deque(maxlen=2000)
self.batch_size = 32
# Q-Network
self.model = self._build_model()
def _build_model(self):
"""Build neural network to approximate Q-function"""
model = keras.Sequential([
layers.Dense(24, activation='relu', input_shape=(self.state_size,)),
layers.Dense(24, activation='relu'),
layers.Dense(self.action_size, activation='linear')
])
model.compile(
optimizer=keras.optimizers.Adam(learning_rate=self.learning_rate),
loss='mse'
)
return model
def remember(self, state, action, reward, next_state, done):
"""Store experience in replay buffer"""
self.memory.append((state, action, reward, next_state, done))
def act(self, state):
"""Choose action using ε-greedy policy"""
if np.random.random() <= self.epsilon:
return random.randrange(self.action_size)
q_values = self.model.predict(state, verbose=0)
return np.argmax(q_values[0])
def replay(self):
"""Train on random batch from memory"""
if len(self.memory) < self.batch_size:
return
# Sample random batch
batch = random.sample(self.memory, self.batch_size)
for state, action, reward, next_state, done in batch:
target = reward
if not done:
target += self.gamma * np.amax(
self.model.predict(next_state, verbose=0)[0]
)
# Update Q-value for taken action
target_f = self.model.predict(state, verbose=0)
target_f[0][action] = target
# Train network
self.model.fit(state, target_f, epochs=1, verbose=0)
# Decay exploration
if self.epsilon > self.epsilon_min:
self.epsilon *= self.epsilon_decay
# Create DQN agent
state_size = 4 # e.g., CartPole state
action_size = 2 # e.g., left or right
dqn_agent = DQNAgent(state_size, action_size)
print("Deep Q-Network Agent:")
print(f" State size: {state_size}")
print(f" Action size: {action_size}")
print(f" Network architecture:")
dqn_agent.model.summary()
print("\nDQN Advantages over Q-Learning:")
print(" ⢠Handles large/continuous state spaces")
print(" ⢠Learns feature representations automatically")
print(" ⢠Can generalize to unseen states")
print(" ⢠Scales to complex problems (e.g., Atari games)")
šÆ Advanced RL Algorithms
# Overview of modern RL algorithms
rl_algorithms = {
'Q-Learning': {
'Type': 'Value-based',
'Pros': 'Simple, well-understood',
'Cons': 'Only for discrete actions, Q-table can be huge',
'Use case': 'Small state/action spaces'
},
'DQN': {
'Type': 'Value-based',
'Pros': 'Handles large state spaces, image inputs',
'Cons': 'Discrete actions only, can be unstable',
'Use case': 'Atari games, complex environments'
},
'Policy Gradient': {
'Type': 'Policy-based',
'Pros': 'Works with continuous actions, stochastic policies',
'Cons': 'High variance, slow convergence',
'Use case': 'Robotics, continuous control'
},
'Actor-Critic': {
'Type': 'Hybrid',
'Pros': 'Lower variance than policy gradient, flexible',
'Cons': 'More complex to implement',
'Use case': 'General purpose, good default choice'
},
'PPO': {
'Type': 'Policy-based',
'Pros': 'Stable, sample efficient, widely used',
'Cons': 'Requires tuning',
'Use case': 'Current industry standard'
},
'A3C': {
'Type': 'Actor-Critic',
'Pros': 'Parallel training, fast',
'Cons': 'Needs multiple cores',
'Use case': 'Large-scale training'
},
'SAC': {
'Type': 'Actor-Critic',
'Pros': 'Very sample efficient, stable',
'Cons': 'Complex implementation',
'Use case': 'Robotics, continuous control'
}
}
print("Advanced RL Algorithms:")
print("="*70)
for name, info in rl_algorithms.items():
print(f"\n{name}:")
for key, value in info.items():
print(f" {key:12s}: {value}")
print("\n" + "="*70)
print("Recommendation:")
print(" Beginners: Start with Q-Learning")
print(" Image inputs: Use DQN")
print(" Continuous actions: Use PPO or SAC")
print(" Production: Use PPO (most reliable)")
print("="*70)
šÆ Key Takeaways
- RL learns from rewards: Trial and error, not labeled data
- Agent interacts with environment: State ā Action ā Reward loop
- Exploration vs Exploitation: Balance trying new things vs using knowledge
- Q-Learning: Learn action-value function, ε-greedy policy
- Q-table: Works for small discrete problems
- DQN: Use neural networks for large/continuous states
- OpenAI Gym: Standard environments for testing RL algorithms
- Modern algorithms: PPO, SAC for production systems
š Real-World Applications
- Gaming: AlphaGo, OpenAI Five (Dota 2), StarCraft AI
- Robotics: Manipulation, locomotion, autonomous navigation
- Autonomous vehicles: Path planning, decision making
- Finance: Trading strategies, portfolio optimization
- Recommendation: Personalized content, ads
- Healthcare: Treatment planning, drug discovery
- Energy: Smart grid management, HVAC optimization
- Manufacturing: Resource allocation, scheduling