What is Reinforcement Learning?
Reinforcement Learning (RL) trains agents to make decisions by rewarding desired behaviors. Powers game AI, robotics, and autonomous systems.
Key Concepts:
- Agent: The learner/decision maker
- Environment: What the agent interacts with
- State: Current situation
- Action: What agent can do
- Reward: Feedback from environment
- Policy: Strategy for choosing actions
🎯 Basic RL Framework
import numpy as np
import matplotlib.pyplot as plt
# Simple grid world environment
class GridWorld:
def __init__(self, size=5):
self.size = size
self.state = (0, 0) # Start position
self.goal = (size-1, size-1) # Goal position
def reset(self):
self.state = (0, 0)
return self.state
def step(self, action):
# Actions: 0=up, 1=right, 2=down, 3=left
x, y = self.state
if action == 0 and x > 0:
x -= 1
elif action == 1 and y < self.size - 1:
y += 1
elif action == 2 and x < self.size - 1:
x += 1
elif action == 3 and y > 0:
y -= 1
self.state = (x, y)
# Reward
if self.state == self.goal:
reward = 10
done = True
else:
reward = -1
done = False
return self.state, reward, done
# Test environment
env = GridWorld(size=5)
state = env.reset()
print(f"Start: {state}")
state, reward, done = env.step(1) # Move right
print(f"After right: {state}, reward: {reward}")
📊 Q-Learning
# Q-Learning: Learn action-value function Q(s, a)
class QLearningAgent:
def __init__(self, n_states, n_actions, learning_rate=0.1,
discount=0.95, epsilon=0.1):
self.q_table = np.zeros((n_states, n_actions))
self.lr = learning_rate
self.gamma = discount
self.epsilon = epsilon
def get_action(self, state):
# Epsilon-greedy policy
if np.random.random() < self.epsilon:
return np.random.randint(self.q_table.shape[1])
else:
return np.argmax(self.q_table[state])
def update(self, state, action, reward, next_state):
# Q-learning update
best_next = np.max(self.q_table[next_state])
td_target = reward + self.gamma * best_next
td_error = td_target - self.q_table[state, action]
self.q_table[state, action] += self.lr * td_error
# Train agent
def state_to_index(state, size=5):
return state[0] * size + state[1]
env = GridWorld(size=5)
agent = QLearningAgent(n_states=25, n_actions=4)
episodes = 500
rewards_history = []
for episode in range(episodes):
state = env.reset()
total_reward = 0
for step in range(100):
state_idx = state_to_index(state)
action = agent.get_action(state_idx)
next_state, reward, done = env.step(action)
next_state_idx = state_to_index(next_state)
agent.update(state_idx, action, reward, next_state_idx)
total_reward += reward
state = next_state
if done:
break
rewards_history.append(total_reward)
# Plot learning curve
plt.figure(figsize=(10, 6))
plt.plot(rewards_history)
plt.xlabel('Episode')
plt.ylabel('Total Reward')
plt.title('Q-Learning Training Progress')
plt.show()
print(f"Final Q-table shape: {agent.q_table.shape}")
print(f"Average reward (last 100): {np.mean(rewards_history[-100:]):.2f}")
🎲 OpenAI Gym
# pip install gym
import gym
# CartPole environment
env = gym.make('CartPole-v1')
print(f"Observation space: {env.observation_space}")
print(f"Action space: {env.action_space}")
# Random agent
state = env.reset()
total_reward = 0
for _ in range(100):
action = env.action_space.sample() # Random action
state, reward, done, info = env.step(action)
total_reward += reward
if done:
break
print(f"Total reward: {total_reward}")
env.close()
🧠 Deep Q-Network (DQN)
from tensorflow import keras
from tensorflow.keras import layers
from collections import deque
import random
class DQNAgent:
def __init__(self, state_size, action_size):
self.state_size = state_size
self.action_size = action_size
self.memory = deque(maxlen=2000)
self.gamma = 0.95
self.epsilon = 1.0
self.epsilon_min = 0.01
self.epsilon_decay = 0.995
self.learning_rate = 0.001
self.model = self._build_model()
def _build_model(self):
model = keras.Sequential([
layers.Dense(24, activation='relu',
input_shape=(self.state_size,)),
layers.Dense(24, activation='relu'),
layers.Dense(self.action_size, activation='linear')
])
model.compile(optimizer=keras.optimizers.Adam(lr=self.learning_rate),
loss='mse')
return model
def remember(self, state, action, reward, next_state, done):
self.memory.append((state, action, reward, next_state, done))
def act(self, state):
if np.random.random() <= self.epsilon:
return random.randrange(self.action_size)
q_values = self.model.predict(state, verbose=0)
return np.argmax(q_values[0])
def replay(self, batch_size):
if len(self.memory) < batch_size:
return
minibatch = random.sample(self.memory, batch_size)
for state, action, reward, next_state, done in minibatch:
target = reward
if not done:
target = reward + self.gamma * \
np.amax(self.model.predict(next_state, verbose=0)[0])
target_f = self.model.predict(state, verbose=0)
target_f[0][action] = target
self.model.fit(state, target_f, epochs=1, verbose=0)
if self.epsilon > self.epsilon_min:
self.epsilon *= self.epsilon_decay
# Train DQN
env = gym.make('CartPole-v1')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
agent = DQNAgent(state_size, action_size)
episodes = 100
batch_size = 32
for episode in range(episodes):
state = env.reset()
state = np.reshape(state, [1, state_size])
total_reward = 0
for time in range(500):
action = agent.act(state)
next_state, reward, done, _ = env.step(action)
next_state = np.reshape(next_state, [1, state_size])
agent.remember(state, action, reward, next_state, done)
state = next_state
total_reward += reward
if done:
print(f"Episode {episode + 1}/{episodes}, Score: {total_reward}, "
f"Epsilon: {agent.epsilon:.2f}")
break
agent.replay(batch_size)
env.close()
🎯 Policy Gradient
# Policy gradient: Directly optimize policy
class PolicyGradientAgent:
def __init__(self, state_size, action_size):
self.state_size = state_size
self.action_size = action_size
self.gamma = 0.99
self.learning_rate = 0.01
# Build policy network
self.model = keras.Sequential([
layers.Dense(24, activation='relu', input_shape=(state_size,)),
layers.Dense(24, activation='relu'),
layers.Dense(action_size, activation='softmax')
])
self.model.compile(
optimizer=keras.optimizers.Adam(lr=self.learning_rate),
loss='categorical_crossentropy'
)
self.states = []
self.actions = []
self.rewards = []
def act(self, state):
state = np.reshape(state, [1, self.state_size])
probs = self.model.predict(state, verbose=0)[0]
action = np.random.choice(self.action_size, p=probs)
return action
def remember(self, state, action, reward):
self.states.append(state)
action_onehot = np.zeros(self.action_size)
action_onehot[action] = 1
self.actions.append(action_onehot)
self.rewards.append(reward)
def discount_rewards(self):
discounted = np.zeros_like(self.rewards, dtype=np.float32)
running_add = 0
for t in reversed(range(len(self.rewards))):
running_add = running_add * self.gamma + self.rewards[t]
discounted[t] = running_add
# Normalize
discounted -= np.mean(discounted)
discounted /= (np.std(discounted) + 1e-8)
return discounted
def train(self):
states = np.vstack(self.states)
actions = np.vstack(self.actions)
rewards = self.discount_rewards()
# Weight actions by discounted rewards
weighted_actions = actions * rewards[:, np.newaxis]
self.model.train_on_batch(states, weighted_actions)
# Clear memory
self.states = []
self.actions = []
self.rewards = []
# Train policy gradient agent
env = gym.make('CartPole-v1')
agent = PolicyGradientAgent(state_size=4, action_size=2)
for episode in range(100):
state = env.reset()
total_reward = 0
for time in range(500):
action = agent.act(state)
next_state, reward, done, _ = env.step(action)
agent.remember(state, action, reward)
state = next_state
total_reward += reward
if done:
agent.train()
print(f"Episode {episode + 1}, Score: {total_reward}")
break
env.close()
📊 RL Algorithms Comparison
| Algorithm | Type | Pros | Cons |
|---|---|---|---|
| Q-Learning | Value-based | Simple, off-policy | Discrete actions only |
| DQN | Value-based | Works with large states | Sample inefficient |
| Policy Gradient | Policy-based | Continuous actions | High variance |
| Actor-Critic | Hybrid | Lower variance | More complex |
| PPO | Policy-based | Stable, efficient | Hyperparameter sensitive |
💡 Best Practices
- Start simple: Q-learning on small problems first
- Use experience replay: Breaks correlation in samples
- Normalize rewards: Makes training more stable
- Tune epsilon decay: Balance exploration vs exploitation
- Use target networks: Stabilizes DQN training
- Monitor learning curves: Average reward over time
- Save checkpoints: Training can be unstable
- Use gym environments: Standard benchmarks
🎯 Key Takeaways
- RL learns through trial and error with rewards
- Q-Learning learns value of state-action pairs
- DQN uses neural networks for large state spaces
- Policy Gradient directly optimizes policy
- OpenAI Gym provides standard environments
- Exploration vs exploitation is key tradeoff
- Deep RL combines deep learning with RL