🎮 Reinforcement Learning

Learning through trial and error

What is Reinforcement Learning?

Reinforcement Learning (RL) trains agents to make decisions by rewarding desired behaviors. Powers game AI, robotics, and autonomous systems.

Key Concepts:

  • Agent: The learner/decision maker
  • Environment: What the agent interacts with
  • State: Current situation
  • Action: What agent can do
  • Reward: Feedback from environment
  • Policy: Strategy for choosing actions

🎯 Basic RL Framework

import numpy as np
import matplotlib.pyplot as plt

# Simple grid world environment
class GridWorld:
    def __init__(self, size=5):
        self.size = size
        self.state = (0, 0)  # Start position
        self.goal = (size-1, size-1)  # Goal position
        
    def reset(self):
        self.state = (0, 0)
        return self.state
    
    def step(self, action):
        # Actions: 0=up, 1=right, 2=down, 3=left
        x, y = self.state
        
        if action == 0 and x > 0:
            x -= 1
        elif action == 1 and y < self.size - 1:
            y += 1
        elif action == 2 and x < self.size - 1:
            x += 1
        elif action == 3 and y > 0:
            y -= 1
        
        self.state = (x, y)
        
        # Reward
        if self.state == self.goal:
            reward = 10
            done = True
        else:
            reward = -1
            done = False
        
        return self.state, reward, done

# Test environment
env = GridWorld(size=5)
state = env.reset()
print(f"Start: {state}")

state, reward, done = env.step(1)  # Move right
print(f"After right: {state}, reward: {reward}")

📊 Q-Learning

# Q-Learning: Learn action-value function Q(s, a)
class QLearningAgent:
    def __init__(self, n_states, n_actions, learning_rate=0.1, 
                 discount=0.95, epsilon=0.1):
        self.q_table = np.zeros((n_states, n_actions))
        self.lr = learning_rate
        self.gamma = discount
        self.epsilon = epsilon
        
    def get_action(self, state):
        # Epsilon-greedy policy
        if np.random.random() < self.epsilon:
            return np.random.randint(self.q_table.shape[1])
        else:
            return np.argmax(self.q_table[state])
    
    def update(self, state, action, reward, next_state):
        # Q-learning update
        best_next = np.max(self.q_table[next_state])
        td_target = reward + self.gamma * best_next
        td_error = td_target - self.q_table[state, action]
        self.q_table[state, action] += self.lr * td_error

# Train agent
def state_to_index(state, size=5):
    return state[0] * size + state[1]

env = GridWorld(size=5)
agent = QLearningAgent(n_states=25, n_actions=4)

episodes = 500
rewards_history = []

for episode in range(episodes):
    state = env.reset()
    total_reward = 0
    
    for step in range(100):
        state_idx = state_to_index(state)
        action = agent.get_action(state_idx)
        next_state, reward, done = env.step(action)
        next_state_idx = state_to_index(next_state)
        
        agent.update(state_idx, action, reward, next_state_idx)
        
        total_reward += reward
        state = next_state
        
        if done:
            break
    
    rewards_history.append(total_reward)

# Plot learning curve
plt.figure(figsize=(10, 6))
plt.plot(rewards_history)
plt.xlabel('Episode')
plt.ylabel('Total Reward')
plt.title('Q-Learning Training Progress')
plt.show()

print(f"Final Q-table shape: {agent.q_table.shape}")
print(f"Average reward (last 100): {np.mean(rewards_history[-100:]):.2f}")

🎲 OpenAI Gym

# pip install gym
import gym

# CartPole environment
env = gym.make('CartPole-v1')

print(f"Observation space: {env.observation_space}")
print(f"Action space: {env.action_space}")

# Random agent
state = env.reset()
total_reward = 0

for _ in range(100):
    action = env.action_space.sample()  # Random action
    state, reward, done, info = env.step(action)
    total_reward += reward
    
    if done:
        break

print(f"Total reward: {total_reward}")
env.close()

🧠 Deep Q-Network (DQN)

from tensorflow import keras
from tensorflow.keras import layers
from collections import deque
import random

class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001
        self.model = self._build_model()
        
    def _build_model(self):
        model = keras.Sequential([
            layers.Dense(24, activation='relu', 
                        input_shape=(self.state_size,)),
            layers.Dense(24, activation='relu'),
            layers.Dense(self.action_size, activation='linear')
        ])
        model.compile(optimizer=keras.optimizers.Adam(lr=self.learning_rate),
                     loss='mse')
        return model
    
    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
    
    def act(self, state):
        if np.random.random() <= self.epsilon:
            return random.randrange(self.action_size)
        q_values = self.model.predict(state, verbose=0)
        return np.argmax(q_values[0])
    
    def replay(self, batch_size):
        if len(self.memory) < batch_size:
            return
        
        minibatch = random.sample(self.memory, batch_size)
        
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                target = reward + self.gamma * \
                        np.amax(self.model.predict(next_state, verbose=0)[0])
            
            target_f = self.model.predict(state, verbose=0)
            target_f[0][action] = target
            self.model.fit(state, target_f, epochs=1, verbose=0)
        
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

# Train DQN
env = gym.make('CartPole-v1')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
agent = DQNAgent(state_size, action_size)

episodes = 100
batch_size = 32

for episode in range(episodes):
    state = env.reset()
    state = np.reshape(state, [1, state_size])
    total_reward = 0
    
    for time in range(500):
        action = agent.act(state)
        next_state, reward, done, _ = env.step(action)
        next_state = np.reshape(next_state, [1, state_size])
        
        agent.remember(state, action, reward, next_state, done)
        state = next_state
        total_reward += reward
        
        if done:
            print(f"Episode {episode + 1}/{episodes}, Score: {total_reward}, "
                  f"Epsilon: {agent.epsilon:.2f}")
            break
        
        agent.replay(batch_size)

env.close()

🎯 Policy Gradient

# Policy gradient: Directly optimize policy
class PolicyGradientAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = 0.99
        self.learning_rate = 0.01
        
        # Build policy network
        self.model = keras.Sequential([
            layers.Dense(24, activation='relu', input_shape=(state_size,)),
            layers.Dense(24, activation='relu'),
            layers.Dense(action_size, activation='softmax')
        ])
        self.model.compile(
            optimizer=keras.optimizers.Adam(lr=self.learning_rate),
            loss='categorical_crossentropy'
        )
        
        self.states = []
        self.actions = []
        self.rewards = []
    
    def act(self, state):
        state = np.reshape(state, [1, self.state_size])
        probs = self.model.predict(state, verbose=0)[0]
        action = np.random.choice(self.action_size, p=probs)
        return action
    
    def remember(self, state, action, reward):
        self.states.append(state)
        action_onehot = np.zeros(self.action_size)
        action_onehot[action] = 1
        self.actions.append(action_onehot)
        self.rewards.append(reward)
    
    def discount_rewards(self):
        discounted = np.zeros_like(self.rewards, dtype=np.float32)
        running_add = 0
        for t in reversed(range(len(self.rewards))):
            running_add = running_add * self.gamma + self.rewards[t]
            discounted[t] = running_add
        
        # Normalize
        discounted -= np.mean(discounted)
        discounted /= (np.std(discounted) + 1e-8)
        return discounted
    
    def train(self):
        states = np.vstack(self.states)
        actions = np.vstack(self.actions)
        rewards = self.discount_rewards()
        
        # Weight actions by discounted rewards
        weighted_actions = actions * rewards[:, np.newaxis]
        
        self.model.train_on_batch(states, weighted_actions)
        
        # Clear memory
        self.states = []
        self.actions = []
        self.rewards = []

# Train policy gradient agent
env = gym.make('CartPole-v1')
agent = PolicyGradientAgent(state_size=4, action_size=2)

for episode in range(100):
    state = env.reset()
    total_reward = 0
    
    for time in range(500):
        action = agent.act(state)
        next_state, reward, done, _ = env.step(action)
        
        agent.remember(state, action, reward)
        state = next_state
        total_reward += reward
        
        if done:
            agent.train()
            print(f"Episode {episode + 1}, Score: {total_reward}")
            break

env.close()

📊 RL Algorithms Comparison

Algorithm Type Pros Cons
Q-Learning Value-based Simple, off-policy Discrete actions only
DQN Value-based Works with large states Sample inefficient
Policy Gradient Policy-based Continuous actions High variance
Actor-Critic Hybrid Lower variance More complex
PPO Policy-based Stable, efficient Hyperparameter sensitive

💡 Best Practices

🎯 Key Takeaways