Reinforcement Learning Basics

What is Reinforcement Learning?

Reinforcement Learning (RL) is about learning from experience. An agent learns to make decisions by interacting with an environment, receiving rewards for good actions and penalties for bad ones. It's how AlphaGo learned to beat world champions, how robots learn to walk, and how recommendation systems optimize engagement.

                Key Difference from Other ML:
                Supervised Learning: Learn from labeled examples (X → Y)
Unsupervised Learning: Find patterns in data
Reinforcement Learning: Learn from rewards through trial and error

            

🧩 Core Concepts

# RL Components

rl_components = {
    'Agent': 'The learner/decision maker (e.g., game player, robot)',
    'Environment': 'The world the agent interacts with',
    'State (s)': 'Current situation of the agent',
    'Action (a)': 'What the agent can do',
    'Reward (r)': 'Feedback signal (positive or negative)',
    'Policy (π)': 'Strategy for choosing actions',
    'Value Function (V)': 'Expected long-term reward from a state',
    'Q-Function (Q)': 'Expected reward for taking action a in state s'
}

print("Reinforcement Learning Components:")
print("="*60)
for component, description in rl_components.items():
    print(f"{component:20s}: {description}")

# RL Loop:
# 1. Agent observes state s
# 2. Agent takes action a
# 3. Environment gives reward r and new state s'
# 4. Agent learns from experience
# 5. Repeat

print("\n" + "="*60)
print("RL Training Loop:")
print("  State → Action → Reward → New State → Learn → Repeat")
print("="*60)

Real-World Examples:

Game Playing: State = board position, Action = move, Reward = win/lose
Robotics: State = sensor data, Action = motor commands, Reward = task completion
Trading: State = market data, Action = buy/sell, Reward = profit/loss
Recommendation: State = user history, Action = recommend item, Reward = engagement

🎯 Markov Decision Process (MDP)

# MDP: Mathematical framework for RL

import numpy as np
import matplotlib.pyplot as plt

# Simple grid world example
# Agent navigates 4x4 grid to reach goal

class GridWorld:
    def __init__(self, size=4):
        self.size = size
        self.state = (0, 0)  # Start position
        self.goal = (3, 3)   # Goal position
        
    def reset(self):
        """Reset to start state"""
        self.state = (0, 0)
        return self.state
    
    def step(self, action):
        """
        Take action and return (new_state, reward, done)
        
        Actions: 0=up, 1=right, 2=down, 3=left
        """
        x, y = self.state
        
        # Take action
        if action == 0 and x > 0:  # Up
            x -= 1
        elif action == 1 and y < self.size - 1:  # Right
            y += 1
        elif action == 2 and x < self.size - 1:  # Down
            x += 1
        elif action == 3 and y > 0:  # Left
            y -= 1
        
        # Update state
        self.state = (x, y)
        
        # Calculate reward
        if self.state == self.goal:
            reward = 10  # Reached goal!
            done = True
        else:
            reward = -1  # Small penalty for each step
            done = False
        
        return self.state, reward, done
    
    def render(self):
        """Visualize grid"""
        grid = np.zeros((self.size, self.size))
        grid[self.state] = 1  # Agent
        grid[self.goal] = 2   # Goal
        
        plt.imshow(grid, cmap='RdYlGn', vmin=0, vmax=2)
        plt.title(f'Agent at {self.state}, Goal at {self.goal}')
        plt.colorbar(ticks=[0, 1, 2], label='0=Empty, 1=Agent, 2=Goal')
        plt.show()

# Create environment
env = GridWorld()

print("Grid World Environment:")
print(f"  Size: {env.size}x{env.size}")
print(f"  Start: {env.state}")
print(f"  Goal: {env.goal}")
print(f"  Actions: 0=up, 1=right, 2=down, 3=left")
print()

# Test environment
state = env.reset()
print(f"Initial state: {state}")

state, reward, done = env.step(1)  # Move right
print(f"After moving right: state={state}, reward={reward}, done={done}")

state, reward, done = env.step(2)  # Move down
print(f"After moving down: state={state}, reward={reward}, done={done}")

🔍 Exploration vs Exploitation

# The RL dilemma: Explore new actions or exploit known good ones?

class EpsilonGreedy:
    """ε-greedy strategy for balancing exploration and exploitation"""
    
    def __init__(self, epsilon=0.1):
        self.epsilon = epsilon  # Probability of exploring
    
    def select_action(self, q_values):
        """
        Select action using ε-greedy strategy
        
        Args:
            q_values: Array of Q-values for each action
        
        Returns:
            Selected action index
        """
        if np.random.random() < self.epsilon:
            # Explore: Choose random action
            return np.random.randint(len(q_values))
        else:
            # Exploit: Choose best known action
            return np.argmax(q_values)
    
    def decay_epsilon(self, decay_rate=0.995):
        """Reduce exploration over time"""
        self.epsilon *= decay_rate

# Example usage
strategy = EpsilonGreedy(epsilon=0.3)

# Simulate Q-values for 4 actions
q_values = np.array([0.5, 0.8, 0.3, 0.6])

print("Q-values for each action:", q_values)
print(f"Best action: {np.argmax(q_values)} (Q={q_values[np.argmax(q_values)]:.2f})")
print()

# Test ε-greedy selection
actions_selected = []
for _ in range(100):
    action = strategy.select_action(q_values)
    actions_selected.append(action)

print(f"With ε={strategy.epsilon}:")
for action in range(len(q_values)):
    count = actions_selected.count(action)
    print(f"  Action {action} selected: {count}% of time")

print("\n✓ Higher ε = more exploration")
print("✓ Lower ε = more exploitation")
print("✓ Typically start high (0.3-1.0) and decay over time")

📊 Q-Learning Algorithm

# Q-Learning: Learn optimal action-value function

class QLearningAgent:
    def __init__(self, n_states, n_actions, learning_rate=0.1, 
                 discount_factor=0.95, epsilon=0.1):
        """
        Initialize Q-Learning agent
        
        Args:
            n_states: Number of possible states
            n_actions: Number of possible actions
            learning_rate (α): How much to update Q-values (0-1)
            discount_factor (γ): Importance of future rewards (0-1)
            epsilon (ε): Exploration rate (0-1)
        """
        self.n_states = n_states
        self.n_actions = n_actions
        self.alpha = learning_rate
        self.gamma = discount_factor
        self.epsilon = epsilon
        
        # Initialize Q-table with zeros
        self.Q = np.zeros((n_states, n_actions))
    
    def get_action(self, state):
        """Choose action using ε-greedy policy"""
        if np.random.random() < self.epsilon:
            return np.random.randint(self.n_actions)  # Explore
        else:
            return np.argmax(self.Q[state])  # Exploit
    
    def update(self, state, action, reward, next_state, done):
        """
        Update Q-value using Q-Learning formula:
        Q(s,a) ← Q(s,a) + α[r + γ·max Q(s',a') - Q(s,a)]
        """
        if done:
            # No future rewards if episode ended
            target = reward
        else:
            # Best possible future reward
            target = reward + self.gamma * np.max(self.Q[next_state])
        
        # Update Q-value
        self.Q[state, action] += self.alpha * (target - self.Q[state, action])
    
    def decay_epsilon(self, decay_rate=0.995):
        """Reduce exploration over time"""
        self.epsilon = max(0.01, self.epsilon * decay_rate)

# Example Q-table for 4x4 grid
n_states = 16  # 4x4 grid
n_actions = 4  # up, right, down, left

agent = QLearningAgent(n_states, n_actions)

print("Q-Learning Agent initialized!")
print(f"  States: {n_states}")
print(f"  Actions: {n_actions}")
print(f"  Learning rate (α): {agent.alpha}")
print(f"  Discount factor (γ): {agent.gamma}")
print(f"  Exploration (ε): {agent.epsilon}")
print()

print("Q-Table shape:", agent.Q.shape)
print("Initial Q-values (all zeros):")
print(agent.Q[:4, :])  # Show first 4 states

🏋️ Training Q-Learning Agent

# Complete training loop

def train_q_learning(env, agent, episodes=1000, max_steps=100):
    """
    Train Q-Learning agent
    
    Args:
        env: Environment
        agent: Q-Learning agent
        episodes: Number of training episodes
        max_steps: Maximum steps per episode
    
    Returns:
        rewards_history: List of total rewards per episode
    """
    rewards_history = []
    
    for episode in range(episodes):
        state = env.reset()
        state_idx = state[0] * env.size + state[1]  # Convert (x,y) to index
        
        total_reward = 0
        
        for step in range(max_steps):
            # Choose action
            action = agent.get_action(state_idx)
            
            # Take action
            next_state, reward, done = env.step(action)
            next_state_idx = next_state[0] * env.size + next_state[1]
            
            # Update Q-table
            agent.update(state_idx, action, reward, next_state_idx, done)
            
            # Update state and reward
            state_idx = next_state_idx
            total_reward += reward
            
            if done:
                break
        
        # Decay exploration
        agent.decay_epsilon()
        
        # Record episode reward
        rewards_history.append(total_reward)
        
        # Print progress
        if (episode + 1) % 100 == 0:
            avg_reward = np.mean(rewards_history[-100:])
            print(f"Episode {episode+1}/{episodes} - "
                  f"Avg Reward: {avg_reward:.2f}, "
                  f"ε: {agent.epsilon:.3f}")
    
    return rewards_history

# Train agent
print("Training Q-Learning agent...\n")
env = GridWorld(size=4)
agent = QLearningAgent(n_states=16, n_actions=4)

rewards = train_q_learning(env, agent, episodes=500)

# Plot learning curve
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(rewards, alpha=0.3)
plt.plot(np.convolve(rewards, np.ones(50)/50, mode='valid'), linewidth=2)
plt.xlabel('Episode')
plt.ylabel('Total Reward')
plt.title('Learning Curve')
plt.grid(True)

plt.subplot(1, 2, 2)
plt.imshow(agent.Q, cmap='RdYlGn', aspect='auto')
plt.colorbar(label='Q-value')
plt.xlabel('Action')
plt.ylabel('State')
plt.title('Learned Q-Table')

plt.tight_layout()
plt.show()

print("\n✓ Agent trained!")

🎮 Using OpenAI Gym

# OpenAI Gym: Standard RL environments

import gym

# Create environment
# env = gym.make('CartPole-v1')  # Balance pole on cart

# Available environments:
popular_envs = {
    'CartPole-v1': 'Balance a pole on a moving cart',
    'MountainCar-v0': 'Drive car up a steep mountain',
    'LunarLander-v2': 'Land spacecraft safely',
    'FrozenLake-v1': 'Navigate frozen lake to goal',
    'Taxi-v3': 'Pick up and drop off passengers',
    'Acrobot-v1': 'Swing up a two-link robot',
    'Pong-v0': 'Classic Atari Pong game',
    'Breakout-v0': 'Classic Atari Breakout game'
}

print("Popular Gym Environments:")
for env_name, description in popular_envs.items():
    print(f"  {env_name:20s}: {description}")

# Example: CartPole
print("\n" + "="*60)
print("CartPole-v1 Environment:")

# Pseudocode (requires gym installation)
"""
env = gym.make('CartPole-v1')

# Environment info
print(f"Observation space: {env.observation_space}")  # State
print(f"Action space: {env.action_space}")  # Actions

# Reset environment
state = env.reset()

# Take action
action = env.action_space.sample()  # Random action
next_state, reward, done, info = env.step(action)

# Render (visualize)
env.render()

# Close
env.close()
"""

print("  State: [cart_position, cart_velocity, pole_angle, pole_velocity]")
print("  Actions: 0=push left, 1=push right")
print("  Goal: Keep pole balanced for 500 steps")
print("  Reward: +1 for each step balanced")
print("="*60)

🚀 Deep Q-Networks (DQN)

# DQN: Use neural network instead of Q-table

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from collections import deque
import random

class DQNAgent:
    def __init__(self, state_size, action_size):
        """
        Deep Q-Network agent
        
        Args:
            state_size: Dimension of state space
            action_size: Number of actions
        """
        self.state_size = state_size
        self.action_size = action_size
        
        # Hyperparameters
        self.gamma = 0.95      # Discount factor
        self.epsilon = 1.0     # Exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001
        
        # Experience replay buffer
        self.memory = deque(maxlen=2000)
        self.batch_size = 32
        
        # Q-Network
        self.model = self._build_model()
    
    def _build_model(self):
        """Build neural network to approximate Q-function"""
        model = keras.Sequential([
            layers.Dense(24, activation='relu', input_shape=(self.state_size,)),
            layers.Dense(24, activation='relu'),
            layers.Dense(self.action_size, activation='linear')
        ])
        
        model.compile(
            optimizer=keras.optimizers.Adam(learning_rate=self.learning_rate),
            loss='mse'
        )
        
        return model
    
    def remember(self, state, action, reward, next_state, done):
        """Store experience in replay buffer"""
        self.memory.append((state, action, reward, next_state, done))
    
    def act(self, state):
        """Choose action using ε-greedy policy"""
        if np.random.random() <= self.epsilon:
            return random.randrange(self.action_size)
        
        q_values = self.model.predict(state, verbose=0)
        return np.argmax(q_values[0])
    
    def replay(self):
        """Train on random batch from memory"""
        if len(self.memory) < self.batch_size:
            return
        
        # Sample random batch
        batch = random.sample(self.memory, self.batch_size)
        
        for state, action, reward, next_state, done in batch:
            target = reward
            if not done:
                target += self.gamma * np.amax(
                    self.model.predict(next_state, verbose=0)[0]
                )
            
            # Update Q-value for taken action
            target_f = self.model.predict(state, verbose=0)
            target_f[0][action] = target
            
            # Train network
            self.model.fit(state, target_f, epochs=1, verbose=0)
        
        # Decay exploration
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

# Create DQN agent
state_size = 4  # e.g., CartPole state
action_size = 2  # e.g., left or right

dqn_agent = DQNAgent(state_size, action_size)

print("Deep Q-Network Agent:")
print(f"  State size: {state_size}")
print(f"  Action size: {action_size}")
print(f"  Network architecture:")
dqn_agent.model.summary()

print("\nDQN Advantages over Q-Learning:")
print("  • Handles large/continuous state spaces")
print("  • Learns feature representations automatically")
print("  • Can generalize to unseen states")
print("  • Scales to complex problems (e.g., Atari games)")

🎯 Advanced RL Algorithms

# Overview of modern RL algorithms

rl_algorithms = {
    'Q-Learning': {
        'Type': 'Value-based',
        'Pros': 'Simple, well-understood',
        'Cons': 'Only for discrete actions, Q-table can be huge',
        'Use case': 'Small state/action spaces'
    },
    'DQN': {
        'Type': 'Value-based',
        'Pros': 'Handles large state spaces, image inputs',
        'Cons': 'Discrete actions only, can be unstable',
        'Use case': 'Atari games, complex environments'
    },
    'Policy Gradient': {
        'Type': 'Policy-based',
        'Pros': 'Works with continuous actions, stochastic policies',
        'Cons': 'High variance, slow convergence',
        'Use case': 'Robotics, continuous control'
    },
    'Actor-Critic': {
        'Type': 'Hybrid',
        'Pros': 'Lower variance than policy gradient, flexible',
        'Cons': 'More complex to implement',
        'Use case': 'General purpose, good default choice'
    },
    'PPO': {
        'Type': 'Policy-based',
        'Pros': 'Stable, sample efficient, widely used',
        'Cons': 'Requires tuning',
        'Use case': 'Current industry standard'
    },
    'A3C': {
        'Type': 'Actor-Critic',
        'Pros': 'Parallel training, fast',
        'Cons': 'Needs multiple cores',
        'Use case': 'Large-scale training'
    },
    'SAC': {
        'Type': 'Actor-Critic',
        'Pros': 'Very sample efficient, stable',
        'Cons': 'Complex implementation',
        'Use case': 'Robotics, continuous control'
    }
}

print("Advanced RL Algorithms:")
print("="*70)
for name, info in rl_algorithms.items():
    print(f"\n{name}:")
    for key, value in info.items():
        print(f"  {key:12s}: {value}")

print("\n" + "="*70)
print("Recommendation:")
print("  Beginners: Start with Q-Learning")
print("  Image inputs: Use DQN")
print("  Continuous actions: Use PPO or SAC")
print("  Production: Use PPO (most reliable)")
print("="*70)

🎯 Key Takeaways

RL learns from rewards: Trial and error, not labeled data
Agent interacts with environment: State → Action → Reward loop
Exploration vs Exploitation: Balance trying new things vs using knowledge
Q-Learning: Learn action-value function, ε-greedy policy
Q-table: Works for small discrete problems
DQN: Use neural networks for large/continuous states
OpenAI Gym: Standard environments for testing RL algorithms
Modern algorithms: PPO, SAC for production systems

🚀 Real-World Applications

Gaming: AlphaGo, OpenAI Five (Dota 2), StarCraft AI
Robotics: Manipulation, locomotion, autonomous navigation
Autonomous vehicles: Path planning, decision making
Finance: Trading strategies, portfolio optimization
Recommendation: Personalized content, ads
Healthcare: Treatment planning, drug discovery
Energy: Smart grid management, HVAC optimization
Manufacturing: Resource allocation, scheduling