Project Overview
Learn to fine-tune pre-trained language models for your specific domain or task. This project covers data preparation, training, evaluation, and deployment of custom models using OpenAI's fine-tuning API and Hugging Face Transformers.
What You'll Learn:
- Data preparation: Format and clean training data
- Fine-tuning strategies: When and how to fine-tune
- OpenAI fine-tuning: Custom GPT-3.5-turbo models
- Hugging Face training: Fine-tune BERT, GPT-2, etc.
- Evaluation: Measure model performance
- Deployment: Serve your custom model
š¤ When to Fine-Tune vs Alternatives
# Decision tree for choosing the right approach
decision_framework = {
'Prompt Engineering': {
'When': 'Task fits within context window, simple customization',
'Cost': 'Low (pay per API call)',
'Time': 'Minutes',
'Examples': 'Change tone, format, simple instructions',
},
'RAG (Retrieval Augmented Generation)': {
'When': 'Need external knowledge, company docs, latest info',
'Cost': 'Low-Medium (embeddings + LLM calls)',
'Time': 'Hours to days',
'Examples': 'Q&A over documents, knowledge base chatbot',
},
'Fine-Tuning': {
'When': 'Need consistent behavior, specialized domain, repeated patterns',
'Cost': 'Medium-High (training + inference)',
'Time': 'Days to weeks',
'Examples': 'Custom writing style, domain-specific language, task automation',
},
'Training from Scratch': {
'When': 'Completely new domain, maximum control, huge dataset',
'Cost': 'Very High (compute intensive)',
'Time': 'Weeks to months',
'Examples': 'New language, proprietary data, research',
}
}
print("Choose the Right Approach:\n")
for approach, details in decision_framework.items():
print(f"{approach}:")
for key, value in details.items():
print(f" {key}: {value}")
print()
š Data Preparation
import json
import pandas as pd
from typing import List, Dict
class DataPreparator:
"""Prepare data for fine-tuning"""
def __init__(self):
self.training_data = []
self.validation_data = []
def create_training_example(self,
prompt: str,
completion: str,
system_message: str = None) -> Dict:
"""
Create a single training example
For OpenAI fine-tuning format:
{"messages": [
{"role": "system", "content": "..."},
{"role": "user", "content": "..."},
{"role": "assistant", "content": "..."}
]}
"""
messages = []
if system_message:
messages.append({"role": "system", "content": system_message})
messages.append({"role": "user", "content": prompt})
messages.append({"role": "assistant", "content": completion})
return {"messages": messages}
def load_from_csv(self,
csv_path: str,
prompt_column: str,
completion_column: str,
system_column: str = None,
train_split: float = 0.8):
"""Load and split data from CSV"""
df = pd.read_csv(csv_path)
print(f"Loaded {len(df)} examples from {csv_path}")
# Create training examples
examples = []
for _, row in df.iterrows():
system = row[system_column] if system_column else None
example = self.create_training_example(
prompt=row[prompt_column],
completion=row[completion_column],
system_message=system
)
examples.append(example)
# Split train/validation
split_idx = int(len(examples) * train_split)
self.training_data = examples[:split_idx]
self.validation_data = examples[split_idx:]
print(f"Split: {len(self.training_data)} training, {len(self.validation_data)} validation")
return self.training_data, self.validation_data
def validate_data(self, examples: List[Dict]) -> Dict:
"""
Validate training data format and quality
Returns:
Dict with validation results and warnings
"""
issues = {
'total_examples': len(examples),
'errors': [],
'warnings': [],
'stats': {}
}
# Check format
for i, example in enumerate(examples):
if 'messages' not in example:
issues['errors'].append(f"Example {i}: Missing 'messages' key")
continue
messages = example['messages']
# Check message structure
if not messages or len(messages) < 2:
issues['errors'].append(f"Example {i}: Need at least user and assistant message")
# Check roles
roles = [m['role'] for m in messages]
if 'assistant' not in roles:
issues['errors'].append(f"Example {i}: Missing assistant response")
# Calculate statistics
prompt_lengths = []
completion_lengths = []
for example in examples:
for msg in example['messages']:
if msg['role'] == 'user':
prompt_lengths.append(len(msg['content']))
elif msg['role'] == 'assistant':
completion_lengths.append(len(msg['content']))
issues['stats'] = {
'avg_prompt_length': sum(prompt_lengths) / len(prompt_lengths) if prompt_lengths else 0,
'avg_completion_length': sum(completion_lengths) / len(completion_lengths) if completion_lengths else 0,
'min_examples_recommended': 50,
'your_examples': len(examples)
}
# Warnings
if len(examples) < 50:
issues['warnings'].append(f"Only {len(examples)} examples. Recommend 50+ for good results")
if issues['stats']['avg_prompt_length'] > 2000:
issues['warnings'].append("Average prompt length is high. Consider summarizing.")
return issues
def save_jsonl(self, examples: List[Dict], output_path: str):
"""Save examples in JSONL format"""
with open(output_path, 'w') as f:
for example in examples:
f.write(json.dumps(example) + '\n')
print(f"Saved {len(examples)} examples to {output_path}")
# Example usage
preparator = DataPreparator()
# Create sample training data
training_examples = [
preparator.create_training_example(
prompt="Write a product description for wireless headphones",
completion="Experience premium sound with our wireless headphones featuring active noise cancellation, 30-hour battery life, and crystal-clear audio quality.",
system_message="You are a professional product description writer."
),
# Add more examples...
]
# Validate data
validation_results = preparator.validate_data(training_examples)
print("\nValidation Results:")
print(f"Total Examples: {validation_results['total_examples']}")
print(f"Errors: {len(validation_results['errors'])}")
print(f"Warnings: {len(validation_results['warnings'])}")
print(f"\nStats:")
for key, value in validation_results['stats'].items():
print(f" {key}: {value}")
# Save to file
preparator.save_jsonl(training_examples, 'training_data.jsonl')
š„ OpenAI Fine-Tuning
from openai import OpenAI
import time
import os
class OpenAIFineTuner:
"""Fine-tune OpenAI models"""
def __init__(self, api_key=None):
self.client = OpenAI(api_key=api_key or os.getenv('OPENAI_API_KEY'))
def upload_training_file(self, file_path: str) -> str:
"""Upload training data to OpenAI"""
print(f"Uploading {file_path}...")
with open(file_path, 'rb') as f:
response = self.client.files.create(
file=f,
purpose='fine-tune'
)
file_id = response.id
print(f"ā File uploaded: {file_id}")
return file_id
def create_fine_tune_job(self,
training_file_id: str,
validation_file_id: str = None,
model: str = "gpt-3.5-turbo",
n_epochs: int = 3,
learning_rate_multiplier: float = None,
suffix: str = None) -> str:
"""
Create fine-tuning job
Args:
training_file_id: ID of uploaded training file
validation_file_id: Optional validation file
model: Base model to fine-tune
n_epochs: Number of training epochs (1-50)
learning_rate_multiplier: Learning rate multiplier (0.02-2.0)
suffix: Custom suffix for model name (up to 40 chars)
"""
print(f"Creating fine-tune job for {model}...")
hyperparameters = {"n_epochs": n_epochs}
if learning_rate_multiplier:
hyperparameters["learning_rate_multiplier"] = learning_rate_multiplier
response = self.client.fine_tuning.jobs.create(
training_file=training_file_id,
validation_file=validation_file_id,
model=model,
hyperparameters=hyperparameters,
suffix=suffix
)
job_id = response.id
print(f"ā Fine-tune job created: {job_id}")
print(f" Status: {response.status}")
return job_id
def monitor_fine_tune_job(self, job_id: str, poll_interval: int = 60):
"""
Monitor fine-tuning progress
Args:
job_id: Fine-tune job ID
poll_interval: Seconds between status checks
"""
print(f"\nMonitoring job {job_id}...")
print("This may take 10-60 minutes depending on data size.\n")
while True:
job = self.client.fine_tuning.jobs.retrieve(job_id)
status = job.status
print(f"Status: {status}")
if status == 'succeeded':
print(f"\nā Fine-tuning completed!")
print(f"Model ID: {job.fine_tuned_model}")
return job.fine_tuned_model
elif status == 'failed':
print(f"\nā Fine-tuning failed")
print(f"Error: {job.error}")
return None
elif status == 'cancelled':
print(f"\nā Fine-tuning cancelled")
return None
# Still running
time.sleep(poll_interval)
def list_fine_tune_jobs(self, limit: int = 10):
"""List recent fine-tune jobs"""
jobs = self.client.fine_tuning.jobs.list(limit=limit)
print(f"Recent Fine-Tune Jobs:\n")
for job in jobs.data:
print(f"ID: {job.id}")
print(f" Status: {job.status}")
print(f" Model: {job.model}")
if job.fine_tuned_model:
print(f" Fine-tuned model: {job.fine_tuned_model}")
print()
def use_fine_tuned_model(self, model_id: str, prompt: str) -> str:
"""Use your fine-tuned model"""
response = self.client.chat.completions.create(
model=model_id,
messages=[
{"role": "user", "content": prompt}
],
temperature=0.7
)
return response.choices[0].message.content
# Example usage
fine_tuner = OpenAIFineTuner()
# Step 1: Upload training data
training_file_id = fine_tuner.upload_training_file('training_data.jsonl')
validation_file_id = fine_tuner.upload_training_file('validation_data.jsonl')
# Step 2: Create fine-tune job
job_id = fine_tuner.create_fine_tune_job(
training_file_id=training_file_id,
validation_file_id=validation_file_id,
model="gpt-3.5-turbo",
n_epochs=3,
suffix="my-custom-model"
)
# Step 3: Monitor progress
# model_id = fine_tuner.monitor_fine_tune_job(job_id)
# Step 4: Use fine-tuned model
# result = fine_tuner.use_fine_tuned_model(
# model_id=model_id,
# prompt="Write a product description for a laptop"
# )
# print(result)
print("\nā OpenAI fine-tuning setup complete!")
š¤ Hugging Face Fine-Tuning
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
TrainingArguments,
Trainer,
DataCollatorForLanguageModeling
)
from datasets import load_dataset, Dataset
import torch
class HuggingFaceFineTuner:
"""Fine-tune Hugging Face models"""
def __init__(self, model_name: str = "gpt2"):
"""
Initialize with base model
Popular options:
- "gpt2" / "gpt2-medium" / "gpt2-large"
- "EleutherAI/gpt-neo-125M"
- "facebook/opt-350m"
- "google/flan-t5-base"
"""
self.model_name = model_name
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.model = AutoModelForCausalLM.from_pretrained(model_name)
# Add padding token if missing
if self.tokenizer.pad_token is None:
self.tokenizer.pad_token = self.tokenizer.eos_token
print(f"ā Loaded {model_name}")
print(f" Parameters: {self.model.num_parameters():,}")
def prepare_dataset(self, texts: List[str], max_length: int = 512):
"""Prepare dataset for training"""
def tokenize_function(examples):
return self.tokenizer(
examples['text'],
truncation=True,
max_length=max_length,
padding='max_length'
)
# Create dataset
dataset = Dataset.from_dict({'text': texts})
# Tokenize
tokenized_dataset = dataset.map(
tokenize_function,
batched=True,
remove_columns=['text']
)
print(f"ā Prepared {len(texts)} examples")
return tokenized_dataset
def train(self,
train_dataset,
output_dir: str = "./fine-tuned-model",
num_epochs: int = 3,
batch_size: int = 4,
learning_rate: float = 2e-5,
save_steps: int = 500,
eval_dataset = None):
"""
Fine-tune the model
Args:
train_dataset: Training dataset
output_dir: Where to save the model
num_epochs: Number of training epochs
batch_size: Training batch size
learning_rate: Learning rate
save_steps: Save checkpoint every N steps
eval_dataset: Optional validation dataset
"""
# Training arguments
training_args = TrainingArguments(
output_dir=output_dir,
num_train_epochs=num_epochs,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
learning_rate=learning_rate,
warmup_steps=100,
weight_decay=0.01,
logging_dir=f"{output_dir}/logs",
logging_steps=50,
save_steps=save_steps,
save_total_limit=3,
evaluation_strategy="steps" if eval_dataset else "no",
eval_steps=save_steps if eval_dataset else None,
fp16=torch.cuda.is_available(), # Use mixed precision if GPU available
)
# Data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
tokenizer=self.tokenizer,
mlm=False # Causal language modeling
)
# Create trainer
trainer = Trainer(
model=self.model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
data_collator=data_collator,
)
print(f"\nStarting training...")
print(f" Epochs: {num_epochs}")
print(f" Batch size: {batch_size}")
print(f" Learning rate: {learning_rate}")
print(f" Device: {'GPU' if torch.cuda.is_available() else 'CPU'}")
print()
# Train
trainer.train()
# Save final model
trainer.save_model(output_dir)
self.tokenizer.save_pretrained(output_dir)
print(f"\nā Training complete! Model saved to {output_dir}")
return trainer
def generate_text(self,
prompt: str,
max_length: int = 100,
temperature: float = 0.7,
top_p: float = 0.9,
num_return_sequences: int = 1):
"""Generate text with the fine-tuned model"""
inputs = self.tokenizer(prompt, return_tensors="pt")
outputs = self.model.generate(
**inputs,
max_length=max_length,
temperature=temperature,
top_p=top_p,
num_return_sequences=num_return_sequences,
do_sample=True,
pad_token_id=self.tokenizer.eos_token_id
)
generated_texts = [
self.tokenizer.decode(output, skip_special_tokens=True)
for output in outputs
]
return generated_texts[0] if num_return_sequences == 1 else generated_texts
# Example usage
fine_tuner = HuggingFaceFineTuner(model_name="gpt2")
# Prepare training data
training_texts = [
"Product: Wireless Mouse. Description: Ergonomic design with 6 buttons...",
"Product: Mechanical Keyboard. Description: RGB backlit keys with Cherry MX switches...",
# Add more examples...
]
train_dataset = fine_tuner.prepare_dataset(training_texts)
# Fine-tune
# trainer = fine_tuner.train(
# train_dataset=train_dataset,
# output_dir="./my-fine-tuned-gpt2",
# num_epochs=3,
# batch_size=4
# )
# Generate with fine-tuned model
# text = fine_tuner.generate_text(
# prompt="Product: Wireless Headphones. Description:",
# max_length=100
# )
# print(text)
print("\nā Hugging Face fine-tuning ready!")
š Evaluation Metrics
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
class ModelEvaluator:
"""Evaluate fine-tuned model performance"""
def __init__(self, base_model, fine_tuned_model):
self.base_model = base_model
self.fine_tuned_model = fine_tuned_model
def compare_outputs(self, test_prompts: List[str], expected_outputs: List[str]):
"""
Compare base model vs fine-tuned model
Returns comparison metrics
"""
print("Evaluating models on test set...\n")
base_outputs = []
finetuned_outputs = []
for prompt in test_prompts:
# Get outputs from both models
base_out = self.base_model.generate(prompt)
ft_out = self.fine_tuned_model.generate(prompt)
base_outputs.append(base_out)
finetuned_outputs.append(ft_out)
# Calculate metrics
results = {
'test_cases': len(test_prompts),
'base_model': self._evaluate_outputs(base_outputs, expected_outputs),
'fine_tuned': self._evaluate_outputs(finetuned_outputs, expected_outputs),
}
# Calculate improvement
results['improvement'] = {
'accuracy': results['fine_tuned']['accuracy'] - results['base_model']['accuracy'],
'avg_similarity': results['fine_tuned']['avg_similarity'] - results['base_model']['avg_similarity']
}
return results
def _evaluate_outputs(self, outputs: List[str], expected: List[str]) -> Dict:
"""Calculate evaluation metrics"""
from difflib import SequenceMatcher
similarities = []
for output, expect in zip(outputs, expected):
similarity = SequenceMatcher(None, output, expect).ratio()
similarities.append(similarity)
return {
'accuracy': sum(1 for s in similarities if s > 0.8) / len(similarities),
'avg_similarity': np.mean(similarities),
'min_similarity': np.min(similarities),
'max_similarity': np.max(similarities),
}
def calculate_perplexity(self, model, test_texts: List[str]) -> float:
"""
Calculate perplexity (lower is better)
Measures how well model predicts the text
"""
total_loss = 0
total_tokens = 0
for text in test_texts:
inputs = model.tokenizer(text, return_tensors="pt")
with torch.no_grad():
outputs = model.model(**inputs, labels=inputs["input_ids"])
loss = outputs.loss
total_loss += loss.item() * inputs["input_ids"].size(1)
total_tokens += inputs["input_ids"].size(1)
avg_loss = total_loss / total_tokens
perplexity = np.exp(avg_loss)
return perplexity
def generate_evaluation_report(self, results: Dict) -> str:
"""Generate readable evaluation report"""
report = "Model Evaluation Report\n"
report += "=" * 50 + "\n\n"
report += f"Test Cases: {results['test_cases']}\n\n"
report += "Base Model:\n"
for metric, value in results['base_model'].items():
report += f" {metric}: {value:.3f}\n"
report += "\nFine-Tuned Model:\n"
for metric, value in results['fine_tuned'].items():
report += f" {metric}: {value:.3f}\n"
report += "\nImprovement:\n"
for metric, value in results['improvement'].items():
sign = "+" if value > 0 else ""
report += f" {metric}: {sign}{value:.3f}\n"
return report
print("Evaluation tools ready!")
š Deployment
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import uvicorn
class ModelServer:
"""Deploy fine-tuned model as API"""
def __init__(self, model_path: str):
"""Load fine-tuned model"""
from transformers import AutoModelForCausalLM, AutoTokenizer
self.tokenizer = AutoTokenizer.from_pretrained(model_path)
self.model = AutoModelForCausalLM.from_pretrained(model_path)
self.model.eval()
print(f"ā Model loaded from {model_path}")
def generate(self, prompt: str, max_length: int = 100) -> str:
"""Generate text"""
inputs = self.tokenizer(prompt, return_tensors="pt")
with torch.no_grad():
outputs = self.model.generate(
**inputs,
max_length=max_length,
temperature=0.7,
top_p=0.9,
do_sample=True
)
text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
return text
# Create FastAPI app
app = FastAPI(title="Fine-Tuned Model API")
# Load model
model_server = ModelServer("./my-fine-tuned-model")
class GenerateRequest(BaseModel):
prompt: str
max_length: int = 100
class GenerateResponse(BaseModel):
generated_text: str
@app.post("/generate", response_model=GenerateResponse)
async def generate_text(request: GenerateRequest):
"""Generate text endpoint"""
try:
text = model_server.generate(
prompt=request.prompt,
max_length=request.max_length
)
return GenerateResponse(generated_text=text)
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.get("/health")
async def health_check():
"""Health check endpoint"""
return {"status": "healthy", "model": "loaded"}
# Run with: uvicorn server:app --host 0.0.0.0 --port 8000
print("API server ready!")
print("Run with: uvicorn server:app --reload")
print("\nTest with:")
print(" curl -X POST http://localhost:8000/generate \\")
print(' -H "Content-Type: application/json" \\')
print(' -d \'{"prompt": "Your prompt here"}\'')
š° Cost & Performance Comparison
| Platform | Training Cost | Training Time | Inference Cost | Best For |
|---|---|---|---|---|
| OpenAI Fine-Tuning | $0.008/1K tokens | 10-60 min | $0.012/1K tokens | Quick deployment, GPT quality |
| Hugging Face (Cloud) | $0.50-2/hour GPU | Hours to days | Self-hosted or API | Full control, various models |
| Local Training | Free (own hardware) | Varies by GPU | Free | Privacy, experimentation |
| Google Colab | Free or $10/month | Hours | Self-hosted | Learning, prototyping |
šÆ Key Takeaways
- Choose wisely: Prompt engineering ā RAG ā Fine-tuning
- Data quality: 50+ high-quality examples minimum
- OpenAI: Easiest, fastest, GPT-3.5-turbo quality
- Hugging Face: Full control, many model options
- Evaluation: Always compare base vs fine-tuned
- Cost-effective: Fine-tuning often cheaper at scale
- Deployment: API, self-hosted, or cloud
- Iteration: Start small, measure, improve
š Use Cases
- Customer support: Company-specific responses
- Content creation: Consistent brand voice
- Code generation: Framework-specific patterns
- Legal/Medical: Domain-specific language
- Translation: Specialized terminology
- Summarization: Format-specific summaries
- Classification: Custom categories
- Data extraction: Structured output format