Project: Document Q&A with RAG
Retrieval-Augmented Generation (RAG) combines the power of LLMs with your own documents. Build a system that can answer questions based on your PDFs, documents, or knowledge base.
What You'll Build:
- Document ingestion pipeline (PDF, TXT, Markdown)
- Vector database for semantic search
- LLM-powered question answering
- Web interface for asking questions
🧠 How RAG Works
1. Document Processing
Load documents → Split into chunks → Create embeddings
2. Store in Vector DB
Store embeddings for semantic search
3. Query Time
User asks question → Find relevant chunks → Send to LLM with context
🔧 Implementation
Install Dependencies
pip install langchain openai chromadb pypdf sentence-transformers
Step 1: Load Documents
from langchain.document_loaders import PyPDFLoader, TextLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
# Load PDFs from directory
loader = DirectoryLoader("./documents/", glob="**/*.pdf", loader_cls=PyPDFLoader)
documents = loader.load()
print(f"Loaded {len(documents)} documents")
# Split into chunks
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000, # Characters per chunk
chunk_overlap=200, # Overlap for context
length_function=len
)
chunks = text_splitter.split_documents(documents)
print(f"Created {len(chunks)} chunks")
Step 2: Create Embeddings & Vector Store
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
import os
os.environ["OPENAI_API_KEY"] = "your-key-here"
# Create embeddings
embeddings = OpenAIEmbeddings()
# Create vector store
vectorstore = Chroma.from_documents(
documents=chunks,
embedding=embeddings,
persist_directory="./chroma_db" # Persists to disk
)
print("Vector store created!")
Step 3: Build Q&A Chain
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
# Initialize LLM
llm = ChatOpenAI(model="gpt-4", temperature=0)
# Create QA chain
qa_chain = RetrievalQA.from_chain_type(
llm=llm,
chain_type="stuff", # "stuff", "map_reduce", "refine", or "map_rerank"
retriever=vectorstore.as_retriever(
search_kwargs={"k": 3} # Return top 3 chunks
),
return_source_documents=True
)
# Ask questions!
def ask_question(question):
result = qa_chain({"query": question})
answer = result["result"]
sources = result["source_documents"]
print(f"\nQ: {question}")
print(f"A: {answer}\n")
print("Sources:")
for i, doc in enumerate(sources):
print(f"{i+1}. {doc.metadata['source']} (page {doc.metadata.get('page', 'N/A')})")
return answer
# Example
ask_question("What is the main topic of the documents?")
ask_question("Summarize the key findings")
Step 4: Advanced RAG with Custom Prompts
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
# Custom prompt
template = """
Use the following context to answer the question at the end.
If you don't know the answer, say "I don't have enough information to answer that."
Don't make up answers.
Context:
{context}
Question: {question}
Detailed Answer:"""
PROMPT = PromptTemplate(
template=template,
input_variables=["context", "question"]
)
qa_chain = RetrievalQA.from_chain_type(
llm=llm,
retriever=vectorstore.as_retriever(),
chain_type_kwargs={"prompt": PROMPT},
return_source_documents=True
)
💡 Advanced Features
Conversational RAG (with Memory)
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
memory = ConversationBufferMemory(
memory_key="chat_history",
return_messages=True,
output_key="answer"
)
conversational_chain = ConversationalRetrievalChain.from_llm(
llm=llm,
retriever=vectorstore.as_retriever(),
memory=memory,
return_source_documents=True
)
# Multi-turn conversation
conversational_chain({"question": "What products does the company offer?"})
conversational_chain({"question": "What are their prices?"}) # Remembers context!
conversational_chain({"question": "Which one is cheapest?"})
Metadata Filtering
# Add metadata when loading
from langchain.schema import Document
docs = [
Document(
page_content="...",
metadata={"source": "doc1.pdf", "category": "technical", "date": "2024-01-15"}
),
# ... more docs
]
# Filter during retrieval
retriever = vectorstore.as_retriever(
search_kwargs={
"k": 5,
"filter": {"category": "technical"} # Only search technical docs
}
)
🌐 Web Interface with Streamlit
# app.py
import streamlit as st
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
st.title("📚 Document Q&A System")
# Initialize (cache for performance)
@st.cache_resource
def load_qa_chain():
embeddings = OpenAIEmbeddings()
vectorstore = Chroma(
persist_directory="./chroma_db",
embedding_function=embeddings
)
llm = ChatOpenAI(model="gpt-4", temperature=0)
return RetrievalQA.from_chain_type(
llm=llm,
retriever=vectorstore.as_retriever(search_kwargs={"k": 3}),
return_source_documents=True
)
qa_chain = load_qa_chain()
# Chat interface
if "messages" not in st.session_state:
st.session_state.messages = []
# Display chat history
for message in st.session_state.messages:
with st.chat_message(message["role"]):
st.markdown(message["content"])
# User input
if prompt := st.chat_input("Ask a question about your documents"):
# Add user message
st.session_state.messages.append({"role": "user", "content": prompt})
with st.chat_message("user"):
st.markdown(prompt)
# Get answer
with st.chat_message("assistant"):
with st.spinner("Thinking..."):
result = qa_chain({"query": prompt})
answer = result["result"]
sources = result["source_documents"]
st.markdown(answer)
# Show sources
with st.expander("📄 Sources"):
for i, doc in enumerate(sources):
st.write(f"**Source {i+1}:** {doc.metadata['source']}")
st.write(f"*Content:* {doc.page_content[:200]}...")
# Add to history
st.session_state.messages.append({"role": "assistant", "content": answer})
# Run: streamlit run app.py
⚡ Optimization Tips
- Chunk Size: 500-1500 characters works best (test for your use case)
- Overlap: 10-20% of chunk size maintains context
- Top-k: 3-5 chunks balances context vs token cost
- Embeddings: OpenAI (best quality) or local models (cheaper)
- Reranking: Use Cohere or other reranker for better results
🎯 Key Takeaways
- RAG grounds LLM responses in your documents
- Vector search finds semantically relevant context
- Chunking strategy critical for good results
- Source attribution builds trust in answers
- Combine with conversation memory for natural chat