Project: Build a Knowledge Base
Capstone Project Overview
In this final lesson, you will build a complete knowledge base system from scratch. This project brings together everything you have learned: document processing, chunking, embeddings, vector storage, retrieval, prompt engineering, and quality measurement. By the end, you will have a working system that ingests documents and answers questions about them with source citations.
What You Will Build
- A document ingestion pipeline that handles PDFs and web pages
- A ChromaDB-backed vector store with metadata
- A retrieval pipeline with hybrid search and re-ranking
- A chat interface that generates grounded answers
- A quality check suite to verify your system works correctly
Step 1: Project Setup
Create the project structure and install dependencies:
mkdir knowledge-base && cd knowledge-base
python -m venv venv
source venv/bin/activate # On Windows: venv\Scripts\activate
pip install langchain langchain-openai langchain-community chromadb
pip install pypdf beautifulsoup4 requests ragas
Create the project structure:
mkdir -p src data/pdfs data/web
touch src/__init__.py src/ingest.py src/retriever.py src/chain.py src/app.py
Set up your environment variables:
export OPENAI_API_KEY="your-api-key-here"
Step 2: Document Ingestion Pipeline
Create src/ingest.py -- the pipeline that loads, chunks, and stores documents:
"""Document ingestion pipeline for the knowledge base."""
from pathlib import Path
from langchain_community.document_loaders import (
PyPDFLoader,
WebBaseLoader,
DirectoryLoader,
)
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
from datetime import datetime
# Configuration
CHROMA_DIR = "./chroma_db"
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 200
EMBEDDING_MODEL = "text-embedding-3-small"
def load_pdfs(pdf_dir: str) -> list:
"""Load all PDFs from a directory."""
loader = DirectoryLoader(pdf_dir, glob="**/*.pdf", loader_cls=PyPDFLoader)
docs = loader.load()
for doc in docs:
doc.metadata["source_type"] = "pdf"
doc.metadata["indexed_at"] = datetime.now().isoformat()
print(f"Loaded {len(docs)} pages from PDFs")
return docs
def load_web_pages(urls: list[str]) -> list:
"""Load content from web pages."""
loader = WebBaseLoader(web_paths=urls)
docs = loader.load()
for doc in docs:
doc.metadata["source_type"] = "web"
doc.metadata["indexed_at"] = datetime.now().isoformat()
# Clean up whitespace
doc.page_content = " ".join(doc.page_content.split())
print(f"Loaded {len(docs)} web pages")
return docs
def chunk_documents(docs: list) -> list:
"""Split documents into chunks for embedding."""
splitter = RecursiveCharacterTextSplitter(
chunk_size=CHUNK_SIZE,
chunk_overlap=CHUNK_OVERLAP,
separators=["\n\n", "\n", ". ", " ", ""],
)
chunks = splitter.split_documents(docs)
print(f"Split into {len(chunks)} chunks")
return chunks
def create_vectorstore(chunks: list) -> Chroma:
"""Embed chunks and store in ChromaDB."""
embeddings = OpenAIEmbeddings(model=EMBEDDING_MODEL)
vectorstore = Chroma.from_documents(
documents=chunks,
embedding=embeddings,
persist_directory=CHROMA_DIR,
collection_metadata={"hnsw:space": "cosine"},
)
print(f"Stored {len(chunks)} chunks in ChromaDB")
return vectorstore
def ingest(pdf_dir: str = "./data/pdfs", urls: list[str] = None) -> Chroma:
"""Run the full ingestion pipeline."""
all_docs = []
# Load PDFs if directory exists and has files
pdf_path = Path(pdf_dir)
if pdf_path.exists() and list(pdf_path.glob("**/*.pdf")):
all_docs.extend(load_pdfs(pdf_dir))
# Load web pages if URLs provided
if urls:
all_docs.extend(load_web_pages(urls))
if not all_docs:
print("No documents found. Add PDFs to ./data/pdfs or provide URLs.")
return None
# Chunk and store
chunks = chunk_documents(all_docs)
vectorstore = create_vectorstore(chunks)
return vectorstore
if __name__ == "__main__":
# Example: ingest PDFs and some web pages
urls = [
"https://docs.python.org/3/tutorial/index.html",
"https://docs.python.org/3/tutorial/introduction.html",
]
ingest(urls=urls)
Step 3: Retrieval Pipeline
Create src/retriever.py -- the retrieval layer with hybrid search:
"""Retrieval pipeline with hybrid search and re-ranking."""
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
CHROMA_DIR = "./chroma_db"
EMBEDDING_MODEL = "text-embedding-3-small"
def get_vectorstore() -> Chroma:
"""Load existing vectorstore."""
embeddings = OpenAIEmbeddings(model=EMBEDDING_MODEL)
return Chroma(
persist_directory=CHROMA_DIR,
embedding_function=embeddings,
)
def create_retriever(vectorstore: Chroma, k: int = 5):
"""Create retriever with MMR for diversity."""
return vectorstore.as_retriever(
search_type="mmr",
search_kwargs={
"k": k,
"fetch_k": 20,
"lambda_mult": 0.7,
},
)
def retrieve_with_scores(vectorstore: Chroma, query: str, k: int = 5):
"""Retrieve documents with similarity scores for debugging."""
results = vectorstore.similarity_search_with_score(query, k=k)
return results
def filtered_retrieve(
vectorstore: Chroma, query: str, source_type: str = None, k: int = 5
):
"""Retrieve with optional metadata filtering."""
search_kwargs = {"k": k}
if source_type:
search_kwargs["filter"] = {"source_type": source_type}
retriever = vectorstore.as_retriever(
search_type="mmr", search_kwargs=search_kwargs
)
return retriever.invoke(query)
Step 4: RAG Chain With Source Citations
Create src/chain.py -- the generation layer:
"""RAG chain with source citations."""
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnableParallel
LLM_MODEL = "gpt-4o"
def format_docs_with_sources(docs) -> str:
"""Format retrieved documents with source metadata."""
formatted = []
for i, doc in enumerate(docs):
source = doc.metadata.get("source", "Unknown")
source_type = doc.metadata.get("source_type", "unknown")
page = doc.metadata.get("page", "")
page_info = f", page {page}" if page else ""
formatted.append(
f"[Source {i+1}: {source}{page_info} ({source_type})]\n"
f"{doc.page_content}"
)
return "\n\n---\n\n".join(formatted)
def create_rag_chain(retriever):
"""Create the RAG chain with source tracking."""
prompt = ChatPromptTemplate.from_messages([
("system", """You are a knowledgeable assistant that answers questions
using the provided context. Follow these rules:
1. Answer ONLY based on the context provided below.
2. If the context does not contain the answer, say:
"I don't have enough information to answer this question."
3. Cite your sources using [Source N] references.
4. Be concise but thorough.
5. If sources provide different information, mention both perspectives.
Context:
{context}"""),
("human", "{question}"),
])
llm = ChatOpenAI(model=LLM_MODEL, temperature=0)
# Chain that returns both answer and sources
chain = RunnableParallel(
answer=(
{
"context": retriever | format_docs_with_sources,
"question": RunnablePassthrough(),
}
| prompt
| llm
| StrOutputParser()
),
sources=retriever,
)
return chain
def create_simple_chain(retriever):
"""Create a simple chain that returns just the answer."""
prompt = ChatPromptTemplate.from_messages([
("system", """Answer based on the context below. Cite sources as
[Source: filename]. If unsure, say so.
Context:
{context}"""),
("human", "{question}"),
])
llm = ChatOpenAI(model=LLM_MODEL, temperature=0)
chain = (
{
"context": retriever | format_docs_with_sources,
"question": RunnablePassthrough(),
}
| prompt
| llm
| StrOutputParser()
)
return chain
Step 5: Chat Interface
Create src/app.py -- a simple interactive interface:
"""Interactive chat interface for the knowledge base."""
from src.retriever import get_vectorstore, create_retriever, retrieve_with_scores
from src.chain import create_rag_chain
def run_chat():
"""Run an interactive chat session."""
print("Knowledge Base Chat")
print("=" * 50)
print("Type 'quit' to exit, 'debug' to toggle debug mode")
print()
vectorstore = get_vectorstore()
retriever = create_retriever(vectorstore, k=5)
chain = create_rag_chain(retriever)
debug_mode = False
while True:
question = input("You: ").strip()
if not question:
continue
if question.lower() == "quit":
print("Goodbye!")
break
if question.lower() == "debug":
debug_mode = not debug_mode
print(f"Debug mode: {'ON' if debug_mode else 'OFF'}")
continue
# Get answer with sources
result = chain.invoke(question)
print(f"\nAssistant: {result['answer']}")
if debug_mode:
print("\n--- Debug Info ---")
print(f"Retrieved {len(result['sources'])} chunks:")
for i, doc in enumerate(result["sources"]):
source = doc.metadata.get("source", "Unknown")
print(f" [{i+1}] {source}")
print(f" {doc.page_content[:100]}...")
# Show similarity scores
scored = retrieve_with_scores(vectorstore, question, k=3)
print("\nSimilarity scores:")
for doc, score in scored:
print(f" Score: {score:.4f} | {doc.metadata.get('source', '?')}")
print("--- End Debug ---")
print()
if __name__ == "__main__":
run_chat()
Step 6: Quality Checks
Create src/check_quality.py -- automated checks to verify your system:
"""Quality checks for the knowledge base."""
import json
from datetime import datetime
from src.retriever import get_vectorstore, create_retriever
from src.chain import create_simple_chain
def run_quality_checks(test_questions: list[dict]) -> dict:
"""Run quality checks against a set of test questions."""
vectorstore = get_vectorstore()
retriever = create_retriever(vectorstore, k=5)
chain = create_simple_chain(retriever)
results = []
for item in test_questions:
question = item["question"]
expected_keywords = item.get("expected_keywords", [])
# Get answer
answer = chain.invoke(question)
# Check if expected keywords appear in the answer
found_keywords = [
kw for kw in expected_keywords if kw.lower() in answer.lower()
]
keyword_coverage = (
len(found_keywords) / len(expected_keywords)
if expected_keywords
else 1.0
)
results.append({
"question": question,
"answer": answer,
"keyword_coverage": keyword_coverage,
"found_keywords": found_keywords,
"missing_keywords": [
kw for kw in expected_keywords if kw not in found_keywords
],
})
status = "PASS" if keyword_coverage >= 0.5 else "FAIL"
print(f"[{status}] {question}")
print(f" Coverage: {keyword_coverage:.0%}")
if keyword_coverage < 1.0:
missing = [kw for kw in expected_keywords if kw not in found_keywords]
print(f" Missing: {missing}")
print()
# Summary
pass_count = sum(1 for r in results if r["keyword_coverage"] >= 0.5)
total = len(results)
avg_coverage = sum(r["keyword_coverage"] for r in results) / total
summary = {
"timestamp": datetime.now().isoformat(),
"total_questions": total,
"passed": pass_count,
"failed": total - pass_count,
"average_keyword_coverage": round(avg_coverage, 2),
"results": results,
}
print("=" * 50)
print(f"Results: {pass_count}/{total} passed ({avg_coverage:.0%} avg coverage)")
# Save results
with open("quality_results.json", "w") as f:
json.dump(summary, f, indent=2)
return summary
# Example test questions - customize for your knowledge base
TEST_QUESTIONS = [
{
"question": "What topics does the knowledge base cover?",
"expected_keywords": ["python", "tutorial"],
},
{
"question": "How do you define variables in Python?",
"expected_keywords": ["variable", "assignment", "="],
},
{
"question": "What is a completely unrelated topic like quantum physics?",
"expected_keywords": [], # Should say "I don't know"
},
]
if __name__ == "__main__":
run_quality_checks(TEST_QUESTIONS)
Step 7: Putting It All Together
Create a main.py entry point:
"""Main entry point for the knowledge base system."""
import sys
from src.ingest import ingest
from src.app import run_chat
from src.check_quality import run_quality_checks, TEST_QUESTIONS
def main():
if len(sys.argv) < 2:
print("Usage:")
print(" python main.py ingest - Ingest documents")
print(" python main.py chat - Start chat interface")
print(" python main.py check - Run quality checks")
print(" python main.py ingest --urls URL1 URL2")
return
command = sys.argv[1]
if command == "ingest":
urls = []
if "--urls" in sys.argv:
url_idx = sys.argv.index("--urls") + 1
urls = sys.argv[url_idx:]
ingest(urls=urls if urls else None)
elif command == "chat":
run_chat()
elif command == "check":
run_quality_checks(TEST_QUESTIONS)
else:
print(f"Unknown command: {command}")
if __name__ == "__main__":
main()
Running the System
# Step 1: Ingest some documents
python main.py ingest --urls https://docs.python.org/3/tutorial/introduction.html
# Step 2: Start chatting
python main.py chat
# Step 3: Run quality checks
python main.py check
Extending the Project
Now that you have a working base, here are ways to extend it:
- Add more document types. Integrate Markdown loaders, code file loaders, or CSV parsers.
- Implement a web UI. Use Streamlit or Gradio to build a browser-based interface with streaming responses.
- Add re-ranking. Integrate Cohere Rerank or a cross-encoder to improve retrieval precision.
- Implement access control. Add user roles and filter documents based on permissions.
- Set up monitoring. Log every query, its latency, the retrieved documents, and user feedback.
- Build an API. Wrap the chain in a FastAPI endpoint for integration with other systems.
Course Recap
Over twelve lessons, you have learned:
- Why RAG matters and what problems it solves
- How embeddings encode meaning into searchable vectors
- How vector databases store and retrieve those vectors at scale
- How to process documents from PDFs, HTML, code, and more
- How chunking strategies affect retrieval quality
- How advanced retrieval techniques like hybrid search and re-ranking improve results
- How to build a complete RAG pipeline with LangChain
- How advanced patterns like HyDE, self-RAG, and agentic RAG push beyond basic retrieval
- How to build code-specific RAG systems
- How to measure your system with real metrics
- How to harden everything for production
- How to build a complete knowledge base from scratch
RAG is one of the most practical and impactful skills in AI engineering today. The systems you build with these techniques will connect powerful language models to real-world data, producing answers that are accurate, current, and verifiable. Go build something useful.