Barclays-banner-builder/backend/scripts/ingest_rag.py
Vadym Samoilenko 735b2ef141 Add full Sprint 0+1 implementation: Docker, FastAPI, React, RAG, deploy
- Backend: FastAPI + SQLAlchemy async, pgvector RAG, RQ workers, OpenAI gpt-5.4-mini structured output
- Frontend: React 18 + Vite + TypeScript + TailwindCSS + shadcn/ui, job polling pattern (no WebSocket)
- Admin panel: editable SystemPrompt with version history for FCA audit trail
- Deploy: idempotent deploy.sh with hash-based cache, Apache Include fragment, alembic migrations
- Docker: dev + prod compose configs, port 8010 (API) to avoid OliVAS conflict on host

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-17 12:01:42 +01:00

80 lines
2.5 KiB
Python

#!/usr/bin/env python3
"""Ingest RAG corpus documents into the database.
Run once on deploy (or after updating documents):
docker compose exec api python scripts/ingest_rag.py
Reads .docx files from rag-corpus/, chunks text, embeds via OpenAI text-embedding-3-small.
"""
import asyncio
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent))
RAG_CORPUS = Path(__file__).parent.parent.parent / "rag-corpus"
CHUNK_SIZE = 800 # characters
CHUNK_OVERLAP = 100
def extract_text_from_docx(path: Path) -> str:
from docx import Document
doc = Document(str(path))
return "\n".join(p.text for p in doc.paragraphs if p.text.strip())
def chunk_text(text: str, chunk_size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP) -> list[str]:
chunks = []
start = 0
while start < len(text):
end = start + chunk_size
chunks.append(text[start:end])
start = end - overlap
return chunks
async def main():
from app.database import AsyncSessionLocal
from app.llm.openai_client import embed
from app.models.rag import RagChunk
from sqlalchemy import delete
docs = list(RAG_CORPUS.glob("*.docx"))
if not docs:
print(f"ERROR: no .docx files found in {RAG_CORPUS}")
sys.exit(1)
all_chunks = []
for doc_path in docs:
print(f"Extracting: {doc_path.name}")
text = extract_text_from_docx(doc_path)
chunks = chunk_text(text)
for i, chunk in enumerate(chunks):
all_chunks.append({"source_doc": doc_path.name, "chunk_text": chunk, "chunk_index": i})
print(f"Total chunks: {len(all_chunks)} — computing embeddings...")
# Batch in groups of 100 to avoid rate limits
batch_size = 100
embeddings = []
for i in range(0, len(all_chunks), batch_size):
batch = all_chunks[i : i + batch_size]
embs = await embed([c["chunk_text"] for c in batch])
embeddings.extend(embs)
print(f" Embedded {min(i + batch_size, len(all_chunks))}/{len(all_chunks)}")
async with AsyncSessionLocal() as db:
await db.execute(delete(RagChunk))
for chunk, emb in zip(all_chunks, embeddings, strict=True):
db.add(RagChunk(
source_doc=chunk["source_doc"],
chunk_text=chunk["chunk_text"],
chunk_index=chunk["chunk_index"],
embedding=emb,
))
await db.commit()
print(f"Ingested {len(all_chunks)} RAG chunks.")
if __name__ == "__main__":
asyncio.run(main())