- Backend: FastAPI + SQLAlchemy async, pgvector RAG, RQ workers, OpenAI gpt-5.4-mini structured output - Frontend: React 18 + Vite + TypeScript + TailwindCSS + shadcn/ui, job polling pattern (no WebSocket) - Admin panel: editable SystemPrompt with version history for FCA audit trail - Deploy: idempotent deploy.sh with hash-based cache, Apache Include fragment, alembic migrations - Docker: dev + prod compose configs, port 8010 (API) to avoid OliVAS conflict on host Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
80 lines
2.5 KiB
Python
80 lines
2.5 KiB
Python
#!/usr/bin/env python3
|
|
"""Ingest RAG corpus documents into the database.
|
|
|
|
Run once on deploy (or after updating documents):
|
|
docker compose exec api python scripts/ingest_rag.py
|
|
|
|
Reads .docx files from rag-corpus/, chunks text, embeds via OpenAI text-embedding-3-small.
|
|
"""
|
|
import asyncio
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
|
|
RAG_CORPUS = Path(__file__).parent.parent.parent / "rag-corpus"
|
|
CHUNK_SIZE = 800 # characters
|
|
CHUNK_OVERLAP = 100
|
|
|
|
|
|
def extract_text_from_docx(path: Path) -> str:
|
|
from docx import Document
|
|
doc = Document(str(path))
|
|
return "\n".join(p.text for p in doc.paragraphs if p.text.strip())
|
|
|
|
|
|
def chunk_text(text: str, chunk_size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP) -> list[str]:
|
|
chunks = []
|
|
start = 0
|
|
while start < len(text):
|
|
end = start + chunk_size
|
|
chunks.append(text[start:end])
|
|
start = end - overlap
|
|
return chunks
|
|
|
|
|
|
async def main():
|
|
from app.database import AsyncSessionLocal
|
|
from app.llm.openai_client import embed
|
|
from app.models.rag import RagChunk
|
|
from sqlalchemy import delete
|
|
|
|
docs = list(RAG_CORPUS.glob("*.docx"))
|
|
if not docs:
|
|
print(f"ERROR: no .docx files found in {RAG_CORPUS}")
|
|
sys.exit(1)
|
|
|
|
all_chunks = []
|
|
for doc_path in docs:
|
|
print(f"Extracting: {doc_path.name}")
|
|
text = extract_text_from_docx(doc_path)
|
|
chunks = chunk_text(text)
|
|
for i, chunk in enumerate(chunks):
|
|
all_chunks.append({"source_doc": doc_path.name, "chunk_text": chunk, "chunk_index": i})
|
|
|
|
print(f"Total chunks: {len(all_chunks)} — computing embeddings...")
|
|
|
|
# Batch in groups of 100 to avoid rate limits
|
|
batch_size = 100
|
|
embeddings = []
|
|
for i in range(0, len(all_chunks), batch_size):
|
|
batch = all_chunks[i : i + batch_size]
|
|
embs = await embed([c["chunk_text"] for c in batch])
|
|
embeddings.extend(embs)
|
|
print(f" Embedded {min(i + batch_size, len(all_chunks))}/{len(all_chunks)}")
|
|
|
|
async with AsyncSessionLocal() as db:
|
|
await db.execute(delete(RagChunk))
|
|
for chunk, emb in zip(all_chunks, embeddings, strict=True):
|
|
db.add(RagChunk(
|
|
source_doc=chunk["source_doc"],
|
|
chunk_text=chunk["chunk_text"],
|
|
chunk_index=chunk["chunk_index"],
|
|
embedding=emb,
|
|
))
|
|
await db.commit()
|
|
print(f"Ingested {len(all_chunks)} RAG chunks.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|