Oliver-ai-bot_2.0/backend/scripts/ingest_local_file.py
Vadym Samoilenko 44a512c41f Phase 1 Complete: Dual-bot architecture, knowledge base, access control
- Remove notebook mode, add RAG + Personal Assistant dual-bot setup
- Add knowledge base management (upload, URL scraping, document processing)
- Add user feature access control (allowed_features, features_override)
- Update admin dashboard with knowledge base tab
- Redesign login page, sidebar, and profile
- Add Celery tasks for async document processing

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-04 21:26:40 +00:00

351 lines
9.9 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Manual PDF Ingestion Script for Testing RAG
This script allows you to manually upload a local PDF to Qdrant for testing
the chat/RAG logic without needing SharePoint integration.
Usage:
python scripts/ingest_local_file.py <pdf_path> [options]
Example:
python scripts/ingest_local_file.py documents/company_policy.pdf --title "Company Policy" --department-id <uuid>
"""
import asyncio
import argparse
import sys
import os
from pathlib import Path
from uuid import uuid4
from datetime import datetime
# Add parent directory to path to import app modules
sys.path.insert(0, str(Path(__file__).parent.parent))
from markitdown import MarkItDown
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct, Distance, VectorParams
from dotenv import load_dotenv
# Load environment variables
load_dotenv()
def extract_text_from_file(file_path: str) -> str:
"""
Extract text from any supported file using MarkItDown
Args:
file_path: Path to file
Returns:
Extracted text (markdown)
"""
print(f"📄 Extracting text from: {file_path}")
try:
md = MarkItDown()
result = md.convert(file_path)
text = result.text_content or ""
print(f" ✓ Total characters extracted: {len(text)}")
return text
except Exception as e:
print(f" ✗ Error extracting text: {e}")
raise
def chunk_text(text: str, chunk_size: int = 1000, chunk_overlap: int = 200) -> list[str]:
"""
Split text into overlapping chunks
Args:
text: Text to chunk
chunk_size: Maximum characters per chunk
chunk_overlap: Overlap between chunks
Returns:
List of text chunks
"""
print(f"📝 Chunking text (size={chunk_size}, overlap={chunk_overlap})")
splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
length_function=len,
separators=["\n\n", "\n", ". ", " ", ""]
)
chunks = splitter.split_text(text)
print(f" ✓ Created {len(chunks)} chunks")
return chunks
async def generate_embeddings(chunks: list[str], api_key: str) -> list[list[float]]:
"""
Generate embeddings for text chunks
Args:
chunks: List of text chunks
api_key: OpenAI API key
Returns:
List of embedding vectors
"""
print(f"🔢 Generating embeddings for {len(chunks)} chunks")
embeddings_model = OpenAIEmbeddings(
model="text-embedding-3-large",
openai_api_key=api_key,
dimensions=3072
)
# Process in batches to avoid rate limits
batch_size = 100
all_embeddings = []
for i in range(0, len(chunks), batch_size):
batch = chunks[i:i + batch_size]
batch_embeddings = await embeddings_model.aembed_documents(batch)
all_embeddings.extend(batch_embeddings)
print(f" ✓ Processed batch {i // batch_size + 1}/{(len(chunks) - 1) // batch_size + 1}")
print(f" ✓ Generated {len(all_embeddings)} embeddings")
return all_embeddings
def ensure_collection(client: QdrantClient, collection_name: str):
"""
Ensure Qdrant collection exists, create if it doesn't
Args:
client: Qdrant client
collection_name: Name of collection
"""
print(f"🗄️ Checking Qdrant collection: {collection_name}")
try:
collections = client.get_collections().collections
if not any(c.name == collection_name for c in collections):
print(f" Creating collection: {collection_name}")
client.create_collection(
collection_name=collection_name,
vectors_config=VectorParams(size=3072, distance=Distance.COSINE)
)
# Create payload indexes
client.create_payload_index(
collection_name=collection_name,
field_name="department_id",
field_schema="keyword"
)
client.create_payload_index(
collection_name=collection_name,
field_name="region_code",
field_schema="keyword"
)
client.create_payload_index(
collection_name=collection_name,
field_name="is_active",
field_schema="bool"
)
print(f" ✓ Collection created with indexes")
else:
print(f" ✓ Collection exists")
except Exception as e:
print(f" ✗ Error ensuring collection: {e}")
raise
def upload_to_qdrant(
client: QdrantClient,
collection_name: str,
chunks: list[str],
embeddings: list[list[float]],
file_name: str,
department_id: str = None,
region_code: str = None
) -> list[str]:
"""
Upload chunks and embeddings to Qdrant
Args:
client: Qdrant client
collection_name: Collection name
chunks: Text chunks
embeddings: Embedding vectors
file_name: Original file name
department_id: Optional department UUID
region_code: Optional region code
Returns:
List of point IDs
"""
print(f"⬆️ Uploading to Qdrant collection: {collection_name}")
document_id = str(uuid4())
points = []
point_ids = []
for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
point_id = str(uuid4())
point_ids.append(point_id)
points.append(PointStruct(
id=point_id,
vector=embedding,
payload={
"document_id": document_id,
"sharepoint_id": document_id, # Use same ID for manual uploads
"file_name": file_name,
"file_url": f"local://{file_name}",
"chunk_index": i,
"total_chunks": len(chunks),
"text": chunk,
"department_id": department_id,
"region_code": region_code,
"file_type": file_type,
"last_modified": datetime.utcnow().isoformat(),
"is_active": True
}
))
if (i + 1) % 10 == 0:
print(f" Prepared {i + 1}/{len(chunks)} points")
# Upload all points
try:
client.upsert(
collection_name=collection_name,
points=points
)
print(f" ✓ Uploaded {len(points)} points to Qdrant")
print(f" ✓ Document ID: {document_id}")
except Exception as e:
print(f" ✗ Error uploading to Qdrant: {e}")
raise
return point_ids
async def main():
"""Main execution function"""
parser = argparse.ArgumentParser(
description="Ingest a local PDF file into Qdrant for RAG testing"
)
parser.add_argument(
"file_path",
help="Path to the file to ingest (PDF, DOCX, PPTX, XLSX, CSV, TXT)"
)
parser.add_argument(
"--title",
help="Document title (defaults to filename)",
default=None
)
parser.add_argument(
"--department-id",
help="Department UUID for filtering",
default=None
)
parser.add_argument(
"--region-code",
help="Region code (e.g., 'UK', 'US', 'APAC')",
default=None
)
parser.add_argument(
"--chunk-size",
type=int,
default=1000,
help="Chunk size in characters (default: 1000)"
)
parser.add_argument(
"--chunk-overlap",
type=int,
default=200,
help="Chunk overlap in characters (default: 200)"
)
args = parser.parse_args()
# Validate file exists
if not os.path.exists(args.file_path):
print(f"❌ Error: File not found: {args.file_path}")
sys.exit(1)
# Get environment variables
openai_api_key = os.getenv("OPENAI_API_KEY")
qdrant_url = os.getenv("QDRANT_URL", "http://localhost:6333")
if not openai_api_key:
print("❌ Error: OPENAI_API_KEY environment variable not set")
sys.exit(1)
# Extract file name and type
file_path = Path(args.file_path)
file_name = args.title or file_path.name
file_type = file_path.suffix.lstrip(".").lower()
print("=" * 80)
print("📚 Document Ingestion Script")
print("=" * 80)
print(f"File: {args.file_path}")
print(f"Title: {file_name}")
print(f"Type: {file_type}")
print(f"Department ID: {args.department_id or 'None'}")
print(f"Region: {args.region_code or 'None'}")
print(f"Qdrant URL: {qdrant_url}")
print("=" * 80)
print()
try:
# 1. Extract text
text = extract_text_from_file(args.file_path)
# 2. Chunk text
chunks = chunk_text(text, args.chunk_size, args.chunk_overlap)
# 3. Generate embeddings
embeddings = await generate_embeddings(chunks, openai_api_key)
# 4. Connect to Qdrant
client = QdrantClient(url=qdrant_url)
collection_name = "sharepoint_docs"
ensure_collection(client, collection_name)
# 5. Upload to Qdrant
point_ids = upload_to_qdrant(
client=client,
collection_name=collection_name,
chunks=chunks,
embeddings=embeddings,
file_name=file_name,
department_id=args.department_id,
region_code=args.region_code
)
print()
print("=" * 80)
print("✅ Ingestion Complete!")
print("=" * 80)
print(f"Total chunks: {len(chunks)}")
print(f"Total embeddings: {len(embeddings)}")
print(f"Qdrant points: {len(point_ids)}")
print()
print("You can now test the RAG chat endpoint with queries related to this document.")
print("=" * 80)
except Exception as e:
print()
print("=" * 80)
print(f"❌ Error: {e}")
print("=" * 80)
sys.exit(1)
if __name__ == "__main__":
asyncio.run(main())