- Remove notebook mode, add RAG + Personal Assistant dual-bot setup - Add knowledge base management (upload, URL scraping, document processing) - Add user feature access control (allowed_features, features_override) - Update admin dashboard with knowledge base tab - Redesign login page, sidebar, and profile - Add Celery tasks for async document processing Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
351 lines
9.9 KiB
Python
Executable file
351 lines
9.9 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Manual PDF Ingestion Script for Testing RAG
|
|
|
|
This script allows you to manually upload a local PDF to Qdrant for testing
|
|
the chat/RAG logic without needing SharePoint integration.
|
|
|
|
Usage:
|
|
python scripts/ingest_local_file.py <pdf_path> [options]
|
|
|
|
Example:
|
|
python scripts/ingest_local_file.py documents/company_policy.pdf --title "Company Policy" --department-id <uuid>
|
|
"""
|
|
import asyncio
|
|
import argparse
|
|
import sys
|
|
import os
|
|
from pathlib import Path
|
|
from uuid import uuid4
|
|
from datetime import datetime
|
|
|
|
# Add parent directory to path to import app modules
|
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
|
|
from markitdown import MarkItDown
|
|
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
from langchain_openai import OpenAIEmbeddings
|
|
from qdrant_client import QdrantClient
|
|
from qdrant_client.models import PointStruct, Distance, VectorParams
|
|
from dotenv import load_dotenv
|
|
|
|
# Load environment variables
|
|
load_dotenv()
|
|
|
|
|
|
def extract_text_from_file(file_path: str) -> str:
|
|
"""
|
|
Extract text from any supported file using MarkItDown
|
|
|
|
Args:
|
|
file_path: Path to file
|
|
|
|
Returns:
|
|
Extracted text (markdown)
|
|
"""
|
|
print(f"📄 Extracting text from: {file_path}")
|
|
|
|
try:
|
|
md = MarkItDown()
|
|
result = md.convert(file_path)
|
|
text = result.text_content or ""
|
|
print(f" ✓ Total characters extracted: {len(text)}")
|
|
return text
|
|
|
|
except Exception as e:
|
|
print(f" ✗ Error extracting text: {e}")
|
|
raise
|
|
|
|
|
|
def chunk_text(text: str, chunk_size: int = 1000, chunk_overlap: int = 200) -> list[str]:
|
|
"""
|
|
Split text into overlapping chunks
|
|
|
|
Args:
|
|
text: Text to chunk
|
|
chunk_size: Maximum characters per chunk
|
|
chunk_overlap: Overlap between chunks
|
|
|
|
Returns:
|
|
List of text chunks
|
|
"""
|
|
print(f"📝 Chunking text (size={chunk_size}, overlap={chunk_overlap})")
|
|
|
|
splitter = RecursiveCharacterTextSplitter(
|
|
chunk_size=chunk_size,
|
|
chunk_overlap=chunk_overlap,
|
|
length_function=len,
|
|
separators=["\n\n", "\n", ". ", " ", ""]
|
|
)
|
|
|
|
chunks = splitter.split_text(text)
|
|
print(f" ✓ Created {len(chunks)} chunks")
|
|
|
|
return chunks
|
|
|
|
|
|
async def generate_embeddings(chunks: list[str], api_key: str) -> list[list[float]]:
|
|
"""
|
|
Generate embeddings for text chunks
|
|
|
|
Args:
|
|
chunks: List of text chunks
|
|
api_key: OpenAI API key
|
|
|
|
Returns:
|
|
List of embedding vectors
|
|
"""
|
|
print(f"🔢 Generating embeddings for {len(chunks)} chunks")
|
|
|
|
embeddings_model = OpenAIEmbeddings(
|
|
model="text-embedding-3-large",
|
|
openai_api_key=api_key,
|
|
dimensions=3072
|
|
)
|
|
|
|
# Process in batches to avoid rate limits
|
|
batch_size = 100
|
|
all_embeddings = []
|
|
|
|
for i in range(0, len(chunks), batch_size):
|
|
batch = chunks[i:i + batch_size]
|
|
batch_embeddings = await embeddings_model.aembed_documents(batch)
|
|
all_embeddings.extend(batch_embeddings)
|
|
print(f" ✓ Processed batch {i // batch_size + 1}/{(len(chunks) - 1) // batch_size + 1}")
|
|
|
|
print(f" ✓ Generated {len(all_embeddings)} embeddings")
|
|
return all_embeddings
|
|
|
|
|
|
def ensure_collection(client: QdrantClient, collection_name: str):
|
|
"""
|
|
Ensure Qdrant collection exists, create if it doesn't
|
|
|
|
Args:
|
|
client: Qdrant client
|
|
collection_name: Name of collection
|
|
"""
|
|
print(f"🗄️ Checking Qdrant collection: {collection_name}")
|
|
|
|
try:
|
|
collections = client.get_collections().collections
|
|
if not any(c.name == collection_name for c in collections):
|
|
print(f" Creating collection: {collection_name}")
|
|
client.create_collection(
|
|
collection_name=collection_name,
|
|
vectors_config=VectorParams(size=3072, distance=Distance.COSINE)
|
|
)
|
|
|
|
# Create payload indexes
|
|
client.create_payload_index(
|
|
collection_name=collection_name,
|
|
field_name="department_id",
|
|
field_schema="keyword"
|
|
)
|
|
client.create_payload_index(
|
|
collection_name=collection_name,
|
|
field_name="region_code",
|
|
field_schema="keyword"
|
|
)
|
|
client.create_payload_index(
|
|
collection_name=collection_name,
|
|
field_name="is_active",
|
|
field_schema="bool"
|
|
)
|
|
print(f" ✓ Collection created with indexes")
|
|
else:
|
|
print(f" ✓ Collection exists")
|
|
|
|
except Exception as e:
|
|
print(f" ✗ Error ensuring collection: {e}")
|
|
raise
|
|
|
|
|
|
def upload_to_qdrant(
|
|
client: QdrantClient,
|
|
collection_name: str,
|
|
chunks: list[str],
|
|
embeddings: list[list[float]],
|
|
file_name: str,
|
|
department_id: str = None,
|
|
region_code: str = None
|
|
) -> list[str]:
|
|
"""
|
|
Upload chunks and embeddings to Qdrant
|
|
|
|
Args:
|
|
client: Qdrant client
|
|
collection_name: Collection name
|
|
chunks: Text chunks
|
|
embeddings: Embedding vectors
|
|
file_name: Original file name
|
|
department_id: Optional department UUID
|
|
region_code: Optional region code
|
|
|
|
Returns:
|
|
List of point IDs
|
|
"""
|
|
print(f"⬆️ Uploading to Qdrant collection: {collection_name}")
|
|
|
|
document_id = str(uuid4())
|
|
points = []
|
|
point_ids = []
|
|
|
|
for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
|
|
point_id = str(uuid4())
|
|
point_ids.append(point_id)
|
|
|
|
points.append(PointStruct(
|
|
id=point_id,
|
|
vector=embedding,
|
|
payload={
|
|
"document_id": document_id,
|
|
"sharepoint_id": document_id, # Use same ID for manual uploads
|
|
"file_name": file_name,
|
|
"file_url": f"local://{file_name}",
|
|
"chunk_index": i,
|
|
"total_chunks": len(chunks),
|
|
"text": chunk,
|
|
"department_id": department_id,
|
|
"region_code": region_code,
|
|
"file_type": file_type,
|
|
"last_modified": datetime.utcnow().isoformat(),
|
|
"is_active": True
|
|
}
|
|
))
|
|
|
|
if (i + 1) % 10 == 0:
|
|
print(f" Prepared {i + 1}/{len(chunks)} points")
|
|
|
|
# Upload all points
|
|
try:
|
|
client.upsert(
|
|
collection_name=collection_name,
|
|
points=points
|
|
)
|
|
print(f" ✓ Uploaded {len(points)} points to Qdrant")
|
|
print(f" ✓ Document ID: {document_id}")
|
|
|
|
except Exception as e:
|
|
print(f" ✗ Error uploading to Qdrant: {e}")
|
|
raise
|
|
|
|
return point_ids
|
|
|
|
|
|
async def main():
|
|
"""Main execution function"""
|
|
parser = argparse.ArgumentParser(
|
|
description="Ingest a local PDF file into Qdrant for RAG testing"
|
|
)
|
|
parser.add_argument(
|
|
"file_path",
|
|
help="Path to the file to ingest (PDF, DOCX, PPTX, XLSX, CSV, TXT)"
|
|
)
|
|
parser.add_argument(
|
|
"--title",
|
|
help="Document title (defaults to filename)",
|
|
default=None
|
|
)
|
|
parser.add_argument(
|
|
"--department-id",
|
|
help="Department UUID for filtering",
|
|
default=None
|
|
)
|
|
parser.add_argument(
|
|
"--region-code",
|
|
help="Region code (e.g., 'UK', 'US', 'APAC')",
|
|
default=None
|
|
)
|
|
parser.add_argument(
|
|
"--chunk-size",
|
|
type=int,
|
|
default=1000,
|
|
help="Chunk size in characters (default: 1000)"
|
|
)
|
|
parser.add_argument(
|
|
"--chunk-overlap",
|
|
type=int,
|
|
default=200,
|
|
help="Chunk overlap in characters (default: 200)"
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Validate file exists
|
|
if not os.path.exists(args.file_path):
|
|
print(f"❌ Error: File not found: {args.file_path}")
|
|
sys.exit(1)
|
|
|
|
# Get environment variables
|
|
openai_api_key = os.getenv("OPENAI_API_KEY")
|
|
qdrant_url = os.getenv("QDRANT_URL", "http://localhost:6333")
|
|
|
|
if not openai_api_key:
|
|
print("❌ Error: OPENAI_API_KEY environment variable not set")
|
|
sys.exit(1)
|
|
|
|
# Extract file name and type
|
|
file_path = Path(args.file_path)
|
|
file_name = args.title or file_path.name
|
|
file_type = file_path.suffix.lstrip(".").lower()
|
|
|
|
print("=" * 80)
|
|
print("📚 Document Ingestion Script")
|
|
print("=" * 80)
|
|
print(f"File: {args.file_path}")
|
|
print(f"Title: {file_name}")
|
|
print(f"Type: {file_type}")
|
|
print(f"Department ID: {args.department_id or 'None'}")
|
|
print(f"Region: {args.region_code or 'None'}")
|
|
print(f"Qdrant URL: {qdrant_url}")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
try:
|
|
# 1. Extract text
|
|
text = extract_text_from_file(args.file_path)
|
|
|
|
# 2. Chunk text
|
|
chunks = chunk_text(text, args.chunk_size, args.chunk_overlap)
|
|
|
|
# 3. Generate embeddings
|
|
embeddings = await generate_embeddings(chunks, openai_api_key)
|
|
|
|
# 4. Connect to Qdrant
|
|
client = QdrantClient(url=qdrant_url)
|
|
collection_name = "sharepoint_docs"
|
|
ensure_collection(client, collection_name)
|
|
|
|
# 5. Upload to Qdrant
|
|
point_ids = upload_to_qdrant(
|
|
client=client,
|
|
collection_name=collection_name,
|
|
chunks=chunks,
|
|
embeddings=embeddings,
|
|
file_name=file_name,
|
|
department_id=args.department_id,
|
|
region_code=args.region_code
|
|
)
|
|
|
|
print()
|
|
print("=" * 80)
|
|
print("✅ Ingestion Complete!")
|
|
print("=" * 80)
|
|
print(f"Total chunks: {len(chunks)}")
|
|
print(f"Total embeddings: {len(embeddings)}")
|
|
print(f"Qdrant points: {len(point_ids)}")
|
|
print()
|
|
print("You can now test the RAG chat endpoint with queries related to this document.")
|
|
print("=" * 80)
|
|
|
|
except Exception as e:
|
|
print()
|
|
print("=" * 80)
|
|
print(f"❌ Error: {e}")
|
|
print("=" * 80)
|
|
sys.exit(1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|