Oliver-ai-bot_2.0/backend/scripts/ingest_local_file.py

#!/usr/bin/env python3
"""
Manual PDF Ingestion Script for Testing RAG

This script allows you to manually upload a local PDF to Qdrant for testing
the chat/RAG logic without needing SharePoint integration.

Usage:
    python scripts/ingest_local_file.py <pdf_path> [options]

Example:
    python scripts/ingest_local_file.py documents/company_policy.pdf --title "Company Policy" --department-id <uuid>
"""
import asyncio
import argparse
import sys
import os
from pathlib import Path
from uuid import uuid4
from datetime import datetime

# Add parent directory to path to import app modules
sys.path.insert(0, str(Path(__file__).parent.parent))

from markitdown import MarkItDown
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct, Distance, VectorParams
from dotenv import load_dotenv

# Load environment variables
load_dotenv()


def extract_text_from_file(file_path: str) -> str:
    """
    Extract text from any supported file using MarkItDown

    Args:
        file_path: Path to file

    Returns:
        Extracted text (markdown)
    """
    print(f"📄 Extracting text from: {file_path}")

    try:
        md = MarkItDown()
        result = md.convert(file_path)
        text = result.text_content or ""
        print(f"   ✓ Total characters extracted: {len(text)}")
        return text

    except Exception as e:
        print(f"   ✗ Error extracting text: {e}")
        raise


def chunk_text(text: str, chunk_size: int = 1000, chunk_overlap: int = 200) -> list[str]:
    """
    Split text into overlapping chunks

    Args:
        text: Text to chunk
        chunk_size: Maximum characters per chunk
        chunk_overlap: Overlap between chunks

    Returns:
        List of text chunks
    """
    print(f"📝 Chunking text (size={chunk_size}, overlap={chunk_overlap})")

    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", ". ", " ", ""]
    )

    chunks = splitter.split_text(text)
    print(f"   ✓ Created {len(chunks)} chunks")

    return chunks


async def generate_embeddings(chunks: list[str], api_key: str) -> list[list[float]]:
    """
    Generate embeddings for text chunks

    Args:
        chunks: List of text chunks
        api_key: OpenAI API key

    Returns:
        List of embedding vectors
    """
    print(f"🔢 Generating embeddings for {len(chunks)} chunks")

    embeddings_model = OpenAIEmbeddings(
        model="text-embedding-3-large",
        openai_api_key=api_key,
        dimensions=3072
    )

    # Process in batches to avoid rate limits
    batch_size = 100
    all_embeddings = []

    for i in range(0, len(chunks), batch_size):
        batch = chunks[i:i + batch_size]
        batch_embeddings = await embeddings_model.aembed_documents(batch)
        all_embeddings.extend(batch_embeddings)
        print(f"   ✓ Processed batch {i // batch_size + 1}/{(len(chunks) - 1) // batch_size + 1}")

    print(f"   ✓ Generated {len(all_embeddings)} embeddings")
    return all_embeddings


def ensure_collection(client: QdrantClient, collection_name: str):
    """
    Ensure Qdrant collection exists, create if it doesn't

    Args:
        client: Qdrant client
        collection_name: Name of collection
    """
    print(f"🗄️  Checking Qdrant collection: {collection_name}")

    try:
        collections = client.get_collections().collections
        if not any(c.name == collection_name for c in collections):
            print(f"   Creating collection: {collection_name}")
            client.create_collection(
                collection_name=collection_name,
                vectors_config=VectorParams(size=3072, distance=Distance.COSINE)
            )

            # Create payload indexes
            client.create_payload_index(
                collection_name=collection_name,
                field_name="department_id",
                field_schema="keyword"
            )
            client.create_payload_index(
                collection_name=collection_name,
                field_name="region_code",
                field_schema="keyword"
            )
            client.create_payload_index(
                collection_name=collection_name,
                field_name="is_active",
                field_schema="bool"
            )
            print(f"   ✓ Collection created with indexes")
        else:
            print(f"   ✓ Collection exists")

    except Exception as e:
        print(f"   ✗ Error ensuring collection: {e}")
        raise


def upload_to_qdrant(
    client: QdrantClient,
    collection_name: str,
    chunks: list[str],
    embeddings: list[list[float]],
    file_name: str,
    department_id: str = None,
    region_code: str = None
) -> list[str]:
    """
    Upload chunks and embeddings to Qdrant

    Args:
        client: Qdrant client
        collection_name: Collection name
        chunks: Text chunks
        embeddings: Embedding vectors
        file_name: Original file name
        department_id: Optional department UUID
        region_code: Optional region code

    Returns:
        List of point IDs
    """
    print(f"⬆️  Uploading to Qdrant collection: {collection_name}")

    document_id = str(uuid4())
    points = []
    point_ids = []

    for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
        point_id = str(uuid4())
        point_ids.append(point_id)

        points.append(PointStruct(
            id=point_id,
            vector=embedding,
            payload={
                "document_id": document_id,
                "sharepoint_id": document_id,  # Use same ID for manual uploads
                "file_name": file_name,
                "file_url": f"local://{file_name}",
                "chunk_index": i,
                "total_chunks": len(chunks),
                "text": chunk,
                "department_id": department_id,
                "region_code": region_code,
                "file_type": file_type,
                "last_modified": datetime.utcnow().isoformat(),
                "is_active": True
            }
        ))

        if (i + 1) % 10 == 0:
            print(f"   Prepared {i + 1}/{len(chunks)} points")

    # Upload all points
    try:
        client.upsert(
            collection_name=collection_name,
            points=points
        )
        print(f"   ✓ Uploaded {len(points)} points to Qdrant")
        print(f"   ✓ Document ID: {document_id}")

    except Exception as e:
        print(f"   ✗ Error uploading to Qdrant: {e}")
        raise

    return point_ids


async def main():
    """Main execution function"""
    parser = argparse.ArgumentParser(
        description="Ingest a local PDF file into Qdrant for RAG testing"
    )
    parser.add_argument(
        "file_path",
        help="Path to the file to ingest (PDF, DOCX, PPTX, XLSX, CSV, TXT)"
    )
    parser.add_argument(
        "--title",
        help="Document title (defaults to filename)",
        default=None
    )
    parser.add_argument(
        "--department-id",
        help="Department UUID for filtering",
        default=None
    )
    parser.add_argument(
        "--region-code",
        help="Region code (e.g., 'UK', 'US', 'APAC')",
        default=None
    )
    parser.add_argument(
        "--chunk-size",
        type=int,
        default=1000,
        help="Chunk size in characters (default: 1000)"
    )
    parser.add_argument(
        "--chunk-overlap",
        type=int,
        default=200,
        help="Chunk overlap in characters (default: 200)"
    )

    args = parser.parse_args()

    # Validate file exists
    if not os.path.exists(args.file_path):
        print(f"❌ Error: File not found: {args.file_path}")
        sys.exit(1)

    # Get environment variables
    openai_api_key = os.getenv("OPENAI_API_KEY")
    qdrant_url = os.getenv("QDRANT_URL", "http://localhost:6333")

    if not openai_api_key:
        print("❌ Error: OPENAI_API_KEY environment variable not set")
        sys.exit(1)

    # Extract file name and type
    file_path = Path(args.file_path)
    file_name = args.title or file_path.name
    file_type = file_path.suffix.lstrip(".").lower()

    print("=" * 80)
    print("📚 Document Ingestion Script")
    print("=" * 80)
    print(f"File: {args.file_path}")
    print(f"Title: {file_name}")
    print(f"Type: {file_type}")
    print(f"Department ID: {args.department_id or 'None'}")
    print(f"Region: {args.region_code or 'None'}")
    print(f"Qdrant URL: {qdrant_url}")
    print("=" * 80)
    print()

    try:
        # 1. Extract text
        text = extract_text_from_file(args.file_path)

        # 2. Chunk text
        chunks = chunk_text(text, args.chunk_size, args.chunk_overlap)

        # 3. Generate embeddings
        embeddings = await generate_embeddings(chunks, openai_api_key)

        # 4. Connect to Qdrant
        client = QdrantClient(url=qdrant_url)
        collection_name = "sharepoint_docs"
        ensure_collection(client, collection_name)

        # 5. Upload to Qdrant
        point_ids = upload_to_qdrant(
            client=client,
            collection_name=collection_name,
            chunks=chunks,
            embeddings=embeddings,
            file_name=file_name,
            department_id=args.department_id,
            region_code=args.region_code
        )

        print()
        print("=" * 80)
        print("✅ Ingestion Complete!")
        print("=" * 80)
        print(f"Total chunks: {len(chunks)}")
        print(f"Total embeddings: {len(embeddings)}")
        print(f"Qdrant points: {len(point_ids)}")
        print()
        print("You can now test the RAG chat endpoint with queries related to this document.")
        print("=" * 80)

    except Exception as e:
        print()
        print("=" * 80)
        print(f"❌ Error: {e}")
        print("=" * 80)
        sys.exit(1)


if __name__ == "__main__":
    asyncio.run(main())