""" Celery Task for processing Knowledge Base documents uploaded via admin panel. Reads the uploaded file from disk, runs it through DocumentProcessor (same pipeline as SharePoint sync), and updates the DB record. """ import asyncio import logging import os from datetime import datetime from typing import Any, Dict from uuid import UUID from sqlalchemy import select from app.core.document_processor import DocumentProcessor, DocumentProcessingError from app.database import AsyncSessionLocal from app.models.knowledge_document import KnowledgeDocument, DocumentStatus from celery_app import celery_app logger = logging.getLogger(__name__) @celery_app.task( name="app.tasks.knowledge_processing.process_knowledge_document", bind=True, max_retries=2, default_retry_delay=30, ) def process_knowledge_document(self, document_id: str, file_path: str) -> Dict[str, Any]: """ Process an uploaded knowledge base document. Args: document_id: UUID string of the KnowledgeDocument record file_path: Path to the uploaded temp file on disk Returns: Dict with processing result """ return asyncio.run(_async_process_knowledge_document(document_id, file_path)) async def _async_process_knowledge_document(document_id: str, file_path: str) -> Dict[str, Any]: """Async implementation of process_knowledge_document.""" async with AsyncSessionLocal() as session: # Load document record result = await session.execute( select(KnowledgeDocument).where(KnowledgeDocument.id == UUID(document_id)) ) doc = result.scalar_one_or_none() if not doc: logger.error("KnowledgeDocument %s not found", document_id) return {"error": "document_not_found"} # Update status to PROCESSING doc.status = DocumentStatus.PROCESSING await session.commit() try: # Read file bytes with open(file_path, "rb") as f: file_bytes = f.read() # Process through DocumentProcessor (same pipeline as SharePoint) processor = DocumentProcessor() vector_count = await processor.process_document( file_bytes=file_bytes, file_name=doc.file_name, file_type=doc.file_type, sharepoint_id=doc.document_key, # Maps to sharepoint_id in Qdrant file_url="", # No URL for uploaded files source_id="knowledge_base", # Distinguishes from SharePoint docs department_id=str(doc.department_id) if doc.department_id else None, region_code=doc.region_code, ) # Success: update record doc.status = DocumentStatus.COMPLETED doc.vector_count = vector_count doc.processed_at = datetime.utcnow() doc.error_message = None await session.commit() logger.info( "Knowledge document %s processed: %d vectors", document_id, vector_count, ) return { "document_id": document_id, "vector_count": vector_count, "status": "completed", } except Exception as exc: logger.exception("Failed to process knowledge document %s: %s", document_id, exc) doc.status = DocumentStatus.FAILED doc.error_message = str(exc)[:2000] await session.commit() return {"error": str(exc), "document_id": document_id} finally: # Clean up temp file try: if os.path.exists(file_path): os.remove(file_path) except OSError as e: logger.warning("Failed to remove temp file %s: %s", file_path, e)