54 lines
1.7 KiB
Python
54 lines
1.7 KiB
Python
"""
|
|
Cloud Run HTTP service for document processing.
|
|
|
|
Endpoints:
|
|
GET /health → Health check
|
|
POST /process-document → Extract text + chunk → return {"chunks": [...]} (stateless, no Qdrant)
|
|
"""
|
|
import logging
|
|
from typing import Optional
|
|
|
|
from fastapi import FastAPI, File, Form, UploadFile, HTTPException
|
|
|
|
logging.basicConfig(level=logging.INFO)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
app = FastAPI(title="Nexus Document Processor")
|
|
|
|
|
|
@app.get("/health")
|
|
def health():
|
|
return {"status": "ok"}
|
|
|
|
|
|
@app.post("/process-document")
|
|
async def process_document(
|
|
file: UploadFile = File(...),
|
|
file_name: str = Form(...),
|
|
file_type: str = Form(...),
|
|
sharepoint_id: str = Form(...),
|
|
department_id: Optional[str] = Form(None),
|
|
region_code: Optional[str] = Form(None),
|
|
):
|
|
"""
|
|
Extract text and chunk a document. Returns chunks as JSON — no Qdrant, no embeddings.
|
|
The calling backend (on GCE) handles embed + upsert to Qdrant.
|
|
"""
|
|
from app.core.document_processor import DocumentProcessor, DocumentProcessingError
|
|
|
|
try:
|
|
file_bytes = await file.read()
|
|
processor = DocumentProcessor()
|
|
chunks = processor.extract_and_chunk(file_bytes, file_name, file_type)
|
|
logger.info("Extracted %d chunks from '%s' (type=%s)", len(chunks), file_name, file_type)
|
|
return {
|
|
"status": "completed",
|
|
"chunks": chunks,
|
|
"total_chunks": len(chunks),
|
|
}
|
|
except DocumentProcessingError as e:
|
|
logger.error("Document processing failed: %s", e)
|
|
raise HTTPException(status_code=422, detail=str(e))
|
|
except Exception as e:
|
|
logger.exception("Unexpected error processing document: %s", e)
|
|
raise HTTPException(status_code=500, detail=str(e))
|