import json import mimetypes import os import uuid from typing import Annotated, List, Optional from fastapi import APIRouter, Body, Depends, File, HTTPException, UploadFile from pydantic import BaseModel from models.sql.user import UserModel from utils.auth_dependencies import get_current_user from constants.documents import ( EXCEL_TYPES, IMAGE_UPLOAD_TYPES, SPREADSHEET_TYPES, UPLOAD_ACCEPTED_FILE_TYPES, ) from models.decomposed_file_info import DecomposedFileInfo from services.attachment_parser_service import ( extract_images_metadata, parse_csv, parse_excel, parse_url, ) from services.documents_loader import DocumentsLoader from services.temp_file_service import TEMP_FILE_SERVICE from utils.validators import validate_files FILES_ROUTER = APIRouter(prefix="/files", tags=["Files"]) def _is_spreadsheet(file_path: str) -> bool: mime, _ = mimetypes.guess_type(file_path) ext = os.path.splitext(file_path)[1].lower() return ( mime in EXCEL_TYPES or mime in SPREADSHEET_TYPES or ext in (".xlsx", ".xls", ".csv") ) def _is_image(file_path: str) -> bool: mime, _ = mimetypes.guess_type(file_path) return mime in IMAGE_UPLOAD_TYPES @FILES_ROUTER.post("/upload", response_model=List[str]) async def upload_files(files: Optional[List[UploadFile]]): if not files: raise HTTPException(status_code=400, detail="Documents are required") temp_dir = TEMP_FILE_SERVICE.create_temp_dir(str(uuid.uuid4())) validate_files(files, True, True, 100, UPLOAD_ACCEPTED_FILE_TYPES) temp_files: List[str] = [] if files: for each_file in files: temp_path = TEMP_FILE_SERVICE.create_temp_file_path( each_file.filename, temp_dir ) with open(temp_path, "wb") as f: content = await each_file.read() f.write(content) temp_files.append(temp_path) return temp_files @FILES_ROUTER.post("/decompose", response_model=List[DecomposedFileInfo]) async def decompose_files(file_paths: Annotated[List[str], Body(embed=True)]): temp_dir = TEMP_FILE_SERVICE.create_temp_dir(str(uuid.uuid4())) txt_files = [] spreadsheet_files = [] image_files = [] other_files = [] for file_path in file_paths: if file_path.endswith(".txt"): txt_files.append(file_path) elif _is_spreadsheet(file_path): spreadsheet_files.append(file_path) elif _is_image(file_path): image_files.append(file_path) else: other_files.append(file_path) response: List[DecomposedFileInfo] = [] # --- Document files (PDF, DOCX, PPTX) via DocumentsLoader --- if other_files: documents_loader = DocumentsLoader(file_paths=other_files) await documents_loader.load_documents(temp_dir) parsed_documents = documents_loader.documents for index, parsed_doc in enumerate(parsed_documents): out_path = TEMP_FILE_SERVICE.create_temp_file_path( f"{uuid.uuid4()}.txt", temp_dir ) parsed_doc = parsed_doc.replace("
", "\n") with open(out_path, "w") as text_file: text_file.write(parsed_doc) response.append( DecomposedFileInfo( name=os.path.basename(other_files[index]), file_path=out_path, file_type="text", ) ) # --- Plain text files --- for each_file in txt_files: response.append( DecomposedFileInfo( name=os.path.basename(each_file), file_path=each_file, file_type="text", ) ) # --- Spreadsheet files (Excel, CSV) --- for sp_path in spreadsheet_files: ext = os.path.splitext(sp_path)[1].lower() if ext in (".xlsx", ".xls"): tables = parse_excel(sp_path) else: tables = [parse_csv(sp_path)] # Store parsed table data as JSON file for downstream use json_path = TEMP_FILE_SERVICE.create_temp_file_path( f"{uuid.uuid4()}.json", temp_dir ) serialized = [t.model_dump() for t in tables] with open(json_path, "w") as jf: json.dump(serialized, jf) response.append( DecomposedFileInfo( name=os.path.basename(sp_path), file_path=json_path, file_type="table", table_data=serialized, ) ) # --- Image files (with vision-based text extraction) --- for img_path in image_files: info = extract_images_metadata(img_path) # Try to extract text from image via Gemini vision extracted_text = None try: from services.docling_service import extract_text_from_image_via_vision mime_type, _ = mimetypes.guess_type(img_path) with open(img_path, "rb") as f: image_bytes = f.read() extracted_text = await extract_text_from_image_via_vision( image_bytes, mime_type or "image/png" ) except Exception as e: print(f"[decompose] Vision text extraction failed for {img_path}: {e}") if extracted_text: # Save extracted text as a text file alongside the image text_path = TEMP_FILE_SERVICE.create_temp_file_path( f"{uuid.uuid4()}.txt", temp_dir ) with open(text_path, "w") as tf: tf.write(extracted_text) response.append( DecomposedFileInfo( name=os.path.basename(img_path), file_path=text_path, file_type="text", ) ) else: response.append( DecomposedFileInfo( name=info.filename, file_path=img_path, file_type="image", image_info=info.model_dump(), ) ) return response class UrlParseRequest(BaseModel): url: str class UrlParseResponse(BaseModel): content: str url: str @FILES_ROUTER.post("/url", response_model=UrlParseResponse) async def parse_url_endpoint(body: UrlParseRequest): """Fetch a URL and extract its article content as text.""" if not body.url or not body.url.strip(): raise HTTPException(status_code=400, detail="URL is required") content = await parse_url(body.url) if not content: raise HTTPException( status_code=422, detail="Could not extract content from the provided URL" ) return UrlParseResponse(content=content, url=body.url) @FILES_ROUTER.post("/fetch-url") async def fetch_url_content( url: str = Body(..., embed=True), _current_user: UserModel = Depends(get_current_user), ): """Fetch a URL and extract its text content.""" if not url.startswith(("http://", "https://")): raise HTTPException(status_code=400, detail="Invalid URL") text = await parse_url(url) if not text: raise HTTPException( status_code=422, detail="Could not extract content from URL" ) return {"text": text, "url": url} @FILES_ROUTER.post("/update") async def update_files( file_path: Annotated[str, Body()], file: Annotated[UploadFile, File()], ): with open(file_path, "wb") as f: f.write(await file.read()) return {"message": "File updated successfully"}