ppt-tool/backend/api/v1/ppt/endpoints/files.py

import json
import mimetypes
import os
import uuid
from typing import Annotated, List, Optional

from fastapi import APIRouter, Body, Depends, File, HTTPException, UploadFile
from pydantic import BaseModel

from models.sql.user import UserModel
from utils.auth_dependencies import get_current_user

from constants.documents import (
    EXCEL_TYPES,
    IMAGE_UPLOAD_TYPES,
    SPREADSHEET_TYPES,
    UPLOAD_ACCEPTED_FILE_TYPES,
)
from models.decomposed_file_info import DecomposedFileInfo
from services.attachment_parser_service import (
    extract_images_metadata,
    parse_csv,
    parse_excel,
    parse_url,
)
from services.documents_loader import DocumentsLoader
from services.temp_file_service import TEMP_FILE_SERVICE
from utils.validators import validate_files

FILES_ROUTER = APIRouter(prefix="/files", tags=["Files"])


def _is_spreadsheet(file_path: str) -> bool:
    mime, _ = mimetypes.guess_type(file_path)
    ext = os.path.splitext(file_path)[1].lower()
    return (
        mime in EXCEL_TYPES
        or mime in SPREADSHEET_TYPES
        or ext in (".xlsx", ".xls", ".csv")
    )


def _is_image(file_path: str) -> bool:
    mime, _ = mimetypes.guess_type(file_path)
    return mime in IMAGE_UPLOAD_TYPES


@FILES_ROUTER.post("/upload", response_model=List[str])
async def upload_files(files: Optional[List[UploadFile]]):
    if not files:
        raise HTTPException(status_code=400, detail="Documents are required")

    temp_dir = TEMP_FILE_SERVICE.create_temp_dir(str(uuid.uuid4()))

    validate_files(files, True, True, 100, UPLOAD_ACCEPTED_FILE_TYPES)

    temp_files: List[str] = []
    if files:
        for each_file in files:
            temp_path = TEMP_FILE_SERVICE.create_temp_file_path(
                each_file.filename, temp_dir
            )
            with open(temp_path, "wb") as f:
                content = await each_file.read()
                f.write(content)

            temp_files.append(temp_path)

    return temp_files


@FILES_ROUTER.post("/decompose", response_model=List[DecomposedFileInfo])
async def decompose_files(file_paths: Annotated[List[str], Body(embed=True)]):
    temp_dir = TEMP_FILE_SERVICE.create_temp_dir(str(uuid.uuid4()))

    txt_files = []
    spreadsheet_files = []
    image_files = []
    other_files = []

    for file_path in file_paths:
        if file_path.endswith(".txt"):
            txt_files.append(file_path)
        elif _is_spreadsheet(file_path):
            spreadsheet_files.append(file_path)
        elif _is_image(file_path):
            image_files.append(file_path)
        else:
            other_files.append(file_path)

    response: List[DecomposedFileInfo] = []

    # --- Document files (PDF, DOCX, PPTX) via DocumentsLoader ---
    if other_files:
        documents_loader = DocumentsLoader(file_paths=other_files)
        await documents_loader.load_documents(temp_dir)
        parsed_documents = documents_loader.documents

        for index, parsed_doc in enumerate(parsed_documents):
            out_path = TEMP_FILE_SERVICE.create_temp_file_path(
                f"{uuid.uuid4()}.txt", temp_dir
            )
            parsed_doc = parsed_doc.replace("<br>", "\n")
            with open(out_path, "w") as text_file:
                text_file.write(parsed_doc)
            response.append(
                DecomposedFileInfo(
                    name=os.path.basename(other_files[index]),
                    file_path=out_path,
                    file_type="text",
                )
            )

    # --- Plain text files ---
    for each_file in txt_files:
        response.append(
            DecomposedFileInfo(
                name=os.path.basename(each_file),
                file_path=each_file,
                file_type="text",
            )
        )

    # --- Spreadsheet files (Excel, CSV) ---
    for sp_path in spreadsheet_files:
        ext = os.path.splitext(sp_path)[1].lower()
        if ext in (".xlsx", ".xls"):
            tables = parse_excel(sp_path)
        else:
            tables = [parse_csv(sp_path)]

        # Store parsed table data as JSON file for downstream use
        json_path = TEMP_FILE_SERVICE.create_temp_file_path(
            f"{uuid.uuid4()}.json", temp_dir
        )
        serialized = [t.model_dump() for t in tables]
        with open(json_path, "w") as jf:
            json.dump(serialized, jf)

        response.append(
            DecomposedFileInfo(
                name=os.path.basename(sp_path),
                file_path=json_path,
                file_type="table",
                table_data=serialized,
            )
        )

    # --- Image files (with vision-based text extraction) ---
    for img_path in image_files:
        info = extract_images_metadata(img_path)

        # Try to extract text from image via Gemini vision
        extracted_text = None
        try:
            from services.docling_service import extract_text_from_image_via_vision
            mime_type, _ = mimetypes.guess_type(img_path)
            with open(img_path, "rb") as f:
                image_bytes = f.read()
            extracted_text = await extract_text_from_image_via_vision(
                image_bytes, mime_type or "image/png"
            )
        except Exception as e:
            print(f"[decompose] Vision text extraction failed for {img_path}: {e}")

        if extracted_text:
            # Save extracted text as a text file alongside the image
            text_path = TEMP_FILE_SERVICE.create_temp_file_path(
                f"{uuid.uuid4()}.txt", temp_dir
            )
            with open(text_path, "w") as tf:
                tf.write(extracted_text)
            response.append(
                DecomposedFileInfo(
                    name=os.path.basename(img_path),
                    file_path=text_path,
                    file_type="text",
                )
            )
        else:
            response.append(
                DecomposedFileInfo(
                    name=info.filename,
                    file_path=img_path,
                    file_type="image",
                    image_info=info.model_dump(),
                )
            )

    return response


class UrlParseRequest(BaseModel):
    url: str


class UrlParseResponse(BaseModel):
    content: str
    url: str


@FILES_ROUTER.post("/url", response_model=UrlParseResponse)
async def parse_url_endpoint(body: UrlParseRequest):
    """Fetch a URL and extract its article content as text."""
    if not body.url or not body.url.strip():
        raise HTTPException(status_code=400, detail="URL is required")

    content = await parse_url(body.url)
    if not content:
        raise HTTPException(
            status_code=422, detail="Could not extract content from the provided URL"
        )

    return UrlParseResponse(content=content, url=body.url)


@FILES_ROUTER.post("/fetch-url")
async def fetch_url_content(
    url: str = Body(..., embed=True),
    _current_user: UserModel = Depends(get_current_user),
):
    """Fetch a URL and extract its text content."""
    if not url.startswith(("http://", "https://")):
        raise HTTPException(status_code=400, detail="Invalid URL")

    text = await parse_url(url)
    if not text:
        raise HTTPException(
            status_code=422, detail="Could not extract content from URL"
        )

    return {"text": text, "url": url}


@FILES_ROUTER.post("/update")
async def update_files(
    file_path: Annotated[str, Body()],
    file: Annotated[UploadFile, File()],
):
    with open(file_path, "wb") as f:
        f.write(await file.read())

    return {"message": "File updated successfully"}