import os import mimetypes from pathlib import Path from typing import Dict, Any, Optional from fastapi import UploadFile def validate_file(file: UploadFile) -> Dict[str, Any]: """Validate uploaded file and return file info""" if not file.filename: raise ValueError("No filename provided") # Get file extension file_path = Path(file.filename) extension = file_path.suffix.lower() # Get MIME type mime_type, _ = mimetypes.guess_type(file.filename) # Validate MIME type allowed_mime_types = { 'application/pdf', 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'application/msword', 'text/plain', 'text/csv', 'application/json', 'text/html', 'text/markdown', 'application/rtf' } if mime_type not in allowed_mime_types: raise ValueError(f"MIME type {mime_type} not supported") return { 'filename': file.filename, 'extension': extension, 'mime_type': mime_type, 'size': file.size } def get_file_info(file_path: Path) -> Dict[str, Any]: """Get information about a file""" if not file_path.exists(): raise FileNotFoundError(f"File {file_path} not found") stat = file_path.stat() mime_type, _ = mimetypes.guess_type(str(file_path)) return { 'filename': file_path.name, 'extension': file_path.suffix.lower(), 'mime_type': mime_type, 'size': stat.st_size, 'created_at': stat.st_ctime, 'modified_at': stat.st_mtime } def ensure_directory(directory: Path) -> None: """Ensure directory exists""" directory.mkdir(parents=True, exist_ok=True) def clean_filename(filename: str) -> str: """Clean filename to be filesystem-safe""" # Remove or replace problematic characters invalid_chars = '<>:"/\\|?*' cleaned = filename for char in invalid_chars: cleaned = cleaned.replace(char, '_') # Remove leading/trailing dots and spaces cleaned = cleaned.strip('. ') # Ensure it's not empty if not cleaned: cleaned = "unnamed_file" return cleaned def get_upload_path(index_id: str, filename: str, base_dir: str) -> Path: """Generate upload path for a file""" base_path = Path(base_dir) index_path = base_path / index_id ensure_directory(index_path) return index_path / clean_filename(filename)