contract-query/backend/app/utils/file_utils.py
2025-08-14 15:03:33 -05:00

87 lines
No EOL
2.4 KiB
Python

import os
import mimetypes
from pathlib import Path
from typing import Dict, Any, Optional
from fastapi import UploadFile
def validate_file(file: UploadFile) -> Dict[str, Any]:
"""Validate uploaded file and return file info"""
if not file.filename:
raise ValueError("No filename provided")
# Get file extension
file_path = Path(file.filename)
extension = file_path.suffix.lower()
# Get MIME type
mime_type, _ = mimetypes.guess_type(file.filename)
# Validate MIME type
allowed_mime_types = {
'application/pdf',
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
'application/msword',
'text/plain',
'text/csv',
'application/json',
'text/html',
'text/markdown',
'application/rtf'
}
if mime_type not in allowed_mime_types:
raise ValueError(f"MIME type {mime_type} not supported")
return {
'filename': file.filename,
'extension': extension,
'mime_type': mime_type,
'size': file.size
}
def get_file_info(file_path: Path) -> Dict[str, Any]:
"""Get information about a file"""
if not file_path.exists():
raise FileNotFoundError(f"File {file_path} not found")
stat = file_path.stat()
mime_type, _ = mimetypes.guess_type(str(file_path))
return {
'filename': file_path.name,
'extension': file_path.suffix.lower(),
'mime_type': mime_type,
'size': stat.st_size,
'created_at': stat.st_ctime,
'modified_at': stat.st_mtime
}
def ensure_directory(directory: Path) -> None:
"""Ensure directory exists"""
directory.mkdir(parents=True, exist_ok=True)
def clean_filename(filename: str) -> str:
"""Clean filename to be filesystem-safe"""
# Remove or replace problematic characters
invalid_chars = '<>:"/\\|?*'
cleaned = filename
for char in invalid_chars:
cleaned = cleaned.replace(char, '_')
# Remove leading/trailing dots and spaces
cleaned = cleaned.strip('. ')
# Ensure it's not empty
if not cleaned:
cleaned = "unnamed_file"
return cleaned
def get_upload_path(index_id: str, filename: str, base_dir: str) -> Path:
"""Generate upload path for a file"""
base_path = Path(base_dir)
index_path = base_path / index_id
ensure_directory(index_path)
return index_path / clean_filename(filename)