diff --git a/src/metadata_importer.py b/src/metadata_importer.py new file mode 100644 index 0000000..3ecfdad --- /dev/null +++ b/src/metadata_importer.py @@ -0,0 +1,305 @@ +"""Metadata importer for external files (CSV, Excel, JSON).""" + +import pandas as pd +import json +from pathlib import Path +from typing import Dict, Optional, List +from .utils import get_logger + +logger = get_logger(__name__) + + +class MetadataImporter: + """Import metadata from various file formats (CSV, Excel, JSON).""" + + def import_from_csv(self, csv_path: str) -> Dict[str, Dict]: + """ + Import metadata from CSV file. + Expected columns: filename, title, subject/description, keywords + + Args: + csv_path: Path to CSV file + + Returns: + Dictionary mapping filename stems to metadata dicts + """ + try: + df = pd.read_csv(csv_path, encoding='utf-8') + logger.info(f"Loaded CSV with {len(df)} rows from {csv_path}") + return self._parse_dataframe(df) + + except UnicodeDecodeError: + # Try alternative encodings + for encoding in ['latin1', 'iso-8859-1', 'cp1252']: + try: + df = pd.read_csv(csv_path, encoding=encoding) + logger.info(f"Loaded CSV with {len(df)} rows using {encoding} encoding") + return self._parse_dataframe(df) + except Exception: + continue + + raise ValueError(f"Could not read CSV file with any supported encoding") + + except Exception as e: + logger.error(f"Error importing from CSV: {e}") + raise + + def import_from_excel(self, excel_path: str, sheet_name: Optional[str] = None) -> Dict[str, Dict]: + """ + Import metadata from Excel file. + + Args: + excel_path: Path to Excel file (.xlsx, .xls) + sheet_name: Name of sheet to read (None = first sheet) + + Returns: + Dictionary mapping filename stems to metadata dicts + """ + try: + # Read Excel file + if sheet_name: + df = pd.read_excel(excel_path, sheet_name=sheet_name) + logger.info(f"Loaded Excel sheet '{sheet_name}' with {len(df)} rows") + else: + df = pd.read_excel(excel_path) + logger.info(f"Loaded Excel with {len(df)} rows from first sheet") + + return self._parse_dataframe(df) + + except Exception as e: + logger.error(f"Error importing from Excel: {e}") + raise + + def import_from_json(self, json_path: str) -> Dict[str, Dict]: + """ + Import metadata from JSON file. + + Expected format: + { + "filename.pdf": {"title": "...", "subject": "...", "keywords": "..."}, + "image.jpg": {"title": "...", "subject": "...", "keywords": "..."} + } + + Or array format: + [ + {"filename": "file.pdf", "title": "...", "subject": "...", "keywords": "..."}, + {"filename": "image.jpg", "title": "...", "subject": "...", "keywords": "..."} + ] + + Args: + json_path: Path to JSON file + + Returns: + Dictionary mapping filename stems to metadata dicts + """ + try: + with open(json_path, 'r', encoding='utf-8') as f: + data = json.load(f) + + metadata_map = {} + + if isinstance(data, dict): + # Object format: {"filename": {metadata}} + for filename, metadata in data.items(): + filename_stem = Path(filename).stem.lower() + metadata_map[filename_stem] = self._normalize_metadata(metadata) + + elif isinstance(data, list): + # Array format: [{filename, metadata}] + for item in data: + if not isinstance(item, dict): + continue + + # Find filename field + filename = None + for key in ['filename', 'file', 'name', 'file_name']: + if key in item: + filename = item[key] + break + + if not filename: + logger.warning(f"Skipping item without filename: {item}") + continue + + filename_stem = Path(filename).stem.lower() + metadata_map[filename_stem] = self._normalize_metadata(item) + + else: + raise ValueError("JSON must be an object or array") + + logger.info(f"Loaded {len(metadata_map)} metadata records from JSON") + return metadata_map + + except Exception as e: + logger.error(f"Error importing from JSON: {e}") + raise + + def _parse_dataframe(self, df: pd.DataFrame) -> Dict[str, Dict]: + """ + Parse pandas DataFrame into metadata map. + + Args: + df: DataFrame with metadata + + Returns: + Dictionary mapping filename stems to metadata dicts + """ + metadata_map = {} + + # Detect filename column (try common names) + filename_col = self._detect_column(df, ['filename', 'file', 'name', 'file_name', 'path']) + + if not filename_col: + raise ValueError("Could not find filename column in data. Tried: filename, file, name, file_name, path") + + # Detect metadata columns + title_col = self._detect_column(df, ['title', 'heading', 'name', 'document_title']) + subject_col = self._detect_column(df, ['subject', 'description', 'summary', 'desc', 'external_description', 'alt_text']) + keywords_col = self._detect_column(df, ['keywords', 'tags', 'categories', 'labels']) + + logger.info(f"Detected columns - filename: {filename_col}, title: {title_col}, subject: {subject_col}, keywords: {keywords_col}") + + # Parse rows + for _, row in df.iterrows(): + filename = str(row.get(filename_col, '')).strip() + if not filename or pd.isna(filename): + continue + + filename_stem = Path(filename).stem.lower() + + metadata_map[filename_stem] = { + 'title': self._get_value(row, title_col), + 'subject': self._get_value(row, subject_col), + 'keywords': self._get_value(row, keywords_col) + } + + logger.info(f"Parsed {len(metadata_map)} metadata records from DataFrame") + return metadata_map + + def _detect_column(self, df: pd.DataFrame, candidates: List[str]) -> Optional[str]: + """ + Detect column name from a list of candidates (case-insensitive). + + Args: + df: DataFrame to search + candidates: List of possible column names + + Returns: + Actual column name if found, None otherwise + """ + # Create lowercase mapping + col_map = {col.lower(): col for col in df.columns} + + # Try each candidate + for candidate in candidates: + if candidate.lower() in col_map: + return col_map[candidate.lower()] + + return None + + def _get_value(self, row: pd.Series, column: Optional[str]) -> str: + """ + Get value from row, handling None column and NaN values. + + Args: + row: DataFrame row + column: Column name (can be None) + + Returns: + String value or empty string + """ + if column is None: + return '' + + value = row.get(column, '') + + if pd.isna(value): + return '' + + return str(value).strip() + + def _normalize_metadata(self, metadata: Dict) -> Dict[str, str]: + """ + Normalize metadata dictionary to standard format. + + Args: + metadata: Raw metadata dict + + Returns: + Normalized metadata with title, subject, keywords keys + """ + normalized = { + 'title': '', + 'subject': '', + 'keywords': '' + } + + # Map title + for key in ['title', 'heading', 'name', 'document_title']: + if key in metadata and metadata[key]: + normalized['title'] = str(metadata[key]).strip() + break + + # Map subject/description + for key in ['subject', 'description', 'summary', 'desc', 'external_description', 'alt_text']: + if key in metadata and metadata[key]: + normalized['subject'] = str(metadata[key]).strip() + break + + # Map keywords + for key in ['keywords', 'tags', 'categories', 'labels']: + if key in metadata and metadata[key]: + value = metadata[key] + # Handle arrays + if isinstance(value, list): + normalized['keywords'] = ', '.join(str(v) for v in value) + else: + normalized['keywords'] = str(value).strip() + break + + return normalized + + def get_metadata_for_file(self, metadata_map: Dict[str, Dict], filename: str) -> Optional[Dict[str, str]]: + """ + Get metadata for a specific file from imported map. + + Args: + metadata_map: Dictionary returned by import_* methods + filename: Filename to look up (with or without extension) + + Returns: + Metadata dict if found, None otherwise + """ + filename_stem = Path(filename).stem.lower() + return metadata_map.get(filename_stem) + + def validate_import(self, metadata_map: Dict[str, Dict]) -> Dict: + """ + Validate imported metadata and return statistics. + + Args: + metadata_map: Dictionary returned by import_* methods + + Returns: + Statistics about the import + """ + stats = { + 'total_records': len(metadata_map), + 'with_title': 0, + 'with_subject': 0, + 'with_keywords': 0, + 'empty_records': 0 + } + + for metadata in metadata_map.values(): + if metadata.get('title'): + stats['with_title'] += 1 + if metadata.get('subject'): + stats['with_subject'] += 1 + if metadata.get('keywords'): + stats['with_keywords'] += 1 + + if not any([metadata.get('title'), metadata.get('subject'), metadata.get('keywords')]): + stats['empty_records'] += 1 + + return stats diff --git a/templates/index.html b/templates/index.html index 5ef7d0f..8a2b3f9 100644 --- a/templates/index.html +++ b/templates/index.html @@ -358,6 +358,45 @@ background: #5a6268; } + /* Import Metadata Section */ + .import-section { + background: white; + border-radius: 8px; + padding: 15px; + margin-bottom: 15px; + border: 2px dashed #dee2e6; + } + + .import-section.active { + border-color: #28a745; + background: #f0fff4; + } + + .btn-import { + background: linear-gradient(135deg, #17a2b8 0%, #138496 100%); + color: white; + border: none; + padding: 8px 20px; + border-radius: 20px; + cursor: pointer; + font-size: 14px; + font-weight: 600; + transition: transform 0.2s; + } + + .btn-import:hover { + transform: translateY(-2px); + } + + .import-stats { + font-size: 12px; + color: #28a745; + margin-top: 10px; + padding: 8px; + background: white; + border-radius: 5px; + } + @media (max-width: 768px) { .metadata-comparison { grid-template-columns: 1fr; @@ -383,7 +422,7 @@
- @@ -392,6 +431,18 @@ â„šī¸ Choose how to generate metadata
+ +
📁

Drop files here or click to browse

@@ -438,6 +489,7 @@