solventum-image-metadata/backend/app/processors/field_mapper.py
SamoilenkoVadym 563d476a94 feat(backend): migrate from Flask to FastAPI with Redis sessions
- Create FastAPI application with async I/O
- Implement Redis session storage (fixes session loss on restart)
- Add JWT authentication with refresh tokens
- Add Microsoft SSO support via MSAL
- Copy all processors from src/ (100% reused, no changes)
- Create file upload/download endpoints
- Create metadata update endpoints
- Create template CRUD endpoints
- Add SQLAlchemy async database models
- Add Docker Compose configuration with Redis

Solves critical issues:
- Session management: Redis replaces in-memory dicts
- Scalability: Async FastAPI + microservices architecture
- File handling: Persistent storage with auto-cleanup

Key files:
- backend/app/main.py - FastAPI entry point
- backend/app/core/redis_client.py - Session store
- backend/app/core/auth.py - JWT authentication
- backend/app/api/* - All REST endpoints
- backend/app/processors/ - Reused from src/

Co-Authored-By: Claude Sonnet 4.5 (1M context) <noreply@anthropic.com>
2026-02-09 13:14:37 +00:00

409 lines
13 KiB
Python

"""Field mapping with automatic detection and manual override."""
import json
from typing import Dict, List, Optional, Tuple
from difflib import SequenceMatcher
from pathlib import Path
from .utils import get_logger
logger = get_logger(__name__)
class FieldMapper:
"""Map source fields to standard metadata fields with fuzzy matching."""
# Standard metadata fields used in Oliver Metadata Tool
STANDARD_FIELDS = ['title', 'subject', 'keywords', 'description']
# Common aliases for fuzzy matching (case-insensitive)
FIELD_ALIASES = {
'title': [
'title', 'name', 'heading', 'filename', 'file_name', 'document_title',
'asset_title', 'resource_title', 'object_name', 'label'
],
'subject': [
'subject', 'description', 'summary', 'abstract', 'alt_text',
'external_description', 'caption', 'about', 'overview', 'details',
'desc', 'long_description', 'content'
],
'keywords': [
'keywords', 'tags', 'categories', 'labels', 'subjects', 'topics',
'taxonomy', 'classification', 'key_words', 'search_terms'
],
'description': [
'description', 'desc', 'summary', 'notes', 'comments', 'remarks',
'details', 'about', 'information', 'info'
]
}
# Similarity threshold for fuzzy matching (0.0 to 1.0)
SIMILARITY_THRESHOLD = 0.6
def __init__(self, presets_path: Optional[str] = None):
"""
Initialize field mapper.
Args:
presets_path: Path to JSON file for saving/loading mapping presets
"""
self.presets_path = presets_path or 'field_mapping_presets.json'
def auto_map(self, source_fields: List[str], strict: bool = False) -> Dict[str, Tuple[str, float]]:
"""
Automatically map source fields to standard fields using fuzzy matching.
Args:
source_fields: List of field names from source data
strict: If True, only accept matches above high confidence threshold (0.8)
Returns:
Dictionary mapping {source_field: (target_field, confidence_score)}
Example: {'File Name': ('title', 0.85), 'Alt Text': ('subject', 0.92)}
"""
mapping = {}
threshold = 0.8 if strict else self.SIMILARITY_THRESHOLD
for source_field in source_fields:
best_match = self._find_best_match(source_field, threshold)
if best_match:
target_field, score = best_match
mapping[source_field] = (target_field, score)
logger.info(f"Auto-mapped '{source_field}' -> '{target_field}' (confidence: {score:.2f})")
return mapping
def _find_best_match(self, source_field: str, threshold: float = 0.6) -> Optional[Tuple[str, float]]:
"""
Find best matching standard field for source field.
Args:
source_field: Source field name
threshold: Minimum similarity score (0.0 to 1.0)
Returns:
Tuple of (target_field, confidence_score) or None
"""
source_lower = source_field.lower().replace(' ', '_').replace('-', '_')
best_score = 0.0
best_field = None
for standard_field, aliases in self.FIELD_ALIASES.items():
for alias in aliases:
# Calculate similarity score
score = SequenceMatcher(None, source_lower, alias).ratio()
# Exact match bonus
if source_lower == alias:
score = 1.0
# Substring match bonus
elif alias in source_lower or source_lower in alias:
score = max(score, 0.85)
if score > best_score and score >= threshold:
best_score = score
best_field = standard_field
if best_field:
return (best_field, best_score)
return None
def validate_mapping(self, mapping: Dict[str, str]) -> Dict[str, List[str]]:
"""
Validate a field mapping configuration.
Args:
mapping: Dictionary mapping {source_field: target_field}
Returns:
Dictionary with validation results:
{
'valid': [list of valid mappings],
'invalid': [list of invalid mappings],
'warnings': [list of warnings]
}
"""
result = {
'valid': [],
'invalid': [],
'warnings': []
}
# Track which target fields are used
target_usage = {}
for source_field, target_field in mapping.items():
# Check if target field is valid
if target_field not in self.STANDARD_FIELDS:
result['invalid'].append(
f"'{target_field}' is not a valid target field (source: '{source_field}')"
)
continue
result['valid'].append(f"'{source_field}' -> '{target_field}'")
# Track multiple sources mapping to same target
if target_field in target_usage:
target_usage[target_field].append(source_field)
else:
target_usage[target_field] = [source_field]
# Warn about multiple sources mapping to same target
for target_field, sources in target_usage.items():
if len(sources) > 1:
result['warnings'].append(
f"Multiple source fields map to '{target_field}': {', '.join(sources)}"
)
return result
def apply_mapping(self, data: Dict[str, str], mapping: Dict[str, str]) -> Dict[str, str]:
"""
Apply field mapping to transform source data to standard format.
Args:
data: Source data dictionary
mapping: Field mapping {source_field: target_field}
Returns:
Transformed data with standard field names
"""
result = {field: '' for field in self.STANDARD_FIELDS}
for source_field, target_field in mapping.items():
if source_field in data and target_field in self.STANDARD_FIELDS:
value = data[source_field]
# Handle multiple values mapping to same target (concatenate)
if result[target_field]:
result[target_field] += f"; {value}"
else:
result[target_field] = value
return result
def save_preset(self, name: str, mapping: Dict[str, str], description: str = ""):
"""
Save mapping preset to file.
Args:
name: Preset name
mapping: Field mapping dictionary
description: Optional description
"""
presets = self._load_presets()
presets[name] = {
'mapping': mapping,
'description': description,
'created_at': self._get_timestamp()
}
try:
with open(self.presets_path, 'w') as f:
json.dump(presets, f, indent=2)
logger.info(f"Saved mapping preset: {name}")
except Exception as e:
logger.error(f"Failed to save preset '{name}': {e}")
raise
def load_preset(self, name: str) -> Optional[Dict[str, str]]:
"""
Load mapping preset from file.
Args:
name: Preset name
Returns:
Mapping dictionary or None if not found
"""
presets = self._load_presets()
if name in presets:
logger.info(f"Loaded mapping preset: {name}")
return presets[name].get('mapping', {})
logger.warning(f"Preset not found: {name}")
return None
def list_presets(self) -> List[Dict[str, str]]:
"""
List all saved presets.
Returns:
List of preset information dictionaries
"""
presets = self._load_presets()
return [
{
'name': name,
'description': data.get('description', ''),
'created_at': data.get('created_at', ''),
'fields': len(data.get('mapping', {}))
}
for name, data in presets.items()
]
def delete_preset(self, name: str) -> bool:
"""
Delete a mapping preset.
Args:
name: Preset name
Returns:
True if deleted, False if not found
"""
presets = self._load_presets()
if name in presets:
del presets[name]
try:
with open(self.presets_path, 'w') as f:
json.dump(presets, f, indent=2)
logger.info(f"Deleted mapping preset: {name}")
return True
except Exception as e:
logger.error(f"Failed to delete preset '{name}': {e}")
raise
return False
def suggest_mapping(self, source_fields: List[str]) -> Dict:
"""
Generate mapping suggestions with confidence scores and alternatives.
Args:
source_fields: List of source field names
Returns:
Dictionary with suggestions:
{
'source_field': {
'best_match': 'target_field',
'confidence': 0.85,
'alternatives': [
{'field': 'other_target', 'confidence': 0.65},
...
]
}
}
"""
suggestions = {}
for source_field in source_fields:
# Find all potential matches
matches = self._find_all_matches(source_field)
if matches:
best_match = matches[0]
suggestions[source_field] = {
'best_match': best_match[0],
'confidence': best_match[1],
'alternatives': [
{'field': field, 'confidence': score}
for field, score in matches[1:3] # Top 2 alternatives
]
}
else:
suggestions[source_field] = {
'best_match': None,
'confidence': 0.0,
'alternatives': []
}
return suggestions
def _find_all_matches(self, source_field: str, min_threshold: float = 0.4) -> List[Tuple[str, float]]:
"""
Find all matching standard fields above threshold, sorted by score.
Args:
source_field: Source field name
min_threshold: Minimum similarity score
Returns:
List of (target_field, score) tuples sorted by score descending
"""
source_lower = source_field.lower().replace(' ', '_').replace('-', '_')
matches = []
for standard_field, aliases in self.FIELD_ALIASES.items():
best_score = 0.0
for alias in aliases:
score = SequenceMatcher(None, source_lower, alias).ratio()
# Exact match
if source_lower == alias:
score = 1.0
# Substring match
elif alias in source_lower or source_lower in alias:
score = max(score, 0.85)
best_score = max(best_score, score)
if best_score >= min_threshold:
matches.append((standard_field, best_score))
# Sort by score descending
matches.sort(key=lambda x: x[1], reverse=True)
return matches
def _load_presets(self) -> Dict:
"""Load all presets from file."""
if Path(self.presets_path).exists():
try:
with open(self.presets_path, 'r') as f:
return json.load(f)
except Exception as e:
logger.error(f"Failed to load presets: {e}")
return {}
return {}
def _get_timestamp(self) -> str:
"""Get current timestamp as ISO format string."""
from datetime import datetime
return datetime.now().isoformat()
def get_unmapped_fields(self, source_fields: List[str], mapping: Dict[str, str]) -> List[str]:
"""
Get list of source fields that are not mapped.
Args:
source_fields: All source field names
mapping: Current mapping dictionary
Returns:
List of unmapped source fields
"""
return [field for field in source_fields if field not in mapping]
def get_mapping_coverage(self, source_fields: List[str], mapping: Dict[str, str]) -> Dict:
"""
Calculate mapping coverage statistics.
Args:
source_fields: All source field names
mapping: Current mapping dictionary
Returns:
Statistics dictionary with coverage info
"""
total_fields = len(source_fields)
mapped_fields = len(mapping)
unmapped = self.get_unmapped_fields(source_fields, mapping)
# Count unique target fields used
unique_targets = len(set(mapping.values()))
return {
'total_source_fields': total_fields,
'mapped_fields': mapped_fields,
'unmapped_fields': len(unmapped),
'coverage_percent': (mapped_fields / total_fields * 100) if total_fields > 0 else 0,
'unique_targets_used': unique_targets,
'unmapped_field_list': unmapped
}