- Create FastAPI application with async I/O - Implement Redis session storage (fixes session loss on restart) - Add JWT authentication with refresh tokens - Add Microsoft SSO support via MSAL - Copy all processors from src/ (100% reused, no changes) - Create file upload/download endpoints - Create metadata update endpoints - Create template CRUD endpoints - Add SQLAlchemy async database models - Add Docker Compose configuration with Redis Solves critical issues: - Session management: Redis replaces in-memory dicts - Scalability: Async FastAPI + microservices architecture - File handling: Persistent storage with auto-cleanup Key files: - backend/app/main.py - FastAPI entry point - backend/app/core/redis_client.py - Session store - backend/app/core/auth.py - JWT authentication - backend/app/api/* - All REST endpoints - backend/app/processors/ - Reused from src/ Co-Authored-By: Claude Sonnet 4.5 (1M context) <noreply@anthropic.com>
409 lines
13 KiB
Python
409 lines
13 KiB
Python
"""Field mapping with automatic detection and manual override."""
|
|
|
|
import json
|
|
from typing import Dict, List, Optional, Tuple
|
|
from difflib import SequenceMatcher
|
|
from pathlib import Path
|
|
from .utils import get_logger
|
|
|
|
logger = get_logger(__name__)
|
|
|
|
|
|
class FieldMapper:
|
|
"""Map source fields to standard metadata fields with fuzzy matching."""
|
|
|
|
# Standard metadata fields used in Oliver Metadata Tool
|
|
STANDARD_FIELDS = ['title', 'subject', 'keywords', 'description']
|
|
|
|
# Common aliases for fuzzy matching (case-insensitive)
|
|
FIELD_ALIASES = {
|
|
'title': [
|
|
'title', 'name', 'heading', 'filename', 'file_name', 'document_title',
|
|
'asset_title', 'resource_title', 'object_name', 'label'
|
|
],
|
|
'subject': [
|
|
'subject', 'description', 'summary', 'abstract', 'alt_text',
|
|
'external_description', 'caption', 'about', 'overview', 'details',
|
|
'desc', 'long_description', 'content'
|
|
],
|
|
'keywords': [
|
|
'keywords', 'tags', 'categories', 'labels', 'subjects', 'topics',
|
|
'taxonomy', 'classification', 'key_words', 'search_terms'
|
|
],
|
|
'description': [
|
|
'description', 'desc', 'summary', 'notes', 'comments', 'remarks',
|
|
'details', 'about', 'information', 'info'
|
|
]
|
|
}
|
|
|
|
# Similarity threshold for fuzzy matching (0.0 to 1.0)
|
|
SIMILARITY_THRESHOLD = 0.6
|
|
|
|
def __init__(self, presets_path: Optional[str] = None):
|
|
"""
|
|
Initialize field mapper.
|
|
|
|
Args:
|
|
presets_path: Path to JSON file for saving/loading mapping presets
|
|
"""
|
|
self.presets_path = presets_path or 'field_mapping_presets.json'
|
|
|
|
def auto_map(self, source_fields: List[str], strict: bool = False) -> Dict[str, Tuple[str, float]]:
|
|
"""
|
|
Automatically map source fields to standard fields using fuzzy matching.
|
|
|
|
Args:
|
|
source_fields: List of field names from source data
|
|
strict: If True, only accept matches above high confidence threshold (0.8)
|
|
|
|
Returns:
|
|
Dictionary mapping {source_field: (target_field, confidence_score)}
|
|
Example: {'File Name': ('title', 0.85), 'Alt Text': ('subject', 0.92)}
|
|
"""
|
|
mapping = {}
|
|
threshold = 0.8 if strict else self.SIMILARITY_THRESHOLD
|
|
|
|
for source_field in source_fields:
|
|
best_match = self._find_best_match(source_field, threshold)
|
|
if best_match:
|
|
target_field, score = best_match
|
|
mapping[source_field] = (target_field, score)
|
|
logger.info(f"Auto-mapped '{source_field}' -> '{target_field}' (confidence: {score:.2f})")
|
|
|
|
return mapping
|
|
|
|
def _find_best_match(self, source_field: str, threshold: float = 0.6) -> Optional[Tuple[str, float]]:
|
|
"""
|
|
Find best matching standard field for source field.
|
|
|
|
Args:
|
|
source_field: Source field name
|
|
threshold: Minimum similarity score (0.0 to 1.0)
|
|
|
|
Returns:
|
|
Tuple of (target_field, confidence_score) or None
|
|
"""
|
|
source_lower = source_field.lower().replace(' ', '_').replace('-', '_')
|
|
best_score = 0.0
|
|
best_field = None
|
|
|
|
for standard_field, aliases in self.FIELD_ALIASES.items():
|
|
for alias in aliases:
|
|
# Calculate similarity score
|
|
score = SequenceMatcher(None, source_lower, alias).ratio()
|
|
|
|
# Exact match bonus
|
|
if source_lower == alias:
|
|
score = 1.0
|
|
|
|
# Substring match bonus
|
|
elif alias in source_lower or source_lower in alias:
|
|
score = max(score, 0.85)
|
|
|
|
if score > best_score and score >= threshold:
|
|
best_score = score
|
|
best_field = standard_field
|
|
|
|
if best_field:
|
|
return (best_field, best_score)
|
|
return None
|
|
|
|
def validate_mapping(self, mapping: Dict[str, str]) -> Dict[str, List[str]]:
|
|
"""
|
|
Validate a field mapping configuration.
|
|
|
|
Args:
|
|
mapping: Dictionary mapping {source_field: target_field}
|
|
|
|
Returns:
|
|
Dictionary with validation results:
|
|
{
|
|
'valid': [list of valid mappings],
|
|
'invalid': [list of invalid mappings],
|
|
'warnings': [list of warnings]
|
|
}
|
|
"""
|
|
result = {
|
|
'valid': [],
|
|
'invalid': [],
|
|
'warnings': []
|
|
}
|
|
|
|
# Track which target fields are used
|
|
target_usage = {}
|
|
|
|
for source_field, target_field in mapping.items():
|
|
# Check if target field is valid
|
|
if target_field not in self.STANDARD_FIELDS:
|
|
result['invalid'].append(
|
|
f"'{target_field}' is not a valid target field (source: '{source_field}')"
|
|
)
|
|
continue
|
|
|
|
result['valid'].append(f"'{source_field}' -> '{target_field}'")
|
|
|
|
# Track multiple sources mapping to same target
|
|
if target_field in target_usage:
|
|
target_usage[target_field].append(source_field)
|
|
else:
|
|
target_usage[target_field] = [source_field]
|
|
|
|
# Warn about multiple sources mapping to same target
|
|
for target_field, sources in target_usage.items():
|
|
if len(sources) > 1:
|
|
result['warnings'].append(
|
|
f"Multiple source fields map to '{target_field}': {', '.join(sources)}"
|
|
)
|
|
|
|
return result
|
|
|
|
def apply_mapping(self, data: Dict[str, str], mapping: Dict[str, str]) -> Dict[str, str]:
|
|
"""
|
|
Apply field mapping to transform source data to standard format.
|
|
|
|
Args:
|
|
data: Source data dictionary
|
|
mapping: Field mapping {source_field: target_field}
|
|
|
|
Returns:
|
|
Transformed data with standard field names
|
|
"""
|
|
result = {field: '' for field in self.STANDARD_FIELDS}
|
|
|
|
for source_field, target_field in mapping.items():
|
|
if source_field in data and target_field in self.STANDARD_FIELDS:
|
|
value = data[source_field]
|
|
|
|
# Handle multiple values mapping to same target (concatenate)
|
|
if result[target_field]:
|
|
result[target_field] += f"; {value}"
|
|
else:
|
|
result[target_field] = value
|
|
|
|
return result
|
|
|
|
def save_preset(self, name: str, mapping: Dict[str, str], description: str = ""):
|
|
"""
|
|
Save mapping preset to file.
|
|
|
|
Args:
|
|
name: Preset name
|
|
mapping: Field mapping dictionary
|
|
description: Optional description
|
|
"""
|
|
presets = self._load_presets()
|
|
|
|
presets[name] = {
|
|
'mapping': mapping,
|
|
'description': description,
|
|
'created_at': self._get_timestamp()
|
|
}
|
|
|
|
try:
|
|
with open(self.presets_path, 'w') as f:
|
|
json.dump(presets, f, indent=2)
|
|
logger.info(f"Saved mapping preset: {name}")
|
|
except Exception as e:
|
|
logger.error(f"Failed to save preset '{name}': {e}")
|
|
raise
|
|
|
|
def load_preset(self, name: str) -> Optional[Dict[str, str]]:
|
|
"""
|
|
Load mapping preset from file.
|
|
|
|
Args:
|
|
name: Preset name
|
|
|
|
Returns:
|
|
Mapping dictionary or None if not found
|
|
"""
|
|
presets = self._load_presets()
|
|
|
|
if name in presets:
|
|
logger.info(f"Loaded mapping preset: {name}")
|
|
return presets[name].get('mapping', {})
|
|
|
|
logger.warning(f"Preset not found: {name}")
|
|
return None
|
|
|
|
def list_presets(self) -> List[Dict[str, str]]:
|
|
"""
|
|
List all saved presets.
|
|
|
|
Returns:
|
|
List of preset information dictionaries
|
|
"""
|
|
presets = self._load_presets()
|
|
|
|
return [
|
|
{
|
|
'name': name,
|
|
'description': data.get('description', ''),
|
|
'created_at': data.get('created_at', ''),
|
|
'fields': len(data.get('mapping', {}))
|
|
}
|
|
for name, data in presets.items()
|
|
]
|
|
|
|
def delete_preset(self, name: str) -> bool:
|
|
"""
|
|
Delete a mapping preset.
|
|
|
|
Args:
|
|
name: Preset name
|
|
|
|
Returns:
|
|
True if deleted, False if not found
|
|
"""
|
|
presets = self._load_presets()
|
|
|
|
if name in presets:
|
|
del presets[name]
|
|
|
|
try:
|
|
with open(self.presets_path, 'w') as f:
|
|
json.dump(presets, f, indent=2)
|
|
logger.info(f"Deleted mapping preset: {name}")
|
|
return True
|
|
except Exception as e:
|
|
logger.error(f"Failed to delete preset '{name}': {e}")
|
|
raise
|
|
|
|
return False
|
|
|
|
def suggest_mapping(self, source_fields: List[str]) -> Dict:
|
|
"""
|
|
Generate mapping suggestions with confidence scores and alternatives.
|
|
|
|
Args:
|
|
source_fields: List of source field names
|
|
|
|
Returns:
|
|
Dictionary with suggestions:
|
|
{
|
|
'source_field': {
|
|
'best_match': 'target_field',
|
|
'confidence': 0.85,
|
|
'alternatives': [
|
|
{'field': 'other_target', 'confidence': 0.65},
|
|
...
|
|
]
|
|
}
|
|
}
|
|
"""
|
|
suggestions = {}
|
|
|
|
for source_field in source_fields:
|
|
# Find all potential matches
|
|
matches = self._find_all_matches(source_field)
|
|
|
|
if matches:
|
|
best_match = matches[0]
|
|
suggestions[source_field] = {
|
|
'best_match': best_match[0],
|
|
'confidence': best_match[1],
|
|
'alternatives': [
|
|
{'field': field, 'confidence': score}
|
|
for field, score in matches[1:3] # Top 2 alternatives
|
|
]
|
|
}
|
|
else:
|
|
suggestions[source_field] = {
|
|
'best_match': None,
|
|
'confidence': 0.0,
|
|
'alternatives': []
|
|
}
|
|
|
|
return suggestions
|
|
|
|
def _find_all_matches(self, source_field: str, min_threshold: float = 0.4) -> List[Tuple[str, float]]:
|
|
"""
|
|
Find all matching standard fields above threshold, sorted by score.
|
|
|
|
Args:
|
|
source_field: Source field name
|
|
min_threshold: Minimum similarity score
|
|
|
|
Returns:
|
|
List of (target_field, score) tuples sorted by score descending
|
|
"""
|
|
source_lower = source_field.lower().replace(' ', '_').replace('-', '_')
|
|
matches = []
|
|
|
|
for standard_field, aliases in self.FIELD_ALIASES.items():
|
|
best_score = 0.0
|
|
|
|
for alias in aliases:
|
|
score = SequenceMatcher(None, source_lower, alias).ratio()
|
|
|
|
# Exact match
|
|
if source_lower == alias:
|
|
score = 1.0
|
|
# Substring match
|
|
elif alias in source_lower or source_lower in alias:
|
|
score = max(score, 0.85)
|
|
|
|
best_score = max(best_score, score)
|
|
|
|
if best_score >= min_threshold:
|
|
matches.append((standard_field, best_score))
|
|
|
|
# Sort by score descending
|
|
matches.sort(key=lambda x: x[1], reverse=True)
|
|
return matches
|
|
|
|
def _load_presets(self) -> Dict:
|
|
"""Load all presets from file."""
|
|
if Path(self.presets_path).exists():
|
|
try:
|
|
with open(self.presets_path, 'r') as f:
|
|
return json.load(f)
|
|
except Exception as e:
|
|
logger.error(f"Failed to load presets: {e}")
|
|
return {}
|
|
return {}
|
|
|
|
def _get_timestamp(self) -> str:
|
|
"""Get current timestamp as ISO format string."""
|
|
from datetime import datetime
|
|
return datetime.now().isoformat()
|
|
|
|
def get_unmapped_fields(self, source_fields: List[str], mapping: Dict[str, str]) -> List[str]:
|
|
"""
|
|
Get list of source fields that are not mapped.
|
|
|
|
Args:
|
|
source_fields: All source field names
|
|
mapping: Current mapping dictionary
|
|
|
|
Returns:
|
|
List of unmapped source fields
|
|
"""
|
|
return [field for field in source_fields if field not in mapping]
|
|
|
|
def get_mapping_coverage(self, source_fields: List[str], mapping: Dict[str, str]) -> Dict:
|
|
"""
|
|
Calculate mapping coverage statistics.
|
|
|
|
Args:
|
|
source_fields: All source field names
|
|
mapping: Current mapping dictionary
|
|
|
|
Returns:
|
|
Statistics dictionary with coverage info
|
|
"""
|
|
total_fields = len(source_fields)
|
|
mapped_fields = len(mapping)
|
|
unmapped = self.get_unmapped_fields(source_fields, mapping)
|
|
|
|
# Count unique target fields used
|
|
unique_targets = len(set(mapping.values()))
|
|
|
|
return {
|
|
'total_source_fields': total_fields,
|
|
'mapped_fields': mapped_fields,
|
|
'unmapped_fields': len(unmapped),
|
|
'coverage_percent': (mapped_fields / total_fields * 100) if total_fields > 0 else 0,
|
|
'unique_targets_used': unique_targets,
|
|
'unmapped_field_list': unmapped
|
|
}
|