amazon-transcreation/backend/app/pipeline/modules/date_format_validator.py
DJP 98fa16bfc3 feat: complete Phase 1-2 scaffold — backend, frontend, pipeline skeleton
Full-stack Amazon AI Transcreation Platform with:
- FastAPI backend (async, PostgreSQL, Redis, Celery) with 11 DB tables
- JWT auth (SSO-ready abstract provider pattern)
- 6-agent pipeline orchestrator with deterministic modules
- Next.js 14 frontend with Amazon branding (Ember fonts, orange/dark theme)
- Job wizard, monitoring HUD, output review, admin screens
- 154 TM/reference files imported, 12 locales configured
- Docker Compose for all services

Agents 2-5 (TM retrieval, ranker, transcreator, compliance) are stubs
pending Phase 3 LLM integration.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-10 12:31:43 -04:00

121 lines
3.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Validate date and percentage format strings against approved locale formats.
Checks that dates and percentages in transcreated text conform to the
locale-specific format rules defined in the date/percentage format file.
"""
import re
from dataclasses import dataclass
@dataclass
class FormatViolation:
"""A detected date/percentage format violation."""
found: str
expected_format: str
description: str
def validate_date_formats(
text: str,
approved_formats: list[dict],
) -> list[FormatViolation]:
"""Validate date strings in text against approved formats.
Args:
text: The text containing dates to validate.
approved_formats: List of dicts with keys:
- pattern: str (regex pattern for valid format)
- example: str (example of the correct format)
- description: str
Returns:
List of FormatViolation instances.
"""
if not text or not approved_formats:
return []
violations: list[FormatViolation] = []
# Common date-like patterns to detect
date_patterns = [
# DD/MM/YYYY, MM/DD/YYYY, YYYY/MM/DD
r"\b\d{1,2}[/\-.]\d{1,2}[/\-.]\d{2,4}\b",
# Month DD, YYYY
r"\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\.?\s+\d{1,2},?\s+\d{4}\b",
# DD Month YYYY
r"\b\d{1,2}\s+(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\.?\s+\d{4}\b",
]
for date_pattern in date_patterns:
for match in re.finditer(date_pattern, text, re.IGNORECASE):
found_date = match.group()
is_valid = False
for fmt in approved_formats:
pattern = fmt.get("pattern", "")
if pattern and re.match(pattern, found_date, re.IGNORECASE):
is_valid = True
break
if not is_valid and approved_formats:
examples = [
fmt.get("example", "") for fmt in approved_formats if fmt.get("example")
]
violations.append(
FormatViolation(
found=found_date,
expected_format=", ".join(examples[:3]),
description=f"Date format '{found_date}' does not match approved formats",
)
)
return violations
def validate_percentage_formats(
text: str,
approved_formats: list[dict],
) -> list[FormatViolation]:
"""Validate percentage strings in text against approved formats.
Args:
text: The text containing percentages to validate.
approved_formats: List of dicts with keys:
- pattern: str (regex)
- example: str
- description: str
Returns:
List of FormatViolation instances.
"""
if not text or not approved_formats:
return []
violations: list[FormatViolation] = []
# Find percentage-like patterns
pct_pattern = r"\b\d+[\.,]?\d*\s*[%]\b"
for match in re.finditer(pct_pattern, text):
found_pct = match.group()
is_valid = False
for fmt in approved_formats:
pattern = fmt.get("pattern", "")
if pattern and re.match(pattern, found_pct):
is_valid = True
break
if not is_valid and approved_formats:
examples = [
fmt.get("example", "") for fmt in approved_formats if fmt.get("example")
]
violations.append(
FormatViolation(
found=found_pct,
expected_format=", ".join(examples[:3]),
description=f"Percentage format '{found_pct}' does not match approved formats",
)
)
return violations