amazon-transcreation/backend/app/pipeline/modules/date_format_validator.py

"""Validate date and percentage format strings against approved locale formats.

Checks that dates and percentages in transcreated text conform to the
locale-specific format rules defined in the date/percentage format file.
"""

import re
from dataclasses import dataclass


@dataclass
class FormatViolation:
    """A detected date/percentage format violation."""
    found: str
    expected_format: str
    description: str


def validate_date_formats(
    text: str,
    approved_formats: list[dict],
) -> list[FormatViolation]:
    """Validate date strings in text against approved formats.

    Args:
        text: The text containing dates to validate.
        approved_formats: List of dicts with keys:
            - pattern: str (regex pattern for valid format)
            - example: str (example of the correct format)
            - description: str

    Returns:
        List of FormatViolation instances.
    """
    if not text or not approved_formats:
        return []

    violations: list[FormatViolation] = []

    # Common date-like patterns to detect
    date_patterns = [
        # DD/MM/YYYY, MM/DD/YYYY, YYYY/MM/DD
        r"\b\d{1,2}[/\-.]\d{1,2}[/\-.]\d{2,4}\b",
        # Month DD, YYYY
        r"\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\.?\s+\d{1,2},?\s+\d{4}\b",
        # DD Month YYYY
        r"\b\d{1,2}\s+(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\.?\s+\d{4}\b",
    ]

    for date_pattern in date_patterns:
        for match in re.finditer(date_pattern, text, re.IGNORECASE):
            found_date = match.group()
            is_valid = False

            for fmt in approved_formats:
                pattern = fmt.get("pattern", "")
                if pattern and re.match(pattern, found_date, re.IGNORECASE):
                    is_valid = True
                    break

            if not is_valid and approved_formats:
                examples = [
                    fmt.get("example", "") for fmt in approved_formats if fmt.get("example")
                ]
                violations.append(
                    FormatViolation(
                        found=found_date,
                        expected_format=", ".join(examples[:3]),
                        description=f"Date format '{found_date}' does not match approved formats",
                    )
                )

    return violations


def validate_percentage_formats(
    text: str,
    approved_formats: list[dict],
) -> list[FormatViolation]:
    """Validate percentage strings in text against approved formats.

    Args:
        text: The text containing percentages to validate.
        approved_formats: List of dicts with keys:
            - pattern: str (regex)
            - example: str
            - description: str

    Returns:
        List of FormatViolation instances.
    """
    if not text or not approved_formats:
        return []

    violations: list[FormatViolation] = []

    # Find percentage-like patterns
    pct_pattern = r"\b\d+[\.,]?\d*\s*[%％]\b"
    for match in re.finditer(pct_pattern, text):
        found_pct = match.group()
        is_valid = False

        for fmt in approved_formats:
            pattern = fmt.get("pattern", "")
            if pattern and re.match(pattern, found_pct):
                is_valid = True
                break

        if not is_valid and approved_formats:
            examples = [
                fmt.get("example", "") for fmt in approved_formats if fmt.get("example")
            ]
            violations.append(
                FormatViolation(
                    found=found_pct,
                    expected_format=", ".join(examples[:3]),
                    description=f"Percentage format '{found_pct}' does not match approved formats",
                )
            )

    return violations