"""Validate date and percentage format strings against approved locale formats. Checks that dates and percentages in transcreated text conform to the locale-specific format rules defined in the date/percentage format file. """ import re from dataclasses import dataclass @dataclass class FormatViolation: """A detected date/percentage format violation.""" found: str expected_format: str description: str def validate_date_formats( text: str, approved_formats: list[dict], ) -> list[FormatViolation]: """Validate date strings in text against approved formats. Args: text: The text containing dates to validate. approved_formats: List of dicts with keys: - pattern: str (regex pattern for valid format) - example: str (example of the correct format) - description: str Returns: List of FormatViolation instances. """ if not text or not approved_formats: return [] violations: list[FormatViolation] = [] # Common date-like patterns to detect date_patterns = [ # DD/MM/YYYY, MM/DD/YYYY, YYYY/MM/DD r"\b\d{1,2}[/\-.]\d{1,2}[/\-.]\d{2,4}\b", # Month DD, YYYY r"\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\.?\s+\d{1,2},?\s+\d{4}\b", # DD Month YYYY r"\b\d{1,2}\s+(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\.?\s+\d{4}\b", ] for date_pattern in date_patterns: for match in re.finditer(date_pattern, text, re.IGNORECASE): found_date = match.group() is_valid = False for fmt in approved_formats: pattern = fmt.get("pattern", "") if pattern and re.match(pattern, found_date, re.IGNORECASE): is_valid = True break if not is_valid and approved_formats: examples = [ fmt.get("example", "") for fmt in approved_formats if fmt.get("example") ] violations.append( FormatViolation( found=found_date, expected_format=", ".join(examples[:3]), description=f"Date format '{found_date}' does not match approved formats", ) ) return violations def validate_percentage_formats( text: str, approved_formats: list[dict], ) -> list[FormatViolation]: """Validate percentage strings in text against approved formats. Args: text: The text containing percentages to validate. approved_formats: List of dicts with keys: - pattern: str (regex) - example: str - description: str Returns: List of FormatViolation instances. """ if not text or not approved_formats: return [] violations: list[FormatViolation] = [] # Find percentage-like patterns pct_pattern = r"\b\d+[\.,]?\d*\s*[%%]\b" for match in re.finditer(pct_pattern, text): found_pct = match.group() is_valid = False for fmt in approved_formats: pattern = fmt.get("pattern", "") if pattern and re.match(pattern, found_pct): is_valid = True break if not is_valid and approved_formats: examples = [ fmt.get("example", "") for fmt in approved_formats if fmt.get("example") ] violations.append( FormatViolation( found=found_pct, expected_format=", ".join(examples[:3]), description=f"Percentage format '{found_pct}' does not match approved formats", ) ) return violations