amazon-transcreation/backend/app/pipeline/modules/line_break_normaliser.py

"""Line break normalisation utilities.

Three modes:
- normalise_for_query: Strip line breaks, collapse multiple spaces to single.
  Used when building search queries against TM.
- normalise_for_excel: Convert \\n to openpyxl-compatible line breaks.
  Used when writing output cells.
- preserve_raw: Return text as-is (identity function for pipeline clarity).
"""

import re


def normalise_for_query(text: str) -> str:
    """Strip line breaks and collapse spaces for TM query matching.

    Args:
        text: Raw text potentially containing line breaks.

    Returns:
        Single-line text with normalised whitespace.
    """
    if not text:
        return ""

    # Replace all line break variants with a space
    result = text.replace("\r\n", " ").replace("\r", " ").replace("\n", " ")

    # Collapse multiple spaces to one
    result = re.sub(r"\s+", " ", result)

    return result.strip()


def normalise_for_excel(text: str) -> str:
    """Convert line breaks to openpyxl-compatible format.

    openpyxl uses \\n for in-cell line breaks when wrap_text is enabled.
    This ensures consistent line break representation.

    Args:
        text: Text with potential line breaks.

    Returns:
        Text with standardised \\n line breaks.
    """
    if not text:
        return ""

    # Normalise all line break variants to \\n
    result = text.replace("\r\n", "\n").replace("\r", "\n")

    return result


def preserve_raw(text: str) -> str:
    """Return text as-is (identity function).

    Used in the pipeline to explicitly indicate no normalisation is applied.

    Args:
        text: Any text.

    Returns:
        The same text, unchanged.
    """
    return text