"""Tests for content classification regex patterns. Patterns are duplicated here to avoid importing the full service module, which has heavy transitive dependencies (google.genai, etc.). These regexes must stay in sync with services/content_intelligence_service.py. """ import re import pytest # --- Duplicated from services/content_intelligence_service.py --- _METRIC_RE = re.compile( r""" (?: [\$€£¥]\s?\d[\d,.]*[KMBTkmbt%]? | \d[\d,.]*\s?% | \d[\d,.]*\s?[KMBTkmbt]\b ) | (?: (?:grew|growth|increased?|decreased?|rose|fell|dropped|declined|revenue|profit|margin|roi|cagr|arpu) .{0,30}? [\$€£¥]?\d[\d,.]*[KMBTkmbt%]? ) """, re.IGNORECASE | re.VERBOSE, ) _QUOTE_RE = re.compile( r'["\u201c\u201d].{15,300}?["\u201c\u201d]' r"(?:\s*[-\u2014\u2013]\s*.{2,60})?", re.DOTALL, ) _TABLE_RE = re.compile(r"^\|.+\|$", re.MULTILINE) _TIMELINE_RE = re.compile( r"(?:(?:19|20)\d{2}|Q[1-4]|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\w*\s+\d{4})", re.IGNORECASE, ) _COMPARISON_RE = re.compile( r"\b(?:vs\.?|versus|compared?\s+to|in\s+contrast|on\s+the\s+other\s+hand|whereas|alternatively)\b", re.IGNORECASE, ) _LIST_RE = re.compile(r"^[\s]*[-*•]\s+.+", re.MULTILINE) _IMAGE_REF_RE = re.compile( r"(?:!\[|see\s+(?:figure|image|diagram|chart|photo)|attached\s+image|\.(?:png|jpg|jpeg|gif|webp|svg)\b)", re.IGNORECASE, ) class TestMetricRegex: @pytest.mark.parametrize("text", [ "$2.3M revenue", "45% growth", "1,200K units", "revenue grew 45%", "profit increased by $2M", "ROI of 340%", "CAGR 12%", ]) def test_detects_metrics(self, text): assert _METRIC_RE.search(text), f"Failed to detect metric: {text}" @pytest.mark.parametrize("text", [ "The cat sat on the mat", "We had a meeting yesterday", ]) def test_rejects_non_metrics(self, text): assert not _METRIC_RE.search(text) class TestQuoteRegex: def test_detects_quoted_text(self): text = '"Innovation is the ability to see change as an opportunity" — John Doe' assert _QUOTE_RE.search(text) def test_detects_smart_quotes(self): text = '\u201cThis is a quoted statement\u201d' assert _QUOTE_RE.search(text) def test_rejects_short_quotes(self): text = '"Hi"' assert not _QUOTE_RE.search(text) class TestTableRegex: def test_detects_markdown_table(self): text = "| Name | Value |\n| --- | --- |\n| A | 1 |" assert _TABLE_RE.search(text) def test_rejects_non_table(self): text = "This is just normal text" assert not _TABLE_RE.search(text) class TestTimelineRegex: @pytest.mark.parametrize("text", [ "In 2023, we launched the product", "Q1 results were strong", "January 2024 earnings", ]) def test_detects_timeline(self, text): assert _TIMELINE_RE.search(text) class TestComparisonRegex: @pytest.mark.parametrize("text", [ "Plan A vs. Plan B", "compared to last year", "in contrast to competitors", "on the other hand, they chose", ]) def test_detects_comparison(self, text): assert _COMPARISON_RE.search(text) class TestListRegex: def test_detects_bullet_list(self): text = "- Item one\n- Item two\n- Item three" matches = _LIST_RE.findall(text) assert len(matches) == 3 def test_detects_asterisk_list(self): text = "* First\n* Second" assert _LIST_RE.search(text) class TestImageRefRegex: @pytest.mark.parametrize("text", [ "See figure 1 below", "see diagram for details", "image.png", "![alt text](photo.jpg)", "attached image shows", ]) def test_detects_image_references(self, text): assert _IMAGE_REF_RE.search(text)