- Fix UV index strategy: mark PyTorch CPU index as explicit with name - Add --index-strategy unsafe-best-match to Dockerfile uv pip install - Fix redis version constraint (>=5.0,<6) for ARQ compatibility - Fix Anthropic model name (claude-sonnet-4-5-20250929) - Fix IMAGE_PROVIDER enum value (gemini_flash, not google) - Resolve middlewares.py vs middlewares/ package conflict - Fix worker import paths (models.sql.presentation, models.sql.slide, utils split) - Fix seed script FK resolution by importing all related models - Fix test suite: async fixture scoping, greenlet dep, regex patterns, fixture params - Fix frontend TypeScript error (Boolean cast for layout.react_code) - Regenerate package-lock.json with i18n packages - Add initial Alembic migration (autogenerated from all models) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
142 lines
3.9 KiB
Python
142 lines
3.9 KiB
Python
"""Tests for content classification regex patterns.
|
|
|
|
Patterns are duplicated here to avoid importing the full service module,
|
|
which has heavy transitive dependencies (google.genai, etc.).
|
|
These regexes must stay in sync with services/content_intelligence_service.py.
|
|
"""
|
|
import re
|
|
|
|
import pytest
|
|
|
|
# --- Duplicated from services/content_intelligence_service.py ---
|
|
|
|
_METRIC_RE = re.compile(
|
|
r"""
|
|
(?:
|
|
[\$€£¥]\s?\d[\d,.]*[KMBTkmbt%]? |
|
|
\d[\d,.]*\s?% |
|
|
\d[\d,.]*\s?[KMBTkmbt]\b
|
|
)
|
|
|
|
|
(?:
|
|
(?:grew|growth|increased?|decreased?|rose|fell|dropped|declined|revenue|profit|margin|roi|cagr|arpu)
|
|
.{0,30}?
|
|
[\$€£¥]?\d[\d,.]*[KMBTkmbt%]?
|
|
)
|
|
""",
|
|
re.IGNORECASE | re.VERBOSE,
|
|
)
|
|
|
|
_QUOTE_RE = re.compile(
|
|
r'["\u201c\u201d].{15,300}?["\u201c\u201d]'
|
|
r"(?:\s*[-\u2014\u2013]\s*.{2,60})?",
|
|
re.DOTALL,
|
|
)
|
|
|
|
_TABLE_RE = re.compile(r"^\|.+\|$", re.MULTILINE)
|
|
|
|
_TIMELINE_RE = re.compile(
|
|
r"(?:(?:19|20)\d{2}|Q[1-4]|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\w*\s+\d{4})",
|
|
re.IGNORECASE,
|
|
)
|
|
|
|
_COMPARISON_RE = re.compile(
|
|
r"\b(?:vs\.?|versus|compared?\s+to|in\s+contrast|on\s+the\s+other\s+hand|whereas|alternatively)\b",
|
|
re.IGNORECASE,
|
|
)
|
|
|
|
_LIST_RE = re.compile(r"^[\s]*[-*•]\s+.+", re.MULTILINE)
|
|
|
|
_IMAGE_REF_RE = re.compile(
|
|
r"(?:!\[|see\s+(?:figure|image|diagram|chart|photo)|attached\s+image|\.(?:png|jpg|jpeg|gif|webp|svg)\b)",
|
|
re.IGNORECASE,
|
|
)
|
|
|
|
|
|
class TestMetricRegex:
|
|
@pytest.mark.parametrize("text", [
|
|
"$2.3M revenue",
|
|
"45% growth",
|
|
"1,200K units",
|
|
"revenue grew 45%",
|
|
"profit increased by $2M",
|
|
"ROI of 340%",
|
|
"CAGR 12%",
|
|
])
|
|
def test_detects_metrics(self, text):
|
|
assert _METRIC_RE.search(text), f"Failed to detect metric: {text}"
|
|
|
|
@pytest.mark.parametrize("text", [
|
|
"The cat sat on the mat",
|
|
"We had a meeting yesterday",
|
|
])
|
|
def test_rejects_non_metrics(self, text):
|
|
assert not _METRIC_RE.search(text)
|
|
|
|
|
|
class TestQuoteRegex:
|
|
def test_detects_quoted_text(self):
|
|
text = '"Innovation is the ability to see change as an opportunity" — John Doe'
|
|
assert _QUOTE_RE.search(text)
|
|
|
|
def test_detects_smart_quotes(self):
|
|
text = '\u201cThis is a quoted statement\u201d'
|
|
assert _QUOTE_RE.search(text)
|
|
|
|
def test_rejects_short_quotes(self):
|
|
text = '"Hi"'
|
|
assert not _QUOTE_RE.search(text)
|
|
|
|
|
|
class TestTableRegex:
|
|
def test_detects_markdown_table(self):
|
|
text = "| Name | Value |\n| --- | --- |\n| A | 1 |"
|
|
assert _TABLE_RE.search(text)
|
|
|
|
def test_rejects_non_table(self):
|
|
text = "This is just normal text"
|
|
assert not _TABLE_RE.search(text)
|
|
|
|
|
|
class TestTimelineRegex:
|
|
@pytest.mark.parametrize("text", [
|
|
"In 2023, we launched the product",
|
|
"Q1 results were strong",
|
|
"January 2024 earnings",
|
|
])
|
|
def test_detects_timeline(self, text):
|
|
assert _TIMELINE_RE.search(text)
|
|
|
|
|
|
class TestComparisonRegex:
|
|
@pytest.mark.parametrize("text", [
|
|
"Plan A vs. Plan B",
|
|
"compared to last year",
|
|
"in contrast to competitors",
|
|
"on the other hand, they chose",
|
|
])
|
|
def test_detects_comparison(self, text):
|
|
assert _COMPARISON_RE.search(text)
|
|
|
|
|
|
class TestListRegex:
|
|
def test_detects_bullet_list(self):
|
|
text = "- Item one\n- Item two\n- Item three"
|
|
matches = _LIST_RE.findall(text)
|
|
assert len(matches) == 3
|
|
|
|
def test_detects_asterisk_list(self):
|
|
text = "* First\n* Second"
|
|
assert _LIST_RE.search(text)
|
|
|
|
|
|
class TestImageRefRegex:
|
|
@pytest.mark.parametrize("text", [
|
|
"See figure 1 below",
|
|
"see diagram for details",
|
|
"image.png",
|
|
"",
|
|
"attached image shows",
|
|
])
|
|
def test_detects_image_references(self, text):
|
|
assert _IMAGE_REF_RE.search(text)
|