pahvalentines/backend/app/services/profanity.py
michael 854fc33246 fix(safety): use substring matching for banned words filter
Remove word-boundary anchors so banned words are caught even when
embedded in larger strings (e.g. "xBatmanx" now matches "Batman").

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-06 15:42:02 -06:00

67 lines
2 KiB
Python

"""Profanity detection service using better-profanity library and custom banned words list."""
import re
from pathlib import Path
from better_profanity import profanity
# Initialize profanity filter once at module load
profanity.load_censor_words()
# Load custom banned words list
_banned_words_path = Path(__file__).resolve().parent.parent.parent / "banned_words.txt"
_banned_words: frozenset[str] = frozenset()
_banned_pattern: re.Pattern | None = None
if _banned_words_path.exists():
_raw = _banned_words_path.read_text(encoding="utf-8")
_banned_words = frozenset(
line.lower()
for line in (l.strip() for l in _raw.splitlines())
if line
)
if _banned_words:
_escaped = sorted((re.escape(w) for w in _banned_words), key=len, reverse=True)
_banned_pattern = re.compile(
"(?:" + "|".join(_escaped) + ")",
re.IGNORECASE,
)
def contains_banned_word(text: str) -> bool:
"""Check if text contains a word from the custom banned words list."""
if not text or _banned_pattern is None:
return False
return _banned_pattern.search(text) is not None
def contains_profanity(text: str) -> bool:
"""Check if text contains profanity.
Args:
text: The text to check for profanity.
Returns:
True if profanity is detected, False otherwise.
"""
if not text:
return False
return profanity.contains_profanity(text) or contains_banned_word(text)
def check_fields(fields: dict[str, str | None]) -> dict[str, bool]:
"""Check multiple fields for profanity.
Args:
fields: Dictionary of field_name -> text to check.
None values are skipped.
Returns:
Dictionary of field_name -> has_profanity (bool).
Only includes fields that were checked (non-None values).
"""
results = {}
for field_name, text in fields.items():
if text is not None:
results[field_name] = contains_profanity(text)
return results