presenton/servers/fastapi/utils/ocr_language.py
sudipnext 3d06644914 feat: update placeholder image references and improve asset handling
- Replaced all instances of the placeholder image path from "/static/images/placeholder.jpg" to "/static/images/replaceable_template_image.png".
- Added a new Nginx location block for serving app data with a long cache expiration.
- Enhanced the image generation service to return the new template image when generation fails.
- Updated various services and endpoints to ensure consistent handling of asset paths, including resolving backend asset URLs.
- Removed Electron-specific checks from several components to streamline API calls and improve compatibility with web deployments.
- Improved error handling and logging in the PDF export process.
- Adjusted Next.js configuration for API routing to ensure proper asset serving in Docker environments.
2026-04-18 20:56:37 +05:45

126 lines
4.3 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Map presentation UI language strings (LanguageType enum values from Next.js) to
Tesseract / LiteParse OCR language codes (ISO 639-3 where applicable).
Keep keys in sync with:
servers/nextjs/app/(presentation-generator)/upload/type.ts → LanguageType
"""
from __future__ import annotations
import re
from typing import Optional
# Values must match `LanguageType` string literals in the upload UI.
PRESENTATION_LANGUAGE_TO_TESSERACT: dict[str, str] = {
"English": "eng",
"Spanish (Español)": "spa",
"French (Français)": "fra",
"German (Deutsch)": "deu",
"Portuguese (Português)": "por",
"Italian (Italiano)": "ita",
"Dutch (Nederlands)": "nld",
"Russian (Русский)": "rus",
"Chinese (Simplified - 中文, 汉语)": "chi_sim",
"Chinese (Traditional - 中文, 漢語)": "chi_tra",
"Japanese (日本語)": "jpn",
"Korean (한국어)": "kor",
"Arabic (العربية)": "ara",
"Hindi (हिन्दी)": "hin",
"Bengali (বাংলা)": "ben",
"Polish (Polski)": "pol",
"Czech (Čeština)": "ces",
"Slovak (Slovenčina)": "slk",
"Hungarian (Magyar)": "hun",
"Romanian (Română)": "ron",
"Bulgarian (Български)": "bul",
"Greek (Ελληνικά)": "ell",
"Serbian (Српски / Srpski)": "srp",
"Croatian (Hrvatski)": "hrv",
"Bosnian (Bosanski)": "bos",
"Slovenian (Slovenščina)": "slv",
"Finnish (Suomi)": "fin",
"Swedish (Svenska)": "swe",
"Danish (Dansk)": "dan",
"Norwegian (Norsk)": "nor",
"Icelandic (Íslenska)": "isl",
"Lithuanian (Lietuvių)": "lit",
"Latvian (Latviešu)": "lav",
"Estonian (Eesti)": "est",
"Maltese (Malti)": "mlt",
"Welsh (Cymraeg)": "cym",
"Irish (Gaeilge)": "gle",
"Scottish Gaelic (Gàidhlig)": "gla",
"Ukrainian (Українська)": "ukr",
"Hebrew (עברית)": "heb",
"Persian/Farsi (فارسی)": "fas",
"Turkish (Türkçe)": "tur",
"Kurdish (Kurdî / کوردی)": "kmr",
"Pashto (پښتو)": "pus",
"Dari (دری)": "prs",
"Uzbek (Oʻzbek)": "uzb",
"Kazakh (Қазақша)": "kaz",
"Tajik (Тоҷикӣ)": "tgk",
"Turkmen (Türkmençe)": "tuk",
"Azerbaijani (Azərbaycan dili)": "aze",
"Urdu (اردو)": "urd",
"Tamil (தமிழ்)": "tam",
"Telugu (తెలుగు)": "tel",
"Marathi (मराठी)": "mar",
"Punjabi (ਪੰਜਾਬੀ / پنجابی)": "pan",
"Gujarati (ગુજરાતી)": "guj",
"Malayalam (മലയാളം)": "mal",
"Kannada (ಕನ್ನಡ)": "kan",
"Odia (ଓଡ଼ିଆ)": "ori",
"Sinhala (සිංහල)": "sin",
"Nepali (नेपाली)": "nep",
"Thai (ไทย)": "tha",
"Vietnamese (Tiếng Việt)": "vie",
"Lao (ລາວ)": "lao",
"Khmer (ភាសាខ្មែរ)": "khm",
"Burmese (မြန်မာစာ)": "mya",
"Tagalog/Filipino (Tagalog/Filipino)": "tgl",
"Javanese (Basa Jawa)": "jav",
"Sundanese (Basa Sunda)": "sun",
"Malay (Bahasa Melayu)": "msa",
"Mongolian (Монгол)": "mon",
"Swahili (Kiswahili)": "swa",
"Hausa (Hausa)": "hau",
"Yoruba (Yorùbá)": "yor",
"Igbo (Igbo)": "ibo",
"Amharic (አማርኛ)": "amh",
"Zulu (isiZulu)": "zul",
"Xhosa (isiXhosa)": "xho",
"Shona (ChiShona)": "sna",
"Somali (Soomaaliga)": "som",
"Basque (Euskara)": "eus",
"Catalan (Català)": "cat",
"Galician (Galego)": "glg",
"Quechua (Runasimi)": "que",
"Nahuatl (Nāhuatl)": "nah",
"Hawaiian (ʻŌlelo Hawaiʻi)": "haw",
"Maori (Te Reo Māori)": "mri",
# No dedicated Tahitian traineddata in default Tesseract bundles.
"Tahitian (Reo Tahiti)": "eng",
"Samoan (Gagana Samoa)": "smo",
}
_LOWER_MAP = {k.lower(): v for k, v in PRESENTATION_LANGUAGE_TO_TESSERACT.items()}
_OCR_CODE_RE = re.compile(r"^[a-zA-Z0-9_,+]+$")
def presentation_language_to_ocr_code(language: Optional[str]) -> str:
"""Resolve UI language label to a Tesseract language code; default English."""
if language is None:
return "eng"
s = str(language).strip()
if not s:
return "eng"
if s in PRESENTATION_LANGUAGE_TO_TESSERACT:
code = PRESENTATION_LANGUAGE_TO_TESSERACT[s]
else:
code = _LOWER_MAP.get(s.lower(), "eng")
if not _OCR_CODE_RE.fullmatch(code):
return "eng"
return code