- Replaced all instances of the placeholder image path from "/static/images/placeholder.jpg" to "/static/images/replaceable_template_image.png". - Added a new Nginx location block for serving app data with a long cache expiration. - Enhanced the image generation service to return the new template image when generation fails. - Updated various services and endpoints to ensure consistent handling of asset paths, including resolving backend asset URLs. - Removed Electron-specific checks from several components to streamline API calls and improve compatibility with web deployments. - Improved error handling and logging in the PDF export process. - Adjusted Next.js configuration for API routing to ensure proper asset serving in Docker environments.
126 lines
4.3 KiB
Python
126 lines
4.3 KiB
Python
"""
|
||
Map presentation UI language strings (LanguageType enum values from Next.js) to
|
||
Tesseract / LiteParse OCR language codes (ISO 639-3 where applicable).
|
||
|
||
Keep keys in sync with:
|
||
servers/nextjs/app/(presentation-generator)/upload/type.ts → LanguageType
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import re
|
||
from typing import Optional
|
||
|
||
# Values must match `LanguageType` string literals in the upload UI.
|
||
PRESENTATION_LANGUAGE_TO_TESSERACT: dict[str, str] = {
|
||
"English": "eng",
|
||
"Spanish (Español)": "spa",
|
||
"French (Français)": "fra",
|
||
"German (Deutsch)": "deu",
|
||
"Portuguese (Português)": "por",
|
||
"Italian (Italiano)": "ita",
|
||
"Dutch (Nederlands)": "nld",
|
||
"Russian (Русский)": "rus",
|
||
"Chinese (Simplified - 中文, 汉语)": "chi_sim",
|
||
"Chinese (Traditional - 中文, 漢語)": "chi_tra",
|
||
"Japanese (日本語)": "jpn",
|
||
"Korean (한국어)": "kor",
|
||
"Arabic (العربية)": "ara",
|
||
"Hindi (हिन्दी)": "hin",
|
||
"Bengali (বাংলা)": "ben",
|
||
"Polish (Polski)": "pol",
|
||
"Czech (Čeština)": "ces",
|
||
"Slovak (Slovenčina)": "slk",
|
||
"Hungarian (Magyar)": "hun",
|
||
"Romanian (Română)": "ron",
|
||
"Bulgarian (Български)": "bul",
|
||
"Greek (Ελληνικά)": "ell",
|
||
"Serbian (Српски / Srpski)": "srp",
|
||
"Croatian (Hrvatski)": "hrv",
|
||
"Bosnian (Bosanski)": "bos",
|
||
"Slovenian (Slovenščina)": "slv",
|
||
"Finnish (Suomi)": "fin",
|
||
"Swedish (Svenska)": "swe",
|
||
"Danish (Dansk)": "dan",
|
||
"Norwegian (Norsk)": "nor",
|
||
"Icelandic (Íslenska)": "isl",
|
||
"Lithuanian (Lietuvių)": "lit",
|
||
"Latvian (Latviešu)": "lav",
|
||
"Estonian (Eesti)": "est",
|
||
"Maltese (Malti)": "mlt",
|
||
"Welsh (Cymraeg)": "cym",
|
||
"Irish (Gaeilge)": "gle",
|
||
"Scottish Gaelic (Gàidhlig)": "gla",
|
||
"Ukrainian (Українська)": "ukr",
|
||
"Hebrew (עברית)": "heb",
|
||
"Persian/Farsi (فارسی)": "fas",
|
||
"Turkish (Türkçe)": "tur",
|
||
"Kurdish (Kurdî / کوردی)": "kmr",
|
||
"Pashto (پښتو)": "pus",
|
||
"Dari (دری)": "prs",
|
||
"Uzbek (Oʻzbek)": "uzb",
|
||
"Kazakh (Қазақша)": "kaz",
|
||
"Tajik (Тоҷикӣ)": "tgk",
|
||
"Turkmen (Türkmençe)": "tuk",
|
||
"Azerbaijani (Azərbaycan dili)": "aze",
|
||
"Urdu (اردو)": "urd",
|
||
"Tamil (தமிழ்)": "tam",
|
||
"Telugu (తెలుగు)": "tel",
|
||
"Marathi (मराठी)": "mar",
|
||
"Punjabi (ਪੰਜਾਬੀ / پنجابی)": "pan",
|
||
"Gujarati (ગુજરાતી)": "guj",
|
||
"Malayalam (മലയാളം)": "mal",
|
||
"Kannada (ಕನ್ನಡ)": "kan",
|
||
"Odia (ଓଡ଼ିଆ)": "ori",
|
||
"Sinhala (සිංහල)": "sin",
|
||
"Nepali (नेपाली)": "nep",
|
||
"Thai (ไทย)": "tha",
|
||
"Vietnamese (Tiếng Việt)": "vie",
|
||
"Lao (ລາວ)": "lao",
|
||
"Khmer (ភាសាខ្មែរ)": "khm",
|
||
"Burmese (မြန်မာစာ)": "mya",
|
||
"Tagalog/Filipino (Tagalog/Filipino)": "tgl",
|
||
"Javanese (Basa Jawa)": "jav",
|
||
"Sundanese (Basa Sunda)": "sun",
|
||
"Malay (Bahasa Melayu)": "msa",
|
||
"Mongolian (Монгол)": "mon",
|
||
"Swahili (Kiswahili)": "swa",
|
||
"Hausa (Hausa)": "hau",
|
||
"Yoruba (Yorùbá)": "yor",
|
||
"Igbo (Igbo)": "ibo",
|
||
"Amharic (አማርኛ)": "amh",
|
||
"Zulu (isiZulu)": "zul",
|
||
"Xhosa (isiXhosa)": "xho",
|
||
"Shona (ChiShona)": "sna",
|
||
"Somali (Soomaaliga)": "som",
|
||
"Basque (Euskara)": "eus",
|
||
"Catalan (Català)": "cat",
|
||
"Galician (Galego)": "glg",
|
||
"Quechua (Runasimi)": "que",
|
||
"Nahuatl (Nāhuatl)": "nah",
|
||
"Hawaiian (ʻŌlelo Hawaiʻi)": "haw",
|
||
"Maori (Te Reo Māori)": "mri",
|
||
# No dedicated Tahitian traineddata in default Tesseract bundles.
|
||
"Tahitian (Reo Tahiti)": "eng",
|
||
"Samoan (Gagana Samoa)": "smo",
|
||
}
|
||
|
||
_LOWER_MAP = {k.lower(): v for k, v in PRESENTATION_LANGUAGE_TO_TESSERACT.items()}
|
||
|
||
_OCR_CODE_RE = re.compile(r"^[a-zA-Z0-9_,+]+$")
|
||
|
||
|
||
def presentation_language_to_ocr_code(language: Optional[str]) -> str:
|
||
"""Resolve UI language label to a Tesseract language code; default English."""
|
||
if language is None:
|
||
return "eng"
|
||
s = str(language).strip()
|
||
if not s:
|
||
return "eng"
|
||
if s in PRESENTATION_LANGUAGE_TO_TESSERACT:
|
||
code = PRESENTATION_LANGUAGE_TO_TESSERACT[s]
|
||
else:
|
||
code = _LOWER_MAP.get(s.lower(), "eng")
|
||
if not _OCR_CODE_RE.fullmatch(code):
|
||
return "eng"
|
||
return code
|