fix(tts): replace pydub MP3 export with lameenc (pure Python, no system ffmpeg)

Gemini TTS _pcm_to_mp3 used pydub.AudioSegment.export(format='mp3') which requires a system ffmpeg binary. Worker containers don't have ffmpeg installed (video ops run on Cloud Run). Switch to lameenc which is pure Python and encodes PCM→MP3 without any system binary. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-30 18:24:15 +01:00 · 2026-04-30 18:24:15 +01:00 · a53cf960ae
commit a53cf960ae
parent b0a90777ed
2 changed files with 11 additions and 19 deletions
--- a/backend/app/services/gemini_tts.py
+++ b/backend/app/services/gemini_tts.py
@ -1,5 +1,4 @@
 import io
-import wave

 from google import genai
 from google.genai import types
@ -354,25 +353,17 @@ class GeminiTTSService:
    def _pcm_to_mp3(self, pcm_data: bytes) -> bytes:
        """
        Convert raw PCM audio (24kHz, 16-bit, mono) to MP3.
-        Gemini TTS outputs PCM at 24000 Hz sample rate.
+        Uses lameenc (pure Python) — no system ffmpeg required.
        """
-        # Create WAV from PCM data
-        wav_buffer = io.BytesIO()
-        with wave.open(wav_buffer, "wb") as wf:
-            wf.setnchannels(1)  # Mono
-            wf.setsampwidth(2)  # 16-bit (2 bytes)
-            wf.setframerate(24000)  # 24kHz
-            wf.writeframes(pcm_data)
-
-        # Convert WAV to MP3 using pydub
-        wav_buffer.seek(0)
-        audio_segment = AudioSegment.from_wav(wav_buffer)
-
-        # Export as MP3
-        mp3_buffer = io.BytesIO()
-        audio_segment.export(mp3_buffer, format="mp3", bitrate="128k")
-
-        return mp3_buffer.getvalue()
+        import lameenc
+        encoder = lameenc.Encoder()
+        encoder.set_bit_rate(128)
+        encoder.set_in_sample_rate(24000)
+        encoder.set_channels(1)
+        encoder.set_quality(2)  # 2 = high quality
+        mp3_data = encoder.encode(pcm_data)
+        mp3_data += encoder.flush()
+        return mp3_data

    def _parse_ad_cues(self, vtt_content: str) -> list[dict]:
        """Parse audio description VTT and extract timing + text"""
--- a/backend/pyproject.toml
+++ b/backend/pyproject.toml
@ -36,6 +36,7 @@ prometheus-client = "^0.19.0"
 sentry-sdk = {extras = ["fastapi"], version = "^1.38.0"}
 ffmpeg-python = "^0.2.0"
 pydub = "^0.25.1"
+lameenc = "^1.7.0"
 faster-whisper = "^1.2.0"
 python-magic = "^0.4.27"
 aiohttp = "^3.12.15"