From a53cf960aebc1bb324de27a388956072b57a64d0 Mon Sep 17 00:00:00 2001 From: Vadym Samoilenko Date: Thu, 30 Apr 2026 18:24:15 +0100 Subject: [PATCH] fix(tts): replace pydub MP3 export with lameenc (pure Python, no system ffmpeg) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Gemini TTS _pcm_to_mp3 used pydub.AudioSegment.export(format='mp3') which requires a system ffmpeg binary. Worker containers don't have ffmpeg installed (video ops run on Cloud Run). Switch to lameenc which is pure Python and encodes PCM→MP3 without any system binary. Co-Authored-By: Claude Sonnet 4.6 --- backend/app/services/gemini_tts.py | 29 ++++++++++------------------- backend/pyproject.toml | 1 + 2 files changed, 11 insertions(+), 19 deletions(-) diff --git a/backend/app/services/gemini_tts.py b/backend/app/services/gemini_tts.py index c559866..f14adfd 100644 --- a/backend/app/services/gemini_tts.py +++ b/backend/app/services/gemini_tts.py @@ -1,5 +1,4 @@ import io -import wave from google import genai from google.genai import types @@ -354,25 +353,17 @@ class GeminiTTSService: def _pcm_to_mp3(self, pcm_data: bytes) -> bytes: """ Convert raw PCM audio (24kHz, 16-bit, mono) to MP3. - Gemini TTS outputs PCM at 24000 Hz sample rate. + Uses lameenc (pure Python) — no system ffmpeg required. """ - # Create WAV from PCM data - wav_buffer = io.BytesIO() - with wave.open(wav_buffer, "wb") as wf: - wf.setnchannels(1) # Mono - wf.setsampwidth(2) # 16-bit (2 bytes) - wf.setframerate(24000) # 24kHz - wf.writeframes(pcm_data) - - # Convert WAV to MP3 using pydub - wav_buffer.seek(0) - audio_segment = AudioSegment.from_wav(wav_buffer) - - # Export as MP3 - mp3_buffer = io.BytesIO() - audio_segment.export(mp3_buffer, format="mp3", bitrate="128k") - - return mp3_buffer.getvalue() + import lameenc + encoder = lameenc.Encoder() + encoder.set_bit_rate(128) + encoder.set_in_sample_rate(24000) + encoder.set_channels(1) + encoder.set_quality(2) # 2 = high quality + mp3_data = encoder.encode(pcm_data) + mp3_data += encoder.flush() + return mp3_data def _parse_ad_cues(self, vtt_content: str) -> list[dict]: """Parse audio description VTT and extract timing + text""" diff --git a/backend/pyproject.toml b/backend/pyproject.toml index 490bf76..d472f91 100644 --- a/backend/pyproject.toml +++ b/backend/pyproject.toml @@ -36,6 +36,7 @@ prometheus-client = "^0.19.0" sentry-sdk = {extras = ["fastapi"], version = "^1.38.0"} ffmpeg-python = "^0.2.0" pydub = "^0.25.1" +lameenc = "^1.7.0" faster-whisper = "^1.2.0" python-magic = "^0.4.27" aiohttp = "^3.12.15"