From a53cf960aebc1bb324de27a388956072b57a64d0 Mon Sep 17 00:00:00 2001
From: Vadym Samoilenko <vadymsamoilenko@oliver.agency>
Date: Thu, 30 Apr 2026 18:24:15 +0100
Subject: [PATCH] fix(tts): replace pydub MP3 export with lameenc (pure Python,
 no system ffmpeg)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Gemini TTS _pcm_to_mp3 used pydub.AudioSegment.export(format='mp3') which
requires a system ffmpeg binary. Worker containers don't have ffmpeg installed
(video ops run on Cloud Run). Switch to lameenc which is pure Python and
encodes PCM→MP3 without any system binary.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 backend/app/services/gemini_tts.py | 29 ++++++++++-------------------
 backend/pyproject.toml             |  1 +
 2 files changed, 11 insertions(+), 19 deletions(-)

diff --git a/backend/app/services/gemini_tts.py b/backend/app/services/gemini_tts.py
index c559866..f14adfd 100644
--- a/backend/app/services/gemini_tts.py
+++ b/backend/app/services/gemini_tts.py
@@ -1,5 +1,4 @@
 import io
-import wave
 
 from google import genai
 from google.genai import types
@@ -354,25 +353,17 @@ class GeminiTTSService:
     def _pcm_to_mp3(self, pcm_data: bytes) -> bytes:
         """
         Convert raw PCM audio (24kHz, 16-bit, mono) to MP3.
-        Gemini TTS outputs PCM at 24000 Hz sample rate.
+        Uses lameenc (pure Python) — no system ffmpeg required.
         """
-        # Create WAV from PCM data
-        wav_buffer = io.BytesIO()
-        with wave.open(wav_buffer, "wb") as wf:
-            wf.setnchannels(1)  # Mono
-            wf.setsampwidth(2)  # 16-bit (2 bytes)
-            wf.setframerate(24000)  # 24kHz
-            wf.writeframes(pcm_data)
-
-        # Convert WAV to MP3 using pydub
-        wav_buffer.seek(0)
-        audio_segment = AudioSegment.from_wav(wav_buffer)
-
-        # Export as MP3
-        mp3_buffer = io.BytesIO()
-        audio_segment.export(mp3_buffer, format="mp3", bitrate="128k")
-
-        return mp3_buffer.getvalue()
+        import lameenc
+        encoder = lameenc.Encoder()
+        encoder.set_bit_rate(128)
+        encoder.set_in_sample_rate(24000)
+        encoder.set_channels(1)
+        encoder.set_quality(2)  # 2 = high quality
+        mp3_data = encoder.encode(pcm_data)
+        mp3_data += encoder.flush()
+        return mp3_data
 
     def _parse_ad_cues(self, vtt_content: str) -> list[dict]:
         """Parse audio description VTT and extract timing + text"""
diff --git a/backend/pyproject.toml b/backend/pyproject.toml
index 490bf76..d472f91 100644
--- a/backend/pyproject.toml
+++ b/backend/pyproject.toml
@@ -36,6 +36,7 @@ prometheus-client = "^0.19.0"
 sentry-sdk = {extras = ["fastapi"], version = "^1.38.0"}
 ffmpeg-python = "^0.2.0"
 pydub = "^0.25.1"
+lameenc = "^1.7.0"
 faster-whisper = "^1.2.0"
 python-magic = "^0.4.27"
 aiohttp = "^3.12.15"