fix(tts): replace pydub MP3 export with lameenc (pure Python, no system ffmpeg)

Gemini TTS _pcm_to_mp3 used pydub.AudioSegment.export(format='mp3') which
requires a system ffmpeg binary. Worker containers don't have ffmpeg installed
(video ops run on Cloud Run). Switch to lameenc which is pure Python and
encodes PCM→MP3 without any system binary.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Vadym Samoilenko 2026-04-30 18:24:15 +01:00
parent b0a90777ed
commit a53cf960ae
2 changed files with 11 additions and 19 deletions

View file

@ -1,5 +1,4 @@
import io
import wave
from google import genai
from google.genai import types
@ -354,25 +353,17 @@ class GeminiTTSService:
def _pcm_to_mp3(self, pcm_data: bytes) -> bytes:
"""
Convert raw PCM audio (24kHz, 16-bit, mono) to MP3.
Gemini TTS outputs PCM at 24000 Hz sample rate.
Uses lameenc (pure Python) no system ffmpeg required.
"""
# Create WAV from PCM data
wav_buffer = io.BytesIO()
with wave.open(wav_buffer, "wb") as wf:
wf.setnchannels(1) # Mono
wf.setsampwidth(2) # 16-bit (2 bytes)
wf.setframerate(24000) # 24kHz
wf.writeframes(pcm_data)
# Convert WAV to MP3 using pydub
wav_buffer.seek(0)
audio_segment = AudioSegment.from_wav(wav_buffer)
# Export as MP3
mp3_buffer = io.BytesIO()
audio_segment.export(mp3_buffer, format="mp3", bitrate="128k")
return mp3_buffer.getvalue()
import lameenc
encoder = lameenc.Encoder()
encoder.set_bit_rate(128)
encoder.set_in_sample_rate(24000)
encoder.set_channels(1)
encoder.set_quality(2) # 2 = high quality
mp3_data = encoder.encode(pcm_data)
mp3_data += encoder.flush()
return mp3_data
def _parse_ad_cues(self, vtt_content: str) -> list[dict]:
"""Parse audio description VTT and extract timing + text"""

View file

@ -36,6 +36,7 @@ prometheus-client = "^0.19.0"
sentry-sdk = {extras = ["fastapi"], version = "^1.38.0"}
ffmpeg-python = "^0.2.0"
pydub = "^0.25.1"
lameenc = "^1.7.0"
faster-whisper = "^1.2.0"
python-magic = "^0.4.27"
aiohttp = "^3.12.15"