diff --git a/backend/app/services/gemini_tts.py b/backend/app/services/gemini_tts.py index c559866..f14adfd 100644 --- a/backend/app/services/gemini_tts.py +++ b/backend/app/services/gemini_tts.py @@ -1,5 +1,4 @@ import io -import wave from google import genai from google.genai import types @@ -354,25 +353,17 @@ class GeminiTTSService: def _pcm_to_mp3(self, pcm_data: bytes) -> bytes: """ Convert raw PCM audio (24kHz, 16-bit, mono) to MP3. - Gemini TTS outputs PCM at 24000 Hz sample rate. + Uses lameenc (pure Python) — no system ffmpeg required. """ - # Create WAV from PCM data - wav_buffer = io.BytesIO() - with wave.open(wav_buffer, "wb") as wf: - wf.setnchannels(1) # Mono - wf.setsampwidth(2) # 16-bit (2 bytes) - wf.setframerate(24000) # 24kHz - wf.writeframes(pcm_data) - - # Convert WAV to MP3 using pydub - wav_buffer.seek(0) - audio_segment = AudioSegment.from_wav(wav_buffer) - - # Export as MP3 - mp3_buffer = io.BytesIO() - audio_segment.export(mp3_buffer, format="mp3", bitrate="128k") - - return mp3_buffer.getvalue() + import lameenc + encoder = lameenc.Encoder() + encoder.set_bit_rate(128) + encoder.set_in_sample_rate(24000) + encoder.set_channels(1) + encoder.set_quality(2) # 2 = high quality + mp3_data = encoder.encode(pcm_data) + mp3_data += encoder.flush() + return mp3_data def _parse_ad_cues(self, vtt_content: str) -> list[dict]: """Parse audio description VTT and extract timing + text""" diff --git a/backend/pyproject.toml b/backend/pyproject.toml index 490bf76..d472f91 100644 --- a/backend/pyproject.toml +++ b/backend/pyproject.toml @@ -36,6 +36,7 @@ prometheus-client = "^0.19.0" sentry-sdk = {extras = ["fastapi"], version = "^1.38.0"} ffmpeg-python = "^0.2.0" pydub = "^0.25.1" +lameenc = "^1.7.0" faster-whisper = "^1.2.0" python-magic = "^0.4.27" aiohttp = "^3.12.15"