forge/backend/app/services/voice_to_text.py

"""Voice to Text Service - Whisper + DeepL"""
import os
from uuid import uuid4
from datetime import datetime, timedelta

from app.database import SessionLocal
from app.models.job import Job
from app.models.asset import Asset
from app.config import settings


async def transcribe(job_id: str):
    """Transcribe audio to text using Whisper with optional translation"""
    db = SessionLocal()
    try:
        job = db.query(Job).filter(Job.id == job_id).first()
        if not job:
            return

        input_data = job.input_data
        input_asset_ids = job.input_asset_ids

        if not input_asset_ids:
            raise ValueError("No input asset provided")

        input_asset = db.query(Asset).filter(Asset.id == input_asset_ids[0]).first()
        if not input_asset:
            raise ValueError("Input asset not found")

        job.progress = 10
        job.api_provider = "whisper"
        db.commit()

        output_format = input_data.get("output_format", "txt")
        translate = input_data.get("translate", False)
        target_language = input_data.get("target_language", "EN-US")

        # Transcribe with Whisper
        import whisper

        model = whisper.load_model("base")
        result = model.transcribe(input_asset.file_path, verbose=False)

        job.progress = 60
        db.commit()

        segments = result.get("segments", [])
        text = result.get("text", "")

        # Generate output based on format
        if output_format == "txt":
            content = text
            extension = "txt"
            mime_type = "text/plain"
        elif output_format == "vtt":
            content = _generate_vtt(segments)
            extension = "vtt"
            mime_type = "text/vtt"
        elif output_format == "srt":
            content = _generate_srt(segments)
            extension = "srt"
            mime_type = "text/plain"
        else:
            content = text
            extension = "txt"
            mime_type = "text/plain"

        output_assets = []

        # Save original transcription
        filename = f"transcription_{uuid4()}.{extension}"
        file_path = os.path.join(settings.storage_path, "documents", filename)
        os.makedirs(os.path.dirname(file_path), exist_ok=True)

        with open(file_path, "w", encoding="utf-8") as f:
            f.write(content)

        asset = Asset(
            user_id=job.user_id,
            project_id=job.project_id,
            original_filename=filename,
            stored_filename=filename,
            file_path=file_path,
            file_type="document",
            mime_type=mime_type,
            file_size_bytes=len(content.encode()),
            source_module="voice_to_text",
            source_job_id=job.id,
            parent_asset_id=input_asset.id,
            asset_metadata={
                "language": result.get("language"),
                "format": output_format,
                "type": "original"
            }
        )
        db.add(asset)
        db.commit()
        db.refresh(asset)
        output_assets.append(asset.id)

        job.progress = 75
        db.commit()

        # Translate if requested
        translated_content = None
        if translate:
            job.api_provider = "whisper+deepl"
            import deepl
            translator = deepl.Translator(settings.deepl_api_key)
            translated_content = translator.translate_text(
                content,
                target_lang=target_language
            ).text

            trans_filename = f"transcription_translated_{uuid4()}.{extension}"
            trans_path = os.path.join(settings.storage_path, "documents", trans_filename)

            with open(trans_path, "w", encoding="utf-8") as f:
                f.write(translated_content)

            trans_asset = Asset(
                user_id=job.user_id,
                project_id=job.project_id,
                original_filename=trans_filename,
                stored_filename=trans_filename,
                file_path=trans_path,
                file_type="document",
                mime_type=mime_type,
                file_size_bytes=len(translated_content.encode()),
                source_module="voice_to_text",
                source_job_id=job.id,
                parent_asset_id=input_asset.id,
                asset_metadata={
                    "language": target_language,
                    "format": output_format,
                    "type": "translated"
                }
            )
            db.add(trans_asset)
            db.commit()
            db.refresh(trans_asset)
            output_assets.append(trans_asset.id)

        job.output_asset_ids = output_assets
        job.output_data = {
            "text": text,
            "translated_text": translated_content,
            "language": result.get("language"),
            "asset_ids": [str(a) for a in output_assets]
        }
        job.progress = 100
        job.status = "completed"
        job.completed_at = datetime.utcnow()
        db.commit()

    except Exception as e:
        job.status = "failed"
        job.error_message = str(e)
        db.commit()
    finally:
        db.close()


def _generate_srt(segments: list) -> str:
    """Generate SRT format from Whisper segments"""
    srt_lines = []
    for i, segment in enumerate(segments, 1):
        start = _format_timestamp_srt(segment['start'])
        end = _format_timestamp_srt(segment['end'])
        text = segment['text'].strip()
        srt_lines.append(f"{i}\n{start} --> {end}\n{text}\n")
    return "\n".join(srt_lines)


def _generate_vtt(segments: list) -> str:
    """Generate VTT format from Whisper segments"""
    vtt_lines = ["WEBVTT\n"]
    for segment in segments:
        start = _format_timestamp_vtt(segment['start'])
        end = _format_timestamp_vtt(segment['end'])
        text = segment['text'].strip()
        vtt_lines.append(f"{start} --> {end}\n{text}\n")
    return "\n".join(vtt_lines)


def _format_timestamp_srt(seconds: float) -> str:
    """Convert seconds to SRT timestamp format (HH:MM:SS,mmm)"""
    td = timedelta(seconds=seconds)
    hours = td.seconds // 3600
    minutes = (td.seconds % 3600) // 60
    secs = td.seconds % 60
    millis = td.microseconds // 1000
    return f"{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}"


def _format_timestamp_vtt(seconds: float) -> str:
    """Convert seconds to VTT timestamp format (HH:MM:SS.mmm)"""
    td = timedelta(seconds=seconds)
    hours = td.seconds // 3600
    minutes = (td.seconds % 3600) // 60
    secs = td.seconds % 60
    millis = td.microseconds // 1000
    return f"{hours:02d}:{minutes:02d}:{secs:02d}.{millis:03d}"