AI-VideoAssistant/engine/services/tts.py

"""TTS (Text-to-Speech) Service implementations.

Provides multiple TTS backend options including edge-tts (free)
and placeholder for cloud services.
"""

import os
import io
import asyncio
import struct
from typing import AsyncIterator, Optional
from loguru import logger

from services.base import BaseTTSService, TTSChunk, ServiceState

# Try to import edge-tts
try:
    import edge_tts
    EDGE_TTS_AVAILABLE = True
except ImportError:
    EDGE_TTS_AVAILABLE = False
    logger.warning("edge-tts not available - EdgeTTS service will be disabled")


class EdgeTTSService(BaseTTSService):
    """
    Microsoft Edge TTS service.

    Uses edge-tts library for free, high-quality speech synthesis.
    Supports streaming for low-latency playback.
    """

    # Voice mapping for common languages
    VOICE_MAP = {
        "en": "en-US-JennyNeural",
        "en-US": "en-US-JennyNeural",
        "en-GB": "en-GB-SoniaNeural",
        "zh": "zh-CN-XiaoxiaoNeural",
        "zh-CN": "zh-CN-XiaoxiaoNeural",
        "zh-TW": "zh-TW-HsiaoChenNeural",
        "ja": "ja-JP-NanamiNeural",
        "ko": "ko-KR-SunHiNeural",
        "fr": "fr-FR-DeniseNeural",
        "de": "de-DE-KatjaNeural",
        "es": "es-ES-ElviraNeural",
    }

    def __init__(
        self,
        voice: str = "en-US-JennyNeural",
        sample_rate: int = 16000,
        speed: float = 1.0
    ):
        """
        Initialize Edge TTS service.

        Args:
            voice: Voice name (e.g., "en-US-JennyNeural") or language code (e.g., "en")
            sample_rate: Target sample rate (will be resampled)
            speed: Speech speed multiplier
        """
        # Resolve voice from language code if needed
        if voice in self.VOICE_MAP:
            voice = self.VOICE_MAP[voice]

        super().__init__(voice=voice, sample_rate=sample_rate, speed=speed)
        self._cancel_event = asyncio.Event()

    async def connect(self) -> None:
        """Edge TTS doesn't require explicit connection."""
        if not EDGE_TTS_AVAILABLE:
            raise RuntimeError("edge-tts package not installed")
        self.state = ServiceState.CONNECTED
        logger.info(f"Edge TTS service ready: voice={self.voice}")

    async def disconnect(self) -> None:
        """Edge TTS doesn't require explicit disconnection."""
        self.state = ServiceState.DISCONNECTED
        logger.info("Edge TTS service disconnected")

    def _get_rate_string(self) -> str:
        """Convert speed to rate string for edge-tts."""
        # edge-tts uses percentage format: "+0%", "-10%", "+20%"
        percentage = int((self.speed - 1.0) * 100)
        if percentage >= 0:
            return f"+{percentage}%"
        return f"{percentage}%"

    async def synthesize(self, text: str) -> bytes:
        """
        Synthesize complete audio for text.

        Args:
            text: Text to synthesize

        Returns:
            PCM audio data (16-bit, mono, 16kHz)
        """
        if not EDGE_TTS_AVAILABLE:
            raise RuntimeError("edge-tts not available")

        # Collect all chunks
        audio_data = b""
        async for chunk in self.synthesize_stream(text):
            audio_data += chunk.audio

        return audio_data

    async def synthesize_stream(self, text: str) -> AsyncIterator[TTSChunk]:
        """
        Synthesize audio in streaming mode.

        Args:
            text: Text to synthesize

        Yields:
            TTSChunk objects with PCM audio
        """
        if not EDGE_TTS_AVAILABLE:
            raise RuntimeError("edge-tts not available")

        self._cancel_event.clear()

        try:
            communicate = edge_tts.Communicate(
                text,
                voice=self.voice,
                rate=self._get_rate_string()
            )

            # edge-tts outputs MP3, we need to decode to PCM
            # For now, collect MP3 chunks and yield after conversion
            mp3_data = b""

            async for chunk in communicate.stream():
                # Check for cancellation
                if self._cancel_event.is_set():
                    logger.info("TTS synthesis cancelled")
                    return

                if chunk["type"] == "audio":
                    mp3_data += chunk["data"]

            # Convert MP3 to PCM
            if mp3_data:
                pcm_data = await self._convert_mp3_to_pcm(mp3_data)
                if pcm_data:
                    # Yield in chunks for streaming playback
                    chunk_size = self.sample_rate * 2 // 10  # 100ms chunks
                    for i in range(0, len(pcm_data), chunk_size):
                        if self._cancel_event.is_set():
                            return

                        chunk_data = pcm_data[i:i + chunk_size]
                        yield TTSChunk(
                            audio=chunk_data,
                            sample_rate=self.sample_rate,
                            is_final=(i + chunk_size >= len(pcm_data))
                        )

        except asyncio.CancelledError:
            logger.info("TTS synthesis cancelled via asyncio")
            raise
        except Exception as e:
            logger.error(f"TTS synthesis error: {e}")
            raise

    async def _convert_mp3_to_pcm(self, mp3_data: bytes) -> bytes:
        """
        Convert MP3 audio to PCM.

        Uses pydub or ffmpeg for conversion.
        """
        try:
            # Try using pydub (requires ffmpeg)
            from pydub import AudioSegment

            # Load MP3 from bytes
            audio = AudioSegment.from_mp3(io.BytesIO(mp3_data))

            # Convert to target format
            audio = audio.set_frame_rate(self.sample_rate)
            audio = audio.set_channels(1)
            audio = audio.set_sample_width(2)  # 16-bit

            # Export as raw PCM
            return audio.raw_data

        except ImportError:
            logger.warning("pydub not available, trying fallback")
            # Fallback: Use subprocess to call ffmpeg directly
            return await self._ffmpeg_convert(mp3_data)
        except Exception as e:
            logger.error(f"Audio conversion error: {e}")
            return b""

    async def _ffmpeg_convert(self, mp3_data: bytes) -> bytes:
        """Convert MP3 to PCM using ffmpeg subprocess."""
        try:
            process = await asyncio.create_subprocess_exec(
                "ffmpeg",
                "-i", "pipe:0",
                "-f", "s16le",
                "-acodec", "pcm_s16le",
                "-ar", str(self.sample_rate),
                "-ac", "1",
                "pipe:1",
                stdin=asyncio.subprocess.PIPE,
                stdout=asyncio.subprocess.PIPE,
                stderr=asyncio.subprocess.DEVNULL
            )

            stdout, _ = await process.communicate(input=mp3_data)
            return stdout

        except Exception as e:
            logger.error(f"ffmpeg conversion error: {e}")
            return b""

    async def cancel(self) -> None:
        """Cancel ongoing synthesis."""
        self._cancel_event.set()


class MockTTSService(BaseTTSService):
    """
    Mock TTS service for testing without actual synthesis.

    Generates silence or simple tones.
    """

    def __init__(
        self,
        voice: str = "mock",
        sample_rate: int = 16000,
        speed: float = 1.0
    ):
        super().__init__(voice=voice, sample_rate=sample_rate, speed=speed)

    async def connect(self) -> None:
        self.state = ServiceState.CONNECTED
        logger.info("Mock TTS service connected")

    async def disconnect(self) -> None:
        self.state = ServiceState.DISCONNECTED
        logger.info("Mock TTS service disconnected")

    async def synthesize(self, text: str) -> bytes:
        """Generate silence based on text length."""
        # Approximate: 100ms per word
        word_count = len(text.split())
        duration_ms = word_count * 100
        samples = int(self.sample_rate * duration_ms / 1000)

        # Generate silence (zeros)
        return bytes(samples * 2)  # 16-bit = 2 bytes per sample

    async def synthesize_stream(self, text: str) -> AsyncIterator[TTSChunk]:
        """Generate silence chunks."""
        audio = await self.synthesize(text)

        # Yield in 100ms chunks
        chunk_size = self.sample_rate * 2 // 10
        for i in range(0, len(audio), chunk_size):
            chunk_data = audio[i:i + chunk_size]
            yield TTSChunk(
                audio=chunk_data,
                sample_rate=self.sample_rate,
                is_final=(i + chunk_size >= len(audio))
            )
            await asyncio.sleep(0.05)  # Simulate processing time