"""TTS (Text-to-Speech) Service implementations. Provides multiple TTS backend options including edge-tts (free) and placeholder for cloud services. """ import os import io import asyncio import struct from typing import AsyncIterator, Optional from loguru import logger from services.base import BaseTTSService, TTSChunk, ServiceState # Try to import edge-tts try: import edge_tts EDGE_TTS_AVAILABLE = True except ImportError: EDGE_TTS_AVAILABLE = False logger.warning("edge-tts not available - EdgeTTS service will be disabled") class EdgeTTSService(BaseTTSService): """ Microsoft Edge TTS service. Uses edge-tts library for free, high-quality speech synthesis. Supports streaming for low-latency playback. """ # Voice mapping for common languages VOICE_MAP = { "en": "en-US-JennyNeural", "en-US": "en-US-JennyNeural", "en-GB": "en-GB-SoniaNeural", "zh": "zh-CN-XiaoxiaoNeural", "zh-CN": "zh-CN-XiaoxiaoNeural", "zh-TW": "zh-TW-HsiaoChenNeural", "ja": "ja-JP-NanamiNeural", "ko": "ko-KR-SunHiNeural", "fr": "fr-FR-DeniseNeural", "de": "de-DE-KatjaNeural", "es": "es-ES-ElviraNeural", } def __init__( self, voice: str = "en-US-JennyNeural", sample_rate: int = 16000, speed: float = 1.0 ): """ Initialize Edge TTS service. Args: voice: Voice name (e.g., "en-US-JennyNeural") or language code (e.g., "en") sample_rate: Target sample rate (will be resampled) speed: Speech speed multiplier """ # Resolve voice from language code if needed if voice in self.VOICE_MAP: voice = self.VOICE_MAP[voice] super().__init__(voice=voice, sample_rate=sample_rate, speed=speed) self._cancel_event = asyncio.Event() async def connect(self) -> None: """Edge TTS doesn't require explicit connection.""" if not EDGE_TTS_AVAILABLE: raise RuntimeError("edge-tts package not installed") self.state = ServiceState.CONNECTED logger.info(f"Edge TTS service ready: voice={self.voice}") async def disconnect(self) -> None: """Edge TTS doesn't require explicit disconnection.""" self.state = ServiceState.DISCONNECTED logger.info("Edge TTS service disconnected") def _get_rate_string(self) -> str: """Convert speed to rate string for edge-tts.""" # edge-tts uses percentage format: "+0%", "-10%", "+20%" percentage = int((self.speed - 1.0) * 100) if percentage >= 0: return f"+{percentage}%" return f"{percentage}%" async def synthesize(self, text: str) -> bytes: """ Synthesize complete audio for text. Args: text: Text to synthesize Returns: PCM audio data (16-bit, mono, 16kHz) """ if not EDGE_TTS_AVAILABLE: raise RuntimeError("edge-tts not available") # Collect all chunks audio_data = b"" async for chunk in self.synthesize_stream(text): audio_data += chunk.audio return audio_data async def synthesize_stream(self, text: str) -> AsyncIterator[TTSChunk]: """ Synthesize audio in streaming mode. Args: text: Text to synthesize Yields: TTSChunk objects with PCM audio """ if not EDGE_TTS_AVAILABLE: raise RuntimeError("edge-tts not available") self._cancel_event.clear() try: communicate = edge_tts.Communicate( text, voice=self.voice, rate=self._get_rate_string() ) # edge-tts outputs MP3, we need to decode to PCM # For now, collect MP3 chunks and yield after conversion mp3_data = b"" async for chunk in communicate.stream(): # Check for cancellation if self._cancel_event.is_set(): logger.info("TTS synthesis cancelled") return if chunk["type"] == "audio": mp3_data += chunk["data"] # Convert MP3 to PCM if mp3_data: pcm_data = await self._convert_mp3_to_pcm(mp3_data) if pcm_data: # Yield in chunks for streaming playback chunk_size = self.sample_rate * 2 // 10 # 100ms chunks for i in range(0, len(pcm_data), chunk_size): if self._cancel_event.is_set(): return chunk_data = pcm_data[i:i + chunk_size] yield TTSChunk( audio=chunk_data, sample_rate=self.sample_rate, is_final=(i + chunk_size >= len(pcm_data)) ) except asyncio.CancelledError: logger.info("TTS synthesis cancelled via asyncio") raise except Exception as e: logger.error(f"TTS synthesis error: {e}") raise async def _convert_mp3_to_pcm(self, mp3_data: bytes) -> bytes: """ Convert MP3 audio to PCM. Uses pydub or ffmpeg for conversion. """ try: # Try using pydub (requires ffmpeg) from pydub import AudioSegment # Load MP3 from bytes audio = AudioSegment.from_mp3(io.BytesIO(mp3_data)) # Convert to target format audio = audio.set_frame_rate(self.sample_rate) audio = audio.set_channels(1) audio = audio.set_sample_width(2) # 16-bit # Export as raw PCM return audio.raw_data except ImportError: logger.warning("pydub not available, trying fallback") # Fallback: Use subprocess to call ffmpeg directly return await self._ffmpeg_convert(mp3_data) except Exception as e: logger.error(f"Audio conversion error: {e}") return b"" async def _ffmpeg_convert(self, mp3_data: bytes) -> bytes: """Convert MP3 to PCM using ffmpeg subprocess.""" try: process = await asyncio.create_subprocess_exec( "ffmpeg", "-i", "pipe:0", "-f", "s16le", "-acodec", "pcm_s16le", "-ar", str(self.sample_rate), "-ac", "1", "pipe:1", stdin=asyncio.subprocess.PIPE, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.DEVNULL ) stdout, _ = await process.communicate(input=mp3_data) return stdout except Exception as e: logger.error(f"ffmpeg conversion error: {e}") return b"" async def cancel(self) -> None: """Cancel ongoing synthesis.""" self._cancel_event.set() class MockTTSService(BaseTTSService): """ Mock TTS service for testing without actual synthesis. Generates silence or simple tones. """ def __init__( self, voice: str = "mock", sample_rate: int = 16000, speed: float = 1.0 ): super().__init__(voice=voice, sample_rate=sample_rate, speed=speed) async def connect(self) -> None: self.state = ServiceState.CONNECTED logger.info("Mock TTS service connected") async def disconnect(self) -> None: self.state = ServiceState.DISCONNECTED logger.info("Mock TTS service disconnected") async def synthesize(self, text: str) -> bytes: """Generate silence based on text length.""" # Approximate: 100ms per word word_count = len(text.split()) duration_ms = word_count * 100 samples = int(self.sample_rate * duration_ms / 1000) # Generate silence (zeros) return bytes(samples * 2) # 16-bit = 2 bytes per sample async def synthesize_stream(self, text: str) -> AsyncIterator[TTSChunk]: """Generate silence chunks.""" audio = await self.synthesize(text) # Yield in 100ms chunks chunk_size = self.sample_rate * 2 // 10 for i in range(0, len(audio), chunk_size): chunk_data = audio[i:i + chunk_size] yield TTSChunk( audio=chunk_data, sample_rate=self.sample_rate, is_final=(i + chunk_size >= len(audio)) ) await asyncio.sleep(0.05) # Simulate processing time