I can use text to get audio response and barge in

2026-01-29 16:25:53 +08:00
parent cd90b4fb37
commit ac0c76e6e8
16 changed files with 3394 additions and 119 deletions
--- a/services/tts.py
+++ b/services/tts.py
@@ -0,0 +1,271 @@
+"""TTS (Text-to-Speech) Service implementations.
+
+Provides multiple TTS backend options including edge-tts (free)
+and placeholder for cloud services.
+"""
+
+import os
+import io
+import asyncio
+import struct
+from typing import AsyncIterator, Optional
+from loguru import logger
+
+from services.base import BaseTTSService, TTSChunk, ServiceState
+
+# Try to import edge-tts
+try:
+    import edge_tts
+    EDGE_TTS_AVAILABLE = True
+except ImportError:
+    EDGE_TTS_AVAILABLE = False
+    logger.warning("edge-tts not available - EdgeTTS service will be disabled")
+
+
+class EdgeTTSService(BaseTTSService):
+    """
+    Microsoft Edge TTS service.
+    
+    Uses edge-tts library for free, high-quality speech synthesis.
+    Supports streaming for low-latency playback.
+    """
+    
+    # Voice mapping for common languages
+    VOICE_MAP = {
+        "en": "en-US-JennyNeural",
+        "en-US": "en-US-JennyNeural",
+        "en-GB": "en-GB-SoniaNeural",
+        "zh": "zh-CN-XiaoxiaoNeural",
+        "zh-CN": "zh-CN-XiaoxiaoNeural",
+        "zh-TW": "zh-TW-HsiaoChenNeural",
+        "ja": "ja-JP-NanamiNeural",
+        "ko": "ko-KR-SunHiNeural",
+        "fr": "fr-FR-DeniseNeural",
+        "de": "de-DE-KatjaNeural",
+        "es": "es-ES-ElviraNeural",
+    }
+    
+    def __init__(
+        self,
+        voice: str = "en-US-JennyNeural",
+        sample_rate: int = 16000,
+        speed: float = 1.0
+    ):
+        """
+        Initialize Edge TTS service.
+        
+        Args:
+            voice: Voice name (e.g., "en-US-JennyNeural") or language code (e.g., "en")
+            sample_rate: Target sample rate (will be resampled)
+            speed: Speech speed multiplier
+        """
+        # Resolve voice from language code if needed
+        if voice in self.VOICE_MAP:
+            voice = self.VOICE_MAP[voice]
+        
+        super().__init__(voice=voice, sample_rate=sample_rate, speed=speed)
+        self._cancel_event = asyncio.Event()
+    
+    async def connect(self) -> None:
+        """Edge TTS doesn't require explicit connection."""
+        if not EDGE_TTS_AVAILABLE:
+            raise RuntimeError("edge-tts package not installed")
+        self.state = ServiceState.CONNECTED
+        logger.info(f"Edge TTS service ready: voice={self.voice}")
+    
+    async def disconnect(self) -> None:
+        """Edge TTS doesn't require explicit disconnection."""
+        self.state = ServiceState.DISCONNECTED
+        logger.info("Edge TTS service disconnected")
+    
+    def _get_rate_string(self) -> str:
+        """Convert speed to rate string for edge-tts."""
+        # edge-tts uses percentage format: "+0%", "-10%", "+20%"
+        percentage = int((self.speed - 1.0) * 100)
+        if percentage >= 0:
+            return f"+{percentage}%"
+        return f"{percentage}%"
+    
+    async def synthesize(self, text: str) -> bytes:
+        """
+        Synthesize complete audio for text.
+        
+        Args:
+            text: Text to synthesize
+            
+        Returns:
+            PCM audio data (16-bit, mono, 16kHz)
+        """
+        if not EDGE_TTS_AVAILABLE:
+            raise RuntimeError("edge-tts not available")
+        
+        # Collect all chunks
+        audio_data = b""
+        async for chunk in self.synthesize_stream(text):
+            audio_data += chunk.audio
+        
+        return audio_data
+    
+    async def synthesize_stream(self, text: str) -> AsyncIterator[TTSChunk]:
+        """
+        Synthesize audio in streaming mode.
+        
+        Args:
+            text: Text to synthesize
+            
+        Yields:
+            TTSChunk objects with PCM audio
+        """
+        if not EDGE_TTS_AVAILABLE:
+            raise RuntimeError("edge-tts not available")
+        
+        self._cancel_event.clear()
+        
+        try:
+            communicate = edge_tts.Communicate(
+                text,
+                voice=self.voice,
+                rate=self._get_rate_string()
+            )
+            
+            # edge-tts outputs MP3, we need to decode to PCM
+            # For now, collect MP3 chunks and yield after conversion
+            mp3_data = b""
+            
+            async for chunk in communicate.stream():
+                # Check for cancellation
+                if self._cancel_event.is_set():
+                    logger.info("TTS synthesis cancelled")
+                    return
+                
+                if chunk["type"] == "audio":
+                    mp3_data += chunk["data"]
+            
+            # Convert MP3 to PCM
+            if mp3_data:
+                pcm_data = await self._convert_mp3_to_pcm(mp3_data)
+                if pcm_data:
+                    # Yield in chunks for streaming playback
+                    chunk_size = self.sample_rate * 2 // 10  # 100ms chunks
+                    for i in range(0, len(pcm_data), chunk_size):
+                        if self._cancel_event.is_set():
+                            return
+                        
+                        chunk_data = pcm_data[i:i + chunk_size]
+                        yield TTSChunk(
+                            audio=chunk_data,
+                            sample_rate=self.sample_rate,
+                            is_final=(i + chunk_size >= len(pcm_data))
+                        )
+        
+        except asyncio.CancelledError:
+            logger.info("TTS synthesis cancelled via asyncio")
+            raise
+        except Exception as e:
+            logger.error(f"TTS synthesis error: {e}")
+            raise
+    
+    async def _convert_mp3_to_pcm(self, mp3_data: bytes) -> bytes:
+        """
+        Convert MP3 audio to PCM.
+        
+        Uses pydub or ffmpeg for conversion.
+        """
+        try:
+            # Try using pydub (requires ffmpeg)
+            from pydub import AudioSegment
+            
+            # Load MP3 from bytes
+            audio = AudioSegment.from_mp3(io.BytesIO(mp3_data))
+            
+            # Convert to target format
+            audio = audio.set_frame_rate(self.sample_rate)
+            audio = audio.set_channels(1)
+            audio = audio.set_sample_width(2)  # 16-bit
+            
+            # Export as raw PCM
+            return audio.raw_data
+            
+        except ImportError:
+            logger.warning("pydub not available, trying fallback")
+            # Fallback: Use subprocess to call ffmpeg directly
+            return await self._ffmpeg_convert(mp3_data)
+        except Exception as e:
+            logger.error(f"Audio conversion error: {e}")
+            return b""
+    
+    async def _ffmpeg_convert(self, mp3_data: bytes) -> bytes:
+        """Convert MP3 to PCM using ffmpeg subprocess."""
+        try:
+            process = await asyncio.create_subprocess_exec(
+                "ffmpeg",
+                "-i", "pipe:0",
+                "-f", "s16le",
+                "-acodec", "pcm_s16le",
+                "-ar", str(self.sample_rate),
+                "-ac", "1",
+                "pipe:1",
+                stdin=asyncio.subprocess.PIPE,
+                stdout=asyncio.subprocess.PIPE,
+                stderr=asyncio.subprocess.DEVNULL
+            )
+            
+            stdout, _ = await process.communicate(input=mp3_data)
+            return stdout
+            
+        except Exception as e:
+            logger.error(f"ffmpeg conversion error: {e}")
+            return b""
+    
+    async def cancel(self) -> None:
+        """Cancel ongoing synthesis."""
+        self._cancel_event.set()
+
+
+class MockTTSService(BaseTTSService):
+    """
+    Mock TTS service for testing without actual synthesis.
+    
+    Generates silence or simple tones.
+    """
+    
+    def __init__(
+        self,
+        voice: str = "mock",
+        sample_rate: int = 16000,
+        speed: float = 1.0
+    ):
+        super().__init__(voice=voice, sample_rate=sample_rate, speed=speed)
+    
+    async def connect(self) -> None:
+        self.state = ServiceState.CONNECTED
+        logger.info("Mock TTS service connected")
+    
+    async def disconnect(self) -> None:
+        self.state = ServiceState.DISCONNECTED
+        logger.info("Mock TTS service disconnected")
+    
+    async def synthesize(self, text: str) -> bytes:
+        """Generate silence based on text length."""
+        # Approximate: 100ms per word
+        word_count = len(text.split())
+        duration_ms = word_count * 100
+        samples = int(self.sample_rate * duration_ms / 1000)
+        
+        # Generate silence (zeros)
+        return bytes(samples * 2)  # 16-bit = 2 bytes per sample
+    
+    async def synthesize_stream(self, text: str) -> AsyncIterator[TTSChunk]:
+        """Generate silence chunks."""
+        audio = await self.synthesize(text)
+        
+        # Yield in 100ms chunks
+        chunk_size = self.sample_rate * 2 // 10
+        for i in range(0, len(audio), chunk_size):
+            chunk_data = audio[i:i + chunk_size]
+            yield TTSChunk(
+                audio=chunk_data,
+                sample_rate=self.sample_rate,
+                is_final=(i + chunk_size >= len(audio))
+            )
+            await asyncio.sleep(0.05)  # Simulate processing time