"""SiliconFlow TTS Service with streaming support. Uses SiliconFlow's CosyVoice2 or MOSS-TTSD models for low-latency text-to-speech synthesis with streaming. API Docs: https://docs.siliconflow.cn/cn/api-reference/audio/create-speech """ import os import asyncio import aiohttp from typing import AsyncIterator, Optional from loguru import logger from services.base import BaseTTSService, TTSChunk, ServiceState class SiliconFlowTTSService(BaseTTSService): """ SiliconFlow TTS service with streaming support. Supports CosyVoice2-0.5B and MOSS-TTSD-v0.5 models. """ # Available voices VOICES = { "alex": "FunAudioLLM/CosyVoice2-0.5B:alex", "anna": "FunAudioLLM/CosyVoice2-0.5B:anna", "bella": "FunAudioLLM/CosyVoice2-0.5B:bella", "benjamin": "FunAudioLLM/CosyVoice2-0.5B:benjamin", "charles": "FunAudioLLM/CosyVoice2-0.5B:charles", "claire": "FunAudioLLM/CosyVoice2-0.5B:claire", "david": "FunAudioLLM/CosyVoice2-0.5B:david", "diana": "FunAudioLLM/CosyVoice2-0.5B:diana", } def __init__( self, api_key: Optional[str] = None, voice: str = "anna", model: str = "FunAudioLLM/CosyVoice2-0.5B", sample_rate: int = 16000, speed: float = 1.0 ): """ Initialize SiliconFlow TTS service. Args: api_key: SiliconFlow API key (defaults to SILICONFLOW_API_KEY env var) voice: Voice name (alex, anna, bella, benjamin, charles, claire, david, diana) model: Model name sample_rate: Output sample rate (8000, 16000, 24000, 32000, 44100) speed: Speech speed (0.25 to 4.0) """ # Resolve voice name if voice in self.VOICES: full_voice = self.VOICES[voice] else: full_voice = voice super().__init__(voice=full_voice, sample_rate=sample_rate, speed=speed) self.api_key = api_key or os.getenv("SILICONFLOW_API_KEY") self.model = model self.api_url = "https://api.siliconflow.cn/v1/audio/speech" self._session: Optional[aiohttp.ClientSession] = None self._cancel_event = asyncio.Event() async def connect(self) -> None: """Initialize HTTP session.""" if not self.api_key: raise ValueError("SiliconFlow API key not provided. Set SILICONFLOW_API_KEY env var.") self._session = aiohttp.ClientSession( headers={ "Authorization": f"Bearer {self.api_key}", "Content-Type": "application/json" } ) self.state = ServiceState.CONNECTED logger.info(f"SiliconFlow TTS service ready: voice={self.voice}, model={self.model}") async def disconnect(self) -> None: """Close HTTP session.""" if self._session: await self._session.close() self._session = None self.state = ServiceState.DISCONNECTED logger.info("SiliconFlow TTS service disconnected") async def synthesize(self, text: str) -> bytes: """Synthesize complete audio for text.""" audio_data = b"" async for chunk in self.synthesize_stream(text): audio_data += chunk.audio return audio_data async def synthesize_stream(self, text: str) -> AsyncIterator[TTSChunk]: """ Synthesize audio in streaming mode. Args: text: Text to synthesize Yields: TTSChunk objects with PCM audio """ if not self._session: raise RuntimeError("TTS service not connected") if not text.strip(): return self._cancel_event.clear() payload = { "model": self.model, "input": text, "voice": self.voice, "response_format": "pcm", "sample_rate": self.sample_rate, "stream": True, "speed": self.speed } try: async with self._session.post(self.api_url, json=payload) as response: if response.status != 200: error_text = await response.text() logger.error(f"SiliconFlow TTS error: {response.status} - {error_text}") return # Stream audio chunks chunk_size = self.sample_rate * 2 // 10 # 100ms chunks buffer = b"" async for chunk in response.content.iter_any(): if self._cancel_event.is_set(): logger.info("TTS synthesis cancelled") return buffer += chunk # Yield complete chunks while len(buffer) >= chunk_size: audio_chunk = buffer[:chunk_size] buffer = buffer[chunk_size:] yield TTSChunk( audio=audio_chunk, sample_rate=self.sample_rate, is_final=False ) # Yield remaining buffer if buffer: yield TTSChunk( audio=buffer, sample_rate=self.sample_rate, is_final=True ) except asyncio.CancelledError: logger.info("TTS synthesis cancelled via asyncio") raise except Exception as e: logger.error(f"TTS synthesis error: {e}") raise async def cancel(self) -> None: """Cancel ongoing synthesis.""" self._cancel_event.set() class StreamingTTSAdapter: """ Adapter for streaming LLM text to TTS with sentence-level chunking. This reduces latency by starting TTS as soon as a complete sentence is received from the LLM, rather than waiting for the full response. """ # Sentence delimiters SENTENCE_ENDS = {'.', '!', '?', '。', '!', '?', ';', '\n'} def __init__(self, tts_service: BaseTTSService, transport, session_id: str): self.tts_service = tts_service self.transport = transport self.session_id = session_id self._buffer = "" self._cancel_event = asyncio.Event() self._is_speaking = False async def process_text_chunk(self, text_chunk: str) -> None: """ Process a text chunk from LLM and trigger TTS when sentence is complete. Args: text_chunk: Text chunk from LLM streaming """ if self._cancel_event.is_set(): return self._buffer += text_chunk # Check for sentence completion for i, char in enumerate(self._buffer): if char in self.SENTENCE_ENDS: # Found sentence end, synthesize up to this point sentence = self._buffer[:i+1].strip() self._buffer = self._buffer[i+1:] if sentence: await self._speak_sentence(sentence) break async def flush(self) -> None: """Flush remaining buffer.""" if self._buffer.strip() and not self._cancel_event.is_set(): await self._speak_sentence(self._buffer.strip()) self._buffer = "" async def _speak_sentence(self, text: str) -> None: """Synthesize and send a sentence.""" if not text or self._cancel_event.is_set(): return self._is_speaking = True try: async for chunk in self.tts_service.synthesize_stream(text): if self._cancel_event.is_set(): break await self.transport.send_audio(chunk.audio) await asyncio.sleep(0.01) # Prevent flooding except Exception as e: logger.error(f"TTS speak error: {e}") finally: self._is_speaking = False def cancel(self) -> None: """Cancel ongoing speech.""" self._cancel_event.set() self._buffer = "" def reset(self) -> None: """Reset for new turn.""" self._cancel_event.clear() self._buffer = "" self._is_speaking = False @property def is_speaking(self) -> bool: return self._is_speaking