AI-VideoAssistant/engine/services/siliconflow_tts.py

"""SiliconFlow TTS Service with streaming support.

Uses SiliconFlow's CosyVoice2 or MOSS-TTSD models for low-latency
text-to-speech synthesis with streaming.

API Docs: https://docs.siliconflow.cn/cn/api-reference/audio/create-speech
"""

import os
import asyncio
import aiohttp
from typing import AsyncIterator, Optional
from loguru import logger

from services.base import BaseTTSService, TTSChunk, ServiceState
from services.streaming_tts_adapter import StreamingTTSAdapter  # backward-compatible re-export


class SiliconFlowTTSService(BaseTTSService):
    """
    SiliconFlow TTS service with streaming support.

    Supports CosyVoice2-0.5B and MOSS-TTSD-v0.5 models.
    """

    # Available voices
    VOICES = {
        "alex": "FunAudioLLM/CosyVoice2-0.5B:alex",
        "anna": "FunAudioLLM/CosyVoice2-0.5B:anna",
        "bella": "FunAudioLLM/CosyVoice2-0.5B:bella",
        "benjamin": "FunAudioLLM/CosyVoice2-0.5B:benjamin",
        "charles": "FunAudioLLM/CosyVoice2-0.5B:charles",
        "claire": "FunAudioLLM/CosyVoice2-0.5B:claire",
        "david": "FunAudioLLM/CosyVoice2-0.5B:david",
        "diana": "FunAudioLLM/CosyVoice2-0.5B:diana",
    }

    def __init__(
        self,
        api_key: Optional[str] = None,
        voice: str = "anna",
        model: str = "FunAudioLLM/CosyVoice2-0.5B",
        sample_rate: int = 16000,
        speed: float = 1.0
    ):
        """
        Initialize SiliconFlow TTS service.

        Args:
            api_key: SiliconFlow API key (defaults to SILICONFLOW_API_KEY env var)
            voice: Voice name (alex, anna, bella, benjamin, charles, claire, david, diana)
            model: Model name
            sample_rate: Output sample rate (8000, 16000, 24000, 32000, 44100)
            speed: Speech speed (0.25 to 4.0)
        """
        # Resolve voice name
        if voice in self.VOICES:
            full_voice = self.VOICES[voice]
        else:
            full_voice = voice

        super().__init__(voice=full_voice, sample_rate=sample_rate, speed=speed)

        self.api_key = api_key or os.getenv("SILICONFLOW_API_KEY")
        self.model = model
        self.api_url = "https://api.siliconflow.cn/v1/audio/speech"

        self._session: Optional[aiohttp.ClientSession] = None
        self._cancel_event = asyncio.Event()

    async def connect(self) -> None:
        """Initialize HTTP session."""
        if not self.api_key:
            raise ValueError("SiliconFlow API key not provided. Set SILICONFLOW_API_KEY env var.")

        self._session = aiohttp.ClientSession(
            headers={
                "Authorization": f"Bearer {self.api_key}",
                "Content-Type": "application/json"
            }
        )
        self.state = ServiceState.CONNECTED
        logger.info(f"SiliconFlow TTS service ready: voice={self.voice}, model={self.model}")

    async def disconnect(self) -> None:
        """Close HTTP session."""
        if self._session:
            await self._session.close()
            self._session = None
        self.state = ServiceState.DISCONNECTED
        logger.info("SiliconFlow TTS service disconnected")

    async def synthesize(self, text: str) -> bytes:
        """Synthesize complete audio for text."""
        audio_data = b""
        async for chunk in self.synthesize_stream(text):
            audio_data += chunk.audio
        return audio_data

    async def synthesize_stream(self, text: str) -> AsyncIterator[TTSChunk]:
        """
        Synthesize audio in streaming mode.

        Args:
            text: Text to synthesize

        Yields:
            TTSChunk objects with PCM audio
        """
        if not self._session:
            raise RuntimeError("TTS service not connected")

        if not text.strip():
            return

        self._cancel_event.clear()

        payload = {
            "model": self.model,
            "input": text,
            "voice": self.voice,
            "response_format": "pcm",
            "sample_rate": self.sample_rate,
            "stream": True,
            "speed": self.speed
        }

        try:
            async with self._session.post(self.api_url, json=payload) as response:
                if response.status != 200:
                    error_text = await response.text()
                    logger.error(f"SiliconFlow TTS error: {response.status} - {error_text}")
                    return

                # Stream audio chunks
                chunk_size = self.sample_rate * 2 // 10  # 100ms chunks
                buffer = b""
                pending_chunk = None

                async for chunk in response.content.iter_any():
                    if self._cancel_event.is_set():
                        logger.info("TTS synthesis cancelled")
                        return

                    buffer += chunk

                    # Yield complete chunks
                    while len(buffer) >= chunk_size:
                        audio_chunk = buffer[:chunk_size]
                        buffer = buffer[chunk_size:]

                        # Keep one full chunk buffered so we can always tag the true
                        # last full chunk as final when stream length is an exact multiple.
                        if pending_chunk is not None:
                            yield TTSChunk(
                                audio=pending_chunk,
                                sample_rate=self.sample_rate,
                                is_final=False
                            )
                        pending_chunk = audio_chunk

                # Flush pending chunk(s) and remaining tail.
                if pending_chunk is not None:
                    if buffer:
                        yield TTSChunk(
                            audio=pending_chunk,
                            sample_rate=self.sample_rate,
                            is_final=False
                        )
                        pending_chunk = None
                    else:
                        yield TTSChunk(
                            audio=pending_chunk,
                            sample_rate=self.sample_rate,
                            is_final=True
                        )
                        pending_chunk = None

                if buffer:
                    yield TTSChunk(
                        audio=buffer,
                        sample_rate=self.sample_rate,
                        is_final=True
                    )

        except asyncio.CancelledError:
            logger.info("TTS synthesis cancelled via asyncio")
            raise
        except Exception as e:
            logger.error(f"TTS synthesis error: {e}")
            raise

    async def cancel(self) -> None:
        """Cancel ongoing synthesis."""
        self._cancel_event.set()


class StreamingTTSAdapter:
    """
    Adapter for streaming LLM text to TTS with sentence-level chunking.

    This reduces latency by starting TTS as soon as a complete sentence
    is received from the LLM, rather than waiting for the full response.
    """

    # Sentence delimiters
    SENTENCE_ENDS = {'，', '。', '！', '？', '.', '!', '?', '\n'}

    def __init__(self, tts_service: BaseTTSService, transport, session_id: str):
        self.tts_service = tts_service
        self.transport = transport
        self.session_id = session_id
        self._buffer = ""
        self._cancel_event = asyncio.Event()
        self._is_speaking = False

    def _is_non_sentence_period(self, text: str, idx: int) -> bool:
        """Check whether '.' should NOT be treated as a sentence delimiter."""
        if text[idx] != ".":
            return False

        # Decimal/version segment: 1.2, v1.2.3
        if idx > 0 and idx < len(text) - 1 and text[idx - 1].isdigit() and text[idx + 1].isdigit():
            return True

        # Number abbreviations: No.1 / No. 1
        left_start = idx - 1
        while left_start >= 0 and text[left_start].isalpha():
            left_start -= 1
        left_token = text[left_start + 1:idx].lower()
        if left_token == "no":
            j = idx + 1
            while j < len(text) and text[j].isspace():
                j += 1
            if j < len(text) and text[j].isdigit():
                return True

        return False

    async def process_text_chunk(self, text_chunk: str) -> None:
        """
        Process a text chunk from LLM and trigger TTS when sentence is complete.

        Args:
            text_chunk: Text chunk from LLM streaming
        """
        if self._cancel_event.is_set():
            return

        self._buffer += text_chunk

        # Check for sentence completion
        while True:
            split_idx = -1
            for i, char in enumerate(self._buffer):
                if char == "." and self._is_non_sentence_period(self._buffer, i):
                    continue
                if char in self.SENTENCE_ENDS:
                    split_idx = i
                    break
            if split_idx < 0:
                break

            end_idx = split_idx + 1
            while end_idx < len(self._buffer) and self._buffer[end_idx] in self.SENTENCE_ENDS:
                end_idx += 1

            sentence = self._buffer[:end_idx].strip()
            self._buffer = self._buffer[end_idx:]

            if sentence and any(ch.isalnum() for ch in sentence):
                await self._speak_sentence(sentence)

    async def flush(self) -> None:
        """Flush remaining buffer."""
        if self._buffer.strip() and not self._cancel_event.is_set():
            await self._speak_sentence(self._buffer.strip())
        self._buffer = ""

    async def _speak_sentence(self, text: str) -> None:
        """Synthesize and send a sentence."""
        if not text or self._cancel_event.is_set():
            return

        self._is_speaking = True

        try:
            async for chunk in self.tts_service.synthesize_stream(text):
                if self._cancel_event.is_set():
                    break
                await self.transport.send_audio(chunk.audio)
                await asyncio.sleep(0.01)  # Prevent flooding
        except Exception as e:
            logger.error(f"TTS speak error: {e}")
        finally:
            self._is_speaking = False

    def cancel(self) -> None:
        """Cancel ongoing speech."""
        self._cancel_event.set()
        self._buffer = ""

    def reset(self) -> None:
        """Reset for new turn."""
        self._cancel_event.clear()
        self._buffer = ""
        self._is_speaking = False

    @property
    def is_speaking(self) -> bool:
        return self._is_speaking