Use openai compatible as vendor

2026-02-12 18:44:55 +08:00
parent 260ff621bf
commit ff3a03b1ad
23 changed files with 822 additions and 905 deletions
--- a/engine/services/init.py
+++ b/engine/services/init.py
@@ -15,8 +15,8 @@ from services.base import (
 from services.llm import OpenAILLMService, MockLLMService
 from services.tts import EdgeTTSService, MockTTSService
 from services.asr import BufferedASRService, MockASRService
-from services.siliconflow_asr import SiliconFlowASRService
-from services.siliconflow_tts import SiliconFlowTTSService
+from services.openai_compatible_asr import OpenAICompatibleASRService, SiliconFlowASRService
+from services.openai_compatible_tts import OpenAICompatibleTTSService, SiliconFlowTTSService
 from services.streaming_tts_adapter import StreamingTTSAdapter
 from services.realtime import RealtimeService, RealtimeConfig, RealtimePipeline

@@ -38,8 +38,10 @@ __all__ = [
    # ASR
    "BufferedASRService",
    "MockASRService",
+    "OpenAICompatibleASRService",
    "SiliconFlowASRService",
    # TTS (SiliconFlow)
+    "OpenAICompatibleTTSService",
    "SiliconFlowTTSService",
    "StreamingTTSAdapter",
    # Realtime
--- a/engine/services/openai_compatible_asr.py
+++ b/engine/services/openai_compatible_asr.py
@@ -0,0 +1,321 @@
+"""OpenAI-compatible ASR (Automatic Speech Recognition) Service.
+
+Uses the SiliconFlow API for speech-to-text transcription.
+API: https://docs.siliconflow.cn/cn/api-reference/audio/create-audio-transcriptions
+"""
+
+import asyncio
+import io
+import wave
+from typing import AsyncIterator, Optional, Callable, Awaitable
+from loguru import logger
+
+try:
+    import aiohttp
+    AIOHTTP_AVAILABLE = True
+except ImportError:
+    AIOHTTP_AVAILABLE = False
+    logger.warning("aiohttp not available - OpenAICompatibleASRService will not work")
+
+from services.base import BaseASRService, ASRResult, ServiceState
+
+
+class OpenAICompatibleASRService(BaseASRService):
+    """
+    OpenAI-compatible ASR service for speech-to-text transcription.
+    
+    Features:
+    - Buffers incoming audio chunks
+    - Provides interim transcriptions periodically (for streaming to client)
+    - Final transcription on EOU
+    
+    API Details:
+    - Endpoint: POST https://api.siliconflow.cn/v1/audio/transcriptions
+    - Models: FunAudioLLM/SenseVoiceSmall (default), TeleAI/TeleSpeechASR
+    - Input: Audio file (multipart/form-data)
+    - Output: {"text": "transcribed text"}
+    """
+    
+    # Supported models
+    MODELS = {
+        "sensevoice": "FunAudioLLM/SenseVoiceSmall",
+        "telespeech": "TeleAI/TeleSpeechASR",
+    }
+    
+    API_URL = "https://api.siliconflow.cn/v1/audio/transcriptions"
+    
+    def __init__(
+        self,
+        api_key: str,
+        model: str = "FunAudioLLM/SenseVoiceSmall",
+        sample_rate: int = 16000,
+        language: str = "auto",
+        interim_interval_ms: int = 500,  # How often to send interim results
+        min_audio_for_interim_ms: int = 300,  # Min audio before first interim
+        on_transcript: Optional[Callable[[str, bool], Awaitable[None]]] = None
+    ):
+        """
+        Initialize OpenAI-compatible ASR service.
+        
+        Args:
+            api_key: Provider API key
+            model: ASR model name or alias
+            sample_rate: Audio sample rate (16000 recommended)
+            language: Language code (auto for automatic detection)
+            interim_interval_ms: How often to generate interim transcriptions
+            min_audio_for_interim_ms: Minimum audio duration before first interim
+            on_transcript: Callback for transcription results (text, is_final)
+        """
+        super().__init__(sample_rate=sample_rate, language=language)
+        
+        if not AIOHTTP_AVAILABLE:
+            raise RuntimeError("aiohttp is required for OpenAICompatibleASRService")
+        
+        self.api_key = api_key
+        self.model = self.MODELS.get(model.lower(), model)
+        self.interim_interval_ms = interim_interval_ms
+        self.min_audio_for_interim_ms = min_audio_for_interim_ms
+        self.on_transcript = on_transcript
+        
+        # Session
+        self._session: Optional[aiohttp.ClientSession] = None
+        
+        # Audio buffer
+        self._audio_buffer: bytes = b""
+        self._current_text: str = ""
+        self._last_interim_time: float = 0
+        
+        # Transcript queue for async iteration
+        self._transcript_queue: asyncio.Queue[ASRResult] = asyncio.Queue()
+        
+        # Background task for interim results
+        self._interim_task: Optional[asyncio.Task] = None
+        self._running = False
+        
+        logger.info(f"OpenAICompatibleASRService initialized with model: {self.model}")
+    
+    async def connect(self) -> None:
+        """Connect to the service."""
+        self._session = aiohttp.ClientSession(
+            headers={
+                "Authorization": f"Bearer {self.api_key}"
+            }
+        )
+        self._running = True
+        self.state = ServiceState.CONNECTED
+        logger.info("OpenAICompatibleASRService connected")
+    
+    async def disconnect(self) -> None:
+        """Disconnect and cleanup."""
+        self._running = False
+        
+        if self._interim_task:
+            self._interim_task.cancel()
+            try:
+                await self._interim_task
+            except asyncio.CancelledError:
+                pass
+            self._interim_task = None
+        
+        if self._session:
+            await self._session.close()
+            self._session = None
+        
+        self._audio_buffer = b""
+        self._current_text = ""
+        self.state = ServiceState.DISCONNECTED
+        logger.info("OpenAICompatibleASRService disconnected")
+    
+    async def send_audio(self, audio: bytes) -> None:
+        """
+        Buffer incoming audio data.
+        
+        Args:
+            audio: PCM audio data (16-bit, mono)
+        """
+        self._audio_buffer += audio
+    
+    async def transcribe_buffer(self, is_final: bool = False) -> Optional[str]:
+        """
+        Transcribe current audio buffer.
+        
+        Args:
+            is_final: Whether this is the final transcription
+            
+        Returns:
+            Transcribed text or None if not enough audio
+        """
+        if not self._session:
+            logger.warning("ASR session not connected")
+            return None
+        
+        # Check minimum audio duration
+        audio_duration_ms = len(self._audio_buffer) / (self.sample_rate * 2) * 1000
+        
+        if not is_final and audio_duration_ms < self.min_audio_for_interim_ms:
+            return None
+        
+        if audio_duration_ms < 100:  # Less than 100ms - too short
+            return None
+        
+        try:
+            # Convert PCM to WAV in memory
+            wav_buffer = io.BytesIO()
+            with wave.open(wav_buffer, 'wb') as wav_file:
+                wav_file.setnchannels(1)
+                wav_file.setsampwidth(2)  # 16-bit
+                wav_file.setframerate(self.sample_rate)
+                wav_file.writeframes(self._audio_buffer)
+            
+            wav_buffer.seek(0)
+            wav_data = wav_buffer.read()
+            
+            # Send to API
+            form_data = aiohttp.FormData()
+            form_data.add_field(
+                'file',
+                wav_data,
+                filename='audio.wav',
+                content_type='audio/wav'
+            )
+            form_data.add_field('model', self.model)
+            
+            async with self._session.post(self.API_URL, data=form_data) as response:
+                if response.status == 200:
+                    result = await response.json()
+                    text = result.get("text", "").strip()
+                    
+                    if text:
+                        self._current_text = text
+                        
+                        # Notify via callback
+                        if self.on_transcript:
+                            await self.on_transcript(text, is_final)
+                        
+                        # Queue result
+                        await self._transcript_queue.put(
+                            ASRResult(text=text, is_final=is_final)
+                        )
+                        
+                        logger.debug(f"ASR {'final' if is_final else 'interim'}: {text[:50]}...")
+                        return text
+                else:
+                    error_text = await response.text()
+                    logger.error(f"ASR API error {response.status}: {error_text}")
+                    return None
+                    
+        except Exception as e:
+            logger.error(f"ASR transcription error: {e}")
+            return None
+    
+    async def get_final_transcription(self) -> str:
+        """
+        Get final transcription and clear buffer.
+        
+        Call this when EOU is detected.
+        
+        Returns:
+            Final transcribed text
+        """
+        # Transcribe full buffer as final
+        text = await self.transcribe_buffer(is_final=True)
+        
+        # Clear buffer
+        result = text or self._current_text
+        self._audio_buffer = b""
+        self._current_text = ""
+        
+        return result
+    
+    def get_and_clear_text(self) -> str:
+        """
+        Get accumulated text and clear buffer.
+        
+        Compatible with BufferedASRService interface.
+        """
+        text = self._current_text
+        self._current_text = ""
+        self._audio_buffer = b""
+        return text
+    
+    def get_audio_buffer(self) -> bytes:
+        """Get current audio buffer."""
+        return self._audio_buffer
+    
+    def get_audio_duration_ms(self) -> float:
+        """Get current audio buffer duration in milliseconds."""
+        return len(self._audio_buffer) / (self.sample_rate * 2) * 1000
+    
+    def clear_buffer(self) -> None:
+        """Clear audio and text buffers."""
+        self._audio_buffer = b""
+        self._current_text = ""
+    
+    async def receive_transcripts(self) -> AsyncIterator[ASRResult]:
+        """
+        Async iterator for transcription results.
+        
+        Yields:
+            ASRResult with text and is_final flag
+        """
+        while self._running:
+            try:
+                result = await asyncio.wait_for(
+                    self._transcript_queue.get(),
+                    timeout=0.1
+                )
+                yield result
+            except asyncio.TimeoutError:
+                continue
+            except asyncio.CancelledError:
+                break
+    
+    async def start_interim_transcription(self) -> None:
+        """
+        Start background task for interim transcriptions.
+        
+        This periodically transcribes buffered audio for
+        real-time feedback to the user.
+        """
+        if self._interim_task and not self._interim_task.done():
+            return
+        
+        self._interim_task = asyncio.create_task(self._interim_loop())
+    
+    async def stop_interim_transcription(self) -> None:
+        """Stop interim transcription task."""
+        if self._interim_task:
+            self._interim_task.cancel()
+            try:
+                await self._interim_task
+            except asyncio.CancelledError:
+                pass
+            self._interim_task = None
+    
+    async def _interim_loop(self) -> None:
+        """Background loop for interim transcriptions."""
+        import time
+        
+        while self._running:
+            try:
+                await asyncio.sleep(self.interim_interval_ms / 1000)
+                
+                # Check if we have enough new audio
+                current_time = time.time()
+                time_since_last = (current_time - self._last_interim_time) * 1000
+                
+                if time_since_last >= self.interim_interval_ms:
+                    audio_duration = self.get_audio_duration_ms()
+                    
+                    if audio_duration >= self.min_audio_for_interim_ms:
+                        await self.transcribe_buffer(is_final=False)
+                        self._last_interim_time = current_time
+                        
+            except asyncio.CancelledError:
+                break
+            except Exception as e:
+                logger.error(f"Interim transcription error: {e}")
+
+
+# Backward-compatible alias
+SiliconFlowASRService = OpenAICompatibleASRService
--- a/engine/services/openai_compatible_tts.py
+++ b/engine/services/openai_compatible_tts.py
@@ -0,0 +1,315 @@
+"""OpenAI-compatible TTS Service with streaming support.
+
+Uses SiliconFlow's CosyVoice2 or MOSS-TTSD models for low-latency
+text-to-speech synthesis with streaming.
+
+API Docs: https://docs.siliconflow.cn/cn/api-reference/audio/create-speech
+"""
+
+import os
+import asyncio
+import aiohttp
+from typing import AsyncIterator, Optional
+from loguru import logger
+
+from services.base import BaseTTSService, TTSChunk, ServiceState
+from services.streaming_tts_adapter import StreamingTTSAdapter  # backward-compatible re-export
+
+
+class OpenAICompatibleTTSService(BaseTTSService):
+    """
+    OpenAI-compatible TTS service with streaming support.
+    
+    Supports CosyVoice2-0.5B and MOSS-TTSD-v0.5 models.
+    """
+    
+    # Available voices
+    VOICES = {
+        "alex": "FunAudioLLM/CosyVoice2-0.5B:alex",
+        "anna": "FunAudioLLM/CosyVoice2-0.5B:anna",
+        "bella": "FunAudioLLM/CosyVoice2-0.5B:bella",
+        "benjamin": "FunAudioLLM/CosyVoice2-0.5B:benjamin",
+        "charles": "FunAudioLLM/CosyVoice2-0.5B:charles",
+        "claire": "FunAudioLLM/CosyVoice2-0.5B:claire",
+        "david": "FunAudioLLM/CosyVoice2-0.5B:david",
+        "diana": "FunAudioLLM/CosyVoice2-0.5B:diana",
+    }
+    
+    def __init__(
+        self,
+        api_key: Optional[str] = None,
+        voice: str = "anna",
+        model: str = "FunAudioLLM/CosyVoice2-0.5B",
+        sample_rate: int = 16000,
+        speed: float = 1.0
+    ):
+        """
+        Initialize OpenAI-compatible TTS service.
+        
+        Args:
+            api_key: Provider API key (defaults to SILICONFLOW_API_KEY env var)
+            voice: Voice name (alex, anna, bella, benjamin, charles, claire, david, diana)
+            model: Model name
+            sample_rate: Output sample rate (8000, 16000, 24000, 32000, 44100)
+            speed: Speech speed (0.25 to 4.0)
+        """
+        # Resolve voice name
+        if voice in self.VOICES:
+            full_voice = self.VOICES[voice]
+        else:
+            full_voice = voice
+            
+        super().__init__(voice=full_voice, sample_rate=sample_rate, speed=speed)
+        
+        self.api_key = api_key or os.getenv("SILICONFLOW_API_KEY")
+        self.model = model
+        self.api_url = "https://api.siliconflow.cn/v1/audio/speech"
+        
+        self._session: Optional[aiohttp.ClientSession] = None
+        self._cancel_event = asyncio.Event()
+    
+    async def connect(self) -> None:
+        """Initialize HTTP session."""
+        if not self.api_key:
+            raise ValueError("SiliconFlow API key not provided. Set SILICONFLOW_API_KEY env var.")
+        
+        self._session = aiohttp.ClientSession(
+            headers={
+                "Authorization": f"Bearer {self.api_key}",
+                "Content-Type": "application/json"
+            }
+        )
+        self.state = ServiceState.CONNECTED
+        logger.info(f"SiliconFlow TTS service ready: voice={self.voice}, model={self.model}")
+    
+    async def disconnect(self) -> None:
+        """Close HTTP session."""
+        if self._session:
+            await self._session.close()
+            self._session = None
+        self.state = ServiceState.DISCONNECTED
+        logger.info("SiliconFlow TTS service disconnected")
+    
+    async def synthesize(self, text: str) -> bytes:
+        """Synthesize complete audio for text."""
+        audio_data = b""
+        async for chunk in self.synthesize_stream(text):
+            audio_data += chunk.audio
+        return audio_data
+    
+    async def synthesize_stream(self, text: str) -> AsyncIterator[TTSChunk]:
+        """
+        Synthesize audio in streaming mode.
+        
+        Args:
+            text: Text to synthesize
+            
+        Yields:
+            TTSChunk objects with PCM audio
+        """
+        if not self._session:
+            raise RuntimeError("TTS service not connected")
+        
+        if not text.strip():
+            return
+        
+        self._cancel_event.clear()
+        
+        payload = {
+            "model": self.model,
+            "input": text,
+            "voice": self.voice,
+            "response_format": "pcm",
+            "sample_rate": self.sample_rate,
+            "stream": True,
+            "speed": self.speed
+        }
+        
+        try:
+            async with self._session.post(self.api_url, json=payload) as response:
+                if response.status != 200:
+                    error_text = await response.text()
+                    logger.error(f"SiliconFlow TTS error: {response.status} - {error_text}")
+                    return
+                
+                # Stream audio chunks
+                chunk_size = self.sample_rate * 2 // 10  # 100ms chunks
+                buffer = b""
+                pending_chunk = None
+                
+                async for chunk in response.content.iter_any():
+                    if self._cancel_event.is_set():
+                        logger.info("TTS synthesis cancelled")
+                        return
+                    
+                    buffer += chunk
+                    
+                    # Yield complete chunks
+                    while len(buffer) >= chunk_size:
+                        audio_chunk = buffer[:chunk_size]
+                        buffer = buffer[chunk_size:]
+
+                        # Keep one full chunk buffered so we can always tag the true
+                        # last full chunk as final when stream length is an exact multiple.
+                        if pending_chunk is not None:
+                            yield TTSChunk(
+                                audio=pending_chunk,
+                                sample_rate=self.sample_rate,
+                                is_final=False
+                            )
+                        pending_chunk = audio_chunk
+                
+                # Flush pending chunk(s) and remaining tail.
+                if pending_chunk is not None:
+                    if buffer:
+                        yield TTSChunk(
+                            audio=pending_chunk,
+                            sample_rate=self.sample_rate,
+                            is_final=False
+                        )
+                        pending_chunk = None
+                    else:
+                        yield TTSChunk(
+                            audio=pending_chunk,
+                            sample_rate=self.sample_rate,
+                            is_final=True
+                        )
+                        pending_chunk = None
+
+                if buffer:
+                    yield TTSChunk(
+                        audio=buffer,
+                        sample_rate=self.sample_rate,
+                        is_final=True
+                    )
+                    
+        except asyncio.CancelledError:
+            logger.info("TTS synthesis cancelled via asyncio")
+            raise
+        except Exception as e:
+            logger.error(f"TTS synthesis error: {e}")
+            raise
+    
+    async def cancel(self) -> None:
+        """Cancel ongoing synthesis."""
+        self._cancel_event.set()
+
+
+class StreamingTTSAdapter:
+    """
+    Adapter for streaming LLM text to TTS with sentence-level chunking.
+    
+    This reduces latency by starting TTS as soon as a complete sentence
+    is received from the LLM, rather than waiting for the full response.
+    """
+    
+    # Sentence delimiters
+    SENTENCE_ENDS = {'，', '。', '！', '？', '.', '!', '?', '\n'}
+    
+    def __init__(self, tts_service: BaseTTSService, transport, session_id: str):
+        self.tts_service = tts_service
+        self.transport = transport
+        self.session_id = session_id
+        self._buffer = ""
+        self._cancel_event = asyncio.Event()
+        self._is_speaking = False
+
+    def _is_non_sentence_period(self, text: str, idx: int) -> bool:
+        """Check whether '.' should NOT be treated as a sentence delimiter."""
+        if text[idx] != ".":
+            return False
+
+        # Decimal/version segment: 1.2, v1.2.3
+        if idx > 0 and idx < len(text) - 1 and text[idx - 1].isdigit() and text[idx + 1].isdigit():
+            return True
+
+        # Number abbreviations: No.1 / No. 1
+        left_start = idx - 1
+        while left_start >= 0 and text[left_start].isalpha():
+            left_start -= 1
+        left_token = text[left_start + 1:idx].lower()
+        if left_token == "no":
+            j = idx + 1
+            while j < len(text) and text[j].isspace():
+                j += 1
+            if j < len(text) and text[j].isdigit():
+                return True
+
+        return False
+    
+    async def process_text_chunk(self, text_chunk: str) -> None:
+        """
+        Process a text chunk from LLM and trigger TTS when sentence is complete.
+        
+        Args:
+            text_chunk: Text chunk from LLM streaming
+        """
+        if self._cancel_event.is_set():
+            return
+        
+        self._buffer += text_chunk
+        
+        # Check for sentence completion
+        while True:
+            split_idx = -1
+            for i, char in enumerate(self._buffer):
+                if char == "." and self._is_non_sentence_period(self._buffer, i):
+                    continue
+                if char in self.SENTENCE_ENDS:
+                    split_idx = i
+                    break
+            if split_idx < 0:
+                break
+
+            end_idx = split_idx + 1
+            while end_idx < len(self._buffer) and self._buffer[end_idx] in self.SENTENCE_ENDS:
+                end_idx += 1
+
+            sentence = self._buffer[:end_idx].strip()
+            self._buffer = self._buffer[end_idx:]
+
+            if sentence and any(ch.isalnum() for ch in sentence):
+                await self._speak_sentence(sentence)
+    
+    async def flush(self) -> None:
+        """Flush remaining buffer."""
+        if self._buffer.strip() and not self._cancel_event.is_set():
+            await self._speak_sentence(self._buffer.strip())
+        self._buffer = ""
+    
+    async def _speak_sentence(self, text: str) -> None:
+        """Synthesize and send a sentence."""
+        if not text or self._cancel_event.is_set():
+            return
+        
+        self._is_speaking = True
+        
+        try:
+            async for chunk in self.tts_service.synthesize_stream(text):
+                if self._cancel_event.is_set():
+                    break
+                await self.transport.send_audio(chunk.audio)
+                await asyncio.sleep(0.01)  # Prevent flooding
+        except Exception as e:
+            logger.error(f"TTS speak error: {e}")
+        finally:
+            self._is_speaking = False
+    
+    def cancel(self) -> None:
+        """Cancel ongoing speech."""
+        self._cancel_event.set()
+        self._buffer = ""
+    
+    def reset(self) -> None:
+        """Reset for new turn."""
+        self._cancel_event.clear()
+        self._buffer = ""
+        self._is_speaking = False
+    
+    @property
+    def is_speaking(self) -> bool:
+        return self._is_speaking
+
+
+# Backward-compatible alias
+SiliconFlowTTSService = OpenAICompatibleTTSService
--- a/engine/services/siliconflow_asr.py
+++ b/engine/services/siliconflow_asr.py
@@ -1,317 +1,8 @@
-"""SiliconFlow ASR (Automatic Speech Recognition) Service.
+"""Backward-compatible imports for legacy siliconflow_asr module."""

-Uses the SiliconFlow API for speech-to-text transcription.
-API: https://docs.siliconflow.cn/cn/api-reference/audio/create-audio-transcriptions
-"""
+from services.openai_compatible_asr import OpenAICompatibleASRService

-import asyncio
-import io
-import wave
-from typing import AsyncIterator, Optional, Callable, Awaitable
-from loguru import logger
+# Backward-compatible alias
+SiliconFlowASRService = OpenAICompatibleASRService

-try:
-    import aiohttp
-    AIOHTTP_AVAILABLE = True
-except ImportError:
-    AIOHTTP_AVAILABLE = False
-    logger.warning("aiohttp not available - SiliconFlowASRService will not work")
-
-from services.base import BaseASRService, ASRResult, ServiceState
-
-
-class SiliconFlowASRService(BaseASRService):
-    """
-    SiliconFlow ASR service for speech-to-text transcription.
-    
-    Features:
-    - Buffers incoming audio chunks
-    - Provides interim transcriptions periodically (for streaming to client)
-    - Final transcription on EOU
-    
-    API Details:
-    - Endpoint: POST https://api.siliconflow.cn/v1/audio/transcriptions
-    - Models: FunAudioLLM/SenseVoiceSmall (default), TeleAI/TeleSpeechASR
-    - Input: Audio file (multipart/form-data)
-    - Output: {"text": "transcribed text"}
-    """
-    
-    # Supported models
-    MODELS = {
-        "sensevoice": "FunAudioLLM/SenseVoiceSmall",
-        "telespeech": "TeleAI/TeleSpeechASR",
-    }
-    
-    API_URL = "https://api.siliconflow.cn/v1/audio/transcriptions"
-    
-    def __init__(
-        self,
-        api_key: str,
-        model: str = "FunAudioLLM/SenseVoiceSmall",
-        sample_rate: int = 16000,
-        language: str = "auto",
-        interim_interval_ms: int = 500,  # How often to send interim results
-        min_audio_for_interim_ms: int = 300,  # Min audio before first interim
-        on_transcript: Optional[Callable[[str, bool], Awaitable[None]]] = None
-    ):
-        """
-        Initialize SiliconFlow ASR service.
-        
-        Args:
-            api_key: SiliconFlow API key
-            model: ASR model name or alias
-            sample_rate: Audio sample rate (16000 recommended)
-            language: Language code (auto for automatic detection)
-            interim_interval_ms: How often to generate interim transcriptions
-            min_audio_for_interim_ms: Minimum audio duration before first interim
-            on_transcript: Callback for transcription results (text, is_final)
-        """
-        super().__init__(sample_rate=sample_rate, language=language)
-        
-        if not AIOHTTP_AVAILABLE:
-            raise RuntimeError("aiohttp is required for SiliconFlowASRService")
-        
-        self.api_key = api_key
-        self.model = self.MODELS.get(model.lower(), model)
-        self.interim_interval_ms = interim_interval_ms
-        self.min_audio_for_interim_ms = min_audio_for_interim_ms
-        self.on_transcript = on_transcript
-        
-        # Session
-        self._session: Optional[aiohttp.ClientSession] = None
-        
-        # Audio buffer
-        self._audio_buffer: bytes = b""
-        self._current_text: str = ""
-        self._last_interim_time: float = 0
-        
-        # Transcript queue for async iteration
-        self._transcript_queue: asyncio.Queue[ASRResult] = asyncio.Queue()
-        
-        # Background task for interim results
-        self._interim_task: Optional[asyncio.Task] = None
-        self._running = False
-        
-        logger.info(f"SiliconFlowASRService initialized with model: {self.model}")
-    
-    async def connect(self) -> None:
-        """Connect to the service."""
-        self._session = aiohttp.ClientSession(
-            headers={
-                "Authorization": f"Bearer {self.api_key}"
-            }
-        )
-        self._running = True
-        self.state = ServiceState.CONNECTED
-        logger.info("SiliconFlowASRService connected")
-    
-    async def disconnect(self) -> None:
-        """Disconnect and cleanup."""
-        self._running = False
-        
-        if self._interim_task:
-            self._interim_task.cancel()
-            try:
-                await self._interim_task
-            except asyncio.CancelledError:
-                pass
-            self._interim_task = None
-        
-        if self._session:
-            await self._session.close()
-            self._session = None
-        
-        self._audio_buffer = b""
-        self._current_text = ""
-        self.state = ServiceState.DISCONNECTED
-        logger.info("SiliconFlowASRService disconnected")
-    
-    async def send_audio(self, audio: bytes) -> None:
-        """
-        Buffer incoming audio data.
-        
-        Args:
-            audio: PCM audio data (16-bit, mono)
-        """
-        self._audio_buffer += audio
-    
-    async def transcribe_buffer(self, is_final: bool = False) -> Optional[str]:
-        """
-        Transcribe current audio buffer.
-        
-        Args:
-            is_final: Whether this is the final transcription
-            
-        Returns:
-            Transcribed text or None if not enough audio
-        """
-        if not self._session:
-            logger.warning("ASR session not connected")
-            return None
-        
-        # Check minimum audio duration
-        audio_duration_ms = len(self._audio_buffer) / (self.sample_rate * 2) * 1000
-        
-        if not is_final and audio_duration_ms < self.min_audio_for_interim_ms:
-            return None
-        
-        if audio_duration_ms < 100:  # Less than 100ms - too short
-            return None
-        
-        try:
-            # Convert PCM to WAV in memory
-            wav_buffer = io.BytesIO()
-            with wave.open(wav_buffer, 'wb') as wav_file:
-                wav_file.setnchannels(1)
-                wav_file.setsampwidth(2)  # 16-bit
-                wav_file.setframerate(self.sample_rate)
-                wav_file.writeframes(self._audio_buffer)
-            
-            wav_buffer.seek(0)
-            wav_data = wav_buffer.read()
-            
-            # Send to API
-            form_data = aiohttp.FormData()
-            form_data.add_field(
-                'file',
-                wav_data,
-                filename='audio.wav',
-                content_type='audio/wav'
-            )
-            form_data.add_field('model', self.model)
-            
-            async with self._session.post(self.API_URL, data=form_data) as response:
-                if response.status == 200:
-                    result = await response.json()
-                    text = result.get("text", "").strip()
-                    
-                    if text:
-                        self._current_text = text
-                        
-                        # Notify via callback
-                        if self.on_transcript:
-                            await self.on_transcript(text, is_final)
-                        
-                        # Queue result
-                        await self._transcript_queue.put(
-                            ASRResult(text=text, is_final=is_final)
-                        )
-                        
-                        logger.debug(f"ASR {'final' if is_final else 'interim'}: {text[:50]}...")
-                        return text
-                else:
-                    error_text = await response.text()
-                    logger.error(f"ASR API error {response.status}: {error_text}")
-                    return None
-                    
-        except Exception as e:
-            logger.error(f"ASR transcription error: {e}")
-            return None
-    
-    async def get_final_transcription(self) -> str:
-        """
-        Get final transcription and clear buffer.
-        
-        Call this when EOU is detected.
-        
-        Returns:
-            Final transcribed text
-        """
-        # Transcribe full buffer as final
-        text = await self.transcribe_buffer(is_final=True)
-        
-        # Clear buffer
-        result = text or self._current_text
-        self._audio_buffer = b""
-        self._current_text = ""
-        
-        return result
-    
-    def get_and_clear_text(self) -> str:
-        """
-        Get accumulated text and clear buffer.
-        
-        Compatible with BufferedASRService interface.
-        """
-        text = self._current_text
-        self._current_text = ""
-        self._audio_buffer = b""
-        return text
-    
-    def get_audio_buffer(self) -> bytes:
-        """Get current audio buffer."""
-        return self._audio_buffer
-    
-    def get_audio_duration_ms(self) -> float:
-        """Get current audio buffer duration in milliseconds."""
-        return len(self._audio_buffer) / (self.sample_rate * 2) * 1000
-    
-    def clear_buffer(self) -> None:
-        """Clear audio and text buffers."""
-        self._audio_buffer = b""
-        self._current_text = ""
-    
-    async def receive_transcripts(self) -> AsyncIterator[ASRResult]:
-        """
-        Async iterator for transcription results.
-        
-        Yields:
-            ASRResult with text and is_final flag
-        """
-        while self._running:
-            try:
-                result = await asyncio.wait_for(
-                    self._transcript_queue.get(),
-                    timeout=0.1
-                )
-                yield result
-            except asyncio.TimeoutError:
-                continue
-            except asyncio.CancelledError:
-                break
-    
-    async def start_interim_transcription(self) -> None:
-        """
-        Start background task for interim transcriptions.
-        
-        This periodically transcribes buffered audio for
-        real-time feedback to the user.
-        """
-        if self._interim_task and not self._interim_task.done():
-            return
-        
-        self._interim_task = asyncio.create_task(self._interim_loop())
-    
-    async def stop_interim_transcription(self) -> None:
-        """Stop interim transcription task."""
-        if self._interim_task:
-            self._interim_task.cancel()
-            try:
-                await self._interim_task
-            except asyncio.CancelledError:
-                pass
-            self._interim_task = None
-    
-    async def _interim_loop(self) -> None:
-        """Background loop for interim transcriptions."""
-        import time
-        
-        while self._running:
-            try:
-                await asyncio.sleep(self.interim_interval_ms / 1000)
-                
-                # Check if we have enough new audio
-                current_time = time.time()
-                time_since_last = (current_time - self._last_interim_time) * 1000
-                
-                if time_since_last >= self.interim_interval_ms:
-                    audio_duration = self.get_audio_duration_ms()
-                    
-                    if audio_duration >= self.min_audio_for_interim_ms:
-                        await self.transcribe_buffer(is_final=False)
-                        self._last_interim_time = current_time
-                        
-            except asyncio.CancelledError:
-                break
-            except Exception as e:
-                logger.error(f"Interim transcription error: {e}")
+__all__ = ["OpenAICompatibleASRService", "SiliconFlowASRService"]
--- a/engine/services/siliconflow_tts.py
+++ b/engine/services/siliconflow_tts.py
@@ -1,311 +1,8 @@
-"""SiliconFlow TTS Service with streaming support.
+"""Backward-compatible imports for legacy siliconflow_tts module."""

-Uses SiliconFlow's CosyVoice2 or MOSS-TTSD models for low-latency
-text-to-speech synthesis with streaming.
+from services.openai_compatible_tts import OpenAICompatibleTTSService, StreamingTTSAdapter

-API Docs: https://docs.siliconflow.cn/cn/api-reference/audio/create-speech
-"""
+# Backward-compatible alias
+SiliconFlowTTSService = OpenAICompatibleTTSService

-import os
-import asyncio
-import aiohttp
-from typing import AsyncIterator, Optional
-from loguru import logger
-
-from services.base import BaseTTSService, TTSChunk, ServiceState
-from services.streaming_tts_adapter import StreamingTTSAdapter  # backward-compatible re-export
-
-
-class SiliconFlowTTSService(BaseTTSService):
-    """
-    SiliconFlow TTS service with streaming support.
-    
-    Supports CosyVoice2-0.5B and MOSS-TTSD-v0.5 models.
-    """
-    
-    # Available voices
-    VOICES = {
-        "alex": "FunAudioLLM/CosyVoice2-0.5B:alex",
-        "anna": "FunAudioLLM/CosyVoice2-0.5B:anna",
-        "bella": "FunAudioLLM/CosyVoice2-0.5B:bella",
-        "benjamin": "FunAudioLLM/CosyVoice2-0.5B:benjamin",
-        "charles": "FunAudioLLM/CosyVoice2-0.5B:charles",
-        "claire": "FunAudioLLM/CosyVoice2-0.5B:claire",
-        "david": "FunAudioLLM/CosyVoice2-0.5B:david",
-        "diana": "FunAudioLLM/CosyVoice2-0.5B:diana",
-    }
-    
-    def __init__(
-        self,
-        api_key: Optional[str] = None,
-        voice: str = "anna",
-        model: str = "FunAudioLLM/CosyVoice2-0.5B",
-        sample_rate: int = 16000,
-        speed: float = 1.0
-    ):
-        """
-        Initialize SiliconFlow TTS service.
-        
-        Args:
-            api_key: SiliconFlow API key (defaults to SILICONFLOW_API_KEY env var)
-            voice: Voice name (alex, anna, bella, benjamin, charles, claire, david, diana)
-            model: Model name
-            sample_rate: Output sample rate (8000, 16000, 24000, 32000, 44100)
-            speed: Speech speed (0.25 to 4.0)
-        """
-        # Resolve voice name
-        if voice in self.VOICES:
-            full_voice = self.VOICES[voice]
-        else:
-            full_voice = voice
-            
-        super().__init__(voice=full_voice, sample_rate=sample_rate, speed=speed)
-        
-        self.api_key = api_key or os.getenv("SILICONFLOW_API_KEY")
-        self.model = model
-        self.api_url = "https://api.siliconflow.cn/v1/audio/speech"
-        
-        self._session: Optional[aiohttp.ClientSession] = None
-        self._cancel_event = asyncio.Event()
-    
-    async def connect(self) -> None:
-        """Initialize HTTP session."""
-        if not self.api_key:
-            raise ValueError("SiliconFlow API key not provided. Set SILICONFLOW_API_KEY env var.")
-        
-        self._session = aiohttp.ClientSession(
-            headers={
-                "Authorization": f"Bearer {self.api_key}",
-                "Content-Type": "application/json"
-            }
-        )
-        self.state = ServiceState.CONNECTED
-        logger.info(f"SiliconFlow TTS service ready: voice={self.voice}, model={self.model}")
-    
-    async def disconnect(self) -> None:
-        """Close HTTP session."""
-        if self._session:
-            await self._session.close()
-            self._session = None
-        self.state = ServiceState.DISCONNECTED
-        logger.info("SiliconFlow TTS service disconnected")
-    
-    async def synthesize(self, text: str) -> bytes:
-        """Synthesize complete audio for text."""
-        audio_data = b""
-        async for chunk in self.synthesize_stream(text):
-            audio_data += chunk.audio
-        return audio_data
-    
-    async def synthesize_stream(self, text: str) -> AsyncIterator[TTSChunk]:
-        """
-        Synthesize audio in streaming mode.
-        
-        Args:
-            text: Text to synthesize
-            
-        Yields:
-            TTSChunk objects with PCM audio
-        """
-        if not self._session:
-            raise RuntimeError("TTS service not connected")
-        
-        if not text.strip():
-            return
-        
-        self._cancel_event.clear()
-        
-        payload = {
-            "model": self.model,
-            "input": text,
-            "voice": self.voice,
-            "response_format": "pcm",
-            "sample_rate": self.sample_rate,
-            "stream": True,
-            "speed": self.speed
-        }
-        
-        try:
-            async with self._session.post(self.api_url, json=payload) as response:
-                if response.status != 200:
-                    error_text = await response.text()
-                    logger.error(f"SiliconFlow TTS error: {response.status} - {error_text}")
-                    return
-                
-                # Stream audio chunks
-                chunk_size = self.sample_rate * 2 // 10  # 100ms chunks
-                buffer = b""
-                pending_chunk = None
-                
-                async for chunk in response.content.iter_any():
-                    if self._cancel_event.is_set():
-                        logger.info("TTS synthesis cancelled")
-                        return
-                    
-                    buffer += chunk
-                    
-                    # Yield complete chunks
-                    while len(buffer) >= chunk_size:
-                        audio_chunk = buffer[:chunk_size]
-                        buffer = buffer[chunk_size:]
-
-                        # Keep one full chunk buffered so we can always tag the true
-                        # last full chunk as final when stream length is an exact multiple.
-                        if pending_chunk is not None:
-                            yield TTSChunk(
-                                audio=pending_chunk,
-                                sample_rate=self.sample_rate,
-                                is_final=False
-                            )
-                        pending_chunk = audio_chunk
-                
-                # Flush pending chunk(s) and remaining tail.
-                if pending_chunk is not None:
-                    if buffer:
-                        yield TTSChunk(
-                            audio=pending_chunk,
-                            sample_rate=self.sample_rate,
-                            is_final=False
-                        )
-                        pending_chunk = None
-                    else:
-                        yield TTSChunk(
-                            audio=pending_chunk,
-                            sample_rate=self.sample_rate,
-                            is_final=True
-                        )
-                        pending_chunk = None
-
-                if buffer:
-                    yield TTSChunk(
-                        audio=buffer,
-                        sample_rate=self.sample_rate,
-                        is_final=True
-                    )
-                    
-        except asyncio.CancelledError:
-            logger.info("TTS synthesis cancelled via asyncio")
-            raise
-        except Exception as e:
-            logger.error(f"TTS synthesis error: {e}")
-            raise
-    
-    async def cancel(self) -> None:
-        """Cancel ongoing synthesis."""
-        self._cancel_event.set()
-
-
-class StreamingTTSAdapter:
-    """
-    Adapter for streaming LLM text to TTS with sentence-level chunking.
-    
-    This reduces latency by starting TTS as soon as a complete sentence
-    is received from the LLM, rather than waiting for the full response.
-    """
-    
-    # Sentence delimiters
-    SENTENCE_ENDS = {'，', '。', '！', '？', '.', '!', '?', '\n'}
-    
-    def __init__(self, tts_service: BaseTTSService, transport, session_id: str):
-        self.tts_service = tts_service
-        self.transport = transport
-        self.session_id = session_id
-        self._buffer = ""
-        self._cancel_event = asyncio.Event()
-        self._is_speaking = False
-
-    def _is_non_sentence_period(self, text: str, idx: int) -> bool:
-        """Check whether '.' should NOT be treated as a sentence delimiter."""
-        if text[idx] != ".":
-            return False
-
-        # Decimal/version segment: 1.2, v1.2.3
-        if idx > 0 and idx < len(text) - 1 and text[idx - 1].isdigit() and text[idx + 1].isdigit():
-            return True
-
-        # Number abbreviations: No.1 / No. 1
-        left_start = idx - 1
-        while left_start >= 0 and text[left_start].isalpha():
-            left_start -= 1
-        left_token = text[left_start + 1:idx].lower()
-        if left_token == "no":
-            j = idx + 1
-            while j < len(text) and text[j].isspace():
-                j += 1
-            if j < len(text) and text[j].isdigit():
-                return True
-
-        return False
-    
-    async def process_text_chunk(self, text_chunk: str) -> None:
-        """
-        Process a text chunk from LLM and trigger TTS when sentence is complete.
-        
-        Args:
-            text_chunk: Text chunk from LLM streaming
-        """
-        if self._cancel_event.is_set():
-            return
-        
-        self._buffer += text_chunk
-        
-        # Check for sentence completion
-        while True:
-            split_idx = -1
-            for i, char in enumerate(self._buffer):
-                if char == "." and self._is_non_sentence_period(self._buffer, i):
-                    continue
-                if char in self.SENTENCE_ENDS:
-                    split_idx = i
-                    break
-            if split_idx < 0:
-                break
-
-            end_idx = split_idx + 1
-            while end_idx < len(self._buffer) and self._buffer[end_idx] in self.SENTENCE_ENDS:
-                end_idx += 1
-
-            sentence = self._buffer[:end_idx].strip()
-            self._buffer = self._buffer[end_idx:]
-
-            if sentence and any(ch.isalnum() for ch in sentence):
-                await self._speak_sentence(sentence)
-    
-    async def flush(self) -> None:
-        """Flush remaining buffer."""
-        if self._buffer.strip() and not self._cancel_event.is_set():
-            await self._speak_sentence(self._buffer.strip())
-        self._buffer = ""
-    
-    async def _speak_sentence(self, text: str) -> None:
-        """Synthesize and send a sentence."""
-        if not text or self._cancel_event.is_set():
-            return
-        
-        self._is_speaking = True
-        
-        try:
-            async for chunk in self.tts_service.synthesize_stream(text):
-                if self._cancel_event.is_set():
-                    break
-                await self.transport.send_audio(chunk.audio)
-                await asyncio.sleep(0.01)  # Prevent flooding
-        except Exception as e:
-            logger.error(f"TTS speak error: {e}")
-        finally:
-            self._is_speaking = False
-    
-    def cancel(self) -> None:
-        """Cancel ongoing speech."""
-        self._cancel_event.set()
-        self._buffer = ""
-    
-    def reset(self) -> None:
-        """Reset for new turn."""
-        self._cancel_event.clear()
-        self._buffer = ""
-        self._is_speaking = False
-    
-    @property
-    def is_speaking(self) -> bool:
-        return self._is_speaking
+__all__ = ["OpenAICompatibleTTSService", "SiliconFlowTTSService", "StreamingTTSAdapter"]