Refactor project structure and enhance backend integration

- Expanded package inclusion in `pyproject.toml` to support new modules. - Introduced new `adapters` and `protocol` packages for better organization. - Added backend adapter implementations for control plane integration. - Updated main application imports to reflect new package structure. - Removed deprecated core components and adjusted documentation accordingly. - Enhanced architecture documentation to clarify the new runtime and integration layers.
2026-03-06 09:51:56 +08:00
parent 4e2450e800
commit 7e0b777923
75 changed files with 274 additions and 688 deletions
--- a/engine/providers/asr/init.py
+++ b/engine/providers/asr/init.py
@@ -0,0 +1 @@
+"""ASR providers."""
--- a/engine/providers/asr/buffered.py
+++ b/engine/providers/asr/buffered.py
@@ -0,0 +1,147 @@
+"""ASR (Automatic Speech Recognition) Service implementations.
+
+Provides speech-to-text capabilities with streaming support.
+"""
+
+import os
+import asyncio
+import json
+from typing import AsyncIterator, Optional
+from loguru import logger
+
+from providers.common.base import BaseASRService, ASRResult, ServiceState
+
+# Try to import websockets for streaming ASR
+try:
+    import websockets
+    WEBSOCKETS_AVAILABLE = True
+except ImportError:
+    WEBSOCKETS_AVAILABLE = False
+
+
+class BufferedASRService(BaseASRService):
+    """
+    Buffered ASR service that accumulates audio and provides
+    a simple text accumulator for use with EOU detection.
+    
+    This is a lightweight implementation that works with the
+    existing VAD + EOU pattern without requiring external ASR.
+    """
+    
+    def __init__(
+        self,
+        sample_rate: int = 16000,
+        language: str = "en"
+    ):
+        super().__init__(sample_rate=sample_rate, language=language)
+        
+        self._audio_buffer: bytes = b""
+        self._current_text: str = ""
+        self._transcript_queue: asyncio.Queue[ASRResult] = asyncio.Queue()
+    
+    async def connect(self) -> None:
+        """No connection needed for buffered ASR."""
+        self.state = ServiceState.CONNECTED
+        logger.info("Buffered ASR service connected")
+    
+    async def disconnect(self) -> None:
+        """Clear buffers on disconnect."""
+        self._audio_buffer = b""
+        self._current_text = ""
+        self.state = ServiceState.DISCONNECTED
+        logger.info("Buffered ASR service disconnected")
+    
+    async def send_audio(self, audio: bytes) -> None:
+        """Buffer audio for later processing."""
+        self._audio_buffer += audio
+    
+    async def receive_transcripts(self) -> AsyncIterator[ASRResult]:
+        """Yield transcription results."""
+        while True:
+            try:
+                result = await asyncio.wait_for(
+                    self._transcript_queue.get(),
+                    timeout=0.1
+                )
+                yield result
+            except asyncio.TimeoutError:
+                continue
+            except asyncio.CancelledError:
+                break
+    
+    def set_text(self, text: str) -> None:
+        """
+        Set the current transcript text directly.
+        
+        This allows external integration (e.g., Whisper, other ASR)
+        to provide transcripts.
+        """
+        self._current_text = text
+        result = ASRResult(text=text, is_final=False)
+        asyncio.create_task(self._transcript_queue.put(result))
+    
+    def get_and_clear_text(self) -> str:
+        """Get accumulated text and clear buffer."""
+        text = self._current_text
+        self._current_text = ""
+        self._audio_buffer = b""
+        return text
+    
+    def get_audio_buffer(self) -> bytes:
+        """Get accumulated audio buffer."""
+        return self._audio_buffer
+    
+    def clear_audio_buffer(self) -> None:
+        """Clear audio buffer."""
+        self._audio_buffer = b""
+
+
+class MockASRService(BaseASRService):
+    """
+    Mock ASR service for testing without actual recognition.
+    """
+    
+    def __init__(self, sample_rate: int = 16000, language: str = "en"):
+        super().__init__(sample_rate=sample_rate, language=language)
+        self._transcript_queue: asyncio.Queue[ASRResult] = asyncio.Queue()
+        self._mock_texts = [
+            "Hello, how are you?",
+            "That's interesting.",
+            "Tell me more about that.",
+            "I understand.",
+        ]
+        self._text_index = 0
+    
+    async def connect(self) -> None:
+        self.state = ServiceState.CONNECTED
+        logger.info("Mock ASR service connected")
+    
+    async def disconnect(self) -> None:
+        self.state = ServiceState.DISCONNECTED
+        logger.info("Mock ASR service disconnected")
+    
+    async def send_audio(self, audio: bytes) -> None:
+        """Mock audio processing - generates fake transcripts periodically."""
+        pass
+    
+    def trigger_transcript(self) -> None:
+        """Manually trigger a transcript (for testing)."""
+        text = self._mock_texts[self._text_index % len(self._mock_texts)]
+        self._text_index += 1
+        
+        result = ASRResult(text=text, is_final=True, confidence=0.95)
+        asyncio.create_task(self._transcript_queue.put(result))
+    
+    async def receive_transcripts(self) -> AsyncIterator[ASRResult]:
+        """Yield transcription results."""
+        while True:
+            try:
+                result = await asyncio.wait_for(
+                    self._transcript_queue.get(),
+                    timeout=0.1
+                )
+                yield result
+            except asyncio.TimeoutError:
+                continue
+            except asyncio.CancelledError:
+                break
--- a/engine/providers/asr/openai_compatible.py
+++ b/engine/providers/asr/openai_compatible.py
@@ -0,0 +1,353 @@
+"""OpenAI-compatible ASR (Automatic Speech Recognition) Service.
+
+Uses the SiliconFlow API for speech-to-text transcription.
+API: https://docs.siliconflow.cn/cn/api-reference/audio/create-audio-transcriptions
+"""
+
+import asyncio
+import io
+import os
+import wave
+from typing import AsyncIterator, Optional, Callable, Awaitable
+from urllib.parse import urlparse, urlunparse
+from loguru import logger
+
+try:
+    import aiohttp
+    AIOHTTP_AVAILABLE = True
+except ImportError:
+    AIOHTTP_AVAILABLE = False
+    logger.warning("aiohttp not available - OpenAICompatibleASRService will not work")
+
+from providers.common.base import BaseASRService, ASRResult, ServiceState
+
+
+class OpenAICompatibleASRService(BaseASRService):
+    """
+    OpenAI-compatible ASR service for speech-to-text transcription.
+    
+    Features:
+    - Buffers incoming audio chunks
+    - Provides interim transcriptions periodically (for streaming to client)
+    - Final transcription on EOU
+    
+    API Details:
+    - Endpoint: POST https://api.siliconflow.cn/v1/audio/transcriptions
+    - Models: FunAudioLLM/SenseVoiceSmall (default), TeleAI/TeleSpeechASR
+    - Input: Audio file (multipart/form-data)
+    - Output: {"text": "transcribed text"}
+    """
+    
+    # Supported models
+    MODELS = {
+        "sensevoice": "FunAudioLLM/SenseVoiceSmall",
+        "telespeech": "TeleAI/TeleSpeechASR",
+    }
+    
+    API_URL = "https://api.siliconflow.cn/v1/audio/transcriptions"
+    
+    def __init__(
+        self,
+        api_key: Optional[str] = None,
+        api_url: Optional[str] = None,
+        model: str = "FunAudioLLM/SenseVoiceSmall",
+        sample_rate: int = 16000,
+        language: str = "auto",
+        interim_interval_ms: int = 500,  # How often to send interim results
+        min_audio_for_interim_ms: int = 300,  # Min audio before first interim
+        on_transcript: Optional[Callable[[str, bool], Awaitable[None]]] = None
+    ):
+        """
+        Initialize OpenAI-compatible ASR service.
+        
+        Args:
+            api_key: Provider API key
+            api_url: Provider API URL (defaults to SiliconFlow endpoint)
+            model: ASR model name or alias
+            sample_rate: Audio sample rate (16000 recommended)
+            language: Language code (auto for automatic detection)
+            interim_interval_ms: How often to generate interim transcriptions
+            min_audio_for_interim_ms: Minimum audio duration before first interim
+            on_transcript: Callback for transcription results (text, is_final)
+        """
+        super().__init__(sample_rate=sample_rate, language=language)
+        
+        if not AIOHTTP_AVAILABLE:
+            raise RuntimeError("aiohttp is required for OpenAICompatibleASRService")
+        
+        self.api_key = api_key
+        raw_api_url = api_url or os.getenv("ASR_API_URL") or self.API_URL
+        self.api_url = self._resolve_transcriptions_endpoint(raw_api_url)
+        self.model = self.MODELS.get(model.lower(), model)
+        self.interim_interval_ms = interim_interval_ms
+        self.min_audio_for_interim_ms = min_audio_for_interim_ms
+        self.on_transcript = on_transcript
+        
+        # Session
+        self._session: Optional[aiohttp.ClientSession] = None
+        
+        # Audio buffer
+        self._audio_buffer: bytes = b""
+        self._current_text: str = ""
+        self._last_interim_time: float = 0
+        
+        # Transcript queue for async iteration
+        self._transcript_queue: asyncio.Queue[ASRResult] = asyncio.Queue()
+        
+        # Background task for interim results
+        self._interim_task: Optional[asyncio.Task] = None
+        self._running = False
+        
+        logger.info(f"OpenAICompatibleASRService initialized with model: {self.model}")
+
+    @staticmethod
+    def _resolve_transcriptions_endpoint(api_url: str) -> str:
+        """
+        Accept either:
+        - base URL: https://host/v1
+        - full endpoint: https://host/v1/audio/transcriptions
+        and always return the final transcriptions endpoint URL.
+        """
+        raw = str(api_url or "").strip()
+        if not raw:
+            return OpenAICompatibleASRService.API_URL
+
+        parsed = urlparse(raw)
+        path = (parsed.path or "").rstrip("/")
+        if path.endswith("/audio/transcriptions"):
+            return raw
+
+        if not path:
+            new_path = "/audio/transcriptions"
+        else:
+            new_path = f"{path}/audio/transcriptions"
+
+        return urlunparse(parsed._replace(path=new_path))
+    
+    async def connect(self) -> None:
+        """Connect to the service."""
+        if not self.api_key:
+            raise ValueError("ASR API key not provided. Configure agent.asr.api_key in YAML.")
+        self._session = aiohttp.ClientSession(
+            headers={
+                "Authorization": f"Bearer {self.api_key}"
+            }
+        )
+        self._running = True
+        self.state = ServiceState.CONNECTED
+        logger.info("OpenAICompatibleASRService connected")
+    
+    async def disconnect(self) -> None:
+        """Disconnect and cleanup."""
+        self._running = False
+        
+        if self._interim_task:
+            self._interim_task.cancel()
+            try:
+                await self._interim_task
+            except asyncio.CancelledError:
+                pass
+            self._interim_task = None
+        
+        if self._session:
+            await self._session.close()
+            self._session = None
+        
+        self._audio_buffer = b""
+        self._current_text = ""
+        self.state = ServiceState.DISCONNECTED
+        logger.info("OpenAICompatibleASRService disconnected")
+    
+    async def send_audio(self, audio: bytes) -> None:
+        """
+        Buffer incoming audio data.
+        
+        Args:
+            audio: PCM audio data (16-bit, mono)
+        """
+        self._audio_buffer += audio
+    
+    async def transcribe_buffer(self, is_final: bool = False) -> Optional[str]:
+        """
+        Transcribe current audio buffer.
+        
+        Args:
+            is_final: Whether this is the final transcription
+            
+        Returns:
+            Transcribed text or None if not enough audio
+        """
+        if not self._session:
+            logger.warning("ASR session not connected")
+            return None
+        
+        # Check minimum audio duration
+        audio_duration_ms = len(self._audio_buffer) / (self.sample_rate * 2) * 1000
+        
+        if not is_final and audio_duration_ms < self.min_audio_for_interim_ms:
+            return None
+        
+        if audio_duration_ms < 100:  # Less than 100ms - too short
+            return None
+        
+        try:
+            # Convert PCM to WAV in memory
+            wav_buffer = io.BytesIO()
+            with wave.open(wav_buffer, 'wb') as wav_file:
+                wav_file.setnchannels(1)
+                wav_file.setsampwidth(2)  # 16-bit
+                wav_file.setframerate(self.sample_rate)
+                wav_file.writeframes(self._audio_buffer)
+            
+            wav_buffer.seek(0)
+            wav_data = wav_buffer.read()
+            
+            # Send to API
+            form_data = aiohttp.FormData()
+            form_data.add_field(
+                'file',
+                wav_data,
+                filename='audio.wav',
+                content_type='audio/wav'
+            )
+            form_data.add_field('model', self.model)
+            
+            async with self._session.post(self.api_url, data=form_data) as response:
+                if response.status == 200:
+                    result = await response.json()
+                    text = result.get("text", "").strip()
+                    
+                    if text:
+                        self._current_text = text
+                        
+                        # Notify via callback
+                        if self.on_transcript:
+                            await self.on_transcript(text, is_final)
+                        
+                        # Queue result
+                        await self._transcript_queue.put(
+                            ASRResult(text=text, is_final=is_final)
+                        )
+                        
+                        logger.debug(f"ASR {'final' if is_final else 'interim'}: {text[:50]}...")
+                        return text
+                else:
+                    error_text = await response.text()
+                    logger.error(f"ASR API error {response.status}: {error_text}")
+                    return None
+                    
+        except Exception as e:
+            logger.error(f"ASR transcription error: {e}")
+            return None
+    
+    async def get_final_transcription(self) -> str:
+        """
+        Get final transcription and clear buffer.
+        
+        Call this when EOU is detected.
+        
+        Returns:
+            Final transcribed text
+        """
+        # Transcribe full buffer as final
+        text = await self.transcribe_buffer(is_final=True)
+        
+        # Clear buffer
+        result = text or self._current_text
+        self._audio_buffer = b""
+        self._current_text = ""
+        
+        return result
+    
+    def get_and_clear_text(self) -> str:
+        """
+        Get accumulated text and clear buffer.
+        
+        Compatible with BufferedASRService interface.
+        """
+        text = self._current_text
+        self._current_text = ""
+        self._audio_buffer = b""
+        return text
+    
+    def get_audio_buffer(self) -> bytes:
+        """Get current audio buffer."""
+        return self._audio_buffer
+    
+    def get_audio_duration_ms(self) -> float:
+        """Get current audio buffer duration in milliseconds."""
+        return len(self._audio_buffer) / (self.sample_rate * 2) * 1000
+    
+    def clear_buffer(self) -> None:
+        """Clear audio and text buffers."""
+        self._audio_buffer = b""
+        self._current_text = ""
+    
+    async def receive_transcripts(self) -> AsyncIterator[ASRResult]:
+        """
+        Async iterator for transcription results.
+        
+        Yields:
+            ASRResult with text and is_final flag
+        """
+        while self._running:
+            try:
+                result = await asyncio.wait_for(
+                    self._transcript_queue.get(),
+                    timeout=0.1
+                )
+                yield result
+            except asyncio.TimeoutError:
+                continue
+            except asyncio.CancelledError:
+                break
+    
+    async def start_interim_transcription(self) -> None:
+        """
+        Start background task for interim transcriptions.
+        
+        This periodically transcribes buffered audio for
+        real-time feedback to the user.
+        """
+        if self._interim_task and not self._interim_task.done():
+            return
+        
+        self._interim_task = asyncio.create_task(self._interim_loop())
+    
+    async def stop_interim_transcription(self) -> None:
+        """Stop interim transcription task."""
+        if self._interim_task:
+            self._interim_task.cancel()
+            try:
+                await self._interim_task
+            except asyncio.CancelledError:
+                pass
+            self._interim_task = None
+    
+    async def _interim_loop(self) -> None:
+        """Background loop for interim transcriptions."""
+        import time
+        
+        while self._running:
+            try:
+                await asyncio.sleep(self.interim_interval_ms / 1000)
+                
+                # Check if we have enough new audio
+                current_time = time.time()
+                time_since_last = (current_time - self._last_interim_time) * 1000
+                
+                if time_since_last >= self.interim_interval_ms:
+                    audio_duration = self.get_audio_duration_ms()
+                    
+                    if audio_duration >= self.min_audio_for_interim_ms:
+                        await self.transcribe_buffer(is_final=False)
+                        self._last_interim_time = current_time
+                        
+            except asyncio.CancelledError:
+                break
+            except Exception as e:
+                logger.error(f"Interim transcription error: {e}")
+
+
+# Backward-compatible alias
+SiliconFlowASRService = OpenAICompatibleASRService
--- a/engine/providers/asr/siliconflow.py
+++ b/engine/providers/asr/siliconflow.py
@@ -0,0 +1,8 @@
+"""Backward-compatible imports for legacy siliconflow_asr module."""
+
+from providers.asr.openai_compatible import OpenAICompatibleASRService
+
+# Backward-compatible alias
+SiliconFlowASRService = OpenAICompatibleASRService
+
+__all__ = ["OpenAICompatibleASRService", "SiliconFlowASRService"]