a basic duplex agent is built

2026-01-29 16:36:46 +08:00
parent ac0c76e6e8
commit d6d0ade33e
6 changed files with 432 additions and 15 deletions
--- a/core/duplex_pipeline.py
+++ b/core/duplex_pipeline.py
@@ -25,6 +25,7 @@ from services.llm import OpenAILLMService, MockLLMService
 from services.tts import EdgeTTSService, MockTTSService
 from services.asr import BufferedASRService
 from services.siliconflow_tts import SiliconFlowTTSService
+from services.siliconflow_asr import SiliconFlowASRService
 from app.config import settings


@@ -90,7 +91,10 @@ class DuplexPipeline:
        # Initialize services
        self.llm_service = llm_service
        self.tts_service = tts_service
-        self.asr_service = asr_service or BufferedASRService()
+        self.asr_service = asr_service  # Will be initialized in start()
+        
+        # Track last sent transcript to avoid duplicates
+        self._last_sent_transcript = ""
        
        # Conversation manager
        self.conversation = ConversationManager(
@@ -148,6 +152,23 @@ class DuplexPipeline:
            await self.tts_service.connect()
            
            # Connect ASR service
+            if not self.asr_service:
+                if settings.asr_provider == "siliconflow" and settings.siliconflow_api_key:
+                    self.asr_service = SiliconFlowASRService(
+                        api_key=settings.siliconflow_api_key,
+                        model=settings.siliconflow_asr_model,
+                        sample_rate=settings.sample_rate,
+                        interim_interval_ms=settings.asr_interim_interval_ms,
+                        min_audio_for_interim_ms=settings.asr_min_audio_ms,
+                        on_transcript=self._on_transcript_callback
+                    )
+                    logger.info("Using SiliconFlow ASR service")
+                else:
+                    self.asr_service = BufferedASRService(
+                        sample_rate=settings.sample_rate
+                    )
+                    logger.info("Using Buffered ASR service (no real transcription)")
+            
            await self.asr_service.connect()
            
            logger.info("DuplexPipeline services connected")
@@ -204,8 +225,11 @@ class DuplexPipeline:
            if vad_status == "Speech" or self.conversation.state == ConversationState.LISTENING:
                self._audio_buffer += pcm_bytes
                await self.asr_service.send_audio(pcm_bytes)
+                
+                # For SiliconFlow ASR, trigger interim transcription periodically
+                # The service handles timing internally via start_interim_transcription()
            
-            # 4. Check for End of Utterance
+            # 4. Check for End of Utterance - this triggers LLM response
            if self.eou_detector.process(vad_status):
                await self._on_end_of_utterance()
            
@@ -237,12 +261,47 @@ class DuplexPipeline:
        """Interrupt current bot speech (manual interrupt command)."""
        await self._handle_barge_in()
    
+    async def _on_transcript_callback(self, text: str, is_final: bool) -> None:
+        """
+        Callback for ASR transcription results.
+        
+        Streams transcription to client for display.
+        
+        Args:
+            text: Transcribed text
+            is_final: Whether this is the final transcription
+        """
+        # Avoid sending duplicate transcripts
+        if text == self._last_sent_transcript and not is_final:
+            return
+        
+        self._last_sent_transcript = text
+        
+        # Send transcript event to client
+        await self.transport.send_event({
+            "event": "transcript",
+            "trackId": self.session_id,
+            "text": text,
+            "isFinal": is_final,
+            "timestamp": self._get_timestamp_ms()
+        })
+        
+        logger.debug(f"Sent transcript ({'final' if is_final else 'interim'}): {text[:50]}...")
+    
    async def _on_speech_start(self) -> None:
        """Handle user starting to speak."""
        if self.conversation.state == ConversationState.IDLE:
            await self.conversation.start_user_turn()
            self._audio_buffer = b""
+            self._last_sent_transcript = ""
            self.eou_detector.reset()
+            
+            # Clear ASR buffer and start interim transcriptions
+            if hasattr(self.asr_service, 'clear_buffer'):
+                self.asr_service.clear_buffer()
+            if hasattr(self.asr_service, 'start_interim_transcription'):
+                await self.asr_service.start_interim_transcription()
+            
            logger.debug("User speech started")
    
    async def _on_end_of_utterance(self) -> None:
@@ -250,25 +309,36 @@ class DuplexPipeline:
        if self.conversation.state != ConversationState.LISTENING:
            return
        
-        # Get transcribed text (if using ASR that provides it)
+        # Stop interim transcriptions
+        if hasattr(self.asr_service, 'stop_interim_transcription'):
+            await self.asr_service.stop_interim_transcription()
+        
+        # Get final transcription from ASR service
        user_text = ""
-        if hasattr(self.asr_service, 'get_and_clear_text'):
+        
+        if hasattr(self.asr_service, 'get_final_transcription'):
+            # SiliconFlow ASR - get final transcription
+            user_text = await self.asr_service.get_final_transcription()
+        elif hasattr(self.asr_service, 'get_and_clear_text'):
+            # Buffered ASR - get accumulated text
            user_text = self.asr_service.get_and_clear_text()
        
-        # If no ASR text, we could use the audio buffer for external ASR
-        # For now, just use placeholder if no ASR text
-        if not user_text:
-            # In a real implementation, you'd send audio_buffer to ASR here
-            # For demo purposes, use mock text
-            user_text = "[User speech detected]"
-            logger.warning("No ASR text available - using placeholder")
+        # Skip if no meaningful text
+        if not user_text or not user_text.strip():
+            logger.debug("EOU detected but no transcription - skipping")
+            # Reset for next utterance
+            self._audio_buffer = b""
+            self._last_sent_transcript = ""
+            await self.conversation.start_user_turn()
+            return
        
-        logger.info(f"EOU detected - user said: {user_text[:50]}...")
+        logger.info(f"EOU detected - user said: {user_text[:100]}...")
        
        # Clear buffers
        self._audio_buffer = b""
+        self._last_sent_transcript = ""
        
-        # Process the turn
+        # Process the turn - trigger LLM response
        await self.conversation.end_user_turn(user_text)
        self._current_turn_task = asyncio.create_task(self._handle_turn(user_text))