a basic duplex agent is built
This commit is contained in:
@@ -25,6 +25,7 @@ from services.llm import OpenAILLMService, MockLLMService
|
||||
from services.tts import EdgeTTSService, MockTTSService
|
||||
from services.asr import BufferedASRService
|
||||
from services.siliconflow_tts import SiliconFlowTTSService
|
||||
from services.siliconflow_asr import SiliconFlowASRService
|
||||
from app.config import settings
|
||||
|
||||
|
||||
@@ -90,7 +91,10 @@ class DuplexPipeline:
|
||||
# Initialize services
|
||||
self.llm_service = llm_service
|
||||
self.tts_service = tts_service
|
||||
self.asr_service = asr_service or BufferedASRService()
|
||||
self.asr_service = asr_service # Will be initialized in start()
|
||||
|
||||
# Track last sent transcript to avoid duplicates
|
||||
self._last_sent_transcript = ""
|
||||
|
||||
# Conversation manager
|
||||
self.conversation = ConversationManager(
|
||||
@@ -148,6 +152,23 @@ class DuplexPipeline:
|
||||
await self.tts_service.connect()
|
||||
|
||||
# Connect ASR service
|
||||
if not self.asr_service:
|
||||
if settings.asr_provider == "siliconflow" and settings.siliconflow_api_key:
|
||||
self.asr_service = SiliconFlowASRService(
|
||||
api_key=settings.siliconflow_api_key,
|
||||
model=settings.siliconflow_asr_model,
|
||||
sample_rate=settings.sample_rate,
|
||||
interim_interval_ms=settings.asr_interim_interval_ms,
|
||||
min_audio_for_interim_ms=settings.asr_min_audio_ms,
|
||||
on_transcript=self._on_transcript_callback
|
||||
)
|
||||
logger.info("Using SiliconFlow ASR service")
|
||||
else:
|
||||
self.asr_service = BufferedASRService(
|
||||
sample_rate=settings.sample_rate
|
||||
)
|
||||
logger.info("Using Buffered ASR service (no real transcription)")
|
||||
|
||||
await self.asr_service.connect()
|
||||
|
||||
logger.info("DuplexPipeline services connected")
|
||||
@@ -204,8 +225,11 @@ class DuplexPipeline:
|
||||
if vad_status == "Speech" or self.conversation.state == ConversationState.LISTENING:
|
||||
self._audio_buffer += pcm_bytes
|
||||
await self.asr_service.send_audio(pcm_bytes)
|
||||
|
||||
# For SiliconFlow ASR, trigger interim transcription periodically
|
||||
# The service handles timing internally via start_interim_transcription()
|
||||
|
||||
# 4. Check for End of Utterance
|
||||
# 4. Check for End of Utterance - this triggers LLM response
|
||||
if self.eou_detector.process(vad_status):
|
||||
await self._on_end_of_utterance()
|
||||
|
||||
@@ -237,12 +261,47 @@ class DuplexPipeline:
|
||||
"""Interrupt current bot speech (manual interrupt command)."""
|
||||
await self._handle_barge_in()
|
||||
|
||||
async def _on_transcript_callback(self, text: str, is_final: bool) -> None:
|
||||
"""
|
||||
Callback for ASR transcription results.
|
||||
|
||||
Streams transcription to client for display.
|
||||
|
||||
Args:
|
||||
text: Transcribed text
|
||||
is_final: Whether this is the final transcription
|
||||
"""
|
||||
# Avoid sending duplicate transcripts
|
||||
if text == self._last_sent_transcript and not is_final:
|
||||
return
|
||||
|
||||
self._last_sent_transcript = text
|
||||
|
||||
# Send transcript event to client
|
||||
await self.transport.send_event({
|
||||
"event": "transcript",
|
||||
"trackId": self.session_id,
|
||||
"text": text,
|
||||
"isFinal": is_final,
|
||||
"timestamp": self._get_timestamp_ms()
|
||||
})
|
||||
|
||||
logger.debug(f"Sent transcript ({'final' if is_final else 'interim'}): {text[:50]}...")
|
||||
|
||||
async def _on_speech_start(self) -> None:
|
||||
"""Handle user starting to speak."""
|
||||
if self.conversation.state == ConversationState.IDLE:
|
||||
await self.conversation.start_user_turn()
|
||||
self._audio_buffer = b""
|
||||
self._last_sent_transcript = ""
|
||||
self.eou_detector.reset()
|
||||
|
||||
# Clear ASR buffer and start interim transcriptions
|
||||
if hasattr(self.asr_service, 'clear_buffer'):
|
||||
self.asr_service.clear_buffer()
|
||||
if hasattr(self.asr_service, 'start_interim_transcription'):
|
||||
await self.asr_service.start_interim_transcription()
|
||||
|
||||
logger.debug("User speech started")
|
||||
|
||||
async def _on_end_of_utterance(self) -> None:
|
||||
@@ -250,25 +309,36 @@ class DuplexPipeline:
|
||||
if self.conversation.state != ConversationState.LISTENING:
|
||||
return
|
||||
|
||||
# Get transcribed text (if using ASR that provides it)
|
||||
# Stop interim transcriptions
|
||||
if hasattr(self.asr_service, 'stop_interim_transcription'):
|
||||
await self.asr_service.stop_interim_transcription()
|
||||
|
||||
# Get final transcription from ASR service
|
||||
user_text = ""
|
||||
if hasattr(self.asr_service, 'get_and_clear_text'):
|
||||
|
||||
if hasattr(self.asr_service, 'get_final_transcription'):
|
||||
# SiliconFlow ASR - get final transcription
|
||||
user_text = await self.asr_service.get_final_transcription()
|
||||
elif hasattr(self.asr_service, 'get_and_clear_text'):
|
||||
# Buffered ASR - get accumulated text
|
||||
user_text = self.asr_service.get_and_clear_text()
|
||||
|
||||
# If no ASR text, we could use the audio buffer for external ASR
|
||||
# For now, just use placeholder if no ASR text
|
||||
if not user_text:
|
||||
# In a real implementation, you'd send audio_buffer to ASR here
|
||||
# For demo purposes, use mock text
|
||||
user_text = "[User speech detected]"
|
||||
logger.warning("No ASR text available - using placeholder")
|
||||
# Skip if no meaningful text
|
||||
if not user_text or not user_text.strip():
|
||||
logger.debug("EOU detected but no transcription - skipping")
|
||||
# Reset for next utterance
|
||||
self._audio_buffer = b""
|
||||
self._last_sent_transcript = ""
|
||||
await self.conversation.start_user_turn()
|
||||
return
|
||||
|
||||
logger.info(f"EOU detected - user said: {user_text[:50]}...")
|
||||
logger.info(f"EOU detected - user said: {user_text[:100]}...")
|
||||
|
||||
# Clear buffers
|
||||
self._audio_buffer = b""
|
||||
self._last_sent_transcript = ""
|
||||
|
||||
# Process the turn
|
||||
# Process the turn - trigger LLM response
|
||||
await self.conversation.end_user_turn(user_text)
|
||||
self._current_turn_task = asyncio.create_task(self._handle_turn(user_text))
|
||||
|
||||
|
||||
Reference in New Issue
Block a user