Implement DashScope ASR provider and enhance ASR service architecture

- Added DashScope ASR service implementation for real-time streaming.
- Updated ASR provider logic to support DashScope alongside existing providers.
- Enhanced runtime metadata resolution to include DashScope as a valid ASR provider.
- Modified configuration files and documentation to reflect the addition of DashScope.
- Introduced tests to validate DashScope integration and ASR service behavior.
- Refactored ASR service factory to accommodate new provider options and modes.
This commit is contained in:
Xin Wang
2026-03-06 11:44:39 +08:00
parent 7e0b777923
commit e11c3abb9e
19 changed files with 940 additions and 44 deletions

View File

@@ -30,11 +30,14 @@ from providers.factory.default import DefaultRealtimeServiceFactory
from runtime.conversation import ConversationManager, ConversationState
from runtime.events import get_event_bus
from runtime.ports import (
ASRMode,
ASRPort,
ASRServiceSpec,
LLMPort,
LLMServiceSpec,
OfflineASRPort,
RealtimeServiceFactory,
StreamingASRPort,
TTSPort,
TTSServiceSpec,
)
@@ -77,6 +80,7 @@ class DuplexPipeline:
_ASR_DELTA_THROTTLE_MS = 500
_LLM_DELTA_THROTTLE_MS = 80
_ASR_CAPTURE_MAX_MS = 15000
_ASR_STREAM_FINAL_TIMEOUT_MS = 800
_OPENER_PRE_ROLL_MS = 180
_DEFAULT_TOOL_SCHEMAS: Dict[str, Dict[str, Any]] = {
"current_time": {
@@ -317,6 +321,10 @@ class DuplexPipeline:
self.llm_service = llm_service
self.tts_service = tts_service
self.asr_service = asr_service # Will be initialized in start()
self._asr_mode: ASRMode = self._resolve_asr_mode(
settings.asr_provider,
getattr(asr_service, "mode", None),
)
self._service_factory = service_factory or DefaultRealtimeServiceFactory()
self._knowledge_searcher = knowledge_searcher
self._tool_resource_resolver = tool_resource_resolver
@@ -324,6 +332,7 @@ class DuplexPipeline:
# Track last sent transcript to avoid duplicates
self._last_sent_transcript = ""
self._latest_asr_interim_text = ""
self._pending_transcript_delta: str = ""
self._last_transcript_delta_emit_ms: float = 0.0
@@ -588,6 +597,7 @@ class DuplexPipeline:
},
"asr": {
"provider": asr_provider,
"mode": self._resolve_asr_mode(asr_provider, self._runtime_asr.get("mode")),
"model": str(self._runtime_asr.get("model") or settings.asr_model or ""),
"interimIntervalMs": int(self._runtime_asr.get("interimIntervalMs") or settings.asr_interim_interval_ms),
"minAudioMs": int(self._runtime_asr.get("minAudioMs") or settings.asr_min_audio_ms),
@@ -787,6 +797,22 @@ class DuplexPipeline:
normalized = str(provider or "").strip().lower()
return normalized == "dashscope"
@staticmethod
def _resolve_asr_mode(provider: Any, raw_mode: Any = None) -> ASRMode:
normalized_mode = str(raw_mode or "").strip().lower()
if normalized_mode in {"offline", "streaming"}:
return normalized_mode # type: ignore[return-value]
normalized_provider = str(provider or "").strip().lower()
if normalized_provider == "dashscope":
return "streaming"
return "offline"
def _offline_asr(self) -> OfflineASRPort:
return self.asr_service # type: ignore[return-value]
def _streaming_asr(self) -> StreamingASRPort:
return self.asr_service # type: ignore[return-value]
@staticmethod
def _default_llm_base_url(provider: Any) -> Optional[str]:
normalized = str(provider or "").strip().lower()
@@ -967,11 +993,13 @@ class DuplexPipeline:
asr_model = self._runtime_asr.get("model") or settings.asr_model
asr_interim_interval = int(self._runtime_asr.get("interimIntervalMs") or settings.asr_interim_interval_ms)
asr_min_audio_ms = int(self._runtime_asr.get("minAudioMs") or settings.asr_min_audio_ms)
asr_mode = self._resolve_asr_mode(asr_provider, self._runtime_asr.get("mode"))
self.asr_service = self._service_factory.create_asr_service(
ASRServiceSpec(
provider=asr_provider,
sample_rate=settings.sample_rate,
mode=asr_mode,
language="auto",
api_key=str(asr_api_key).strip() if asr_api_key else None,
api_url=str(asr_api_url).strip() if asr_api_url else None,
@@ -981,10 +1009,14 @@ class DuplexPipeline:
on_transcript=self._on_transcript_callback,
)
)
self._asr_mode = self._resolve_asr_mode(
self._runtime_asr.get("provider") or settings.asr_provider,
getattr(self.asr_service, "mode", self._runtime_asr.get("mode")),
)
await self.asr_service.connect()
logger.info("DuplexPipeline services connected")
logger.info("DuplexPipeline services connected (asr_mode={})", self._asr_mode)
if not self._outbound_task or self._outbound_task.done():
self._outbound_task = asyncio.create_task(self._outbound_loop())
@@ -1457,6 +1489,7 @@ class DuplexPipeline:
self._last_sent_transcript = text
if is_final:
self._latest_asr_interim_text = ""
self._pending_transcript_delta = ""
self._last_transcript_delta_emit_ms = 0.0
await self._send_event(
@@ -1472,6 +1505,7 @@ class DuplexPipeline:
logger.debug(f"Sent transcript (final): {text[:50]}...")
return
self._latest_asr_interim_text = text
self._pending_transcript_delta = text
should_emit = (
self._last_transcript_delta_emit_ms <= 0.0
@@ -1495,14 +1529,16 @@ class DuplexPipeline:
await self.conversation.start_user_turn()
self._audio_buffer = b""
self._last_sent_transcript = ""
self._latest_asr_interim_text = ""
self.eou_detector.reset()
self._asr_capture_active = False
self._asr_capture_started_ms = 0.0
self._pending_speech_audio = b""
# Clear ASR buffer. Interim starts only after ASR capture is activated.
if hasattr(self.asr_service, 'clear_buffer'):
self.asr_service.clear_buffer()
if self._asr_mode == "streaming":
self._streaming_asr().clear_utterance()
else:
self._offline_asr().clear_buffer()
logger.debug("User speech started")
@@ -1511,8 +1547,10 @@ class DuplexPipeline:
if self._asr_capture_active:
return
if hasattr(self.asr_service, 'start_interim_transcription'):
await self.asr_service.start_interim_transcription()
if self._asr_mode == "streaming":
await self._streaming_asr().begin_utterance()
else:
await self._offline_asr().start_interim_transcription()
# Prime ASR with a short pre-speech context window so the utterance
# start isn't lost while waiting for VAD to transition to Speech.
@@ -1545,24 +1583,22 @@ class DuplexPipeline:
self._pending_speech_audio = b""
return
# Add a tiny trailing silence tail to stabilize final-token decoding.
if self._asr_final_tail_bytes > 0:
final_tail = b"\x00" * self._asr_final_tail_bytes
await self.asr_service.send_audio(final_tail)
# Stop interim transcriptions
if hasattr(self.asr_service, 'stop_interim_transcription'):
await self.asr_service.stop_interim_transcription()
# Get final transcription from ASR service
user_text = ""
if hasattr(self.asr_service, 'get_final_transcription'):
# SiliconFlow ASR - get final transcription
user_text = await self.asr_service.get_final_transcription()
elif hasattr(self.asr_service, 'get_and_clear_text'):
# Buffered ASR - get accumulated text
user_text = self.asr_service.get_and_clear_text()
if self._asr_mode == "streaming":
streaming_asr = self._streaming_asr()
await streaming_asr.end_utterance()
user_text = await streaming_asr.wait_for_final_transcription(
timeout_ms=self._ASR_STREAM_FINAL_TIMEOUT_MS
)
if not user_text.strip():
user_text = self._latest_asr_interim_text
else:
# Add a tiny trailing silence tail to stabilize final-token decoding.
if self._asr_final_tail_bytes > 0:
final_tail = b"\x00" * self._asr_final_tail_bytes
await self.asr_service.send_audio(final_tail)
await self._offline_asr().stop_interim_transcription()
user_text = await self._offline_asr().get_final_transcription()
# Skip if no meaningful text
if not user_text or not user_text.strip():
@@ -1570,6 +1606,7 @@ class DuplexPipeline:
# Reset for next utterance
self._audio_buffer = b""
self._last_sent_transcript = ""
self._latest_asr_interim_text = ""
self._asr_capture_active = False
self._asr_capture_started_ms = 0.0
self._pending_speech_audio = b""
@@ -1594,6 +1631,7 @@ class DuplexPipeline:
# Clear buffers
self._audio_buffer = b""
self._last_sent_transcript = ""
self._latest_asr_interim_text = ""
self._pending_transcript_delta = ""
self._last_transcript_delta_emit_ms = 0.0
self._asr_capture_active = False