Implement DashScope ASR provider and enhance ASR service architecture
- Added DashScope ASR service implementation for real-time streaming. - Updated ASR provider logic to support DashScope alongside existing providers. - Enhanced runtime metadata resolution to include DashScope as a valid ASR provider. - Modified configuration files and documentation to reflect the addition of DashScope. - Introduced tests to validate DashScope integration and ASR service behavior. - Refactored ASR service factory to accommodate new provider options and modes.
This commit is contained in:
@@ -30,11 +30,14 @@ from providers.factory.default import DefaultRealtimeServiceFactory
|
||||
from runtime.conversation import ConversationManager, ConversationState
|
||||
from runtime.events import get_event_bus
|
||||
from runtime.ports import (
|
||||
ASRMode,
|
||||
ASRPort,
|
||||
ASRServiceSpec,
|
||||
LLMPort,
|
||||
LLMServiceSpec,
|
||||
OfflineASRPort,
|
||||
RealtimeServiceFactory,
|
||||
StreamingASRPort,
|
||||
TTSPort,
|
||||
TTSServiceSpec,
|
||||
)
|
||||
@@ -77,6 +80,7 @@ class DuplexPipeline:
|
||||
_ASR_DELTA_THROTTLE_MS = 500
|
||||
_LLM_DELTA_THROTTLE_MS = 80
|
||||
_ASR_CAPTURE_MAX_MS = 15000
|
||||
_ASR_STREAM_FINAL_TIMEOUT_MS = 800
|
||||
_OPENER_PRE_ROLL_MS = 180
|
||||
_DEFAULT_TOOL_SCHEMAS: Dict[str, Dict[str, Any]] = {
|
||||
"current_time": {
|
||||
@@ -317,6 +321,10 @@ class DuplexPipeline:
|
||||
self.llm_service = llm_service
|
||||
self.tts_service = tts_service
|
||||
self.asr_service = asr_service # Will be initialized in start()
|
||||
self._asr_mode: ASRMode = self._resolve_asr_mode(
|
||||
settings.asr_provider,
|
||||
getattr(asr_service, "mode", None),
|
||||
)
|
||||
self._service_factory = service_factory or DefaultRealtimeServiceFactory()
|
||||
self._knowledge_searcher = knowledge_searcher
|
||||
self._tool_resource_resolver = tool_resource_resolver
|
||||
@@ -324,6 +332,7 @@ class DuplexPipeline:
|
||||
|
||||
# Track last sent transcript to avoid duplicates
|
||||
self._last_sent_transcript = ""
|
||||
self._latest_asr_interim_text = ""
|
||||
self._pending_transcript_delta: str = ""
|
||||
self._last_transcript_delta_emit_ms: float = 0.0
|
||||
|
||||
@@ -588,6 +597,7 @@ class DuplexPipeline:
|
||||
},
|
||||
"asr": {
|
||||
"provider": asr_provider,
|
||||
"mode": self._resolve_asr_mode(asr_provider, self._runtime_asr.get("mode")),
|
||||
"model": str(self._runtime_asr.get("model") or settings.asr_model or ""),
|
||||
"interimIntervalMs": int(self._runtime_asr.get("interimIntervalMs") or settings.asr_interim_interval_ms),
|
||||
"minAudioMs": int(self._runtime_asr.get("minAudioMs") or settings.asr_min_audio_ms),
|
||||
@@ -787,6 +797,22 @@ class DuplexPipeline:
|
||||
normalized = str(provider or "").strip().lower()
|
||||
return normalized == "dashscope"
|
||||
|
||||
@staticmethod
|
||||
def _resolve_asr_mode(provider: Any, raw_mode: Any = None) -> ASRMode:
|
||||
normalized_mode = str(raw_mode or "").strip().lower()
|
||||
if normalized_mode in {"offline", "streaming"}:
|
||||
return normalized_mode # type: ignore[return-value]
|
||||
normalized_provider = str(provider or "").strip().lower()
|
||||
if normalized_provider == "dashscope":
|
||||
return "streaming"
|
||||
return "offline"
|
||||
|
||||
def _offline_asr(self) -> OfflineASRPort:
|
||||
return self.asr_service # type: ignore[return-value]
|
||||
|
||||
def _streaming_asr(self) -> StreamingASRPort:
|
||||
return self.asr_service # type: ignore[return-value]
|
||||
|
||||
@staticmethod
|
||||
def _default_llm_base_url(provider: Any) -> Optional[str]:
|
||||
normalized = str(provider or "").strip().lower()
|
||||
@@ -967,11 +993,13 @@ class DuplexPipeline:
|
||||
asr_model = self._runtime_asr.get("model") or settings.asr_model
|
||||
asr_interim_interval = int(self._runtime_asr.get("interimIntervalMs") or settings.asr_interim_interval_ms)
|
||||
asr_min_audio_ms = int(self._runtime_asr.get("minAudioMs") or settings.asr_min_audio_ms)
|
||||
asr_mode = self._resolve_asr_mode(asr_provider, self._runtime_asr.get("mode"))
|
||||
|
||||
self.asr_service = self._service_factory.create_asr_service(
|
||||
ASRServiceSpec(
|
||||
provider=asr_provider,
|
||||
sample_rate=settings.sample_rate,
|
||||
mode=asr_mode,
|
||||
language="auto",
|
||||
api_key=str(asr_api_key).strip() if asr_api_key else None,
|
||||
api_url=str(asr_api_url).strip() if asr_api_url else None,
|
||||
@@ -981,10 +1009,14 @@ class DuplexPipeline:
|
||||
on_transcript=self._on_transcript_callback,
|
||||
)
|
||||
)
|
||||
self._asr_mode = self._resolve_asr_mode(
|
||||
self._runtime_asr.get("provider") or settings.asr_provider,
|
||||
getattr(self.asr_service, "mode", self._runtime_asr.get("mode")),
|
||||
)
|
||||
|
||||
await self.asr_service.connect()
|
||||
|
||||
logger.info("DuplexPipeline services connected")
|
||||
logger.info("DuplexPipeline services connected (asr_mode={})", self._asr_mode)
|
||||
if not self._outbound_task or self._outbound_task.done():
|
||||
self._outbound_task = asyncio.create_task(self._outbound_loop())
|
||||
|
||||
@@ -1457,6 +1489,7 @@ class DuplexPipeline:
|
||||
self._last_sent_transcript = text
|
||||
|
||||
if is_final:
|
||||
self._latest_asr_interim_text = ""
|
||||
self._pending_transcript_delta = ""
|
||||
self._last_transcript_delta_emit_ms = 0.0
|
||||
await self._send_event(
|
||||
@@ -1472,6 +1505,7 @@ class DuplexPipeline:
|
||||
logger.debug(f"Sent transcript (final): {text[:50]}...")
|
||||
return
|
||||
|
||||
self._latest_asr_interim_text = text
|
||||
self._pending_transcript_delta = text
|
||||
should_emit = (
|
||||
self._last_transcript_delta_emit_ms <= 0.0
|
||||
@@ -1495,14 +1529,16 @@ class DuplexPipeline:
|
||||
await self.conversation.start_user_turn()
|
||||
self._audio_buffer = b""
|
||||
self._last_sent_transcript = ""
|
||||
self._latest_asr_interim_text = ""
|
||||
self.eou_detector.reset()
|
||||
self._asr_capture_active = False
|
||||
self._asr_capture_started_ms = 0.0
|
||||
self._pending_speech_audio = b""
|
||||
|
||||
# Clear ASR buffer. Interim starts only after ASR capture is activated.
|
||||
if hasattr(self.asr_service, 'clear_buffer'):
|
||||
self.asr_service.clear_buffer()
|
||||
if self._asr_mode == "streaming":
|
||||
self._streaming_asr().clear_utterance()
|
||||
else:
|
||||
self._offline_asr().clear_buffer()
|
||||
|
||||
logger.debug("User speech started")
|
||||
|
||||
@@ -1511,8 +1547,10 @@ class DuplexPipeline:
|
||||
if self._asr_capture_active:
|
||||
return
|
||||
|
||||
if hasattr(self.asr_service, 'start_interim_transcription'):
|
||||
await self.asr_service.start_interim_transcription()
|
||||
if self._asr_mode == "streaming":
|
||||
await self._streaming_asr().begin_utterance()
|
||||
else:
|
||||
await self._offline_asr().start_interim_transcription()
|
||||
|
||||
# Prime ASR with a short pre-speech context window so the utterance
|
||||
# start isn't lost while waiting for VAD to transition to Speech.
|
||||
@@ -1545,24 +1583,22 @@ class DuplexPipeline:
|
||||
self._pending_speech_audio = b""
|
||||
return
|
||||
|
||||
# Add a tiny trailing silence tail to stabilize final-token decoding.
|
||||
if self._asr_final_tail_bytes > 0:
|
||||
final_tail = b"\x00" * self._asr_final_tail_bytes
|
||||
await self.asr_service.send_audio(final_tail)
|
||||
|
||||
# Stop interim transcriptions
|
||||
if hasattr(self.asr_service, 'stop_interim_transcription'):
|
||||
await self.asr_service.stop_interim_transcription()
|
||||
|
||||
# Get final transcription from ASR service
|
||||
user_text = ""
|
||||
|
||||
if hasattr(self.asr_service, 'get_final_transcription'):
|
||||
# SiliconFlow ASR - get final transcription
|
||||
user_text = await self.asr_service.get_final_transcription()
|
||||
elif hasattr(self.asr_service, 'get_and_clear_text'):
|
||||
# Buffered ASR - get accumulated text
|
||||
user_text = self.asr_service.get_and_clear_text()
|
||||
if self._asr_mode == "streaming":
|
||||
streaming_asr = self._streaming_asr()
|
||||
await streaming_asr.end_utterance()
|
||||
user_text = await streaming_asr.wait_for_final_transcription(
|
||||
timeout_ms=self._ASR_STREAM_FINAL_TIMEOUT_MS
|
||||
)
|
||||
if not user_text.strip():
|
||||
user_text = self._latest_asr_interim_text
|
||||
else:
|
||||
# Add a tiny trailing silence tail to stabilize final-token decoding.
|
||||
if self._asr_final_tail_bytes > 0:
|
||||
final_tail = b"\x00" * self._asr_final_tail_bytes
|
||||
await self.asr_service.send_audio(final_tail)
|
||||
await self._offline_asr().stop_interim_transcription()
|
||||
user_text = await self._offline_asr().get_final_transcription()
|
||||
|
||||
# Skip if no meaningful text
|
||||
if not user_text or not user_text.strip():
|
||||
@@ -1570,6 +1606,7 @@ class DuplexPipeline:
|
||||
# Reset for next utterance
|
||||
self._audio_buffer = b""
|
||||
self._last_sent_transcript = ""
|
||||
self._latest_asr_interim_text = ""
|
||||
self._asr_capture_active = False
|
||||
self._asr_capture_started_ms = 0.0
|
||||
self._pending_speech_audio = b""
|
||||
@@ -1594,6 +1631,7 @@ class DuplexPipeline:
|
||||
# Clear buffers
|
||||
self._audio_buffer = b""
|
||||
self._last_sent_transcript = ""
|
||||
self._latest_asr_interim_text = ""
|
||||
self._pending_transcript_delta = ""
|
||||
self._last_transcript_delta_emit_ms = 0.0
|
||||
self._asr_capture_active = False
|
||||
|
||||
Reference in New Issue
Block a user