Add ASR interim results support in Assistant model and API

- Introduced `asr_interim_enabled` field in the Assistant model to control interim ASR results.
- Updated AssistantBase and AssistantUpdate schemas to include the new field.
- Modified the database schema to add the `asr_interim_enabled` column.
- Enhanced runtime metadata to reflect interim ASR settings.
- Updated API endpoints and tests to validate the new functionality.
- Adjusted documentation to include details about interim ASR results configuration.
This commit is contained in:
Xin Wang
2026-03-06 12:58:54 +08:00
parent e11c3abb9e
commit da38157638
19 changed files with 183 additions and 5 deletions

View File

@@ -599,6 +599,7 @@ class DuplexPipeline:
"provider": asr_provider,
"mode": self._resolve_asr_mode(asr_provider, self._runtime_asr.get("mode")),
"model": str(self._runtime_asr.get("model") or settings.asr_model or ""),
"enableInterim": self._asr_interim_enabled(),
"interimIntervalMs": int(self._runtime_asr.get("interimIntervalMs") or settings.asr_interim_interval_ms),
"minAudioMs": int(self._runtime_asr.get("minAudioMs") or settings.asr_min_audio_ms),
},
@@ -865,6 +866,20 @@ class DuplexPipeline:
return self._runtime_barge_in_min_duration_ms
return self._barge_in_min_duration_ms
def _asr_interim_enabled(self) -> bool:
current_mode = self._asr_mode
if not self.asr_service:
current_mode = self._resolve_asr_mode(
self._runtime_asr.get("provider") or settings.asr_provider,
self._runtime_asr.get("mode"),
)
if current_mode != "offline":
return True
enabled = self._coerce_bool(self._runtime_asr.get("enableInterim"))
if enabled is not None:
return enabled
return bool(settings.asr_enable_interim)
def _barge_in_silence_tolerance_frames(self) -> int:
"""Convert silence tolerance from ms to frame count using current chunk size."""
chunk_ms = max(1, settings.chunk_size_ms)
@@ -991,6 +1006,9 @@ class DuplexPipeline:
asr_api_key = self._runtime_asr.get("apiKey")
asr_api_url = self._runtime_asr.get("baseUrl") or settings.asr_api_url
asr_model = self._runtime_asr.get("model") or settings.asr_model
asr_enable_interim = self._coerce_bool(self._runtime_asr.get("enableInterim"))
if asr_enable_interim is None:
asr_enable_interim = bool(settings.asr_enable_interim)
asr_interim_interval = int(self._runtime_asr.get("interimIntervalMs") or settings.asr_interim_interval_ms)
asr_min_audio_ms = int(self._runtime_asr.get("minAudioMs") or settings.asr_min_audio_ms)
asr_mode = self._resolve_asr_mode(asr_provider, self._runtime_asr.get("mode"))
@@ -1004,6 +1022,7 @@ class DuplexPipeline:
api_key=str(asr_api_key).strip() if asr_api_key else None,
api_url=str(asr_api_url).strip() if asr_api_url else None,
model=str(asr_model).strip() if asr_model else None,
enable_interim=asr_enable_interim,
interim_interval_ms=asr_interim_interval,
min_audio_for_interim_ms=asr_min_audio_ms,
on_transcript=self._on_transcript_callback,
@@ -1481,6 +1500,9 @@ class DuplexPipeline:
text: Transcribed text
is_final: Whether this is the final transcription
"""
if not is_final and not self._asr_interim_enabled():
return
# Avoid sending duplicate transcripts
if text == self._last_sent_transcript and not is_final:
return
@@ -1550,7 +1572,8 @@ class DuplexPipeline:
if self._asr_mode == "streaming":
await self._streaming_asr().begin_utterance()
else:
await self._offline_asr().start_interim_transcription()
if self._asr_interim_enabled():
await self._offline_asr().start_interim_transcription()
# Prime ASR with a short pre-speech context window so the utterance
# start isn't lost while waiting for VAD to transition to Speech.

View File

@@ -22,6 +22,7 @@ class ASRServiceSpec:
api_key: Optional[str] = None
api_url: Optional[str] = None
model: Optional[str] = None
enable_interim: bool = False
interim_interval_ms: int = 500
min_audio_for_interim_ms: int = 300
on_transcript: Optional[TranscriptCallback] = None