Add ASR interim results support in Assistant model and API
- Introduced `asr_interim_enabled` field in the Assistant model to control interim ASR results. - Updated AssistantBase and AssistantUpdate schemas to include the new field. - Modified the database schema to add the `asr_interim_enabled` column. - Enhanced runtime metadata to reflect interim ASR settings. - Updated API endpoints and tests to validate the new functionality. - Adjusted documentation to include details about interim ASR results configuration.
This commit is contained in:
@@ -127,6 +127,7 @@ class Assistant(Base):
|
|||||||
speed: Mapped[float] = mapped_column(Float, default=1.0)
|
speed: Mapped[float] = mapped_column(Float, default=1.0)
|
||||||
hotwords: Mapped[dict] = mapped_column(JSON, default=list)
|
hotwords: Mapped[dict] = mapped_column(JSON, default=list)
|
||||||
tools: Mapped[dict] = mapped_column(JSON, default=list)
|
tools: Mapped[dict] = mapped_column(JSON, default=list)
|
||||||
|
asr_interim_enabled: Mapped[bool] = mapped_column(default=False)
|
||||||
bot_cannot_be_interrupted: Mapped[bool] = mapped_column(default=False)
|
bot_cannot_be_interrupted: Mapped[bool] = mapped_column(default=False)
|
||||||
interruption_sensitivity: Mapped[int] = mapped_column(Integer, default=500)
|
interruption_sensitivity: Mapped[int] = mapped_column(Integer, default=500)
|
||||||
config_mode: Mapped[str] = mapped_column(String(32), default="platform")
|
config_mode: Mapped[str] = mapped_column(String(32), default="platform")
|
||||||
|
|||||||
@@ -126,6 +126,9 @@ def _ensure_assistant_schema(db: Session) -> None:
|
|||||||
if "manual_opener_tool_calls" not in columns:
|
if "manual_opener_tool_calls" not in columns:
|
||||||
db.execute(text("ALTER TABLE assistants ADD COLUMN manual_opener_tool_calls JSON"))
|
db.execute(text("ALTER TABLE assistants ADD COLUMN manual_opener_tool_calls JSON"))
|
||||||
altered = True
|
altered = True
|
||||||
|
if "asr_interim_enabled" not in columns:
|
||||||
|
db.execute(text("ALTER TABLE assistants ADD COLUMN asr_interim_enabled BOOLEAN DEFAULT 0"))
|
||||||
|
altered = True
|
||||||
|
|
||||||
if altered:
|
if altered:
|
||||||
db.commit()
|
db.commit()
|
||||||
@@ -317,6 +320,9 @@ def _resolve_runtime_metadata(db: Session, assistant: Assistant) -> tuple[Dict[s
|
|||||||
else:
|
else:
|
||||||
warnings.append(f"LLM model not found: {assistant.llm_model_id}")
|
warnings.append(f"LLM model not found: {assistant.llm_model_id}")
|
||||||
|
|
||||||
|
asr_runtime: Dict[str, Any] = {
|
||||||
|
"enableInterim": bool(assistant.asr_interim_enabled),
|
||||||
|
}
|
||||||
if assistant.asr_model_id:
|
if assistant.asr_model_id:
|
||||||
asr = db.query(ASRModel).filter(ASRModel.id == assistant.asr_model_id).first()
|
asr = db.query(ASRModel).filter(ASRModel.id == assistant.asr_model_id).first()
|
||||||
if asr:
|
if asr:
|
||||||
@@ -326,14 +332,15 @@ def _resolve_runtime_metadata(db: Session, assistant: Assistant) -> tuple[Dict[s
|
|||||||
asr_provider = "openai_compatible"
|
asr_provider = "openai_compatible"
|
||||||
else:
|
else:
|
||||||
asr_provider = "buffered"
|
asr_provider = "buffered"
|
||||||
metadata["services"]["asr"] = {
|
asr_runtime.update({
|
||||||
"provider": asr_provider,
|
"provider": asr_provider,
|
||||||
"model": asr.model_name or asr.name,
|
"model": asr.model_name or asr.name,
|
||||||
"apiKey": asr.api_key if asr_provider in {"openai_compatible", "dashscope"} else None,
|
"apiKey": asr.api_key if asr_provider in {"openai_compatible", "dashscope"} else None,
|
||||||
"baseUrl": asr.base_url if asr_provider in {"openai_compatible", "dashscope"} else None,
|
"baseUrl": asr.base_url if asr_provider in {"openai_compatible", "dashscope"} else None,
|
||||||
}
|
})
|
||||||
else:
|
else:
|
||||||
warnings.append(f"ASR model not found: {assistant.asr_model_id}")
|
warnings.append(f"ASR model not found: {assistant.asr_model_id}")
|
||||||
|
metadata["services"]["asr"] = asr_runtime
|
||||||
|
|
||||||
if not assistant.voice_output_enabled:
|
if not assistant.voice_output_enabled:
|
||||||
metadata["services"]["tts"] = {"enabled": False}
|
metadata["services"]["tts"] = {"enabled": False}
|
||||||
@@ -437,6 +444,7 @@ def assistant_to_dict(assistant: Assistant) -> dict:
|
|||||||
"speed": assistant.speed,
|
"speed": assistant.speed,
|
||||||
"hotwords": assistant.hotwords or [],
|
"hotwords": assistant.hotwords or [],
|
||||||
"tools": _normalize_assistant_tool_ids(assistant.tools),
|
"tools": _normalize_assistant_tool_ids(assistant.tools),
|
||||||
|
"asrInterimEnabled": bool(assistant.asr_interim_enabled),
|
||||||
"botCannotBeInterrupted": bool(assistant.bot_cannot_be_interrupted),
|
"botCannotBeInterrupted": bool(assistant.bot_cannot_be_interrupted),
|
||||||
"interruptionSensitivity": assistant.interruption_sensitivity,
|
"interruptionSensitivity": assistant.interruption_sensitivity,
|
||||||
"configMode": assistant.config_mode,
|
"configMode": assistant.config_mode,
|
||||||
@@ -457,6 +465,7 @@ def _apply_assistant_update(assistant: Assistant, update_data: dict) -> None:
|
|||||||
"firstTurnMode": "first_turn_mode",
|
"firstTurnMode": "first_turn_mode",
|
||||||
"manualOpenerToolCalls": "manual_opener_tool_calls",
|
"manualOpenerToolCalls": "manual_opener_tool_calls",
|
||||||
"interruptionSensitivity": "interruption_sensitivity",
|
"interruptionSensitivity": "interruption_sensitivity",
|
||||||
|
"asrInterimEnabled": "asr_interim_enabled",
|
||||||
"botCannotBeInterrupted": "bot_cannot_be_interrupted",
|
"botCannotBeInterrupted": "bot_cannot_be_interrupted",
|
||||||
"configMode": "config_mode",
|
"configMode": "config_mode",
|
||||||
"voiceOutputEnabled": "voice_output_enabled",
|
"voiceOutputEnabled": "voice_output_enabled",
|
||||||
@@ -651,6 +660,7 @@ def create_assistant(data: AssistantCreate, db: Session = Depends(get_db)):
|
|||||||
speed=data.speed,
|
speed=data.speed,
|
||||||
hotwords=data.hotwords,
|
hotwords=data.hotwords,
|
||||||
tools=_normalize_assistant_tool_ids(data.tools),
|
tools=_normalize_assistant_tool_ids(data.tools),
|
||||||
|
asr_interim_enabled=data.asrInterimEnabled,
|
||||||
bot_cannot_be_interrupted=data.botCannotBeInterrupted,
|
bot_cannot_be_interrupted=data.botCannotBeInterrupted,
|
||||||
interruption_sensitivity=data.interruptionSensitivity,
|
interruption_sensitivity=data.interruptionSensitivity,
|
||||||
config_mode=data.configMode,
|
config_mode=data.configMode,
|
||||||
|
|||||||
@@ -291,6 +291,7 @@ class AssistantBase(BaseModel):
|
|||||||
speed: float = 1.0
|
speed: float = 1.0
|
||||||
hotwords: List[str] = []
|
hotwords: List[str] = []
|
||||||
tools: List[str] = []
|
tools: List[str] = []
|
||||||
|
asrInterimEnabled: bool = False
|
||||||
botCannotBeInterrupted: bool = False
|
botCannotBeInterrupted: bool = False
|
||||||
interruptionSensitivity: int = 500
|
interruptionSensitivity: int = 500
|
||||||
configMode: str = "platform"
|
configMode: str = "platform"
|
||||||
@@ -322,6 +323,7 @@ class AssistantUpdate(BaseModel):
|
|||||||
speed: Optional[float] = None
|
speed: Optional[float] = None
|
||||||
hotwords: Optional[List[str]] = None
|
hotwords: Optional[List[str]] = None
|
||||||
tools: Optional[List[str]] = None
|
tools: Optional[List[str]] = None
|
||||||
|
asrInterimEnabled: Optional[bool] = None
|
||||||
botCannotBeInterrupted: Optional[bool] = None
|
botCannotBeInterrupted: Optional[bool] = None
|
||||||
interruptionSensitivity: Optional[int] = None
|
interruptionSensitivity: Optional[int] = None
|
||||||
configMode: Optional[str] = None
|
configMode: Optional[str] = None
|
||||||
|
|||||||
@@ -27,6 +27,7 @@ class TestAssistantAPI:
|
|||||||
assert data["voiceOutputEnabled"] is True
|
assert data["voiceOutputEnabled"] is True
|
||||||
assert data["firstTurnMode"] == "bot_first"
|
assert data["firstTurnMode"] == "bot_first"
|
||||||
assert data["generatedOpenerEnabled"] is False
|
assert data["generatedOpenerEnabled"] is False
|
||||||
|
assert data["asrInterimEnabled"] is False
|
||||||
assert data["botCannotBeInterrupted"] is False
|
assert data["botCannotBeInterrupted"] is False
|
||||||
assert "id" in data
|
assert "id" in data
|
||||||
assert data["callCount"] == 0
|
assert data["callCount"] == 0
|
||||||
@@ -37,6 +38,7 @@ class TestAssistantAPI:
|
|||||||
response = client.post("/api/assistants", json=data)
|
response = client.post("/api/assistants", json=data)
|
||||||
assert response.status_code == 200
|
assert response.status_code == 200
|
||||||
assert response.json()["name"] == "Minimal Assistant"
|
assert response.json()["name"] == "Minimal Assistant"
|
||||||
|
assert response.json()["asrInterimEnabled"] is False
|
||||||
|
|
||||||
def test_get_assistant_by_id(self, client, sample_assistant_data):
|
def test_get_assistant_by_id(self, client, sample_assistant_data):
|
||||||
"""Test getting a specific assistant by ID"""
|
"""Test getting a specific assistant by ID"""
|
||||||
@@ -68,6 +70,7 @@ class TestAssistantAPI:
|
|||||||
"prompt": "You are an updated assistant.",
|
"prompt": "You are an updated assistant.",
|
||||||
"speed": 1.5,
|
"speed": 1.5,
|
||||||
"voiceOutputEnabled": False,
|
"voiceOutputEnabled": False,
|
||||||
|
"asrInterimEnabled": True,
|
||||||
"manualOpenerToolCalls": [
|
"manualOpenerToolCalls": [
|
||||||
{"toolName": "text_msg_prompt", "arguments": {"msg": "请选择服务类型"}}
|
{"toolName": "text_msg_prompt", "arguments": {"msg": "请选择服务类型"}}
|
||||||
],
|
],
|
||||||
@@ -79,6 +82,7 @@ class TestAssistantAPI:
|
|||||||
assert data["prompt"] == "You are an updated assistant."
|
assert data["prompt"] == "You are an updated assistant."
|
||||||
assert data["speed"] == 1.5
|
assert data["speed"] == 1.5
|
||||||
assert data["voiceOutputEnabled"] is False
|
assert data["voiceOutputEnabled"] is False
|
||||||
|
assert data["asrInterimEnabled"] is True
|
||||||
assert data["manualOpenerToolCalls"] == [
|
assert data["manualOpenerToolCalls"] == [
|
||||||
{"toolName": "text_msg_prompt", "arguments": {"msg": "请选择服务类型"}}
|
{"toolName": "text_msg_prompt", "arguments": {"msg": "请选择服务类型"}}
|
||||||
]
|
]
|
||||||
@@ -213,6 +217,7 @@ class TestAssistantAPI:
|
|||||||
"prompt": "runtime prompt",
|
"prompt": "runtime prompt",
|
||||||
"opener": "runtime opener",
|
"opener": "runtime opener",
|
||||||
"manualOpenerToolCalls": [{"toolName": "text_msg_prompt", "arguments": {"msg": "欢迎"}}],
|
"manualOpenerToolCalls": [{"toolName": "text_msg_prompt", "arguments": {"msg": "欢迎"}}],
|
||||||
|
"asrInterimEnabled": True,
|
||||||
"speed": 1.1,
|
"speed": 1.1,
|
||||||
})
|
})
|
||||||
assistant_resp = client.post("/api/assistants", json=sample_assistant_data)
|
assistant_resp = client.post("/api/assistants", json=sample_assistant_data)
|
||||||
@@ -232,6 +237,7 @@ class TestAssistantAPI:
|
|||||||
assert metadata["services"]["llm"]["model"] == sample_llm_model_data["model_name"]
|
assert metadata["services"]["llm"]["model"] == sample_llm_model_data["model_name"]
|
||||||
assert metadata["services"]["asr"]["model"] == sample_asr_model_data["model_name"]
|
assert metadata["services"]["asr"]["model"] == sample_asr_model_data["model_name"]
|
||||||
assert metadata["services"]["asr"]["baseUrl"] == sample_asr_model_data["base_url"]
|
assert metadata["services"]["asr"]["baseUrl"] == sample_asr_model_data["base_url"]
|
||||||
|
assert metadata["services"]["asr"]["enableInterim"] is True
|
||||||
expected_tts_voice = f"{sample_voice_data['model']}:{sample_voice_data['voice_key']}"
|
expected_tts_voice = f"{sample_voice_data['model']}:{sample_voice_data['voice_key']}"
|
||||||
assert metadata["services"]["tts"]["voice"] == expected_tts_voice
|
assert metadata["services"]["tts"]["voice"] == expected_tts_voice
|
||||||
assert metadata["services"]["tts"]["baseUrl"] == sample_voice_data["base_url"]
|
assert metadata["services"]["tts"]["baseUrl"] == sample_voice_data["base_url"]
|
||||||
@@ -309,6 +315,7 @@ class TestAssistantAPI:
|
|||||||
assert runtime_resp.status_code == 200
|
assert runtime_resp.status_code == 200
|
||||||
metadata = runtime_resp.json()["sessionStartMetadata"]
|
metadata = runtime_resp.json()["sessionStartMetadata"]
|
||||||
assert metadata["output"]["mode"] == "text"
|
assert metadata["output"]["mode"] == "text"
|
||||||
|
assert metadata["services"]["asr"]["enableInterim"] is False
|
||||||
assert metadata["services"]["tts"]["enabled"] is False
|
assert metadata["services"]["tts"]["enabled"] is False
|
||||||
|
|
||||||
def test_runtime_config_dashscope_voice_provider(self, client, sample_assistant_data):
|
def test_runtime_config_dashscope_voice_provider(self, client, sample_assistant_data):
|
||||||
@@ -373,6 +380,17 @@ class TestAssistantAPI:
|
|||||||
asr = metadata["services"]["asr"]
|
asr = metadata["services"]["asr"]
|
||||||
assert asr["provider"] == "dashscope"
|
assert asr["provider"] == "dashscope"
|
||||||
assert asr["baseUrl"] == "wss://dashscope.aliyuncs.com/api-ws/v1/realtime"
|
assert asr["baseUrl"] == "wss://dashscope.aliyuncs.com/api-ws/v1/realtime"
|
||||||
|
assert asr["enableInterim"] is False
|
||||||
|
|
||||||
|
def test_runtime_config_defaults_asr_interim_disabled_without_asr_model(self, client, sample_assistant_data):
|
||||||
|
assistant_resp = client.post("/api/assistants", json=sample_assistant_data)
|
||||||
|
assert assistant_resp.status_code == 200
|
||||||
|
assistant_id = assistant_resp.json()["id"]
|
||||||
|
|
||||||
|
runtime_resp = client.get(f"/api/assistants/{assistant_id}/runtime-config")
|
||||||
|
assert runtime_resp.status_code == 200
|
||||||
|
metadata = runtime_resp.json()["sessionStartMetadata"]
|
||||||
|
assert metadata["services"]["asr"]["enableInterim"] is False
|
||||||
|
|
||||||
def test_assistant_interrupt_and_generated_opener_flags(self, client, sample_assistant_data):
|
def test_assistant_interrupt_and_generated_opener_flags(self, client, sample_assistant_data):
|
||||||
sample_assistant_data.update({
|
sample_assistant_data.update({
|
||||||
|
|||||||
@@ -13,6 +13,7 @@
|
|||||||
|---|---|
|
|---|---|
|
||||||
| ASR 引擎 | 选择语音识别服务提供商 |
|
| ASR 引擎 | 选择语音识别服务提供商 |
|
||||||
| 模型 | 识别模型名称 |
|
| 模型 | 识别模型名称 |
|
||||||
|
| `enable_interim` | 是否开启离线 ASR 中间结果(默认 `false`,仅离线模式生效) |
|
||||||
| 语言 | 中文/英文/多语言 |
|
| 语言 | 中文/英文/多语言 |
|
||||||
| 热词 | 提升特定词汇识别准确率 |
|
| 热词 | 提升特定词汇识别准确率 |
|
||||||
| 标点与规范化 | 是否自动补全标点、文本规范化 |
|
| 标点与规范化 | 是否自动补全标点、文本规范化 |
|
||||||
|
|||||||
@@ -249,6 +249,8 @@ class LocalYamlAssistantConfigAdapter(NullBackendAdapter):
|
|||||||
asr_runtime["apiKey"] = cls._as_str(asr.get("api_key"))
|
asr_runtime["apiKey"] = cls._as_str(asr.get("api_key"))
|
||||||
if cls._as_str(asr.get("api_url")):
|
if cls._as_str(asr.get("api_url")):
|
||||||
asr_runtime["baseUrl"] = cls._as_str(asr.get("api_url"))
|
asr_runtime["baseUrl"] = cls._as_str(asr.get("api_url"))
|
||||||
|
if asr.get("enable_interim") is not None:
|
||||||
|
asr_runtime["enableInterim"] = asr.get("enable_interim")
|
||||||
if asr.get("interim_interval_ms") is not None:
|
if asr.get("interim_interval_ms") is not None:
|
||||||
asr_runtime["interimIntervalMs"] = asr.get("interim_interval_ms")
|
asr_runtime["interimIntervalMs"] = asr.get("interim_interval_ms")
|
||||||
if asr.get("min_audio_ms") is not None:
|
if asr.get("min_audio_ms") is not None:
|
||||||
|
|||||||
@@ -89,6 +89,7 @@ class Settings(BaseSettings):
|
|||||||
)
|
)
|
||||||
asr_api_url: Optional[str] = Field(default=None, description="ASR provider API URL")
|
asr_api_url: Optional[str] = Field(default=None, description="ASR provider API URL")
|
||||||
asr_model: Optional[str] = Field(default=None, description="ASR model name")
|
asr_model: Optional[str] = Field(default=None, description="ASR model name")
|
||||||
|
asr_enable_interim: bool = Field(default=False, description="Enable interim transcripts for offline ASR")
|
||||||
asr_interim_interval_ms: int = Field(default=500, description="Interval for interim ASR results in ms")
|
asr_interim_interval_ms: int = Field(default=500, description="Interval for interim ASR results in ms")
|
||||||
asr_min_audio_ms: int = Field(default=300, description="Minimum audio duration before first ASR result")
|
asr_min_audio_ms: int = Field(default=300, description="Minimum audio duration before first ASR result")
|
||||||
asr_start_min_speech_ms: int = Field(
|
asr_start_min_speech_ms: int = Field(
|
||||||
|
|||||||
@@ -44,6 +44,7 @@ agent:
|
|||||||
api_key: you_asr_api_key
|
api_key: you_asr_api_key
|
||||||
api_url: https://api.siliconflow.cn/v1/audio/transcriptions
|
api_url: https://api.siliconflow.cn/v1/audio/transcriptions
|
||||||
model: FunAudioLLM/SenseVoiceSmall
|
model: FunAudioLLM/SenseVoiceSmall
|
||||||
|
enable_interim: false
|
||||||
interim_interval_ms: 500
|
interim_interval_ms: 500
|
||||||
min_audio_ms: 300
|
min_audio_ms: 300
|
||||||
start_min_speech_ms: 160
|
start_min_speech_ms: 160
|
||||||
|
|||||||
@@ -41,6 +41,7 @@ agent:
|
|||||||
api_key: your_asr_api_key
|
api_key: your_asr_api_key
|
||||||
api_url: https://api.siliconflow.cn/v1/audio/transcriptions
|
api_url: https://api.siliconflow.cn/v1/audio/transcriptions
|
||||||
model: FunAudioLLM/SenseVoiceSmall
|
model: FunAudioLLM/SenseVoiceSmall
|
||||||
|
enable_interim: false
|
||||||
interim_interval_ms: 500
|
interim_interval_ms: 500
|
||||||
min_audio_ms: 300
|
min_audio_ms: 300
|
||||||
start_min_speech_ms: 160
|
start_min_speech_ms: 160
|
||||||
|
|||||||
@@ -53,6 +53,7 @@ class OpenAICompatibleASRService(BaseASRService):
|
|||||||
model: str = "FunAudioLLM/SenseVoiceSmall",
|
model: str = "FunAudioLLM/SenseVoiceSmall",
|
||||||
sample_rate: int = 16000,
|
sample_rate: int = 16000,
|
||||||
language: str = "auto",
|
language: str = "auto",
|
||||||
|
enable_interim: bool = False,
|
||||||
interim_interval_ms: int = 500, # How often to send interim results
|
interim_interval_ms: int = 500, # How often to send interim results
|
||||||
min_audio_for_interim_ms: int = 300, # Min audio before first interim
|
min_audio_for_interim_ms: int = 300, # Min audio before first interim
|
||||||
on_transcript: Optional[Callable[[str, bool], Awaitable[None]]] = None
|
on_transcript: Optional[Callable[[str, bool], Awaitable[None]]] = None
|
||||||
@@ -66,6 +67,7 @@ class OpenAICompatibleASRService(BaseASRService):
|
|||||||
model: ASR model name or alias
|
model: ASR model name or alias
|
||||||
sample_rate: Audio sample rate (16000 recommended)
|
sample_rate: Audio sample rate (16000 recommended)
|
||||||
language: Language code (auto for automatic detection)
|
language: Language code (auto for automatic detection)
|
||||||
|
enable_interim: Whether to generate interim transcriptions in offline mode
|
||||||
interim_interval_ms: How often to generate interim transcriptions
|
interim_interval_ms: How often to generate interim transcriptions
|
||||||
min_audio_for_interim_ms: Minimum audio duration before first interim
|
min_audio_for_interim_ms: Minimum audio duration before first interim
|
||||||
on_transcript: Callback for transcription results (text, is_final)
|
on_transcript: Callback for transcription results (text, is_final)
|
||||||
@@ -80,6 +82,7 @@ class OpenAICompatibleASRService(BaseASRService):
|
|||||||
raw_api_url = api_url or os.getenv("ASR_API_URL") or self.API_URL
|
raw_api_url = api_url or os.getenv("ASR_API_URL") or self.API_URL
|
||||||
self.api_url = self._resolve_transcriptions_endpoint(raw_api_url)
|
self.api_url = self._resolve_transcriptions_endpoint(raw_api_url)
|
||||||
self.model = self.MODELS.get(model.lower(), model)
|
self.model = self.MODELS.get(model.lower(), model)
|
||||||
|
self.enable_interim = bool(enable_interim)
|
||||||
self.interim_interval_ms = interim_interval_ms
|
self.interim_interval_ms = interim_interval_ms
|
||||||
self.min_audio_for_interim_ms = min_audio_for_interim_ms
|
self.min_audio_for_interim_ms = min_audio_for_interim_ms
|
||||||
self.on_transcript = on_transcript
|
self.on_transcript = on_transcript
|
||||||
@@ -181,6 +184,9 @@ class OpenAICompatibleASRService(BaseASRService):
|
|||||||
if not self._session:
|
if not self._session:
|
||||||
logger.warning("ASR session not connected")
|
logger.warning("ASR session not connected")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
if not is_final and not self.enable_interim:
|
||||||
|
return None
|
||||||
|
|
||||||
# Check minimum audio duration
|
# Check minimum audio duration
|
||||||
audio_duration_ms = len(self._audio_buffer) / (self.sample_rate * 2) * 1000
|
audio_duration_ms = len(self._audio_buffer) / (self.sample_rate * 2) * 1000
|
||||||
@@ -310,6 +316,9 @@ class OpenAICompatibleASRService(BaseASRService):
|
|||||||
This periodically transcribes buffered audio for
|
This periodically transcribes buffered audio for
|
||||||
real-time feedback to the user.
|
real-time feedback to the user.
|
||||||
"""
|
"""
|
||||||
|
if not self.enable_interim:
|
||||||
|
return
|
||||||
|
|
||||||
if self._interim_task and not self._interim_task.done():
|
if self._interim_task and not self._interim_task.done():
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|||||||
@@ -117,6 +117,7 @@ class DefaultRealtimeServiceFactory(RealtimeServiceFactory):
|
|||||||
model=spec.model or self._DEFAULT_OPENAI_COMPATIBLE_ASR_MODEL,
|
model=spec.model or self._DEFAULT_OPENAI_COMPATIBLE_ASR_MODEL,
|
||||||
sample_rate=spec.sample_rate,
|
sample_rate=spec.sample_rate,
|
||||||
language=spec.language,
|
language=spec.language,
|
||||||
|
enable_interim=spec.enable_interim,
|
||||||
interim_interval_ms=spec.interim_interval_ms,
|
interim_interval_ms=spec.interim_interval_ms,
|
||||||
min_audio_for_interim_ms=spec.min_audio_for_interim_ms,
|
min_audio_for_interim_ms=spec.min_audio_for_interim_ms,
|
||||||
on_transcript=spec.on_transcript,
|
on_transcript=spec.on_transcript,
|
||||||
|
|||||||
@@ -599,6 +599,7 @@ class DuplexPipeline:
|
|||||||
"provider": asr_provider,
|
"provider": asr_provider,
|
||||||
"mode": self._resolve_asr_mode(asr_provider, self._runtime_asr.get("mode")),
|
"mode": self._resolve_asr_mode(asr_provider, self._runtime_asr.get("mode")),
|
||||||
"model": str(self._runtime_asr.get("model") or settings.asr_model or ""),
|
"model": str(self._runtime_asr.get("model") or settings.asr_model or ""),
|
||||||
|
"enableInterim": self._asr_interim_enabled(),
|
||||||
"interimIntervalMs": int(self._runtime_asr.get("interimIntervalMs") or settings.asr_interim_interval_ms),
|
"interimIntervalMs": int(self._runtime_asr.get("interimIntervalMs") or settings.asr_interim_interval_ms),
|
||||||
"minAudioMs": int(self._runtime_asr.get("minAudioMs") or settings.asr_min_audio_ms),
|
"minAudioMs": int(self._runtime_asr.get("minAudioMs") or settings.asr_min_audio_ms),
|
||||||
},
|
},
|
||||||
@@ -865,6 +866,20 @@ class DuplexPipeline:
|
|||||||
return self._runtime_barge_in_min_duration_ms
|
return self._runtime_barge_in_min_duration_ms
|
||||||
return self._barge_in_min_duration_ms
|
return self._barge_in_min_duration_ms
|
||||||
|
|
||||||
|
def _asr_interim_enabled(self) -> bool:
|
||||||
|
current_mode = self._asr_mode
|
||||||
|
if not self.asr_service:
|
||||||
|
current_mode = self._resolve_asr_mode(
|
||||||
|
self._runtime_asr.get("provider") or settings.asr_provider,
|
||||||
|
self._runtime_asr.get("mode"),
|
||||||
|
)
|
||||||
|
if current_mode != "offline":
|
||||||
|
return True
|
||||||
|
enabled = self._coerce_bool(self._runtime_asr.get("enableInterim"))
|
||||||
|
if enabled is not None:
|
||||||
|
return enabled
|
||||||
|
return bool(settings.asr_enable_interim)
|
||||||
|
|
||||||
def _barge_in_silence_tolerance_frames(self) -> int:
|
def _barge_in_silence_tolerance_frames(self) -> int:
|
||||||
"""Convert silence tolerance from ms to frame count using current chunk size."""
|
"""Convert silence tolerance from ms to frame count using current chunk size."""
|
||||||
chunk_ms = max(1, settings.chunk_size_ms)
|
chunk_ms = max(1, settings.chunk_size_ms)
|
||||||
@@ -991,6 +1006,9 @@ class DuplexPipeline:
|
|||||||
asr_api_key = self._runtime_asr.get("apiKey")
|
asr_api_key = self._runtime_asr.get("apiKey")
|
||||||
asr_api_url = self._runtime_asr.get("baseUrl") or settings.asr_api_url
|
asr_api_url = self._runtime_asr.get("baseUrl") or settings.asr_api_url
|
||||||
asr_model = self._runtime_asr.get("model") or settings.asr_model
|
asr_model = self._runtime_asr.get("model") or settings.asr_model
|
||||||
|
asr_enable_interim = self._coerce_bool(self._runtime_asr.get("enableInterim"))
|
||||||
|
if asr_enable_interim is None:
|
||||||
|
asr_enable_interim = bool(settings.asr_enable_interim)
|
||||||
asr_interim_interval = int(self._runtime_asr.get("interimIntervalMs") or settings.asr_interim_interval_ms)
|
asr_interim_interval = int(self._runtime_asr.get("interimIntervalMs") or settings.asr_interim_interval_ms)
|
||||||
asr_min_audio_ms = int(self._runtime_asr.get("minAudioMs") or settings.asr_min_audio_ms)
|
asr_min_audio_ms = int(self._runtime_asr.get("minAudioMs") or settings.asr_min_audio_ms)
|
||||||
asr_mode = self._resolve_asr_mode(asr_provider, self._runtime_asr.get("mode"))
|
asr_mode = self._resolve_asr_mode(asr_provider, self._runtime_asr.get("mode"))
|
||||||
@@ -1004,6 +1022,7 @@ class DuplexPipeline:
|
|||||||
api_key=str(asr_api_key).strip() if asr_api_key else None,
|
api_key=str(asr_api_key).strip() if asr_api_key else None,
|
||||||
api_url=str(asr_api_url).strip() if asr_api_url else None,
|
api_url=str(asr_api_url).strip() if asr_api_url else None,
|
||||||
model=str(asr_model).strip() if asr_model else None,
|
model=str(asr_model).strip() if asr_model else None,
|
||||||
|
enable_interim=asr_enable_interim,
|
||||||
interim_interval_ms=asr_interim_interval,
|
interim_interval_ms=asr_interim_interval,
|
||||||
min_audio_for_interim_ms=asr_min_audio_ms,
|
min_audio_for_interim_ms=asr_min_audio_ms,
|
||||||
on_transcript=self._on_transcript_callback,
|
on_transcript=self._on_transcript_callback,
|
||||||
@@ -1481,6 +1500,9 @@ class DuplexPipeline:
|
|||||||
text: Transcribed text
|
text: Transcribed text
|
||||||
is_final: Whether this is the final transcription
|
is_final: Whether this is the final transcription
|
||||||
"""
|
"""
|
||||||
|
if not is_final and not self._asr_interim_enabled():
|
||||||
|
return
|
||||||
|
|
||||||
# Avoid sending duplicate transcripts
|
# Avoid sending duplicate transcripts
|
||||||
if text == self._last_sent_transcript and not is_final:
|
if text == self._last_sent_transcript and not is_final:
|
||||||
return
|
return
|
||||||
@@ -1550,7 +1572,8 @@ class DuplexPipeline:
|
|||||||
if self._asr_mode == "streaming":
|
if self._asr_mode == "streaming":
|
||||||
await self._streaming_asr().begin_utterance()
|
await self._streaming_asr().begin_utterance()
|
||||||
else:
|
else:
|
||||||
await self._offline_asr().start_interim_transcription()
|
if self._asr_interim_enabled():
|
||||||
|
await self._offline_asr().start_interim_transcription()
|
||||||
|
|
||||||
# Prime ASR with a short pre-speech context window so the utterance
|
# Prime ASR with a short pre-speech context window so the utterance
|
||||||
# start isn't lost while waiting for VAD to transition to Speech.
|
# start isn't lost while waiting for VAD to transition to Speech.
|
||||||
|
|||||||
@@ -22,6 +22,7 @@ class ASRServiceSpec:
|
|||||||
api_key: Optional[str] = None
|
api_key: Optional[str] = None
|
||||||
api_url: Optional[str] = None
|
api_url: Optional[str] = None
|
||||||
model: Optional[str] = None
|
model: Optional[str] = None
|
||||||
|
enable_interim: bool = False
|
||||||
interim_interval_ms: int = 500
|
interim_interval_ms: int = 500
|
||||||
min_audio_for_interim_ms: int = 300
|
min_audio_for_interim_ms: int = 300
|
||||||
on_transcript: Optional[TranscriptCallback] = None
|
on_transcript: Optional[TranscriptCallback] = None
|
||||||
|
|||||||
@@ -32,6 +32,7 @@ def test_create_asr_service_openai_compatible_returns_offline_provider():
|
|||||||
)
|
)
|
||||||
assert isinstance(service, OpenAICompatibleASRService)
|
assert isinstance(service, OpenAICompatibleASRService)
|
||||||
assert service.mode == "offline"
|
assert service.mode == "offline"
|
||||||
|
assert service.enable_interim is False
|
||||||
|
|
||||||
|
|
||||||
def test_create_asr_service_fallback_buffered_for_unsupported_provider():
|
def test_create_asr_service_fallback_buffered_for_unsupported_provider():
|
||||||
|
|||||||
@@ -282,7 +282,7 @@ async def test_local_yaml_adapter_rejects_path_traversal_like_assistant_id(tmp_p
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_local_yaml_translates_agent_schema_to_runtime_services(tmp_path):
|
async def test_local_yaml_translates_agent_schema_with_asr_interim_flag(tmp_path):
|
||||||
config_dir = tmp_path / "assistants"
|
config_dir = tmp_path / "assistants"
|
||||||
config_dir.mkdir(parents=True, exist_ok=True)
|
config_dir.mkdir(parents=True, exist_ok=True)
|
||||||
(config_dir / "default.yaml").write_text(
|
(config_dir / "default.yaml").write_text(
|
||||||
@@ -305,6 +305,7 @@ async def test_local_yaml_translates_agent_schema_to_runtime_services(tmp_path):
|
|||||||
" model: asr-model",
|
" model: asr-model",
|
||||||
" api_key: sk-asr",
|
" api_key: sk-asr",
|
||||||
" api_url: https://asr.example.com/v1/audio/transcriptions",
|
" api_url: https://asr.example.com/v1/audio/transcriptions",
|
||||||
|
" enable_interim: false",
|
||||||
" duplex:",
|
" duplex:",
|
||||||
" system_prompt: You are test assistant",
|
" system_prompt: You are test assistant",
|
||||||
]
|
]
|
||||||
@@ -321,4 +322,5 @@ async def test_local_yaml_translates_agent_schema_to_runtime_services(tmp_path):
|
|||||||
assert services.get("llm", {}).get("apiKey") == "sk-llm"
|
assert services.get("llm", {}).get("apiKey") == "sk-llm"
|
||||||
assert services.get("tts", {}).get("apiKey") == "sk-tts"
|
assert services.get("tts", {}).get("apiKey") == "sk-tts"
|
||||||
assert services.get("asr", {}).get("apiKey") == "sk-asr"
|
assert services.get("asr", {}).get("apiKey") == "sk-asr"
|
||||||
|
assert services.get("asr", {}).get("enableInterim") is False
|
||||||
assert assistant.get("systemPrompt") == "You are test assistant"
|
assert assistant.get("systemPrompt") == "You are test assistant"
|
||||||
|
|||||||
@@ -145,10 +145,11 @@ async def test_start_asr_capture_uses_streaming_begin(monkeypatch):
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_start_asr_capture_uses_offline_interim_control(monkeypatch):
|
async def test_start_asr_capture_uses_offline_interim_control_when_enabled(monkeypatch):
|
||||||
asr = _FakeOfflineASR()
|
asr = _FakeOfflineASR()
|
||||||
pipeline = _build_pipeline(monkeypatch, asr)
|
pipeline = _build_pipeline(monkeypatch, asr)
|
||||||
pipeline._asr_mode = "offline"
|
pipeline._asr_mode = "offline"
|
||||||
|
pipeline._runtime_asr["enableInterim"] = True
|
||||||
pipeline._pending_speech_audio = b"\x00" * 320
|
pipeline._pending_speech_audio = b"\x00" * 320
|
||||||
pipeline._pre_speech_buffer = b"\x00" * 640
|
pipeline._pre_speech_buffer = b"\x00" * 640
|
||||||
|
|
||||||
@@ -159,6 +160,69 @@ async def test_start_asr_capture_uses_offline_interim_control(monkeypatch):
|
|||||||
assert pipeline._asr_capture_active is True
|
assert pipeline._asr_capture_active is True
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_start_asr_capture_skips_offline_interim_control_when_disabled(monkeypatch):
|
||||||
|
asr = _FakeOfflineASR()
|
||||||
|
pipeline = _build_pipeline(monkeypatch, asr)
|
||||||
|
pipeline._asr_mode = "offline"
|
||||||
|
pipeline._runtime_asr["enableInterim"] = False
|
||||||
|
pipeline._pending_speech_audio = b"\x00" * 320
|
||||||
|
pipeline._pre_speech_buffer = b"\x00" * 640
|
||||||
|
|
||||||
|
await pipeline._start_asr_capture()
|
||||||
|
|
||||||
|
assert asr.start_interim_calls == 0
|
||||||
|
assert asr.sent_audio
|
||||||
|
assert pipeline._asr_capture_active is True
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_offline_interim_callback_ignored_when_disabled(monkeypatch):
|
||||||
|
asr = _FakeOfflineASR()
|
||||||
|
pipeline = _build_pipeline(monkeypatch, asr)
|
||||||
|
pipeline._asr_mode = "offline"
|
||||||
|
pipeline._runtime_asr["enableInterim"] = False
|
||||||
|
|
||||||
|
captured_events = []
|
||||||
|
captured_deltas = []
|
||||||
|
|
||||||
|
async def _capture_event(event: Dict[str, Any], priority: int = 20):
|
||||||
|
_ = priority
|
||||||
|
captured_events.append(event)
|
||||||
|
|
||||||
|
async def _capture_delta(text: str):
|
||||||
|
captured_deltas.append(text)
|
||||||
|
|
||||||
|
monkeypatch.setattr(pipeline, "_send_event", _capture_event)
|
||||||
|
monkeypatch.setattr(pipeline, "_emit_transcript_delta", _capture_delta)
|
||||||
|
|
||||||
|
await pipeline._on_transcript_callback("ignored interim", is_final=False)
|
||||||
|
|
||||||
|
assert captured_events == []
|
||||||
|
assert captured_deltas == []
|
||||||
|
assert pipeline._latest_asr_interim_text == ""
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_offline_final_callback_emits_when_interim_disabled(monkeypatch):
|
||||||
|
asr = _FakeOfflineASR()
|
||||||
|
pipeline = _build_pipeline(monkeypatch, asr)
|
||||||
|
pipeline._asr_mode = "offline"
|
||||||
|
pipeline._runtime_asr["enableInterim"] = False
|
||||||
|
|
||||||
|
captured_events = []
|
||||||
|
|
||||||
|
async def _capture_event(event: Dict[str, Any], priority: int = 20):
|
||||||
|
_ = priority
|
||||||
|
captured_events.append(event)
|
||||||
|
|
||||||
|
monkeypatch.setattr(pipeline, "_send_event", _capture_event)
|
||||||
|
|
||||||
|
await pipeline._on_transcript_callback("final only", is_final=True)
|
||||||
|
|
||||||
|
assert any(event.get("type") == "transcript.final" for event in captured_events)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_streaming_eou_falls_back_to_latest_interim(monkeypatch):
|
async def test_streaming_eou_falls_back_to_latest_interim(monkeypatch):
|
||||||
asr = _FakeStreamingASR()
|
asr = _FakeStreamingASR()
|
||||||
|
|||||||
@@ -259,6 +259,7 @@ export const AssistantsPage: React.FC = () => {
|
|||||||
speed: 1,
|
speed: 1,
|
||||||
hotwords: [],
|
hotwords: [],
|
||||||
tools: [],
|
tools: [],
|
||||||
|
asrInterimEnabled: false,
|
||||||
botCannotBeInterrupted: false,
|
botCannotBeInterrupted: false,
|
||||||
interruptionSensitivity: 180,
|
interruptionSensitivity: 180,
|
||||||
configMode: 'platform',
|
configMode: 'platform',
|
||||||
@@ -1358,6 +1359,41 @@ export const AssistantsPage: React.FC = () => {
|
|||||||
</p>
|
</p>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
<div className="space-y-3">
|
||||||
|
<div className="flex items-center justify-between gap-3">
|
||||||
|
<label className="text-sm font-medium text-white flex items-center">
|
||||||
|
<Mic className="w-4 h-4 mr-2 text-primary"/> 离线 ASR 中间结果
|
||||||
|
</label>
|
||||||
|
<div className="inline-flex rounded-lg border border-white/10 bg-white/5 p-1">
|
||||||
|
<button
|
||||||
|
type="button"
|
||||||
|
onClick={() => updateAssistant('asrInterimEnabled', false)}
|
||||||
|
className={`px-3 py-1 text-xs rounded-md transition-colors ${
|
||||||
|
selectedAssistant.asrInterimEnabled === true
|
||||||
|
? 'text-muted-foreground hover:text-foreground'
|
||||||
|
: 'bg-primary text-primary-foreground shadow-sm'
|
||||||
|
}`}
|
||||||
|
>
|
||||||
|
关闭
|
||||||
|
</button>
|
||||||
|
<button
|
||||||
|
type="button"
|
||||||
|
onClick={() => updateAssistant('asrInterimEnabled', true)}
|
||||||
|
className={`px-3 py-1 text-xs rounded-md transition-colors ${
|
||||||
|
selectedAssistant.asrInterimEnabled === true
|
||||||
|
? 'bg-primary text-primary-foreground shadow-sm'
|
||||||
|
: 'text-muted-foreground hover:text-foreground'
|
||||||
|
}`}
|
||||||
|
>
|
||||||
|
开启
|
||||||
|
</button>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<p className="text-xs text-muted-foreground">
|
||||||
|
仅影响离线 ASR 模式(OpenAI Compatible / buffered)。默认关闭。
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
|
||||||
<div className="space-y-3">
|
<div className="space-y-3">
|
||||||
<div className="flex items-center justify-between gap-3">
|
<div className="flex items-center justify-between gap-3">
|
||||||
<label className="text-sm font-medium text-white flex items-center">
|
<label className="text-sm font-medium text-white flex items-center">
|
||||||
|
|||||||
@@ -87,6 +87,7 @@ const mapAssistant = (raw: AnyRecord): Assistant => ({
|
|||||||
speed: Number(readField(raw, ['speed'], 1)),
|
speed: Number(readField(raw, ['speed'], 1)),
|
||||||
hotwords: readField(raw, ['hotwords'], []),
|
hotwords: readField(raw, ['hotwords'], []),
|
||||||
tools: normalizeToolIdList(readField(raw, ['tools'], [])),
|
tools: normalizeToolIdList(readField(raw, ['tools'], [])),
|
||||||
|
asrInterimEnabled: Boolean(readField(raw, ['asrInterimEnabled', 'asr_interim_enabled'], false)),
|
||||||
botCannotBeInterrupted: Boolean(readField(raw, ['botCannotBeInterrupted', 'bot_cannot_be_interrupted'], false)),
|
botCannotBeInterrupted: Boolean(readField(raw, ['botCannotBeInterrupted', 'bot_cannot_be_interrupted'], false)),
|
||||||
interruptionSensitivity: Number(readField(raw, ['interruptionSensitivity', 'interruption_sensitivity'], 500)),
|
interruptionSensitivity: Number(readField(raw, ['interruptionSensitivity', 'interruption_sensitivity'], 500)),
|
||||||
configMode: readField(raw, ['configMode', 'config_mode'], 'platform') as 'platform' | 'dify' | 'fastgpt' | 'none',
|
configMode: readField(raw, ['configMode', 'config_mode'], 'platform') as 'platform' | 'dify' | 'fastgpt' | 'none',
|
||||||
@@ -284,6 +285,7 @@ export const createAssistant = async (data: Partial<Assistant>): Promise<Assista
|
|||||||
speed: data.speed ?? 1,
|
speed: data.speed ?? 1,
|
||||||
hotwords: data.hotwords || [],
|
hotwords: data.hotwords || [],
|
||||||
tools: normalizeToolIdList(data.tools || []),
|
tools: normalizeToolIdList(data.tools || []),
|
||||||
|
asrInterimEnabled: data.asrInterimEnabled ?? false,
|
||||||
botCannotBeInterrupted: data.botCannotBeInterrupted ?? false,
|
botCannotBeInterrupted: data.botCannotBeInterrupted ?? false,
|
||||||
interruptionSensitivity: data.interruptionSensitivity ?? 500,
|
interruptionSensitivity: data.interruptionSensitivity ?? 500,
|
||||||
configMode: data.configMode || 'platform',
|
configMode: data.configMode || 'platform',
|
||||||
@@ -316,6 +318,7 @@ export const updateAssistant = async (id: string, data: Partial<Assistant>): Pro
|
|||||||
speed: data.speed,
|
speed: data.speed,
|
||||||
hotwords: data.hotwords,
|
hotwords: data.hotwords,
|
||||||
tools: data.tools === undefined ? undefined : normalizeToolIdList(data.tools),
|
tools: data.tools === undefined ? undefined : normalizeToolIdList(data.tools),
|
||||||
|
asrInterimEnabled: data.asrInterimEnabled,
|
||||||
botCannotBeInterrupted: data.botCannotBeInterrupted,
|
botCannotBeInterrupted: data.botCannotBeInterrupted,
|
||||||
interruptionSensitivity: data.interruptionSensitivity,
|
interruptionSensitivity: data.interruptionSensitivity,
|
||||||
configMode: data.configMode,
|
configMode: data.configMode,
|
||||||
|
|||||||
@@ -19,6 +19,7 @@ export interface Assistant {
|
|||||||
speed: number;
|
speed: number;
|
||||||
hotwords: string[];
|
hotwords: string[];
|
||||||
tools?: string[]; // IDs of enabled tools
|
tools?: string[]; // IDs of enabled tools
|
||||||
|
asrInterimEnabled?: boolean;
|
||||||
botCannotBeInterrupted?: boolean;
|
botCannotBeInterrupted?: boolean;
|
||||||
interruptionSensitivity?: number; // In ms
|
interruptionSensitivity?: number; // In ms
|
||||||
configMode?: 'platform' | 'dify' | 'fastgpt' | 'none';
|
configMode?: 'platform' | 'dify' | 'fastgpt' | 'none';
|
||||||
|
|||||||
Reference in New Issue
Block a user