From d41db6418ccb89ffe959aa0c4ead7f659dd76043 Mon Sep 17 00:00:00 2001 From: Xin Wang Date: Thu, 12 Feb 2026 13:51:27 +0800 Subject: [PATCH] Add bot not interrupt and generated opener --- api/app/main.py | 27 +++++++++++ api/app/models.py | 2 + api/app/routers/assistants.py | 11 +++++ api/app/schemas.py | 4 ++ api/tests/test_assistants.py | 26 ++++++++++ engine/core/duplex_pipeline.py | 89 ++++++++++++++++++++++++++++++++-- web/pages/Assistants.tsx | 60 ++++++++++++++++++++--- web/services/backendApi.ts | 6 +++ web/types.ts | 2 + 9 files changed, 215 insertions(+), 12 deletions(-) diff --git a/api/app/main.py b/api/app/main.py index a193ff9..36b2f18 100644 --- a/api/app/main.py +++ b/api/app/main.py @@ -2,15 +2,42 @@ from fastapi import FastAPI from fastapi.middleware.cors import CORSMiddleware from contextlib import asynccontextmanager import os +from sqlalchemy import inspect, text from .db import Base, engine from .routers import assistants, voices, workflows, history, knowledge, llm, asr, tools +def _ensure_assistant_columns() -> None: + """Best-effort SQLite schema evolution for assistant flags.""" + inspector = inspect(engine) + if "assistants" not in inspector.get_table_names(): + return + + columns = {col["name"] for col in inspector.get_columns("assistants")} + alter_statements = [] + if "generated_opener_enabled" not in columns: + alter_statements.append( + "ALTER TABLE assistants ADD COLUMN generated_opener_enabled BOOLEAN DEFAULT 0" + ) + if "bot_cannot_be_interrupted" not in columns: + alter_statements.append( + "ALTER TABLE assistants ADD COLUMN bot_cannot_be_interrupted BOOLEAN DEFAULT 0" + ) + + if not alter_statements: + return + + with engine.begin() as conn: + for stmt in alter_statements: + conn.execute(text(stmt)) + + @asynccontextmanager async def lifespan(app: FastAPI): # 启动时创建表 Base.metadata.create_all(bind=engine) + _ensure_assistant_columns() yield diff --git a/api/app/models.py b/api/app/models.py index e32d6b1..d1669f7 100644 --- a/api/app/models.py +++ b/api/app/models.py @@ -113,6 +113,7 @@ class Assistant(Base): name: Mapped[str] = mapped_column(String(255), nullable=False) call_count: Mapped[int] = mapped_column(Integer, default=0) opener: Mapped[str] = mapped_column(Text, default="") + generated_opener_enabled: Mapped[bool] = mapped_column(default=False) prompt: Mapped[str] = mapped_column(Text, default="") knowledge_base_id: Mapped[Optional[str]] = mapped_column(String(64), nullable=True) language: Mapped[str] = mapped_column(String(16), default="zh") @@ -121,6 +122,7 @@ class Assistant(Base): speed: Mapped[float] = mapped_column(Float, default=1.0) hotwords: Mapped[dict] = mapped_column(JSON, default=list) tools: Mapped[dict] = mapped_column(JSON, default=list) + bot_cannot_be_interrupted: Mapped[bool] = mapped_column(default=False) interruption_sensitivity: Mapped[int] = mapped_column(Integer, default=500) config_mode: Mapped[str] = mapped_column(String(32), default="platform") api_url: Mapped[Optional[str]] = mapped_column(String(255), nullable=True) diff --git a/api/app/routers/assistants.py b/api/app/routers/assistants.py index 2368cee..6a8d02e 100644 --- a/api/app/routers/assistants.py +++ b/api/app/routers/assistants.py @@ -21,7 +21,12 @@ def _resolve_runtime_metadata(db: Session, assistant: Assistant) -> dict: metadata = { "systemPrompt": assistant.prompt or "", "greeting": assistant.opener or "", + "generatedOpenerEnabled": bool(assistant.generated_opener_enabled), "output": {"mode": "audio" if assistant.voice_output_enabled else "text"}, + "bargeIn": { + "enabled": not bool(assistant.bot_cannot_be_interrupted), + "minDurationMs": int(assistant.interruption_sensitivity or 500), + }, "services": {}, } warnings = [] @@ -100,6 +105,7 @@ def assistant_to_dict(assistant: Assistant) -> dict: "name": assistant.name, "callCount": assistant.call_count, "opener": assistant.opener or "", + "generatedOpenerEnabled": bool(assistant.generated_opener_enabled), "prompt": assistant.prompt or "", "knowledgeBaseId": assistant.knowledge_base_id, "language": assistant.language, @@ -108,6 +114,7 @@ def assistant_to_dict(assistant: Assistant) -> dict: "speed": assistant.speed, "hotwords": assistant.hotwords or [], "tools": assistant.tools or [], + "botCannotBeInterrupted": bool(assistant.bot_cannot_be_interrupted), "interruptionSensitivity": assistant.interruption_sensitivity, "configMode": assistant.config_mode, "apiUrl": assistant.api_url, @@ -125,8 +132,10 @@ def _apply_assistant_update(assistant: Assistant, update_data: dict) -> None: field_map = { "knowledgeBaseId": "knowledge_base_id", "interruptionSensitivity": "interruption_sensitivity", + "botCannotBeInterrupted": "bot_cannot_be_interrupted", "configMode": "config_mode", "voiceOutputEnabled": "voice_output_enabled", + "generatedOpenerEnabled": "generated_opener_enabled", "apiUrl": "api_url", "apiKey": "api_key", "llmModelId": "llm_model_id", @@ -184,6 +193,7 @@ def create_assistant(data: AssistantCreate, db: Session = Depends(get_db)): user_id=1, # 默认用户,后续添加认证 name=data.name, opener=data.opener, + generated_opener_enabled=data.generatedOpenerEnabled, prompt=data.prompt, knowledge_base_id=data.knowledgeBaseId, language=data.language, @@ -192,6 +202,7 @@ def create_assistant(data: AssistantCreate, db: Session = Depends(get_db)): speed=data.speed, hotwords=data.hotwords, tools=data.tools, + bot_cannot_be_interrupted=data.botCannotBeInterrupted, interruption_sensitivity=data.interruptionSensitivity, config_mode=data.configMode, api_url=data.apiUrl, diff --git a/api/app/schemas.py b/api/app/schemas.py index 73b3c69..a974089 100644 --- a/api/app/schemas.py +++ b/api/app/schemas.py @@ -273,6 +273,7 @@ class ToolResourceOut(ToolResourceBase): class AssistantBase(BaseModel): name: str opener: str = "" + generatedOpenerEnabled: bool = False prompt: str = "" knowledgeBaseId: Optional[str] = None language: str = "zh" @@ -281,6 +282,7 @@ class AssistantBase(BaseModel): speed: float = 1.0 hotwords: List[str] = [] tools: List[str] = [] + botCannotBeInterrupted: bool = False interruptionSensitivity: int = 500 configMode: str = "platform" apiUrl: Optional[str] = None @@ -299,6 +301,7 @@ class AssistantCreate(AssistantBase): class AssistantUpdate(BaseModel): name: Optional[str] = None opener: Optional[str] = None + generatedOpenerEnabled: Optional[bool] = None prompt: Optional[str] = None knowledgeBaseId: Optional[str] = None language: Optional[str] = None @@ -307,6 +310,7 @@ class AssistantUpdate(BaseModel): speed: Optional[float] = None hotwords: Optional[List[str]] = None tools: Optional[List[str]] = None + botCannotBeInterrupted: Optional[bool] = None interruptionSensitivity: Optional[int] = None configMode: Optional[str] = None apiUrl: Optional[str] = None diff --git a/api/tests/test_assistants.py b/api/tests/test_assistants.py index a0140d1..7bec31d 100644 --- a/api/tests/test_assistants.py +++ b/api/tests/test_assistants.py @@ -24,6 +24,8 @@ class TestAssistantAPI: assert data["prompt"] == sample_assistant_data["prompt"] assert data["language"] == sample_assistant_data["language"] assert data["voiceOutputEnabled"] is True + assert data["generatedOpenerEnabled"] is False + assert data["botCannotBeInterrupted"] is False assert "id" in data assert data["callCount"] == 0 @@ -225,3 +227,27 @@ class TestAssistantAPI: metadata = runtime_resp.json()["sessionStartMetadata"] assert metadata["output"]["mode"] == "text" assert metadata["services"]["tts"]["enabled"] is False + + def test_assistant_interrupt_and_generated_opener_flags(self, client, sample_assistant_data): + sample_assistant_data.update({ + "generatedOpenerEnabled": True, + "botCannotBeInterrupted": True, + "interruptionSensitivity": 900, + }) + assistant_resp = client.post("/api/assistants", json=sample_assistant_data) + assert assistant_resp.status_code == 200 + assistant_id = assistant_resp.json()["id"] + + get_resp = client.get(f"/api/assistants/{assistant_id}") + assert get_resp.status_code == 200 + payload = get_resp.json() + assert payload["generatedOpenerEnabled"] is True + assert payload["botCannotBeInterrupted"] is True + assert payload["interruptionSensitivity"] == 900 + + runtime_resp = client.get(f"/api/assistants/{assistant_id}/runtime-config") + assert runtime_resp.status_code == 200 + metadata = runtime_resp.json()["sessionStartMetadata"] + assert metadata["generatedOpenerEnabled"] is True + assert metadata["bargeIn"]["enabled"] is False + assert metadata["bargeIn"]["minDurationMs"] == 900 diff --git a/engine/core/duplex_pipeline.py b/engine/core/duplex_pipeline.py index 5684401..2bff90a 100644 --- a/engine/core/duplex_pipeline.py +++ b/engine/core/duplex_pipeline.py @@ -268,6 +268,9 @@ class DuplexPipeline: self._runtime_output: Dict[str, Any] = {} self._runtime_system_prompt: Optional[str] = None self._runtime_greeting: Optional[str] = None + self._runtime_generated_opener_enabled: Optional[bool] = None + self._runtime_barge_in_enabled: Optional[bool] = None + self._runtime_barge_in_min_duration_ms: Optional[int] = None self._runtime_knowledge: Dict[str, Any] = {} self._runtime_knowledge_base_id: Optional[str] = None self._runtime_tools: List[Any] = [] @@ -301,8 +304,18 @@ class DuplexPipeline: if self._runtime_system_prompt: self.conversation.system_prompt = self._runtime_system_prompt if "greeting" in metadata: - self._runtime_greeting = str(metadata.get("greeting") or "") + greeting_payload = metadata.get("greeting") + if isinstance(greeting_payload, dict): + self._runtime_greeting = str(greeting_payload.get("text") or "") + generated_flag = self._coerce_bool(greeting_payload.get("generated")) + if generated_flag is not None: + self._runtime_generated_opener_enabled = generated_flag + else: + self._runtime_greeting = str(greeting_payload or "") self.conversation.greeting = self._runtime_greeting or None + generated_opener_flag = self._coerce_bool(metadata.get("generatedOpenerEnabled")) + if generated_opener_flag is not None: + self._runtime_generated_opener_enabled = generated_opener_flag services = metadata.get("services") or {} if isinstance(services, dict): @@ -315,6 +328,17 @@ class DuplexPipeline: output = metadata.get("output") or {} if isinstance(output, dict): self._runtime_output = output + barge_in = metadata.get("bargeIn") + if isinstance(barge_in, dict): + barge_in_enabled = self._coerce_bool(barge_in.get("enabled")) + if barge_in_enabled is not None: + self._runtime_barge_in_enabled = barge_in_enabled + min_duration = barge_in.get("minDurationMs") + if isinstance(min_duration, (int, float, str)): + try: + self._runtime_barge_in_min_duration_ms = max(0, int(min_duration)) + except (TypeError, ValueError): + self._runtime_barge_in_min_duration_ms = None knowledge_base_id = metadata.get("knowledgeBaseId") if knowledge_base_id is not None: @@ -366,6 +390,50 @@ class DuplexPipeline: return True + def _generated_opener_enabled(self) -> bool: + return self._runtime_generated_opener_enabled is True + + def _barge_in_enabled(self) -> bool: + if self._runtime_barge_in_enabled is not None: + return self._runtime_barge_in_enabled + return True + + def _resolved_barge_in_min_duration_ms(self) -> int: + if self._runtime_barge_in_min_duration_ms is not None: + return self._runtime_barge_in_min_duration_ms + return self._barge_in_min_duration_ms + + async def _generate_runtime_greeting(self) -> Optional[str]: + if not self.llm_service: + return None + + prompt_hint = (self._runtime_greeting or "").strip() + system_prompt = ( + "You generate one concise opener for a live voice call assistant. " + "Return plain text only, no quotes, no markdown, one sentence." + ) + user_prompt = "Generate a friendly opening line (max 25 words)." + if prompt_hint: + user_prompt += f" Style hint: {prompt_hint}" + + try: + generated = await self.llm_service.generate( + [ + LLMMessage(role="system", content=system_prompt), + LLMMessage(role="user", content=user_prompt), + ], + temperature=0.7, + max_tokens=64, + ) + except Exception as exc: + logger.warning(f"Failed to generate runtime greeting: {exc}") + return None + + text = (generated or "").strip() + if not text: + return None + return text.strip().strip('"').strip("'") + async def start(self) -> None: """Start the pipeline and connect services.""" try: @@ -464,8 +532,15 @@ class DuplexPipeline: self._outbound_task = asyncio.create_task(self._outbound_loop()) # Speak greeting if configured - if self.conversation.greeting and tts_output_enabled: - await self._speak(self.conversation.greeting) + if tts_output_enabled: + greeting_to_speak = self.conversation.greeting + if self._generated_opener_enabled(): + generated_greeting = await self._generate_runtime_greeting() + if generated_greeting: + greeting_to_speak = generated_greeting + self.conversation.greeting = generated_greeting + if greeting_to_speak: + await self._speak(greeting_to_speak) except Exception as e: logger.error(f"Failed to start pipeline: {e}") @@ -552,7 +627,7 @@ class DuplexPipeline: # 2. Check for barge-in (user speaking while bot speaking) # Filter false interruptions by requiring minimum speech duration - if self._is_bot_speaking: + if self._is_bot_speaking and self._barge_in_enabled(): if vad_status == "Speech": # User is speaking while bot is speaking self._barge_in_silence_frames = 0 # Reset silence counter @@ -566,7 +641,7 @@ class DuplexPipeline: self._barge_in_speech_frames += 1 # Check if speech duration exceeds threshold speech_duration_ms = (time.time() - self._barge_in_speech_start_time) * 1000 - if speech_duration_ms >= self._barge_in_min_duration_ms: + if speech_duration_ms >= self._resolved_barge_in_min_duration_ms(): logger.info(f"Barge-in confirmed after {speech_duration_ms:.0f}ms of speech ({self._barge_in_speech_frames} frames)") await self._handle_barge_in() else: @@ -580,6 +655,10 @@ class DuplexPipeline: self._barge_in_speech_start_time = None self._barge_in_speech_frames = 0 self._barge_in_silence_frames = 0 + elif self._is_bot_speaking and not self._barge_in_enabled(): + self._barge_in_speech_start_time = None + self._barge_in_speech_frames = 0 + self._barge_in_silence_frames = 0 # 3. Buffer audio for ASR if vad_status == "Speech" or self.conversation.state == ConversationState.LISTENING: diff --git a/web/pages/Assistants.tsx b/web/pages/Assistants.tsx index 3b3eb3b..24b2c44 100644 --- a/web/pages/Assistants.tsx +++ b/web/pages/Assistants.tsx @@ -118,6 +118,7 @@ export const AssistantsPage: React.FC = () => { const newAssistantPayload: Partial = { name: 'New Assistant', opener: '', + generatedOpenerEnabled: false, prompt: '', knowledgeBaseId: '', language: 'zh', @@ -126,6 +127,7 @@ export const AssistantsPage: React.FC = () => { speed: 1, hotwords: [], tools: [], + botCannotBeInterrupted: false, interruptionSensitivity: 500, configMode: 'platform', }; @@ -244,6 +246,7 @@ export const AssistantsPage: React.FC = () => { const isExternalConfig = selectedAssistant?.configMode === 'dify' || selectedAssistant?.configMode === 'fastgpt'; const isNoneConfig = selectedAssistant?.configMode === 'none' || !selectedAssistant?.configMode; + const canAdjustInterruptionSensitivity = selectedAssistant?.botCannotBeInterrupted !== true; return (
@@ -524,11 +527,30 @@ export const AssistantsPage: React.FC = () => { value={selectedAssistant.opener} onChange={(e) => updateAssistant('opener', e.target.value)} placeholder="例如:您好,我是您的专属AI助手..." - className="bg-white/5 border-white/10 focus:border-primary/50" + disabled={selectedAssistant.generatedOpenerEnabled === true} + className="bg-white/5 border-white/10 focus:border-primary/50 disabled:opacity-50 disabled:cursor-not-allowed" />

接通通话后的第一句话。

+
+ + +

+ 开启后,系统将自动生成开场白,手动开场白输入框会暂时禁用。 +

+
+
-
+
+ +
+ +
+
@@ -689,7 +726,8 @@ export const AssistantsPage: React.FC = () => { type="number" value={selectedAssistant.interruptionSensitivity || 500} onChange={(e) => updateAssistant('interruptionSensitivity', parseInt(e.target.value) || 0)} - className="w-20 h-8 text-right pr-7 text-xs font-mono bg-black/40 border-white/5" + disabled={!canAdjustInterruptionSensitivity} + className="w-20 h-8 text-right pr-7 text-xs font-mono bg-black/40 border-white/5 disabled:opacity-40 disabled:cursor-not-allowed" /> ms
@@ -703,16 +741,19 @@ export const AssistantsPage: React.FC = () => { step="50" value={selectedAssistant.interruptionSensitivity || 500} onChange={(e) => updateAssistant('interruptionSensitivity', parseInt(e.target.value))} - className="flex-1 h-1.5 bg-secondary rounded-lg appearance-none cursor-pointer accent-primary" + disabled={!canAdjustInterruptionSensitivity} + className="flex-1 h-1.5 bg-secondary rounded-lg appearance-none cursor-pointer accent-primary disabled:opacity-40 disabled:cursor-not-allowed" />
-
+
0ms (Extreme) 1000ms 2000ms (Lazy)
-

- * 定义用户说话多长时间后 AI 应当停止当前的发言并响应。数值越小响应越快,但也更容易被噪音误导打断。 +

+ {canAdjustInterruptionSensitivity + ? '* 定义用户说话多长时间后 AI 应当停止当前的发言并响应。数值越小响应越快,但也更容易被噪音误导打断。' + : '* 当前已开启“机器人不可打断”,VAD 打断灵敏度已禁用。'}

@@ -1760,6 +1801,11 @@ export const DebugDrawer: React.FC<{ }, systemPrompt: assistant.prompt || '', greeting: assistant.opener || '', + generatedOpenerEnabled: assistant.generatedOpenerEnabled === true, + bargeIn: { + enabled: assistant.botCannotBeInterrupted !== true, + minDurationMs: assistant.interruptionSensitivity || 500, + }, knowledgeBaseId, knowledge, tools: selectedToolSchemas, diff --git a/web/services/backendApi.ts b/web/services/backendApi.ts index c8b4e49..8028473 100644 --- a/web/services/backendApi.ts +++ b/web/services/backendApi.ts @@ -30,6 +30,7 @@ const mapAssistant = (raw: AnyRecord): Assistant => ({ name: readField(raw, ['name'], ''), callCount: Number(readField(raw, ['callCount', 'call_count'], 0)), opener: readField(raw, ['opener'], ''), + generatedOpenerEnabled: Boolean(readField(raw, ['generatedOpenerEnabled', 'generated_opener_enabled'], false)), prompt: readField(raw, ['prompt'], ''), knowledgeBaseId: readField(raw, ['knowledgeBaseId', 'knowledge_base_id'], ''), language: readField(raw, ['language'], 'zh') as 'zh' | 'en', @@ -38,6 +39,7 @@ const mapAssistant = (raw: AnyRecord): Assistant => ({ speed: Number(readField(raw, ['speed'], 1)), hotwords: readField(raw, ['hotwords'], []), tools: readField(raw, ['tools'], []), + botCannotBeInterrupted: Boolean(readField(raw, ['botCannotBeInterrupted', 'bot_cannot_be_interrupted'], false)), interruptionSensitivity: Number(readField(raw, ['interruptionSensitivity', 'interruption_sensitivity'], 500)), configMode: readField(raw, ['configMode', 'config_mode'], 'platform') as 'platform' | 'dify' | 'fastgpt' | 'none', apiUrl: readField(raw, ['apiUrl', 'api_url'], ''), @@ -212,6 +214,7 @@ export const createAssistant = async (data: Partial): Promise): Promise): Pro const payload = { name: data.name, opener: data.opener, + generatedOpenerEnabled: data.generatedOpenerEnabled, prompt: data.prompt, knowledgeBaseId: data.knowledgeBaseId, language: data.language, @@ -245,6 +250,7 @@ export const updateAssistant = async (id: string, data: Partial): Pro speed: data.speed, hotwords: data.hotwords, tools: data.tools, + botCannotBeInterrupted: data.botCannotBeInterrupted, interruptionSensitivity: data.interruptionSensitivity, configMode: data.configMode, apiUrl: data.apiUrl, diff --git a/web/types.ts b/web/types.ts index 5bbf011..34cf2df 100644 --- a/web/types.ts +++ b/web/types.ts @@ -4,6 +4,7 @@ export interface Assistant { name: string; callCount: number; opener: string; + generatedOpenerEnabled?: boolean; prompt: string; knowledgeBaseId: string; language: 'zh' | 'en'; @@ -12,6 +13,7 @@ export interface Assistant { speed: number; hotwords: string[]; tools?: string[]; // IDs of enabled tools + botCannotBeInterrupted?: boolean; interruptionSensitivity?: number; // In ms configMode?: 'platform' | 'dify' | 'fastgpt' | 'none'; apiUrl?: string;