diff --git a/api/app/routers/tools.py b/api/app/routers/tools.py index 5c9efa3..00c9bf3 100644 --- a/api/app/routers/tools.py +++ b/api/app/routers/tools.py @@ -109,6 +109,67 @@ TOOL_REGISTRY = { "required": ["msg"] } }, + "voice_choice_prompt": { + "name": "语音选项提示", + "description": "播报问题并展示可选项,等待用户选择后回传结果", + "parameters": { + "type": "object", + "properties": { + "question": {"type": "string", "description": "向用户展示的问题文本"}, + "options": { + "type": "array", + "description": "可选项(字符串或含 id/label/value 的对象)", + "minItems": 2, + "items": { + "anyOf": [ + {"type": "string"}, + { + "type": "object", + "properties": { + "id": {"type": "string"}, + "label": {"type": "string"}, + "value": {"type": "string"} + }, + "required": ["label"] + } + ] + } + }, + "voice_text": {"type": "string", "description": "可选,单独指定播报文本;为空则播报 question"} + }, + "required": ["question", "options"] + } + }, + "text_choice_prompt": { + "name": "文本选项提示", + "description": "显示文本选项弹窗并等待用户选择后回传结果", + "parameters": { + "type": "object", + "properties": { + "question": {"type": "string", "description": "向用户展示的问题文本"}, + "options": { + "type": "array", + "description": "可选项(字符串或含 id/label/value 的对象)", + "minItems": 2, + "items": { + "anyOf": [ + {"type": "string"}, + { + "type": "object", + "properties": { + "id": {"type": "string"}, + "label": {"type": "string"}, + "value": {"type": "string"} + }, + "required": ["label"] + } + ] + } + } + }, + "required": ["question", "options"] + } + }, } TOOL_CATEGORY_MAP = { @@ -121,6 +182,8 @@ TOOL_CATEGORY_MAP = { "decrease_volume": "system", "voice_message_prompt": "system", "text_msg_prompt": "system", + "voice_choice_prompt": "system", + "text_choice_prompt": "system", } TOOL_ICON_MAP = { @@ -133,6 +196,8 @@ TOOL_ICON_MAP = { "decrease_volume": "Volume2", "voice_message_prompt": "Volume2", "text_msg_prompt": "Terminal", + "voice_choice_prompt": "Volume2", + "text_choice_prompt": "Terminal", } TOOL_HTTP_DEFAULTS = { @@ -145,6 +210,8 @@ TOOL_PARAMETER_DEFAULTS = { TOOL_WAIT_FOR_RESPONSE_DEFAULTS = { "text_msg_prompt": True, + "voice_choice_prompt": True, + "text_choice_prompt": True, } diff --git a/engine/core/duplex_pipeline.py b/engine/core/duplex_pipeline.py index 60bdc45..037274a 100644 --- a/engine/core/duplex_pipeline.py +++ b/engine/core/duplex_pipeline.py @@ -168,6 +168,70 @@ class DuplexPipeline: "required": ["msg"], }, }, + "voice_choice_prompt": { + "name": "voice_choice_prompt", + "description": "Speak a question and show options on client side, then wait for selection", + "parameters": { + "type": "object", + "properties": { + "question": {"type": "string", "description": "Question text to show"}, + "options": { + "type": "array", + "description": "Selectable options (string or object with id/label/value)", + "minItems": 2, + "items": { + "anyOf": [ + {"type": "string"}, + { + "type": "object", + "properties": { + "id": {"type": "string"}, + "label": {"type": "string"}, + "value": {"type": "string"}, + }, + "required": ["label"], + }, + ] + }, + }, + "voice_text": { + "type": "string", + "description": "Optional voice text. Falls back to question when omitted.", + }, + }, + "required": ["question", "options"], + }, + }, + "text_choice_prompt": { + "name": "text_choice_prompt", + "description": "Show a text-only choice prompt on client side and wait for selection", + "parameters": { + "type": "object", + "properties": { + "question": {"type": "string", "description": "Question text to show"}, + "options": { + "type": "array", + "description": "Selectable options (string or object with id/label/value)", + "minItems": 2, + "items": { + "anyOf": [ + {"type": "string"}, + { + "type": "object", + "properties": { + "id": {"type": "string"}, + "label": {"type": "string"}, + "value": {"type": "string"}, + }, + "required": ["label"], + }, + ] + }, + }, + }, + "required": ["question", "options"], + }, + }, } _DEFAULT_CLIENT_EXECUTORS = frozenset({ "turn_on_camera", @@ -176,6 +240,8 @@ class DuplexPipeline: "decrease_volume", "voice_message_prompt", "text_msg_prompt", + "voice_choice_prompt", + "text_choice_prompt", }) def __init__( diff --git a/web/pages/Assistants.tsx b/web/pages/Assistants.tsx index ffa4c41..d83e7eb 100644 --- a/web/pages/Assistants.tsx +++ b/web/pages/Assistants.tsx @@ -4,7 +4,7 @@ import { createPortal } from 'react-dom'; import { Plus, Search, Play, Square, Copy, Trash2, Mic, MessageSquare, Save, Video, PhoneOff, Camera, ArrowLeftRight, Send, Phone, Rocket, AlertTriangle, PhoneCall, CameraOff, Image, Images, CloudSun, Calendar, TrendingUp, Coins, Wrench, Globe, Terminal, X, ClipboardCheck, Sparkles, Volume2, Timer, ChevronDown, Database, Server, Zap, ExternalLink, Key, BrainCircuit, Ear, Book, Filter } from 'lucide-react'; import { Button, Input, Badge, Drawer, Dialog, Switch } from '../components/UI'; import { ASRModel, Assistant, KnowledgeBase, LLMModel, TabValue, Tool, Voice } from '../types'; -import { createAssistant, deleteAssistant, fetchASRModels, fetchAssistantOpenerAudioPcmBuffer, fetchAssistants, fetchKnowledgeBases, fetchLLMModels, fetchTools, fetchVoices, generateAssistantOpenerAudio, updateAssistant as updateAssistantApi } from '../services/backendApi'; +import { createAssistant, deleteAssistant, fetchASRModels, fetchAssistantOpenerAudioPcmBuffer, fetchAssistants, fetchKnowledgeBases, fetchLLMModels, fetchTools, fetchVoices, generateAssistantOpenerAudio, previewVoice, updateAssistant as updateAssistantApi } from '../services/backendApi'; const isOpenAICompatibleVendor = (vendor?: string) => { const normalized = String(vendor || '').trim().toLowerCase(); @@ -1696,7 +1696,34 @@ const TOOL_PARAMETER_HINTS: Record = { }, required: ['msg'], }, - choice_prompt: { + voice_choice_prompt: { + type: 'object', + properties: { + question: { type: 'string', description: 'Question text to ask the user' }, + options: { + type: 'array', + description: 'Selectable options (string or object with id/label/value)', + minItems: 2, + items: { + anyOf: [ + { type: 'string' }, + { + type: 'object', + properties: { + id: { type: 'string' }, + label: { type: 'string' }, + value: { type: 'string' }, + }, + required: ['label'], + }, + ], + }, + }, + voice_text: { type: 'string', description: 'Optional custom voice text, defaults to question' }, + }, + required: ['question', 'options'], + }, + text_choice_prompt: { type: 'object', properties: { question: { type: 'string', description: 'Question text to ask the user' }, @@ -1741,12 +1768,14 @@ const DEBUG_CLIENT_TOOLS = [ { id: 'decrease_volume', name: 'decrease_volume', description: '调低音量' }, { id: 'voice_message_prompt', name: 'voice_message_prompt', description: '语音消息提示' }, { id: 'text_msg_prompt', name: 'text_msg_prompt', description: '文本消息提示' }, - { id: 'choice_prompt', name: 'choice_prompt', description: '选项问题提示' }, + { id: 'voice_choice_prompt', name: 'voice_choice_prompt', description: '语音选项提示(原子)' }, + { id: 'text_choice_prompt', name: 'text_choice_prompt', description: '文本选项提示(等待选择)' }, ] as const; const DEBUG_CLIENT_TOOL_ID_SET = new Set(DEBUG_CLIENT_TOOLS.map((item) => item.id)); const DEBUG_CLIENT_TOOL_WAIT_DEFAULTS: Record = { text_msg_prompt: true, - choice_prompt: true, + voice_choice_prompt: true, + text_choice_prompt: true, }; type DynamicVariableEntry = { @@ -1936,6 +1965,25 @@ type DebugChoicePromptOption = { value: string; }; +type DebugTextPromptDialogState = { + open: boolean; + message: string; + pendingResult?: DebugPromptPendingResult; +}; + +type DebugChoicePromptDialogState = { + open: boolean; + question: string; + options: DebugChoicePromptOption[]; + pendingResult?: DebugPromptPendingResult; + requireSelection?: boolean; + voiceText?: string; +}; + +type DebugPromptQueueItem = + | { kind: 'text'; payload: Omit } + | { kind: 'choice'; payload: Omit }; + const normalizeChoicePromptOptions = (rawOptions: unknown[]): DebugChoicePromptOption[] => { const usedIds = new Set(); const resolved: DebugChoicePromptOption[] = []; @@ -2055,19 +2103,12 @@ export const DebugDrawer: React.FC<{ const [inputText, setInputText] = useState(''); const [isLoading, setIsLoading] = useState(false); const [callStatus, setCallStatus] = useState<'idle' | 'calling' | 'active'>('idle'); - const [textPromptDialog, setTextPromptDialog] = useState<{ - open: boolean; - message: string; - pendingResult?: DebugPromptPendingResult; - }>({ open: false, message: '' }); - const [choicePromptDialog, setChoicePromptDialog] = useState<{ - open: boolean; - question: string; - options: DebugChoicePromptOption[]; - pendingResult?: DebugPromptPendingResult; - }>({ open: false, question: '', options: [] }); + const [textPromptDialog, setTextPromptDialog] = useState({ open: false, message: '' }); + const [choicePromptDialog, setChoicePromptDialog] = useState({ open: false, question: '', options: [] }); const textPromptDialogRef = useRef(textPromptDialog); const choicePromptDialogRef = useRef(choicePromptDialog); + const promptDialogQueueRef = useRef([]); + const promptAudioRef = useRef(null); const [textSessionStarted, setTextSessionStarted] = useState(false); const [wsStatus, setWsStatus] = useState<'disconnected' | 'connecting' | 'ready' | 'error'>('disconnected'); const [wsError, setWsError] = useState(''); @@ -2245,9 +2286,17 @@ export const DebugDrawer: React.FC<{ } } else { setMode('text'); + if (textPromptDialogRef.current.open) { + closeTextPromptDialog('dismiss', { force: true, skipQueueAdvance: true }); + } + if (choicePromptDialogRef.current.open) { + closeChoicePromptDialog('dismiss', undefined, { force: true, skipQueueAdvance: true }); + } stopVoiceCapture(); stopMedia(); closeWs(); + stopPromptVoicePlayback(); + promptDialogQueueRef.current = []; setTextPromptDialog({ open: false, message: '' }); setChoicePromptDialog({ open: false, question: '', options: [] }); if (audioCtxRef.current) { @@ -2514,8 +2563,102 @@ export const DebugDrawer: React.FC<{ ]); }; - const closeTextPromptDialog = (action: 'confirm' | 'dismiss') => { + const stopPromptVoicePlayback = () => { + if (promptAudioRef.current) { + try { + promptAudioRef.current.pause(); + } catch { + // no-op + } + promptAudioRef.current = null; + } + if (typeof window !== 'undefined' && 'speechSynthesis' in window) { + window.speechSynthesis.cancel(); + } + }; + + const playPromptVoice = async (text: string) => { + const phrase = String(text || '').trim(); + if (!phrase) return; + stopPromptVoicePlayback(); + + const canUseAssistantTts = assistant.voiceOutputEnabled !== false && Boolean(assistant.voice); + if (canUseAssistantTts) { + const selectedVoice = voices.find((item) => item.id === assistant.voice); + if (selectedVoice) { + try { + const audioUrl = await previewVoice(selectedVoice.id, phrase, assistant.speed); + const audio = new Audio(audioUrl); + promptAudioRef.current = audio; + audio.onended = () => { + if (promptAudioRef.current === audio) { + promptAudioRef.current = null; + } + }; + audio.onerror = () => { + if (promptAudioRef.current === audio) { + promptAudioRef.current = null; + } + }; + await audio.play(); + return; + } catch (err) { + console.warn('Assistant TTS preview failed, falling back to speechSynthesis', err); + } + } + } + + if (typeof window !== 'undefined' && 'speechSynthesis' in window) { + const utterance = new SpeechSynthesisUtterance(phrase); + utterance.lang = assistant.language === 'en' ? 'en-US' : 'zh-CN'; + window.speechSynthesis.cancel(); + window.speechSynthesis.speak(utterance); + } + }; + + const hasActivePromptDialog = () => textPromptDialogRef.current.open || choicePromptDialogRef.current.open; + + const activatePromptDialog = (item: DebugPromptQueueItem) => { + if (item.kind === 'text') { + setTextPromptDialog({ + open: true, + message: item.payload.message, + pendingResult: item.payload.pendingResult, + }); + return; + } + const nextVoiceText = String(item.payload.voiceText || '').trim(); + setChoicePromptDialog({ + open: true, + question: item.payload.question, + options: item.payload.options, + pendingResult: item.payload.pendingResult, + requireSelection: item.payload.requireSelection === true, + voiceText: nextVoiceText || undefined, + }); + if (nextVoiceText) { + void playPromptVoice(nextVoiceText); + } + }; + + const enqueuePromptDialog = (item: DebugPromptQueueItem) => { + if (hasActivePromptDialog()) { + promptDialogQueueRef.current.push(item); + return; + } + activatePromptDialog(item); + }; + + const openNextPromptDialog = (force = false) => { + if (!force && hasActivePromptDialog()) return; + const next = promptDialogQueueRef.current.shift(); + if (!next) return; + activatePromptDialog(next); + }; + + const closeTextPromptDialog = (action: 'confirm' | 'dismiss', opts?: { force?: boolean; skipQueueAdvance?: boolean }) => { const snapshot = textPromptDialogRef.current; + if (!snapshot.open && !opts?.force) return; const pending = snapshot?.pendingResult; const message = snapshot?.message || ''; setTextPromptDialog({ open: false, message: '' }); @@ -2534,16 +2677,25 @@ export const DebugDrawer: React.FC<{ pending.toolDisplayName ); } + if (!opts?.skipQueueAdvance) { + openNextPromptDialog(true); + } }; const closeChoicePromptDialog = ( action: 'select' | 'dismiss', - selectedOption?: DebugChoicePromptOption + selectedOption?: DebugChoicePromptOption, + opts?: { force?: boolean; skipQueueAdvance?: boolean } ) => { const snapshot = choicePromptDialogRef.current; + if (!snapshot.open && !opts?.force) return; + if (snapshot.requireSelection && action !== 'select' && !opts?.force) { + return; + } const pending = snapshot?.pendingResult; const question = snapshot?.question || ''; const options = snapshot?.options || []; + stopPromptVoicePlayback(); setChoicePromptDialog({ open: false, question: '', options: [] }); if (pending?.waitForResponse) { emitClientToolResult( @@ -2568,6 +2720,9 @@ export const DebugDrawer: React.FC<{ pending.toolDisplayName ); } + if (!opts?.skipQueueAdvance) { + openNextPromptDialog(true); + } }; const scheduleQueuedPlayback = (ctx: AudioContext) => { @@ -2699,11 +2854,13 @@ export const DebugDrawer: React.FC<{ const handleHangup = () => { if (textPromptDialog.open) { - closeTextPromptDialog('dismiss'); + closeTextPromptDialog('dismiss', { force: true, skipQueueAdvance: true }); } if (choicePromptDialog.open) { - closeChoicePromptDialog('dismiss'); + closeChoicePromptDialog('dismiss', undefined, { force: true, skipQueueAdvance: true }); } + stopPromptVoicePlayback(); + promptDialogQueueRef.current = []; stopVoiceCapture(); stopMedia(); closeWs(); @@ -3059,6 +3216,10 @@ export const DebugDrawer: React.FC<{ userDraftIndexRef.current = null; lastUserFinalRef.current = ''; micFrameBufferRef.current = new Uint8Array(0); + stopPromptVoicePlayback(); + promptDialogQueueRef.current = []; + setTextPromptDialog({ open: false, message: '' }); + setChoicePromptDialog({ open: false, question: '', options: [] }); setTextSessionStarted(false); stopPlaybackImmediately(); if (isOpen) setWsStatus('disconnected'); @@ -3220,9 +3381,11 @@ export const DebugDrawer: React.FC<{ parsedArgs = {}; } } - const waitForResponse = Boolean( + const waitForResponseRaw = Boolean( payload?.wait_for_response ?? toolCall?.wait_for_response ?? toolCall?.waitForResponse ?? false ); + const waitForResponse = + toolName === 'voice_choice_prompt' || toolName === 'text_choice_prompt' ? true : waitForResponseRaw; const resultPayload: any = { tool_call_id: toolCallId, name: toolName, @@ -3316,46 +3479,14 @@ export const DebugDrawer: React.FC<{ if (!msg) { resultPayload.output = { message: "Missing required argument 'msg'" }; resultPayload.status = { code: 422, message: 'invalid_arguments' }; - } else if (typeof window !== 'undefined' && 'speechSynthesis' in window) { - const utterance = new SpeechSynthesisUtterance(msg); - utterance.lang = 'zh-CN'; - window.speechSynthesis.cancel(); + } else { + void playPromptVoice(msg); if (waitForResponse) { - utterance.onend = () => { - emitClientToolResult( - { - tool_call_id: toolCallId, - name: toolName, - output: { message: 'voice_prompt_completed', msg }, - status: { code: 200, message: 'ok' }, - }, - toolDisplayName - ); - }; - utterance.onerror = (event) => { - emitClientToolResult( - { - tool_call_id: toolCallId, - name: toolName, - output: { - message: 'voice_prompt_failed', - msg, - error: String(event.error || 'speech_error'), - }, - status: { code: 500, message: 'client_tool_failed' }, - }, - toolDisplayName - ); - }; - window.speechSynthesis.speak(utterance); - return; + // Voice prompt playback is fire-and-forget; keep previous wait behavior stable. + // Client ack is returned immediately after dispatch. } - window.speechSynthesis.speak(utterance); resultPayload.output = { message: 'voice_prompt_sent', msg }; resultPayload.status = { code: 200, message: 'ok' }; - } else { - resultPayload.output = { message: 'speech_synthesis_unavailable', msg }; - resultPayload.status = { code: 503, message: 'speech_output_unavailable' }; } } else if (toolName === 'text_msg_prompt') { const msg = String(parsedArgs?.msg || '').trim(); @@ -3363,15 +3494,16 @@ export const DebugDrawer: React.FC<{ resultPayload.output = { message: "Missing required argument 'msg'" }; resultPayload.status = { code: 422, message: 'invalid_arguments' }; } else { - setChoicePromptDialog({ open: false, question: '', options: [] }); - setTextPromptDialog({ - open: true, - message: msg, - pendingResult: { - toolCallId: toolCallId, - toolName, - toolDisplayName, - waitForResponse, + enqueuePromptDialog({ + kind: 'text', + payload: { + message: msg, + pendingResult: { + toolCallId: toolCallId, + toolName, + toolDisplayName, + waitForResponse, + }, }, }); if (!waitForResponse) { @@ -3381,10 +3513,15 @@ export const DebugDrawer: React.FC<{ return; } } - } else if (toolName === 'choice_prompt') { + } else if (toolName === 'text_choice_prompt' || toolName === 'voice_choice_prompt') { const question = String(parsedArgs?.question || '').trim(); const rawOptions = Array.isArray(parsedArgs?.options) ? parsedArgs.options : []; const options = normalizeChoicePromptOptions(rawOptions); + const isVoiceChoicePrompt = toolName === 'voice_choice_prompt'; + const voiceText = isVoiceChoicePrompt + ? String(parsedArgs?.voice_text || parsedArgs?.voiceText || parsedArgs?.msg || question || '').trim() + : ''; + const requireSelection = toolName === 'voice_choice_prompt' || toolName === 'text_choice_prompt'; if (!question) { resultPayload.output = { message: "Missing required argument 'question'" }; resultPayload.status = { code: 422, message: 'invalid_arguments' }; @@ -3392,21 +3529,24 @@ export const DebugDrawer: React.FC<{ resultPayload.output = { message: "Argument 'options' requires at least 2 valid entries" }; resultPayload.status = { code: 422, message: 'invalid_arguments' }; } else { - setTextPromptDialog({ open: false, message: '' }); - setChoicePromptDialog({ - open: true, - question, - options, - pendingResult: { - toolCallId: toolCallId, - toolName, - toolDisplayName, - waitForResponse, + enqueuePromptDialog({ + kind: 'choice', + payload: { + question, + options, + pendingResult: { + toolCallId: toolCallId, + toolName, + toolDisplayName, + waitForResponse, + }, + requireSelection, + voiceText, }, }); - if (!waitForResponse) { + if (!waitForResponse && !requireSelection) { resultPayload.output = { - message: 'choice_prompt_shown', + message: `${toolName}_shown`, question, options, }; @@ -4149,17 +4289,26 @@ export const DebugDrawer: React.FC<{ {choicePromptDialog.open && (
- + {!choicePromptDialog.requireSelection && ( + + )}
-
选项问题提示
+
+ {choicePromptDialog.requireSelection + ? (choicePromptDialog.voiceText ? '语音选项提示' : '文本选项提示') + : '选项问题提示'} +

{choicePromptDialog.question}

+ {choicePromptDialog.requireSelection && ( +

请点击一个选项继续。

+ )}
{choicePromptDialog.options.map((option) => ( @@ -4173,11 +4322,13 @@ export const DebugDrawer: React.FC<{ ))}
-
- -
+ {!choicePromptDialog.requireSelection && ( +
+ +
+ )}
)}