diff --git a/web/pages/Assistants.tsx b/web/pages/Assistants.tsx index b1dea9e..9c9be45 100644 --- a/web/pages/Assistants.tsx +++ b/web/pages/Assistants.tsx @@ -1065,6 +1065,7 @@ export const DebugDrawer: React.FC<{ const assistantDraftIndexRef = useRef(null); const audioCtxRef = useRef(null); const playbackTimeRef = useRef(0); + const activeAudioSourcesRef = useRef>(new Set()); const [devices, setDevices] = useState([]); const [selectedCamera, setSelectedCamera] = useState(''); @@ -1151,6 +1152,19 @@ export const DebugDrawer: React.FC<{ playbackTimeRef.current = ctx.currentTime; }; + const stopPlaybackImmediately = () => { + activeAudioSourcesRef.current.forEach((source) => { + try { + source.stop(); + } catch { + // no-op + } + source.disconnect(); + }); + activeAudioSourcesRef.current.clear(); + clearPlaybackQueue(); + }; + const playPcm16Chunk = async (pcmBuffer: ArrayBuffer) => { if (!textTtsEnabled) return; if (mode !== 'text') return; @@ -1170,6 +1184,15 @@ export const DebugDrawer: React.FC<{ const source = ctx.createBufferSource(); source.buffer = audioBuffer; source.connect(ctx.destination); + activeAudioSourcesRef.current.add(source); + source.onended = () => { + activeAudioSourcesRef.current.delete(source); + try { + source.disconnect(); + } catch { + // no-op + } + }; const startAt = Math.max(ctx.currentTime + 0.02, playbackTimeRef.current); source.start(startAt); @@ -1228,6 +1251,11 @@ export const DebugDrawer: React.FC<{ if (mode === 'text') { if (textTtsEnabled) await ensureAudioContext(); await ensureWsSession(); + // Interrupt any in-flight response/audio before sending new user utterance. + if (wsRef.current?.readyState === WebSocket.OPEN) { + wsRef.current.send(JSON.stringify({ type: 'response.cancel', graceful: false })); + } + stopPlaybackImmediately(); wsRef.current?.send(JSON.stringify({ type: 'input.text', text: userMsg })); } else { setTimeout(() => { @@ -1391,7 +1419,7 @@ export const DebugDrawer: React.FC<{ pendingRejectRef.current = null; assistantDraftIndexRef.current = null; setTextSessionStarted(false); - clearPlaybackQueue(); + stopPlaybackImmediately(); if (isOpen) setWsStatus('disconnected'); }; @@ -1456,7 +1484,15 @@ export const DebugDrawer: React.FC<{ return; } if (type === 'output.audio.start') { - clearPlaybackQueue(); + // New utterance audio starts: cancel old queued/playing audio to avoid overlap. + stopPlaybackImmediately(); + return; + } + + if (type === 'response.interrupted') { + assistantDraftIndexRef.current = null; + setIsLoading(false); + stopPlaybackImmediately(); return; } @@ -1541,7 +1577,7 @@ export const DebugDrawer: React.FC<{ ws.onclose = () => { wsReadyRef.current = false; setTextSessionStarted(false); - clearPlaybackQueue(); + stopPlaybackImmediately(); if (wsStatus !== 'error') setWsStatus('disconnected'); }; }); @@ -1549,7 +1585,7 @@ export const DebugDrawer: React.FC<{ useEffect(() => { if (!textTtsEnabled) { - clearPlaybackQueue(); + stopPlaybackImmediately(); } }, [textTtsEnabled]); @@ -1660,13 +1696,10 @@ export const DebugDrawer: React.FC<{
{mode === 'text' ? ( textSessionStarted ? ( -
-
+
+
-
) : wsStatus === 'connecting' ? (
@@ -1769,6 +1802,11 @@ export const DebugDrawer: React.FC<{
+ {mode === 'text' && textSessionStarted && ( + + )} setInputText(e.target.value)}