Update text mode debug

This commit is contained in:
Xin Wang
2026-02-09 11:25:25 +08:00
parent f4aa432f0e
commit 17bd4a78f4

View File

@@ -1065,6 +1065,7 @@ export const DebugDrawer: React.FC<{
const assistantDraftIndexRef = useRef<number | null>(null); const assistantDraftIndexRef = useRef<number | null>(null);
const audioCtxRef = useRef<AudioContext | null>(null); const audioCtxRef = useRef<AudioContext | null>(null);
const playbackTimeRef = useRef<number>(0); const playbackTimeRef = useRef<number>(0);
const activeAudioSourcesRef = useRef<Set<AudioBufferSourceNode>>(new Set());
const [devices, setDevices] = useState<MediaDeviceInfo[]>([]); const [devices, setDevices] = useState<MediaDeviceInfo[]>([]);
const [selectedCamera, setSelectedCamera] = useState<string>(''); const [selectedCamera, setSelectedCamera] = useState<string>('');
@@ -1151,6 +1152,19 @@ export const DebugDrawer: React.FC<{
playbackTimeRef.current = ctx.currentTime; playbackTimeRef.current = ctx.currentTime;
}; };
const stopPlaybackImmediately = () => {
activeAudioSourcesRef.current.forEach((source) => {
try {
source.stop();
} catch {
// no-op
}
source.disconnect();
});
activeAudioSourcesRef.current.clear();
clearPlaybackQueue();
};
const playPcm16Chunk = async (pcmBuffer: ArrayBuffer) => { const playPcm16Chunk = async (pcmBuffer: ArrayBuffer) => {
if (!textTtsEnabled) return; if (!textTtsEnabled) return;
if (mode !== 'text') return; if (mode !== 'text') return;
@@ -1170,6 +1184,15 @@ export const DebugDrawer: React.FC<{
const source = ctx.createBufferSource(); const source = ctx.createBufferSource();
source.buffer = audioBuffer; source.buffer = audioBuffer;
source.connect(ctx.destination); source.connect(ctx.destination);
activeAudioSourcesRef.current.add(source);
source.onended = () => {
activeAudioSourcesRef.current.delete(source);
try {
source.disconnect();
} catch {
// no-op
}
};
const startAt = Math.max(ctx.currentTime + 0.02, playbackTimeRef.current); const startAt = Math.max(ctx.currentTime + 0.02, playbackTimeRef.current);
source.start(startAt); source.start(startAt);
@@ -1228,6 +1251,11 @@ export const DebugDrawer: React.FC<{
if (mode === 'text') { if (mode === 'text') {
if (textTtsEnabled) await ensureAudioContext(); if (textTtsEnabled) await ensureAudioContext();
await ensureWsSession(); await ensureWsSession();
// Interrupt any in-flight response/audio before sending new user utterance.
if (wsRef.current?.readyState === WebSocket.OPEN) {
wsRef.current.send(JSON.stringify({ type: 'response.cancel', graceful: false }));
}
stopPlaybackImmediately();
wsRef.current?.send(JSON.stringify({ type: 'input.text', text: userMsg })); wsRef.current?.send(JSON.stringify({ type: 'input.text', text: userMsg }));
} else { } else {
setTimeout(() => { setTimeout(() => {
@@ -1391,7 +1419,7 @@ export const DebugDrawer: React.FC<{
pendingRejectRef.current = null; pendingRejectRef.current = null;
assistantDraftIndexRef.current = null; assistantDraftIndexRef.current = null;
setTextSessionStarted(false); setTextSessionStarted(false);
clearPlaybackQueue(); stopPlaybackImmediately();
if (isOpen) setWsStatus('disconnected'); if (isOpen) setWsStatus('disconnected');
}; };
@@ -1456,7 +1484,15 @@ export const DebugDrawer: React.FC<{
return; return;
} }
if (type === 'output.audio.start') { if (type === 'output.audio.start') {
clearPlaybackQueue(); // New utterance audio starts: cancel old queued/playing audio to avoid overlap.
stopPlaybackImmediately();
return;
}
if (type === 'response.interrupted') {
assistantDraftIndexRef.current = null;
setIsLoading(false);
stopPlaybackImmediately();
return; return;
} }
@@ -1541,7 +1577,7 @@ export const DebugDrawer: React.FC<{
ws.onclose = () => { ws.onclose = () => {
wsReadyRef.current = false; wsReadyRef.current = false;
setTextSessionStarted(false); setTextSessionStarted(false);
clearPlaybackQueue(); stopPlaybackImmediately();
if (wsStatus !== 'error') setWsStatus('disconnected'); if (wsStatus !== 'error') setWsStatus('disconnected');
}; };
}); });
@@ -1549,7 +1585,7 @@ export const DebugDrawer: React.FC<{
useEffect(() => { useEffect(() => {
if (!textTtsEnabled) { if (!textTtsEnabled) {
clearPlaybackQueue(); stopPlaybackImmediately();
} }
}, [textTtsEnabled]); }, [textTtsEnabled]);
@@ -1660,13 +1696,10 @@ export const DebugDrawer: React.FC<{
<div className="flex-1 overflow-hidden flex flex-col min-h-0 mb-4"> <div className="flex-1 overflow-hidden flex flex-col min-h-0 mb-4">
{mode === 'text' ? ( {mode === 'text' ? (
textSessionStarted ? ( textSessionStarted ? (
<div className="flex-1 flex flex-col min-h-0 space-y-2"> <div className="flex-1 flex flex-col min-h-0 animate-in fade-in">
<div className="flex flex-col h-full animate-in fade-in"> <div className="h-[52vh] min-h-[260px] max-h-[52vh]">
<TranscriptionLog /> <TranscriptionLog />
</div> </div>
<Button variant="destructive" size="sm" className="w-full h-10 font-bold" onClick={closeWs}>
<PhoneOff className="mr-2 h-4 w-4" />
</Button>
</div> </div>
) : wsStatus === 'connecting' ? ( ) : wsStatus === 'connecting' ? (
<div className="flex-1 flex flex-col items-center justify-center space-y-6"> <div className="flex-1 flex flex-col items-center justify-center space-y-6">
@@ -1769,6 +1802,11 @@ export const DebugDrawer: React.FC<{
<div className="shrink-0 space-y-2"> <div className="shrink-0 space-y-2">
<div className="flex space-x-2"> <div className="flex space-x-2">
{mode === 'text' && textSessionStarted && (
<Button variant="destructive" size="sm" className="h-9 font-bold shrink-0" onClick={closeWs}>
<PhoneOff className="mr-2 h-4 w-4" />
</Button>
)}
<Input <Input
value={inputText} value={inputText}
onChange={e => setInputText(e.target.value)} onChange={e => setInputText(e.target.value)}