diff --git a/web/pages/Assistants.tsx b/web/pages/Assistants.tsx index 2865c82..3d697f2 100644 --- a/web/pages/Assistants.tsx +++ b/web/pages/Assistants.tsx @@ -1061,11 +1061,14 @@ export const DebugDrawer: React.FC<{ const pendingResolveRef = useRef<(() => void) | null>(null); const pendingRejectRef = useRef<((e: Error) => void) | null>(null); const assistantDraftIndexRef = useRef(null); + const audioCtxRef = useRef(null); + const playbackTimeRef = useRef(0); const [devices, setDevices] = useState([]); const [selectedCamera, setSelectedCamera] = useState(''); const [selectedMic, setSelectedMic] = useState(''); const [isSwapped, setIsSwapped] = useState(false); + const [textTtsEnabled, setTextTtsEnabled] = useState(true); // Initialize useEffect(() => { @@ -1080,6 +1083,10 @@ export const DebugDrawer: React.FC<{ setMode('text'); stopMedia(); closeWs(); + if (audioCtxRef.current) { + void audioCtxRef.current.close(); + audioCtxRef.current = null; + } setIsSwapped(false); setCallStatus('idle'); } @@ -1123,6 +1130,48 @@ export const DebugDrawer: React.FC<{ } }; + const ensureAudioContext = async () => { + if (!audioCtxRef.current) { + audioCtxRef.current = new AudioContext(); + playbackTimeRef.current = audioCtxRef.current.currentTime; + } + if (audioCtxRef.current.state === 'suspended') { + await audioCtxRef.current.resume(); + } + return audioCtxRef.current; + }; + + const clearPlaybackQueue = () => { + const ctx = audioCtxRef.current; + if (!ctx) return; + playbackTimeRef.current = ctx.currentTime; + }; + + const playPcm16Chunk = async (pcmBuffer: ArrayBuffer) => { + if (!textTtsEnabled) return; + if (mode !== 'text') return; + const ctx = await ensureAudioContext(); + const int16 = new Int16Array(pcmBuffer); + if (int16.length === 0) return; + + const float32 = new Float32Array(int16.length); + for (let i = 0; i < int16.length; i += 1) { + float32[i] = int16[i] / 32768; + } + + const sampleRate = 16000; + const audioBuffer = ctx.createBuffer(1, float32.length, sampleRate); + audioBuffer.copyToChannel(float32, 0); + + const source = ctx.createBufferSource(); + source.buffer = audioBuffer; + source.connect(ctx.destination); + + const startAt = Math.max(ctx.currentTime + 0.02, playbackTimeRef.current); + source.start(startAt); + playbackTimeRef.current = startAt + audioBuffer.duration; + }; + useEffect(() => { const handleStream = async () => { if (isOpen && mode === 'video' && callStatus === 'active') { @@ -1173,6 +1222,7 @@ export const DebugDrawer: React.FC<{ try { if (mode === 'text') { + if (textTtsEnabled) await ensureAudioContext(); await ensureWsSession(); wsRef.current?.send(JSON.stringify({ type: 'input.text', text: userMsg })); } else { @@ -1321,6 +1371,7 @@ export const DebugDrawer: React.FC<{ pendingResolveRef.current = null; pendingRejectRef.current = null; assistantDraftIndexRef.current = null; + clearPlaybackQueue(); if (isOpen) setWsStatus('disconnected'); }; @@ -1357,6 +1408,14 @@ export const DebugDrawer: React.FC<{ }; ws.onmessage = (event) => { + if (event.data instanceof ArrayBuffer) { + void playPcm16Chunk(event.data); + return; + } + if (event.data instanceof Blob) { + void event.data.arrayBuffer().then((buf) => playPcm16Chunk(buf)); + return; + } if (typeof event.data !== 'string') return; let payload: any; try { @@ -1376,6 +1435,10 @@ export const DebugDrawer: React.FC<{ ); return; } + if (type === 'output.audio.start') { + clearPlaybackQueue(); + return; + } if (type === 'session.started') { wsReadyRef.current = true; @@ -1457,11 +1520,18 @@ export const DebugDrawer: React.FC<{ ws.onclose = () => { wsReadyRef.current = false; + clearPlaybackQueue(); if (wsStatus !== 'error') setWsStatus('disconnected'); }; }); }; + useEffect(() => { + if (!textTtsEnabled) { + clearPlaybackQueue(); + } + }, [textTtsEnabled]); + useEffect(() => { if (!isOpen) return; const localResolved = buildLocalResolvedRuntime(); @@ -1525,6 +1595,15 @@ export const DebugDrawer: React.FC<{ WS: {wsStatus} +