diff --git a/web/pages/Assistants.tsx b/web/pages/Assistants.tsx index 0311057..0de0761 100644 --- a/web/pages/Assistants.tsx +++ b/web/pages/Assistants.tsx @@ -992,6 +992,37 @@ export const DebugDrawer: React.FC<{ llmModels: LLMModel[]; asrModels: ASRModel[]; }> = ({ isOpen, onClose, assistant, voices, llmModels, asrModels }) => { + const TARGET_SAMPLE_RATE = 16000; + const downsampleTo16k = (input: Float32Array, inputSampleRate: number): Float32Array => { + if (inputSampleRate === TARGET_SAMPLE_RATE) return input; + if (inputSampleRate < TARGET_SAMPLE_RATE) return input; + const ratio = inputSampleRate / TARGET_SAMPLE_RATE; + const outputLength = Math.max(1, Math.round(input.length / ratio)); + const output = new Float32Array(outputLength); + let offsetInput = 0; + for (let i = 0; i < outputLength; i += 1) { + const nextOffsetInput = Math.min(input.length, Math.round((i + 1) * ratio)); + let accum = 0; + let count = 0; + for (let j = offsetInput; j < nextOffsetInput; j += 1) { + accum += input[j]; + count += 1; + } + output[i] = count > 0 ? accum / count : input[Math.min(offsetInput, input.length - 1)] || 0; + offsetInput = nextOffsetInput; + } + return output; + }; + + const float32ToPcm16 = (input: Float32Array): Int16Array => { + const output = new Int16Array(input.length); + for (let i = 0; i < input.length; i += 1) { + const s = Math.max(-1, Math.min(1, input[i])); + output[i] = s < 0 ? Math.round(s * 0x8000) : Math.round(s * 0x7fff); + } + return output; + }; + const [mode, setMode] = useState<'text' | 'voice' | 'video'>('text'); const [messages, setMessages] = useState<{role: 'user' | 'model', text: string}[]>([]); const [inputText, setInputText] = useState(''); @@ -1002,6 +1033,8 @@ export const DebugDrawer: React.FC<{ const [wsError, setWsError] = useState(''); const [resolvedConfigOpen, setResolvedConfigOpen] = useState(false); const [resolvedConfigView, setResolvedConfigView] = useState(''); + const [captureConfigOpen, setCaptureConfigOpen] = useState(false); + const [captureConfigView, setCaptureConfigView] = useState(''); const [settingsDrawerOpen, setSettingsDrawerOpen] = useState(false); const [wsUrl, setWsUrl] = useState(() => { const fromStorage = localStorage.getItem('debug_ws_url'); @@ -1034,6 +1067,15 @@ export const DebugDrawer: React.FC<{ const [selectedMic, setSelectedMic] = useState(''); const [isSwapped, setIsSwapped] = useState(false); const [textTtsEnabled, setTextTtsEnabled] = useState(true); + const [aecEnabled, setAecEnabled] = useState(() => localStorage.getItem('debug_audio_aec') !== '0'); + const [nsEnabled, setNsEnabled] = useState(() => localStorage.getItem('debug_audio_ns') !== '0'); + const [agcEnabled, setAgcEnabled] = useState(() => localStorage.getItem('debug_audio_agc') !== '0'); + + const micAudioCtxRef = useRef(null); + const micSourceRef = useRef(null); + const micProcessorRef = useRef(null); + const micGainRef = useRef(null); + const userDraftIndexRef = useRef(null); // Initialize useEffect(() => { @@ -1047,6 +1089,7 @@ export const DebugDrawer: React.FC<{ } } else { setMode('text'); + stopVoiceCapture(); stopMedia(); closeWs(); if (audioCtxRef.current) { @@ -1063,6 +1106,18 @@ export const DebugDrawer: React.FC<{ localStorage.setItem('debug_ws_url', wsUrl); }, [wsUrl]); + useEffect(() => { + localStorage.setItem('debug_audio_aec', aecEnabled ? '1' : '0'); + }, [aecEnabled]); + + useEffect(() => { + localStorage.setItem('debug_audio_ns', nsEnabled ? '1' : '0'); + }, [nsEnabled]); + + useEffect(() => { + localStorage.setItem('debug_audio_agc', agcEnabled ? '1' : '0'); + }, [agcEnabled]); + // Auto-scroll logic useEffect(() => { if (scrollRef.current) { @@ -1072,10 +1127,11 @@ export const DebugDrawer: React.FC<{ // Fetch Devices useEffect(() => { - if (isOpen && mode === 'video') { + if (isOpen && (mode === 'video' || mode === 'voice')) { const getDevices = async () => { + let permissionStream: MediaStream | null = null; try { - await navigator.mediaDevices.getUserMedia({ audio: true, video: true }); + permissionStream = await navigator.mediaDevices.getUserMedia({ audio: true, video: mode === 'video' }); const dev = await navigator.mediaDevices.enumerateDevices(); setDevices(dev); const cams = dev.filter(d => d.kind === 'videoinput'); @@ -1084,6 +1140,8 @@ export const DebugDrawer: React.FC<{ if (mics.length > 0 && !selectedMic) setSelectedMic(mics[0].deviceId); } catch (e) { console.error("Error enumerating devices", e); + } finally { + permissionStream?.getTracks().forEach((track) => track.stop()); } }; getDevices(); @@ -1097,6 +1155,100 @@ export const DebugDrawer: React.FC<{ } }; + const stopVoiceCapture = () => { + if (micProcessorRef.current) { + micProcessorRef.current.onaudioprocess = null; + try { + micProcessorRef.current.disconnect(); + } catch { + // no-op + } + micProcessorRef.current = null; + } + if (micSourceRef.current) { + try { + micSourceRef.current.disconnect(); + } catch { + // no-op + } + micSourceRef.current = null; + } + if (micGainRef.current) { + try { + micGainRef.current.disconnect(); + } catch { + // no-op + } + micGainRef.current = null; + } + if (micAudioCtxRef.current) { + void micAudioCtxRef.current.close(); + micAudioCtxRef.current = null; + } + setCaptureConfigView(''); + stopMedia(); + }; + + const buildMicConstraints = (): MediaTrackConstraints => ({ + deviceId: selectedMic ? { exact: selectedMic } : undefined, + echoCancellation: aecEnabled, + noiseSuppression: nsEnabled, + autoGainControl: agcEnabled, + channelCount: 1, + sampleRate: TARGET_SAMPLE_RATE, + }); + + const startVoiceCapture = async () => { + stopVoiceCapture(); + const requestedConstraints = buildMicConstraints(); + const stream = await navigator.mediaDevices.getUserMedia({ + audio: requestedConstraints, + video: false, + }); + streamRef.current = stream; + const track = stream.getAudioTracks()[0]; + if (track) { + console.log('Voice capture settings', track.getSettings()); + setCaptureConfigView( + JSON.stringify( + { + requested: requestedConstraints, + applied: track.getSettings(), + capabilities: typeof track.getCapabilities === 'function' ? track.getCapabilities() : undefined, + }, + null, + 2 + ) + ); + } + + const ctx = new AudioContext(); + if (ctx.state === 'suspended') { + await ctx.resume(); + } + micAudioCtxRef.current = ctx; + const source = ctx.createMediaStreamSource(stream); + const processor = ctx.createScriptProcessor(4096, 1, 1); + const silentGain = ctx.createGain(); + silentGain.gain.value = 0; + + source.connect(processor); + processor.connect(silentGain); + silentGain.connect(ctx.destination); + + processor.onaudioprocess = (event) => { + if (!wsRef.current || wsRef.current.readyState !== WebSocket.OPEN || !wsReadyRef.current) return; + const inChannel = event.inputBuffer.getChannelData(0); + const downsampled = downsampleTo16k(inChannel, event.inputBuffer.sampleRate); + const pcm16 = float32ToPcm16(downsampled); + wsRef.current.send(pcm16.buffer); + }; + + micSourceRef.current = source; + micProcessorRef.current = processor; + micGainRef.current = silentGain; + }; + const ensureAudioContext = async () => { if (!audioCtxRef.current) { audioCtxRef.current = new AudioContext(); @@ -1174,7 +1326,6 @@ export const DebugDrawer: React.FC<{ const playPcm16Chunk = async (pcmBuffer: ArrayBuffer) => { if (!textTtsEnabled) return; - if (mode !== 'text') return; const ctx = await ensureAudioContext(); const int16 = new Int16Array(pcmBuffer); if (int16.length === 0) return; @@ -1219,17 +1370,43 @@ export const DebugDrawer: React.FC<{ }, [mode, isOpen, selectedCamera, selectedMic, callStatus]); const handleCall = () => { - setCallStatus('calling'); - setTimeout(() => { + if (mode !== 'voice') { + setCallStatus('calling'); + setTimeout(() => { + setCallStatus('active'); + setMessages([{ role: 'model', text: assistant.opener || "Hello!" }]); + }, 1500); + return; + } + const launchVoice = async () => { + try { + setCallStatus('calling'); + setMessages([]); + setWsError(''); + closeWs(); + if (textTtsEnabled) await ensureAudioContext(); + await ensureWsSession(); + await startVoiceCapture(); setCallStatus('active'); - setMessages([{ role: 'model', text: assistant.opener || "Hello!" }]); - }, 1500); + setMessages([{ role: 'model', text: assistant.opener || 'Hello!' }]); + } catch (e) { + console.error(e); + stopVoiceCapture(); + setCallStatus('idle'); + setWsStatus('error'); + setWsError((e as Error)?.message || 'Failed to start voice call'); + } + }; + void launchVoice(); }; const handleHangup = () => { + stopVoiceCapture(); stopMedia(); + closeWs(); setCallStatus('idle'); setMessages([]); + setIsLoading(false); }; const handleSend = async () => { @@ -1250,6 +1427,9 @@ export const DebugDrawer: React.FC<{ } stopPlaybackImmediately(); wsRef.current?.send(JSON.stringify({ type: 'input.text', text: userMsg })); + } else if (mode === 'voice') { + await ensureWsSession(); + wsRef.current?.send(JSON.stringify({ type: 'input.text', text: userMsg })); } else { setTimeout(() => { setMessages(prev => [...prev, { role: 'model', text: `[Mock Response]: Received "${userMsg}"` }]); @@ -1385,6 +1565,7 @@ export const DebugDrawer: React.FC<{ pendingResolveRef.current = null; pendingRejectRef.current = null; assistantDraftIndexRef.current = null; + userDraftIndexRef.current = null; setTextSessionStarted(false); stopPlaybackImmediately(); if (isOpen) setWsStatus('disconnected'); @@ -1468,6 +1649,49 @@ export const DebugDrawer: React.FC<{ return; } + if (type === 'input.speech_started') { + setIsLoading(true); + return; + } + + if (type === 'input.speech_stopped') { + setIsLoading(false); + return; + } + + if (type === 'transcript.delta') { + const delta = String(payload.text || ''); + if (!delta) return; + setMessages((prev) => { + const idx = userDraftIndexRef.current; + if (idx === null || !prev[idx] || prev[idx].role !== 'user') { + const next = [...prev, { role: 'user' as const, text: delta }]; + userDraftIndexRef.current = next.length - 1; + return next; + } + const next = [...prev]; + next[idx] = { ...next[idx], text: next[idx].text + delta }; + return next; + }); + return; + } + + if (type === 'transcript.final') { + const finalText = String(payload.text || ''); + setMessages((prev) => { + const idx = userDraftIndexRef.current; + userDraftIndexRef.current = null; + if (idx !== null && prev[idx] && prev[idx].role === 'user') { + const next = [...prev]; + next[idx] = { ...next[idx], text: finalText || next[idx].text }; + return next; + } + if (!finalText) return prev; + return [...prev, { role: 'user', text: finalText }]; + }); + return; + } + if (type === 'assistant.response.delta') { const delta = String(payload.text || ''); if (!delta) return; @@ -1540,6 +1764,7 @@ export const DebugDrawer: React.FC<{ ws.onclose = () => { wsReadyRef.current = false; setTextSessionStarted(false); + userDraftIndexRef.current = null; stopPlaybackImmediately(); if (wsStatus !== 'error') setWsStatus('disconnected'); }; @@ -1560,6 +1785,18 @@ export const DebugDrawer: React.FC<{ } }, [textTtsEnabled]); + useEffect(() => { + if (!isOpen || mode !== 'voice' || callStatus !== 'active') return; + const restartCapture = async () => { + try { + await startVoiceCapture(); + } catch (e) { + console.error('Failed to restart voice capture with new 3A settings', e); + } + }; + void restartCapture(); + }, [aecEnabled, nsEnabled, agcEnabled, selectedMic, mode, callStatus, isOpen]); + useEffect(() => { if (!isOpen) return; const localResolved = buildLocalResolvedRuntime(); @@ -1611,6 +1848,35 @@ export const DebugDrawer: React.FC<{ TTS +
+

Audio 3A

+ + + +
+
+ + {captureConfigOpen && ( +
+              {captureConfigView || 'Voice call not started yet.'}
+            
+ )} +
{wsError &&

{wsError}

}