Add voice debug drawer

This commit is contained in:
Xin Wang
2026-02-09 16:38:00 +08:00
parent 29d8361ca9
commit a26e3f4026

View File

@@ -992,6 +992,37 @@ export const DebugDrawer: React.FC<{
llmModels: LLMModel[]; llmModels: LLMModel[];
asrModels: ASRModel[]; asrModels: ASRModel[];
}> = ({ isOpen, onClose, assistant, voices, llmModels, asrModels }) => { }> = ({ isOpen, onClose, assistant, voices, llmModels, asrModels }) => {
const TARGET_SAMPLE_RATE = 16000;
const downsampleTo16k = (input: Float32Array, inputSampleRate: number): Float32Array => {
if (inputSampleRate === TARGET_SAMPLE_RATE) return input;
if (inputSampleRate < TARGET_SAMPLE_RATE) return input;
const ratio = inputSampleRate / TARGET_SAMPLE_RATE;
const outputLength = Math.max(1, Math.round(input.length / ratio));
const output = new Float32Array(outputLength);
let offsetInput = 0;
for (let i = 0; i < outputLength; i += 1) {
const nextOffsetInput = Math.min(input.length, Math.round((i + 1) * ratio));
let accum = 0;
let count = 0;
for (let j = offsetInput; j < nextOffsetInput; j += 1) {
accum += input[j];
count += 1;
}
output[i] = count > 0 ? accum / count : input[Math.min(offsetInput, input.length - 1)] || 0;
offsetInput = nextOffsetInput;
}
return output;
};
const float32ToPcm16 = (input: Float32Array): Int16Array => {
const output = new Int16Array(input.length);
for (let i = 0; i < input.length; i += 1) {
const s = Math.max(-1, Math.min(1, input[i]));
output[i] = s < 0 ? Math.round(s * 0x8000) : Math.round(s * 0x7fff);
}
return output;
};
const [mode, setMode] = useState<'text' | 'voice' | 'video'>('text'); const [mode, setMode] = useState<'text' | 'voice' | 'video'>('text');
const [messages, setMessages] = useState<{role: 'user' | 'model', text: string}[]>([]); const [messages, setMessages] = useState<{role: 'user' | 'model', text: string}[]>([]);
const [inputText, setInputText] = useState(''); const [inputText, setInputText] = useState('');
@@ -1002,6 +1033,8 @@ export const DebugDrawer: React.FC<{
const [wsError, setWsError] = useState(''); const [wsError, setWsError] = useState('');
const [resolvedConfigOpen, setResolvedConfigOpen] = useState(false); const [resolvedConfigOpen, setResolvedConfigOpen] = useState(false);
const [resolvedConfigView, setResolvedConfigView] = useState<string>(''); const [resolvedConfigView, setResolvedConfigView] = useState<string>('');
const [captureConfigOpen, setCaptureConfigOpen] = useState(false);
const [captureConfigView, setCaptureConfigView] = useState<string>('');
const [settingsDrawerOpen, setSettingsDrawerOpen] = useState(false); const [settingsDrawerOpen, setSettingsDrawerOpen] = useState(false);
const [wsUrl, setWsUrl] = useState<string>(() => { const [wsUrl, setWsUrl] = useState<string>(() => {
const fromStorage = localStorage.getItem('debug_ws_url'); const fromStorage = localStorage.getItem('debug_ws_url');
@@ -1034,6 +1067,15 @@ export const DebugDrawer: React.FC<{
const [selectedMic, setSelectedMic] = useState<string>(''); const [selectedMic, setSelectedMic] = useState<string>('');
const [isSwapped, setIsSwapped] = useState(false); const [isSwapped, setIsSwapped] = useState(false);
const [textTtsEnabled, setTextTtsEnabled] = useState(true); const [textTtsEnabled, setTextTtsEnabled] = useState(true);
const [aecEnabled, setAecEnabled] = useState<boolean>(() => localStorage.getItem('debug_audio_aec') !== '0');
const [nsEnabled, setNsEnabled] = useState<boolean>(() => localStorage.getItem('debug_audio_ns') !== '0');
const [agcEnabled, setAgcEnabled] = useState<boolean>(() => localStorage.getItem('debug_audio_agc') !== '0');
const micAudioCtxRef = useRef<AudioContext | null>(null);
const micSourceRef = useRef<MediaStreamAudioSourceNode | null>(null);
const micProcessorRef = useRef<ScriptProcessorNode | null>(null);
const micGainRef = useRef<GainNode | null>(null);
const userDraftIndexRef = useRef<number | null>(null);
// Initialize // Initialize
useEffect(() => { useEffect(() => {
@@ -1047,6 +1089,7 @@ export const DebugDrawer: React.FC<{
} }
} else { } else {
setMode('text'); setMode('text');
stopVoiceCapture();
stopMedia(); stopMedia();
closeWs(); closeWs();
if (audioCtxRef.current) { if (audioCtxRef.current) {
@@ -1063,6 +1106,18 @@ export const DebugDrawer: React.FC<{
localStorage.setItem('debug_ws_url', wsUrl); localStorage.setItem('debug_ws_url', wsUrl);
}, [wsUrl]); }, [wsUrl]);
useEffect(() => {
localStorage.setItem('debug_audio_aec', aecEnabled ? '1' : '0');
}, [aecEnabled]);
useEffect(() => {
localStorage.setItem('debug_audio_ns', nsEnabled ? '1' : '0');
}, [nsEnabled]);
useEffect(() => {
localStorage.setItem('debug_audio_agc', agcEnabled ? '1' : '0');
}, [agcEnabled]);
// Auto-scroll logic // Auto-scroll logic
useEffect(() => { useEffect(() => {
if (scrollRef.current) { if (scrollRef.current) {
@@ -1072,10 +1127,11 @@ export const DebugDrawer: React.FC<{
// Fetch Devices // Fetch Devices
useEffect(() => { useEffect(() => {
if (isOpen && mode === 'video') { if (isOpen && (mode === 'video' || mode === 'voice')) {
const getDevices = async () => { const getDevices = async () => {
let permissionStream: MediaStream | null = null;
try { try {
await navigator.mediaDevices.getUserMedia({ audio: true, video: true }); permissionStream = await navigator.mediaDevices.getUserMedia({ audio: true, video: mode === 'video' });
const dev = await navigator.mediaDevices.enumerateDevices(); const dev = await navigator.mediaDevices.enumerateDevices();
setDevices(dev); setDevices(dev);
const cams = dev.filter(d => d.kind === 'videoinput'); const cams = dev.filter(d => d.kind === 'videoinput');
@@ -1084,6 +1140,8 @@ export const DebugDrawer: React.FC<{
if (mics.length > 0 && !selectedMic) setSelectedMic(mics[0].deviceId); if (mics.length > 0 && !selectedMic) setSelectedMic(mics[0].deviceId);
} catch (e) { } catch (e) {
console.error("Error enumerating devices", e); console.error("Error enumerating devices", e);
} finally {
permissionStream?.getTracks().forEach((track) => track.stop());
} }
}; };
getDevices(); getDevices();
@@ -1097,6 +1155,100 @@ export const DebugDrawer: React.FC<{
} }
}; };
const stopVoiceCapture = () => {
if (micProcessorRef.current) {
micProcessorRef.current.onaudioprocess = null;
try {
micProcessorRef.current.disconnect();
} catch {
// no-op
}
micProcessorRef.current = null;
}
if (micSourceRef.current) {
try {
micSourceRef.current.disconnect();
} catch {
// no-op
}
micSourceRef.current = null;
}
if (micGainRef.current) {
try {
micGainRef.current.disconnect();
} catch {
// no-op
}
micGainRef.current = null;
}
if (micAudioCtxRef.current) {
void micAudioCtxRef.current.close();
micAudioCtxRef.current = null;
}
setCaptureConfigView('');
stopMedia();
};
const buildMicConstraints = (): MediaTrackConstraints => ({
deviceId: selectedMic ? { exact: selectedMic } : undefined,
echoCancellation: aecEnabled,
noiseSuppression: nsEnabled,
autoGainControl: agcEnabled,
channelCount: 1,
sampleRate: TARGET_SAMPLE_RATE,
});
const startVoiceCapture = async () => {
stopVoiceCapture();
const requestedConstraints = buildMicConstraints();
const stream = await navigator.mediaDevices.getUserMedia({
audio: requestedConstraints,
video: false,
});
streamRef.current = stream;
const track = stream.getAudioTracks()[0];
if (track) {
console.log('Voice capture settings', track.getSettings());
setCaptureConfigView(
JSON.stringify(
{
requested: requestedConstraints,
applied: track.getSettings(),
capabilities: typeof track.getCapabilities === 'function' ? track.getCapabilities() : undefined,
},
null,
2
)
);
}
const ctx = new AudioContext();
if (ctx.state === 'suspended') {
await ctx.resume();
}
micAudioCtxRef.current = ctx;
const source = ctx.createMediaStreamSource(stream);
const processor = ctx.createScriptProcessor(4096, 1, 1);
const silentGain = ctx.createGain();
silentGain.gain.value = 0;
source.connect(processor);
processor.connect(silentGain);
silentGain.connect(ctx.destination);
processor.onaudioprocess = (event) => {
if (!wsRef.current || wsRef.current.readyState !== WebSocket.OPEN || !wsReadyRef.current) return;
const inChannel = event.inputBuffer.getChannelData(0);
const downsampled = downsampleTo16k(inChannel, event.inputBuffer.sampleRate);
const pcm16 = float32ToPcm16(downsampled);
wsRef.current.send(pcm16.buffer);
};
micSourceRef.current = source;
micProcessorRef.current = processor;
micGainRef.current = silentGain;
};
const ensureAudioContext = async () => { const ensureAudioContext = async () => {
if (!audioCtxRef.current) { if (!audioCtxRef.current) {
audioCtxRef.current = new AudioContext(); audioCtxRef.current = new AudioContext();
@@ -1174,7 +1326,6 @@ export const DebugDrawer: React.FC<{
const playPcm16Chunk = async (pcmBuffer: ArrayBuffer) => { const playPcm16Chunk = async (pcmBuffer: ArrayBuffer) => {
if (!textTtsEnabled) return; if (!textTtsEnabled) return;
if (mode !== 'text') return;
const ctx = await ensureAudioContext(); const ctx = await ensureAudioContext();
const int16 = new Int16Array(pcmBuffer); const int16 = new Int16Array(pcmBuffer);
if (int16.length === 0) return; if (int16.length === 0) return;
@@ -1219,17 +1370,43 @@ export const DebugDrawer: React.FC<{
}, [mode, isOpen, selectedCamera, selectedMic, callStatus]); }, [mode, isOpen, selectedCamera, selectedMic, callStatus]);
const handleCall = () => { const handleCall = () => {
if (mode !== 'voice') {
setCallStatus('calling'); setCallStatus('calling');
setTimeout(() => { setTimeout(() => {
setCallStatus('active'); setCallStatus('active');
setMessages([{ role: 'model', text: assistant.opener || "Hello!" }]); setMessages([{ role: 'model', text: assistant.opener || "Hello!" }]);
}, 1500); }, 1500);
return;
}
const launchVoice = async () => {
try {
setCallStatus('calling');
setMessages([]);
setWsError('');
closeWs();
if (textTtsEnabled) await ensureAudioContext();
await ensureWsSession();
await startVoiceCapture();
setCallStatus('active');
setMessages([{ role: 'model', text: assistant.opener || 'Hello!' }]);
} catch (e) {
console.error(e);
stopVoiceCapture();
setCallStatus('idle');
setWsStatus('error');
setWsError((e as Error)?.message || 'Failed to start voice call');
}
};
void launchVoice();
}; };
const handleHangup = () => { const handleHangup = () => {
stopVoiceCapture();
stopMedia(); stopMedia();
closeWs();
setCallStatus('idle'); setCallStatus('idle');
setMessages([]); setMessages([]);
setIsLoading(false);
}; };
const handleSend = async () => { const handleSend = async () => {
@@ -1250,6 +1427,9 @@ export const DebugDrawer: React.FC<{
} }
stopPlaybackImmediately(); stopPlaybackImmediately();
wsRef.current?.send(JSON.stringify({ type: 'input.text', text: userMsg })); wsRef.current?.send(JSON.stringify({ type: 'input.text', text: userMsg }));
} else if (mode === 'voice') {
await ensureWsSession();
wsRef.current?.send(JSON.stringify({ type: 'input.text', text: userMsg }));
} else { } else {
setTimeout(() => { setTimeout(() => {
setMessages(prev => [...prev, { role: 'model', text: `[Mock Response]: Received "${userMsg}"` }]); setMessages(prev => [...prev, { role: 'model', text: `[Mock Response]: Received "${userMsg}"` }]);
@@ -1385,6 +1565,7 @@ export const DebugDrawer: React.FC<{
pendingResolveRef.current = null; pendingResolveRef.current = null;
pendingRejectRef.current = null; pendingRejectRef.current = null;
assistantDraftIndexRef.current = null; assistantDraftIndexRef.current = null;
userDraftIndexRef.current = null;
setTextSessionStarted(false); setTextSessionStarted(false);
stopPlaybackImmediately(); stopPlaybackImmediately();
if (isOpen) setWsStatus('disconnected'); if (isOpen) setWsStatus('disconnected');
@@ -1468,6 +1649,49 @@ export const DebugDrawer: React.FC<{
return; return;
} }
if (type === 'input.speech_started') {
setIsLoading(true);
return;
}
if (type === 'input.speech_stopped') {
setIsLoading(false);
return;
}
if (type === 'transcript.delta') {
const delta = String(payload.text || '');
if (!delta) return;
setMessages((prev) => {
const idx = userDraftIndexRef.current;
if (idx === null || !prev[idx] || prev[idx].role !== 'user') {
const next = [...prev, { role: 'user' as const, text: delta }];
userDraftIndexRef.current = next.length - 1;
return next;
}
const next = [...prev];
next[idx] = { ...next[idx], text: next[idx].text + delta };
return next;
});
return;
}
if (type === 'transcript.final') {
const finalText = String(payload.text || '');
setMessages((prev) => {
const idx = userDraftIndexRef.current;
userDraftIndexRef.current = null;
if (idx !== null && prev[idx] && prev[idx].role === 'user') {
const next = [...prev];
next[idx] = { ...next[idx], text: finalText || next[idx].text };
return next;
}
if (!finalText) return prev;
return [...prev, { role: 'user', text: finalText }];
});
return;
}
if (type === 'assistant.response.delta') { if (type === 'assistant.response.delta') {
const delta = String(payload.text || ''); const delta = String(payload.text || '');
if (!delta) return; if (!delta) return;
@@ -1540,6 +1764,7 @@ export const DebugDrawer: React.FC<{
ws.onclose = () => { ws.onclose = () => {
wsReadyRef.current = false; wsReadyRef.current = false;
setTextSessionStarted(false); setTextSessionStarted(false);
userDraftIndexRef.current = null;
stopPlaybackImmediately(); stopPlaybackImmediately();
if (wsStatus !== 'error') setWsStatus('disconnected'); if (wsStatus !== 'error') setWsStatus('disconnected');
}; };
@@ -1560,6 +1785,18 @@ export const DebugDrawer: React.FC<{
} }
}, [textTtsEnabled]); }, [textTtsEnabled]);
useEffect(() => {
if (!isOpen || mode !== 'voice' || callStatus !== 'active') return;
const restartCapture = async () => {
try {
await startVoiceCapture();
} catch (e) {
console.error('Failed to restart voice capture with new 3A settings', e);
}
};
void restartCapture();
}, [aecEnabled, nsEnabled, agcEnabled, selectedMic, mode, callStatus, isOpen]);
useEffect(() => { useEffect(() => {
if (!isOpen) return; if (!isOpen) return;
const localResolved = buildLocalResolvedRuntime(); const localResolved = buildLocalResolvedRuntime();
@@ -1611,6 +1848,35 @@ export const DebugDrawer: React.FC<{
TTS TTS
</label> </label>
</div> </div>
<div className="rounded-md border border-white/10 bg-black/20 p-2 space-y-2">
<p className="text-[10px] uppercase tracking-widest text-muted-foreground">Audio 3A</p>
<label className="inline-flex items-center gap-2 text-xs text-muted-foreground">
<input type="checkbox" checked={aecEnabled} onChange={(e) => setAecEnabled(e.target.checked)} className="accent-primary" />
Echo Cancellation (AEC)
</label>
<label className="inline-flex items-center gap-2 text-xs text-muted-foreground">
<input type="checkbox" checked={nsEnabled} onChange={(e) => setNsEnabled(e.target.checked)} className="accent-primary" />
Noise Suppression (NS)
</label>
<label className="inline-flex items-center gap-2 text-xs text-muted-foreground">
<input type="checkbox" checked={agcEnabled} onChange={(e) => setAgcEnabled(e.target.checked)} className="accent-primary" />
Auto Gain Control (AGC)
</label>
</div>
<div className="rounded-md border border-white/10 bg-black/30">
<button
className="w-full px-3 py-2 text-left text-xs text-muted-foreground hover:text-foreground flex items-center justify-between"
onClick={() => setCaptureConfigOpen((v) => !v)}
>
<span>Capture Config Echo</span>
<ChevronDown className={`h-3.5 w-3.5 transition-transform ${captureConfigOpen ? 'rotate-180' : ''}`} />
</button>
{captureConfigOpen && (
<pre className="px-3 pb-3 text-[11px] leading-5 text-cyan-100/90 whitespace-pre-wrap break-all max-h-64 overflow-auto">
{captureConfigView || 'Voice call not started yet.'}
</pre>
)}
</div>
{wsError && <p className="text-xs text-red-400">{wsError}</p>} {wsError && <p className="text-xs text-red-400">{wsError}</p>}
<div className="rounded-md border border-white/10 bg-black/30"> <div className="rounded-md border border-white/10 bg-black/30">