Add voice debug drawer
This commit is contained in:
@@ -992,6 +992,37 @@ export const DebugDrawer: React.FC<{
|
||||
llmModels: LLMModel[];
|
||||
asrModels: ASRModel[];
|
||||
}> = ({ isOpen, onClose, assistant, voices, llmModels, asrModels }) => {
|
||||
const TARGET_SAMPLE_RATE = 16000;
|
||||
const downsampleTo16k = (input: Float32Array, inputSampleRate: number): Float32Array => {
|
||||
if (inputSampleRate === TARGET_SAMPLE_RATE) return input;
|
||||
if (inputSampleRate < TARGET_SAMPLE_RATE) return input;
|
||||
const ratio = inputSampleRate / TARGET_SAMPLE_RATE;
|
||||
const outputLength = Math.max(1, Math.round(input.length / ratio));
|
||||
const output = new Float32Array(outputLength);
|
||||
let offsetInput = 0;
|
||||
for (let i = 0; i < outputLength; i += 1) {
|
||||
const nextOffsetInput = Math.min(input.length, Math.round((i + 1) * ratio));
|
||||
let accum = 0;
|
||||
let count = 0;
|
||||
for (let j = offsetInput; j < nextOffsetInput; j += 1) {
|
||||
accum += input[j];
|
||||
count += 1;
|
||||
}
|
||||
output[i] = count > 0 ? accum / count : input[Math.min(offsetInput, input.length - 1)] || 0;
|
||||
offsetInput = nextOffsetInput;
|
||||
}
|
||||
return output;
|
||||
};
|
||||
|
||||
const float32ToPcm16 = (input: Float32Array): Int16Array => {
|
||||
const output = new Int16Array(input.length);
|
||||
for (let i = 0; i < input.length; i += 1) {
|
||||
const s = Math.max(-1, Math.min(1, input[i]));
|
||||
output[i] = s < 0 ? Math.round(s * 0x8000) : Math.round(s * 0x7fff);
|
||||
}
|
||||
return output;
|
||||
};
|
||||
|
||||
const [mode, setMode] = useState<'text' | 'voice' | 'video'>('text');
|
||||
const [messages, setMessages] = useState<{role: 'user' | 'model', text: string}[]>([]);
|
||||
const [inputText, setInputText] = useState('');
|
||||
@@ -1002,6 +1033,8 @@ export const DebugDrawer: React.FC<{
|
||||
const [wsError, setWsError] = useState('');
|
||||
const [resolvedConfigOpen, setResolvedConfigOpen] = useState(false);
|
||||
const [resolvedConfigView, setResolvedConfigView] = useState<string>('');
|
||||
const [captureConfigOpen, setCaptureConfigOpen] = useState(false);
|
||||
const [captureConfigView, setCaptureConfigView] = useState<string>('');
|
||||
const [settingsDrawerOpen, setSettingsDrawerOpen] = useState(false);
|
||||
const [wsUrl, setWsUrl] = useState<string>(() => {
|
||||
const fromStorage = localStorage.getItem('debug_ws_url');
|
||||
@@ -1034,6 +1067,15 @@ export const DebugDrawer: React.FC<{
|
||||
const [selectedMic, setSelectedMic] = useState<string>('');
|
||||
const [isSwapped, setIsSwapped] = useState(false);
|
||||
const [textTtsEnabled, setTextTtsEnabled] = useState(true);
|
||||
const [aecEnabled, setAecEnabled] = useState<boolean>(() => localStorage.getItem('debug_audio_aec') !== '0');
|
||||
const [nsEnabled, setNsEnabled] = useState<boolean>(() => localStorage.getItem('debug_audio_ns') !== '0');
|
||||
const [agcEnabled, setAgcEnabled] = useState<boolean>(() => localStorage.getItem('debug_audio_agc') !== '0');
|
||||
|
||||
const micAudioCtxRef = useRef<AudioContext | null>(null);
|
||||
const micSourceRef = useRef<MediaStreamAudioSourceNode | null>(null);
|
||||
const micProcessorRef = useRef<ScriptProcessorNode | null>(null);
|
||||
const micGainRef = useRef<GainNode | null>(null);
|
||||
const userDraftIndexRef = useRef<number | null>(null);
|
||||
|
||||
// Initialize
|
||||
useEffect(() => {
|
||||
@@ -1047,6 +1089,7 @@ export const DebugDrawer: React.FC<{
|
||||
}
|
||||
} else {
|
||||
setMode('text');
|
||||
stopVoiceCapture();
|
||||
stopMedia();
|
||||
closeWs();
|
||||
if (audioCtxRef.current) {
|
||||
@@ -1063,6 +1106,18 @@ export const DebugDrawer: React.FC<{
|
||||
localStorage.setItem('debug_ws_url', wsUrl);
|
||||
}, [wsUrl]);
|
||||
|
||||
useEffect(() => {
|
||||
localStorage.setItem('debug_audio_aec', aecEnabled ? '1' : '0');
|
||||
}, [aecEnabled]);
|
||||
|
||||
useEffect(() => {
|
||||
localStorage.setItem('debug_audio_ns', nsEnabled ? '1' : '0');
|
||||
}, [nsEnabled]);
|
||||
|
||||
useEffect(() => {
|
||||
localStorage.setItem('debug_audio_agc', agcEnabled ? '1' : '0');
|
||||
}, [agcEnabled]);
|
||||
|
||||
// Auto-scroll logic
|
||||
useEffect(() => {
|
||||
if (scrollRef.current) {
|
||||
@@ -1072,10 +1127,11 @@ export const DebugDrawer: React.FC<{
|
||||
|
||||
// Fetch Devices
|
||||
useEffect(() => {
|
||||
if (isOpen && mode === 'video') {
|
||||
if (isOpen && (mode === 'video' || mode === 'voice')) {
|
||||
const getDevices = async () => {
|
||||
let permissionStream: MediaStream | null = null;
|
||||
try {
|
||||
await navigator.mediaDevices.getUserMedia({ audio: true, video: true });
|
||||
permissionStream = await navigator.mediaDevices.getUserMedia({ audio: true, video: mode === 'video' });
|
||||
const dev = await navigator.mediaDevices.enumerateDevices();
|
||||
setDevices(dev);
|
||||
const cams = dev.filter(d => d.kind === 'videoinput');
|
||||
@@ -1084,6 +1140,8 @@ export const DebugDrawer: React.FC<{
|
||||
if (mics.length > 0 && !selectedMic) setSelectedMic(mics[0].deviceId);
|
||||
} catch (e) {
|
||||
console.error("Error enumerating devices", e);
|
||||
} finally {
|
||||
permissionStream?.getTracks().forEach((track) => track.stop());
|
||||
}
|
||||
};
|
||||
getDevices();
|
||||
@@ -1097,6 +1155,100 @@ export const DebugDrawer: React.FC<{
|
||||
}
|
||||
};
|
||||
|
||||
const stopVoiceCapture = () => {
|
||||
if (micProcessorRef.current) {
|
||||
micProcessorRef.current.onaudioprocess = null;
|
||||
try {
|
||||
micProcessorRef.current.disconnect();
|
||||
} catch {
|
||||
// no-op
|
||||
}
|
||||
micProcessorRef.current = null;
|
||||
}
|
||||
if (micSourceRef.current) {
|
||||
try {
|
||||
micSourceRef.current.disconnect();
|
||||
} catch {
|
||||
// no-op
|
||||
}
|
||||
micSourceRef.current = null;
|
||||
}
|
||||
if (micGainRef.current) {
|
||||
try {
|
||||
micGainRef.current.disconnect();
|
||||
} catch {
|
||||
// no-op
|
||||
}
|
||||
micGainRef.current = null;
|
||||
}
|
||||
if (micAudioCtxRef.current) {
|
||||
void micAudioCtxRef.current.close();
|
||||
micAudioCtxRef.current = null;
|
||||
}
|
||||
setCaptureConfigView('');
|
||||
stopMedia();
|
||||
};
|
||||
|
||||
const buildMicConstraints = (): MediaTrackConstraints => ({
|
||||
deviceId: selectedMic ? { exact: selectedMic } : undefined,
|
||||
echoCancellation: aecEnabled,
|
||||
noiseSuppression: nsEnabled,
|
||||
autoGainControl: agcEnabled,
|
||||
channelCount: 1,
|
||||
sampleRate: TARGET_SAMPLE_RATE,
|
||||
});
|
||||
|
||||
const startVoiceCapture = async () => {
|
||||
stopVoiceCapture();
|
||||
const requestedConstraints = buildMicConstraints();
|
||||
const stream = await navigator.mediaDevices.getUserMedia({
|
||||
audio: requestedConstraints,
|
||||
video: false,
|
||||
});
|
||||
streamRef.current = stream;
|
||||
const track = stream.getAudioTracks()[0];
|
||||
if (track) {
|
||||
console.log('Voice capture settings', track.getSettings());
|
||||
setCaptureConfigView(
|
||||
JSON.stringify(
|
||||
{
|
||||
requested: requestedConstraints,
|
||||
applied: track.getSettings(),
|
||||
capabilities: typeof track.getCapabilities === 'function' ? track.getCapabilities() : undefined,
|
||||
},
|
||||
null,
|
||||
2
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
const ctx = new AudioContext();
|
||||
if (ctx.state === 'suspended') {
|
||||
await ctx.resume();
|
||||
}
|
||||
micAudioCtxRef.current = ctx;
|
||||
const source = ctx.createMediaStreamSource(stream);
|
||||
const processor = ctx.createScriptProcessor(4096, 1, 1);
|
||||
const silentGain = ctx.createGain();
|
||||
silentGain.gain.value = 0;
|
||||
|
||||
source.connect(processor);
|
||||
processor.connect(silentGain);
|
||||
silentGain.connect(ctx.destination);
|
||||
|
||||
processor.onaudioprocess = (event) => {
|
||||
if (!wsRef.current || wsRef.current.readyState !== WebSocket.OPEN || !wsReadyRef.current) return;
|
||||
const inChannel = event.inputBuffer.getChannelData(0);
|
||||
const downsampled = downsampleTo16k(inChannel, event.inputBuffer.sampleRate);
|
||||
const pcm16 = float32ToPcm16(downsampled);
|
||||
wsRef.current.send(pcm16.buffer);
|
||||
};
|
||||
|
||||
micSourceRef.current = source;
|
||||
micProcessorRef.current = processor;
|
||||
micGainRef.current = silentGain;
|
||||
};
|
||||
|
||||
const ensureAudioContext = async () => {
|
||||
if (!audioCtxRef.current) {
|
||||
audioCtxRef.current = new AudioContext();
|
||||
@@ -1174,7 +1326,6 @@ export const DebugDrawer: React.FC<{
|
||||
|
||||
const playPcm16Chunk = async (pcmBuffer: ArrayBuffer) => {
|
||||
if (!textTtsEnabled) return;
|
||||
if (mode !== 'text') return;
|
||||
const ctx = await ensureAudioContext();
|
||||
const int16 = new Int16Array(pcmBuffer);
|
||||
if (int16.length === 0) return;
|
||||
@@ -1219,17 +1370,43 @@ export const DebugDrawer: React.FC<{
|
||||
}, [mode, isOpen, selectedCamera, selectedMic, callStatus]);
|
||||
|
||||
const handleCall = () => {
|
||||
if (mode !== 'voice') {
|
||||
setCallStatus('calling');
|
||||
setTimeout(() => {
|
||||
setCallStatus('active');
|
||||
setMessages([{ role: 'model', text: assistant.opener || "Hello!" }]);
|
||||
}, 1500);
|
||||
return;
|
||||
}
|
||||
const launchVoice = async () => {
|
||||
try {
|
||||
setCallStatus('calling');
|
||||
setMessages([]);
|
||||
setWsError('');
|
||||
closeWs();
|
||||
if (textTtsEnabled) await ensureAudioContext();
|
||||
await ensureWsSession();
|
||||
await startVoiceCapture();
|
||||
setCallStatus('active');
|
||||
setMessages([{ role: 'model', text: assistant.opener || 'Hello!' }]);
|
||||
} catch (e) {
|
||||
console.error(e);
|
||||
stopVoiceCapture();
|
||||
setCallStatus('idle');
|
||||
setWsStatus('error');
|
||||
setWsError((e as Error)?.message || 'Failed to start voice call');
|
||||
}
|
||||
};
|
||||
void launchVoice();
|
||||
};
|
||||
|
||||
const handleHangup = () => {
|
||||
stopVoiceCapture();
|
||||
stopMedia();
|
||||
closeWs();
|
||||
setCallStatus('idle');
|
||||
setMessages([]);
|
||||
setIsLoading(false);
|
||||
};
|
||||
|
||||
const handleSend = async () => {
|
||||
@@ -1250,6 +1427,9 @@ export const DebugDrawer: React.FC<{
|
||||
}
|
||||
stopPlaybackImmediately();
|
||||
wsRef.current?.send(JSON.stringify({ type: 'input.text', text: userMsg }));
|
||||
} else if (mode === 'voice') {
|
||||
await ensureWsSession();
|
||||
wsRef.current?.send(JSON.stringify({ type: 'input.text', text: userMsg }));
|
||||
} else {
|
||||
setTimeout(() => {
|
||||
setMessages(prev => [...prev, { role: 'model', text: `[Mock Response]: Received "${userMsg}"` }]);
|
||||
@@ -1385,6 +1565,7 @@ export const DebugDrawer: React.FC<{
|
||||
pendingResolveRef.current = null;
|
||||
pendingRejectRef.current = null;
|
||||
assistantDraftIndexRef.current = null;
|
||||
userDraftIndexRef.current = null;
|
||||
setTextSessionStarted(false);
|
||||
stopPlaybackImmediately();
|
||||
if (isOpen) setWsStatus('disconnected');
|
||||
@@ -1468,6 +1649,49 @@ export const DebugDrawer: React.FC<{
|
||||
return;
|
||||
}
|
||||
|
||||
if (type === 'input.speech_started') {
|
||||
setIsLoading(true);
|
||||
return;
|
||||
}
|
||||
|
||||
if (type === 'input.speech_stopped') {
|
||||
setIsLoading(false);
|
||||
return;
|
||||
}
|
||||
|
||||
if (type === 'transcript.delta') {
|
||||
const delta = String(payload.text || '');
|
||||
if (!delta) return;
|
||||
setMessages((prev) => {
|
||||
const idx = userDraftIndexRef.current;
|
||||
if (idx === null || !prev[idx] || prev[idx].role !== 'user') {
|
||||
const next = [...prev, { role: 'user' as const, text: delta }];
|
||||
userDraftIndexRef.current = next.length - 1;
|
||||
return next;
|
||||
}
|
||||
const next = [...prev];
|
||||
next[idx] = { ...next[idx], text: next[idx].text + delta };
|
||||
return next;
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
if (type === 'transcript.final') {
|
||||
const finalText = String(payload.text || '');
|
||||
setMessages((prev) => {
|
||||
const idx = userDraftIndexRef.current;
|
||||
userDraftIndexRef.current = null;
|
||||
if (idx !== null && prev[idx] && prev[idx].role === 'user') {
|
||||
const next = [...prev];
|
||||
next[idx] = { ...next[idx], text: finalText || next[idx].text };
|
||||
return next;
|
||||
}
|
||||
if (!finalText) return prev;
|
||||
return [...prev, { role: 'user', text: finalText }];
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
if (type === 'assistant.response.delta') {
|
||||
const delta = String(payload.text || '');
|
||||
if (!delta) return;
|
||||
@@ -1540,6 +1764,7 @@ export const DebugDrawer: React.FC<{
|
||||
ws.onclose = () => {
|
||||
wsReadyRef.current = false;
|
||||
setTextSessionStarted(false);
|
||||
userDraftIndexRef.current = null;
|
||||
stopPlaybackImmediately();
|
||||
if (wsStatus !== 'error') setWsStatus('disconnected');
|
||||
};
|
||||
@@ -1560,6 +1785,18 @@ export const DebugDrawer: React.FC<{
|
||||
}
|
||||
}, [textTtsEnabled]);
|
||||
|
||||
useEffect(() => {
|
||||
if (!isOpen || mode !== 'voice' || callStatus !== 'active') return;
|
||||
const restartCapture = async () => {
|
||||
try {
|
||||
await startVoiceCapture();
|
||||
} catch (e) {
|
||||
console.error('Failed to restart voice capture with new 3A settings', e);
|
||||
}
|
||||
};
|
||||
void restartCapture();
|
||||
}, [aecEnabled, nsEnabled, agcEnabled, selectedMic, mode, callStatus, isOpen]);
|
||||
|
||||
useEffect(() => {
|
||||
if (!isOpen) return;
|
||||
const localResolved = buildLocalResolvedRuntime();
|
||||
@@ -1611,6 +1848,35 @@ export const DebugDrawer: React.FC<{
|
||||
TTS
|
||||
</label>
|
||||
</div>
|
||||
<div className="rounded-md border border-white/10 bg-black/20 p-2 space-y-2">
|
||||
<p className="text-[10px] uppercase tracking-widest text-muted-foreground">Audio 3A</p>
|
||||
<label className="inline-flex items-center gap-2 text-xs text-muted-foreground">
|
||||
<input type="checkbox" checked={aecEnabled} onChange={(e) => setAecEnabled(e.target.checked)} className="accent-primary" />
|
||||
Echo Cancellation (AEC)
|
||||
</label>
|
||||
<label className="inline-flex items-center gap-2 text-xs text-muted-foreground">
|
||||
<input type="checkbox" checked={nsEnabled} onChange={(e) => setNsEnabled(e.target.checked)} className="accent-primary" />
|
||||
Noise Suppression (NS)
|
||||
</label>
|
||||
<label className="inline-flex items-center gap-2 text-xs text-muted-foreground">
|
||||
<input type="checkbox" checked={agcEnabled} onChange={(e) => setAgcEnabled(e.target.checked)} className="accent-primary" />
|
||||
Auto Gain Control (AGC)
|
||||
</label>
|
||||
</div>
|
||||
<div className="rounded-md border border-white/10 bg-black/30">
|
||||
<button
|
||||
className="w-full px-3 py-2 text-left text-xs text-muted-foreground hover:text-foreground flex items-center justify-between"
|
||||
onClick={() => setCaptureConfigOpen((v) => !v)}
|
||||
>
|
||||
<span>Capture Config Echo</span>
|
||||
<ChevronDown className={`h-3.5 w-3.5 transition-transform ${captureConfigOpen ? 'rotate-180' : ''}`} />
|
||||
</button>
|
||||
{captureConfigOpen && (
|
||||
<pre className="px-3 pb-3 text-[11px] leading-5 text-cyan-100/90 whitespace-pre-wrap break-all max-h-64 overflow-auto">
|
||||
{captureConfigView || 'Voice call not started yet.'}
|
||||
</pre>
|
||||
)}
|
||||
</div>
|
||||
{wsError && <p className="text-xs text-red-400">{wsError}</p>}
|
||||
|
||||
<div className="rounded-md border border-white/10 bg-black/30">
|
||||
|
||||
Reference in New Issue
Block a user