Add voice debug drawer

2026-02-09 16:38:00 +08:00
parent 29d8361ca9
commit a26e3f4026
1 changed files with 273 additions and 7 deletions
--- a/web/pages/Assistants.tsx
+++ b/web/pages/Assistants.tsx
@@ -992,6 +992,37 @@ export const DebugDrawer: React.FC<{
  llmModels: LLMModel[];
  asrModels: ASRModel[];
 }> = ({ isOpen, onClose, assistant, voices, llmModels, asrModels }) => {
+  const TARGET_SAMPLE_RATE = 16000;
+  const downsampleTo16k = (input: Float32Array, inputSampleRate: number): Float32Array => {
+    if (inputSampleRate === TARGET_SAMPLE_RATE) return input;
+    if (inputSampleRate < TARGET_SAMPLE_RATE) return input;
+    const ratio = inputSampleRate / TARGET_SAMPLE_RATE;
+    const outputLength = Math.max(1, Math.round(input.length / ratio));
+    const output = new Float32Array(outputLength);
+    let offsetInput = 0;
+    for (let i = 0; i < outputLength; i += 1) {
+      const nextOffsetInput = Math.min(input.length, Math.round((i + 1) * ratio));
+      let accum = 0;
+      let count = 0;
+      for (let j = offsetInput; j < nextOffsetInput; j += 1) {
+        accum += input[j];
+        count += 1;
+      }
+      output[i] = count > 0 ? accum / count : input[Math.min(offsetInput, input.length - 1)] || 0;
+      offsetInput = nextOffsetInput;
+    }
+    return output;
+  };
+
+  const float32ToPcm16 = (input: Float32Array): Int16Array => {
+    const output = new Int16Array(input.length);
+    for (let i = 0; i < input.length; i += 1) {
+      const s = Math.max(-1, Math.min(1, input[i]));
+      output[i] = s < 0 ? Math.round(s * 0x8000) : Math.round(s * 0x7fff);
+    }
+    return output;
+  };
+
  const [mode, setMode] = useState<'text' | 'voice' | 'video'>('text');
  const [messages, setMessages] = useState<{role: 'user' | 'model', text: string}[]>([]);
  const [inputText, setInputText] = useState('');
@@ -1002,6 +1033,8 @@ export const DebugDrawer: React.FC<{
  const [wsError, setWsError] = useState('');
  const [resolvedConfigOpen, setResolvedConfigOpen] = useState(false);
  const [resolvedConfigView, setResolvedConfigView] = useState<string>('');
+  const [captureConfigOpen, setCaptureConfigOpen] = useState(false);
+  const [captureConfigView, setCaptureConfigView] = useState<string>('');
  const [settingsDrawerOpen, setSettingsDrawerOpen] = useState(false);
  const [wsUrl, setWsUrl] = useState<string>(() => {
    const fromStorage = localStorage.getItem('debug_ws_url');
@@ -1034,6 +1067,15 @@ export const DebugDrawer: React.FC<{
  const [selectedMic, setSelectedMic] = useState<string>('');
  const [isSwapped, setIsSwapped] = useState(false); 
  const [textTtsEnabled, setTextTtsEnabled] = useState(true);
+  const [aecEnabled, setAecEnabled] = useState<boolean>(() => localStorage.getItem('debug_audio_aec') !== '0');
+  const [nsEnabled, setNsEnabled] = useState<boolean>(() => localStorage.getItem('debug_audio_ns') !== '0');
+  const [agcEnabled, setAgcEnabled] = useState<boolean>(() => localStorage.getItem('debug_audio_agc') !== '0');
+
+  const micAudioCtxRef = useRef<AudioContext | null>(null);
+  const micSourceRef = useRef<MediaStreamAudioSourceNode | null>(null);
+  const micProcessorRef = useRef<ScriptProcessorNode | null>(null);
+  const micGainRef = useRef<GainNode | null>(null);
+  const userDraftIndexRef = useRef<number | null>(null);

  // Initialize
  useEffect(() => {
@@ -1047,6 +1089,7 @@ export const DebugDrawer: React.FC<{
      }
    } else {
        setMode('text');
+        stopVoiceCapture();
        stopMedia();
        closeWs();
        if (audioCtxRef.current) {
@@ -1063,6 +1106,18 @@ export const DebugDrawer: React.FC<{
    localStorage.setItem('debug_ws_url', wsUrl);
  }, [wsUrl]);

+  useEffect(() => {
+    localStorage.setItem('debug_audio_aec', aecEnabled ? '1' : '0');
+  }, [aecEnabled]);
+
+  useEffect(() => {
+    localStorage.setItem('debug_audio_ns', nsEnabled ? '1' : '0');
+  }, [nsEnabled]);
+
+  useEffect(() => {
+    localStorage.setItem('debug_audio_agc', agcEnabled ? '1' : '0');
+  }, [agcEnabled]);
+
  // Auto-scroll logic
  useEffect(() => {
      if (scrollRef.current) {
@@ -1072,10 +1127,11 @@ export const DebugDrawer: React.FC<{

  // Fetch Devices
  useEffect(() => {
-    if (isOpen && mode === 'video') {
+    if (isOpen && (mode === 'video' || mode === 'voice')) {
        const getDevices = async () => {
+            let permissionStream: MediaStream | null = null;
            try {
-                await navigator.mediaDevices.getUserMedia({ audio: true, video: true });
+                permissionStream = await navigator.mediaDevices.getUserMedia({ audio: true, video: mode === 'video' });
                const dev = await navigator.mediaDevices.enumerateDevices();
                setDevices(dev);
                const cams = dev.filter(d => d.kind === 'videoinput');
@@ -1084,6 +1140,8 @@ export const DebugDrawer: React.FC<{
                if (mics.length > 0 && !selectedMic) setSelectedMic(mics[0].deviceId);
            } catch (e) {
                console.error("Error enumerating devices", e);
+            } finally {
+                permissionStream?.getTracks().forEach((track) => track.stop());
            }
        };
        getDevices();
@@ -1097,6 +1155,100 @@ export const DebugDrawer: React.FC<{
      }
  };

+  const stopVoiceCapture = () => {
+    if (micProcessorRef.current) {
+      micProcessorRef.current.onaudioprocess = null;
+      try {
+        micProcessorRef.current.disconnect();
+      } catch {
+        // no-op
+      }
+      micProcessorRef.current = null;
+    }
+    if (micSourceRef.current) {
+      try {
+        micSourceRef.current.disconnect();
+      } catch {
+        // no-op
+      }
+      micSourceRef.current = null;
+    }
+    if (micGainRef.current) {
+      try {
+        micGainRef.current.disconnect();
+      } catch {
+        // no-op
+      }
+      micGainRef.current = null;
+    }
+    if (micAudioCtxRef.current) {
+      void micAudioCtxRef.current.close();
+      micAudioCtxRef.current = null;
+    }
+    setCaptureConfigView('');
+    stopMedia();
+  };
+
+  const buildMicConstraints = (): MediaTrackConstraints => ({
+    deviceId: selectedMic ? { exact: selectedMic } : undefined,
+    echoCancellation: aecEnabled,
+    noiseSuppression: nsEnabled,
+    autoGainControl: agcEnabled,
+    channelCount: 1,
+    sampleRate: TARGET_SAMPLE_RATE,
+  });
+
+  const startVoiceCapture = async () => {
+    stopVoiceCapture();
+    const requestedConstraints = buildMicConstraints();
+    const stream = await navigator.mediaDevices.getUserMedia({
+      audio: requestedConstraints,
+      video: false,
+    });
+    streamRef.current = stream;
+    const track = stream.getAudioTracks()[0];
+    if (track) {
+      console.log('Voice capture settings', track.getSettings());
+      setCaptureConfigView(
+        JSON.stringify(
+          {
+            requested: requestedConstraints,
+            applied: track.getSettings(),
+            capabilities: typeof track.getCapabilities === 'function' ? track.getCapabilities() : undefined,
+          },
+          null,
+          2
+        )
+      );
+    }
+
+    const ctx = new AudioContext();
+    if (ctx.state === 'suspended') {
+      await ctx.resume();
+    }
+    micAudioCtxRef.current = ctx;
+    const source = ctx.createMediaStreamSource(stream);
+    const processor = ctx.createScriptProcessor(4096, 1, 1);
+    const silentGain = ctx.createGain();
+    silentGain.gain.value = 0;
+
+    source.connect(processor);
+    processor.connect(silentGain);
+    silentGain.connect(ctx.destination);
+
+    processor.onaudioprocess = (event) => {
+      if (!wsRef.current || wsRef.current.readyState !== WebSocket.OPEN || !wsReadyRef.current) return;
+      const inChannel = event.inputBuffer.getChannelData(0);
+      const downsampled = downsampleTo16k(inChannel, event.inputBuffer.sampleRate);
+      const pcm16 = float32ToPcm16(downsampled);
+      wsRef.current.send(pcm16.buffer);
+    };
+
+    micSourceRef.current = source;
+    micProcessorRef.current = processor;
+    micGainRef.current = silentGain;
+  };
+
  const ensureAudioContext = async () => {
    if (!audioCtxRef.current) {
      audioCtxRef.current = new AudioContext();
@@ -1174,7 +1326,6 @@ export const DebugDrawer: React.FC<{

  const playPcm16Chunk = async (pcmBuffer: ArrayBuffer) => {
    if (!textTtsEnabled) return;
-    if (mode !== 'text') return;
    const ctx = await ensureAudioContext();
    const int16 = new Int16Array(pcmBuffer);
    if (int16.length === 0) return;
@@ -1219,17 +1370,43 @@ export const DebugDrawer: React.FC<{
  }, [mode, isOpen, selectedCamera, selectedMic, callStatus]);

  const handleCall = () => {
+    if (mode !== 'voice') {
      setCallStatus('calling');
      setTimeout(() => {
          setCallStatus('active');
          setMessages([{ role: 'model', text: assistant.opener || "Hello!" }]);
      }, 1500);
+      return;
+    }
+    const launchVoice = async () => {
+      try {
+        setCallStatus('calling');
+        setMessages([]);
+        setWsError('');
+        closeWs();
+        if (textTtsEnabled) await ensureAudioContext();
+        await ensureWsSession();
+        await startVoiceCapture();
+        setCallStatus('active');
+        setMessages([{ role: 'model', text: assistant.opener || 'Hello!' }]);
+      } catch (e) {
+        console.error(e);
+        stopVoiceCapture();
+        setCallStatus('idle');
+        setWsStatus('error');
+        setWsError((e as Error)?.message || 'Failed to start voice call');
+      }
+    };
+    void launchVoice();
  };

  const handleHangup = () => {
+    stopVoiceCapture();
    stopMedia();
+    closeWs();
    setCallStatus('idle');
    setMessages([]);
+    setIsLoading(false);
  };

  const handleSend = async () => {
@@ -1250,6 +1427,9 @@ export const DebugDrawer: React.FC<{
        }
        stopPlaybackImmediately();
        wsRef.current?.send(JSON.stringify({ type: 'input.text', text: userMsg }));
+      } else if (mode === 'voice') {
+        await ensureWsSession();
+        wsRef.current?.send(JSON.stringify({ type: 'input.text', text: userMsg }));
      } else {
        setTimeout(() => {
          setMessages(prev => [...prev, { role: 'model', text: `[Mock Response]: Received "${userMsg}"` }]);
@@ -1385,6 +1565,7 @@ export const DebugDrawer: React.FC<{
    pendingResolveRef.current = null;
    pendingRejectRef.current = null;
    assistantDraftIndexRef.current = null;
+    userDraftIndexRef.current = null;
    setTextSessionStarted(false);
    stopPlaybackImmediately();
    if (isOpen) setWsStatus('disconnected');
@@ -1468,6 +1649,49 @@ export const DebugDrawer: React.FC<{
          return;
        }

+        if (type === 'input.speech_started') {
+          setIsLoading(true);
+          return;
+        }
+
+        if (type === 'input.speech_stopped') {
+          setIsLoading(false);
+          return;
+        }
+
+        if (type === 'transcript.delta') {
+          const delta = String(payload.text || '');
+          if (!delta) return;
+          setMessages((prev) => {
+            const idx = userDraftIndexRef.current;
+            if (idx === null || !prev[idx] || prev[idx].role !== 'user') {
+              const next = [...prev, { role: 'user' as const, text: delta }];
+              userDraftIndexRef.current = next.length - 1;
+              return next;
+            }
+            const next = [...prev];
+            next[idx] = { ...next[idx], text: next[idx].text + delta };
+            return next;
+          });
+          return;
+        }
+
+        if (type === 'transcript.final') {
+          const finalText = String(payload.text || '');
+          setMessages((prev) => {
+            const idx = userDraftIndexRef.current;
+            userDraftIndexRef.current = null;
+            if (idx !== null && prev[idx] && prev[idx].role === 'user') {
+              const next = [...prev];
+              next[idx] = { ...next[idx], text: finalText || next[idx].text };
+              return next;
+            }
+            if (!finalText) return prev;
+            return [...prev, { role: 'user', text: finalText }];
+          });
+          return;
+        }
+
        if (type === 'assistant.response.delta') {
          const delta = String(payload.text || '');
          if (!delta) return;
@@ -1540,6 +1764,7 @@ export const DebugDrawer: React.FC<{
      ws.onclose = () => {
        wsReadyRef.current = false;
        setTextSessionStarted(false);
+        userDraftIndexRef.current = null;
        stopPlaybackImmediately();
        if (wsStatus !== 'error') setWsStatus('disconnected');
      };
@@ -1560,6 +1785,18 @@ export const DebugDrawer: React.FC<{
    }
  }, [textTtsEnabled]);

+  useEffect(() => {
+    if (!isOpen || mode !== 'voice' || callStatus !== 'active') return;
+    const restartCapture = async () => {
+      try {
+        await startVoiceCapture();
+      } catch (e) {
+        console.error('Failed to restart voice capture with new 3A settings', e);
+      }
+    };
+    void restartCapture();
+  }, [aecEnabled, nsEnabled, agcEnabled, selectedMic, mode, callStatus, isOpen]);
+
  useEffect(() => {
    if (!isOpen) return;
    const localResolved = buildLocalResolvedRuntime();
@@ -1611,6 +1848,35 @@ export const DebugDrawer: React.FC<{
            TTS
          </label>
        </div>
+        <div className="rounded-md border border-white/10 bg-black/20 p-2 space-y-2">
+          <p className="text-[10px] uppercase tracking-widest text-muted-foreground">Audio 3A</p>
+          <label className="inline-flex items-center gap-2 text-xs text-muted-foreground">
+            <input type="checkbox" checked={aecEnabled} onChange={(e) => setAecEnabled(e.target.checked)} className="accent-primary" />
+            Echo Cancellation (AEC)
+          </label>
+          <label className="inline-flex items-center gap-2 text-xs text-muted-foreground">
+            <input type="checkbox" checked={nsEnabled} onChange={(e) => setNsEnabled(e.target.checked)} className="accent-primary" />
+            Noise Suppression (NS)
+          </label>
+          <label className="inline-flex items-center gap-2 text-xs text-muted-foreground">
+            <input type="checkbox" checked={agcEnabled} onChange={(e) => setAgcEnabled(e.target.checked)} className="accent-primary" />
+            Auto Gain Control (AGC)
+          </label>
+        </div>
+        <div className="rounded-md border border-white/10 bg-black/30">
+          <button
+            className="w-full px-3 py-2 text-left text-xs text-muted-foreground hover:text-foreground flex items-center justify-between"
+            onClick={() => setCaptureConfigOpen((v) => !v)}
+          >
+            <span>Capture Config Echo</span>
+            <ChevronDown className={`h-3.5 w-3.5 transition-transform ${captureConfigOpen ? 'rotate-180' : ''}`} />
+          </button>
+          {captureConfigOpen && (
+            <pre className="px-3 pb-3 text-[11px] leading-5 text-cyan-100/90 whitespace-pre-wrap break-all max-h-64 overflow-auto">
+              {captureConfigView || 'Voice call not started yet.'}
+            </pre>
+          )}
+        </div>
        {wsError && <p className="text-xs text-red-400">{wsError}</p>}

        <div className="rounded-md border border-white/10 bg-black/30">