Add voice debug drawer

2026-02-09 16:38:00 +08:00
parent 29d8361ca9
commit a26e3f4026
1 changed files with 273 additions and 7 deletions
--- a/web/pages/Assistants.tsx
+++ b/web/pages/Assistants.tsx
@@ -992,6 +992,37 @@ export const DebugDrawer: React.FC<{
  llmModels: LLMModel[];
  asrModels: ASRModel[];
 }> = ({ isOpen, onClose, assistant, voices, llmModels, asrModels }) => {
  const TARGET_SAMPLE_RATE = 16000;
  const downsampleTo16k = (input: Float32Array, inputSampleRate: number): Float32Array => {
    if (inputSampleRate === TARGET_SAMPLE_RATE) return input;
    if (inputSampleRate < TARGET_SAMPLE_RATE) return input;
    const ratio = inputSampleRate / TARGET_SAMPLE_RATE;
    const outputLength = Math.max(1, Math.round(input.length / ratio));
    const output = new Float32Array(outputLength);
    let offsetInput = 0;
    for (let i = 0; i < outputLength; i += 1) {
      const nextOffsetInput = Math.min(input.length, Math.round((i + 1) * ratio));
      let accum = 0;
      let count = 0;
      for (let j = offsetInput; j < nextOffsetInput; j += 1) {
        accum += input[j];
        count += 1;
      }
      output[i] = count > 0 ? accum / count : input[Math.min(offsetInput, input.length - 1)] || 0;
      offsetInput = nextOffsetInput;
    }
    return output;
  };
  const float32ToPcm16 = (input: Float32Array): Int16Array => {
    const output = new Int16Array(input.length);
    for (let i = 0; i < input.length; i += 1) {
      const s = Math.max(-1, Math.min(1, input[i]));
      output[i] = s < 0 ? Math.round(s * 0x8000) : Math.round(s * 0x7fff);
    }
    return output;
  };
  const [mode, setMode] = useState<'text' | 'voice' | 'video'>('text');
  const [messages, setMessages] = useState<{role: 'user' | 'model', text: string}[]>([]);
  const [inputText, setInputText] = useState('');
@@ -1002,6 +1033,8 @@ export const DebugDrawer: React.FC<{
  const [wsError, setWsError] = useState('');
  const [resolvedConfigOpen, setResolvedConfigOpen] = useState(false);
  const [resolvedConfigView, setResolvedConfigView] = useState<string>('');
  const [captureConfigOpen, setCaptureConfigOpen] = useState(false);
  const [captureConfigView, setCaptureConfigView] = useState<string>('');
  const [settingsDrawerOpen, setSettingsDrawerOpen] = useState(false);
  const [wsUrl, setWsUrl] = useState<string>(() => {
    const fromStorage = localStorage.getItem('debug_ws_url');
@@ -1034,6 +1067,15 @@ export const DebugDrawer: React.FC<{
  const [selectedMic, setSelectedMic] = useState<string>('');
  const [isSwapped, setIsSwapped] = useState(false); 
  const [textTtsEnabled, setTextTtsEnabled] = useState(true);
  const [aecEnabled, setAecEnabled] = useState<boolean>(() => localStorage.getItem('debug_audio_aec') !== '0');
  const [nsEnabled, setNsEnabled] = useState<boolean>(() => localStorage.getItem('debug_audio_ns') !== '0');
  const [agcEnabled, setAgcEnabled] = useState<boolean>(() => localStorage.getItem('debug_audio_agc') !== '0');
  const micAudioCtxRef = useRef<AudioContext | null>(null);
  const micSourceRef = useRef<MediaStreamAudioSourceNode | null>(null);
  const micProcessorRef = useRef<ScriptProcessorNode | null>(null);
  const micGainRef = useRef<GainNode | null>(null);
  const userDraftIndexRef = useRef<number | null>(null);
  // Initialize
  useEffect(() => {
@@ -1047,6 +1089,7 @@ export const DebugDrawer: React.FC<{
      }
    } else {
        setMode('text');
        stopVoiceCapture();
        stopMedia();
        closeWs();
        if (audioCtxRef.current) {
@@ -1063,6 +1106,18 @@ export const DebugDrawer: React.FC<{
    localStorage.setItem('debug_ws_url', wsUrl);
  }, [wsUrl]);
  useEffect(() => {
    localStorage.setItem('debug_audio_aec', aecEnabled ? '1' : '0');
  }, [aecEnabled]);
  useEffect(() => {
    localStorage.setItem('debug_audio_ns', nsEnabled ? '1' : '0');
  }, [nsEnabled]);
  useEffect(() => {
    localStorage.setItem('debug_audio_agc', agcEnabled ? '1' : '0');
  }, [agcEnabled]);
  // Auto-scroll logic
  useEffect(() => {
      if (scrollRef.current) {
@@ -1072,10 +1127,11 @@ export const DebugDrawer: React.FC<{
  // Fetch Devices
  useEffect(() => {
-    if (isOpen && mode === 'video') {
+    if (isOpen && (mode === 'video' || mode === 'voice')) {
        const getDevices = async () => {
            let permissionStream: MediaStream | null = null;
            try {
-                await navigator.mediaDevices.getUserMedia({ audio: true, video: true });
+                permissionStream = await navigator.mediaDevices.getUserMedia({ audio: true, video: mode === 'video' });
                const dev = await navigator.mediaDevices.enumerateDevices();
                setDevices(dev);
                const cams = dev.filter(d => d.kind === 'videoinput');
@@ -1084,6 +1140,8 @@ export const DebugDrawer: React.FC<{
                if (mics.length > 0 && !selectedMic) setSelectedMic(mics[0].deviceId);
            } catch (e) {
                console.error("Error enumerating devices", e);
            } finally {
                permissionStream?.getTracks().forEach((track) => track.stop());
            }
        };
        getDevices();
@@ -1097,6 +1155,100 @@ export const DebugDrawer: React.FC<{
      }
  };
  const stopVoiceCapture = () => {
    if (micProcessorRef.current) {
      micProcessorRef.current.onaudioprocess = null;
      try {
        micProcessorRef.current.disconnect();
      } catch {
        // no-op
      }
      micProcessorRef.current = null;
    }
    if (micSourceRef.current) {
      try {
        micSourceRef.current.disconnect();
      } catch {
        // no-op
      }
      micSourceRef.current = null;
    }
    if (micGainRef.current) {
      try {
        micGainRef.current.disconnect();
      } catch {
        // no-op
      }
      micGainRef.current = null;
    }
    if (micAudioCtxRef.current) {
      void micAudioCtxRef.current.close();
      micAudioCtxRef.current = null;
    }
    setCaptureConfigView('');
    stopMedia();
  };
  const buildMicConstraints = (): MediaTrackConstraints => ({
    deviceId: selectedMic ? { exact: selectedMic } : undefined,
    echoCancellation: aecEnabled,
    noiseSuppression: nsEnabled,
    autoGainControl: agcEnabled,
    channelCount: 1,
    sampleRate: TARGET_SAMPLE_RATE,
  });
  const startVoiceCapture = async () => {
    stopVoiceCapture();
    const requestedConstraints = buildMicConstraints();
    const stream = await navigator.mediaDevices.getUserMedia({
      audio: requestedConstraints,
      video: false,
    });
    streamRef.current = stream;
    const track = stream.getAudioTracks()[0];
    if (track) {
      console.log('Voice capture settings', track.getSettings());
      setCaptureConfigView(
        JSON.stringify(
          {
            requested: requestedConstraints,
            applied: track.getSettings(),
            capabilities: typeof track.getCapabilities === 'function' ? track.getCapabilities() : undefined,
          },
          null,
          2
        )
      );
    }
    const ctx = new AudioContext();
    if (ctx.state === 'suspended') {
      await ctx.resume();
    }
    micAudioCtxRef.current = ctx;
    const source = ctx.createMediaStreamSource(stream);
    const processor = ctx.createScriptProcessor(4096, 1, 1);
    const silentGain = ctx.createGain();
    silentGain.gain.value = 0;
    source.connect(processor);
    processor.connect(silentGain);
    silentGain.connect(ctx.destination);
    processor.onaudioprocess = (event) => {
      if (!wsRef.current || wsRef.current.readyState !== WebSocket.OPEN || !wsReadyRef.current) return;
      const inChannel = event.inputBuffer.getChannelData(0);
      const downsampled = downsampleTo16k(inChannel, event.inputBuffer.sampleRate);
      const pcm16 = float32ToPcm16(downsampled);
      wsRef.current.send(pcm16.buffer);
    };
    micSourceRef.current = source;
    micProcessorRef.current = processor;
    micGainRef.current = silentGain;
  };
  const ensureAudioContext = async () => {
    if (!audioCtxRef.current) {
      audioCtxRef.current = new AudioContext();
@@ -1174,7 +1326,6 @@ export const DebugDrawer: React.FC<{
  const playPcm16Chunk = async (pcmBuffer: ArrayBuffer) => {
    if (!textTtsEnabled) return;
    if (mode !== 'text') return;
    const ctx = await ensureAudioContext();
    const int16 = new Int16Array(pcmBuffer);
    if (int16.length === 0) return;
@@ -1219,17 +1370,43 @@ export const DebugDrawer: React.FC<{
  }, [mode, isOpen, selectedCamera, selectedMic, callStatus]);
  const handleCall = () => {
    if (mode !== 'voice') {
      setCallStatus('calling');
      setTimeout(() => {
          setCallStatus('active');
          setMessages([{ role: 'model', text: assistant.opener || "Hello!" }]);
      }, 1500);
      return;
    }
    const launchVoice = async () => {
      try {
        setCallStatus('calling');
        setMessages([]);
        setWsError('');
        closeWs();
        if (textTtsEnabled) await ensureAudioContext();
        await ensureWsSession();
        await startVoiceCapture();
        setCallStatus('active');
        setMessages([{ role: 'model', text: assistant.opener || 'Hello!' }]);
      } catch (e) {
        console.error(e);
        stopVoiceCapture();
        setCallStatus('idle');
        setWsStatus('error');
        setWsError((e as Error)?.message || 'Failed to start voice call');
      }
    };
    void launchVoice();
  };
  const handleHangup = () => {
    stopVoiceCapture();
    stopMedia();
    closeWs();
    setCallStatus('idle');
    setMessages([]);
    setIsLoading(false);
  };
  const handleSend = async () => {
@@ -1250,6 +1427,9 @@ export const DebugDrawer: React.FC<{
        }
        stopPlaybackImmediately();
        wsRef.current?.send(JSON.stringify({ type: 'input.text', text: userMsg }));
      } else if (mode === 'voice') {
        await ensureWsSession();
        wsRef.current?.send(JSON.stringify({ type: 'input.text', text: userMsg }));
      } else {
        setTimeout(() => {
          setMessages(prev => [...prev, { role: 'model', text: `[Mock Response]: Received "${userMsg}"` }]);
@@ -1385,6 +1565,7 @@ export const DebugDrawer: React.FC<{
    pendingResolveRef.current = null;
    pendingRejectRef.current = null;
    assistantDraftIndexRef.current = null;
    userDraftIndexRef.current = null;
    setTextSessionStarted(false);
    stopPlaybackImmediately();
    if (isOpen) setWsStatus('disconnected');
@@ -1468,6 +1649,49 @@ export const DebugDrawer: React.FC<{
          return;
        }
        if (type === 'input.speech_started') {
          setIsLoading(true);
          return;
        }
        if (type === 'input.speech_stopped') {
          setIsLoading(false);
          return;
        }
        if (type === 'transcript.delta') {
          const delta = String(payload.text || '');
          if (!delta) return;
          setMessages((prev) => {
            const idx = userDraftIndexRef.current;
            if (idx === null || !prev[idx] || prev[idx].role !== 'user') {
              const next = [...prev, { role: 'user' as const, text: delta }];
              userDraftIndexRef.current = next.length - 1;
              return next;
            }
            const next = [...prev];
            next[idx] = { ...next[idx], text: next[idx].text + delta };
            return next;
          });
          return;
        }
        if (type === 'transcript.final') {
          const finalText = String(payload.text || '');
          setMessages((prev) => {
            const idx = userDraftIndexRef.current;
            userDraftIndexRef.current = null;
            if (idx !== null && prev[idx] && prev[idx].role === 'user') {
              const next = [...prev];
              next[idx] = { ...next[idx], text: finalText || next[idx].text };
              return next;
            }
            if (!finalText) return prev;
            return [...prev, { role: 'user', text: finalText }];
          });
          return;
        }
        if (type === 'assistant.response.delta') {
          const delta = String(payload.text || '');
          if (!delta) return;
@@ -1540,6 +1764,7 @@ export const DebugDrawer: React.FC<{
      ws.onclose = () => {
        wsReadyRef.current = false;
        setTextSessionStarted(false);
        userDraftIndexRef.current = null;
        stopPlaybackImmediately();
        if (wsStatus !== 'error') setWsStatus('disconnected');
      };
@@ -1560,6 +1785,18 @@ export const DebugDrawer: React.FC<{
    }
  }, [textTtsEnabled]);
  useEffect(() => {
    if (!isOpen || mode !== 'voice' || callStatus !== 'active') return;
    const restartCapture = async () => {
      try {
        await startVoiceCapture();
      } catch (e) {
        console.error('Failed to restart voice capture with new 3A settings', e);
      }
    };
    void restartCapture();
  }, [aecEnabled, nsEnabled, agcEnabled, selectedMic, mode, callStatus, isOpen]);
  useEffect(() => {
    if (!isOpen) return;
    const localResolved = buildLocalResolvedRuntime();
@@ -1611,6 +1848,35 @@ export const DebugDrawer: React.FC<{
            TTS
          </label>
        </div>
        <div className="rounded-md border border-white/10 bg-black/20 p-2 space-y-2">
          <p className="text-[10px] uppercase tracking-widest text-muted-foreground">Audio 3A</p>
          <label className="inline-flex items-center gap-2 text-xs text-muted-foreground">
            <input type="checkbox" checked={aecEnabled} onChange={(e) => setAecEnabled(e.target.checked)} className="accent-primary" />
            Echo Cancellation (AEC)
          </label>
          <label className="inline-flex items-center gap-2 text-xs text-muted-foreground">
            <input type="checkbox" checked={nsEnabled} onChange={(e) => setNsEnabled(e.target.checked)} className="accent-primary" />
            Noise Suppression (NS)
          </label>
          <label className="inline-flex items-center gap-2 text-xs text-muted-foreground">
            <input type="checkbox" checked={agcEnabled} onChange={(e) => setAgcEnabled(e.target.checked)} className="accent-primary" />
            Auto Gain Control (AGC)
          </label>
        </div>
        <div className="rounded-md border border-white/10 bg-black/30">
          <button
            className="w-full px-3 py-2 text-left text-xs text-muted-foreground hover:text-foreground flex items-center justify-between"
            onClick={() => setCaptureConfigOpen((v) => !v)}
          >
            <span>Capture Config Echo</span>
            <ChevronDown className={`h-3.5 w-3.5 transition-transform ${captureConfigOpen ? 'rotate-180' : ''}`} />
          </button>
          {captureConfigOpen && (
            <pre className="px-3 pb-3 text-[11px] leading-5 text-cyan-100/90 whitespace-pre-wrap break-all max-h-64 overflow-auto">
              {captureConfigView || 'Voice call not started yet.'}
            </pre>
          )}
        </div>
        {wsError && <p className="text-xs text-red-400">{wsError}</p>}
        <div className="rounded-md border border-white/10 bg-black/30">