Enhance voice interaction features and introduce voice preview functionality

- Update README to reflect the integration of the DebugVoicePanel with WebSocket support for voice interactions. - Refactor voice_webrtc.py to improve error handling during WebRTC signaling and include assistant_id in the offer payload. - Add useVoicePreview hook to manage microphone access and WebRTC connections for real-time voice previews. - Modify AssistantPage to incorporate new visualizer options and pass assistantId to DebugVoicePanel, enhancing user experience during audio interactions. - Update API model to include new fields for voice, speed, and language, supporting TTS and ASR configurations.
2026-06-10 10:17:46 +08:00
parent c839779d87
commit ac3f4dd806
5 changed files with 419 additions and 71 deletions
--- a/backend/README.md
+++ b/backend/README.md
@@ -100,5 +100,5 @@ docker compose --profile remote up -d
 - [ ] `pip install` 后跑通,核对 pipecat 版本的服务/transport 构造参数(代码内有注释)
 - [ ] 起本地 SenseVoice / CosyVoice 的 OpenAI 兼容服务
 - [ ] `realtime` 模式(目前只 `pipeline` 级联)
- [ ] 前端 `DebugVoicePanel` 接 `/ws/voice`(抄 dograh `useWebSocketRTC.tsx`)
+- [x] 前端 `DebugVoicePanel` 接 `/ws/voice`(参考 dograh `useWebSocketRTC.tsx`)
 - [ ] 加 DB 后:助手配置入库(目前随请求内联)
--- a/backend/routes/voice_webrtc.py
+++ b/backend/routes/voice_webrtc.py
@@ -2,9 +2,10 @@

 参考 dograh 的 webrtc_signaling.py,砍掉鉴权/配额/DB/org/ICE 过滤策略/TURN。
 握手消息:
-    client → {type:"offer",         payload:{pc_id, sdp, type, config}}
+    client → {type:"offer",         payload:{pc_id, sdp, type, assistant_id}}
    server → {type:"answer",        payload:{pc_id, sdp, type}}
    both   → {type:"ice-candidate", payload:{pc_id, candidate:{...}}}
+    server → {type:"error",         payload:{message}}
 """

 import asyncio
@@ -36,10 +37,22 @@ async def voice_signaling(websocket: WebSocket):
    try:
        while True:
            message = await websocket.receive_json()
-            if message.get("type") == "offer":
-                await _handle_offer(websocket, message.get("payload", {}), peers)
-            elif message.get("type") == "ice-candidate":
-                await _handle_ice(message.get("payload", {}), peers)
+            try:
+                if message.get("type") == "offer":
+                    await _handle_offer(websocket, message.get("payload", {}), peers)
+                elif message.get("type") == "ice-candidate":
+                    await _handle_ice(message.get("payload", {}), peers)
+            except Exception as e:
+                logger.exception(f"处理 WebRTC 信令消息失败: {e}")
+                if websocket.application_state == WebSocketState.CONNECTED:
+                    await websocket.send_json(
+                        {
+                            "type": "error",
+                            "payload": {
+                                "message": f"语音会话启动失败: {type(e).__name__}"
+                            },
+                        }
+                    )
    except WebSocketDisconnect:
        logger.info("WebRTC 信令断开")
    except Exception as e:
--- a/frontend/src/components/pages/AssistantPage.tsx
+++ b/frontend/src/components/pages/AssistantPage.tsx
@@ -57,6 +57,7 @@ import {
  PopoverTrigger,
 } from "@/components/ui/popover";
 import { AuraVisualizer } from "@/components/ui/aura-visualizer";
+import { NebulaVisualizer } from "@/components/ui/nebula-visualizer";
 import { SpectrumVisualizer } from "@/components/ui/spectrum-visualizer";
 import { WaveVisualizer } from "@/components/ui/wave-visualizer";
 import {
@@ -76,6 +77,7 @@ import {
  type Credential,
  type KnowledgeBase,
 } from "@/lib/api";
+import { useVoicePreview } from "@/hooks/use-voice-preview";

 type RuntimeMode = "pipeline" | "realtime";

@@ -425,7 +427,6 @@ export function AssistantPage() {
        appId: "",
        apiUrl: "",
        apiKey: "",
-        model: "",
        asr: "",
        voice: "",
        enableInterrupt: true,
@@ -455,6 +456,7 @@ export function AssistantPage() {
        prompt: "",
        apiUrl: "",
        apiKey: "",
+        model: "",
        asr: "",
        voice: "",
        enableInterrupt: true,
@@ -549,7 +551,6 @@ export function AssistantPage() {
      apiUrl: a.apiUrl,
      // 编辑时不把打码占位符放入输入框；空值写回后端表示保留旧 key
      apiKey: "",
-      model: a.llmCredentialId ?? "",
      asr: a.asrCredentialId ?? "",
      voice: a.ttsCredentialId ?? "",
      enableInterrupt: a.enableInterrupt,
@@ -607,6 +608,7 @@ export function AssistantPage() {
      apiUrl: a.apiUrl,
      // 编辑时不把打码占位符放入输入框；空值写回后端表示保留旧 key
      apiKey: "",
+      model: a.llmCredentialId ?? "",
      asr: a.asrCredentialId ?? "",
      voice: a.ttsCredentialId ?? "",
      enableInterrupt: a.enableInterrupt,
@@ -1229,7 +1231,7 @@ export function AssistantPage() {
            </SectionCard>
          </div>

-          <DebugDrawer />
+          <DebugDrawer assistantId={editingId} />
        </div>
      </div>
    );
@@ -1334,7 +1336,7 @@ export function AssistantPage() {
            </SectionCard>
          </div>

-          <DebugDrawer />
+          <DebugDrawer assistantId={editingId} />
        </div>
      </div>
    );
@@ -1453,7 +1455,7 @@ export function AssistantPage() {
            </SectionCard>
          </div>

-          <DebugDrawer />
+          <DebugDrawer assistantId={editingId} />
        </div>
      </div>
    );
@@ -1664,71 +1666,117 @@ export function AssistantPage() {
          </SectionCard>
        </div>

-        <DebugDrawer />
+        <DebugDrawer assistantId={editingId} />
      </div>
    </div>
  );
 }

-type VizStyle = "aura" | "bars" | "wave";
+type VizStyle = "aura" | "nebula" | "bars" | "wave";

-const VIZ_ORDER: VizStyle[] = ["aura", "bars", "wave"];
-const VIZ_LABEL: Record<VizStyle, string> = {
-  aura: "光环",
-  bars: "频谱",
-  wave: "波形",
-};
+const VIZ_OPTIONS: { style: VizStyle; label: string; icon: React.ReactNode }[] =
+  [
+    { style: "aura", label: "光环", icon: <Orbit size={14} /> },
+    { style: "nebula", label: "星云", icon: <Sparkles size={14} /> },
+    { style: "bars", label: "频谱", icon: <AudioLines size={14} /> },
+    { style: "wave", label: "波形", icon: <Waves size={14} /> },
+  ];

-function DebugDrawer() {
+function SegmentedIconGroup({
+  children,
+  label,
+}: {
+  children: React.ReactNode;
+  label: string;
+}) {
+  return (
+    <div
+      role="group"
+      aria-label={label}
+      className="flex items-center gap-0.5 rounded-full border border-hairline bg-canvas-soft p-0.5"
+    >
+      {children}
+    </div>
+  );
+}
+
+function SegmentedIconButton({
+  selected,
+  label,
+  onClick,
+  children,
+}: {
+  selected: boolean;
+  label: string;
+  onClick: () => void;
+  children: React.ReactNode;
+}) {
+  return (
+    <button
+      type="button"
+      onClick={onClick}
+      aria-label={label}
+      aria-pressed={selected}
+      title={label}
+      className={[
+        "flex h-7 w-7 items-center justify-center rounded-full transition-colors",
+        selected
+          ? "bg-surface-strong text-foreground shadow-sm"
+          : "text-muted-soft hover:text-foreground",
+      ].join(" ")}
+    >
+      {children}
+    </button>
+  );
+}
+
+function DebugDrawer({ assistantId }: { assistantId: string | null }) {
  const [showTranscript, setShowTranscript] = useState(false);
-  const [vizStyle, setVizStyle] = useState<VizStyle>("wave");
+  const [vizStyle, setVizStyle] = useState<VizStyle>("aura");

  return (
    <aside className="hidden min-w-0 flex-1 flex-col overflow-hidden rounded-2xl border border-hairline bg-card shadow-sm lg:flex">
-      <div className="flex shrink-0 items-center justify-between gap-3 border-b border-hairline px-5 py-4">
+      <div className="flex shrink-0 items-center justify-between gap-3 border-b border-hairline px-5 py-3">
        <div className="text-sm font-medium text-foreground">调试与预览</div>
        <div className="flex items-center gap-2">
          {!showTranscript && (
-            <Button
-              type="button"
-              variant="outline"
-              size="icon"
-              className="h-8 w-8 rounded-full"
-              onClick={() =>
-                setVizStyle(
-                  (value) =>
-                    VIZ_ORDER[
-                      (VIZ_ORDER.indexOf(value) + 1) % VIZ_ORDER.length
-                    ],
-                )
-              }
-              aria-label={`切换可视化样式（当前：${VIZ_LABEL[vizStyle]}）`}
-              title={`可视化：${VIZ_LABEL[vizStyle]}`}
-            >
-              {vizStyle === "aura" ? (
-                <Orbit size={16} />
-              ) : vizStyle === "bars" ? (
-                <AudioLines size={16} />
-              ) : (
-                <Waves size={16} />
-              )}
-            </Button>
+            <SegmentedIconGroup label="可视化样式">
+              {VIZ_OPTIONS.map((option) => (
+                <SegmentedIconButton
+                  key={option.style}
+                  selected={vizStyle === option.style}
+                  label={`可视化样式：${option.label}`}
+                  onClick={() => setVizStyle(option.style)}
+                >
+                  {option.icon}
+                </SegmentedIconButton>
+              ))}
+            </SegmentedIconGroup>
          )}
-          <Button
-            type="button"
-            variant={showTranscript ? "default" : "outline"}
-            size="icon"
-            className="h-8 w-8 rounded-full text-xs font-medium"
-            onClick={() => setShowTranscript((value) => !value)}
-            aria-label={showTranscript ? "显示音频可视化" : "显示文字聊天记录"}
-            aria-pressed={showTranscript}
-          >
-            文
-          </Button>
+          <SegmentedIconGroup label="预览视图">
+            <SegmentedIconButton
+              selected={!showTranscript}
+              label="语音可视化视图"
+              onClick={() => setShowTranscript(false)}
+            >
+              <Mic size={14} />
+            </SegmentedIconButton>
+            <SegmentedIconButton
+              selected={showTranscript}
+              label="文字聊天记录视图"
+              onClick={() => setShowTranscript(true)}
+            >
+              <MessageSquareText size={14} />
+            </SegmentedIconButton>
+          </SegmentedIconGroup>
        </div>
      </div>

-      <DebugVoicePanel showTranscript={showTranscript} vizStyle={vizStyle} />
+      <DebugVoicePanel
+        showTranscript={showTranscript}
+        vizStyle={vizStyle}
+        assistantId={assistantId}
+      />
    </aside>
  );
 }
@@ -1736,15 +1784,22 @@ function DebugDrawer() {
 function DebugVoicePanel({
  showTranscript,
  vizStyle,
+  assistantId,
 }: {
  showTranscript: boolean;
  vizStyle: VizStyle;
+  assistantId: string | null;
 }) {
-  const [recording, setRecording] = useState(false);
  const [micError, setMicError] = useState(false);
+  const { status, error, localStream, connect, disconnect, audioRef } =
+    useVoicePreview(assistantId, { onMicError: () => setMicError(true) });
+  // 连接中或已连通都视作"会话进行中"
+  const recording = status === "connecting" || status === "connected";

  return (
    <div className="flex min-h-0 flex-1 flex-col">
+      {/* 后端 TTS 音频经 WebRTC 媒体流过来,挂这里播放 */}
+      <audio ref={audioRef} autoPlay playsInline className="hidden" />
      {showTranscript ? (
        <DebugTranscriptPanel />
      ) : (
@@ -1774,40 +1829,55 @@ function DebugVoicePanel({
            {(() => {
              const onVizError = () => {
                setMicError(true);
-                setRecording(false);
+                disconnect();
              };
              const shared = {
-                active: recording,
+                active: Boolean(localStream),
+                stream: localStream,
                className: "relative shrink-0",
                onError: onVizError,
              } as const;
              if (vizStyle === "aura")
                return <AuraVisualizer {...shared} size={200} />;
+              if (vizStyle === "nebula")
+                return <NebulaVisualizer {...shared} size={200} />;
              if (vizStyle === "bars")
-                return (
-                  <SpectrumVisualizer {...shared} size={200} barCount={64} />
-                );
+                return <SpectrumVisualizer {...shared} size={200} />;
              return <WaveVisualizer {...shared} size={200} />;
            })()}
          </div>

          <div className="relative max-w-xs space-y-1.5">
            <div className="font-display display-sm text-foreground">
-              {recording ? "我在聆听" : "开始一次语音对话"}
+              {status === "connecting"
+                ? "连接中…"
+                : status === "connected"
+                  ? "我在聆听"
+                  : "开始一次语音对话"}
            </div>
            <p className="mx-auto text-xs leading-5 text-muted-foreground">
              {micError
                ? "无法访问麦克风，请检查浏览器权限后重试。"
-                : recording
-                  ? "直接说话即可。助手会在您停顿后自然回应。"
-                  : "测试语音识别、响应速度与助手的播报效果。"}
+                : status === "failed"
+                  ? error ||
+                    "连接失败，请确认后端已启动且助手已保存后重试。"
+                  : !assistantId
+                    ? "请先保存助手，再开始语音预览。"
+                    : recording
+                      ? "直接说话即可。助手会在您停顿后自然回应。"
+                      : "测试语音识别、响应速度与助手的播报效果。"}
            </p>
          </div>

          <Button
+            disabled={!assistantId || status === "connecting"}
            onClick={() => {
              setMicError(false);
-              setRecording((value) => !value);
+              if (recording) {
+                disconnect();
+              } else {
+                void connect();
+              }
            }}
            className={[
              "relative h-11 gap-2 rounded-full px-6 text-sm font-medium shadow-sm transition-transform hover:scale-[1.03]",
@@ -1817,7 +1887,13 @@ function DebugVoicePanel({
            ].join(" ")}
            aria-label={recording ? "结束语音测试" : "开始语音测试"}
          >
-            {recording ? <PhoneOff size={18} /> : <Mic size={18} />}
+            {status === "connecting" ? (
+              <Loader2 size={18} className="animate-spin" />
+            ) : recording ? (
+              <PhoneOff size={18} />
+            ) : (
+              <Mic size={18} />
+            )}
            {recording ? "结束对话" : "开始对话"}
          </Button>
        </div>
--- a/frontend/src/hooks/use-voice-preview.ts
+++ b/frontend/src/hooks/use-voice-preview.ts
@@ -0,0 +1,256 @@
+"use client";
+
+/**
+ * 语音预览:把麦克风接到后端 /ws/voice(WebRTC 信令),听到助手实时回应。
+ *
+ * 走原生 RTCPeerConnection + 一条 ws 信令通道,与后端 voice_webrtc.py 的约定对齐:
+ *   client → {type:"offer",         payload:{pc_id, sdp, type, assistant_id}}
+ *   server → {type:"answer",        payload:{pc_id, sdp, type}}
+ *   client → {type:"ice-candidate", payload:{pc_id, candidate:{...}}}
+ * 音频本身走 WebRTC 媒体流(Opus),不经 ws;后端 TTS 帧从 ontrack 拿到直接播放。
+ *
+ * 纯本机(localhost)即可跑:localhost 是 secure context,麦克风可用,ws 用明文。
+ * 局域网/别的设备要 https+wss,见 deploy/README.md。
+ */
+
+import { useCallback, useEffect, useRef, useState } from "react";
+
+import { API_BASE } from "@/lib/api";
+
+export type VoicePreviewStatus = "idle" | "connecting" | "connected" | "failed";
+
+// http→ws、https→wss,自动跟随 API 基址(同源反代时也对)
+function wsBaseUrl(): string {
+  const url = new URL(API_BASE, window.location.origin);
+  url.protocol = url.protocol === "https:" ? "wss:" : "ws:";
+  return url.toString().replace(/\/$/, "");
+}
+
+function generatePcId(): string {
+  const bytes = new Uint8Array(16);
+  crypto.getRandomValues(bytes);
+  return (
+    "PC-" +
+    Array.from(bytes)
+      .map((b) => b.toString(16).padStart(2, "0"))
+      .join("")
+  );
+}
+
+type UseVoicePreviewOptions = {
+  /** 取麦克风失败(权限/无设备)时回调,供 UI 提示。 */
+  onMicError?: () => void;
+};
+
+function errorMessage(error: unknown, fallback: string): string {
+  if (error instanceof Error && error.message) return error.message;
+  return fallback;
+}
+
+export function useVoicePreview(
+  assistantId: string | null,
+  { onMicError }: UseVoicePreviewOptions = {},
+) {
+  const [status, setStatus] = useState<VoicePreviewStatus>("idle");
+  const [error, setError] = useState<string | null>(null);
+  const [localStream, setLocalStream] = useState<MediaStream | null>(null);
+  const audioRef = useRef<HTMLAudioElement | null>(null);
+  const pcRef = useRef<RTCPeerConnection | null>(null);
+  const wsRef = useRef<WebSocket | null>(null);
+  const localStreamRef = useRef<MediaStream | null>(null);
+  const startingRef = useRef(false);
+
+  const releaseResources = useCallback(() => {
+    const ws = wsRef.current;
+    wsRef.current = null;
+    if (ws) {
+      ws.onclose = null;
+      ws.onerror = null;
+      ws.onmessage = null;
+      ws.close();
+    }
+
+    const pc = pcRef.current;
+    pcRef.current = null;
+    if (pc) {
+      pc.onconnectionstatechange = null;
+      pc.onicecandidate = null;
+      pc.oniceconnectionstatechange = null;
+      pc.ontrack = null;
+      pc.close();
+    }
+
+    localStreamRef.current?.getTracks().forEach((track) => track.stop());
+    localStreamRef.current = null;
+    if (audioRef.current) audioRef.current.srcObject = null;
+    startingRef.current = false;
+  }, []);
+
+  const disconnect = useCallback(() => {
+    releaseResources();
+    setLocalStream(null);
+    setError(null);
+    setStatus("idle");
+  }, [releaseResources]);
+
+  const fail = useCallback(
+    (message: string) => {
+      releaseResources();
+      setLocalStream(null);
+      setError(message);
+      setStatus("failed");
+    },
+    [releaseResources],
+  );
+
+  const connect = useCallback(async () => {
+    if (startingRef.current || pcRef.current || wsRef.current) return;
+    if (!assistantId) {
+      setError("请先保存助手，再开始语音预览。");
+      setStatus("failed");
+      return;
+    }
+
+    startingRef.current = true;
+    setError(null);
+    setStatus("connecting");
+
+    const pcId = generatePcId();
+    const ws = new WebSocket(`${wsBaseUrl()}/ws/voice`);
+    wsRef.current = ws;
+
+    ws.onmessage = async (event) => {
+      try {
+        const msg = JSON.parse(event.data);
+        if (msg.type === "answer") {
+          await pcRef.current?.setRemoteDescription({
+            type: "answer",
+            sdp: msg.payload.sdp,
+          });
+        } else if (msg.type === "ice-candidate" && msg.payload?.candidate) {
+          // 后端当前不主动 trickle,留兼容
+          try {
+            await pcRef.current?.addIceCandidate(msg.payload.candidate);
+          } catch {
+            /* 忽略迟到/重复 candidate */
+          }
+        } else if (msg.type === "error") {
+          fail(msg.payload?.message || "后端无法启动语音会话。");
+        }
+      } catch {
+        /* 非 JSON / 未知消息,忽略 */
+      }
+    };
+
+    try {
+      // 1) 等 ws 连上
+      await new Promise<void>((resolve, reject) => {
+        ws.onopen = () => resolve();
+        ws.onerror = (e) => reject(e);
+        ws.onclose = () => reject(new Error("语音信令连接已关闭。"));
+      });
+      // 连上后,信令异常或关闭都结束当前会话并保留失败状态。
+      ws.onerror = () => {
+        if (wsRef.current === ws) fail("语音信令连接失败。");
+      };
+      ws.onclose = () => {
+        if (wsRef.current === ws) fail("语音信令连接已断开。");
+      };
+
+      // 2) 建 PeerConnection(纯 STUN,本机/局域网够用)
+      const pc = new RTCPeerConnection({
+        iceServers: [{ urls: "stun:stun.l.google.com:19302" }],
+      });
+      pcRef.current = pc;
+
+      pc.onicecandidate = (e) => {
+        if (ws.readyState !== WebSocket.OPEN) return;
+        ws.send(
+          JSON.stringify({
+            type: "ice-candidate",
+            payload: {
+              pc_id: pcId,
+              candidate: e.candidate
+                ? {
+                    candidate: e.candidate.candidate,
+                    sdpMid: e.candidate.sdpMid,
+                    sdpMLineIndex: e.candidate.sdpMLineIndex,
+                  }
+                : null,
+            },
+          }),
+        );
+      };
+
+      pc.ontrack = (e) => {
+        if (e.track.kind === "audio" && audioRef.current) {
+          audioRef.current.srcObject =
+            e.streams[0] ?? new MediaStream([e.track]);
+          void audioRef.current.play().catch(() => {});
+        }
+      };
+
+      pc.onconnectionstatechange = () => {
+        if (pcRef.current !== pc) return;
+        if (pc.connectionState === "connected") setStatus("connected");
+        else if (pc.connectionState === "failed")
+          fail("WebRTC 音频连接失败。");
+      };
+
+      pc.oniceconnectionstatechange = () => {
+        if (pcRef.current !== pc) return;
+        const st = pc.iceConnectionState;
+        if (st === "connected" || st === "completed") setStatus("connected");
+        else if (st === "failed") fail("WebRTC 音频连接失败。");
+        else if (st === "disconnected") fail("WebRTC 音频连接已断开。");
+      };
+
+      // 3) 取麦克风 → 加入连接
+      let stream: MediaStream;
+      try {
+        stream = await navigator.mediaDevices.getUserMedia({
+          audio: {
+            echoCancellation: true,
+            noiseSuppression: true,
+            autoGainControl: true,
+          },
+        });
+      } catch (mediaError) {
+        onMicError?.();
+        fail(errorMessage(mediaError, "无法访问麦克风。"));
+        return;
+      }
+      localStreamRef.current = stream;
+      setLocalStream(stream);
+      stream.getTracks().forEach((track) => pc.addTrack(track, stream));
+
+      // 4) 生成 offer 并发给后端(assistant_id 在 payload 顶层)
+      const offer = await pc.createOffer();
+      await pc.setLocalDescription(offer);
+      const localDescription = pc.localDescription;
+      if (!localDescription?.sdp) {
+        throw new Error("浏览器无法创建 WebRTC offer。");
+      }
+      ws.send(
+        JSON.stringify({
+          type: "offer",
+          payload: {
+            pc_id: pcId,
+            sdp: localDescription.sdp,
+            type: localDescription.type,
+            assistant_id: assistantId,
+          },
+        }),
+      );
+    } catch (connectionError) {
+      fail(errorMessage(connectionError, "无法连接语音服务。"));
+    } finally {
+      startingRef.current = false;
+    }
+  }, [assistantId, fail, onMicError]);
+
+  // 卸载时收尾
+  useEffect(() => releaseResources, [releaseResources]);
+
+  return { status, error, localStream, connect, disconnect, audioRef };
+}
--- a/frontend/src/lib/api.ts
+++ b/frontend/src/lib/api.ts
@@ -5,7 +5,7 @@
 * 注意:api_key 读取时后端永远打码,写回打码占位符表示"不改 key"(写时哨兵)。
 */

-const API_BASE =
+export const API_BASE =
  process.env.NEXT_PUBLIC_API_BASE_URL ?? "http://localhost:8000";

 export type ModelType = "LLM" | "ASR" | "TTS" | "Realtime" | "Embedding";
@@ -34,6 +34,9 @@ export type CredentialUpsert = {
  interfaceType: InterfaceType;
  apiUrl: string;
  apiKey: string;
+  voice: string;
+  speed: number;
+  language: string;
  isDefault: boolean;
 };