Enhance voice interaction features and introduce voice preview functionality

- Update README to reflect the integration of the DebugVoicePanel with WebSocket support for voice interactions.
- Refactor voice_webrtc.py to improve error handling during WebRTC signaling and include assistant_id in the offer payload.
- Add useVoicePreview hook to manage microphone access and WebRTC connections for real-time voice previews.
- Modify AssistantPage to incorporate new visualizer options and pass assistantId to DebugVoicePanel, enhancing user experience during audio interactions.
- Update API model to include new fields for voice, speed, and language, supporting TTS and ASR configurations.
This commit is contained in:
Xin Wang
2026-06-10 10:17:46 +08:00
parent c839779d87
commit ac3f4dd806
5 changed files with 419 additions and 71 deletions

View File

@@ -100,5 +100,5 @@ docker compose --profile remote up -d
- [ ] `pip install` 后跑通,核对 pipecat 版本的服务/transport 构造参数(代码内有注释)
- [ ] 起本地 SenseVoice / CosyVoice 的 OpenAI 兼容服务
- [ ] `realtime` 模式(目前只 `pipeline` 级联)
- [ ] 前端 `DebugVoicePanel``/ws/voice`( dograh `useWebSocketRTC.tsx`)
- [x] 前端 `DebugVoicePanel``/ws/voice`(参考 dograh `useWebSocketRTC.tsx`)
- [ ] 加 DB 后:助手配置入库(目前随请求内联)

View File

@@ -2,9 +2,10 @@
参考 dograh 的 webrtc_signaling.py,砍掉鉴权/配额/DB/org/ICE 过滤策略/TURN。
握手消息:
client → {type:"offer", payload:{pc_id, sdp, type, config}}
client → {type:"offer", payload:{pc_id, sdp, type, assistant_id}}
server → {type:"answer", payload:{pc_id, sdp, type}}
both → {type:"ice-candidate", payload:{pc_id, candidate:{...}}}
server → {type:"error", payload:{message}}
"""
import asyncio
@@ -36,10 +37,22 @@ async def voice_signaling(websocket: WebSocket):
try:
while True:
message = await websocket.receive_json()
if message.get("type") == "offer":
await _handle_offer(websocket, message.get("payload", {}), peers)
elif message.get("type") == "ice-candidate":
await _handle_ice(message.get("payload", {}), peers)
try:
if message.get("type") == "offer":
await _handle_offer(websocket, message.get("payload", {}), peers)
elif message.get("type") == "ice-candidate":
await _handle_ice(message.get("payload", {}), peers)
except Exception as e:
logger.exception(f"处理 WebRTC 信令消息失败: {e}")
if websocket.application_state == WebSocketState.CONNECTED:
await websocket.send_json(
{
"type": "error",
"payload": {
"message": f"语音会话启动失败: {type(e).__name__}"
},
}
)
except WebSocketDisconnect:
logger.info("WebRTC 信令断开")
except Exception as e:

View File

@@ -57,6 +57,7 @@ import {
PopoverTrigger,
} from "@/components/ui/popover";
import { AuraVisualizer } from "@/components/ui/aura-visualizer";
import { NebulaVisualizer } from "@/components/ui/nebula-visualizer";
import { SpectrumVisualizer } from "@/components/ui/spectrum-visualizer";
import { WaveVisualizer } from "@/components/ui/wave-visualizer";
import {
@@ -76,6 +77,7 @@ import {
type Credential,
type KnowledgeBase,
} from "@/lib/api";
import { useVoicePreview } from "@/hooks/use-voice-preview";
type RuntimeMode = "pipeline" | "realtime";
@@ -425,7 +427,6 @@ export function AssistantPage() {
appId: "",
apiUrl: "",
apiKey: "",
model: "",
asr: "",
voice: "",
enableInterrupt: true,
@@ -455,6 +456,7 @@ export function AssistantPage() {
prompt: "",
apiUrl: "",
apiKey: "",
model: "",
asr: "",
voice: "",
enableInterrupt: true,
@@ -549,7 +551,6 @@ export function AssistantPage() {
apiUrl: a.apiUrl,
// 编辑时不把打码占位符放入输入框;空值写回后端表示保留旧 key
apiKey: "",
model: a.llmCredentialId ?? "",
asr: a.asrCredentialId ?? "",
voice: a.ttsCredentialId ?? "",
enableInterrupt: a.enableInterrupt,
@@ -607,6 +608,7 @@ export function AssistantPage() {
apiUrl: a.apiUrl,
// 编辑时不把打码占位符放入输入框;空值写回后端表示保留旧 key
apiKey: "",
model: a.llmCredentialId ?? "",
asr: a.asrCredentialId ?? "",
voice: a.ttsCredentialId ?? "",
enableInterrupt: a.enableInterrupt,
@@ -1229,7 +1231,7 @@ export function AssistantPage() {
</SectionCard>
</div>
<DebugDrawer />
<DebugDrawer assistantId={editingId} />
</div>
</div>
);
@@ -1334,7 +1336,7 @@ export function AssistantPage() {
</SectionCard>
</div>
<DebugDrawer />
<DebugDrawer assistantId={editingId} />
</div>
</div>
);
@@ -1453,7 +1455,7 @@ export function AssistantPage() {
</SectionCard>
</div>
<DebugDrawer />
<DebugDrawer assistantId={editingId} />
</div>
</div>
);
@@ -1664,71 +1666,117 @@ export function AssistantPage() {
</SectionCard>
</div>
<DebugDrawer />
<DebugDrawer assistantId={editingId} />
</div>
</div>
);
}
type VizStyle = "aura" | "bars" | "wave";
type VizStyle = "aura" | "nebula" | "bars" | "wave";
const VIZ_ORDER: VizStyle[] = ["aura", "bars", "wave"];
const VIZ_LABEL: Record<VizStyle, string> = {
aura: "光环",
bars: "频谱",
wave: "波形",
};
const VIZ_OPTIONS: { style: VizStyle; label: string; icon: React.ReactNode }[] =
[
{ style: "aura", label: "光环", icon: <Orbit size={14} /> },
{ style: "nebula", label: "星云", icon: <Sparkles size={14} /> },
{ style: "bars", label: "频谱", icon: <AudioLines size={14} /> },
{ style: "wave", label: "波形", icon: <Waves size={14} /> },
];
function DebugDrawer() {
function SegmentedIconGroup({
children,
label,
}: {
children: React.ReactNode;
label: string;
}) {
return (
<div
role="group"
aria-label={label}
className="flex items-center gap-0.5 rounded-full border border-hairline bg-canvas-soft p-0.5"
>
{children}
</div>
);
}
function SegmentedIconButton({
selected,
label,
onClick,
children,
}: {
selected: boolean;
label: string;
onClick: () => void;
children: React.ReactNode;
}) {
return (
<button
type="button"
onClick={onClick}
aria-label={label}
aria-pressed={selected}
title={label}
className={[
"flex h-7 w-7 items-center justify-center rounded-full transition-colors",
selected
? "bg-surface-strong text-foreground shadow-sm"
: "text-muted-soft hover:text-foreground",
].join(" ")}
>
{children}
</button>
);
}
function DebugDrawer({ assistantId }: { assistantId: string | null }) {
const [showTranscript, setShowTranscript] = useState(false);
const [vizStyle, setVizStyle] = useState<VizStyle>("wave");
const [vizStyle, setVizStyle] = useState<VizStyle>("aura");
return (
<aside className="hidden min-w-0 flex-1 flex-col overflow-hidden rounded-2xl border border-hairline bg-card shadow-sm lg:flex">
<div className="flex shrink-0 items-center justify-between gap-3 border-b border-hairline px-5 py-4">
<div className="flex shrink-0 items-center justify-between gap-3 border-b border-hairline px-5 py-3">
<div className="text-sm font-medium text-foreground"></div>
<div className="flex items-center gap-2">
{!showTranscript && (
<Button
type="button"
variant="outline"
size="icon"
className="h-8 w-8 rounded-full"
onClick={() =>
setVizStyle(
(value) =>
VIZ_ORDER[
(VIZ_ORDER.indexOf(value) + 1) % VIZ_ORDER.length
],
)
}
aria-label={`切换可视化样式(当前:${VIZ_LABEL[vizStyle]}`}
title={`可视化:${VIZ_LABEL[vizStyle]}`}
>
{vizStyle === "aura" ? (
<Orbit size={16} />
) : vizStyle === "bars" ? (
<AudioLines size={16} />
) : (
<Waves size={16} />
)}
</Button>
<SegmentedIconGroup label="可视化样式">
{VIZ_OPTIONS.map((option) => (
<SegmentedIconButton
key={option.style}
selected={vizStyle === option.style}
label={`可视化样式:${option.label}`}
onClick={() => setVizStyle(option.style)}
>
{option.icon}
</SegmentedIconButton>
))}
</SegmentedIconGroup>
)}
<Button
type="button"
variant={showTranscript ? "default" : "outline"}
size="icon"
className="h-8 w-8 rounded-full text-xs font-medium"
onClick={() => setShowTranscript((value) => !value)}
aria-label={showTranscript ? "显示音频可视化" : "显示文字聊天记录"}
aria-pressed={showTranscript}
>
</Button>
<SegmentedIconGroup label="预览视图">
<SegmentedIconButton
selected={!showTranscript}
label="语音可视化视图"
onClick={() => setShowTranscript(false)}
>
<Mic size={14} />
</SegmentedIconButton>
<SegmentedIconButton
selected={showTranscript}
label="文字聊天记录视图"
onClick={() => setShowTranscript(true)}
>
<MessageSquareText size={14} />
</SegmentedIconButton>
</SegmentedIconGroup>
</div>
</div>
<DebugVoicePanel showTranscript={showTranscript} vizStyle={vizStyle} />
<DebugVoicePanel
showTranscript={showTranscript}
vizStyle={vizStyle}
assistantId={assistantId}
/>
</aside>
);
}
@@ -1736,15 +1784,22 @@ function DebugDrawer() {
function DebugVoicePanel({
showTranscript,
vizStyle,
assistantId,
}: {
showTranscript: boolean;
vizStyle: VizStyle;
assistantId: string | null;
}) {
const [recording, setRecording] = useState(false);
const [micError, setMicError] = useState(false);
const { status, error, localStream, connect, disconnect, audioRef } =
useVoicePreview(assistantId, { onMicError: () => setMicError(true) });
// 连接中或已连通都视作"会话进行中"
const recording = status === "connecting" || status === "connected";
return (
<div className="flex min-h-0 flex-1 flex-col">
{/* 后端 TTS 音频经 WebRTC 媒体流过来,挂这里播放 */}
<audio ref={audioRef} autoPlay playsInline className="hidden" />
{showTranscript ? (
<DebugTranscriptPanel />
) : (
@@ -1774,40 +1829,55 @@ function DebugVoicePanel({
{(() => {
const onVizError = () => {
setMicError(true);
setRecording(false);
disconnect();
};
const shared = {
active: recording,
active: Boolean(localStream),
stream: localStream,
className: "relative shrink-0",
onError: onVizError,
} as const;
if (vizStyle === "aura")
return <AuraVisualizer {...shared} size={200} />;
if (vizStyle === "nebula")
return <NebulaVisualizer {...shared} size={200} />;
if (vizStyle === "bars")
return (
<SpectrumVisualizer {...shared} size={200} barCount={64} />
);
return <SpectrumVisualizer {...shared} size={200} />;
return <WaveVisualizer {...shared} size={200} />;
})()}
</div>
<div className="relative max-w-xs space-y-1.5">
<div className="font-display display-sm text-foreground">
{recording ? "我在聆听" : "开始一次语音对话"}
{status === "connecting"
? "连接中…"
: status === "connected"
? "我在聆听"
: "开始一次语音对话"}
</div>
<p className="mx-auto text-xs leading-5 text-muted-foreground">
{micError
? "无法访问麦克风,请检查浏览器权限后重试。"
: recording
? "直接说话即可。助手会在您停顿后自然回应。"
: "测试语音识别、响应速度与助手的播报效果。"}
: status === "failed"
? error ||
"连接失败,请确认后端已启动且助手已保存后重试。"
: !assistantId
? "请先保存助手,再开始语音预览。"
: recording
? "直接说话即可。助手会在您停顿后自然回应。"
: "测试语音识别、响应速度与助手的播报效果。"}
</p>
</div>
<Button
disabled={!assistantId || status === "connecting"}
onClick={() => {
setMicError(false);
setRecording((value) => !value);
if (recording) {
disconnect();
} else {
void connect();
}
}}
className={[
"relative h-11 gap-2 rounded-full px-6 text-sm font-medium shadow-sm transition-transform hover:scale-[1.03]",
@@ -1817,7 +1887,13 @@ function DebugVoicePanel({
].join(" ")}
aria-label={recording ? "结束语音测试" : "开始语音测试"}
>
{recording ? <PhoneOff size={18} /> : <Mic size={18} />}
{status === "connecting" ? (
<Loader2 size={18} className="animate-spin" />
) : recording ? (
<PhoneOff size={18} />
) : (
<Mic size={18} />
)}
{recording ? "结束对话" : "开始对话"}
</Button>
</div>

View File

@@ -0,0 +1,256 @@
"use client";
/**
* 语音预览:把麦克风接到后端 /ws/voice(WebRTC 信令),听到助手实时回应。
*
* 走原生 RTCPeerConnection + 一条 ws 信令通道,与后端 voice_webrtc.py 的约定对齐:
* client → {type:"offer", payload:{pc_id, sdp, type, assistant_id}}
* server → {type:"answer", payload:{pc_id, sdp, type}}
* client → {type:"ice-candidate", payload:{pc_id, candidate:{...}}}
* 音频本身走 WebRTC 媒体流(Opus),不经 ws;后端 TTS 帧从 ontrack 拿到直接播放。
*
* 纯本机(localhost)即可跑:localhost 是 secure context,麦克风可用,ws 用明文。
* 局域网/别的设备要 https+wss,见 deploy/README.md。
*/
import { useCallback, useEffect, useRef, useState } from "react";
import { API_BASE } from "@/lib/api";
export type VoicePreviewStatus = "idle" | "connecting" | "connected" | "failed";
// http→ws、https→wss,自动跟随 API 基址(同源反代时也对)
function wsBaseUrl(): string {
const url = new URL(API_BASE, window.location.origin);
url.protocol = url.protocol === "https:" ? "wss:" : "ws:";
return url.toString().replace(/\/$/, "");
}
function generatePcId(): string {
const bytes = new Uint8Array(16);
crypto.getRandomValues(bytes);
return (
"PC-" +
Array.from(bytes)
.map((b) => b.toString(16).padStart(2, "0"))
.join("")
);
}
type UseVoicePreviewOptions = {
/** 取麦克风失败(权限/无设备)时回调,供 UI 提示。 */
onMicError?: () => void;
};
function errorMessage(error: unknown, fallback: string): string {
if (error instanceof Error && error.message) return error.message;
return fallback;
}
export function useVoicePreview(
assistantId: string | null,
{ onMicError }: UseVoicePreviewOptions = {},
) {
const [status, setStatus] = useState<VoicePreviewStatus>("idle");
const [error, setError] = useState<string | null>(null);
const [localStream, setLocalStream] = useState<MediaStream | null>(null);
const audioRef = useRef<HTMLAudioElement | null>(null);
const pcRef = useRef<RTCPeerConnection | null>(null);
const wsRef = useRef<WebSocket | null>(null);
const localStreamRef = useRef<MediaStream | null>(null);
const startingRef = useRef(false);
const releaseResources = useCallback(() => {
const ws = wsRef.current;
wsRef.current = null;
if (ws) {
ws.onclose = null;
ws.onerror = null;
ws.onmessage = null;
ws.close();
}
const pc = pcRef.current;
pcRef.current = null;
if (pc) {
pc.onconnectionstatechange = null;
pc.onicecandidate = null;
pc.oniceconnectionstatechange = null;
pc.ontrack = null;
pc.close();
}
localStreamRef.current?.getTracks().forEach((track) => track.stop());
localStreamRef.current = null;
if (audioRef.current) audioRef.current.srcObject = null;
startingRef.current = false;
}, []);
const disconnect = useCallback(() => {
releaseResources();
setLocalStream(null);
setError(null);
setStatus("idle");
}, [releaseResources]);
const fail = useCallback(
(message: string) => {
releaseResources();
setLocalStream(null);
setError(message);
setStatus("failed");
},
[releaseResources],
);
const connect = useCallback(async () => {
if (startingRef.current || pcRef.current || wsRef.current) return;
if (!assistantId) {
setError("请先保存助手,再开始语音预览。");
setStatus("failed");
return;
}
startingRef.current = true;
setError(null);
setStatus("connecting");
const pcId = generatePcId();
const ws = new WebSocket(`${wsBaseUrl()}/ws/voice`);
wsRef.current = ws;
ws.onmessage = async (event) => {
try {
const msg = JSON.parse(event.data);
if (msg.type === "answer") {
await pcRef.current?.setRemoteDescription({
type: "answer",
sdp: msg.payload.sdp,
});
} else if (msg.type === "ice-candidate" && msg.payload?.candidate) {
// 后端当前不主动 trickle,留兼容
try {
await pcRef.current?.addIceCandidate(msg.payload.candidate);
} catch {
/* 忽略迟到/重复 candidate */
}
} else if (msg.type === "error") {
fail(msg.payload?.message || "后端无法启动语音会话。");
}
} catch {
/* 非 JSON / 未知消息,忽略 */
}
};
try {
// 1) 等 ws 连上
await new Promise<void>((resolve, reject) => {
ws.onopen = () => resolve();
ws.onerror = (e) => reject(e);
ws.onclose = () => reject(new Error("语音信令连接已关闭。"));
});
// 连上后,信令异常或关闭都结束当前会话并保留失败状态。
ws.onerror = () => {
if (wsRef.current === ws) fail("语音信令连接失败。");
};
ws.onclose = () => {
if (wsRef.current === ws) fail("语音信令连接已断开。");
};
// 2) 建 PeerConnection(纯 STUN,本机/局域网够用)
const pc = new RTCPeerConnection({
iceServers: [{ urls: "stun:stun.l.google.com:19302" }],
});
pcRef.current = pc;
pc.onicecandidate = (e) => {
if (ws.readyState !== WebSocket.OPEN) return;
ws.send(
JSON.stringify({
type: "ice-candidate",
payload: {
pc_id: pcId,
candidate: e.candidate
? {
candidate: e.candidate.candidate,
sdpMid: e.candidate.sdpMid,
sdpMLineIndex: e.candidate.sdpMLineIndex,
}
: null,
},
}),
);
};
pc.ontrack = (e) => {
if (e.track.kind === "audio" && audioRef.current) {
audioRef.current.srcObject =
e.streams[0] ?? new MediaStream([e.track]);
void audioRef.current.play().catch(() => {});
}
};
pc.onconnectionstatechange = () => {
if (pcRef.current !== pc) return;
if (pc.connectionState === "connected") setStatus("connected");
else if (pc.connectionState === "failed")
fail("WebRTC 音频连接失败。");
};
pc.oniceconnectionstatechange = () => {
if (pcRef.current !== pc) return;
const st = pc.iceConnectionState;
if (st === "connected" || st === "completed") setStatus("connected");
else if (st === "failed") fail("WebRTC 音频连接失败。");
else if (st === "disconnected") fail("WebRTC 音频连接已断开。");
};
// 3) 取麦克风 → 加入连接
let stream: MediaStream;
try {
stream = await navigator.mediaDevices.getUserMedia({
audio: {
echoCancellation: true,
noiseSuppression: true,
autoGainControl: true,
},
});
} catch (mediaError) {
onMicError?.();
fail(errorMessage(mediaError, "无法访问麦克风。"));
return;
}
localStreamRef.current = stream;
setLocalStream(stream);
stream.getTracks().forEach((track) => pc.addTrack(track, stream));
// 4) 生成 offer 并发给后端(assistant_id 在 payload 顶层)
const offer = await pc.createOffer();
await pc.setLocalDescription(offer);
const localDescription = pc.localDescription;
if (!localDescription?.sdp) {
throw new Error("浏览器无法创建 WebRTC offer。");
}
ws.send(
JSON.stringify({
type: "offer",
payload: {
pc_id: pcId,
sdp: localDescription.sdp,
type: localDescription.type,
assistant_id: assistantId,
},
}),
);
} catch (connectionError) {
fail(errorMessage(connectionError, "无法连接语音服务。"));
} finally {
startingRef.current = false;
}
}, [assistantId, fail, onMicError]);
// 卸载时收尾
useEffect(() => releaseResources, [releaseResources]);
return { status, error, localStream, connect, disconnect, audioRef };
}

View File

@@ -5,7 +5,7 @@
* 注意:api_key 读取时后端永远打码,写回打码占位符表示"不改 key"(写时哨兵)。
*/
const API_BASE =
export const API_BASE =
process.env.NEXT_PUBLIC_API_BASE_URL ?? "http://localhost:8000";
export type ModelType = "LLM" | "ASR" | "TTS" | "Realtime" | "Embedding";
@@ -34,6 +34,9 @@ export type CredentialUpsert = {
interfaceType: InterfaceType;
apiUrl: string;
apiKey: string;
voice: string;
speed: number;
language: string;
isDefault: boolean;
};