Enhance voice interaction and transcript handling in the assistant

- Add a new Docker configuration for the UI in launch.json to facilitate development.
- Refactor pipeline.py to integrate a TranscriptProcessor for managing user and assistant transcripts, including event handlers for real-time updates and message handling.
- Update useVoicePreview.ts to establish a data channel for sending and receiving text messages, improving interaction flow.
- Modify AssistantPage.tsx to support displaying chat messages and sending user input, enhancing the user experience during voice interactions.
- Revise DebugTranscriptPanel to dynamically render chat messages with timestamps, improving the visual representation of conversation history.
This commit is contained in:
Xin Wang
2026-06-10 15:11:34 +08:00
parent b711350c0c
commit 2c2af1f2cd
4 changed files with 223 additions and 28 deletions

View File

@@ -8,6 +8,13 @@
"cwd": "frontend",
"port": 3001,
"autoPort": false
},
{
"name": "ui-docker",
"runtimeExecutable": "docker",
"runtimeArgs": ["compose", "up", "ui"],
"port": 3030,
"autoPort": false
}
]
}

View File

@@ -10,11 +10,19 @@ from loguru import logger
from models import AssistantConfig
from services.pipecat.service_factory import create_services
from pipecat.frames.frames import EndFrame, TTSSpeakFrame
from pipecat.frames.frames import (
EndFrame,
InterruptionTaskFrame,
TranscriptionFrame,
TransportMessageUrgentFrame,
TTSSpeakFrame,
)
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.processors.transcript_processor import TranscriptProcessor
from pipecat.utils.time import time_now_iso8601
async def run_pipeline(transport, cfg: AssistantConfig) -> None:
@@ -32,14 +40,20 @@ async def run_pipeline(transport, cfg: AssistantConfig) -> None:
context = OpenAILLMContext(messages=[{"role": "system", "content": cfg.prompt}])
context_aggregator = llm.create_context_aggregator(context)
# 转写收集:user 侧收 ASR 最终转写,assistant 侧聚合 TTS 实际播报的文本,
# 统一通过 data channel 推给前端聊天记录面板。
transcript = TranscriptProcessor()
pipeline = Pipeline(
[
transport.input(),
stt,
transcript.user(),
context_aggregator.user(),
llm,
tts,
transport.output(),
transcript.assistant(),
context_aggregator.assistant(),
]
)
@@ -52,6 +66,39 @@ async def run_pipeline(transport, cfg: AssistantConfig) -> None:
),
)
@transcript.event_handler("on_transcript_update")
async def on_transcript_update(_processor, frame):
# 每条最终转写(用户/助手)推给前端,前端据此渲染聊天记录
for msg in frame.messages:
await task.queue_frame(
TransportMessageUrgentFrame(
message={
"type": "transcript",
"role": msg.role,
"content": msg.content,
"timestamp": msg.timestamp,
}
)
)
@transport.event_handler("on_app_message")
async def on_app_message(_transport, message, _sender):
# 前端文字输入:先打断当前播报,再当作一条用户最终转写注入,
# 走与语音完全相同的 转写→上下文→LLM→TTS 链路
if not isinstance(message, dict) or message.get("type") != "user-text":
return
text = str(message.get("text") or "").strip()
if not text:
return
await task.queue_frames(
[
InterruptionTaskFrame(),
TranscriptionFrame(
text=text, user_id="debug", timestamp=time_now_iso8601()
),
]
)
@transport.event_handler("on_client_connected")
async def on_client_connected(_transport, _client):
if cfg.greeting:

View File

@@ -78,7 +78,7 @@ import {
type Credential,
type KnowledgeBase,
} from "@/lib/api";
import { useVoicePreview } from "@/hooks/use-voice-preview";
import { useVoicePreview, type ChatMessage } from "@/hooks/use-voice-preview";
type RuntimeMode = "pipeline" | "realtime";
@@ -1856,19 +1856,28 @@ function DebugVoicePanel({
error,
micWarning,
localStream,
messages,
sendText,
connect,
disconnect,
audioRef,
} = useVoicePreview(assistantId);
// 连接中或已连通都视作"会话进行中"
const recording = status === "connecting" || status === "connected";
const [textDraft, setTextDraft] = useState("");
function handleSendText() {
if (sendText(textDraft)) {
setTextDraft("");
}
}
return (
<div className="flex min-h-0 flex-1 flex-col">
{/* 后端 TTS 音频经 WebRTC 媒体流过来,挂这里播放 */}
<audio ref={audioRef} autoPlay playsInline className="hidden" />
{showTranscript ? (
<DebugTranscriptPanel />
<DebugTranscriptPanel messages={messages} recording={recording} />
) : (
<div className="relative flex min-h-0 flex-1 flex-col items-center justify-center gap-3 overflow-y-auto px-6 py-3 text-center">
<div
@@ -1966,10 +1975,29 @@ function DebugVoicePanel({
<div className="flex items-end gap-2">
<Textarea
rows={1}
placeholder="输入文字以模拟用户消息…"
value={textDraft}
disabled={status !== "connected"}
onChange={(event) => setTextDraft(event.target.value)}
onKeyDown={(event) => {
if (event.key === "Enter" && !event.shiftKey && !event.nativeEvent.isComposing) {
event.preventDefault();
handleSendText();
}
}}
placeholder={
status === "connected"
? "输入文字发送给助手,将打断当前播报…"
: "开始对话后可输入文字…"
}
className="max-h-24 min-h-10 flex-1 resize-none border-hairline-strong bg-background text-sm text-foreground placeholder:text-muted-soft"
/>
<Button size="icon" className="shrink-0" aria-label="发送调试消息">
<Button
size="icon"
className="shrink-0"
aria-label="发送调试消息"
disabled={status !== "connected" || !textDraft.trim()}
onClick={handleSendText}
>
<Send size={16} />
</Button>
</div>
@@ -1978,31 +2006,79 @@ function DebugVoicePanel({
);
}
function DebugTranscriptPanel() {
// ISO 时间戳 → HH:MM(本地时区),解析失败返回空串
function formatMessageTime(iso: string): string {
const d = new Date(iso);
if (Number.isNaN(d.getTime())) return "";
const pad = (n: number) => String(n).padStart(2, "0");
return `${pad(d.getHours())}:${pad(d.getMinutes())}`;
}
function DebugTranscriptPanel({
messages,
recording,
}: {
messages: ChatMessage[];
recording: boolean;
}) {
const scrollRef = useRef<HTMLDivElement>(null);
// 新消息时滚到底部
useEffect(() => {
const el = scrollRef.current;
if (el) el.scrollTop = el.scrollHeight;
}, [messages]);
if (messages.length === 0) {
return (
<div className="flex min-h-0 flex-1 flex-col items-center justify-center gap-2 px-6 text-center">
<MessageSquareText size={28} className="text-muted-soft" />
<div className="text-sm font-medium text-foreground">
{recording ? "暂无聊天记录" : "尚未开始对话"}
</div>
<p className="max-w-xs text-xs leading-5 text-muted-foreground">
{recording
? "开口说话或在下方输入文字,对话内容会实时显示在这里。"
: "点击「开始对话」后,语音与文字消息会实时显示在这里。"}
</p>
</div>
);
}
return (
<div className="flex min-h-0 flex-1 flex-col overflow-y-auto px-5 py-4">
<div
ref={scrollRef}
className="flex min-h-0 flex-1 flex-col overflow-y-auto px-5 py-4"
>
<div className="flex flex-col gap-4">
<div className="flex max-w-[88%] flex-col gap-1 self-start">
<span className="px-1 text-[11px] text-muted-soft"> · 10:24</span>
<div className="rounded-2xl rounded-tl-sm bg-surface-strong px-4 py-2.5 text-sm leading-6 text-foreground">
AI
</div>
</div>
<div className="flex max-w-[88%] flex-col items-end gap-1 self-end">
<span className="px-1 text-[11px] text-muted-soft"> · 10:25</span>
<div className="rounded-2xl rounded-tr-sm bg-primary px-4 py-2.5 text-sm leading-6 text-primary-foreground">
</div>
</div>
<div className="flex max-w-[88%] flex-col gap-1 self-start">
<span className="px-1 text-[11px] text-muted-soft"> · 10:25</span>
<div className="rounded-2xl rounded-tl-sm bg-surface-strong px-4 py-2.5 text-sm leading-6 text-foreground">
线线线 App
线
</div>
</div>
{messages.map((message) => {
const time = formatMessageTime(message.timestamp);
return message.role === "assistant" ? (
<div
key={message.id}
className="flex max-w-[88%] flex-col gap-1 self-start"
>
<span className="px-1 text-[11px] text-muted-soft">
{time ? ` · ${time}` : ""}
</span>
<div className="whitespace-pre-wrap rounded-2xl rounded-tl-sm bg-surface-strong px-4 py-2.5 text-sm leading-6 text-foreground">
{message.content}
</div>
</div>
) : (
<div
key={message.id}
className="flex max-w-[88%] flex-col items-end gap-1 self-end"
>
<span className="px-1 text-[11px] text-muted-soft">
{time ? ` · ${time}` : ""}
</span>
<div className="whitespace-pre-wrap rounded-2xl rounded-tr-sm bg-primary px-4 py-2.5 text-sm leading-6 text-primary-foreground">
{message.content}
</div>
</div>
);
})}
</div>
</div>
);

View File

@@ -9,6 +9,10 @@
* client → {type:"ice-candidate", payload:{pc_id, candidate:{...}}}
* 音频本身走 WebRTC 媒体流(Opus),不经 ws;后端 TTS 帧从 ontrack 拿到直接播放。
*
* 另开一条 data channel 与后端管线(pipeline.py)互通应用消息:
* client → {type:"user-text", text} 文字输入(打断并触发新回复)
* server → {type:"transcript", role, content, timestamp} 用户/助手最终转写(聊天记录)
*
* 纯本机(localhost)即可跑:localhost 是 secure context,麦克风可用,ws 用明文。
* 局域网/别的设备要 https+wss,见 deploy/README.md。
*/
@@ -19,6 +23,14 @@ import { API_BASE } from "@/lib/api";
export type VoicePreviewStatus = "idle" | "connecting" | "connected" | "failed";
export type ChatMessage = {
id: string;
role: "user" | "assistant";
content: string;
/** 后端给的 ISO 时间戳 */
timestamp: string;
};
// http→ws、https→wss,自动跟随 API 基址(同源反代时也对)
function wsBaseUrl(): string {
const url = new URL(API_BASE, window.location.origin);
@@ -62,11 +74,14 @@ export function useVoicePreview(assistantId: string | null) {
const [error, setError] = useState<string | null>(null);
const [micWarning, setMicWarning] = useState<string | null>(null);
const [localStream, setLocalStream] = useState<MediaStream | null>(null);
const [messages, setMessages] = useState<ChatMessage[]>([]);
const audioRef = useRef<HTMLAudioElement | null>(null);
const pcRef = useRef<RTCPeerConnection | null>(null);
const wsRef = useRef<WebSocket | null>(null);
const dataChannelRef = useRef<RTCDataChannel | null>(null);
const localStreamRef = useRef<MediaStream | null>(null);
const startingRef = useRef(false);
const messageSeqRef = useRef(0);
const releaseResources = useCallback(() => {
const ws = wsRef.current;
@@ -78,6 +93,13 @@ export function useVoicePreview(assistantId: string | null) {
ws.close();
}
const channel = dataChannelRef.current;
dataChannelRef.current = null;
if (channel) {
channel.onmessage = null;
channel.close();
}
const pc = pcRef.current;
pcRef.current = null;
if (pc) {
@@ -122,6 +144,7 @@ export function useVoicePreview(assistantId: string | null) {
startingRef.current = true;
setError(null);
setMicWarning(null);
setMessages([]); // 新会话清空上一轮聊天记录
setStatus("connecting");
// 麦克风是可选的:获取失败时继续建立仅接收后端音频的 WebRTC 会话。
@@ -213,6 +236,36 @@ export function useVoicePreview(assistantId: string | null) {
);
};
// 应用消息通道:收后端转写(聊天记录),发文字输入。
// 由浏览器侧主动创建,后端 SmallWebRTCConnection 的 on("datachannel") 会接住。
const channel = pc.createDataChannel("chat");
dataChannelRef.current = channel;
channel.onmessage = (event) => {
try {
const msg = JSON.parse(event.data);
if (
msg?.type === "transcript" &&
(msg.role === "user" || msg.role === "assistant") &&
typeof msg.content === "string" &&
msg.content.trim()
) {
messageSeqRef.current += 1;
const next: ChatMessage = {
id: `msg-${messageSeqRef.current}`,
role: msg.role,
content: msg.content,
timestamp:
typeof msg.timestamp === "string"
? msg.timestamp
: new Date().toISOString(),
};
setMessages((prev) => [...prev, next]);
}
} catch {
/* 非 JSON / 未知消息,忽略 */
}
};
pc.ontrack = (e) => {
if (e.track.kind === "audio" && audioRef.current) {
audioRef.current.srcObject =
@@ -268,6 +321,16 @@ export function useVoicePreview(assistantId: string | null) {
}
}, [assistantId, fail]);
// 发送文字消息:后端先打断当前播报,再按用户输入触发新回复。
// 成功返回 true;通道未就绪(未开始对话/连接中)返回 false。
const sendText = useCallback((text: string): boolean => {
const trimmed = text.trim();
const channel = dataChannelRef.current;
if (!trimmed || !channel || channel.readyState !== "open") return false;
channel.send(JSON.stringify({ type: "user-text", text: trimmed }));
return true;
}, []);
// 卸载时收尾
useEffect(() => releaseResources, [releaseResources]);
@@ -276,6 +339,8 @@ export function useVoicePreview(assistantId: string | null) {
error,
micWarning,
localStream,
messages,
sendText,
connect,
disconnect,
audioRef,