Enhance voice interaction and transcript handling in the assistant
- Add a new Docker configuration for the UI in launch.json to facilitate development. - Refactor pipeline.py to integrate a TranscriptProcessor for managing user and assistant transcripts, including event handlers for real-time updates and message handling. - Update useVoicePreview.ts to establish a data channel for sending and receiving text messages, improving interaction flow. - Modify AssistantPage.tsx to support displaying chat messages and sending user input, enhancing the user experience during voice interactions. - Revise DebugTranscriptPanel to dynamically render chat messages with timestamps, improving the visual representation of conversation history.
This commit is contained in:
@@ -8,6 +8,13 @@
|
||||
"cwd": "frontend",
|
||||
"port": 3001,
|
||||
"autoPort": false
|
||||
},
|
||||
{
|
||||
"name": "ui-docker",
|
||||
"runtimeExecutable": "docker",
|
||||
"runtimeArgs": ["compose", "up", "ui"],
|
||||
"port": 3030,
|
||||
"autoPort": false
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
@@ -10,11 +10,19 @@ from loguru import logger
|
||||
from models import AssistantConfig
|
||||
from services.pipecat.service_factory import create_services
|
||||
|
||||
from pipecat.frames.frames import EndFrame, TTSSpeakFrame
|
||||
from pipecat.frames.frames import (
|
||||
EndFrame,
|
||||
InterruptionTaskFrame,
|
||||
TranscriptionFrame,
|
||||
TransportMessageUrgentFrame,
|
||||
TTSSpeakFrame,
|
||||
)
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.processors.transcript_processor import TranscriptProcessor
|
||||
from pipecat.utils.time import time_now_iso8601
|
||||
|
||||
|
||||
async def run_pipeline(transport, cfg: AssistantConfig) -> None:
|
||||
@@ -32,14 +40,20 @@ async def run_pipeline(transport, cfg: AssistantConfig) -> None:
|
||||
context = OpenAILLMContext(messages=[{"role": "system", "content": cfg.prompt}])
|
||||
context_aggregator = llm.create_context_aggregator(context)
|
||||
|
||||
# 转写收集:user 侧收 ASR 最终转写,assistant 侧聚合 TTS 实际播报的文本,
|
||||
# 统一通过 data channel 推给前端聊天记录面板。
|
||||
transcript = TranscriptProcessor()
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(),
|
||||
stt,
|
||||
transcript.user(),
|
||||
context_aggregator.user(),
|
||||
llm,
|
||||
tts,
|
||||
transport.output(),
|
||||
transcript.assistant(),
|
||||
context_aggregator.assistant(),
|
||||
]
|
||||
)
|
||||
@@ -52,6 +66,39 @@ async def run_pipeline(transport, cfg: AssistantConfig) -> None:
|
||||
),
|
||||
)
|
||||
|
||||
@transcript.event_handler("on_transcript_update")
|
||||
async def on_transcript_update(_processor, frame):
|
||||
# 每条最终转写(用户/助手)推给前端,前端据此渲染聊天记录
|
||||
for msg in frame.messages:
|
||||
await task.queue_frame(
|
||||
TransportMessageUrgentFrame(
|
||||
message={
|
||||
"type": "transcript",
|
||||
"role": msg.role,
|
||||
"content": msg.content,
|
||||
"timestamp": msg.timestamp,
|
||||
}
|
||||
)
|
||||
)
|
||||
|
||||
@transport.event_handler("on_app_message")
|
||||
async def on_app_message(_transport, message, _sender):
|
||||
# 前端文字输入:先打断当前播报,再当作一条用户最终转写注入,
|
||||
# 走与语音完全相同的 转写→上下文→LLM→TTS 链路
|
||||
if not isinstance(message, dict) or message.get("type") != "user-text":
|
||||
return
|
||||
text = str(message.get("text") or "").strip()
|
||||
if not text:
|
||||
return
|
||||
await task.queue_frames(
|
||||
[
|
||||
InterruptionTaskFrame(),
|
||||
TranscriptionFrame(
|
||||
text=text, user_id="debug", timestamp=time_now_iso8601()
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(_transport, _client):
|
||||
if cfg.greeting:
|
||||
|
||||
@@ -78,7 +78,7 @@ import {
|
||||
type Credential,
|
||||
type KnowledgeBase,
|
||||
} from "@/lib/api";
|
||||
import { useVoicePreview } from "@/hooks/use-voice-preview";
|
||||
import { useVoicePreview, type ChatMessage } from "@/hooks/use-voice-preview";
|
||||
|
||||
type RuntimeMode = "pipeline" | "realtime";
|
||||
|
||||
@@ -1856,19 +1856,28 @@ function DebugVoicePanel({
|
||||
error,
|
||||
micWarning,
|
||||
localStream,
|
||||
messages,
|
||||
sendText,
|
||||
connect,
|
||||
disconnect,
|
||||
audioRef,
|
||||
} = useVoicePreview(assistantId);
|
||||
// 连接中或已连通都视作"会话进行中"
|
||||
const recording = status === "connecting" || status === "connected";
|
||||
const [textDraft, setTextDraft] = useState("");
|
||||
|
||||
function handleSendText() {
|
||||
if (sendText(textDraft)) {
|
||||
setTextDraft("");
|
||||
}
|
||||
}
|
||||
|
||||
return (
|
||||
<div className="flex min-h-0 flex-1 flex-col">
|
||||
{/* 后端 TTS 音频经 WebRTC 媒体流过来,挂这里播放 */}
|
||||
<audio ref={audioRef} autoPlay playsInline className="hidden" />
|
||||
{showTranscript ? (
|
||||
<DebugTranscriptPanel />
|
||||
<DebugTranscriptPanel messages={messages} recording={recording} />
|
||||
) : (
|
||||
<div className="relative flex min-h-0 flex-1 flex-col items-center justify-center gap-3 overflow-y-auto px-6 py-3 text-center">
|
||||
<div
|
||||
@@ -1966,10 +1975,29 @@ function DebugVoicePanel({
|
||||
<div className="flex items-end gap-2">
|
||||
<Textarea
|
||||
rows={1}
|
||||
placeholder="输入文字以模拟用户消息…"
|
||||
value={textDraft}
|
||||
disabled={status !== "connected"}
|
||||
onChange={(event) => setTextDraft(event.target.value)}
|
||||
onKeyDown={(event) => {
|
||||
if (event.key === "Enter" && !event.shiftKey && !event.nativeEvent.isComposing) {
|
||||
event.preventDefault();
|
||||
handleSendText();
|
||||
}
|
||||
}}
|
||||
placeholder={
|
||||
status === "connected"
|
||||
? "输入文字发送给助手,将打断当前播报…"
|
||||
: "开始对话后可输入文字…"
|
||||
}
|
||||
className="max-h-24 min-h-10 flex-1 resize-none border-hairline-strong bg-background text-sm text-foreground placeholder:text-muted-soft"
|
||||
/>
|
||||
<Button size="icon" className="shrink-0" aria-label="发送调试消息">
|
||||
<Button
|
||||
size="icon"
|
||||
className="shrink-0"
|
||||
aria-label="发送调试消息"
|
||||
disabled={status !== "connected" || !textDraft.trim()}
|
||||
onClick={handleSendText}
|
||||
>
|
||||
<Send size={16} />
|
||||
</Button>
|
||||
</div>
|
||||
@@ -1978,31 +2006,79 @@ function DebugVoicePanel({
|
||||
);
|
||||
}
|
||||
|
||||
function DebugTranscriptPanel() {
|
||||
// ISO 时间戳 → HH:MM(本地时区),解析失败返回空串
|
||||
function formatMessageTime(iso: string): string {
|
||||
const d = new Date(iso);
|
||||
if (Number.isNaN(d.getTime())) return "";
|
||||
const pad = (n: number) => String(n).padStart(2, "0");
|
||||
return `${pad(d.getHours())}:${pad(d.getMinutes())}`;
|
||||
}
|
||||
|
||||
function DebugTranscriptPanel({
|
||||
messages,
|
||||
recording,
|
||||
}: {
|
||||
messages: ChatMessage[];
|
||||
recording: boolean;
|
||||
}) {
|
||||
const scrollRef = useRef<HTMLDivElement>(null);
|
||||
|
||||
// 新消息时滚到底部
|
||||
useEffect(() => {
|
||||
const el = scrollRef.current;
|
||||
if (el) el.scrollTop = el.scrollHeight;
|
||||
}, [messages]);
|
||||
|
||||
if (messages.length === 0) {
|
||||
return (
|
||||
<div className="flex min-h-0 flex-1 flex-col items-center justify-center gap-2 px-6 text-center">
|
||||
<MessageSquareText size={28} className="text-muted-soft" />
|
||||
<div className="text-sm font-medium text-foreground">
|
||||
{recording ? "暂无聊天记录" : "尚未开始对话"}
|
||||
</div>
|
||||
<p className="max-w-xs text-xs leading-5 text-muted-foreground">
|
||||
{recording
|
||||
? "开口说话或在下方输入文字,对话内容会实时显示在这里。"
|
||||
: "点击「开始对话」后,语音与文字消息会实时显示在这里。"}
|
||||
</p>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
return (
|
||||
<div className="flex min-h-0 flex-1 flex-col overflow-y-auto px-5 py-4">
|
||||
<div
|
||||
ref={scrollRef}
|
||||
className="flex min-h-0 flex-1 flex-col overflow-y-auto px-5 py-4"
|
||||
>
|
||||
<div className="flex flex-col gap-4">
|
||||
<div className="flex max-w-[88%] flex-col gap-1 self-start">
|
||||
<span className="px-1 text-[11px] text-muted-soft">助手 · 10:24</span>
|
||||
<div className="rounded-2xl rounded-tl-sm bg-surface-strong px-4 py-2.5 text-sm leading-6 text-foreground">
|
||||
您好,我是 AI 视频助手,请问有什么可以帮您?
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div className="flex max-w-[88%] flex-col items-end gap-1 self-end">
|
||||
<span className="px-1 text-[11px] text-muted-soft">我 · 10:25</span>
|
||||
<div className="rounded-2xl rounded-tr-sm bg-primary px-4 py-2.5 text-sm leading-6 text-primary-foreground">
|
||||
我想了解一下社保卡的办理流程。
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div className="flex max-w-[88%] flex-col gap-1 self-start">
|
||||
<span className="px-1 text-[11px] text-muted-soft">助手 · 10:25</span>
|
||||
<div className="rounded-2xl rounded-tl-sm bg-surface-strong px-4 py-2.5 text-sm leading-6 text-foreground">
|
||||
社保卡可通过线上或线下渠道办理。线上可在政务服务 App
|
||||
提交申请,线下可前往社保经办网点。
|
||||
</div>
|
||||
</div>
|
||||
{messages.map((message) => {
|
||||
const time = formatMessageTime(message.timestamp);
|
||||
return message.role === "assistant" ? (
|
||||
<div
|
||||
key={message.id}
|
||||
className="flex max-w-[88%] flex-col gap-1 self-start"
|
||||
>
|
||||
<span className="px-1 text-[11px] text-muted-soft">
|
||||
助手{time ? ` · ${time}` : ""}
|
||||
</span>
|
||||
<div className="whitespace-pre-wrap rounded-2xl rounded-tl-sm bg-surface-strong px-4 py-2.5 text-sm leading-6 text-foreground">
|
||||
{message.content}
|
||||
</div>
|
||||
</div>
|
||||
) : (
|
||||
<div
|
||||
key={message.id}
|
||||
className="flex max-w-[88%] flex-col items-end gap-1 self-end"
|
||||
>
|
||||
<span className="px-1 text-[11px] text-muted-soft">
|
||||
我{time ? ` · ${time}` : ""}
|
||||
</span>
|
||||
<div className="whitespace-pre-wrap rounded-2xl rounded-tr-sm bg-primary px-4 py-2.5 text-sm leading-6 text-primary-foreground">
|
||||
{message.content}
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
})}
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
|
||||
@@ -9,6 +9,10 @@
|
||||
* client → {type:"ice-candidate", payload:{pc_id, candidate:{...}}}
|
||||
* 音频本身走 WebRTC 媒体流(Opus),不经 ws;后端 TTS 帧从 ontrack 拿到直接播放。
|
||||
*
|
||||
* 另开一条 data channel 与后端管线(pipeline.py)互通应用消息:
|
||||
* client → {type:"user-text", text} 文字输入(打断并触发新回复)
|
||||
* server → {type:"transcript", role, content, timestamp} 用户/助手最终转写(聊天记录)
|
||||
*
|
||||
* 纯本机(localhost)即可跑:localhost 是 secure context,麦克风可用,ws 用明文。
|
||||
* 局域网/别的设备要 https+wss,见 deploy/README.md。
|
||||
*/
|
||||
@@ -19,6 +23,14 @@ import { API_BASE } from "@/lib/api";
|
||||
|
||||
export type VoicePreviewStatus = "idle" | "connecting" | "connected" | "failed";
|
||||
|
||||
export type ChatMessage = {
|
||||
id: string;
|
||||
role: "user" | "assistant";
|
||||
content: string;
|
||||
/** 后端给的 ISO 时间戳 */
|
||||
timestamp: string;
|
||||
};
|
||||
|
||||
// http→ws、https→wss,自动跟随 API 基址(同源反代时也对)
|
||||
function wsBaseUrl(): string {
|
||||
const url = new URL(API_BASE, window.location.origin);
|
||||
@@ -62,11 +74,14 @@ export function useVoicePreview(assistantId: string | null) {
|
||||
const [error, setError] = useState<string | null>(null);
|
||||
const [micWarning, setMicWarning] = useState<string | null>(null);
|
||||
const [localStream, setLocalStream] = useState<MediaStream | null>(null);
|
||||
const [messages, setMessages] = useState<ChatMessage[]>([]);
|
||||
const audioRef = useRef<HTMLAudioElement | null>(null);
|
||||
const pcRef = useRef<RTCPeerConnection | null>(null);
|
||||
const wsRef = useRef<WebSocket | null>(null);
|
||||
const dataChannelRef = useRef<RTCDataChannel | null>(null);
|
||||
const localStreamRef = useRef<MediaStream | null>(null);
|
||||
const startingRef = useRef(false);
|
||||
const messageSeqRef = useRef(0);
|
||||
|
||||
const releaseResources = useCallback(() => {
|
||||
const ws = wsRef.current;
|
||||
@@ -78,6 +93,13 @@ export function useVoicePreview(assistantId: string | null) {
|
||||
ws.close();
|
||||
}
|
||||
|
||||
const channel = dataChannelRef.current;
|
||||
dataChannelRef.current = null;
|
||||
if (channel) {
|
||||
channel.onmessage = null;
|
||||
channel.close();
|
||||
}
|
||||
|
||||
const pc = pcRef.current;
|
||||
pcRef.current = null;
|
||||
if (pc) {
|
||||
@@ -122,6 +144,7 @@ export function useVoicePreview(assistantId: string | null) {
|
||||
startingRef.current = true;
|
||||
setError(null);
|
||||
setMicWarning(null);
|
||||
setMessages([]); // 新会话清空上一轮聊天记录
|
||||
setStatus("connecting");
|
||||
|
||||
// 麦克风是可选的:获取失败时继续建立仅接收后端音频的 WebRTC 会话。
|
||||
@@ -213,6 +236,36 @@ export function useVoicePreview(assistantId: string | null) {
|
||||
);
|
||||
};
|
||||
|
||||
// 应用消息通道:收后端转写(聊天记录),发文字输入。
|
||||
// 由浏览器侧主动创建,后端 SmallWebRTCConnection 的 on("datachannel") 会接住。
|
||||
const channel = pc.createDataChannel("chat");
|
||||
dataChannelRef.current = channel;
|
||||
channel.onmessage = (event) => {
|
||||
try {
|
||||
const msg = JSON.parse(event.data);
|
||||
if (
|
||||
msg?.type === "transcript" &&
|
||||
(msg.role === "user" || msg.role === "assistant") &&
|
||||
typeof msg.content === "string" &&
|
||||
msg.content.trim()
|
||||
) {
|
||||
messageSeqRef.current += 1;
|
||||
const next: ChatMessage = {
|
||||
id: `msg-${messageSeqRef.current}`,
|
||||
role: msg.role,
|
||||
content: msg.content,
|
||||
timestamp:
|
||||
typeof msg.timestamp === "string"
|
||||
? msg.timestamp
|
||||
: new Date().toISOString(),
|
||||
};
|
||||
setMessages((prev) => [...prev, next]);
|
||||
}
|
||||
} catch {
|
||||
/* 非 JSON / 未知消息,忽略 */
|
||||
}
|
||||
};
|
||||
|
||||
pc.ontrack = (e) => {
|
||||
if (e.track.kind === "audio" && audioRef.current) {
|
||||
audioRef.current.srcObject =
|
||||
@@ -268,6 +321,16 @@ export function useVoicePreview(assistantId: string | null) {
|
||||
}
|
||||
}, [assistantId, fail]);
|
||||
|
||||
// 发送文字消息:后端先打断当前播报,再按用户输入触发新回复。
|
||||
// 成功返回 true;通道未就绪(未开始对话/连接中)返回 false。
|
||||
const sendText = useCallback((text: string): boolean => {
|
||||
const trimmed = text.trim();
|
||||
const channel = dataChannelRef.current;
|
||||
if (!trimmed || !channel || channel.readyState !== "open") return false;
|
||||
channel.send(JSON.stringify({ type: "user-text", text: trimmed }));
|
||||
return true;
|
||||
}, []);
|
||||
|
||||
// 卸载时收尾
|
||||
useEffect(() => releaseResources, [releaseResources]);
|
||||
|
||||
@@ -276,6 +339,8 @@ export function useVoicePreview(assistantId: string | null) {
|
||||
error,
|
||||
micWarning,
|
||||
localStream,
|
||||
messages,
|
||||
sendText,
|
||||
connect,
|
||||
disconnect,
|
||||
audioRef,
|
||||
|
||||
Reference in New Issue
Block a user