Enhance voice interaction and transcript handling in the assistant

- Add a new Docker configuration for the UI in launch.json to facilitate development. - Refactor pipeline.py to integrate a TranscriptProcessor for managing user and assistant transcripts, including event handlers for real-time updates and message handling. - Update useVoicePreview.ts to establish a data channel for sending and receiving text messages, improving interaction flow. - Modify AssistantPage.tsx to support displaying chat messages and sending user input, enhancing the user experience during voice interactions. - Revise DebugTranscriptPanel to dynamically render chat messages with timestamps, improving the visual representation of conversation history.
2026-06-10 15:11:34 +08:00
parent b711350c0c
commit 2c2af1f2cd
4 changed files with 223 additions and 28 deletions
--- a/.claude/launch.json
+++ b/.claude/launch.json
@@ -8,6 +8,13 @@
      "cwd": "frontend",
      "port": 3001,
      "autoPort": false
+    },
+    {
+      "name": "ui-docker",
+      "runtimeExecutable": "docker",
+      "runtimeArgs": ["compose", "up", "ui"],
+      "port": 3030,
+      "autoPort": false
    }
  ]
 }
--- a/backend/services/pipecat/pipeline.py
+++ b/backend/services/pipecat/pipeline.py
@@ -10,11 +10,19 @@ from loguru import logger
 from models import AssistantConfig
 from services.pipecat.service_factory import create_services

-from pipecat.frames.frames import EndFrame, TTSSpeakFrame
+from pipecat.frames.frames import (
+    EndFrame,
+    InterruptionTaskFrame,
+    TranscriptionFrame,
+    TransportMessageUrgentFrame,
+    TTSSpeakFrame,
+)
 from pipecat.pipeline.pipeline import Pipeline
 from pipecat.pipeline.runner import PipelineRunner
 from pipecat.pipeline.task import PipelineParams, PipelineTask
 from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
+from pipecat.processors.transcript_processor import TranscriptProcessor
+from pipecat.utils.time import time_now_iso8601


 async def run_pipeline(transport, cfg: AssistantConfig) -> None:
@@ -32,14 +40,20 @@ async def run_pipeline(transport, cfg: AssistantConfig) -> None:
    context = OpenAILLMContext(messages=[{"role": "system", "content": cfg.prompt}])
    context_aggregator = llm.create_context_aggregator(context)

+    # 转写收集:user 侧收 ASR 最终转写,assistant 侧聚合 TTS 实际播报的文本,
+    # 统一通过 data channel 推给前端聊天记录面板。
+    transcript = TranscriptProcessor()
+
    pipeline = Pipeline(
        [
            transport.input(),
            stt,
+            transcript.user(),
            context_aggregator.user(),
            llm,
            tts,
            transport.output(),
+            transcript.assistant(),
            context_aggregator.assistant(),
        ]
    )
@@ -52,6 +66,39 @@ async def run_pipeline(transport, cfg: AssistantConfig) -> None:
        ),
    )

+    @transcript.event_handler("on_transcript_update")
+    async def on_transcript_update(_processor, frame):
+        # 每条最终转写(用户/助手)推给前端,前端据此渲染聊天记录
+        for msg in frame.messages:
+            await task.queue_frame(
+                TransportMessageUrgentFrame(
+                    message={
+                        "type": "transcript",
+                        "role": msg.role,
+                        "content": msg.content,
+                        "timestamp": msg.timestamp,
+                    }
+                )
+            )
+
+    @transport.event_handler("on_app_message")
+    async def on_app_message(_transport, message, _sender):
+        # 前端文字输入:先打断当前播报,再当作一条用户最终转写注入,
+        # 走与语音完全相同的 转写→上下文→LLM→TTS 链路
+        if not isinstance(message, dict) or message.get("type") != "user-text":
+            return
+        text = str(message.get("text") or "").strip()
+        if not text:
+            return
+        await task.queue_frames(
+            [
+                InterruptionTaskFrame(),
+                TranscriptionFrame(
+                    text=text, user_id="debug", timestamp=time_now_iso8601()
+                ),
+            ]
+        )
+
    @transport.event_handler("on_client_connected")
    async def on_client_connected(_transport, _client):
        if cfg.greeting:
--- a/frontend/src/components/pages/AssistantPage.tsx
+++ b/frontend/src/components/pages/AssistantPage.tsx
@@ -78,7 +78,7 @@ import {
  type Credential,
  type KnowledgeBase,
 } from "@/lib/api";
-import { useVoicePreview } from "@/hooks/use-voice-preview";
+import { useVoicePreview, type ChatMessage } from "@/hooks/use-voice-preview";

 type RuntimeMode = "pipeline" | "realtime";

@@ -1856,19 +1856,28 @@ function DebugVoicePanel({
    error,
    micWarning,
    localStream,
+    messages,
+    sendText,
    connect,
    disconnect,
    audioRef,
  } = useVoicePreview(assistantId);
  // 连接中或已连通都视作"会话进行中"
  const recording = status === "connecting" || status === "connected";
+  const [textDraft, setTextDraft] = useState("");
+
+  function handleSendText() {
+    if (sendText(textDraft)) {
+      setTextDraft("");
+    }
+  }

  return (
    <div className="flex min-h-0 flex-1 flex-col">
      {/* 后端 TTS 音频经 WebRTC 媒体流过来,挂这里播放 */}
      <audio ref={audioRef} autoPlay playsInline className="hidden" />
      {showTranscript ? (
-        <DebugTranscriptPanel />
+        <DebugTranscriptPanel messages={messages} recording={recording} />
      ) : (
        <div className="relative flex min-h-0 flex-1 flex-col items-center justify-center gap-3 overflow-y-auto px-6 py-3 text-center">
          <div
@@ -1966,10 +1975,29 @@ function DebugVoicePanel({
        <div className="flex items-end gap-2">
          <Textarea
            rows={1}
-            placeholder="输入文字以模拟用户消息…"
+            value={textDraft}
+            disabled={status !== "connected"}
+            onChange={(event) => setTextDraft(event.target.value)}
+            onKeyDown={(event) => {
+              if (event.key === "Enter" && !event.shiftKey && !event.nativeEvent.isComposing) {
+                event.preventDefault();
+                handleSendText();
+              }
+            }}
+            placeholder={
+              status === "connected"
+                ? "输入文字发送给助手，将打断当前播报…"
+                : "开始对话后可输入文字…"
+            }
            className="max-h-24 min-h-10 flex-1 resize-none border-hairline-strong bg-background text-sm text-foreground placeholder:text-muted-soft"
          />
-          <Button size="icon" className="shrink-0" aria-label="发送调试消息">
+          <Button
+            size="icon"
+            className="shrink-0"
+            aria-label="发送调试消息"
+            disabled={status !== "connected" || !textDraft.trim()}
+            onClick={handleSendText}
+          >
            <Send size={16} />
          </Button>
        </div>
@@ -1978,31 +2006,79 @@ function DebugVoicePanel({
  );
 }

-function DebugTranscriptPanel() {
+// ISO 时间戳 → HH:MM(本地时区),解析失败返回空串
+function formatMessageTime(iso: string): string {
+  const d = new Date(iso);
+  if (Number.isNaN(d.getTime())) return "";
+  const pad = (n: number) => String(n).padStart(2, "0");
+  return `${pad(d.getHours())}:${pad(d.getMinutes())}`;
+}
+
+function DebugTranscriptPanel({
+  messages,
+  recording,
+}: {
+  messages: ChatMessage[];
+  recording: boolean;
+}) {
+  const scrollRef = useRef<HTMLDivElement>(null);
+
+  // 新消息时滚到底部
+  useEffect(() => {
+    const el = scrollRef.current;
+    if (el) el.scrollTop = el.scrollHeight;
+  }, [messages]);
+
+  if (messages.length === 0) {
+    return (
+      <div className="flex min-h-0 flex-1 flex-col items-center justify-center gap-2 px-6 text-center">
+        <MessageSquareText size={28} className="text-muted-soft" />
+        <div className="text-sm font-medium text-foreground">
+          {recording ? "暂无聊天记录" : "尚未开始对话"}
+        </div>
+        <p className="max-w-xs text-xs leading-5 text-muted-foreground">
+          {recording
+            ? "开口说话或在下方输入文字，对话内容会实时显示在这里。"
+            : "点击「开始对话」后，语音与文字消息会实时显示在这里。"}
+        </p>
+      </div>
+    );
+  }
+
  return (
-    <div className="flex min-h-0 flex-1 flex-col overflow-y-auto px-5 py-4">
+    <div
+      ref={scrollRef}
+      className="flex min-h-0 flex-1 flex-col overflow-y-auto px-5 py-4"
+    >
      <div className="flex flex-col gap-4">
-        <div className="flex max-w-[88%] flex-col gap-1 self-start">
-          <span className="px-1 text-[11px] text-muted-soft">助手 · 10:24</span>
-          <div className="rounded-2xl rounded-tl-sm bg-surface-strong px-4 py-2.5 text-sm leading-6 text-foreground">
-            您好，我是 AI 视频助手，请问有什么可以帮您？
-          </div>
-        </div>
-
-        <div className="flex max-w-[88%] flex-col items-end gap-1 self-end">
-          <span className="px-1 text-[11px] text-muted-soft">我 · 10:25</span>
-          <div className="rounded-2xl rounded-tr-sm bg-primary px-4 py-2.5 text-sm leading-6 text-primary-foreground">
-            我想了解一下社保卡的办理流程。
-          </div>
-        </div>
-
-        <div className="flex max-w-[88%] flex-col gap-1 self-start">
-          <span className="px-1 text-[11px] text-muted-soft">助手 · 10:25</span>
-          <div className="rounded-2xl rounded-tl-sm bg-surface-strong px-4 py-2.5 text-sm leading-6 text-foreground">
-            社保卡可通过线上或线下渠道办理。线上可在政务服务 App
-            提交申请，线下可前往社保经办网点。
-          </div>
-        </div>
+        {messages.map((message) => {
+          const time = formatMessageTime(message.timestamp);
+          return message.role === "assistant" ? (
+            <div
+              key={message.id}
+              className="flex max-w-[88%] flex-col gap-1 self-start"
+            >
+              <span className="px-1 text-[11px] text-muted-soft">
+                助手{time ? ` · ${time}` : ""}
+              </span>
+              <div className="whitespace-pre-wrap rounded-2xl rounded-tl-sm bg-surface-strong px-4 py-2.5 text-sm leading-6 text-foreground">
+                {message.content}
+              </div>
+            </div>
+          ) : (
+            <div
+              key={message.id}
+              className="flex max-w-[88%] flex-col items-end gap-1 self-end"
+            >
+              <span className="px-1 text-[11px] text-muted-soft">
+                我{time ? ` · ${time}` : ""}
+              </span>
+              <div className="whitespace-pre-wrap rounded-2xl rounded-tr-sm bg-primary px-4 py-2.5 text-sm leading-6 text-primary-foreground">
+                {message.content}
+              </div>
+            </div>
+          );
+        })}
      </div>
    </div>
  );
--- a/frontend/src/hooks/use-voice-preview.ts
+++ b/frontend/src/hooks/use-voice-preview.ts
@@ -9,6 +9,10 @@
 *   client → {type:"ice-candidate", payload:{pc_id, candidate:{...}}}
 * 音频本身走 WebRTC 媒体流(Opus),不经 ws;后端 TTS 帧从 ontrack 拿到直接播放。
 *
+ * 另开一条 data channel 与后端管线(pipeline.py)互通应用消息:
+ *   client → {type:"user-text",  text}                          文字输入(打断并触发新回复)
+ *   server → {type:"transcript", role, content, timestamp}      用户/助手最终转写(聊天记录)
+ *
 * 纯本机(localhost)即可跑:localhost 是 secure context,麦克风可用,ws 用明文。
 * 局域网/别的设备要 https+wss,见 deploy/README.md。
 */
@@ -19,6 +23,14 @@ import { API_BASE } from "@/lib/api";

 export type VoicePreviewStatus = "idle" | "connecting" | "connected" | "failed";

+export type ChatMessage = {
+  id: string;
+  role: "user" | "assistant";
+  content: string;
+  /** 后端给的 ISO 时间戳 */
+  timestamp: string;
+};
+
 // http→ws、https→wss,自动跟随 API 基址(同源反代时也对)
 function wsBaseUrl(): string {
  const url = new URL(API_BASE, window.location.origin);
@@ -62,11 +74,14 @@ export function useVoicePreview(assistantId: string | null) {
  const [error, setError] = useState<string | null>(null);
  const [micWarning, setMicWarning] = useState<string | null>(null);
  const [localStream, setLocalStream] = useState<MediaStream | null>(null);
+  const [messages, setMessages] = useState<ChatMessage[]>([]);
  const audioRef = useRef<HTMLAudioElement | null>(null);
  const pcRef = useRef<RTCPeerConnection | null>(null);
  const wsRef = useRef<WebSocket | null>(null);
+  const dataChannelRef = useRef<RTCDataChannel | null>(null);
  const localStreamRef = useRef<MediaStream | null>(null);
  const startingRef = useRef(false);
+  const messageSeqRef = useRef(0);

  const releaseResources = useCallback(() => {
    const ws = wsRef.current;
@@ -78,6 +93,13 @@ export function useVoicePreview(assistantId: string | null) {
      ws.close();
    }

+    const channel = dataChannelRef.current;
+    dataChannelRef.current = null;
+    if (channel) {
+      channel.onmessage = null;
+      channel.close();
+    }
+
    const pc = pcRef.current;
    pcRef.current = null;
    if (pc) {
@@ -122,6 +144,7 @@ export function useVoicePreview(assistantId: string | null) {
    startingRef.current = true;
    setError(null);
    setMicWarning(null);
+    setMessages([]); // 新会话清空上一轮聊天记录
    setStatus("connecting");

    // 麦克风是可选的:获取失败时继续建立仅接收后端音频的 WebRTC 会话。
@@ -213,6 +236,36 @@ export function useVoicePreview(assistantId: string | null) {
        );
      };

+      // 应用消息通道:收后端转写(聊天记录),发文字输入。
+      // 由浏览器侧主动创建,后端 SmallWebRTCConnection 的 on("datachannel") 会接住。
+      const channel = pc.createDataChannel("chat");
+      dataChannelRef.current = channel;
+      channel.onmessage = (event) => {
+        try {
+          const msg = JSON.parse(event.data);
+          if (
+            msg?.type === "transcript" &&
+            (msg.role === "user" || msg.role === "assistant") &&
+            typeof msg.content === "string" &&
+            msg.content.trim()
+          ) {
+            messageSeqRef.current += 1;
+            const next: ChatMessage = {
+              id: `msg-${messageSeqRef.current}`,
+              role: msg.role,
+              content: msg.content,
+              timestamp:
+                typeof msg.timestamp === "string"
+                  ? msg.timestamp
+                  : new Date().toISOString(),
+            };
+            setMessages((prev) => [...prev, next]);
+          }
+        } catch {
+          /* 非 JSON / 未知消息,忽略 */
+        }
+      };
+
      pc.ontrack = (e) => {
        if (e.track.kind === "audio" && audioRef.current) {
          audioRef.current.srcObject =
@@ -268,6 +321,16 @@ export function useVoicePreview(assistantId: string | null) {
    }
  }, [assistantId, fail]);

+  // 发送文字消息:后端先打断当前播报,再按用户输入触发新回复。
+  // 成功返回 true;通道未就绪(未开始对话/连接中)返回 false。
+  const sendText = useCallback((text: string): boolean => {
+    const trimmed = text.trim();
+    const channel = dataChannelRef.current;
+    if (!trimmed || !channel || channel.readyState !== "open") return false;
+    channel.send(JSON.stringify({ type: "user-text", text: trimmed }));
+    return true;
+  }, []);
+
  // 卸载时收尾
  useEffect(() => releaseResources, [releaseResources]);

@@ -276,6 +339,8 @@ export function useVoicePreview(assistantId: string | null) {
    error,
    micWarning,
    localStream,
+    messages,
+    sendText,
    connect,
    disconnect,
    audioRef,