From 6a59df3dbdfdcffa8f20523a6e5f650b2aba4955 Mon Sep 17 00:00:00 2001
From: Xin Wang <wangx@XindeMac-mini-2.local>
Date: Thu, 21 May 2026 17:05:20 +0800
Subject: [PATCH] xfyun asr update one bubble

---
 AGENTS.md               |  1 +
 engine/xfyun_asr.py     | 16 +++++++++++++++-
 examples/webpage/app.js | 15 +++++++++++++--
 3 files changed, 29 insertions(+), 3 deletions(-)
 create mode 100644 AGENTS.md

diff --git a/AGENTS.md b/AGENTS.md
new file mode 100644
index 0000000..f6307a9
--- /dev/null
+++ b/AGENTS.md
@@ -0,0 +1 @@
+Write readable, maintainable, extensible code for a voice agent endpoint with pipecat as engine
\ No newline at end of file
diff --git a/engine/xfyun_asr.py b/engine/xfyun_asr.py
index 6b5b2d8..bcee840 100644
--- a/engine/xfyun_asr.py
+++ b/engine/xfyun_asr.py
@@ -78,6 +78,12 @@ class XfyunASRService(STTService):
         self._sent_final_frame = False
         self._partials: list[str] = []
         self._last_text = ""
+        # Text already finalized by xfyun within the current VAD turn. xfyun
+        # may emit several status=2 segments within one turn (e.g. when the
+        # user pauses briefly); each segment resets `_partials`/`_last_text`,
+        # but interim frames pushed to clients should still grow
+        # monotonically across segments. Reset on VADUserStartedSpeakingFrame.
+        self._turn_committed_text = ""
 
     async def cleanup(self) -> None:
         await self._close_utterance()
@@ -95,6 +101,7 @@ class XfyunASRService(STTService):
         await super().process_frame(frame, direction)
 
         if isinstance(frame, VADUserStartedSpeakingFrame):
+            self._turn_committed_text = ""
             await self._start_utterance()
         elif isinstance(frame, VADUserStoppedSpeakingFrame):
             await self._finish_utterance()
@@ -261,7 +268,7 @@ class XfyunASRService(STTService):
                 self._last_text = text
                 await self.push_frame(
                     InterimTranscriptionFrame(
-                        text,
+                        self._turn_committed_text + text,
                         self._user_id,
                         time_now_iso8601(),
                         _language_or_none(self._language),
@@ -273,6 +280,10 @@ class XfyunASRService(STTService):
             final_text = self._last_text
             if final_text:
                 self.confirm_finalize()
+                # Emit just this segment's text. The pipecat user aggregator
+                # concatenates TranscriptionFrames within a VAD turn, so we
+                # must NOT prepend `_turn_committed_text` here or the
+                # aggregated turn text would double-count earlier segments.
                 await self.push_frame(
                     TranscriptionFrame(
                         final_text,
@@ -282,6 +293,9 @@ class XfyunASRService(STTService):
                         result=payload,
                     )
                 )
+                # Accumulate so the next sub-session's interim frames carry
+                # the full turn so far (used for client UI display only).
+                self._turn_committed_text += final_text
             await self._close_utterance()
 
     def _apply_recognition_result(self, recognition: dict[str, Any]) -> str:
diff --git a/examples/webpage/app.js b/examples/webpage/app.js
index ec618f4..4b0fcc0 100644
--- a/examples/webpage/app.js
+++ b/examples/webpage/app.js
@@ -538,9 +538,14 @@ function handleUserTranscript(text) {
     body.textContent = text;
     state.currentUserBubble.classList.remove("bubble--interim");
   } else {
-    addBubble("user", text);
+    state.currentUserBubble = addBubble("user", text);
   }
-  state.currentUserBubble = null;
+  // Intentionally keep `state.currentUserBubble` set. Streaming ASRs (e.g.
+  // xfyun) can emit multiple interim/final cycles within a single dialog
+  // turn — for example when the user pauses mid-sentence or the upstream
+  // service segments the utterance. Keeping the bubble open until the
+  // assistant starts replying (see `handleAssistantStarted` /
+  // `response.audio.started`) collapses those cycles into one bubble.
 }
 
 function handleUserTranscriptInterim(text) {
@@ -590,6 +595,11 @@ function handleAssistantDelta(text) {
 
 function handleAssistantStarted() {
   state.currentAssistantBubble = null;
+  // Close the in-flight user bubble so the next user turn starts a fresh
+  // one. We do this here (and on `response.audio.started`) rather than on
+  // every `input.transcript.final`, because streaming ASRs may emit
+  // several finals within a single dialog turn.
+  state.currentUserBubble = null;
 }
 
 function handleAssistantFinal(text, interrupted) {
@@ -633,6 +643,7 @@ function handleEvent(event) {
       break;
     case "response.audio.started":
       setBotIndicator(true);
+      state.currentUserBubble = null;
       break;
     case "response.audio.stopped":
       finalizeAssistantBubble();