From 2ecb3ef506da0d0392b1b424e3733d80f67cab2b Mon Sep 17 00:00:00 2001
From: Xin Wang <wangx@XindeMac-mini-2.local>
Date: Thu, 21 May 2026 20:48:24 +0800
Subject: [PATCH] update xfyun asr

---
 engine/xfyun_asr.py | 27 ++++++++++++++++++++++-----
 1 file changed, 22 insertions(+), 5 deletions(-)

diff --git a/engine/xfyun_asr.py b/engine/xfyun_asr.py
index bcee840..07d9260 100644
--- a/engine/xfyun_asr.py
+++ b/engine/xfyun_asr.py
@@ -21,6 +21,7 @@ from pipecat.frames.frames import (
     Frame,
     InterimTranscriptionFrame,
     TranscriptionFrame,
+    UserStoppedSpeakingFrame,
     VADUserStartedSpeakingFrame,
     VADUserStoppedSpeakingFrame,
 )
@@ -78,11 +79,17 @@ class XfyunASRService(STTService):
         self._sent_final_frame = False
         self._partials: list[str] = []
         self._last_text = ""
-        # Text already finalized by xfyun within the current VAD turn. xfyun
-        # may emit several status=2 segments within one turn (e.g. when the
-        # user pauses briefly); each segment resets `_partials`/`_last_text`,
+        # Text already finalized by xfyun within the current aggregator turn.
+        # xfyun may emit several status=2 segments within one turn (brief
+        # user pauses, or the engine's VAD stopping/restarting before the
+        # turn timeout fires); each segment resets `_partials`/`_last_text`,
         # but interim frames pushed to clients should still grow
-        # monotonically across segments. Reset on VADUserStartedSpeakingFrame.
+        # monotonically across segments. Reset on the aggregator-level
+        # `UserStoppedSpeakingFrame` (broadcast by the user aggregator once
+        # per turn at turn end) — NOT on `VADUserStartedSpeakingFrame`,
+        # which fires on every brief resume within the same turn and would
+        # clobber the accumulator mid-utterance, making the bubble appear
+        # to "un-stream" backwards.
         self._turn_committed_text = ""
 
     async def cleanup(self) -> None:
@@ -100,8 +107,18 @@ class XfyunASRService(STTService):
     async def process_frame(self, frame: Frame, direction: FrameDirection) -> None:
         await super().process_frame(frame, direction)
 
-        if isinstance(frame, VADUserStartedSpeakingFrame):
+        if isinstance(frame, UserStoppedSpeakingFrame):
+            # Aggregator-level turn end (broadcast by the user aggregator
+            # once per turn). At this point the xfyun websocket has already
+            # been closed via `_finish_utterance` on VAD stop, so it's safe
+            # to clear the cross-segment text accumulator here. Doing it on
+            # turn end rather than turn start avoids a frame-ordering race
+            # where the first interim of a new turn could be prefixed with
+            # the previous turn's committed text (the aggregator broadcasts
+            # `UserStartedSpeakingFrame` only after `VADUserStartedSpeakingFrame`
+            # has already propagated through this processor).
             self._turn_committed_text = ""
+        elif isinstance(frame, VADUserStartedSpeakingFrame):
             await self._start_utterance()
         elif isinstance(frame, VADUserStoppedSpeakingFrame):
             await self._finish_utterance()