update xfyun asr

2026-05-21 20:48:24 +08:00
parent 872b8ac64a
commit 2ecb3ef506
1 changed files with 22 additions and 5 deletions
--- a/engine/xfyun_asr.py
+++ b/engine/xfyun_asr.py
@@ -21,6 +21,7 @@ from pipecat.frames.frames import (
    Frame,
    InterimTranscriptionFrame,
    TranscriptionFrame,
+    UserStoppedSpeakingFrame,
    VADUserStartedSpeakingFrame,
    VADUserStoppedSpeakingFrame,
 )
@@ -78,11 +79,17 @@ class XfyunASRService(STTService):
        self._sent_final_frame = False
        self._partials: list[str] = []
        self._last_text = ""
-        # Text already finalized by xfyun within the current VAD turn. xfyun
-        # may emit several status=2 segments within one turn (e.g. when the
-        # user pauses briefly); each segment resets `_partials`/`_last_text`,
+        # Text already finalized by xfyun within the current aggregator turn.
+        # xfyun may emit several status=2 segments within one turn (brief
+        # user pauses, or the engine's VAD stopping/restarting before the
+        # turn timeout fires); each segment resets `_partials`/`_last_text`,
        # but interim frames pushed to clients should still grow
-        # monotonically across segments. Reset on VADUserStartedSpeakingFrame.
+        # monotonically across segments. Reset on the aggregator-level
+        # `UserStoppedSpeakingFrame` (broadcast by the user aggregator once
+        # per turn at turn end) — NOT on `VADUserStartedSpeakingFrame`,
+        # which fires on every brief resume within the same turn and would
+        # clobber the accumulator mid-utterance, making the bubble appear
+        # to "un-stream" backwards.
        self._turn_committed_text = ""

    async def cleanup(self) -> None:
@@ -100,8 +107,18 @@ class XfyunASRService(STTService):
    async def process_frame(self, frame: Frame, direction: FrameDirection) -> None:
        await super().process_frame(frame, direction)

-        if isinstance(frame, VADUserStartedSpeakingFrame):
+        if isinstance(frame, UserStoppedSpeakingFrame):
+            # Aggregator-level turn end (broadcast by the user aggregator
+            # once per turn). At this point the xfyun websocket has already
+            # been closed via `_finish_utterance` on VAD stop, so it's safe
+            # to clear the cross-segment text accumulator here. Doing it on
+            # turn end rather than turn start avoids a frame-ordering race
+            # where the first interim of a new turn could be prefixed with
+            # the previous turn's committed text (the aggregator broadcasts
+            # `UserStartedSpeakingFrame` only after `VADUserStartedSpeakingFrame`
+            # has already propagated through this processor).
            self._turn_committed_text = ""
+        elif isinstance(frame, VADUserStartedSpeakingFrame):
            await self._start_utterance()
        elif isinstance(frame, VADUserStoppedSpeakingFrame):
            await self._finish_utterance()