From 2ecb3ef506da0d0392b1b424e3733d80f67cab2b Mon Sep 17 00:00:00 2001 From: Xin Wang Date: Thu, 21 May 2026 20:48:24 +0800 Subject: [PATCH] update xfyun asr --- engine/xfyun_asr.py | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/engine/xfyun_asr.py b/engine/xfyun_asr.py index bcee840..07d9260 100644 --- a/engine/xfyun_asr.py +++ b/engine/xfyun_asr.py @@ -21,6 +21,7 @@ from pipecat.frames.frames import ( Frame, InterimTranscriptionFrame, TranscriptionFrame, + UserStoppedSpeakingFrame, VADUserStartedSpeakingFrame, VADUserStoppedSpeakingFrame, ) @@ -78,11 +79,17 @@ class XfyunASRService(STTService): self._sent_final_frame = False self._partials: list[str] = [] self._last_text = "" - # Text already finalized by xfyun within the current VAD turn. xfyun - # may emit several status=2 segments within one turn (e.g. when the - # user pauses briefly); each segment resets `_partials`/`_last_text`, + # Text already finalized by xfyun within the current aggregator turn. + # xfyun may emit several status=2 segments within one turn (brief + # user pauses, or the engine's VAD stopping/restarting before the + # turn timeout fires); each segment resets `_partials`/`_last_text`, # but interim frames pushed to clients should still grow - # monotonically across segments. Reset on VADUserStartedSpeakingFrame. + # monotonically across segments. Reset on the aggregator-level + # `UserStoppedSpeakingFrame` (broadcast by the user aggregator once + # per turn at turn end) — NOT on `VADUserStartedSpeakingFrame`, + # which fires on every brief resume within the same turn and would + # clobber the accumulator mid-utterance, making the bubble appear + # to "un-stream" backwards. self._turn_committed_text = "" async def cleanup(self) -> None: @@ -100,8 +107,18 @@ class XfyunASRService(STTService): async def process_frame(self, frame: Frame, direction: FrameDirection) -> None: await super().process_frame(frame, direction) - if isinstance(frame, VADUserStartedSpeakingFrame): + if isinstance(frame, UserStoppedSpeakingFrame): + # Aggregator-level turn end (broadcast by the user aggregator + # once per turn). At this point the xfyun websocket has already + # been closed via `_finish_utterance` on VAD stop, so it's safe + # to clear the cross-segment text accumulator here. Doing it on + # turn end rather than turn start avoids a frame-ordering race + # where the first interim of a new turn could be prefixed with + # the previous turn's committed text (the aggregator broadcasts + # `UserStartedSpeakingFrame` only after `VADUserStartedSpeakingFrame` + # has already propagated through this processor). self._turn_committed_text = "" + elif isinstance(frame, VADUserStartedSpeakingFrame): await self._start_utterance() elif isinstance(frame, VADUserStoppedSpeakingFrame): await self._finish_utterance()