update xfyun asr

This commit is contained in:
Xin Wang
2026-05-21 20:48:24 +08:00
parent 872b8ac64a
commit 2ecb3ef506

View File

@@ -21,6 +21,7 @@ from pipecat.frames.frames import (
Frame,
InterimTranscriptionFrame,
TranscriptionFrame,
UserStoppedSpeakingFrame,
VADUserStartedSpeakingFrame,
VADUserStoppedSpeakingFrame,
)
@@ -78,11 +79,17 @@ class XfyunASRService(STTService):
self._sent_final_frame = False
self._partials: list[str] = []
self._last_text = ""
# Text already finalized by xfyun within the current VAD turn. xfyun
# may emit several status=2 segments within one turn (e.g. when the
# user pauses briefly); each segment resets `_partials`/`_last_text`,
# Text already finalized by xfyun within the current aggregator turn.
# xfyun may emit several status=2 segments within one turn (brief
# user pauses, or the engine's VAD stopping/restarting before the
# turn timeout fires); each segment resets `_partials`/`_last_text`,
# but interim frames pushed to clients should still grow
# monotonically across segments. Reset on VADUserStartedSpeakingFrame.
# monotonically across segments. Reset on the aggregator-level
# `UserStoppedSpeakingFrame` (broadcast by the user aggregator once
# per turn at turn end) — NOT on `VADUserStartedSpeakingFrame`,
# which fires on every brief resume within the same turn and would
# clobber the accumulator mid-utterance, making the bubble appear
# to "un-stream" backwards.
self._turn_committed_text = ""
async def cleanup(self) -> None:
@@ -100,8 +107,18 @@ class XfyunASRService(STTService):
async def process_frame(self, frame: Frame, direction: FrameDirection) -> None:
await super().process_frame(frame, direction)
if isinstance(frame, VADUserStartedSpeakingFrame):
if isinstance(frame, UserStoppedSpeakingFrame):
# Aggregator-level turn end (broadcast by the user aggregator
# once per turn). At this point the xfyun websocket has already
# been closed via `_finish_utterance` on VAD stop, so it's safe
# to clear the cross-segment text accumulator here. Doing it on
# turn end rather than turn start avoids a frame-ordering race
# where the first interim of a new turn could be prefixed with
# the previous turn's committed text (the aggregator broadcasts
# `UserStartedSpeakingFrame` only after `VADUserStartedSpeakingFrame`
# has already propagated through this processor).
self._turn_committed_text = ""
elif isinstance(frame, VADUserStartedSpeakingFrame):
await self._start_utterance()
elif isinstance(frame, VADUserStoppedSpeakingFrame):
await self._finish_utterance()