update xfyun asr
This commit is contained in:
@@ -21,6 +21,7 @@ from pipecat.frames.frames import (
|
||||
Frame,
|
||||
InterimTranscriptionFrame,
|
||||
TranscriptionFrame,
|
||||
UserStoppedSpeakingFrame,
|
||||
VADUserStartedSpeakingFrame,
|
||||
VADUserStoppedSpeakingFrame,
|
||||
)
|
||||
@@ -78,11 +79,17 @@ class XfyunASRService(STTService):
|
||||
self._sent_final_frame = False
|
||||
self._partials: list[str] = []
|
||||
self._last_text = ""
|
||||
# Text already finalized by xfyun within the current VAD turn. xfyun
|
||||
# may emit several status=2 segments within one turn (e.g. when the
|
||||
# user pauses briefly); each segment resets `_partials`/`_last_text`,
|
||||
# Text already finalized by xfyun within the current aggregator turn.
|
||||
# xfyun may emit several status=2 segments within one turn (brief
|
||||
# user pauses, or the engine's VAD stopping/restarting before the
|
||||
# turn timeout fires); each segment resets `_partials`/`_last_text`,
|
||||
# but interim frames pushed to clients should still grow
|
||||
# monotonically across segments. Reset on VADUserStartedSpeakingFrame.
|
||||
# monotonically across segments. Reset on the aggregator-level
|
||||
# `UserStoppedSpeakingFrame` (broadcast by the user aggregator once
|
||||
# per turn at turn end) — NOT on `VADUserStartedSpeakingFrame`,
|
||||
# which fires on every brief resume within the same turn and would
|
||||
# clobber the accumulator mid-utterance, making the bubble appear
|
||||
# to "un-stream" backwards.
|
||||
self._turn_committed_text = ""
|
||||
|
||||
async def cleanup(self) -> None:
|
||||
@@ -100,8 +107,18 @@ class XfyunASRService(STTService):
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection) -> None:
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if isinstance(frame, VADUserStartedSpeakingFrame):
|
||||
if isinstance(frame, UserStoppedSpeakingFrame):
|
||||
# Aggregator-level turn end (broadcast by the user aggregator
|
||||
# once per turn). At this point the xfyun websocket has already
|
||||
# been closed via `_finish_utterance` on VAD stop, so it's safe
|
||||
# to clear the cross-segment text accumulator here. Doing it on
|
||||
# turn end rather than turn start avoids a frame-ordering race
|
||||
# where the first interim of a new turn could be prefixed with
|
||||
# the previous turn's committed text (the aggregator broadcasts
|
||||
# `UserStartedSpeakingFrame` only after `VADUserStartedSpeakingFrame`
|
||||
# has already propagated through this processor).
|
||||
self._turn_committed_text = ""
|
||||
elif isinstance(frame, VADUserStartedSpeakingFrame):
|
||||
await self._start_utterance()
|
||||
elif isinstance(frame, VADUserStoppedSpeakingFrame):
|
||||
await self._finish_utterance()
|
||||
|
||||
Reference in New Issue
Block a user