From 6a59df3dbdfdcffa8f20523a6e5f650b2aba4955 Mon Sep 17 00:00:00 2001 From: Xin Wang Date: Thu, 21 May 2026 17:05:20 +0800 Subject: [PATCH] xfyun asr update one bubble --- AGENTS.md | 1 + engine/xfyun_asr.py | 16 +++++++++++++++- examples/webpage/app.js | 15 +++++++++++++-- 3 files changed, 29 insertions(+), 3 deletions(-) create mode 100644 AGENTS.md diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..f6307a9 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1 @@ +Write readable, maintainable, extensible code for a voice agent endpoint with pipecat as engine \ No newline at end of file diff --git a/engine/xfyun_asr.py b/engine/xfyun_asr.py index 6b5b2d8..bcee840 100644 --- a/engine/xfyun_asr.py +++ b/engine/xfyun_asr.py @@ -78,6 +78,12 @@ class XfyunASRService(STTService): self._sent_final_frame = False self._partials: list[str] = [] self._last_text = "" + # Text already finalized by xfyun within the current VAD turn. xfyun + # may emit several status=2 segments within one turn (e.g. when the + # user pauses briefly); each segment resets `_partials`/`_last_text`, + # but interim frames pushed to clients should still grow + # monotonically across segments. Reset on VADUserStartedSpeakingFrame. + self._turn_committed_text = "" async def cleanup(self) -> None: await self._close_utterance() @@ -95,6 +101,7 @@ class XfyunASRService(STTService): await super().process_frame(frame, direction) if isinstance(frame, VADUserStartedSpeakingFrame): + self._turn_committed_text = "" await self._start_utterance() elif isinstance(frame, VADUserStoppedSpeakingFrame): await self._finish_utterance() @@ -261,7 +268,7 @@ class XfyunASRService(STTService): self._last_text = text await self.push_frame( InterimTranscriptionFrame( - text, + self._turn_committed_text + text, self._user_id, time_now_iso8601(), _language_or_none(self._language), @@ -273,6 +280,10 @@ class XfyunASRService(STTService): final_text = self._last_text if final_text: self.confirm_finalize() + # Emit just this segment's text. The pipecat user aggregator + # concatenates TranscriptionFrames within a VAD turn, so we + # must NOT prepend `_turn_committed_text` here or the + # aggregated turn text would double-count earlier segments. await self.push_frame( TranscriptionFrame( final_text, @@ -282,6 +293,9 @@ class XfyunASRService(STTService): result=payload, ) ) + # Accumulate so the next sub-session's interim frames carry + # the full turn so far (used for client UI display only). + self._turn_committed_text += final_text await self._close_utterance() def _apply_recognition_result(self, recognition: dict[str, Any]) -> str: diff --git a/examples/webpage/app.js b/examples/webpage/app.js index ec618f4..4b0fcc0 100644 --- a/examples/webpage/app.js +++ b/examples/webpage/app.js @@ -538,9 +538,14 @@ function handleUserTranscript(text) { body.textContent = text; state.currentUserBubble.classList.remove("bubble--interim"); } else { - addBubble("user", text); + state.currentUserBubble = addBubble("user", text); } - state.currentUserBubble = null; + // Intentionally keep `state.currentUserBubble` set. Streaming ASRs (e.g. + // xfyun) can emit multiple interim/final cycles within a single dialog + // turn — for example when the user pauses mid-sentence or the upstream + // service segments the utterance. Keeping the bubble open until the + // assistant starts replying (see `handleAssistantStarted` / + // `response.audio.started`) collapses those cycles into one bubble. } function handleUserTranscriptInterim(text) { @@ -590,6 +595,11 @@ function handleAssistantDelta(text) { function handleAssistantStarted() { state.currentAssistantBubble = null; + // Close the in-flight user bubble so the next user turn starts a fresh + // one. We do this here (and on `response.audio.started`) rather than on + // every `input.transcript.final`, because streaming ASRs may emit + // several finals within a single dialog turn. + state.currentUserBubble = null; } function handleAssistantFinal(text, interrupted) { @@ -633,6 +643,7 @@ function handleEvent(event) { break; case "response.audio.started": setBotIndicator(true); + state.currentUserBubble = null; break; case "response.audio.stopped": finalizeAssistantBubble();