xfyun asr update one bubble

This commit is contained in:
Xin Wang
2026-05-21 17:05:20 +08:00
parent b09ca18cb9
commit 6a59df3dbd
3 changed files with 29 additions and 3 deletions

1
AGENTS.md Normal file
View File

@@ -0,0 +1 @@
Write readable, maintainable, extensible code for a voice agent endpoint with pipecat as engine

View File

@@ -78,6 +78,12 @@ class XfyunASRService(STTService):
self._sent_final_frame = False
self._partials: list[str] = []
self._last_text = ""
# Text already finalized by xfyun within the current VAD turn. xfyun
# may emit several status=2 segments within one turn (e.g. when the
# user pauses briefly); each segment resets `_partials`/`_last_text`,
# but interim frames pushed to clients should still grow
# monotonically across segments. Reset on VADUserStartedSpeakingFrame.
self._turn_committed_text = ""
async def cleanup(self) -> None:
await self._close_utterance()
@@ -95,6 +101,7 @@ class XfyunASRService(STTService):
await super().process_frame(frame, direction)
if isinstance(frame, VADUserStartedSpeakingFrame):
self._turn_committed_text = ""
await self._start_utterance()
elif isinstance(frame, VADUserStoppedSpeakingFrame):
await self._finish_utterance()
@@ -261,7 +268,7 @@ class XfyunASRService(STTService):
self._last_text = text
await self.push_frame(
InterimTranscriptionFrame(
text,
self._turn_committed_text + text,
self._user_id,
time_now_iso8601(),
_language_or_none(self._language),
@@ -273,6 +280,10 @@ class XfyunASRService(STTService):
final_text = self._last_text
if final_text:
self.confirm_finalize()
# Emit just this segment's text. The pipecat user aggregator
# concatenates TranscriptionFrames within a VAD turn, so we
# must NOT prepend `_turn_committed_text` here or the
# aggregated turn text would double-count earlier segments.
await self.push_frame(
TranscriptionFrame(
final_text,
@@ -282,6 +293,9 @@ class XfyunASRService(STTService):
result=payload,
)
)
# Accumulate so the next sub-session's interim frames carry
# the full turn so far (used for client UI display only).
self._turn_committed_text += final_text
await self._close_utterance()
def _apply_recognition_result(self, recognition: dict[str, Any]) -> str:

View File

@@ -538,9 +538,14 @@ function handleUserTranscript(text) {
body.textContent = text;
state.currentUserBubble.classList.remove("bubble--interim");
} else {
addBubble("user", text);
state.currentUserBubble = addBubble("user", text);
}
state.currentUserBubble = null;
// Intentionally keep `state.currentUserBubble` set. Streaming ASRs (e.g.
// xfyun) can emit multiple interim/final cycles within a single dialog
// turn — for example when the user pauses mid-sentence or the upstream
// service segments the utterance. Keeping the bubble open until the
// assistant starts replying (see `handleAssistantStarted` /
// `response.audio.started`) collapses those cycles into one bubble.
}
function handleUserTranscriptInterim(text) {
@@ -590,6 +595,11 @@ function handleAssistantDelta(text) {
function handleAssistantStarted() {
state.currentAssistantBubble = null;
// Close the in-flight user bubble so the next user turn starts a fresh
// one. We do this here (and on `response.audio.started`) rather than on
// every `input.transcript.final`, because streaming ASRs may emit
// several finals within a single dialog turn.
state.currentUserBubble = null;
}
function handleAssistantFinal(text, interrupted) {
@@ -633,6 +643,7 @@ function handleEvent(event) {
break;
case "response.audio.started":
setBotIndicator(true);
state.currentUserBubble = null;
break;
case "response.audio.stopped":
finalizeAssistantBubble();