Update xfyun asr service

This commit is contained in:
Xin Wang
2026-05-22 08:37:31 +08:00
parent c0fb0c51a2
commit dab2d0593e
3 changed files with 27 additions and 48 deletions

View File

@@ -91,6 +91,7 @@ class STTConfig:
encoding: str = "raw"
frame_size: int = 1280
timeout_sec: float = 10.0
dynamic_correction: bool = False
@dataclass(frozen=True)

View File

@@ -32,6 +32,7 @@ def create_stt_service(config: STTConfig, audio: AudioConfig | None = None):
encoding=config.encoding,
frame_size=config.frame_size,
open_timeout=config.timeout_sec,
dynamic_correction=config.dynamic_correction,
)
_require_provider(config.provider, "openai", "stt")

View File

@@ -17,7 +17,6 @@ from loguru import logger
from pipecat.frames.frames import (
CancelFrame,
EndFrame,
ErrorFrame,
Frame,
InterimTranscriptionFrame,
TranscriptionFrame,
@@ -53,6 +52,7 @@ class XfyunASRService(STTService):
encoding: str = "raw",
frame_size: int = 1280,
open_timeout: float = 10.0,
dynamic_correction: bool = False,
**kwargs,
) -> None:
super().__init__(
@@ -70,6 +70,7 @@ class XfyunASRService(STTService):
self._encoding = encoding
self._frame_size = frame_size
self._open_timeout = open_timeout
self._dynamic_correction = dynamic_correction
self._websocket = None
self._receive_task = None
@@ -79,11 +80,6 @@ class XfyunASRService(STTService):
self._finalizing_turn = False
self._partials: list[str] = []
self._last_text = ""
# Text already emitted as TranscriptionFrame deltas for Pipecat's
# user-turn strategy and context aggregator. The xfyun websocket now
# spans a full logical user turn, so UI interim frames can use
# `_last_text` directly while this cursor prevents duplicate context.
self._turn_transcription_text = ""
async def cleanup(self) -> None:
await self._close_utterance()
@@ -135,7 +131,6 @@ class XfyunASRService(STTService):
self._audio_buffer.clear()
self._partials = []
self._last_text = ""
self._turn_transcription_text = ""
self._sent_first_frame = False
self._sent_final_frame = False
@@ -206,13 +201,17 @@ class XfyunASRService(STTService):
return
if not self._sent_first_frame:
business = {
"language": self._language,
"domain": self._domain,
"accent": self._accent,
}
if self._dynamic_correction:
business["dwa"] = "wpgs"
payload = {
"common": {"app_id": self._app_id},
"business": {
"language": self._language,
"domain": self._domain,
"accent": self._accent,
},
"business": business,
"data": {
"status": 0,
"format": f"audio/L16;rate={self.sample_rate}",
@@ -266,12 +265,13 @@ class XfyunASRService(STTService):
if not isinstance(data, dict):
return
is_final_response = data.get("status") == 2
recognition = data.get("result")
if isinstance(recognition, dict):
text = self._apply_recognition_result(recognition)
if text and text != self._last_text:
self._last_text = text
if not self._finalizing_turn:
if not self._finalizing_turn and not is_final_response:
await self.push_frame(
InterimTranscriptionFrame(
text,
@@ -281,51 +281,28 @@ class XfyunASRService(STTService):
result=payload,
)
)
await self._push_transcription_delta(text, result=payload, finalized=False)
if data.get("status") == 2:
if is_final_response:
final_text = self._last_text
if final_text and not self._finalizing_turn:
if final_text:
self.confirm_finalize()
await self._push_transcription_delta(final_text, result=payload, finalized=True)
await self.push_frame(
TranscriptionFrame(
final_text,
self._user_id,
time_now_iso8601(),
_language_or_none(self._language),
result=payload,
)
)
await self._close_utterance()
async def _push_transcription_delta(
self,
text: str,
*,
result: dict[str, Any],
finalized: bool,
) -> None:
if text.startswith(self._turn_transcription_text):
delta = text[len(self._turn_transcription_text) :]
else:
logger.debug(
"Xfyun transcript replacement is not append-only; "
"continuing with the new suffix for turn aggregation"
)
delta = text
if not delta and not finalized:
return
self._turn_transcription_text = text
await self.push_frame(
TranscriptionFrame(
delta,
self._user_id,
time_now_iso8601(),
_language_or_none(self._language),
result=result,
)
)
def _apply_recognition_result(self, recognition: dict[str, Any]) -> str:
partial = _extract_text_from_result(recognition)
if not partial:
return self._last_text
if recognition.get("pgs") == "rpl" and recognition.get("rg"):
if self._dynamic_correction and recognition.get("pgs") == "rpl" and recognition.get("rg"):
start, end = recognition["rg"]
if 1 <= start <= len(self._partials):
self._partials[start - 1 : end] = [partial]