Refactor duplicate stream tts adapter

2026-02-09 16:28:59 +08:00
parent 11016c04da
commit 29d8361ca9
5 changed files with 196 additions and 197 deletions
--- a/engine/core/duplex_pipeline.py
+++ b/engine/core/duplex_pipeline.py
@@ -30,6 +30,7 @@ from services.base import BaseASRService, BaseLLMService, BaseTTSService
 from services.llm import MockLLMService, OpenAILLMService
 from services.siliconflow_asr import SiliconFlowASRService
 from services.siliconflow_tts import SiliconFlowTTSService
+from services.streaming_text import extract_tts_sentence, has_spoken_content
 from services.tts import EdgeTTSService, MockTTSService


@@ -529,7 +530,15 @@ class DuplexPipeline:

                # Check for sentence completion - synthesize immediately for low latency
                while True:
-                    split_result = self._extract_tts_sentence(sentence_buffer, force=False)
+                    split_result = extract_tts_sentence(
+                        sentence_buffer,
+                        end_chars=self._SENTENCE_END_CHARS,
+                        trailing_chars=self._SENTENCE_TRAILING_CHARS,
+                        closers=self._SENTENCE_CLOSERS,
+                        min_split_spoken_chars=self._MIN_SPLIT_SPOKEN_CHARS,
+                        hold_trailing_at_buffer_end=True,
+                        force=False,
+                    )
                    if not split_result:
                        break
                    sentence, sentence_buffer = split_result
@@ -542,7 +551,7 @@ class DuplexPipeline:
                        continue

                    # Avoid synthesizing punctuation-only fragments (e.g. standalone "!")
-                    if not self._has_spoken_content(sentence):
+                    if not has_spoken_content(sentence):
                        pending_punctuation = sentence
                        continue

@@ -576,7 +585,7 @@ class DuplexPipeline:

            # Speak any remaining text
            remaining_text = f"{pending_punctuation}{sentence_buffer}".strip()
-            if remaining_text and self._has_spoken_content(remaining_text) and not self._interrupt_event.is_set():
+            if remaining_text and has_spoken_content(remaining_text) and not self._interrupt_event.is_set():
                if not first_audio_sent:
                    await self.transport.send_event({
                        **ev(
@@ -618,84 +627,6 @@ class DuplexPipeline:
            self._barge_in_speech_frames = 0
            self._barge_in_silence_frames = 0

-    def _extract_tts_sentence(self, text_buffer: str, force: bool = False) -> Optional[tuple[str, str]]:
-        """
-        Extract one TTS sentence from the buffer.
-
-        Consecutive sentence terminators are grouped together to avoid creating
-        punctuation-only fragments such as a standalone "!" after "?". By
-        default, trailing terminator at buffer end is held for more context.
-        """
-        if not text_buffer:
-            return None
-
-        search_start = 0
-        while True:
-            split_idx = -1
-            for idx in range(search_start, len(text_buffer)):
-                char = text_buffer[idx]
-                if char == "." and self._is_non_sentence_period(text_buffer, idx):
-                    continue
-                if char in self._SENTENCE_END_CHARS:
-                    split_idx = idx
-                    break
-
-            if split_idx == -1:
-                return None
-
-            end_idx = split_idx + 1
-            while end_idx < len(text_buffer) and text_buffer[end_idx] in self._SENTENCE_TRAILING_CHARS:
-                end_idx += 1
-
-            # Include trailing quote/bracket closers in the same segment.
-            while end_idx < len(text_buffer) and text_buffer[end_idx] in self._SENTENCE_CLOSERS:
-                end_idx += 1
-
-            if not force and end_idx >= len(text_buffer):
-                return None
-
-            sentence = text_buffer[:end_idx].strip()
-            spoken_chars = sum(1 for ch in sentence if ch.isalnum())
-
-            # Keep short utterances (e.g. "好。", "OK.") merged with following text.
-            if (
-                not force
-                and 0 < spoken_chars < self._MIN_SPLIT_SPOKEN_CHARS
-                and end_idx < len(text_buffer)
-            ):
-                search_start = end_idx
-                continue
-
-            remainder = text_buffer[end_idx:]
-            return sentence, remainder
-
-    def _has_spoken_content(self, text: str) -> bool:
-        """Check whether text contains pronounceable content (not punctuation-only)."""
-        return any(char.isalnum() for char in text)
-
-    def _is_non_sentence_period(self, text: str, idx: int) -> bool:
-        """Check whether '.' should NOT be treated as a sentence delimiter."""
-        if text[idx] != ".":
-            return False
-
-        # Decimal/version segment: 1.2, v1.2.3
-        if idx > 0 and idx < len(text) - 1 and text[idx - 1].isdigit() and text[idx + 1].isdigit():
-            return True
-
-        # Number abbreviations: No.1 / No. 1
-        left_start = idx - 1
-        while left_start >= 0 and text[left_start].isalpha():
-            left_start -= 1
-        left_token = text[left_start + 1:idx].lower()
-        if left_token == "no":
-            j = idx + 1
-            while j < len(text) and text[j].isspace():
-                j += 1
-            if j < len(text) and text[j].isdigit():
-                return True
-
-        return False
-
    async def _speak_sentence(self, text: str, fade_in_ms: int = 0, fade_out_ms: int = 8) -> None:
        """
        Synthesize and send a single sentence.