Refactor duplicate stream tts adapter
This commit is contained in:
@@ -30,6 +30,7 @@ from services.base import BaseASRService, BaseLLMService, BaseTTSService
|
||||
from services.llm import MockLLMService, OpenAILLMService
|
||||
from services.siliconflow_asr import SiliconFlowASRService
|
||||
from services.siliconflow_tts import SiliconFlowTTSService
|
||||
from services.streaming_text import extract_tts_sentence, has_spoken_content
|
||||
from services.tts import EdgeTTSService, MockTTSService
|
||||
|
||||
|
||||
@@ -529,7 +530,15 @@ class DuplexPipeline:
|
||||
|
||||
# Check for sentence completion - synthesize immediately for low latency
|
||||
while True:
|
||||
split_result = self._extract_tts_sentence(sentence_buffer, force=False)
|
||||
split_result = extract_tts_sentence(
|
||||
sentence_buffer,
|
||||
end_chars=self._SENTENCE_END_CHARS,
|
||||
trailing_chars=self._SENTENCE_TRAILING_CHARS,
|
||||
closers=self._SENTENCE_CLOSERS,
|
||||
min_split_spoken_chars=self._MIN_SPLIT_SPOKEN_CHARS,
|
||||
hold_trailing_at_buffer_end=True,
|
||||
force=False,
|
||||
)
|
||||
if not split_result:
|
||||
break
|
||||
sentence, sentence_buffer = split_result
|
||||
@@ -542,7 +551,7 @@ class DuplexPipeline:
|
||||
continue
|
||||
|
||||
# Avoid synthesizing punctuation-only fragments (e.g. standalone "!")
|
||||
if not self._has_spoken_content(sentence):
|
||||
if not has_spoken_content(sentence):
|
||||
pending_punctuation = sentence
|
||||
continue
|
||||
|
||||
@@ -576,7 +585,7 @@ class DuplexPipeline:
|
||||
|
||||
# Speak any remaining text
|
||||
remaining_text = f"{pending_punctuation}{sentence_buffer}".strip()
|
||||
if remaining_text and self._has_spoken_content(remaining_text) and not self._interrupt_event.is_set():
|
||||
if remaining_text and has_spoken_content(remaining_text) and not self._interrupt_event.is_set():
|
||||
if not first_audio_sent:
|
||||
await self.transport.send_event({
|
||||
**ev(
|
||||
@@ -618,84 +627,6 @@ class DuplexPipeline:
|
||||
self._barge_in_speech_frames = 0
|
||||
self._barge_in_silence_frames = 0
|
||||
|
||||
def _extract_tts_sentence(self, text_buffer: str, force: bool = False) -> Optional[tuple[str, str]]:
|
||||
"""
|
||||
Extract one TTS sentence from the buffer.
|
||||
|
||||
Consecutive sentence terminators are grouped together to avoid creating
|
||||
punctuation-only fragments such as a standalone "!" after "?". By
|
||||
default, trailing terminator at buffer end is held for more context.
|
||||
"""
|
||||
if not text_buffer:
|
||||
return None
|
||||
|
||||
search_start = 0
|
||||
while True:
|
||||
split_idx = -1
|
||||
for idx in range(search_start, len(text_buffer)):
|
||||
char = text_buffer[idx]
|
||||
if char == "." and self._is_non_sentence_period(text_buffer, idx):
|
||||
continue
|
||||
if char in self._SENTENCE_END_CHARS:
|
||||
split_idx = idx
|
||||
break
|
||||
|
||||
if split_idx == -1:
|
||||
return None
|
||||
|
||||
end_idx = split_idx + 1
|
||||
while end_idx < len(text_buffer) and text_buffer[end_idx] in self._SENTENCE_TRAILING_CHARS:
|
||||
end_idx += 1
|
||||
|
||||
# Include trailing quote/bracket closers in the same segment.
|
||||
while end_idx < len(text_buffer) and text_buffer[end_idx] in self._SENTENCE_CLOSERS:
|
||||
end_idx += 1
|
||||
|
||||
if not force and end_idx >= len(text_buffer):
|
||||
return None
|
||||
|
||||
sentence = text_buffer[:end_idx].strip()
|
||||
spoken_chars = sum(1 for ch in sentence if ch.isalnum())
|
||||
|
||||
# Keep short utterances (e.g. "好。", "OK.") merged with following text.
|
||||
if (
|
||||
not force
|
||||
and 0 < spoken_chars < self._MIN_SPLIT_SPOKEN_CHARS
|
||||
and end_idx < len(text_buffer)
|
||||
):
|
||||
search_start = end_idx
|
||||
continue
|
||||
|
||||
remainder = text_buffer[end_idx:]
|
||||
return sentence, remainder
|
||||
|
||||
def _has_spoken_content(self, text: str) -> bool:
|
||||
"""Check whether text contains pronounceable content (not punctuation-only)."""
|
||||
return any(char.isalnum() for char in text)
|
||||
|
||||
def _is_non_sentence_period(self, text: str, idx: int) -> bool:
|
||||
"""Check whether '.' should NOT be treated as a sentence delimiter."""
|
||||
if text[idx] != ".":
|
||||
return False
|
||||
|
||||
# Decimal/version segment: 1.2, v1.2.3
|
||||
if idx > 0 and idx < len(text) - 1 and text[idx - 1].isdigit() and text[idx + 1].isdigit():
|
||||
return True
|
||||
|
||||
# Number abbreviations: No.1 / No. 1
|
||||
left_start = idx - 1
|
||||
while left_start >= 0 and text[left_start].isalpha():
|
||||
left_start -= 1
|
||||
left_token = text[left_start + 1:idx].lower()
|
||||
if left_token == "no":
|
||||
j = idx + 1
|
||||
while j < len(text) and text[j].isspace():
|
||||
j += 1
|
||||
if j < len(text) and text[j].isdigit():
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
async def _speak_sentence(self, text: str, fade_in_ms: int = 0, fade_out_ms: int = 8) -> None:
|
||||
"""
|
||||
Synthesize and send a single sentence.
|
||||
|
||||
Reference in New Issue
Block a user