Improve tts with codex extra high

2026-02-09 15:04:34 +08:00
parent a42dd4c712
commit cd68ebe306
3 changed files with 164 additions and 56 deletions
--- a/engine/services/siliconflow_tts.py
+++ b/engine/services/siliconflow_tts.py
@@ -134,6 +134,7 @@ class SiliconFlowTTSService(BaseTTSService):
                # Stream audio chunks
                chunk_size = self.sample_rate * 2 // 10  # 100ms chunks
                buffer = b""
+                pending_chunk = None
                
                async for chunk in response.content.iter_any():
                    if self._cancel_event.is_set():
@@ -146,14 +147,34 @@ class SiliconFlowTTSService(BaseTTSService):
                    while len(buffer) >= chunk_size:
                        audio_chunk = buffer[:chunk_size]
                        buffer = buffer[chunk_size:]
-                        
+
+                        # Keep one full chunk buffered so we can always tag the true
+                        # last full chunk as final when stream length is an exact multiple.
+                        if pending_chunk is not None:
+                            yield TTSChunk(
+                                audio=pending_chunk,
+                                sample_rate=self.sample_rate,
+                                is_final=False
+                            )
+                        pending_chunk = audio_chunk
+                
+                # Flush pending chunk(s) and remaining tail.
+                if pending_chunk is not None:
+                    if buffer:
                        yield TTSChunk(
-                            audio=audio_chunk,
+                            audio=pending_chunk,
                            sample_rate=self.sample_rate,
                            is_final=False
                        )
-                
-                # Yield remaining buffer
+                        pending_chunk = None
+                    else:
+                        yield TTSChunk(
+                            audio=pending_chunk,
+                            sample_rate=self.sample_rate,
+                            is_final=True
+                        )
+                        pending_chunk = None
+
                if buffer:
                    yield TTSChunk(
                        audio=buffer,
@@ -182,7 +203,7 @@ class StreamingTTSAdapter:
    """
    
    # Sentence delimiters
-    SENTENCE_ENDS = {'。', '！', '？', '\n'}
+    SENTENCE_ENDS = {'。', '！', '？', '.', '!', '?', '\n'}
    
    def __init__(self, tts_service: BaseTTSService, transport, session_id: str):
        self.tts_service = tts_service
@@ -205,15 +226,24 @@ class StreamingTTSAdapter:
        self._buffer += text_chunk
        
        # Check for sentence completion
-        for i, char in enumerate(self._buffer):
-            if char in self.SENTENCE_ENDS:
-                # Found sentence end, synthesize up to this point
-                sentence = self._buffer[:i+1].strip()
-                self._buffer = self._buffer[i+1:]
-                
-                if sentence:
-                    await self._speak_sentence(sentence)
+        while True:
+            split_idx = -1
+            for i, char in enumerate(self._buffer):
+                if char in self.SENTENCE_ENDS:
+                    split_idx = i
+                    break
+            if split_idx < 0:
                break
+
+            end_idx = split_idx + 1
+            while end_idx < len(self._buffer) and self._buffer[end_idx] in self.SENTENCE_ENDS:
+                end_idx += 1
+
+            sentence = self._buffer[:end_idx].strip()
+            self._buffer = self._buffer[end_idx:]
+
+            if sentence and any(ch.isalnum() for ch in sentence):
+                await self._speak_sentence(sentence)
    
    async def flush(self) -> None:
        """Flush remaining buffer."""