Improve tts with codex extra high

This commit is contained in:
Xin Wang
2026-02-09 15:04:34 +08:00
parent a42dd4c712
commit cd68ebe306
3 changed files with 164 additions and 56 deletions

View File

@@ -49,6 +49,10 @@ class DuplexPipeline:
Barge-in Detection → Interrupt Barge-in Detection → Interrupt
""" """
_SENTENCE_END_CHARS = frozenset({"", "", "", ".", "!", "?", "\n"})
_SENTENCE_TRAILING_CHARS = frozenset({"", "", "", ".", "!", "?", "", "~", "", "\n"})
_SENTENCE_CLOSERS = frozenset({'"', "'", "", "", ")", "]", "}", "", "", "", "", ""})
def __init__( def __init__(
self, self,
transport: BaseTransport, transport: BaseTransport,
@@ -499,8 +503,9 @@ class DuplexPipeline:
# Sentence buffer for streaming TTS # Sentence buffer for streaming TTS
sentence_buffer = "" sentence_buffer = ""
sentence_ends = {'', '', '', '', '\n'} pending_punctuation = ""
first_audio_sent = False first_audio_sent = False
spoken_sentence_count = 0
# Stream LLM response and TTS sentence by sentence # Stream LLM response and TTS sentence by sentence
async for text_chunk in self.llm_service.generate_stream(messages): async for text_chunk in self.llm_service.generate_stream(messages):
@@ -521,19 +526,25 @@ class DuplexPipeline:
}) })
# Check for sentence completion - synthesize immediately for low latency # Check for sentence completion - synthesize immediately for low latency
while any(end in sentence_buffer for end in sentence_ends): while True:
# Find first sentence end split_result = self._extract_tts_sentence(sentence_buffer, force=False)
min_idx = len(sentence_buffer) if not split_result:
for end in sentence_ends: break
idx = sentence_buffer.find(end) sentence, sentence_buffer = split_result
if idx != -1 and idx < min_idx: if not sentence:
min_idx = idx continue
if min_idx < len(sentence_buffer): sentence = f"{pending_punctuation}{sentence}".strip()
sentence = sentence_buffer[:min_idx + 1].strip() pending_punctuation = ""
sentence_buffer = sentence_buffer[min_idx + 1:] if not sentence:
continue
if sentence and not self._interrupt_event.is_set(): # Avoid synthesizing punctuation-only fragments (e.g. standalone "!")
if not self._has_spoken_content(sentence):
pending_punctuation = sentence
continue
if not self._interrupt_event.is_set():
# Send track start on first audio # Send track start on first audio
if not first_audio_sent: if not first_audio_sent:
await self.transport.send_event({ await self.transport.send_event({
@@ -544,10 +555,14 @@ class DuplexPipeline:
}) })
first_audio_sent = True first_audio_sent = True
# Synthesize and send this sentence immediately # Keep very short fade-in for non-first sentence to preserve consonant attack.
await self._speak_sentence(sentence) fade_in_ms = 2 if spoken_sentence_count == 0 else 1
else: await self._speak_sentence(
break sentence,
fade_in_ms=fade_in_ms,
fade_out_ms=8,
)
spoken_sentence_count += 1
# Send final LLM response event # Send final LLM response event
if full_response and not self._interrupt_event.is_set(): if full_response and not self._interrupt_event.is_set():
@@ -560,7 +575,8 @@ class DuplexPipeline:
}) })
# Speak any remaining text # Speak any remaining text
if sentence_buffer.strip() and not self._interrupt_event.is_set(): remaining_text = f"{pending_punctuation}{sentence_buffer}".strip()
if remaining_text and self._has_spoken_content(remaining_text) and not self._interrupt_event.is_set():
if not first_audio_sent: if not first_audio_sent:
await self.transport.send_event({ await self.transport.send_event({
**ev( **ev(
@@ -569,7 +585,12 @@ class DuplexPipeline:
) )
}) })
first_audio_sent = True first_audio_sent = True
await self._speak_sentence(sentence_buffer.strip()) fade_in_ms = 2 if spoken_sentence_count == 0 else 1
await self._speak_sentence(
remaining_text,
fade_in_ms=fade_in_ms,
fade_out_ms=8,
)
# Send track end # Send track end
if first_audio_sent: if first_audio_sent:
@@ -598,12 +619,53 @@ class DuplexPipeline:
self._barge_in_speech_frames = 0 self._barge_in_speech_frames = 0
self._barge_in_silence_frames = 0 self._barge_in_silence_frames = 0
async def _speak_sentence(self, text: str) -> None: def _extract_tts_sentence(self, text_buffer: str, force: bool = False) -> Optional[tuple[str, str]]:
"""
Extract one TTS sentence from the buffer.
Consecutive sentence terminators are grouped together to avoid creating
punctuation-only fragments such as a standalone "!" after "?". By
default, trailing terminator at buffer end is held for more context.
"""
if not text_buffer:
return None
split_idx = -1
for idx, char in enumerate(text_buffer):
if char in self._SENTENCE_END_CHARS:
split_idx = idx
break
if split_idx == -1:
return None
end_idx = split_idx + 1
while end_idx < len(text_buffer) and text_buffer[end_idx] in self._SENTENCE_TRAILING_CHARS:
end_idx += 1
# Include trailing quote/bracket closers in the same segment.
while end_idx < len(text_buffer) and text_buffer[end_idx] in self._SENTENCE_CLOSERS:
end_idx += 1
if not force and end_idx >= len(text_buffer):
return None
sentence = text_buffer[:end_idx].strip()
remainder = text_buffer[end_idx:]
return sentence, remainder
def _has_spoken_content(self, text: str) -> bool:
"""Check whether text contains pronounceable content (not punctuation-only)."""
return any(char.isalnum() for char in text)
async def _speak_sentence(self, text: str, fade_in_ms: int = 2, fade_out_ms: int = 8) -> None:
""" """
Synthesize and send a single sentence. Synthesize and send a single sentence.
Args: Args:
text: Sentence to speak text: Sentence to speak
fade_in_ms: Fade-in duration for sentence start chunks
fade_out_ms: Fade-out duration for sentence end chunks
""" """
if not text.strip() or self._interrupt_event.is_set(): if not text.strip() or self._interrupt_event.is_set():
return return
@@ -640,7 +702,8 @@ class DuplexPipeline:
sample_rate=chunk.sample_rate, sample_rate=chunk.sample_rate,
fade_in=is_first_chunk, fade_in=is_first_chunk,
fade_out=bool(chunk.is_final), fade_out=bool(chunk.is_final),
fade_ms=8, fade_in_ms=fade_in_ms,
fade_out_ms=fade_out_ms,
) )
is_first_chunk = False is_first_chunk = False
@@ -656,7 +719,8 @@ class DuplexPipeline:
sample_rate: int, sample_rate: int,
fade_in: bool = False, fade_in: bool = False,
fade_out: bool = False, fade_out: bool = False,
fade_ms: int = 8, fade_in_ms: int = 2,
fade_out_ms: int = 8,
) -> bytes: ) -> bytes:
"""Apply short edge fades to reduce click/pop at sentence boundaries.""" """Apply short edge fades to reduce click/pop at sentence boundaries."""
if not pcm_bytes or (not fade_in and not fade_out): if not pcm_bytes or (not fade_in and not fade_out):
@@ -667,13 +731,14 @@ class DuplexPipeline:
if samples.size == 0: if samples.size == 0:
return pcm_bytes return pcm_bytes
fade_samples = int(sample_rate * (fade_ms / 1000.0))
fade_samples = max(1, min(fade_samples, samples.size))
if fade_in: if fade_in:
samples[:fade_samples] *= np.linspace(0.0, 1.0, fade_samples, endpoint=True) fade_in_samples = int(sample_rate * (fade_in_ms / 1000.0))
fade_in_samples = max(1, min(fade_in_samples, samples.size))
samples[:fade_in_samples] *= np.linspace(0.0, 1.0, fade_in_samples, endpoint=True)
if fade_out: if fade_out:
samples[-fade_samples:] *= np.linspace(1.0, 0.0, fade_samples, endpoint=True) fade_out_samples = int(sample_rate * (fade_out_ms / 1000.0))
fade_out_samples = max(1, min(fade_out_samples, samples.size))
samples[-fade_out_samples:] *= np.linspace(1.0, 0.0, fade_out_samples, endpoint=True)
return np.clip(samples, -32768, 32767).astype("<i2").tobytes() return np.clip(samples, -32768, 32767).astype("<i2").tobytes()
except Exception: except Exception:

View File

@@ -400,6 +400,7 @@
let interimAiText = ""; let interimAiText = "";
const targetSampleRate = 16000; const targetSampleRate = 16000;
const playbackStopRampSec = 0.008;
function logLine(type, text, data) { function logLine(type, text, data) {
const time = new Date().toLocaleTimeString(); const time = new Date().toLocaleTimeString();
@@ -456,10 +457,18 @@
function stopPlayback() { function stopPlayback() {
discardAudio = true; discardAudio = true;
playbackTime = audioCtx ? audioCtx.currentTime : 0; const now = audioCtx ? audioCtx.currentTime : 0;
playbackSources.forEach((s) => { playbackTime = now;
playbackSources.forEach((node) => {
try { try {
s.stop(); if (audioCtx && node.gainNode && node.source) {
node.gainNode.gain.cancelScheduledValues(now);
node.gainNode.gain.setValueAtTime(node.gainNode.gain.value || 1, now);
node.gainNode.gain.linearRampToValueAtTime(0, now + playbackStopRampSec);
node.source.stop(now + playbackStopRampSec + 0.002);
} else if (node.source) {
node.source.stop();
}
} catch (err) {} } catch (err) {}
}); });
playbackSources = []; playbackSources = [];
@@ -527,14 +536,18 @@
const buffer = audioCtx.createBuffer(1, float32.length, targetSampleRate); const buffer = audioCtx.createBuffer(1, float32.length, targetSampleRate);
buffer.copyToChannel(float32, 0); buffer.copyToChannel(float32, 0);
const source = audioCtx.createBufferSource(); const source = audioCtx.createBufferSource();
const gainNode = audioCtx.createGain();
source.buffer = buffer; source.buffer = buffer;
source.connect(playbackDest); source.connect(gainNode);
gainNode.connect(playbackDest);
const startTime = Math.max(audioCtx.currentTime + 0.02, playbackTime); const startTime = Math.max(audioCtx.currentTime + 0.02, playbackTime);
gainNode.gain.setValueAtTime(1, startTime);
source.start(startTime); source.start(startTime);
playbackTime = startTime + buffer.duration; playbackTime = startTime + buffer.duration;
playbackSources.push(source); const playbackNode = { source, gainNode };
playbackSources.push(playbackNode);
source.onended = () => { source.onended = () => {
playbackSources = playbackSources.filter((s) => s !== source); playbackSources = playbackSources.filter((s) => s !== playbackNode);
}; };
} }

View File

@@ -134,6 +134,7 @@ class SiliconFlowTTSService(BaseTTSService):
# Stream audio chunks # Stream audio chunks
chunk_size = self.sample_rate * 2 // 10 # 100ms chunks chunk_size = self.sample_rate * 2 // 10 # 100ms chunks
buffer = b"" buffer = b""
pending_chunk = None
async for chunk in response.content.iter_any(): async for chunk in response.content.iter_any():
if self._cancel_event.is_set(): if self._cancel_event.is_set():
@@ -147,13 +148,33 @@ class SiliconFlowTTSService(BaseTTSService):
audio_chunk = buffer[:chunk_size] audio_chunk = buffer[:chunk_size]
buffer = buffer[chunk_size:] buffer = buffer[chunk_size:]
# Keep one full chunk buffered so we can always tag the true
# last full chunk as final when stream length is an exact multiple.
if pending_chunk is not None:
yield TTSChunk( yield TTSChunk(
audio=audio_chunk, audio=pending_chunk,
sample_rate=self.sample_rate, sample_rate=self.sample_rate,
is_final=False is_final=False
) )
pending_chunk = audio_chunk
# Flush pending chunk(s) and remaining tail.
if pending_chunk is not None:
if buffer:
yield TTSChunk(
audio=pending_chunk,
sample_rate=self.sample_rate,
is_final=False
)
pending_chunk = None
else:
yield TTSChunk(
audio=pending_chunk,
sample_rate=self.sample_rate,
is_final=True
)
pending_chunk = None
# Yield remaining buffer
if buffer: if buffer:
yield TTSChunk( yield TTSChunk(
audio=buffer, audio=buffer,
@@ -182,7 +203,7 @@ class StreamingTTSAdapter:
""" """
# Sentence delimiters # Sentence delimiters
SENTENCE_ENDS = {'', '', '', '\n'} SENTENCE_ENDS = {'', '', '', '.', '!', '?', '\n'}
def __init__(self, tts_service: BaseTTSService, transport, session_id: str): def __init__(self, tts_service: BaseTTSService, transport, session_id: str):
self.tts_service = tts_service self.tts_service = tts_service
@@ -205,15 +226,24 @@ class StreamingTTSAdapter:
self._buffer += text_chunk self._buffer += text_chunk
# Check for sentence completion # Check for sentence completion
while True:
split_idx = -1
for i, char in enumerate(self._buffer): for i, char in enumerate(self._buffer):
if char in self.SENTENCE_ENDS: if char in self.SENTENCE_ENDS:
# Found sentence end, synthesize up to this point split_idx = i
sentence = self._buffer[:i+1].strip()
self._buffer = self._buffer[i+1:]
if sentence:
await self._speak_sentence(sentence)
break break
if split_idx < 0:
break
end_idx = split_idx + 1
while end_idx < len(self._buffer) and self._buffer[end_idx] in self.SENTENCE_ENDS:
end_idx += 1
sentence = self._buffer[:end_idx].strip()
self._buffer = self._buffer[end_idx:]
if sentence and any(ch.isalnum() for ch in sentence):
await self._speak_sentence(sentence)
async def flush(self) -> None: async def flush(self) -> None:
"""Flush remaining buffer.""" """Flush remaining buffer."""