From 5349ed88e7adab0387bc6924f9abbca3e6f3360e Mon Sep 17 00:00:00 2001 From: Xin Wang Date: Mon, 9 Feb 2026 13:39:55 +0800 Subject: [PATCH] Improve tts stream --- engine/core/duplex_pipeline.py | 1 - web/pages/Assistants.tsx | 75 +++++++++++++++++++++++++--------- 2 files changed, 56 insertions(+), 20 deletions(-) diff --git a/engine/core/duplex_pipeline.py b/engine/core/duplex_pipeline.py index be31239..f01dd63 100644 --- a/engine/core/duplex_pipeline.py +++ b/engine/core/duplex_pipeline.py @@ -634,7 +634,6 @@ class DuplexPipeline: break await self.transport.send_audio(chunk.audio) - await asyncio.sleep(0.005) # Small delay to prevent flooding except asyncio.CancelledError: logger.debug("TTS sentence cancelled") except Exception as e: diff --git a/web/pages/Assistants.tsx b/web/pages/Assistants.tsx index d5d048d..f273c1b 100644 --- a/web/pages/Assistants.tsx +++ b/web/pages/Assistants.tsx @@ -1087,6 +1087,12 @@ export const DebugDrawer: React.FC<{ const audioCtxRef = useRef(null); const playbackTimeRef = useRef(0); const activeAudioSourcesRef = useRef>(new Set()); + const queuedAudioBuffersRef = useRef([]); + const queuedAudioDurationRef = useRef(0); + + const PLAYBACK_INITIAL_BUFFER_SECONDS = 0.25; + const PLAYBACK_MAX_AHEAD_SECONDS = 0.8; + const PLAYBACK_SCHEDULE_LEAD_SECONDS = 0.04; const [devices, setDevices] = useState([]); const [selectedCamera, setSelectedCamera] = useState(''); @@ -1169,8 +1175,10 @@ export const DebugDrawer: React.FC<{ const clearPlaybackQueue = () => { const ctx = audioCtxRef.current; - if (!ctx) return; - playbackTimeRef.current = ctx.currentTime; + const now = ctx ? ctx.currentTime : 0; + playbackTimeRef.current = now; + queuedAudioBuffersRef.current = []; + queuedAudioDurationRef.current = 0; }; const stopPlaybackImmediately = () => { @@ -1186,6 +1194,49 @@ export const DebugDrawer: React.FC<{ clearPlaybackQueue(); }; + const scheduleQueuedPlayback = (ctx: AudioContext) => { + const queue = queuedAudioBuffersRef.current; + if (queue.length === 0) return; + + const now = ctx.currentTime; + if (playbackTimeRef.current < now) { + playbackTimeRef.current = now; + } + + const hasActivePlayback = activeAudioSourcesRef.current.size > 0; + const minBufferSeconds = hasActivePlayback + ? 0 + : PLAYBACK_INITIAL_BUFFER_SECONDS; + + if (queuedAudioDurationRef.current < minBufferSeconds) { + return; + } + + while (queue.length > 0 && (playbackTimeRef.current - now) < PLAYBACK_MAX_AHEAD_SECONDS) { + const audioBuffer = queue.shift(); + if (!audioBuffer) break; + queuedAudioDurationRef.current = Math.max(0, queuedAudioDurationRef.current - audioBuffer.duration); + + const source = ctx.createBufferSource(); + source.buffer = audioBuffer; + source.connect(ctx.destination); + activeAudioSourcesRef.current.add(source); + source.onended = () => { + activeAudioSourcesRef.current.delete(source); + try { + source.disconnect(); + } catch { + // no-op + } + scheduleQueuedPlayback(ctx); + }; + + const startAt = Math.max(ctx.currentTime + PLAYBACK_SCHEDULE_LEAD_SECONDS, playbackTimeRef.current); + source.start(startAt); + playbackTimeRef.current = startAt + audioBuffer.duration; + } + }; + const playPcm16Chunk = async (pcmBuffer: ArrayBuffer) => { if (!textTtsEnabled) return; if (mode !== 'text') return; @@ -1201,23 +1252,9 @@ export const DebugDrawer: React.FC<{ const sampleRate = 16000; const audioBuffer = ctx.createBuffer(1, float32.length, sampleRate); audioBuffer.copyToChannel(float32, 0); - - const source = ctx.createBufferSource(); - source.buffer = audioBuffer; - source.connect(ctx.destination); - activeAudioSourcesRef.current.add(source); - source.onended = () => { - activeAudioSourcesRef.current.delete(source); - try { - source.disconnect(); - } catch { - // no-op - } - }; - - const startAt = Math.max(ctx.currentTime + 0.02, playbackTimeRef.current); - source.start(startAt); - playbackTimeRef.current = startAt + audioBuffer.duration; + queuedAudioBuffersRef.current.push(audioBuffer); + queuedAudioDurationRef.current += audioBuffer.duration; + scheduleQueuedPlayback(ctx); }; useEffect(() => {