Add edge fade for tts

2026-02-09 13:51:52 +08:00
parent 5349ed88e7
commit 210301dc6b
1 changed files with 43 additions and 2 deletions
--- a/engine/core/duplex_pipeline.py
+++ b/engine/core/duplex_pipeline.py
@@ -14,6 +14,7 @@ event-driven design.
 import asyncio
 import time
 from typing import Optional, Callable, Awaitable, Dict, Any
 import numpy as np
 from loguru import logger
 from core.transports import BaseTransport
@@ -608,6 +609,7 @@ class DuplexPipeline:
            return
        try:
            is_first_chunk = True
            async for chunk in self.tts_service.synthesize_stream(text):
                # Check interrupt at the start of each iteration
                if self._interrupt_event.is_set():
@@ -633,12 +635,51 @@ class DuplexPipeline:
                if self._interrupt_event.is_set():
                    break
-                await self.transport.send_audio(chunk.audio)
+                smoothed_audio = self._apply_edge_fade(
                    pcm_bytes=chunk.audio,
                    sample_rate=chunk.sample_rate,
                    fade_in=is_first_chunk,
                    fade_out=bool(chunk.is_final),
                    fade_ms=8,
                )
                is_first_chunk = False
                await self.transport.send_audio(smoothed_audio)
        except asyncio.CancelledError:
            logger.debug("TTS sentence cancelled")
        except Exception as e:
            logger.error(f"TTS sentence error: {e}")
    def _apply_edge_fade(
        self,
        pcm_bytes: bytes,
        sample_rate: int,
        fade_in: bool = False,
        fade_out: bool = False,
        fade_ms: int = 8,
    ) -> bytes:
        """Apply short edge fades to reduce click/pop at sentence boundaries."""
        if not pcm_bytes or (not fade_in and not fade_out):
            return pcm_bytes
        try:
            samples = np.frombuffer(pcm_bytes, dtype="<i2").astype(np.float32)
            if samples.size == 0:
                return pcm_bytes
            fade_samples = int(sample_rate * (fade_ms / 1000.0))
            fade_samples = max(1, min(fade_samples, samples.size))
            if fade_in:
                samples[:fade_samples] *= np.linspace(0.0, 1.0, fade_samples, endpoint=True)
            if fade_out:
                samples[-fade_samples:] *= np.linspace(1.0, 0.0, fade_samples, endpoint=True)
            return np.clip(samples, -32768, 32767).astype("<i2").tobytes()
        except Exception:
            # Fallback: never block audio delivery on smoothing failure.
            return pcm_bytes
    async def _speak(self, text: str) -> None:
        """
        Synthesize and send speech.