Add edge fade for tts

2026-02-09 13:51:52 +08:00
parent 5349ed88e7
commit 210301dc6b
1 changed files with 43 additions and 2 deletions
--- a/engine/core/duplex_pipeline.py
+++ b/engine/core/duplex_pipeline.py
@@ -14,6 +14,7 @@ event-driven design.
 import asyncio
 import time
 from typing import Optional, Callable, Awaitable, Dict, Any
+import numpy as np
 from loguru import logger

 from core.transports import BaseTransport
@@ -608,6 +609,7 @@ class DuplexPipeline:
            return
        
        try:
+            is_first_chunk = True
            async for chunk in self.tts_service.synthesize_stream(text):
                # Check interrupt at the start of each iteration
                if self._interrupt_event.is_set():
@@ -633,12 +635,51 @@ class DuplexPipeline:
                if self._interrupt_event.is_set():
                    break

-                await self.transport.send_audio(chunk.audio)
+                smoothed_audio = self._apply_edge_fade(
+                    pcm_bytes=chunk.audio,
+                    sample_rate=chunk.sample_rate,
+                    fade_in=is_first_chunk,
+                    fade_out=bool(chunk.is_final),
+                    fade_ms=8,
+                )
+                is_first_chunk = False
+
+                await self.transport.send_audio(smoothed_audio)
        except asyncio.CancelledError:
            logger.debug("TTS sentence cancelled")
        except Exception as e:
            logger.error(f"TTS sentence error: {e}")

+    def _apply_edge_fade(
+        self,
+        pcm_bytes: bytes,
+        sample_rate: int,
+        fade_in: bool = False,
+        fade_out: bool = False,
+        fade_ms: int = 8,
+    ) -> bytes:
+        """Apply short edge fades to reduce click/pop at sentence boundaries."""
+        if not pcm_bytes or (not fade_in and not fade_out):
+            return pcm_bytes
+
+        try:
+            samples = np.frombuffer(pcm_bytes, dtype="<i2").astype(np.float32)
+            if samples.size == 0:
+                return pcm_bytes
+
+            fade_samples = int(sample_rate * (fade_ms / 1000.0))
+            fade_samples = max(1, min(fade_samples, samples.size))
+
+            if fade_in:
+                samples[:fade_samples] *= np.linspace(0.0, 1.0, fade_samples, endpoint=True)
+            if fade_out:
+                samples[-fade_samples:] *= np.linspace(1.0, 0.0, fade_samples, endpoint=True)
+
+            return np.clip(samples, -32768, 32767).astype("<i2").tobytes()
+        except Exception:
+            # Fallback: never block audio delivery on smoothing failure.
+            return pcm_bytes
+    
    async def _speak(self, text: str) -> None:
        """
        Synthesize and send speech.