diff --git a/engine/core/duplex_pipeline.py b/engine/core/duplex_pipeline.py index f01dd63..31e118d 100644 --- a/engine/core/duplex_pipeline.py +++ b/engine/core/duplex_pipeline.py @@ -14,6 +14,7 @@ event-driven design. import asyncio import time from typing import Optional, Callable, Awaitable, Dict, Any +import numpy as np from loguru import logger from core.transports import BaseTransport @@ -608,6 +609,7 @@ class DuplexPipeline: return try: + is_first_chunk = True async for chunk in self.tts_service.synthesize_stream(text): # Check interrupt at the start of each iteration if self._interrupt_event.is_set(): @@ -632,12 +634,51 @@ class DuplexPipeline: # Double-check interrupt right before sending audio if self._interrupt_event.is_set(): break - - await self.transport.send_audio(chunk.audio) + + smoothed_audio = self._apply_edge_fade( + pcm_bytes=chunk.audio, + sample_rate=chunk.sample_rate, + fade_in=is_first_chunk, + fade_out=bool(chunk.is_final), + fade_ms=8, + ) + is_first_chunk = False + + await self.transport.send_audio(smoothed_audio) except asyncio.CancelledError: logger.debug("TTS sentence cancelled") except Exception as e: logger.error(f"TTS sentence error: {e}") + + def _apply_edge_fade( + self, + pcm_bytes: bytes, + sample_rate: int, + fade_in: bool = False, + fade_out: bool = False, + fade_ms: int = 8, + ) -> bytes: + """Apply short edge fades to reduce click/pop at sentence boundaries.""" + if not pcm_bytes or (not fade_in and not fade_out): + return pcm_bytes + + try: + samples = np.frombuffer(pcm_bytes, dtype=" None: """