Add edge fade for tts

This commit is contained in:
Xin Wang
2026-02-09 13:51:52 +08:00
parent 5349ed88e7
commit 210301dc6b

View File

@@ -14,6 +14,7 @@ event-driven design.
import asyncio import asyncio
import time import time
from typing import Optional, Callable, Awaitable, Dict, Any from typing import Optional, Callable, Awaitable, Dict, Any
import numpy as np
from loguru import logger from loguru import logger
from core.transports import BaseTransport from core.transports import BaseTransport
@@ -608,6 +609,7 @@ class DuplexPipeline:
return return
try: try:
is_first_chunk = True
async for chunk in self.tts_service.synthesize_stream(text): async for chunk in self.tts_service.synthesize_stream(text):
# Check interrupt at the start of each iteration # Check interrupt at the start of each iteration
if self._interrupt_event.is_set(): if self._interrupt_event.is_set():
@@ -633,12 +635,51 @@ class DuplexPipeline:
if self._interrupt_event.is_set(): if self._interrupt_event.is_set():
break break
await self.transport.send_audio(chunk.audio) smoothed_audio = self._apply_edge_fade(
pcm_bytes=chunk.audio,
sample_rate=chunk.sample_rate,
fade_in=is_first_chunk,
fade_out=bool(chunk.is_final),
fade_ms=8,
)
is_first_chunk = False
await self.transport.send_audio(smoothed_audio)
except asyncio.CancelledError: except asyncio.CancelledError:
logger.debug("TTS sentence cancelled") logger.debug("TTS sentence cancelled")
except Exception as e: except Exception as e:
logger.error(f"TTS sentence error: {e}") logger.error(f"TTS sentence error: {e}")
def _apply_edge_fade(
self,
pcm_bytes: bytes,
sample_rate: int,
fade_in: bool = False,
fade_out: bool = False,
fade_ms: int = 8,
) -> bytes:
"""Apply short edge fades to reduce click/pop at sentence boundaries."""
if not pcm_bytes or (not fade_in and not fade_out):
return pcm_bytes
try:
samples = np.frombuffer(pcm_bytes, dtype="<i2").astype(np.float32)
if samples.size == 0:
return pcm_bytes
fade_samples = int(sample_rate * (fade_ms / 1000.0))
fade_samples = max(1, min(fade_samples, samples.size))
if fade_in:
samples[:fade_samples] *= np.linspace(0.0, 1.0, fade_samples, endpoint=True)
if fade_out:
samples[-fade_samples:] *= np.linspace(1.0, 0.0, fade_samples, endpoint=True)
return np.clip(samples, -32768, 32767).astype("<i2").tobytes()
except Exception:
# Fallback: never block audio delivery on smoothing failure.
return pcm_bytes
async def _speak(self, text: str) -> None: async def _speak(self, text: str) -> None:
""" """
Synthesize and send speech. Synthesize and send speech.