Add edge fade for tts

This commit is contained in:
Xin Wang
2026-02-09 13:51:52 +08:00
parent 5349ed88e7
commit 210301dc6b

View File

@@ -14,6 +14,7 @@ event-driven design.
import asyncio
import time
from typing import Optional, Callable, Awaitable, Dict, Any
import numpy as np
from loguru import logger
from core.transports import BaseTransport
@@ -608,6 +609,7 @@ class DuplexPipeline:
return
try:
is_first_chunk = True
async for chunk in self.tts_service.synthesize_stream(text):
# Check interrupt at the start of each iteration
if self._interrupt_event.is_set():
@@ -633,12 +635,51 @@ class DuplexPipeline:
if self._interrupt_event.is_set():
break
await self.transport.send_audio(chunk.audio)
smoothed_audio = self._apply_edge_fade(
pcm_bytes=chunk.audio,
sample_rate=chunk.sample_rate,
fade_in=is_first_chunk,
fade_out=bool(chunk.is_final),
fade_ms=8,
)
is_first_chunk = False
await self.transport.send_audio(smoothed_audio)
except asyncio.CancelledError:
logger.debug("TTS sentence cancelled")
except Exception as e:
logger.error(f"TTS sentence error: {e}")
def _apply_edge_fade(
self,
pcm_bytes: bytes,
sample_rate: int,
fade_in: bool = False,
fade_out: bool = False,
fade_ms: int = 8,
) -> bytes:
"""Apply short edge fades to reduce click/pop at sentence boundaries."""
if not pcm_bytes or (not fade_in and not fade_out):
return pcm_bytes
try:
samples = np.frombuffer(pcm_bytes, dtype="<i2").astype(np.float32)
if samples.size == 0:
return pcm_bytes
fade_samples = int(sample_rate * (fade_ms / 1000.0))
fade_samples = max(1, min(fade_samples, samples.size))
if fade_in:
samples[:fade_samples] *= np.linspace(0.0, 1.0, fade_samples, endpoint=True)
if fade_out:
samples[-fade_samples:] *= np.linspace(1.0, 0.0, fade_samples, endpoint=True)
return np.clip(samples, -32768, 32767).astype("<i2").tobytes()
except Exception:
# Fallback: never block audio delivery on smoothing failure.
return pcm_bytes
async def _speak(self, text: str) -> None:
"""
Synthesize and send speech.