Add edge fade for tts
This commit is contained in:
@@ -14,6 +14,7 @@ event-driven design.
|
|||||||
import asyncio
|
import asyncio
|
||||||
import time
|
import time
|
||||||
from typing import Optional, Callable, Awaitable, Dict, Any
|
from typing import Optional, Callable, Awaitable, Dict, Any
|
||||||
|
import numpy as np
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
|
||||||
from core.transports import BaseTransport
|
from core.transports import BaseTransport
|
||||||
@@ -608,6 +609,7 @@ class DuplexPipeline:
|
|||||||
return
|
return
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
is_first_chunk = True
|
||||||
async for chunk in self.tts_service.synthesize_stream(text):
|
async for chunk in self.tts_service.synthesize_stream(text):
|
||||||
# Check interrupt at the start of each iteration
|
# Check interrupt at the start of each iteration
|
||||||
if self._interrupt_event.is_set():
|
if self._interrupt_event.is_set():
|
||||||
@@ -633,12 +635,51 @@ class DuplexPipeline:
|
|||||||
if self._interrupt_event.is_set():
|
if self._interrupt_event.is_set():
|
||||||
break
|
break
|
||||||
|
|
||||||
await self.transport.send_audio(chunk.audio)
|
smoothed_audio = self._apply_edge_fade(
|
||||||
|
pcm_bytes=chunk.audio,
|
||||||
|
sample_rate=chunk.sample_rate,
|
||||||
|
fade_in=is_first_chunk,
|
||||||
|
fade_out=bool(chunk.is_final),
|
||||||
|
fade_ms=8,
|
||||||
|
)
|
||||||
|
is_first_chunk = False
|
||||||
|
|
||||||
|
await self.transport.send_audio(smoothed_audio)
|
||||||
except asyncio.CancelledError:
|
except asyncio.CancelledError:
|
||||||
logger.debug("TTS sentence cancelled")
|
logger.debug("TTS sentence cancelled")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"TTS sentence error: {e}")
|
logger.error(f"TTS sentence error: {e}")
|
||||||
|
|
||||||
|
def _apply_edge_fade(
|
||||||
|
self,
|
||||||
|
pcm_bytes: bytes,
|
||||||
|
sample_rate: int,
|
||||||
|
fade_in: bool = False,
|
||||||
|
fade_out: bool = False,
|
||||||
|
fade_ms: int = 8,
|
||||||
|
) -> bytes:
|
||||||
|
"""Apply short edge fades to reduce click/pop at sentence boundaries."""
|
||||||
|
if not pcm_bytes or (not fade_in and not fade_out):
|
||||||
|
return pcm_bytes
|
||||||
|
|
||||||
|
try:
|
||||||
|
samples = np.frombuffer(pcm_bytes, dtype="<i2").astype(np.float32)
|
||||||
|
if samples.size == 0:
|
||||||
|
return pcm_bytes
|
||||||
|
|
||||||
|
fade_samples = int(sample_rate * (fade_ms / 1000.0))
|
||||||
|
fade_samples = max(1, min(fade_samples, samples.size))
|
||||||
|
|
||||||
|
if fade_in:
|
||||||
|
samples[:fade_samples] *= np.linspace(0.0, 1.0, fade_samples, endpoint=True)
|
||||||
|
if fade_out:
|
||||||
|
samples[-fade_samples:] *= np.linspace(1.0, 0.0, fade_samples, endpoint=True)
|
||||||
|
|
||||||
|
return np.clip(samples, -32768, 32767).astype("<i2").tobytes()
|
||||||
|
except Exception:
|
||||||
|
# Fallback: never block audio delivery on smoothing failure.
|
||||||
|
return pcm_bytes
|
||||||
|
|
||||||
async def _speak(self, text: str) -> None:
|
async def _speak(self, text: str) -> None:
|
||||||
"""
|
"""
|
||||||
Synthesize and send speech.
|
Synthesize and send speech.
|
||||||
|
|||||||
Reference in New Issue
Block a user