Preventing injecting silence while we are receiving audio from TTS.

This commit is contained in:
filipi87
2026-05-13 13:44:05 -03:00
parent 6887ac394a
commit 3954f96a6b

View File

@@ -35,6 +35,8 @@ from pipecat.frames.frames import (
OutputTransportMessageUrgentFrame,
SpriteFrame,
StartFrame,
TTSAudioRawFrame,
TTSStoppedFrame,
UserImageRawFrame,
UserImageRequestFrame,
)
@@ -97,6 +99,7 @@ class RawAudioTrack(AudioStreamTrack):
self._start = time.time()
# Queue of (bytes, future), broken into 10ms sub chunks as needed
self._chunk_queue = deque()
self._is_bot_speaking = False
def add_audio_bytes(self, audio_bytes: bytes):
"""Add audio bytes to the buffer for transmission.
@@ -123,6 +126,14 @@ class RawAudioTrack(AudioStreamTrack):
return future
def set_is_bot_speaking(self, value: bool):
"""Set whether the bot is currently speaking.
Args:
value: True if the bot has started speaking, False when it has stopped.
"""
self._is_bot_speaking = value
async def recv(self):
"""Return the next audio frame for WebRTC transmission.
@@ -137,7 +148,12 @@ class RawAudioTrack(AudioStreamTrack):
await asyncio.sleep(wait)
if not self._chunk_queue:
if self._auto_silence:
# Injecting silence while the bot is speaking can cause audible glitches:
# TTS audio arrives in bursts, and a silence frame inserted between two
# consecutive TTS chunks will produce a brief gap or pop in the output.
if self._auto_silence and not self._is_bot_speaking:
#if self._is_bot_speaking:
# logger.warning("Injecting silence while bot is speaking can cause glitches in the audio.")
chunk = bytes(self._bytes_per_10ms)
else:
while not self._chunk_queue:
@@ -426,6 +442,15 @@ class SmallWebRTCClient:
return True
return False
def set_is_bot_speaking(self, value: bool):
"""Propagate bot speaking state to the audio output track.
Args:
value: True if the bot has started speaking, False when it has stopped.
"""
if self._audio_output_track:
self._audio_output_track.set_is_bot_speaking(value)
async def write_video_frame(self, frame: OutputImageRawFrame) -> bool:
"""Write a video frame to the WebRTC connection.
@@ -861,6 +886,13 @@ class SmallWebRTCOutputTransport(BaseOutputTransport):
Returns:
True if the audio frame was written successfully, False otherwise.
"""
# Track when the bot is speaking so the audio track can avoid injecting
# silence between TTS chunks, which would cause audible glitches.
# Using the TTSAudioRawFrame as reference since we can receive
# TTSStartedFrame a few hundred milliseconds before actually start
# receiving the audio
if isinstance(frame, TTSAudioRawFrame):
self._client.set_is_bot_speaking(True)
return await self._client.write_audio_frame(frame)
async def write_video_frame(self, frame: OutputImageRawFrame) -> bool:
@@ -874,6 +906,20 @@ class SmallWebRTCOutputTransport(BaseOutputTransport):
"""
return await self._client.write_video_frame(frame)
async def process_frame(self, frame: Frame, direction: FrameDirection):
"""Process incoming frames and handle transport-specific logic.
Args:
frame: The frame to process.
direction: The direction of frame flow in the pipeline.
"""
await super().process_frame(frame, direction)
# Track when the bot is speaking so the audio track can avoid injecting
# silence between TTS chunks, which would cause audible glitches.
if isinstance(frame, TTSStoppedFrame):
self._client.set_is_bot_speaking(False)
class SmallWebRTCTransport(BaseTransport):
"""WebRTC transport implementation for real-time communication.