diff --git a/src/pipecat/transports/smallwebrtc/transport.py b/src/pipecat/transports/smallwebrtc/transport.py index 080f2383d..33cc2ee77 100644 --- a/src/pipecat/transports/smallwebrtc/transport.py +++ b/src/pipecat/transports/smallwebrtc/transport.py @@ -35,6 +35,8 @@ from pipecat.frames.frames import ( OutputTransportMessageUrgentFrame, SpriteFrame, StartFrame, + TTSAudioRawFrame, + TTSStoppedFrame, UserImageRawFrame, UserImageRequestFrame, ) @@ -97,6 +99,7 @@ class RawAudioTrack(AudioStreamTrack): self._start = time.time() # Queue of (bytes, future), broken into 10ms sub chunks as needed self._chunk_queue = deque() + self._is_bot_speaking = False def add_audio_bytes(self, audio_bytes: bytes): """Add audio bytes to the buffer for transmission. @@ -123,6 +126,14 @@ class RawAudioTrack(AudioStreamTrack): return future + def set_is_bot_speaking(self, value: bool): + """Set whether the bot is currently speaking. + + Args: + value: True if the bot has started speaking, False when it has stopped. + """ + self._is_bot_speaking = value + async def recv(self): """Return the next audio frame for WebRTC transmission. @@ -137,7 +148,12 @@ class RawAudioTrack(AudioStreamTrack): await asyncio.sleep(wait) if not self._chunk_queue: - if self._auto_silence: + # Injecting silence while the bot is speaking can cause audible glitches: + # TTS audio arrives in bursts, and a silence frame inserted between two + # consecutive TTS chunks will produce a brief gap or pop in the output. + if self._auto_silence and not self._is_bot_speaking: + #if self._is_bot_speaking: + # logger.warning("Injecting silence while bot is speaking can cause glitches in the audio.") chunk = bytes(self._bytes_per_10ms) else: while not self._chunk_queue: @@ -426,6 +442,15 @@ class SmallWebRTCClient: return True return False + def set_is_bot_speaking(self, value: bool): + """Propagate bot speaking state to the audio output track. + + Args: + value: True if the bot has started speaking, False when it has stopped. + """ + if self._audio_output_track: + self._audio_output_track.set_is_bot_speaking(value) + async def write_video_frame(self, frame: OutputImageRawFrame) -> bool: """Write a video frame to the WebRTC connection. @@ -861,6 +886,13 @@ class SmallWebRTCOutputTransport(BaseOutputTransport): Returns: True if the audio frame was written successfully, False otherwise. """ + # Track when the bot is speaking so the audio track can avoid injecting + # silence between TTS chunks, which would cause audible glitches. + # Using the TTSAudioRawFrame as reference since we can receive + # TTSStartedFrame a few hundred milliseconds before actually start + # receiving the audio + if isinstance(frame, TTSAudioRawFrame): + self._client.set_is_bot_speaking(True) return await self._client.write_audio_frame(frame) async def write_video_frame(self, frame: OutputImageRawFrame) -> bool: @@ -874,6 +906,20 @@ class SmallWebRTCOutputTransport(BaseOutputTransport): """ return await self._client.write_video_frame(frame) + async def process_frame(self, frame: Frame, direction: FrameDirection): + """Process incoming frames and handle transport-specific logic. + + Args: + frame: The frame to process. + direction: The direction of frame flow in the pipeline. + """ + await super().process_frame(frame, direction) + + # Track when the bot is speaking so the audio track can avoid injecting + # silence between TTS chunks, which would cause audible glitches. + if isinstance(frame, TTSStoppedFrame): + self._client.set_is_bot_speaking(False) + class SmallWebRTCTransport(BaseTransport): """WebRTC transport implementation for real-time communication.