From eeaa9f67a1b4ff8bc7f288a3b4359d25741df2e1 Mon Sep 17 00:00:00 2001 From: Mark Backman Date: Fri, 2 May 2025 16:26:23 -0400 Subject: [PATCH] Fix: SimliVideoService was continuously emitting audio, preventing BotStoppedSpeakingFrame from being sent --- CHANGELOG.md | 3 +++ examples/foundational/27-simli-layer.py | 1 + src/pipecat/services/simli/video.py | 17 ++++++++++------- 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5954e6baf..baa07ba6f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -136,6 +136,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed +- Fixed an issue with `SimliVideoService` where the bot was continuously outputting + audio, which prevents the `BotStoppedSpeakingFrame` from being emitted. + - Fixed an issue where `OpenAIRealtimeBetaLLMService` would add two assistant messages to the context. diff --git a/examples/foundational/27-simli-layer.py b/examples/foundational/27-simli-layer.py index 513bbc239..38721e50e 100644 --- a/examples/foundational/27-simli-layer.py +++ b/examples/foundational/27-simli-layer.py @@ -36,6 +36,7 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection, _: argparse.Namespac audio_in_enabled=True, audio_out_enabled=True, video_out_enabled=True, + video_out_is_live=True, video_out_width=512, video_out_height=512, vad_analyzer=SileroVADAnalyzer(), diff --git a/src/pipecat/services/simli/video.py b/src/pipecat/services/simli/video.py index 0fb490914..121801e4b 100644 --- a/src/pipecat/services/simli/video.py +++ b/src/pipecat/services/simli/video.py @@ -64,13 +64,16 @@ class SimliVideoService(FrameProcessor): async for audio_frame in self._simli_client.getAudioStreamIterator(): resampled_frames = self._pipecat_resampler.resample(audio_frame) for resampled_frame in resampled_frames: - await self.push_frame( - TTSAudioRawFrame( - audio=resampled_frame.to_ndarray().tobytes(), - sample_rate=self._pipecat_resampler.rate, - num_channels=1, - ), - ) + audio_array = resampled_frame.to_ndarray() + # Only push frame is there is audio (e.g. not silence) + if audio_array.any(): + await self.push_frame( + TTSAudioRawFrame( + audio=audio_array.tobytes(), + sample_rate=self._pipecat_resampler.rate, + num_channels=1, + ), + ) async def _consume_and_process_video(self): await self._pipecat_resampler_event.wait()