From eeaa9f67a1b4ff8bc7f288a3b4359d25741df2e1 Mon Sep 17 00:00:00 2001
From: Mark Backman <mark@daily.co>
Date: Fri, 2 May 2025 16:26:23 -0400
Subject: [PATCH] Fix: SimliVideoService was continuously emitting audio,
 preventing BotStoppedSpeakingFrame from being sent

---
 CHANGELOG.md                            |  3 +++
 examples/foundational/27-simli-layer.py |  1 +
 src/pipecat/services/simli/video.py     | 17 ++++++++++-------
 3 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5954e6baf..baa07ba6f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -136,6 +136,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Fixed
 
+- Fixed an issue with `SimliVideoService` where the bot was continuously outputting
+  audio, which prevents the `BotStoppedSpeakingFrame` from being emitted.
+
 - Fixed an issue where `OpenAIRealtimeBetaLLMService` would add two assistant
   messages to the context.
 
diff --git a/examples/foundational/27-simli-layer.py b/examples/foundational/27-simli-layer.py
index 513bbc239..38721e50e 100644
--- a/examples/foundational/27-simli-layer.py
+++ b/examples/foundational/27-simli-layer.py
@@ -36,6 +36,7 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection, _: argparse.Namespac
             audio_in_enabled=True,
             audio_out_enabled=True,
             video_out_enabled=True,
+            video_out_is_live=True,
             video_out_width=512,
             video_out_height=512,
             vad_analyzer=SileroVADAnalyzer(),
diff --git a/src/pipecat/services/simli/video.py b/src/pipecat/services/simli/video.py
index 0fb490914..121801e4b 100644
--- a/src/pipecat/services/simli/video.py
+++ b/src/pipecat/services/simli/video.py
@@ -64,13 +64,16 @@ class SimliVideoService(FrameProcessor):
         async for audio_frame in self._simli_client.getAudioStreamIterator():
             resampled_frames = self._pipecat_resampler.resample(audio_frame)
             for resampled_frame in resampled_frames:
-                await self.push_frame(
-                    TTSAudioRawFrame(
-                        audio=resampled_frame.to_ndarray().tobytes(),
-                        sample_rate=self._pipecat_resampler.rate,
-                        num_channels=1,
-                    ),
-                )
+                audio_array = resampled_frame.to_ndarray()
+                # Only push frame is there is audio (e.g. not silence)
+                if audio_array.any():
+                    await self.push_frame(
+                        TTSAudioRawFrame(
+                            audio=audio_array.tobytes(),
+                            sample_rate=self._pipecat_resampler.rate,
+                            num_channels=1,
+                        ),
+                    )
 
     async def _consume_and_process_video(self):
         await self._pipecat_resampler_event.wait()