From 64592b274b9fbdc9ca7d057685be7a09f65a17ad Mon Sep 17 00:00:00 2001
From: Filipi Fuchter <filipi87@gmail.com>
Date: Tue, 5 Aug 2025 17:11:34 -0300
Subject: [PATCH] Fixed an issue where `BotStartedSpeakingFrame` and
 `BotStoppedSpeakingFrame`   were not emitted when using `TavusVideoService`
 or `HeyGenVideoService`.

---
 CHANGELOG.md                          |  3 +++
 src/pipecat/audio/utils.py            | 34 +++++++++++++++++++++++++++
 src/pipecat/frames/frames.py          | 12 ++++++++++
 src/pipecat/services/heygen/video.py  |  3 ++-
 src/pipecat/services/tavus/video.py   |  3 ++-
 src/pipecat/transports/base_output.py | 17 +++++++++++++-
 6 files changed, 69 insertions(+), 3 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4da3cf32d..e18ca157d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -47,6 +47,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Fixed
 
+- Fixed an issue where `BotStartedSpeakingFrame` and `BotStoppedSpeakingFrame` 
+  were not emitted when using `TavusVideoService` or `HeyGenVideoService`.
+
 - Fixed an issue in `LiveKitTransport` where empty `AudioRawFrame`s were pushed
   down the pipeline. This resulted in warnings by the STT processor.
 - Fixed `PiperTTSService` to send text as a JSON object in the request body,
diff --git a/src/pipecat/audio/utils.py b/src/pipecat/audio/utils.py
index 6d5a12929..b2f76ec60 100644
--- a/src/pipecat/audio/utils.py
+++ b/src/pipecat/audio/utils.py
@@ -20,6 +20,10 @@ from pipecat.audio.resamplers.base_audio_resampler import BaseAudioResampler
 from pipecat.audio.resamplers.soxr_resampler import SOXRAudioResampler
 from pipecat.audio.resamplers.soxr_stream_resampler import SOXRStreamAudioResampler
 
+# Normal speech usually results in many samples between ±500 to ±5000, depending on loudness and mic gain.
+# So we are using a threshold that is well below what real speech produces.
+SPEAKING_THRESHOLD = 20
+
 
 def create_default_resampler(**kwargs) -> BaseAudioResampler:
     """Create a default audio resampler instance.
@@ -275,3 +279,33 @@ async def pcm_to_alaw(pcm_bytes: bytes, in_rate: int, out_rate: int, resampler:
     out_alaw_bytes = audioop.lin2alaw(in_pcm_bytes, 2)
 
     return out_alaw_bytes
+
+
+def is_silence(pcm_bytes: bytes) -> bool:
+    """Determine if an audio sample contains silence by checking amplitude levels.
+
+    This function analyzes raw PCM audio data to detect silence by comparing
+    the maximum absolute amplitude against a predefined threshold. The audio
+    is expected to be clean speech or complete silence without background noise.
+
+    Args:
+        pcm_bytes: Raw PCM audio data as bytes (16-bit signed integers).
+
+    Returns:
+        bool: True if the audio sample is considered silence (below threshold),
+              False otherwise.
+
+    Note:
+        Normal speech typically produces amplitude values between ±500 to ±5000,
+        depending on factors like loudness and microphone gain. The threshold
+        (SPEAKING_THRESHOLD) is set well below typical speech levels to
+        reliably detect silence vs. speech.
+    """
+    # Convert raw audio bytes to a NumPy array of int16 samples
+    audio_data = np.frombuffer(pcm_bytes, dtype=np.int16)
+
+    # Check the maximum absolute amplitude in the frame
+    max_value = np.abs(audio_data).max()
+
+    # If max value is lower than SPEAKING_THRESHOLD, consider it as silence
+    return max_value <= SPEAKING_THRESHOLD
diff --git a/src/pipecat/frames/frames.py b/src/pipecat/frames/frames.py
index bc9e8a06d..94e799aa6 100644
--- a/src/pipecat/frames/frames.py
+++ b/src/pipecat/frames/frames.py
@@ -238,6 +238,18 @@ class TTSAudioRawFrame(OutputAudioRawFrame):
     pass
 
 
+@dataclass
+class SpeechOutputAudioRawFrame(OutputAudioRawFrame):
+    """An audio frame part of a speech audio stream.
+
+    This frame is part of a continuous stream of audio frames containing speech.
+    The audio stream might also contain silence frames, so a process to distinguish
+    between speech and silence might be needed.
+    """
+
+    pass
+
+
 @dataclass
 class URLImageRawFrame(OutputImageRawFrame):
     """Image frame with an associated URL.
diff --git a/src/pipecat/services/heygen/video.py b/src/pipecat/services/heygen/video.py
index 011151fdf..237dfc36f 100644
--- a/src/pipecat/services/heygen/video.py
+++ b/src/pipecat/services/heygen/video.py
@@ -27,6 +27,7 @@ from pipecat.frames.frames import (
     OutputAudioRawFrame,
     OutputImageRawFrame,
     OutputTransportReadyFrame,
+    SpeechOutputAudioRawFrame,
     StartFrame,
     TTSAudioRawFrame,
     UserStartedSpeakingFrame,
@@ -157,7 +158,7 @@ class HeyGenVideoService(AIService):
 
     async def _on_participant_audio_data(self, audio_frame: AudioRawFrame):
         """Handle incoming audio data from participants."""
-        frame = OutputAudioRawFrame(
+        frame = SpeechOutputAudioRawFrame(
             audio=audio_frame.audio,
             sample_rate=audio_frame.sample_rate,
             num_channels=audio_frame.num_channels,
diff --git a/src/pipecat/services/tavus/video.py b/src/pipecat/services/tavus/video.py
index 37b21257f..684bd5659 100644
--- a/src/pipecat/services/tavus/video.py
+++ b/src/pipecat/services/tavus/video.py
@@ -25,6 +25,7 @@ from pipecat.frames.frames import (
     OutputAudioRawFrame,
     OutputImageRawFrame,
     OutputTransportReadyFrame,
+    SpeechOutputAudioRawFrame,
     StartFrame,
     StartInterruptionFrame,
     TTSAudioRawFrame,
@@ -154,7 +155,7 @@ class TavusVideoService(AIService):
         self, participant_id: str, audio: AudioData, audio_source: str
     ):
         """Handle incoming audio data from participants."""
-        frame = OutputAudioRawFrame(
+        frame = SpeechOutputAudioRawFrame(
             audio=audio.audio_frames,
             sample_rate=audio.sample_rate,
             num_channels=audio.num_channels,
diff --git a/src/pipecat/transports/base_output.py b/src/pipecat/transports/base_output.py
index 4c60f7490..313c36fc0 100644
--- a/src/pipecat/transports/base_output.py
+++ b/src/pipecat/transports/base_output.py
@@ -21,7 +21,7 @@ from loguru import logger
 from PIL import Image
 
 from pipecat.audio.mixers.base_audio_mixer import BaseAudioMixer
-from pipecat.audio.utils import create_stream_resampler
+from pipecat.audio.utils import create_stream_resampler, is_silence
 from pipecat.frames.frames import (
     BotSpeakingFrame,
     BotStartedSpeakingFrame,
@@ -35,6 +35,7 @@ from pipecat.frames.frames import (
     OutputDTMFUrgentFrame,
     OutputImageRawFrame,
     OutputTransportReadyFrame,
+    SpeechOutputAudioRawFrame,
     SpriteFrame,
     StartFrame,
     StartInterruptionFrame,
@@ -671,10 +672,24 @@ class BaseOutputTransport(FrameProcessor):
             TOTAL_CHUNK_MS = self._params.audio_out_10ms_chunks * 10
             BOT_SPEAKING_CHUNK_PERIOD = max(int(200 / TOTAL_CHUNK_MS), 1)
             bot_speaking_counter = 0
+            speech_last_speaking_time = 0
+
             async for frame in self._next_frame():
                 # Notify the bot started speaking upstream if necessary and that
                 # it's actually speaking.
+                is_speaking = False
                 if isinstance(frame, TTSAudioRawFrame):
+                    is_speaking = True
+                elif isinstance(frame, SpeechOutputAudioRawFrame):
+                    if not is_silence(frame.audio):
+                        is_speaking = True
+                        speech_last_speaking_time = time.time()
+                    else:
+                        silence_duration = time.time() - speech_last_speaking_time
+                        if silence_duration > BOT_VAD_STOP_SECS:
+                            await self._bot_stopped_speaking()
+
+                if is_speaking:
                     await self._bot_started_speaking()
                     if bot_speaking_counter % BOT_SPEAKING_CHUNK_PERIOD == 0:
                         await self._transport.push_frame(BotSpeakingFrame())