Merge pull request #2352 from pipecat-ai/filipi/webrtc_audio_frame

Implementing if the bot it is speaking or not based on the SpeechOutputAudioRawFrame
2025-08-05 17:26:44 -03:00
parent 95c661bdaa 64592b274b
commit 17e7f8a2cd
6 changed files with 69 additions and 3 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -47,6 +47,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

 ### Fixed

+- Fixed an issue where `BotStartedSpeakingFrame` and `BotStoppedSpeakingFrame` 
+  were not emitted when using `TavusVideoService` or `HeyGenVideoService`.
+
 - Fixed an issue in `LiveKitTransport` where empty `AudioRawFrame`s were pushed
  down the pipeline. This resulted in warnings by the STT processor.
 - Fixed `PiperTTSService` to send text as a JSON object in the request body,
--- a/src/pipecat/audio/utils.py
+++ b/src/pipecat/audio/utils.py
@@ -20,6 +20,10 @@ from pipecat.audio.resamplers.base_audio_resampler import BaseAudioResampler
 from pipecat.audio.resamplers.soxr_resampler import SOXRAudioResampler
 from pipecat.audio.resamplers.soxr_stream_resampler import SOXRStreamAudioResampler

+# Normal speech usually results in many samples between ±500 to ±5000, depending on loudness and mic gain.
+# So we are using a threshold that is well below what real speech produces.
+SPEAKING_THRESHOLD = 20
+

 def create_default_resampler(**kwargs) -> BaseAudioResampler:
    """Create a default audio resampler instance.
@@ -275,3 +279,33 @@ async def pcm_to_alaw(pcm_bytes: bytes, in_rate: int, out_rate: int, resampler:
    out_alaw_bytes = audioop.lin2alaw(in_pcm_bytes, 2)

    return out_alaw_bytes
+
+
+def is_silence(pcm_bytes: bytes) -> bool:
+    """Determine if an audio sample contains silence by checking amplitude levels.
+
+    This function analyzes raw PCM audio data to detect silence by comparing
+    the maximum absolute amplitude against a predefined threshold. The audio
+    is expected to be clean speech or complete silence without background noise.
+
+    Args:
+        pcm_bytes: Raw PCM audio data as bytes (16-bit signed integers).
+
+    Returns:
+        bool: True if the audio sample is considered silence (below threshold),
+              False otherwise.
+
+    Note:
+        Normal speech typically produces amplitude values between ±500 to ±5000,
+        depending on factors like loudness and microphone gain. The threshold
+        (SPEAKING_THRESHOLD) is set well below typical speech levels to
+        reliably detect silence vs. speech.
+    """
+    # Convert raw audio bytes to a NumPy array of int16 samples
+    audio_data = np.frombuffer(pcm_bytes, dtype=np.int16)
+
+    # Check the maximum absolute amplitude in the frame
+    max_value = np.abs(audio_data).max()
+
+    # If max value is lower than SPEAKING_THRESHOLD, consider it as silence
+    return max_value <= SPEAKING_THRESHOLD
--- a/src/pipecat/frames/frames.py
+++ b/src/pipecat/frames/frames.py
@@ -238,6 +238,18 @@ class TTSAudioRawFrame(OutputAudioRawFrame):
    pass


+@dataclass
+class SpeechOutputAudioRawFrame(OutputAudioRawFrame):
+    """An audio frame part of a speech audio stream.
+
+    This frame is part of a continuous stream of audio frames containing speech.
+    The audio stream might also contain silence frames, so a process to distinguish
+    between speech and silence might be needed.
+    """
+
+    pass
+
+
@dataclass
 class URLImageRawFrame(OutputImageRawFrame):
    """Image frame with an associated URL.
--- a/src/pipecat/services/heygen/video.py
+++ b/src/pipecat/services/heygen/video.py
@@ -27,6 +27,7 @@ from pipecat.frames.frames import (
    OutputAudioRawFrame,
    OutputImageRawFrame,
    OutputTransportReadyFrame,
+    SpeechOutputAudioRawFrame,
    StartFrame,
    TTSAudioRawFrame,
    UserStartedSpeakingFrame,
@@ -157,7 +158,7 @@ class HeyGenVideoService(AIService):

    async def _on_participant_audio_data(self, audio_frame: AudioRawFrame):
        """Handle incoming audio data from participants."""
-        frame = OutputAudioRawFrame(
+        frame = SpeechOutputAudioRawFrame(
            audio=audio_frame.audio,
            sample_rate=audio_frame.sample_rate,
            num_channels=audio_frame.num_channels,
--- a/src/pipecat/services/tavus/video.py
+++ b/src/pipecat/services/tavus/video.py
@@ -25,6 +25,7 @@ from pipecat.frames.frames import (
    OutputAudioRawFrame,
    OutputImageRawFrame,
    OutputTransportReadyFrame,
+    SpeechOutputAudioRawFrame,
    StartFrame,
    StartInterruptionFrame,
    TTSAudioRawFrame,
@@ -154,7 +155,7 @@ class TavusVideoService(AIService):
        self, participant_id: str, audio: AudioData, audio_source: str
    ):
        """Handle incoming audio data from participants."""
-        frame = OutputAudioRawFrame(
+        frame = SpeechOutputAudioRawFrame(
            audio=audio.audio_frames,
            sample_rate=audio.sample_rate,
            num_channels=audio.num_channels,
--- a/src/pipecat/transports/base_output.py
+++ b/src/pipecat/transports/base_output.py
@@ -21,7 +21,7 @@ from loguru import logger
 from PIL import Image

 from pipecat.audio.mixers.base_audio_mixer import BaseAudioMixer
-from pipecat.audio.utils import create_stream_resampler
+from pipecat.audio.utils import create_stream_resampler, is_silence
 from pipecat.frames.frames import (
    BotSpeakingFrame,
    BotStartedSpeakingFrame,
@@ -35,6 +35,7 @@ from pipecat.frames.frames import (
    OutputDTMFUrgentFrame,
    OutputImageRawFrame,
    OutputTransportReadyFrame,
+    SpeechOutputAudioRawFrame,
    SpriteFrame,
    StartFrame,
    StartInterruptionFrame,
@@ -671,10 +672,24 @@ class BaseOutputTransport(FrameProcessor):
            TOTAL_CHUNK_MS = self._params.audio_out_10ms_chunks * 10
            BOT_SPEAKING_CHUNK_PERIOD = max(int(200 / TOTAL_CHUNK_MS), 1)
            bot_speaking_counter = 0
+            speech_last_speaking_time = 0
+
            async for frame in self._next_frame():
                # Notify the bot started speaking upstream if necessary and that
                # it's actually speaking.
+                is_speaking = False
                if isinstance(frame, TTSAudioRawFrame):
+                    is_speaking = True
+                elif isinstance(frame, SpeechOutputAudioRawFrame):
+                    if not is_silence(frame.audio):
+                        is_speaking = True
+                        speech_last_speaking_time = time.time()
+                    else:
+                        silence_duration = time.time() - speech_last_speaking_time
+                        if silence_duration > BOT_VAD_STOP_SECS:
+                            await self._bot_stopped_speaking()
+
+                if is_speaking:
                    await self._bot_started_speaking()
                    if bot_speaking_counter % BOT_SPEAKING_CHUNK_PERIOD == 0:
                        await self._transport.push_frame(BotSpeakingFrame())