From 64592b274b9fbdc9ca7d057685be7a09f65a17ad Mon Sep 17 00:00:00 2001 From: Filipi Fuchter Date: Tue, 5 Aug 2025 17:11:34 -0300 Subject: [PATCH] Fixed an issue where `BotStartedSpeakingFrame` and `BotStoppedSpeakingFrame` were not emitted when using `TavusVideoService` or `HeyGenVideoService`. --- CHANGELOG.md | 3 +++ src/pipecat/audio/utils.py | 34 +++++++++++++++++++++++++++ src/pipecat/frames/frames.py | 12 ++++++++++ src/pipecat/services/heygen/video.py | 3 ++- src/pipecat/services/tavus/video.py | 3 ++- src/pipecat/transports/base_output.py | 17 +++++++++++++- 6 files changed, 69 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4da3cf32d..e18ca157d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -47,6 +47,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed +- Fixed an issue where `BotStartedSpeakingFrame` and `BotStoppedSpeakingFrame` + were not emitted when using `TavusVideoService` or `HeyGenVideoService`. + - Fixed an issue in `LiveKitTransport` where empty `AudioRawFrame`s were pushed down the pipeline. This resulted in warnings by the STT processor. - Fixed `PiperTTSService` to send text as a JSON object in the request body, diff --git a/src/pipecat/audio/utils.py b/src/pipecat/audio/utils.py index 6d5a12929..b2f76ec60 100644 --- a/src/pipecat/audio/utils.py +++ b/src/pipecat/audio/utils.py @@ -20,6 +20,10 @@ from pipecat.audio.resamplers.base_audio_resampler import BaseAudioResampler from pipecat.audio.resamplers.soxr_resampler import SOXRAudioResampler from pipecat.audio.resamplers.soxr_stream_resampler import SOXRStreamAudioResampler +# Normal speech usually results in many samples between ±500 to ±5000, depending on loudness and mic gain. +# So we are using a threshold that is well below what real speech produces. +SPEAKING_THRESHOLD = 20 + def create_default_resampler(**kwargs) -> BaseAudioResampler: """Create a default audio resampler instance. @@ -275,3 +279,33 @@ async def pcm_to_alaw(pcm_bytes: bytes, in_rate: int, out_rate: int, resampler: out_alaw_bytes = audioop.lin2alaw(in_pcm_bytes, 2) return out_alaw_bytes + + +def is_silence(pcm_bytes: bytes) -> bool: + """Determine if an audio sample contains silence by checking amplitude levels. + + This function analyzes raw PCM audio data to detect silence by comparing + the maximum absolute amplitude against a predefined threshold. The audio + is expected to be clean speech or complete silence without background noise. + + Args: + pcm_bytes: Raw PCM audio data as bytes (16-bit signed integers). + + Returns: + bool: True if the audio sample is considered silence (below threshold), + False otherwise. + + Note: + Normal speech typically produces amplitude values between ±500 to ±5000, + depending on factors like loudness and microphone gain. The threshold + (SPEAKING_THRESHOLD) is set well below typical speech levels to + reliably detect silence vs. speech. + """ + # Convert raw audio bytes to a NumPy array of int16 samples + audio_data = np.frombuffer(pcm_bytes, dtype=np.int16) + + # Check the maximum absolute amplitude in the frame + max_value = np.abs(audio_data).max() + + # If max value is lower than SPEAKING_THRESHOLD, consider it as silence + return max_value <= SPEAKING_THRESHOLD diff --git a/src/pipecat/frames/frames.py b/src/pipecat/frames/frames.py index bc9e8a06d..94e799aa6 100644 --- a/src/pipecat/frames/frames.py +++ b/src/pipecat/frames/frames.py @@ -238,6 +238,18 @@ class TTSAudioRawFrame(OutputAudioRawFrame): pass +@dataclass +class SpeechOutputAudioRawFrame(OutputAudioRawFrame): + """An audio frame part of a speech audio stream. + + This frame is part of a continuous stream of audio frames containing speech. + The audio stream might also contain silence frames, so a process to distinguish + between speech and silence might be needed. + """ + + pass + + @dataclass class URLImageRawFrame(OutputImageRawFrame): """Image frame with an associated URL. diff --git a/src/pipecat/services/heygen/video.py b/src/pipecat/services/heygen/video.py index 011151fdf..237dfc36f 100644 --- a/src/pipecat/services/heygen/video.py +++ b/src/pipecat/services/heygen/video.py @@ -27,6 +27,7 @@ from pipecat.frames.frames import ( OutputAudioRawFrame, OutputImageRawFrame, OutputTransportReadyFrame, + SpeechOutputAudioRawFrame, StartFrame, TTSAudioRawFrame, UserStartedSpeakingFrame, @@ -157,7 +158,7 @@ class HeyGenVideoService(AIService): async def _on_participant_audio_data(self, audio_frame: AudioRawFrame): """Handle incoming audio data from participants.""" - frame = OutputAudioRawFrame( + frame = SpeechOutputAudioRawFrame( audio=audio_frame.audio, sample_rate=audio_frame.sample_rate, num_channels=audio_frame.num_channels, diff --git a/src/pipecat/services/tavus/video.py b/src/pipecat/services/tavus/video.py index 37b21257f..684bd5659 100644 --- a/src/pipecat/services/tavus/video.py +++ b/src/pipecat/services/tavus/video.py @@ -25,6 +25,7 @@ from pipecat.frames.frames import ( OutputAudioRawFrame, OutputImageRawFrame, OutputTransportReadyFrame, + SpeechOutputAudioRawFrame, StartFrame, StartInterruptionFrame, TTSAudioRawFrame, @@ -154,7 +155,7 @@ class TavusVideoService(AIService): self, participant_id: str, audio: AudioData, audio_source: str ): """Handle incoming audio data from participants.""" - frame = OutputAudioRawFrame( + frame = SpeechOutputAudioRawFrame( audio=audio.audio_frames, sample_rate=audio.sample_rate, num_channels=audio.num_channels, diff --git a/src/pipecat/transports/base_output.py b/src/pipecat/transports/base_output.py index 4c60f7490..313c36fc0 100644 --- a/src/pipecat/transports/base_output.py +++ b/src/pipecat/transports/base_output.py @@ -21,7 +21,7 @@ from loguru import logger from PIL import Image from pipecat.audio.mixers.base_audio_mixer import BaseAudioMixer -from pipecat.audio.utils import create_stream_resampler +from pipecat.audio.utils import create_stream_resampler, is_silence from pipecat.frames.frames import ( BotSpeakingFrame, BotStartedSpeakingFrame, @@ -35,6 +35,7 @@ from pipecat.frames.frames import ( OutputDTMFUrgentFrame, OutputImageRawFrame, OutputTransportReadyFrame, + SpeechOutputAudioRawFrame, SpriteFrame, StartFrame, StartInterruptionFrame, @@ -671,10 +672,24 @@ class BaseOutputTransport(FrameProcessor): TOTAL_CHUNK_MS = self._params.audio_out_10ms_chunks * 10 BOT_SPEAKING_CHUNK_PERIOD = max(int(200 / TOTAL_CHUNK_MS), 1) bot_speaking_counter = 0 + speech_last_speaking_time = 0 + async for frame in self._next_frame(): # Notify the bot started speaking upstream if necessary and that # it's actually speaking. + is_speaking = False if isinstance(frame, TTSAudioRawFrame): + is_speaking = True + elif isinstance(frame, SpeechOutputAudioRawFrame): + if not is_silence(frame.audio): + is_speaking = True + speech_last_speaking_time = time.time() + else: + silence_duration = time.time() - speech_last_speaking_time + if silence_duration > BOT_VAD_STOP_SECS: + await self._bot_stopped_speaking() + + if is_speaking: await self._bot_started_speaking() if bot_speaking_counter % BOT_SPEAKING_CHUNK_PERIOD == 0: await self._transport.push_frame(BotSpeakingFrame())