Merge pull request #2352 from pipecat-ai/filipi/webrtc_audio_frame

Implementing if the bot it is speaking or not based on the SpeechOutputAudioRawFrame
This commit is contained in:
Filipi da Silva Fuchter
2025-08-05 17:26:44 -03:00
committed by GitHub
6 changed files with 69 additions and 3 deletions

View File

@@ -47,6 +47,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Fixed
- Fixed an issue where `BotStartedSpeakingFrame` and `BotStoppedSpeakingFrame`
were not emitted when using `TavusVideoService` or `HeyGenVideoService`.
- Fixed an issue in `LiveKitTransport` where empty `AudioRawFrame`s were pushed
down the pipeline. This resulted in warnings by the STT processor.
- Fixed `PiperTTSService` to send text as a JSON object in the request body,

View File

@@ -20,6 +20,10 @@ from pipecat.audio.resamplers.base_audio_resampler import BaseAudioResampler
from pipecat.audio.resamplers.soxr_resampler import SOXRAudioResampler
from pipecat.audio.resamplers.soxr_stream_resampler import SOXRStreamAudioResampler
# Normal speech usually results in many samples between ±500 to ±5000, depending on loudness and mic gain.
# So we are using a threshold that is well below what real speech produces.
SPEAKING_THRESHOLD = 20
def create_default_resampler(**kwargs) -> BaseAudioResampler:
"""Create a default audio resampler instance.
@@ -275,3 +279,33 @@ async def pcm_to_alaw(pcm_bytes: bytes, in_rate: int, out_rate: int, resampler:
out_alaw_bytes = audioop.lin2alaw(in_pcm_bytes, 2)
return out_alaw_bytes
def is_silence(pcm_bytes: bytes) -> bool:
"""Determine if an audio sample contains silence by checking amplitude levels.
This function analyzes raw PCM audio data to detect silence by comparing
the maximum absolute amplitude against a predefined threshold. The audio
is expected to be clean speech or complete silence without background noise.
Args:
pcm_bytes: Raw PCM audio data as bytes (16-bit signed integers).
Returns:
bool: True if the audio sample is considered silence (below threshold),
False otherwise.
Note:
Normal speech typically produces amplitude values between ±500 to ±5000,
depending on factors like loudness and microphone gain. The threshold
(SPEAKING_THRESHOLD) is set well below typical speech levels to
reliably detect silence vs. speech.
"""
# Convert raw audio bytes to a NumPy array of int16 samples
audio_data = np.frombuffer(pcm_bytes, dtype=np.int16)
# Check the maximum absolute amplitude in the frame
max_value = np.abs(audio_data).max()
# If max value is lower than SPEAKING_THRESHOLD, consider it as silence
return max_value <= SPEAKING_THRESHOLD

View File

@@ -238,6 +238,18 @@ class TTSAudioRawFrame(OutputAudioRawFrame):
pass
@dataclass
class SpeechOutputAudioRawFrame(OutputAudioRawFrame):
"""An audio frame part of a speech audio stream.
This frame is part of a continuous stream of audio frames containing speech.
The audio stream might also contain silence frames, so a process to distinguish
between speech and silence might be needed.
"""
pass
@dataclass
class URLImageRawFrame(OutputImageRawFrame):
"""Image frame with an associated URL.

View File

@@ -27,6 +27,7 @@ from pipecat.frames.frames import (
OutputAudioRawFrame,
OutputImageRawFrame,
OutputTransportReadyFrame,
SpeechOutputAudioRawFrame,
StartFrame,
TTSAudioRawFrame,
UserStartedSpeakingFrame,
@@ -157,7 +158,7 @@ class HeyGenVideoService(AIService):
async def _on_participant_audio_data(self, audio_frame: AudioRawFrame):
"""Handle incoming audio data from participants."""
frame = OutputAudioRawFrame(
frame = SpeechOutputAudioRawFrame(
audio=audio_frame.audio,
sample_rate=audio_frame.sample_rate,
num_channels=audio_frame.num_channels,

View File

@@ -25,6 +25,7 @@ from pipecat.frames.frames import (
OutputAudioRawFrame,
OutputImageRawFrame,
OutputTransportReadyFrame,
SpeechOutputAudioRawFrame,
StartFrame,
StartInterruptionFrame,
TTSAudioRawFrame,
@@ -154,7 +155,7 @@ class TavusVideoService(AIService):
self, participant_id: str, audio: AudioData, audio_source: str
):
"""Handle incoming audio data from participants."""
frame = OutputAudioRawFrame(
frame = SpeechOutputAudioRawFrame(
audio=audio.audio_frames,
sample_rate=audio.sample_rate,
num_channels=audio.num_channels,

View File

@@ -21,7 +21,7 @@ from loguru import logger
from PIL import Image
from pipecat.audio.mixers.base_audio_mixer import BaseAudioMixer
from pipecat.audio.utils import create_stream_resampler
from pipecat.audio.utils import create_stream_resampler, is_silence
from pipecat.frames.frames import (
BotSpeakingFrame,
BotStartedSpeakingFrame,
@@ -35,6 +35,7 @@ from pipecat.frames.frames import (
OutputDTMFUrgentFrame,
OutputImageRawFrame,
OutputTransportReadyFrame,
SpeechOutputAudioRawFrame,
SpriteFrame,
StartFrame,
StartInterruptionFrame,
@@ -671,10 +672,24 @@ class BaseOutputTransport(FrameProcessor):
TOTAL_CHUNK_MS = self._params.audio_out_10ms_chunks * 10
BOT_SPEAKING_CHUNK_PERIOD = max(int(200 / TOTAL_CHUNK_MS), 1)
bot_speaking_counter = 0
speech_last_speaking_time = 0
async for frame in self._next_frame():
# Notify the bot started speaking upstream if necessary and that
# it's actually speaking.
is_speaking = False
if isinstance(frame, TTSAudioRawFrame):
is_speaking = True
elif isinstance(frame, SpeechOutputAudioRawFrame):
if not is_silence(frame.audio):
is_speaking = True
speech_last_speaking_time = time.time()
else:
silence_duration = time.time() - speech_last_speaking_time
if silence_duration > BOT_VAD_STOP_SECS:
await self._bot_stopped_speaking()
if is_speaking:
await self._bot_started_speaking()
if bot_speaking_counter % BOT_SPEAKING_CHUNK_PERIOD == 0:
await self._transport.push_frame(BotSpeakingFrame())