Merge pull request #2352 from pipecat-ai/filipi/webrtc_audio_frame
Implementing if the bot it is speaking or not based on the SpeechOutputAudioRawFrame
This commit is contained in:
@@ -47,6 +47,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
|
||||
### Fixed
|
||||
|
||||
- Fixed an issue where `BotStartedSpeakingFrame` and `BotStoppedSpeakingFrame`
|
||||
were not emitted when using `TavusVideoService` or `HeyGenVideoService`.
|
||||
|
||||
- Fixed an issue in `LiveKitTransport` where empty `AudioRawFrame`s were pushed
|
||||
down the pipeline. This resulted in warnings by the STT processor.
|
||||
- Fixed `PiperTTSService` to send text as a JSON object in the request body,
|
||||
|
||||
@@ -20,6 +20,10 @@ from pipecat.audio.resamplers.base_audio_resampler import BaseAudioResampler
|
||||
from pipecat.audio.resamplers.soxr_resampler import SOXRAudioResampler
|
||||
from pipecat.audio.resamplers.soxr_stream_resampler import SOXRStreamAudioResampler
|
||||
|
||||
# Normal speech usually results in many samples between ±500 to ±5000, depending on loudness and mic gain.
|
||||
# So we are using a threshold that is well below what real speech produces.
|
||||
SPEAKING_THRESHOLD = 20
|
||||
|
||||
|
||||
def create_default_resampler(**kwargs) -> BaseAudioResampler:
|
||||
"""Create a default audio resampler instance.
|
||||
@@ -275,3 +279,33 @@ async def pcm_to_alaw(pcm_bytes: bytes, in_rate: int, out_rate: int, resampler:
|
||||
out_alaw_bytes = audioop.lin2alaw(in_pcm_bytes, 2)
|
||||
|
||||
return out_alaw_bytes
|
||||
|
||||
|
||||
def is_silence(pcm_bytes: bytes) -> bool:
|
||||
"""Determine if an audio sample contains silence by checking amplitude levels.
|
||||
|
||||
This function analyzes raw PCM audio data to detect silence by comparing
|
||||
the maximum absolute amplitude against a predefined threshold. The audio
|
||||
is expected to be clean speech or complete silence without background noise.
|
||||
|
||||
Args:
|
||||
pcm_bytes: Raw PCM audio data as bytes (16-bit signed integers).
|
||||
|
||||
Returns:
|
||||
bool: True if the audio sample is considered silence (below threshold),
|
||||
False otherwise.
|
||||
|
||||
Note:
|
||||
Normal speech typically produces amplitude values between ±500 to ±5000,
|
||||
depending on factors like loudness and microphone gain. The threshold
|
||||
(SPEAKING_THRESHOLD) is set well below typical speech levels to
|
||||
reliably detect silence vs. speech.
|
||||
"""
|
||||
# Convert raw audio bytes to a NumPy array of int16 samples
|
||||
audio_data = np.frombuffer(pcm_bytes, dtype=np.int16)
|
||||
|
||||
# Check the maximum absolute amplitude in the frame
|
||||
max_value = np.abs(audio_data).max()
|
||||
|
||||
# If max value is lower than SPEAKING_THRESHOLD, consider it as silence
|
||||
return max_value <= SPEAKING_THRESHOLD
|
||||
|
||||
@@ -238,6 +238,18 @@ class TTSAudioRawFrame(OutputAudioRawFrame):
|
||||
pass
|
||||
|
||||
|
||||
@dataclass
|
||||
class SpeechOutputAudioRawFrame(OutputAudioRawFrame):
|
||||
"""An audio frame part of a speech audio stream.
|
||||
|
||||
This frame is part of a continuous stream of audio frames containing speech.
|
||||
The audio stream might also contain silence frames, so a process to distinguish
|
||||
between speech and silence might be needed.
|
||||
"""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
@dataclass
|
||||
class URLImageRawFrame(OutputImageRawFrame):
|
||||
"""Image frame with an associated URL.
|
||||
|
||||
@@ -27,6 +27,7 @@ from pipecat.frames.frames import (
|
||||
OutputAudioRawFrame,
|
||||
OutputImageRawFrame,
|
||||
OutputTransportReadyFrame,
|
||||
SpeechOutputAudioRawFrame,
|
||||
StartFrame,
|
||||
TTSAudioRawFrame,
|
||||
UserStartedSpeakingFrame,
|
||||
@@ -157,7 +158,7 @@ class HeyGenVideoService(AIService):
|
||||
|
||||
async def _on_participant_audio_data(self, audio_frame: AudioRawFrame):
|
||||
"""Handle incoming audio data from participants."""
|
||||
frame = OutputAudioRawFrame(
|
||||
frame = SpeechOutputAudioRawFrame(
|
||||
audio=audio_frame.audio,
|
||||
sample_rate=audio_frame.sample_rate,
|
||||
num_channels=audio_frame.num_channels,
|
||||
|
||||
@@ -25,6 +25,7 @@ from pipecat.frames.frames import (
|
||||
OutputAudioRawFrame,
|
||||
OutputImageRawFrame,
|
||||
OutputTransportReadyFrame,
|
||||
SpeechOutputAudioRawFrame,
|
||||
StartFrame,
|
||||
StartInterruptionFrame,
|
||||
TTSAudioRawFrame,
|
||||
@@ -154,7 +155,7 @@ class TavusVideoService(AIService):
|
||||
self, participant_id: str, audio: AudioData, audio_source: str
|
||||
):
|
||||
"""Handle incoming audio data from participants."""
|
||||
frame = OutputAudioRawFrame(
|
||||
frame = SpeechOutputAudioRawFrame(
|
||||
audio=audio.audio_frames,
|
||||
sample_rate=audio.sample_rate,
|
||||
num_channels=audio.num_channels,
|
||||
|
||||
@@ -21,7 +21,7 @@ from loguru import logger
|
||||
from PIL import Image
|
||||
|
||||
from pipecat.audio.mixers.base_audio_mixer import BaseAudioMixer
|
||||
from pipecat.audio.utils import create_stream_resampler
|
||||
from pipecat.audio.utils import create_stream_resampler, is_silence
|
||||
from pipecat.frames.frames import (
|
||||
BotSpeakingFrame,
|
||||
BotStartedSpeakingFrame,
|
||||
@@ -35,6 +35,7 @@ from pipecat.frames.frames import (
|
||||
OutputDTMFUrgentFrame,
|
||||
OutputImageRawFrame,
|
||||
OutputTransportReadyFrame,
|
||||
SpeechOutputAudioRawFrame,
|
||||
SpriteFrame,
|
||||
StartFrame,
|
||||
StartInterruptionFrame,
|
||||
@@ -671,10 +672,24 @@ class BaseOutputTransport(FrameProcessor):
|
||||
TOTAL_CHUNK_MS = self._params.audio_out_10ms_chunks * 10
|
||||
BOT_SPEAKING_CHUNK_PERIOD = max(int(200 / TOTAL_CHUNK_MS), 1)
|
||||
bot_speaking_counter = 0
|
||||
speech_last_speaking_time = 0
|
||||
|
||||
async for frame in self._next_frame():
|
||||
# Notify the bot started speaking upstream if necessary and that
|
||||
# it's actually speaking.
|
||||
is_speaking = False
|
||||
if isinstance(frame, TTSAudioRawFrame):
|
||||
is_speaking = True
|
||||
elif isinstance(frame, SpeechOutputAudioRawFrame):
|
||||
if not is_silence(frame.audio):
|
||||
is_speaking = True
|
||||
speech_last_speaking_time = time.time()
|
||||
else:
|
||||
silence_duration = time.time() - speech_last_speaking_time
|
||||
if silence_duration > BOT_VAD_STOP_SECS:
|
||||
await self._bot_stopped_speaking()
|
||||
|
||||
if is_speaking:
|
||||
await self._bot_started_speaking()
|
||||
if bot_speaking_counter % BOT_SPEAKING_CHUNK_PERIOD == 0:
|
||||
await self._transport.push_frame(BotSpeakingFrame())
|
||||
|
||||
Reference in New Issue
Block a user