SegmentedSTTService: use VAD events to detect valid audio
This commit is contained in:
@@ -149,6 +149,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
|
||||
### Fixed
|
||||
|
||||
- Fixed a `SegmentedSTTService` issue that was causing audio to be sent
|
||||
prematurely to the STT service. Instead of analyzing the volume in this
|
||||
service we rely on VAD events which use both VAD and volume.
|
||||
|
||||
- Fixed a `GeminiMultimodalLiveLLMService` issue that was causing messages to be
|
||||
duplicated in the context when pushing `LLMMessagesAppendFrame` frames.
|
||||
|
||||
|
||||
@@ -14,7 +14,6 @@ from loguru import logger
|
||||
|
||||
from pipecat.adapters.base_llm_adapter import BaseLLMAdapter
|
||||
from pipecat.adapters.services.open_ai_adapter import OpenAILLMAdapter
|
||||
from pipecat.audio.utils import calculate_audio_volume, exp_smoothing
|
||||
from pipecat.frames.frames import (
|
||||
AudioRawFrame,
|
||||
BotStartedSpeakingFrame,
|
||||
@@ -38,6 +37,8 @@ from pipecat.frames.frames import (
|
||||
TTSTextFrame,
|
||||
TTSUpdateSettingsFrame,
|
||||
UserImageRequestFrame,
|
||||
UserStartedSpeakingFrame,
|
||||
UserStoppedSpeakingFrame,
|
||||
VisionImageRawFrame,
|
||||
)
|
||||
from pipecat.metrics.metrics import MetricsData
|
||||
@@ -859,79 +860,64 @@ class STTService(AIService):
|
||||
|
||||
|
||||
class SegmentedSTTService(STTService):
|
||||
"""SegmentedSTTService is an STTService that will detect speech and will run
|
||||
speech-to-text on speech segments only, instead of a continous stream.
|
||||
"""SegmentedSTTService is an STTService that uses VAD events to detect
|
||||
speech and will run speech-to-text on speech segments only, instead of a
|
||||
continous stream. Since it uses VAD it means that VAD needs to be enabled in
|
||||
the pipeline.
|
||||
|
||||
This service always keeps a small audio buffer to take into account that VAD
|
||||
events are delayed from when the user speech really starts.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
min_volume: float = 0.6,
|
||||
max_silence_secs: float = 0.3,
|
||||
max_buffer_secs: float = 1.5,
|
||||
sample_rate: Optional[int] = None,
|
||||
**kwargs,
|
||||
):
|
||||
def __init__(self, *, sample_rate: Optional[int] = None, **kwargs):
|
||||
super().__init__(sample_rate=sample_rate, **kwargs)
|
||||
self._min_volume = min_volume
|
||||
self._max_silence_secs = max_silence_secs
|
||||
self._max_buffer_secs = max_buffer_secs
|
||||
self._content = None
|
||||
self._wave = None
|
||||
self._silence_num_frames = 0
|
||||
# Volume exponential smoothing
|
||||
self._smoothing_factor = 0.2
|
||||
self._prev_volume = 0
|
||||
|
||||
async def process_audio_frame(self, frame: AudioRawFrame, direction: FrameDirection):
|
||||
# Try to filter out empty background noise
|
||||
volume = self._get_smoothed_volume(frame)
|
||||
if volume >= self._min_volume:
|
||||
# If volume is high enough, write new data to wave file
|
||||
self._wave.writeframes(frame.audio)
|
||||
self._silence_num_frames = 0
|
||||
else:
|
||||
self._silence_num_frames += frame.num_frames
|
||||
self._prev_volume = volume
|
||||
|
||||
# If buffer is not empty and we have enough data or there's been a long
|
||||
# silence, transcribe the audio gathered so far.
|
||||
silence_secs = self._silence_num_frames / self.sample_rate
|
||||
buffer_secs = self._wave.getnframes() / self.sample_rate
|
||||
if self._content.tell() > 0 and (
|
||||
buffer_secs > self._max_buffer_secs or silence_secs > self._max_silence_secs
|
||||
):
|
||||
self._silence_num_frames = 0
|
||||
self._wave.close()
|
||||
self._content.seek(0)
|
||||
await self.process_generator(self.run_stt(self._content.read()))
|
||||
(self._content, self._wave) = self._new_wave()
|
||||
self._audio_buffer = bytearray()
|
||||
self._audio_buffer_size_1s = 0
|
||||
self._user_speaking = False
|
||||
|
||||
async def start(self, frame: StartFrame):
|
||||
await super().start(frame)
|
||||
if not self._wave:
|
||||
(self._content, self._wave) = self._new_wave()
|
||||
self._audio_buffer_size_1s = self.sample_rate * 2
|
||||
|
||||
async def stop(self, frame: EndFrame):
|
||||
await super().stop(frame)
|
||||
self._wave.close()
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
async def cancel(self, frame: CancelFrame):
|
||||
await super().cancel(frame)
|
||||
self._wave.close()
|
||||
if isinstance(frame, UserStartedSpeakingFrame):
|
||||
await self._handle_user_started_speaking(frame)
|
||||
elif isinstance(frame, UserStoppedSpeakingFrame):
|
||||
await self._handle_user_stopped_speaking(frame)
|
||||
|
||||
async def _handle_user_started_speaking(self, frame: UserStartedSpeakingFrame):
|
||||
self._user_speaking = True
|
||||
|
||||
async def _handle_user_stopped_speaking(self, frame: UserStoppedSpeakingFrame):
|
||||
self._user_speaking = False
|
||||
|
||||
def _new_wave(self):
|
||||
content = io.BytesIO()
|
||||
ww = wave.open(content, "wb")
|
||||
ww.setsampwidth(2)
|
||||
ww.setnchannels(1)
|
||||
ww.setframerate(self.sample_rate)
|
||||
return (content, ww)
|
||||
wav = wave.open(content, "wb")
|
||||
wav.setsampwidth(2)
|
||||
wav.setnchannels(1)
|
||||
wav.setframerate(self.sample_rate)
|
||||
wav.writeframes(self._audio_buffer)
|
||||
wav.close()
|
||||
content.seek(0)
|
||||
|
||||
def _get_smoothed_volume(self, frame: AudioRawFrame) -> float:
|
||||
volume = calculate_audio_volume(frame.audio, frame.sample_rate)
|
||||
return exp_smoothing(volume, self._prev_volume, self._smoothing_factor)
|
||||
await self.process_generator(self.run_stt(content.read()))
|
||||
|
||||
# Start clean.
|
||||
self._audio_buffer.clear()
|
||||
|
||||
async def process_audio_frame(self, frame: AudioRawFrame, direction: FrameDirection):
|
||||
# If the user is speaking the audio buffer will keep growin.
|
||||
self._audio_buffer += frame.audio
|
||||
|
||||
# If the user is not speaking we keep just a little bit of audio.
|
||||
if not self._user_speaking and len(self._audio_buffer) > self._audio_buffer_size_1s:
|
||||
discarded = len(self._audio_buffer) - self._audio_buffer_size_1s
|
||||
self._audio_buffer = self._audio_buffer[discarded:]
|
||||
|
||||
|
||||
class ImageGenService(AIService):
|
||||
|
||||
Reference in New Issue
Block a user