Merge pull request #397 from pipecat-ai/khk/rtvi-vad-params

VADParamsUpdateFrame and handling thereof
This commit is contained in:
Kwindla Hultman Kramer
2024-08-18 21:14:58 -07:00
committed by GitHub
3 changed files with 37 additions and 12 deletions

View File

@@ -456,3 +456,11 @@ class FunctionCallResultFrame(DataFrame):
tool_call_id: str
arguments: str
result: any
@dataclass
class VADParamsUpdateFrame(ControlFrame):
"""A control frame containing a request to update VAD params. Intended
to be pushed upstream from RTVI processor.
"""
params: dict

View File

@@ -20,9 +20,10 @@ from pipecat.frames.frames import (
StopInterruptionFrame,
SystemFrame,
UserStartedSpeakingFrame,
UserStoppedSpeakingFrame)
UserStoppedSpeakingFrame,
VADParamsUpdateFrame)
from pipecat.transports.base_transport import TransportParams
from pipecat.vad.vad_analyzer import VADAnalyzer, VADState
from pipecat.vad.vad_analyzer import VADAnalyzer, VADParams, VADState
from loguru import logger
@@ -102,6 +103,11 @@ class BaseInputTransport(FrameProcessor):
# finish and the task finishes when EndFrame is processed.
await self._internal_push_frame(frame, direction)
await self.stop(frame)
elif isinstance(frame, VADParamsUpdateFrame):
vad_analyzer = self.vad_analyzer()
if not vad_analyzer:
pass
vad_analyzer.set_params(frame.params)
# Other frames
else:
await self._internal_push_frame(frame, direction)

View File

@@ -11,6 +11,8 @@ from pydantic.main import BaseModel
from pipecat.utils.audio import calculate_audio_volume, exp_smoothing
from loguru import logger
class VADState(Enum):
QUIET = 1
@@ -31,17 +33,8 @@ class VADAnalyzer:
def __init__(self, *, sample_rate: int, num_channels: int, params: VADParams):
self._sample_rate = sample_rate
self._num_channels = num_channels
self._params = params
self._vad_frames = self.num_frames_required()
self._vad_frames_num_bytes = self._vad_frames * num_channels * 2
vad_frames_per_sec = self._vad_frames / self._sample_rate
self._vad_start_frames = round(self._params.start_secs / vad_frames_per_sec)
self._vad_stop_frames = round(self._params.stop_secs / vad_frames_per_sec)
self._vad_starting_count = 0
self._vad_stopping_count = 0
self._vad_state: VADState = VADState.QUIET
self.set_params(params)
self._vad_buffer = b""
@@ -53,6 +46,10 @@ class VADAnalyzer:
def sample_rate(self):
return self._sample_rate
@property
def num_channels(self):
return self._num_channels
@abstractmethod
def num_frames_required(self) -> int:
pass
@@ -61,6 +58,20 @@ class VADAnalyzer:
def voice_confidence(self, buffer) -> float:
pass
def set_params(self, params: VADParams):
logger.debug(f"Setting VAD params to: {params}")
self._params = params
self._vad_frames = self.num_frames_required()
self._vad_frames_num_bytes = self._vad_frames * self._num_channels * 2
vad_frames_per_sec = self._vad_frames / self._sample_rate
self._vad_start_frames = round(self._params.start_secs / vad_frames_per_sec)
self._vad_stop_frames = round(self._params.stop_secs / vad_frames_per_sec)
self._vad_starting_count = 0
self._vad_stopping_count = 0
self._vad_state: VADState = VADState.QUIET
def _get_smoothed_volume(self, audio: bytes) -> float:
volume = calculate_audio_volume(audio, self._sample_rate)
return exp_smoothing(volume, self._prev_volume, self._smoothing_factor)