diff --git a/src/pipecat/audio/vad/vad_analyzer.py b/src/pipecat/audio/vad/vad_analyzer.py index a57181008..3ca21a208 100644 --- a/src/pipecat/audio/vad/vad_analyzer.py +++ b/src/pipecat/audio/vad/vad_analyzer.py @@ -6,7 +6,7 @@ from abc import ABC, abstractmethod from enum import Enum -from typing import Optional, Tuple +from typing import Optional from loguru import logger from pydantic import BaseModel @@ -88,24 +88,12 @@ class VADAnalyzer(ABC): volume = calculate_audio_volume(audio, self.sample_rate) return exp_smoothing(volume, self._prev_volume, self._smoothing_factor) - def analyze_audio(self, buffer) -> Tuple[VADState, Optional[str]]: - """Analyze audio for voice activity. - - Args: - buffer: Audio buffer to analyze - - Returns: - Tuple containing: - - VADState: Current VAD state - - Optional[str]: Event type if a speech event occurred ("speech_started", - "speech_stopped"), or None if no event occurred - """ + def analyze_audio(self, buffer) -> VADState: self._vad_buffer += buffer - event_type = None num_required_bytes = self._vad_frames_num_bytes if len(self._vad_buffer) < num_required_bytes: - return self._vad_state, event_type + return self._vad_state audio_frames = self._vad_buffer[:num_required_bytes] self._vad_buffer = self._vad_buffer[num_required_bytes:] @@ -144,7 +132,6 @@ class VADAnalyzer(ABC): ): self._vad_state = VADState.SPEAKING self._vad_starting_count = 0 - event_type = "speech_started" if ( self._vad_state == VADState.STOPPING @@ -152,6 +139,5 @@ class VADAnalyzer(ABC): ): self._vad_state = VADState.QUIET self._vad_stopping_count = 0 - event_type = "speech_stopped" - return self._vad_state, event_type + return self._vad_state diff --git a/src/pipecat/transports/base_input.py b/src/pipecat/transports/base_input.py index 57511a96a..acb0058c6 100644 --- a/src/pipecat/transports/base_input.py +++ b/src/pipecat/transports/base_input.py @@ -230,13 +230,9 @@ class BaseInputTransport(FrameProcessor): async def _vad_analyze(self, audio_frame: InputAudioRawFrame) -> VADState: state = VADState.QUIET if self.vad_analyzer: - state, event_type = await self.get_event_loop().run_in_executor( + state = await self.get_event_loop().run_in_executor( self._executor, self.vad_analyzer.analyze_audio, audio_frame.audio ) - - if event_type: - await self._handle_vad_event(event_type) - return state async def _handle_vad(self, audio_frame: InputAudioRawFrame, vad_state: VADState): @@ -254,10 +250,13 @@ class BaseInputTransport(FrameProcessor): self._params.turn_analyzer is None or not self._params.turn_analyzer.speech_triggered ) - if can_create_user_frames: - if new_vad_state == VADState.SPEAKING: + if new_vad_state == VADState.SPEAKING: + await self.push_frame(VADUserStartedSpeakingFrame()) + if can_create_user_frames: frame = UserStartedSpeakingFrame() - elif new_vad_state == VADState.QUIET: + elif new_vad_state == VADState.QUIET: + await self.push_frame(VADUserStoppedSpeakingFrame()) + if can_create_user_frames: frame = UserStoppedSpeakingFrame() if frame: @@ -266,16 +265,6 @@ class BaseInputTransport(FrameProcessor): vad_state = new_vad_state return vad_state - async def _handle_vad_event(self, event_type: str): - """Handle VAD speech events by creating and pushing appropriate frames.""" - if event_type == "speech_started": - logger.debug("VAD detected definitive speech start") - await self.push_frame(VADUserStartedSpeakingFrame()) - - elif event_type == "speech_stopped": - logger.debug("VAD detected definitive speech stop") - await self.push_frame(VADUserStoppedSpeakingFrame()) - async def _handle_end_of_turn(self): if self.turn_analyzer: state, prediction = await self.turn_analyzer.analyze_end_of_turn()