From 3ea9cfd2518cd0abfab3375f2afc741ff242c32a Mon Sep 17 00:00:00 2001 From: Filipi Fuchter Date: Thu, 17 Apr 2025 16:46:15 -0300 Subject: [PATCH] Keeping the _speech_triggered as true if the state is incomplete. --- src/pipecat/audio/turn/base_smart_turn.py | 9 +++++---- src/pipecat/transports/base_input.py | 23 +++++++++++++---------- 2 files changed, 18 insertions(+), 14 deletions(-) diff --git a/src/pipecat/audio/turn/base_smart_turn.py b/src/pipecat/audio/turn/base_smart_turn.py index c08fd8fb3..cc18b2880 100644 --- a/src/pipecat/audio/turn/base_smart_turn.py +++ b/src/pipecat/audio/turn/base_smart_turn.py @@ -82,7 +82,7 @@ class BaseSmartTurn(ABC): f"End of Turn complete due to stop_secs. Silence in ms: {self._silence_ms}" ) state = EndOfTurnState.COMPLETE - self._clear() + self._clear(state) else: # Trim buffer to prevent unbounded growth before speech max_buffer_time = ( @@ -101,14 +101,15 @@ class BaseSmartTurn(ABC): logger.debug("Analyzing End of Turn...") state = self._process_speech_segment(self._audio_buffer) if state == EndOfTurnState.COMPLETE or USE_ONLY_LAST_VAD_SEGMENT: - self._clear() + self._clear(state) logger.debug(f"End of Turn result: {state}") return state - def _clear(self): + def _clear(self, turn_state: EndOfTurnState): # Reset internal state for next turn logger.debug("Clearing audio buffer...") - self._speech_triggered = False + # If the state is still incomplete, keep the _speech_triggered as True + self._speech_triggered = turn_state == EndOfTurnState.INCOMPLETE self._audio_buffer = [] self._speech_start_time = None self._silence_ms = 0 diff --git a/src/pipecat/transports/base_input.py b/src/pipecat/transports/base_input.py index 7ae6d3e64..b43b71a49 100644 --- a/src/pipecat/transports/base_input.py +++ b/src/pipecat/transports/base_input.py @@ -221,6 +221,18 @@ class BaseInputTransport(FrameProcessor): await self.push_frame(UserEndOfTurnFrame()) await self._handle_user_interruption(UserStoppedSpeakingFrame()) + async def _run_turn_analyzer(self, frame: InputAudioRawFrame, vad_state: VADState, previous_vad_state: VADState): + is_speech = vad_state == VADState.SPEAKING or vad_state == VADState.STARTING + # If silence exceeds threshold, we are going to receive EndOfTurnState.COMPLETE + end_of_turn_state = self._params.end_of_turn_analyzer.append_audio( + frame.audio, is_speech + ) + if end_of_turn_state == EndOfTurnState.COMPLETE: + await self._handle_end_of_turn_complete(end_of_turn_state) + # Otherwise we are going to trigger to check if the turn is completed based on the VAD + elif vad_state == VADState.QUIET and vad_state != previous_vad_state: + await self._handle_end_of_turn() + async def _audio_task_handler(self): vad_state: VADState = VADState.QUIET while True: @@ -240,16 +252,7 @@ class BaseInputTransport(FrameProcessor): audio_passthrough = self._params.vad_audio_passthrough if self._params.end_of_turn_analyzer: - is_speech = vad_state == VADState.SPEAKING or vad_state == VADState.STARTING - # If silence exceeds threshold, we are going to receive EndOfTurnState.COMPLETE - end_of_turn_state = self._params.end_of_turn_analyzer.append_audio( - frame.audio, is_speech - ) - if end_of_turn_state == EndOfTurnState.COMPLETE: - await self._handle_end_of_turn_complete(end_of_turn_state) - # Otherwise we are going to trigger to check if the turn is completed based on the VAD - elif vad_state == VADState.QUIET and vad_state != previous_vad_state: - await self._handle_end_of_turn() + await self._run_turn_analyzer(frame, vad_state, previous_vad_state) # Push audio downstream if passthrough. if audio_passthrough: