Keeping the _speech_triggered as true if the state is incomplete.

2025-04-17 16:46:15 -03:00
parent a80f82cdb6
commit 3ea9cfd251
2 changed files with 18 additions and 14 deletions
--- a/src/pipecat/audio/turn/base_smart_turn.py
+++ b/src/pipecat/audio/turn/base_smart_turn.py
@@ -82,7 +82,7 @@ class BaseSmartTurn(ABC):
                        f"End of Turn complete due to stop_secs. Silence in ms: {self._silence_ms}"
                    )
                    state = EndOfTurnState.COMPLETE
-                    self._clear()
+                    self._clear(state)
            else:
                # Trim buffer to prevent unbounded growth before speech
                max_buffer_time = (
@@ -101,14 +101,15 @@ class BaseSmartTurn(ABC):
        logger.debug("Analyzing End of Turn...")
        state = self._process_speech_segment(self._audio_buffer)
        if state == EndOfTurnState.COMPLETE or USE_ONLY_LAST_VAD_SEGMENT:
-            self._clear()
+            self._clear(state)
        logger.debug(f"End of Turn result: {state}")
        return state

-    def _clear(self):
+    def _clear(self, turn_state: EndOfTurnState):
        # Reset internal state for next turn
        logger.debug("Clearing audio buffer...")
-        self._speech_triggered = False
+        # If the state is still incomplete, keep the _speech_triggered as True
+        self._speech_triggered = turn_state == EndOfTurnState.INCOMPLETE
        self._audio_buffer = []
        self._speech_start_time = None
        self._silence_ms = 0
--- a/src/pipecat/transports/base_input.py
+++ b/src/pipecat/transports/base_input.py
@@ -221,6 +221,18 @@ class BaseInputTransport(FrameProcessor):
            await self.push_frame(UserEndOfTurnFrame())
            await self._handle_user_interruption(UserStoppedSpeakingFrame())

+    async def _run_turn_analyzer(self, frame: InputAudioRawFrame, vad_state: VADState, previous_vad_state: VADState):
+        is_speech = vad_state == VADState.SPEAKING or vad_state == VADState.STARTING
+        # If silence exceeds threshold, we are going to receive EndOfTurnState.COMPLETE
+        end_of_turn_state = self._params.end_of_turn_analyzer.append_audio(
+            frame.audio, is_speech
+        )
+        if end_of_turn_state == EndOfTurnState.COMPLETE:
+            await self._handle_end_of_turn_complete(end_of_turn_state)
+        # Otherwise we are going to trigger to check if the turn is completed based on the VAD
+        elif vad_state == VADState.QUIET and vad_state != previous_vad_state:
+            await self._handle_end_of_turn()
+
    async def _audio_task_handler(self):
        vad_state: VADState = VADState.QUIET
        while True:
@@ -240,16 +252,7 @@ class BaseInputTransport(FrameProcessor):
                audio_passthrough = self._params.vad_audio_passthrough

            if self._params.end_of_turn_analyzer:
-                is_speech = vad_state == VADState.SPEAKING or vad_state == VADState.STARTING
-                # If silence exceeds threshold, we are going to receive EndOfTurnState.COMPLETE
-                end_of_turn_state = self._params.end_of_turn_analyzer.append_audio(
-                    frame.audio, is_speech
-                )
-                if end_of_turn_state == EndOfTurnState.COMPLETE:
-                    await self._handle_end_of_turn_complete(end_of_turn_state)
-                # Otherwise we are going to trigger to check if the turn is completed based on the VAD
-                elif vad_state == VADState.QUIET and vad_state != previous_vad_state:
-                    await self._handle_end_of_turn()
+                await self._run_turn_analyzer(frame, vad_state, previous_vad_state)

            # Push audio downstream if passthrough.
            if audio_passthrough: