Handling interruption.

2026-05-20 16:26:47 -03:00
parent f568b1d8df
commit b360fbf7fc
1 changed files with 33 additions and 6 deletions
--- a/scripts/daily/test_tavus_transport.py
+++ b/scripts/daily/test_tavus_transport.py
@@ -52,6 +52,9 @@ class DailyProxyApp(EventHandler):
        # Raw PCM buffer — filled at DECLARED_SAMPLE_RATE speed, drained at TRUE_SAMPLE_RATE speed.
        self._buffer = bytearray()
        self._audio_task: asyncio.Task | None = None
+        # Tracks whether the previous frame was silence, to detect speech→silence transitions.
+        # Initialised True so leading silence before first speech is dropped.
+        self._last_was_silence: bool = True

        self._client: CallClient = CallClient(event_handler=self)
        self._client.update_subscription_profiles(
@@ -149,7 +152,7 @@ class DailyProxyApp(EventHandler):
        )

    @staticmethod
-    def _is_silence(data: bytes, threshold: int = 10) -> bool:
+    def _is_silence(data: bytes, threshold: int = 5) -> bool:
        # Interpret as 16-bit signed PCM samples and check peak amplitude.
        # WebRTC-injected silence is all zeros; real TTS audio has non-trivial
        # amplitude. This lets us skip buffering frames that Pipecat never wrote,
@@ -165,17 +168,45 @@ class DailyProxyApp(EventHandler):
        fast-audio effect we want to observe. The buffer then drains at
        TRUE_SAMPLE_RATE speed once speech stops.

-        Silence frames are dropped from the buffer entirely.
+        On the first silence after speech (speech→silence transition) we insert
+        one real-time-equivalent chunk of silence to represent the natural pause.
+        All subsequent silence frames are dropped so the buffer drains back down.
+        This also limits the impact of any TTS-emitted silence to a single chunk
+        per transition rather than letting it accumulate.
        """
        new_bytes = audio_data.audio_frames
        if self._is_silence(new_bytes):
+            if not self._last_was_silence:
+                # First silence after speech: mark the pause with one chunk.
+                self._buffer.extend(bytes(len(new_bytes) // SPEEDUP))
+                self._last_was_silence = True
            return

+        self._last_was_silence = False
        self._buffer.extend(new_bytes)

    def _audio_data_received(self, participant_id: str, audio_data: AudioData, audio_source: str):
        asyncio.run_coroutine_threadsafe(self._buffer_audio(audio_data), self._loop)

+    async def _handle_interrupt(self):
+        """Clear the audio buffer, mimicking the avatar stopping mid-speech."""
+        dropped = len(self._buffer)
+        self._buffer.clear()
+        self._last_was_silence = True
+        logger.info(
+            f"Interrupt received — dropped {dropped}B ({dropped / (TRUE_SAMPLE_RATE * 2):.3f}s) from buffer"
+        )
+
+    #
+    # Daily (EventHandler)
+    #
+
+    def on_app_message(self, message, sender):
+        if not isinstance(message, dict):
+            return
+        if message.get("event_type") == "conversation.interrupt":
+            asyncio.run_coroutine_threadsafe(self._handle_interrupt(), self._loop)
+
    async def _audio_task_handler(self):
        """Drain the buffer at TRUE_SAMPLE_RATE speed (real-time playback)."""
        chunk_frames = int(TRUE_SAMPLE_RATE * 20 / 1000)  # 20 ms chunks
@@ -202,10 +233,6 @@ class DailyProxyApp(EventHandler):
                    )
                    last_log_time = now

-    #
-    # Daily (EventHandler)
-    #
-
    def on_participant_joined(self, participant):
        participant_name = participant["info"]["userName"]
        logger.info(f"Participant {participant_name} joined")