From b360fbf7fc07241e47891b2048e3abec2e8f6ab9 Mon Sep 17 00:00:00 2001
From: filipi87 <filipi87@gmail.com>
Date: Wed, 20 May 2026 16:26:47 -0300
Subject: [PATCH] Handling interruption.

---
 scripts/daily/test_tavus_transport.py | 39 ++++++++++++++++++++++-----
 1 file changed, 33 insertions(+), 6 deletions(-)

diff --git a/scripts/daily/test_tavus_transport.py b/scripts/daily/test_tavus_transport.py
index ed9e4fccf..109f93321 100644
--- a/scripts/daily/test_tavus_transport.py
+++ b/scripts/daily/test_tavus_transport.py
@@ -52,6 +52,9 @@ class DailyProxyApp(EventHandler):
         # Raw PCM buffer — filled at DECLARED_SAMPLE_RATE speed, drained at TRUE_SAMPLE_RATE speed.
         self._buffer = bytearray()
         self._audio_task: asyncio.Task | None = None
+        # Tracks whether the previous frame was silence, to detect speech→silence transitions.
+        # Initialised True so leading silence before first speech is dropped.
+        self._last_was_silence: bool = True
 
         self._client: CallClient = CallClient(event_handler=self)
         self._client.update_subscription_profiles(
@@ -149,7 +152,7 @@ class DailyProxyApp(EventHandler):
         )
 
     @staticmethod
-    def _is_silence(data: bytes, threshold: int = 10) -> bool:
+    def _is_silence(data: bytes, threshold: int = 5) -> bool:
         # Interpret as 16-bit signed PCM samples and check peak amplitude.
         # WebRTC-injected silence is all zeros; real TTS audio has non-trivial
         # amplitude. This lets us skip buffering frames that Pipecat never wrote,
@@ -165,17 +168,45 @@ class DailyProxyApp(EventHandler):
         fast-audio effect we want to observe. The buffer then drains at
         TRUE_SAMPLE_RATE speed once speech stops.
 
-        Silence frames are dropped from the buffer entirely.
+        On the first silence after speech (speech→silence transition) we insert
+        one real-time-equivalent chunk of silence to represent the natural pause.
+        All subsequent silence frames are dropped so the buffer drains back down.
+        This also limits the impact of any TTS-emitted silence to a single chunk
+        per transition rather than letting it accumulate.
         """
         new_bytes = audio_data.audio_frames
         if self._is_silence(new_bytes):
+            if not self._last_was_silence:
+                # First silence after speech: mark the pause with one chunk.
+                self._buffer.extend(bytes(len(new_bytes) // SPEEDUP))
+                self._last_was_silence = True
             return
 
+        self._last_was_silence = False
         self._buffer.extend(new_bytes)
 
     def _audio_data_received(self, participant_id: str, audio_data: AudioData, audio_source: str):
         asyncio.run_coroutine_threadsafe(self._buffer_audio(audio_data), self._loop)
 
+    async def _handle_interrupt(self):
+        """Clear the audio buffer, mimicking the avatar stopping mid-speech."""
+        dropped = len(self._buffer)
+        self._buffer.clear()
+        self._last_was_silence = True
+        logger.info(
+            f"Interrupt received — dropped {dropped}B ({dropped / (TRUE_SAMPLE_RATE * 2):.3f}s) from buffer"
+        )
+
+    #
+    # Daily (EventHandler)
+    #
+
+    def on_app_message(self, message, sender):
+        if not isinstance(message, dict):
+            return
+        if message.get("event_type") == "conversation.interrupt":
+            asyncio.run_coroutine_threadsafe(self._handle_interrupt(), self._loop)
+
     async def _audio_task_handler(self):
         """Drain the buffer at TRUE_SAMPLE_RATE speed (real-time playback)."""
         chunk_frames = int(TRUE_SAMPLE_RATE * 20 / 1000)  # 20 ms chunks
@@ -202,10 +233,6 @@ class DailyProxyApp(EventHandler):
                     )
                     last_log_time = now
 
-    #
-    # Daily (EventHandler)
-    #
-
     def on_participant_joined(self, participant):
         participant_name = participant["info"]["userName"]
         logger.info(f"Participant {participant_name} joined")