From e3eea0c02fd361a63c1bce61bacdd5f5adee62e7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= <aleix@daily.co>
Date: Wed, 16 Oct 2024 16:35:42 -0700
Subject: [PATCH] vad: add support for interruption to SileroVAD processor

---
 CHANGELOG.md                                  |  10 ++
 examples/foundational/07-interruptible-vad.py | 106 ++++++++++++++++++
 src/pipecat/vad/silero.py                     |  28 ++++-
 3 files changed, 142 insertions(+), 2 deletions(-)
 create mode 100644 examples/foundational/07-interruptible-vad.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 18bf85ffe..182677961 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -12,6 +12,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Renamed `OpenAILLMServiceRealtimeBeta` to `OpenAIRealtimeBetaLLMService` to
   match other services.
 
+### Fixed
+
+- Fixed `SileroVAD` processor to support interruptions properly.
+
+### Other
+
+- Added `examples/foundational/07-interruptible-vad.py`. This is the same as
+  `07-interruptible.py` but using the `SileroVAD` processor instead of passing
+  the `VADAnalyzer` in the transport.
+
 ## [0.0.45] - 2024-10-16
 
 ### Changed
diff --git a/examples/foundational/07-interruptible-vad.py b/examples/foundational/07-interruptible-vad.py
new file mode 100644
index 000000000..716e3ee03
--- /dev/null
+++ b/examples/foundational/07-interruptible-vad.py
@@ -0,0 +1,106 @@
+#
+# Copyright (c) 2024, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+
+import asyncio
+import aiohttp
+import os
+import sys
+
+from pipecat.frames.frames import LLMMessagesFrame
+from pipecat.pipeline.pipeline import Pipeline
+from pipecat.pipeline.runner import PipelineRunner
+from pipecat.pipeline.task import PipelineParams, PipelineTask
+from pipecat.processors.aggregators.llm_response import (
+    LLMAssistantResponseAggregator,
+    LLMUserResponseAggregator,
+)
+from pipecat.services.cartesia import CartesiaTTSService
+from pipecat.services.openai import OpenAILLMService
+from pipecat.transports.services.daily import DailyParams, DailyTransport
+from pipecat.vad.silero import SileroVAD
+
+from runner import configure
+
+from loguru import logger
+
+from dotenv import load_dotenv
+
+load_dotenv(override=True)
+
+logger.remove(0)
+logger.add(sys.stderr, level="DEBUG")
+
+
+async def main():
+    async with aiohttp.ClientSession() as session:
+        (room_url, token) = await configure(session)
+
+        transport = DailyTransport(
+            room_url,
+            token,
+            "Respond bot",
+            DailyParams(
+                audio_in_enabled=True,
+                audio_out_enabled=True,
+                transcription_enabled=True,
+            ),
+        )
+
+        vad = SileroVAD()
+
+        tts = CartesiaTTSService(
+            api_key=os.getenv("CARTESIA_API_KEY"),
+            voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22",  # British Lady
+        )
+
+        llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o")
+
+        messages = [
+            {
+                "role": "system",
+                "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
+            },
+        ]
+
+        tma_in = LLMUserResponseAggregator(messages)
+        tma_out = LLMAssistantResponseAggregator(messages)
+
+        pipeline = Pipeline(
+            [
+                transport.input(),  # Transport user input
+                vad,
+                tma_in,  # User responses
+                llm,  # LLM
+                tts,  # TTS
+                transport.output(),  # Transport bot output
+                tma_out,  # Assistant spoken responses
+            ]
+        )
+
+        task = PipelineTask(
+            pipeline,
+            PipelineParams(
+                allow_interruptions=True,
+                enable_metrics=True,
+                enable_usage_metrics=True,
+                report_only_initial_ttfb=True,
+            ),
+        )
+
+        @transport.event_handler("on_first_participant_joined")
+        async def on_first_participant_joined(transport, participant):
+            transport.capture_participant_transcription(participant["id"])
+            # Kick off the conversation.
+            messages.append({"role": "system", "content": "Please introduce yourself to the user."})
+            await task.queue_frames([LLMMessagesFrame(messages)])
+
+        runner = PipelineRunner()
+
+        await runner.run(task)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/src/pipecat/vad/silero.py b/src/pipecat/vad/silero.py
index 3e852b38b..399ef6b48 100644
--- a/src/pipecat/vad/silero.py
+++ b/src/pipecat/vad/silero.py
@@ -11,6 +11,8 @@ import numpy as np
 from pipecat.frames.frames import (
     AudioRawFrame,
     Frame,
+    StartInterruptionFrame,
+    StopInterruptionFrame,
     UserStartedSpeakingFrame,
     UserStoppedSpeakingFrame,
 )
@@ -200,6 +202,27 @@ class SileroVAD(FrameProcessor):
         else:
             await self.push_frame(frame, direction)
 
+    #
+    # Handle interruptions
+    #
+
+    async def _handle_interruptions(self, frame: Frame):
+        if self.interruptions_allowed:
+            # Make sure we notify about interruptions quickly out-of-band.
+            if isinstance(frame, UserStartedSpeakingFrame):
+                logger.debug("User started speaking")
+                await self._start_interruption()
+                # Push an out-of-band frame (i.e. not using the ordered push
+                # frame task) to stop everything, specially at the output
+                # transport.
+                await self.push_frame(StartInterruptionFrame())
+            elif isinstance(frame, UserStoppedSpeakingFrame):
+                logger.debug("User stopped speaking")
+                await self._stop_interruption()
+                await self.push_frame(StopInterruptionFrame())
+
+        await self.push_frame(frame)
+
     async def _analyze_audio(self, frame: AudioRawFrame):
         # Check VAD and push event if necessary. We just care about changes
         # from QUIET to SPEAKING and vice versa.
@@ -217,5 +240,6 @@ class SileroVAD(FrameProcessor):
                 new_frame = UserStoppedSpeakingFrame()
 
             if new_frame:
-                await self.push_frame(new_frame)
-                self._processor_vad_state = new_vad_state
+                await self._handle_interruptions(new_frame)
+
+            self._processor_vad_state = new_vad_state