demo: DelayProcessor

2025-09-11 16:05:08 +08:00
1 changed files with 63 additions and 1 deletions
--- a/examples/foundational/07-interruptible.py
+++ b/examples/foundational/07-interruptible.py
@@ -4,17 +4,19 @@
 # SPDX-License-Identifier: BSD 2-Clause License
 #

+import asyncio
 import os

 from dotenv import load_dotenv
 from loguru import logger

 from pipecat.audio.vad.silero import SileroVADAnalyzer
-from pipecat.frames.frames import LLMRunFrame
+from pipecat.frames.frames import Frame, LLMFullResponseEndFrame, LLMRunFrame, LLMTextFrame
 from pipecat.pipeline.pipeline import Pipeline
 from pipecat.pipeline.runner import PipelineRunner
 from pipecat.pipeline.task import PipelineParams, PipelineTask
 from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
+from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
 from pipecat.runner.types import RunnerArguments
 from pipecat.runner.utils import create_transport
 from pipecat.services.cartesia.tts import CartesiaTTSService
@@ -26,6 +28,62 @@ from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams

 load_dotenv(override=True)

+
+class DelayProcessor(FrameProcessor):
+    """Custom processor that queues LLM text frames until response is complete.
+
+    This creates a more natural conversation flow by preventing the agent from
+    responding immediately after the user stops speaking. It queues all LLMTextFrames
+    until it sees an LLMFullResponseEndFrame, then waits for the specified delay
+    before releasing all queued frames at once.
+    """
+
+    def __init__(self, *, delay_seconds: float = 1.0, **kwargs) -> None:
+        """Initialize the DelayProcessor.
+
+        Args:
+            delay_seconds: Number of seconds to delay before releasing queued frames (default: 1.0)
+        """
+        super().__init__(**kwargs)
+        self._delay_seconds = delay_seconds
+        self._queued_frames = []
+
+    async def process_frame(self, frame: Frame, direction: FrameDirection) -> None:
+        """Process frames, queuing LLM text frames until response is complete.
+
+        Args:
+            frame: The frame to process
+            direction: Direction of the frame in the pipeline
+        """
+        await super().process_frame(frame, direction)
+
+        if isinstance(frame, LLMTextFrame):
+            # Queue LLM text frames instead of pushing them immediately
+            logger.debug(f"Queuing LLMTextFrame: {frame.text}")
+            self._queued_frames.append((frame, direction))
+        elif isinstance(frame, LLMFullResponseEndFrame):
+            # When we see the end frame, wait for delay then push all queued frames
+            logger.debug(
+                f"LLM response complete, delaying {self._delay_seconds} seconds before releasing {len(self._queued_frames)} queued frames"
+            )
+            await asyncio.sleep(self._delay_seconds)
+
+            # Push all queued LLM text frames
+            for queued_frame, queued_direction in self._queued_frames:
+                logger.debug(f"Releasing queued LLMTextFrame: {queued_frame.text}")
+                await self.push_frame(queued_frame, queued_direction)
+
+            # Clear the queue
+            self._queued_frames.clear()
+
+            # Push the end frame
+            logger.debug("Pushing LLMFullResponseEndFrame")
+            await self.push_frame(frame, direction)
+        else:
+            # Push all other frames immediately
+            await self.push_frame(frame, direction)
+
+
 # We store functions so objects (e.g. SileroVADAnalyzer) don't get
 # instantiated. The function will be called when the desired transport gets
 # selected.
@@ -70,12 +128,16 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
    context = OpenAILLMContext(messages)
    context_aggregator = llm.create_context_aggregator(context)

+    # Create delay processor to add 1-second delay before agent responses
+    delay_processor = DelayProcessor(delay_seconds=1.0)
+
    pipeline = Pipeline(
        [
            transport.input(),  # Transport user input
            stt,
            context_aggregator.user(),  # User responses
            llm,  # LLM
+            delay_processor,  # Add delay before TTS
            tts,  # TTS
            transport.output(),  # Transport bot output
            context_aggregator.assistant(),  # Assistant spoken responses