Merge pull request #3074 from pipecat-ai/pk/tweak-moondream-example

Update Moondream example so that Moondream service output makes it in…
2025-11-17 16:52:18 -05:00
parent 47f78df497 5095fc6a64
commit 515eaeeb1a
1 changed files with 37 additions and 3 deletions
--- a/examples/foundational/14d-function-calling-moondream-video.py
+++ b/examples/foundational/14d-function-calling-moondream-video.py
@@ -15,14 +15,21 @@ from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
 from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
 from pipecat.audio.vad.silero import SileroVADAnalyzer
 from pipecat.audio.vad.vad_analyzer import VADParams
-from pipecat.frames.frames import LLMRunFrame, UserImageRequestFrame
+from pipecat.frames.frames import (
+    Frame,
+    LLMFullResponseEndFrame,
+    LLMFullResponseStartFrame,
+    LLMRunFrame,
+    TextFrame,
+    UserImageRequestFrame,
+)
 from pipecat.pipeline.parallel_pipeline import ParallelPipeline
 from pipecat.pipeline.pipeline import Pipeline
 from pipecat.pipeline.runner import PipelineRunner
 from pipecat.pipeline.task import PipelineTask
 from pipecat.processors.aggregators.llm_context import LLMContext
 from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
-from pipecat.processors.frame_processor import FrameDirection
+from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
 from pipecat.runner.types import RunnerArguments
 from pipecat.runner.utils import (
    create_transport,
@@ -66,6 +73,27 @@ async def fetch_user_image(params: FunctionCallParams):
    # await params.result_callback({"result": "Image is being captured."})


+class MoondreamTextFrameWrapper(FrameProcessor):
+    """Wraps Moondream-provided TextFrames with LLM response start/end frames.
+
+    This processor detects TextFrames and automatically wraps them with
+    LLMFullResponseStartFrame and LLMFullResponseEndFrame to provide proper
+    response boundaries for downstream processors.
+    """
+
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
+
+        # If we receive a TextFrame, wrap it with response start/end frames
+        if isinstance(frame, TextFrame):
+            await self.push_frame(LLMFullResponseStartFrame(), direction)
+            await self.push_frame(frame, direction)
+            await self.push_frame(LLMFullResponseEndFrame(), direction)
+        else:
+            # For all other frames, just pass them through
+            await self.push_frame(frame, direction)
+
+
 # We store functions so objects (e.g. SileroVADAnalyzer) don't get
 # instantiated. The function will be called when the desired transport gets
 # selected.
@@ -130,6 +158,12 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
    # If you run into weird description, try with use_cpu=True
    moondream = MoondreamService()

+    # Wrap TextFrames with LLM response start/end frames, which makes Moondream
+    # output be treated like LLM responses for the purpose of context
+    # aggregation. Without this, the assistant context aggregator would ignore
+    # Moondream output (if the TTS service is disabled).
+    moondream_text_wrapper = MoondreamTextFrameWrapper()
+
    pipeline = Pipeline(
        [
            transport.input(),  # Transport user input
@@ -137,7 +171,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
            context_aggregator.user(),  # User responses
            ParallelPipeline(
                [llm],  # LLM
-                [moondream],
+                [moondream, moondream_text_wrapper],
            ),
            tts,  # TTS
            transport.output(),  # Transport bot output