diff --git a/examples/foundational/14d-function-calling-moondream-video.py b/examples/foundational/14d-function-calling-moondream-video.py index 6aeb2b892..9544818b9 100644 --- a/examples/foundational/14d-function-calling-moondream-video.py +++ b/examples/foundational/14d-function-calling-moondream-video.py @@ -15,14 +15,21 @@ from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3 from pipecat.audio.vad.silero import SileroVADAnalyzer from pipecat.audio.vad.vad_analyzer import VADParams -from pipecat.frames.frames import LLMRunFrame, UserImageRequestFrame +from pipecat.frames.frames import ( + Frame, + LLMFullResponseEndFrame, + LLMFullResponseStartFrame, + LLMRunFrame, + TextFrame, + UserImageRequestFrame, +) from pipecat.pipeline.parallel_pipeline import ParallelPipeline from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner from pipecat.pipeline.task import PipelineTask from pipecat.processors.aggregators.llm_context import LLMContext from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair -from pipecat.processors.frame_processor import FrameDirection +from pipecat.processors.frame_processor import FrameDirection, FrameProcessor from pipecat.runner.types import RunnerArguments from pipecat.runner.utils import ( create_transport, @@ -66,6 +73,27 @@ async def fetch_user_image(params: FunctionCallParams): # await params.result_callback({"result": "Image is being captured."}) +class MoondreamTextFrameWrapper(FrameProcessor): + """Wraps Moondream-provided TextFrames with LLM response start/end frames. + + This processor detects TextFrames and automatically wraps them with + LLMFullResponseStartFrame and LLMFullResponseEndFrame to provide proper + response boundaries for downstream processors. + """ + + async def process_frame(self, frame: Frame, direction: FrameDirection): + await super().process_frame(frame, direction) + + # If we receive a TextFrame, wrap it with response start/end frames + if isinstance(frame, TextFrame): + await self.push_frame(LLMFullResponseStartFrame(), direction) + await self.push_frame(frame, direction) + await self.push_frame(LLMFullResponseEndFrame(), direction) + else: + # For all other frames, just pass them through + await self.push_frame(frame, direction) + + # We store functions so objects (e.g. SileroVADAnalyzer) don't get # instantiated. The function will be called when the desired transport gets # selected. @@ -130,6 +158,12 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments): # If you run into weird description, try with use_cpu=True moondream = MoondreamService() + # Wrap TextFrames with LLM response start/end frames, which makes Moondream + # output be treated like LLM responses for the purpose of context + # aggregation. Without this, the assistant context aggregator would ignore + # Moondream output (if the TTS service is disabled). + moondream_text_wrapper = MoondreamTextFrameWrapper() + pipeline = Pipeline( [ transport.input(), # Transport user input @@ -137,7 +171,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments): context_aggregator.user(), # User responses ParallelPipeline( [llm], # LLM - [moondream], + [moondream, moondream_text_wrapper], ), tts, # TTS transport.output(), # Transport bot output