scripts(evals): add vision support

2025-08-11 15:04:46 -07:00
parent fb18ae174e
commit 1cfbfcaf11
5 changed files with 48 additions and 7 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -50,6 +50,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

 ### Other

+- Add vision support to release evals so we can run the foundational examples 12
+  series.
+
 - Added foundational example `15a-switch-languages.py` to release evals. It is
  able to detect if we switched the language properly.

--- a/scripts/evals/assets/cat.jpg
+++ b/scripts/evals/assets/cat.jpg
--- a/scripts/evals/eval.py
+++ b/scripts/evals/eval.py
@@ -4,7 +4,6 @@
 # SPDX-License-Identifier: BSD 2-Clause License
 #

-import argparse
 import asyncio
 import io
 import os
@@ -13,11 +12,12 @@ import time
 import wave
 from datetime import datetime
 from pathlib import Path
-from typing import List, Optional
+from typing import List, Optional, Tuple

 import aiofiles
 from deepgram import LiveOptions
 from loguru import logger
+from PIL.ImageFile import ImageFile
 from utils import (
    EvalResult,
    load_module_from_path,
@@ -30,7 +30,7 @@ from pipecat.adapters.schemas.function_schema import FunctionSchema
 from pipecat.adapters.schemas.tools_schema import ToolsSchema
 from pipecat.audio.vad.silero import SileroVADAnalyzer
 from pipecat.audio.vad.vad_analyzer import VADParams
-from pipecat.frames.frames import EndTaskFrame
+from pipecat.frames.frames import EndTaskFrame, OutputImageRawFrame
 from pipecat.pipeline.pipeline import Pipeline
 from pipecat.pipeline.runner import PipelineRunner
 from pipecat.pipeline.task import PipelineParams, PipelineTask
@@ -49,6 +49,8 @@ SCRIPT_DIR = Path(__file__).resolve().parent
 PIPELINE_IDLE_TIMEOUT_SECS = 60
 EVAL_TIMEOUT_SECS = 90

+EvalPrompt = str | Tuple[str, ImageFile]
+

 class EvalRunner:
    def __init__(
@@ -87,7 +89,7 @@ class EvalRunner:
    async def assert_eval_false(self):
        await self._queue.put(False)

-    async def run_eval(self, example_file: str, prompt: str, eval: Optional[str] = None):
+    async def run_eval(self, example_file: str, prompt: EvalPrompt, eval: Optional[str] = None):
        if not re.match(self._pattern, example_file):
            return

@@ -178,6 +180,7 @@ async def run_example_pipeline(script_path: Path):
        DailyParams(
            audio_in_enabled=True,
            audio_out_enabled=True,
+            video_in_enabled=True,
            vad_analyzer=SileroVADAnalyzer(),
        ),
    )
@@ -189,7 +192,10 @@ async def run_example_pipeline(script_path: Path):


 async def run_eval_pipeline(
-    eval_runner: EvalRunner, example_file: str, prompt: str, eval: Optional[str]
+    eval_runner: EvalRunner,
+    example_file: str,
+    prompt: EvalPrompt,
+    eval: Optional[str],
 ):
    logger.info(f"Starting eval bot")

@@ -202,6 +208,7 @@ async def run_eval_pipeline(
        DailyParams(
            audio_in_enabled=True,
            audio_out_enabled=True,
+            video_out_enabled=True,
            vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=2.0)),
        ),
    )
@@ -242,6 +249,14 @@ async def run_eval_pipeline(
    )
    tools = ToolsSchema(standard_tools=[eval_function])

+    # Load example prompt depending on image.
+    example_prompt = ""
+    example_image: Optional[ImageFile] = None
+    if isinstance(prompt, str):
+        example_prompt = prompt
+    elif isinstance(prompt, tuple):
+        example_prompt, example_image = prompt
+
    # See if we need to include an eval prompt.
    eval_prompt = ""
    if eval:
@@ -250,7 +265,7 @@ async def run_eval_pipeline(
    messages = [
        {
            "role": "system",
-            "content": f"You are an LLM eval, be extremly brief. Your goal is to only ask one question: {prompt}. Call the eval function only if the user answers the question and check if the answer is correct (words as numbers are valid). {eval_prompt}",
+            "content": f"You are an LLM eval, be extremly brief. Your goal is to only ask one question: {example_prompt}. Call the eval function only if the user answers the question and check if the answer is correct (words as numbers are valid). {eval_prompt}",
        },
    ]

@@ -288,6 +303,14 @@ async def run_eval_pipeline(
    @transport.event_handler("on_client_connected")
    async def on_client_connected(transport, client):
        logger.info(f"Client connected")
+        if example_image:
+            await task.queue_frame(
+                OutputImageRawFrame(
+                    image=example_image.tobytes(),
+                    size=example_image.size,
+                    format="RGB",
+                )
+            )
        await audio_buffer.start_recording()

    @transport.event_handler("on_client_disconnected")
--- a/scripts/evals/run-release-evals.py
+++ b/scripts/evals/run-release-evals.py
@@ -13,12 +13,15 @@ from pathlib import Path
 from dotenv import load_dotenv
 from eval import EvalRunner
 from loguru import logger
+from PIL import Image
 from utils import check_env_variables

 load_dotenv(override=True)

 SCRIPT_DIR = Path(__file__).resolve().parent

+ASSETS_DIR = SCRIPT_DIR / "assets"
+
 FOUNDATIONAL_DIR = SCRIPT_DIR.parent.parent / "examples" / "foundational"


@@ -39,6 +42,10 @@ EVAL_ONLINE_SEARCH = f"Today is {datetime.now(timezone.utc).strftime('%B %d, %Y'
 PROMPT_SWITCH_LANGUAGE = "Say something in Spanish."
 EVAL_SWITCH_LANGUAGE = "Check if the user is now talking in Spanish."

+# Vision
+PROMPT_VISION = ("What do you see?", Image.open(ASSETS_DIR / "cat.jpg"))
+EVAL_VISION = "A cat description."
+
 TESTS_07 = [
    # 07 series
    ("07-interruptible.py", PROMPT_SIMPLE_MATH, None),
@@ -81,6 +88,13 @@ TESTS_07 = [
    # ("07u-interruptible-ultravox.py", PROMPT_SIMPLE_MATH, None),
 ]

+TESTS_12 = [
+    ("12-describe-video.py", PROMPT_VISION, EVAL_VISION),
+    ("12a-describe-video-gemini-flash.py", PROMPT_VISION, EVAL_VISION),
+    ("12b-describe-video-gpt-4o.py", PROMPT_VISION, EVAL_VISION),
+    ("12c-describe-video-anthropic.py", PROMPT_VISION, EVAL_VISION),
+]
+
 TESTS_14 = [
    ("14-function-calling.py", PROMPT_WEATHER, EVAL_WEATHER),
    ("14a-function-calling-anthropic.py", PROMPT_WEATHER, EVAL_WEATHER),
@@ -142,6 +156,7 @@ TESTS_43 = [

 TESTS = [
    *TESTS_07,
+    *TESTS_12,
    *TESTS_14,
    *TESTS_15,
    *TESTS_19,
--- a/src/pipecat/transports/services/daily.py
+++ b/src/pipecat/transports/services/daily.py
@@ -560,7 +560,7 @@ class DailyTransportClient(EventHandler):
        self._out_sample_rate = self._params.audio_out_sample_rate or frame.audio_out_sample_rate

        if self._params.audio_in_enabled:
-            if self._params.audio_in_user_tracks and not self._audio_task:
+            if self._params.audio_in_user_tracks and not self._audio_task and self._task_manager:
                self._audio_queue = WatchdogQueue(self._task_manager)
                self._audio_task = self._task_manager.create_task(
                    self._callback_task_handler(self._audio_queue),