scripts(evals): add vision support

This commit is contained in:
Aleix Conchillo Flaqué
2025-08-11 15:04:46 -07:00
parent fb18ae174e
commit 1cfbfcaf11
5 changed files with 48 additions and 7 deletions

View File

@@ -50,6 +50,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Other
- Add vision support to release evals so we can run the foundational examples 12
series.
- Added foundational example `15a-switch-languages.py` to release evals. It is
able to detect if we switched the language properly.

Binary file not shown.

After

Width:  |  Height:  |  Size: 63 KiB

View File

@@ -4,7 +4,6 @@
# SPDX-License-Identifier: BSD 2-Clause License
#
import argparse
import asyncio
import io
import os
@@ -13,11 +12,12 @@ import time
import wave
from datetime import datetime
from pathlib import Path
from typing import List, Optional
from typing import List, Optional, Tuple
import aiofiles
from deepgram import LiveOptions
from loguru import logger
from PIL.ImageFile import ImageFile
from utils import (
EvalResult,
load_module_from_path,
@@ -30,7 +30,7 @@ from pipecat.adapters.schemas.function_schema import FunctionSchema
from pipecat.adapters.schemas.tools_schema import ToolsSchema
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.audio.vad.vad_analyzer import VADParams
from pipecat.frames.frames import EndTaskFrame
from pipecat.frames.frames import EndTaskFrame, OutputImageRawFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
@@ -49,6 +49,8 @@ SCRIPT_DIR = Path(__file__).resolve().parent
PIPELINE_IDLE_TIMEOUT_SECS = 60
EVAL_TIMEOUT_SECS = 90
EvalPrompt = str | Tuple[str, ImageFile]
class EvalRunner:
def __init__(
@@ -87,7 +89,7 @@ class EvalRunner:
async def assert_eval_false(self):
await self._queue.put(False)
async def run_eval(self, example_file: str, prompt: str, eval: Optional[str] = None):
async def run_eval(self, example_file: str, prompt: EvalPrompt, eval: Optional[str] = None):
if not re.match(self._pattern, example_file):
return
@@ -178,6 +180,7 @@ async def run_example_pipeline(script_path: Path):
DailyParams(
audio_in_enabled=True,
audio_out_enabled=True,
video_in_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
),
)
@@ -189,7 +192,10 @@ async def run_example_pipeline(script_path: Path):
async def run_eval_pipeline(
eval_runner: EvalRunner, example_file: str, prompt: str, eval: Optional[str]
eval_runner: EvalRunner,
example_file: str,
prompt: EvalPrompt,
eval: Optional[str],
):
logger.info(f"Starting eval bot")
@@ -202,6 +208,7 @@ async def run_eval_pipeline(
DailyParams(
audio_in_enabled=True,
audio_out_enabled=True,
video_out_enabled=True,
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=2.0)),
),
)
@@ -242,6 +249,14 @@ async def run_eval_pipeline(
)
tools = ToolsSchema(standard_tools=[eval_function])
# Load example prompt depending on image.
example_prompt = ""
example_image: Optional[ImageFile] = None
if isinstance(prompt, str):
example_prompt = prompt
elif isinstance(prompt, tuple):
example_prompt, example_image = prompt
# See if we need to include an eval prompt.
eval_prompt = ""
if eval:
@@ -250,7 +265,7 @@ async def run_eval_pipeline(
messages = [
{
"role": "system",
"content": f"You are an LLM eval, be extremly brief. Your goal is to only ask one question: {prompt}. Call the eval function only if the user answers the question and check if the answer is correct (words as numbers are valid). {eval_prompt}",
"content": f"You are an LLM eval, be extremly brief. Your goal is to only ask one question: {example_prompt}. Call the eval function only if the user answers the question and check if the answer is correct (words as numbers are valid). {eval_prompt}",
},
]
@@ -288,6 +303,14 @@ async def run_eval_pipeline(
@transport.event_handler("on_client_connected")
async def on_client_connected(transport, client):
logger.info(f"Client connected")
if example_image:
await task.queue_frame(
OutputImageRawFrame(
image=example_image.tobytes(),
size=example_image.size,
format="RGB",
)
)
await audio_buffer.start_recording()
@transport.event_handler("on_client_disconnected")

View File

@@ -13,12 +13,15 @@ from pathlib import Path
from dotenv import load_dotenv
from eval import EvalRunner
from loguru import logger
from PIL import Image
from utils import check_env_variables
load_dotenv(override=True)
SCRIPT_DIR = Path(__file__).resolve().parent
ASSETS_DIR = SCRIPT_DIR / "assets"
FOUNDATIONAL_DIR = SCRIPT_DIR.parent.parent / "examples" / "foundational"
@@ -39,6 +42,10 @@ EVAL_ONLINE_SEARCH = f"Today is {datetime.now(timezone.utc).strftime('%B %d, %Y'
PROMPT_SWITCH_LANGUAGE = "Say something in Spanish."
EVAL_SWITCH_LANGUAGE = "Check if the user is now talking in Spanish."
# Vision
PROMPT_VISION = ("What do you see?", Image.open(ASSETS_DIR / "cat.jpg"))
EVAL_VISION = "A cat description."
TESTS_07 = [
# 07 series
("07-interruptible.py", PROMPT_SIMPLE_MATH, None),
@@ -81,6 +88,13 @@ TESTS_07 = [
# ("07u-interruptible-ultravox.py", PROMPT_SIMPLE_MATH, None),
]
TESTS_12 = [
("12-describe-video.py", PROMPT_VISION, EVAL_VISION),
("12a-describe-video-gemini-flash.py", PROMPT_VISION, EVAL_VISION),
("12b-describe-video-gpt-4o.py", PROMPT_VISION, EVAL_VISION),
("12c-describe-video-anthropic.py", PROMPT_VISION, EVAL_VISION),
]
TESTS_14 = [
("14-function-calling.py", PROMPT_WEATHER, EVAL_WEATHER),
("14a-function-calling-anthropic.py", PROMPT_WEATHER, EVAL_WEATHER),
@@ -142,6 +156,7 @@ TESTS_43 = [
TESTS = [
*TESTS_07,
*TESTS_12,
*TESTS_14,
*TESTS_15,
*TESTS_19,

View File

@@ -560,7 +560,7 @@ class DailyTransportClient(EventHandler):
self._out_sample_rate = self._params.audio_out_sample_rate or frame.audio_out_sample_rate
if self._params.audio_in_enabled:
if self._params.audio_in_user_tracks and not self._audio_task:
if self._params.audio_in_user_tracks and not self._audio_task and self._task_manager:
self._audio_queue = WatchdogQueue(self._task_manager)
self._audio_task = self._task_manager.create_task(
self._callback_task_handler(self._audio_queue),