diff --git a/CHANGELOG.md b/CHANGELOG.md index 55b92591a..339cea16b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -50,6 +50,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Other +- Add vision support to release evals so we can run the foundational examples 12 + series. + - Added foundational example `15a-switch-languages.py` to release evals. It is able to detect if we switched the language properly. diff --git a/scripts/evals/assets/cat.jpg b/scripts/evals/assets/cat.jpg new file mode 100644 index 000000000..700b5fc92 Binary files /dev/null and b/scripts/evals/assets/cat.jpg differ diff --git a/scripts/evals/eval.py b/scripts/evals/eval.py index c12f75fa5..b91f27c6e 100644 --- a/scripts/evals/eval.py +++ b/scripts/evals/eval.py @@ -4,7 +4,6 @@ # SPDX-License-Identifier: BSD 2-Clause License # -import argparse import asyncio import io import os @@ -13,11 +12,12 @@ import time import wave from datetime import datetime from pathlib import Path -from typing import List, Optional +from typing import List, Optional, Tuple import aiofiles from deepgram import LiveOptions from loguru import logger +from PIL.ImageFile import ImageFile from utils import ( EvalResult, load_module_from_path, @@ -30,7 +30,7 @@ from pipecat.adapters.schemas.function_schema import FunctionSchema from pipecat.adapters.schemas.tools_schema import ToolsSchema from pipecat.audio.vad.silero import SileroVADAnalyzer from pipecat.audio.vad.vad_analyzer import VADParams -from pipecat.frames.frames import EndTaskFrame +from pipecat.frames.frames import EndTaskFrame, OutputImageRawFrame from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner from pipecat.pipeline.task import PipelineParams, PipelineTask @@ -49,6 +49,8 @@ SCRIPT_DIR = Path(__file__).resolve().parent PIPELINE_IDLE_TIMEOUT_SECS = 60 EVAL_TIMEOUT_SECS = 90 +EvalPrompt = str | Tuple[str, ImageFile] + class EvalRunner: def __init__( @@ -87,7 +89,7 @@ class EvalRunner: async def assert_eval_false(self): await self._queue.put(False) - async def run_eval(self, example_file: str, prompt: str, eval: Optional[str] = None): + async def run_eval(self, example_file: str, prompt: EvalPrompt, eval: Optional[str] = None): if not re.match(self._pattern, example_file): return @@ -178,6 +180,7 @@ async def run_example_pipeline(script_path: Path): DailyParams( audio_in_enabled=True, audio_out_enabled=True, + video_in_enabled=True, vad_analyzer=SileroVADAnalyzer(), ), ) @@ -189,7 +192,10 @@ async def run_example_pipeline(script_path: Path): async def run_eval_pipeline( - eval_runner: EvalRunner, example_file: str, prompt: str, eval: Optional[str] + eval_runner: EvalRunner, + example_file: str, + prompt: EvalPrompt, + eval: Optional[str], ): logger.info(f"Starting eval bot") @@ -202,6 +208,7 @@ async def run_eval_pipeline( DailyParams( audio_in_enabled=True, audio_out_enabled=True, + video_out_enabled=True, vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=2.0)), ), ) @@ -242,6 +249,14 @@ async def run_eval_pipeline( ) tools = ToolsSchema(standard_tools=[eval_function]) + # Load example prompt depending on image. + example_prompt = "" + example_image: Optional[ImageFile] = None + if isinstance(prompt, str): + example_prompt = prompt + elif isinstance(prompt, tuple): + example_prompt, example_image = prompt + # See if we need to include an eval prompt. eval_prompt = "" if eval: @@ -250,7 +265,7 @@ async def run_eval_pipeline( messages = [ { "role": "system", - "content": f"You are an LLM eval, be extremly brief. Your goal is to only ask one question: {prompt}. Call the eval function only if the user answers the question and check if the answer is correct (words as numbers are valid). {eval_prompt}", + "content": f"You are an LLM eval, be extremly brief. Your goal is to only ask one question: {example_prompt}. Call the eval function only if the user answers the question and check if the answer is correct (words as numbers are valid). {eval_prompt}", }, ] @@ -288,6 +303,14 @@ async def run_eval_pipeline( @transport.event_handler("on_client_connected") async def on_client_connected(transport, client): logger.info(f"Client connected") + if example_image: + await task.queue_frame( + OutputImageRawFrame( + image=example_image.tobytes(), + size=example_image.size, + format="RGB", + ) + ) await audio_buffer.start_recording() @transport.event_handler("on_client_disconnected") diff --git a/scripts/evals/run-release-evals.py b/scripts/evals/run-release-evals.py index 5aac70c6b..6b6ac7eda 100644 --- a/scripts/evals/run-release-evals.py +++ b/scripts/evals/run-release-evals.py @@ -13,12 +13,15 @@ from pathlib import Path from dotenv import load_dotenv from eval import EvalRunner from loguru import logger +from PIL import Image from utils import check_env_variables load_dotenv(override=True) SCRIPT_DIR = Path(__file__).resolve().parent +ASSETS_DIR = SCRIPT_DIR / "assets" + FOUNDATIONAL_DIR = SCRIPT_DIR.parent.parent / "examples" / "foundational" @@ -39,6 +42,10 @@ EVAL_ONLINE_SEARCH = f"Today is {datetime.now(timezone.utc).strftime('%B %d, %Y' PROMPT_SWITCH_LANGUAGE = "Say something in Spanish." EVAL_SWITCH_LANGUAGE = "Check if the user is now talking in Spanish." +# Vision +PROMPT_VISION = ("What do you see?", Image.open(ASSETS_DIR / "cat.jpg")) +EVAL_VISION = "A cat description." + TESTS_07 = [ # 07 series ("07-interruptible.py", PROMPT_SIMPLE_MATH, None), @@ -81,6 +88,13 @@ TESTS_07 = [ # ("07u-interruptible-ultravox.py", PROMPT_SIMPLE_MATH, None), ] +TESTS_12 = [ + ("12-describe-video.py", PROMPT_VISION, EVAL_VISION), + ("12a-describe-video-gemini-flash.py", PROMPT_VISION, EVAL_VISION), + ("12b-describe-video-gpt-4o.py", PROMPT_VISION, EVAL_VISION), + ("12c-describe-video-anthropic.py", PROMPT_VISION, EVAL_VISION), +] + TESTS_14 = [ ("14-function-calling.py", PROMPT_WEATHER, EVAL_WEATHER), ("14a-function-calling-anthropic.py", PROMPT_WEATHER, EVAL_WEATHER), @@ -142,6 +156,7 @@ TESTS_43 = [ TESTS = [ *TESTS_07, + *TESTS_12, *TESTS_14, *TESTS_15, *TESTS_19, diff --git a/src/pipecat/transports/services/daily.py b/src/pipecat/transports/services/daily.py index daf879be7..77f7f5e29 100644 --- a/src/pipecat/transports/services/daily.py +++ b/src/pipecat/transports/services/daily.py @@ -560,7 +560,7 @@ class DailyTransportClient(EventHandler): self._out_sample_rate = self._params.audio_out_sample_rate or frame.audio_out_sample_rate if self._params.audio_in_enabled: - if self._params.audio_in_user_tracks and not self._audio_task: + if self._params.audio_in_user_tracks and not self._audio_task and self._task_manager: self._audio_queue = WatchdogQueue(self._task_manager) self._audio_task = self._task_manager.create_task( self._callback_task_handler(self._audio_queue),