scripts(evals): add vision support
This commit is contained in:
@@ -50,6 +50,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
|
||||
### Other
|
||||
|
||||
- Add vision support to release evals so we can run the foundational examples 12
|
||||
series.
|
||||
|
||||
- Added foundational example `15a-switch-languages.py` to release evals. It is
|
||||
able to detect if we switched the language properly.
|
||||
|
||||
|
||||
BIN
scripts/evals/assets/cat.jpg
Normal file
BIN
scripts/evals/assets/cat.jpg
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 63 KiB |
@@ -4,7 +4,6 @@
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import io
|
||||
import os
|
||||
@@ -13,11 +12,12 @@ import time
|
||||
import wave
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import List, Optional
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
import aiofiles
|
||||
from deepgram import LiveOptions
|
||||
from loguru import logger
|
||||
from PIL.ImageFile import ImageFile
|
||||
from utils import (
|
||||
EvalResult,
|
||||
load_module_from_path,
|
||||
@@ -30,7 +30,7 @@ from pipecat.adapters.schemas.function_schema import FunctionSchema
|
||||
from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
from pipecat.frames.frames import EndTaskFrame
|
||||
from pipecat.frames.frames import EndTaskFrame, OutputImageRawFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
@@ -49,6 +49,8 @@ SCRIPT_DIR = Path(__file__).resolve().parent
|
||||
PIPELINE_IDLE_TIMEOUT_SECS = 60
|
||||
EVAL_TIMEOUT_SECS = 90
|
||||
|
||||
EvalPrompt = str | Tuple[str, ImageFile]
|
||||
|
||||
|
||||
class EvalRunner:
|
||||
def __init__(
|
||||
@@ -87,7 +89,7 @@ class EvalRunner:
|
||||
async def assert_eval_false(self):
|
||||
await self._queue.put(False)
|
||||
|
||||
async def run_eval(self, example_file: str, prompt: str, eval: Optional[str] = None):
|
||||
async def run_eval(self, example_file: str, prompt: EvalPrompt, eval: Optional[str] = None):
|
||||
if not re.match(self._pattern, example_file):
|
||||
return
|
||||
|
||||
@@ -178,6 +180,7 @@ async def run_example_pipeline(script_path: Path):
|
||||
DailyParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
video_in_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
),
|
||||
)
|
||||
@@ -189,7 +192,10 @@ async def run_example_pipeline(script_path: Path):
|
||||
|
||||
|
||||
async def run_eval_pipeline(
|
||||
eval_runner: EvalRunner, example_file: str, prompt: str, eval: Optional[str]
|
||||
eval_runner: EvalRunner,
|
||||
example_file: str,
|
||||
prompt: EvalPrompt,
|
||||
eval: Optional[str],
|
||||
):
|
||||
logger.info(f"Starting eval bot")
|
||||
|
||||
@@ -202,6 +208,7 @@ async def run_eval_pipeline(
|
||||
DailyParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
video_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=2.0)),
|
||||
),
|
||||
)
|
||||
@@ -242,6 +249,14 @@ async def run_eval_pipeline(
|
||||
)
|
||||
tools = ToolsSchema(standard_tools=[eval_function])
|
||||
|
||||
# Load example prompt depending on image.
|
||||
example_prompt = ""
|
||||
example_image: Optional[ImageFile] = None
|
||||
if isinstance(prompt, str):
|
||||
example_prompt = prompt
|
||||
elif isinstance(prompt, tuple):
|
||||
example_prompt, example_image = prompt
|
||||
|
||||
# See if we need to include an eval prompt.
|
||||
eval_prompt = ""
|
||||
if eval:
|
||||
@@ -250,7 +265,7 @@ async def run_eval_pipeline(
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": f"You are an LLM eval, be extremly brief. Your goal is to only ask one question: {prompt}. Call the eval function only if the user answers the question and check if the answer is correct (words as numbers are valid). {eval_prompt}",
|
||||
"content": f"You are an LLM eval, be extremly brief. Your goal is to only ask one question: {example_prompt}. Call the eval function only if the user answers the question and check if the answer is correct (words as numbers are valid). {eval_prompt}",
|
||||
},
|
||||
]
|
||||
|
||||
@@ -288,6 +303,14 @@ async def run_eval_pipeline(
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
if example_image:
|
||||
await task.queue_frame(
|
||||
OutputImageRawFrame(
|
||||
image=example_image.tobytes(),
|
||||
size=example_image.size,
|
||||
format="RGB",
|
||||
)
|
||||
)
|
||||
await audio_buffer.start_recording()
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
|
||||
@@ -13,12 +13,15 @@ from pathlib import Path
|
||||
from dotenv import load_dotenv
|
||||
from eval import EvalRunner
|
||||
from loguru import logger
|
||||
from PIL import Image
|
||||
from utils import check_env_variables
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
SCRIPT_DIR = Path(__file__).resolve().parent
|
||||
|
||||
ASSETS_DIR = SCRIPT_DIR / "assets"
|
||||
|
||||
FOUNDATIONAL_DIR = SCRIPT_DIR.parent.parent / "examples" / "foundational"
|
||||
|
||||
|
||||
@@ -39,6 +42,10 @@ EVAL_ONLINE_SEARCH = f"Today is {datetime.now(timezone.utc).strftime('%B %d, %Y'
|
||||
PROMPT_SWITCH_LANGUAGE = "Say something in Spanish."
|
||||
EVAL_SWITCH_LANGUAGE = "Check if the user is now talking in Spanish."
|
||||
|
||||
# Vision
|
||||
PROMPT_VISION = ("What do you see?", Image.open(ASSETS_DIR / "cat.jpg"))
|
||||
EVAL_VISION = "A cat description."
|
||||
|
||||
TESTS_07 = [
|
||||
# 07 series
|
||||
("07-interruptible.py", PROMPT_SIMPLE_MATH, None),
|
||||
@@ -81,6 +88,13 @@ TESTS_07 = [
|
||||
# ("07u-interruptible-ultravox.py", PROMPT_SIMPLE_MATH, None),
|
||||
]
|
||||
|
||||
TESTS_12 = [
|
||||
("12-describe-video.py", PROMPT_VISION, EVAL_VISION),
|
||||
("12a-describe-video-gemini-flash.py", PROMPT_VISION, EVAL_VISION),
|
||||
("12b-describe-video-gpt-4o.py", PROMPT_VISION, EVAL_VISION),
|
||||
("12c-describe-video-anthropic.py", PROMPT_VISION, EVAL_VISION),
|
||||
]
|
||||
|
||||
TESTS_14 = [
|
||||
("14-function-calling.py", PROMPT_WEATHER, EVAL_WEATHER),
|
||||
("14a-function-calling-anthropic.py", PROMPT_WEATHER, EVAL_WEATHER),
|
||||
@@ -142,6 +156,7 @@ TESTS_43 = [
|
||||
|
||||
TESTS = [
|
||||
*TESTS_07,
|
||||
*TESTS_12,
|
||||
*TESTS_14,
|
||||
*TESTS_15,
|
||||
*TESTS_19,
|
||||
|
||||
@@ -560,7 +560,7 @@ class DailyTransportClient(EventHandler):
|
||||
self._out_sample_rate = self._params.audio_out_sample_rate or frame.audio_out_sample_rate
|
||||
|
||||
if self._params.audio_in_enabled:
|
||||
if self._params.audio_in_user_tracks and not self._audio_task:
|
||||
if self._params.audio_in_user_tracks and not self._audio_task and self._task_manager:
|
||||
self._audio_queue = WatchdogQueue(self._task_manager)
|
||||
self._audio_task = self._task_manager.create_task(
|
||||
self._callback_task_handler(self._audio_queue),
|
||||
|
||||
Reference in New Issue
Block a user