Files
pipecat/scripts/evals/run-release-evals.py
2026-01-07 16:58:13 -05:00

296 lines
9.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#
# Copyright (c) 20242026, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import argparse
import asyncio
import sys
from datetime import datetime, timezone
from pathlib import Path
from dotenv import load_dotenv
from eval import EvalConfig, EvalRunner
from loguru import logger
from PIL import Image
from utils import check_env_variables
load_dotenv(override=True)
SCRIPT_DIR = Path(__file__).resolve().parent
ASSETS_DIR = SCRIPT_DIR / "assets"
FOUNDATIONAL_DIR = SCRIPT_DIR.parent.parent / "examples" / "foundational"
EVAL_SIMPLE_MATH = EvalConfig(
prompt="A simple math addition.",
eval="The user answers the math addition correctly.",
)
EVAL_WEATHER = EvalConfig(
prompt="What's the weather in San Francisco? Temperature should be in fahrenheits.",
eval="The user talks about the weather in San Francisco, including the degrees.",
)
EVAL_ONLINE_SEARCH = EvalConfig(
prompt="What's the current date in UTC?",
eval=f"Current date in UTC is {datetime.now(timezone.utc).strftime('%A, %B %d, %Y')}.",
)
EVAL_SWITCH_LANGUAGE = EvalConfig(
prompt="Say something in Spanish.",
eval="The user talks in Spanish.",
)
EVAL_VISION_CAMERA = EvalConfig(
prompt=("Briefly describe what you see.", Image.open(ASSETS_DIR / "cat.jpg")),
eval="The user provides a cat description.",
)
def EVAL_VISION_IMAGE(*, eval_speaks_first: bool = False):
return EvalConfig(
prompt="Briefly describe this image.",
eval="The user provides a cat description.",
eval_speaks_first=eval_speaks_first,
runner_args_body={
"image_path": ASSETS_DIR / "cat.jpg",
"question": "Briefly describe this image.",
},
)
EVAL_VOICEMAIL = EvalConfig(
prompt="Please leave a message.",
eval="The user provides a reasonable voicemail message.",
eval_speaks_first=True,
)
EVAL_CONVERSATION = EvalConfig(
prompt="Hello, this is Mark.",
eval="The user provides any reasonable conversational response to the greeting.",
eval_speaks_first=True,
)
EVAL_FLIGHT_STATUS = EvalConfig(
prompt="Check the status of flight AA100.",
eval="The user says something about the status of flight AA100, such as whether it's on time or delayed.",
)
EVAL_ORDER = EvalConfig(
prompt="I'd like to order a chocolate iced doughnut and a regular brewed coffee.",
eval="The user acknowledges the order of a chocolate iced doughnut and regular brewed coffee.",
eval_speaks_first=True,
)
TESTS_07 = [
# 07 series
("07-interruptible.py", EVAL_SIMPLE_MATH),
("07-interruptible-cartesia-http.py", EVAL_SIMPLE_MATH),
("07a-interruptible-speechmatics.py", EVAL_SIMPLE_MATH),
("07a-interruptible-speechmatics-vad.py", EVAL_SIMPLE_MATH),
("07aa-interruptible-soniox.py", EVAL_SIMPLE_MATH),
("07ab-interruptible-inworld.py", EVAL_SIMPLE_MATH),
("07ab-interruptible-inworld-http.py", EVAL_SIMPLE_MATH),
("07ac-interruptible-asyncai.py", EVAL_SIMPLE_MATH),
("07ac-interruptible-asyncai-http.py", EVAL_SIMPLE_MATH),
# Need license key to run
# ("07ad-interruptible-aicoustics.py", EVAL_SIMPLE_MATH),
("07ae-interruptible-hume.py", EVAL_SIMPLE_MATH),
("07af-interruptible-gradium.py", EVAL_SIMPLE_MATH),
("07b-interruptible-langchain.py", EVAL_SIMPLE_MATH),
("07c-interruptible-deepgram.py", EVAL_SIMPLE_MATH),
("07c-interruptible-deepgram-flux.py", EVAL_SIMPLE_MATH),
("07c-interruptible-deepgram-http.py", EVAL_SIMPLE_MATH),
("07c-interruptible-deepgram-vad.py", EVAL_SIMPLE_MATH),
("07d-interruptible-elevenlabs.py", EVAL_SIMPLE_MATH),
("07d-interruptible-elevenlabs-http.py", EVAL_SIMPLE_MATH),
("07f-interruptible-azure.py", EVAL_SIMPLE_MATH),
("07f-interruptible-azure-http.py", EVAL_SIMPLE_MATH),
("07g-interruptible-openai.py", EVAL_SIMPLE_MATH),
("07h-interruptible-openpipe.py", EVAL_SIMPLE_MATH),
("07j-interruptible-gladia.py", EVAL_SIMPLE_MATH),
("07k-interruptible-lmnt.py", EVAL_SIMPLE_MATH),
("07l-interruptible-groq.py", EVAL_SIMPLE_MATH),
("07m-interruptible-aws.py", EVAL_SIMPLE_MATH),
("07m-interruptible-aws-strands.py", EVAL_WEATHER),
("07n-interruptible-gemini.py", EVAL_SIMPLE_MATH),
("07n-interruptible-google.py", EVAL_SIMPLE_MATH),
("07n-interruptible-google-http.py", EVAL_SIMPLE_MATH),
("07o-interruptible-assemblyai.py", EVAL_SIMPLE_MATH),
("07q-interruptible-rime.py", EVAL_SIMPLE_MATH),
("07q-interruptible-rime-http.py", EVAL_SIMPLE_MATH),
("07r-interruptible-nvidia.py", EVAL_SIMPLE_MATH),
("07s-interruptible-google-audio-in.py", EVAL_SIMPLE_MATH),
("07t-interruptible-fish.py", EVAL_SIMPLE_MATH),
("07v-interruptible-neuphonic.py", EVAL_SIMPLE_MATH),
("07v-interruptible-neuphonic-http.py", EVAL_SIMPLE_MATH),
("07w-interruptible-fal.py", EVAL_SIMPLE_MATH),
("07y-interruptible-minimax.py", EVAL_SIMPLE_MATH),
("07z-interruptible-sarvam.py", EVAL_SIMPLE_MATH),
("07z-interruptible-sarvam-http.py", EVAL_SIMPLE_MATH),
# Needs a local XTTS docker instance running.
# ("07i-interruptible-xtts.py", EVAL_SIMPLE_MATH),
# Needs a Krisp license.
# ("07p-interruptible-krisp.py", EVAL_SIMPLE_MATH),
]
TESTS_12 = [
("12-describe-image-openai.py", EVAL_VISION_IMAGE(eval_speaks_first=True)),
("12a-describe-image-anthropic.py", EVAL_VISION_IMAGE(eval_speaks_first=True)),
("12b-describe-image-aws.py", EVAL_VISION_IMAGE(eval_speaks_first=True)),
("12c-describe-image-gemini-flash.py", EVAL_VISION_IMAGE(eval_speaks_first=True)),
("12d-describe-image-moondream.py", EVAL_VISION_IMAGE()),
]
TESTS_14 = [
("14-function-calling.py", EVAL_WEATHER),
("14a-function-calling-anthropic.py", EVAL_WEATHER),
("14e-function-calling-google.py", EVAL_WEATHER),
("14f-function-calling-groq.py", EVAL_WEATHER),
("14g-function-calling-grok.py", EVAL_WEATHER),
("14h-function-calling-azure.py", EVAL_WEATHER),
("14i-function-calling-fireworks.py", EVAL_WEATHER),
("14j-function-calling-nvidia.py", EVAL_WEATHER),
("14k-function-calling-cerebras.py", EVAL_WEATHER),
("14m-function-calling-openrouter.py", EVAL_WEATHER),
("14n-function-calling-perplexity.py", EVAL_WEATHER),
("14p-function-calling-gemini-vertex-ai.py", EVAL_WEATHER),
("14q-function-calling-qwen.py", EVAL_WEATHER),
("14r-function-calling-aws.py", EVAL_WEATHER),
("14v-function-calling-openai.py", EVAL_WEATHER),
("14w-function-calling-mistral.py", EVAL_WEATHER),
("14x-function-calling-openpipe.py", EVAL_WEATHER),
# Video
("14d-function-calling-anthropic-video.py", EVAL_VISION_CAMERA),
("14d-function-calling-aws-video.py", EVAL_VISION_CAMERA),
("14d-function-calling-gemini-flash-video.py", EVAL_VISION_CAMERA),
("14d-function-calling-moondream-video.py", EVAL_VISION_CAMERA),
("14d-function-calling-openai-video.py", EVAL_VISION_CAMERA),
# Currently not working.
# ("14c-function-calling-together.py", EVAL_WEATHER),
# ("14l-function-calling-deepseek.py", EVAL_WEATHER),
# ("14o-function-calling-gemini-openai-format.py", EVAL_WEATHER),
]
TESTS_15 = [
("15a-switch-languages.py", EVAL_SWITCH_LANGUAGE),
]
TESTS_19 = [
("19-openai-realtime.py", EVAL_WEATHER),
("19-openai-realtime-beta.py", EVAL_WEATHER),
# OpenAI Realtime not released on Azure yet
# ("19a-azure-realtime.py", EVAL_WEATHER),
("19a-azure-realtime-beta.py", EVAL_WEATHER),
("19b-openai-realtime-text.py", EVAL_WEATHER),
("19b-openai-realtime-beta-text.py", EVAL_WEATHER),
]
TESTS_21 = [
("21a-tavus-video-service.py", EVAL_SIMPLE_MATH),
]
TESTS_26 = [
("26-gemini-live.py", EVAL_SIMPLE_MATH),
("26a-gemini-live-transcription.py", EVAL_SIMPLE_MATH),
("26b-gemini-live-function-calling.py", EVAL_WEATHER),
("26c-gemini-live-video.py", EVAL_VISION_CAMERA),
("26e-gemini-live-google-search.py", EVAL_ONLINE_SEARCH),
("26h-gemini-live-vertex-function-calling.py", EVAL_WEATHER),
# Currently not working.
# ("26d-gemini-live-text.py", EVAL_SIMPLE_MATH),
]
TESTS_27 = [
("27-simli-layer.py", EVAL_SIMPLE_MATH),
]
TESTS_40 = [
("40-aws-nova-sonic.py", EVAL_SIMPLE_MATH),
]
TESTS_43 = [
("43a-heygen-video-service.py", EVAL_SIMPLE_MATH),
]
TESTS_44 = [
("44-voicemail-detection.py", EVAL_VOICEMAIL),
("44-voicemail-detection.py", EVAL_CONVERSATION),
]
TESTS_49 = [
("49a-thinking-anthropic.py", EVAL_SIMPLE_MATH),
("49b-thinking-google.py", EVAL_SIMPLE_MATH),
("49c-thinking-functions-anthropic.py", EVAL_FLIGHT_STATUS),
("49d-thinking-functions-google.py", EVAL_FLIGHT_STATUS),
]
TESTS_50 = [
("50-ultravox-realtime.py", EVAL_ORDER),
]
TESTS_51 = [
("51-grok-realtime.py", EVAL_WEATHER),
]
TESTS = [
*TESTS_07,
*TESTS_12,
*TESTS_14,
*TESTS_15,
*TESTS_19,
*TESTS_21,
*TESTS_26,
*TESTS_27,
*TESTS_40,
*TESTS_43,
*TESTS_44,
*TESTS_49,
*TESTS_50,
*TESTS_51,
]
async def main(args: argparse.Namespace):
if not check_env_variables():
return
# Log level
logger.remove(0)
log_level = "TRACE" if args.verbose >= 2 else "DEBUG"
if args.verbose:
logger.add(sys.stderr, level=log_level)
runner = EvalRunner(
examples_dir=FOUNDATIONAL_DIR,
name=args.name,
pattern=args.pattern,
record_audio=args.audio,
log_level=log_level,
)
# Parse test config: (test, prompt, eval, user_speaks_first)
for test_config in TESTS:
test, eval_config = test_config
await runner.run_eval(test, eval_config)
runner.print_results()
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Pipecat Eval Runner")
parser.add_argument("--audio", "-a", action="store_true", help="Record audio for each test")
parser.add_argument("--name", "-n", help="Name for the current runner (e.g. 'v.0.0.68')")
parser.add_argument("--pattern", "-p", help="Only run tests that match the pattern")
parser.add_argument("--verbose", "-v", action="count", default=0)
args = parser.parse_args()
asyncio.run(main(args))