Files
pipecat/scripts/evals/run-release-evals.py
2026-05-21 11:45:17 -04:00

304 lines
13 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#
# Copyright (c) 20242026, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import argparse
import asyncio
import sys
from datetime import UTC, datetime, timezone
from pathlib import Path
from dotenv import load_dotenv
from eval import EvalConfig, EvalRunner
from loguru import logger
from PIL import Image
from utils import check_env_variables
load_dotenv(override=True)
SCRIPT_DIR = Path(__file__).resolve().parent
ASSETS_DIR = SCRIPT_DIR / "assets"
FOUNDATIONAL_DIR = SCRIPT_DIR.parent.parent / "examples"
EVAL_SIMPLE_MATH = EvalConfig(
prompt="A simple math addition.",
eval="The user answers the math addition correctly.",
)
EVAL_WEATHER = EvalConfig(
prompt="What's the weather in San Francisco? Temperature should be in Fahrenheit.",
eval="The user talks about the weather in San Francisco, including the degrees.",
)
EVAL_WEATHER_AND_RESTAURANT = EvalConfig(
prompt="What's the weather in San Francisco, and what's a good restaurant there? Temperature should be in Fahrenheit.",
eval="The user talks about the weather in San Francisco, including the degrees, and provides a restaurant recommendation.",
)
EVAL_ONLINE_SEARCH = EvalConfig(
prompt="What's the current date in UTC?",
eval=f"Current date in UTC is {datetime.now(UTC).strftime('%A, %B %d, %Y')}.",
)
EVAL_SWITCH_LANGUAGE = EvalConfig(
prompt="Say something in Spanish.",
eval="The user talks in Spanish.",
)
EVAL_VISION_CAMERA = EvalConfig(
prompt=("Briefly describe what you see.", Image.open(ASSETS_DIR / "cat.jpg")),
eval="The user provides a cat description.",
)
def EVAL_VISION_IMAGE(*, eval_speaks_first: bool = False):
return EvalConfig(
prompt="Briefly describe this image.",
eval="The user provides a cat description.",
eval_speaks_first=eval_speaks_first,
runner_args_body={
"image_path": ASSETS_DIR / "cat.jpg",
"question": "Briefly describe this image.",
},
)
EVAL_VOICEMAIL = EvalConfig(
prompt="Please leave a message.",
eval="The user provides a reasonable voicemail message.",
eval_speaks_first=True,
)
EVAL_CONVERSATION = EvalConfig(
prompt="Hello, this is Mark.",
eval="The user provides any reasonable conversational response to the greeting.",
eval_speaks_first=True,
)
EVAL_FLIGHT_STATUS = EvalConfig(
prompt="Check the status of flight AA100.",
eval="The user says something about the status of flight AA100, such as whether it's on time or delayed.",
)
EVAL_ORDER = EvalConfig(
prompt="I'd like to order a chocolate iced doughnut and a regular brewed coffee.",
eval="The user acknowledges the order of a chocolate iced doughnut and regular brewed coffee.",
eval_speaks_first=True,
)
EVAL_COMPLETE_TURN = EvalConfig(
prompt="I would go to Japan because I love the culture and want to try authentic ramen.",
eval="The user provides a relevant response about Japan or travel, showing the conversation continues normally.",
)
TESTS_VOICE = [
("voice/voice-cartesia.py", EVAL_SIMPLE_MATH),
("voice/voice-cartesia-http.py", EVAL_SIMPLE_MATH),
("voice/voice-speechmatics.py", EVAL_SIMPLE_MATH),
("voice/voice-speechmatics-vad.py", EVAL_SIMPLE_MATH),
("voice/voice-langchain.py", EVAL_SIMPLE_MATH),
("voice/voice-deepgram.py", EVAL_SIMPLE_MATH),
("voice/voice-deepgram-flux.py", EVAL_SIMPLE_MATH),
("voice/voice-deepgram-http.py", EVAL_SIMPLE_MATH),
("voice/voice-elevenlabs.py", EVAL_SIMPLE_MATH),
("voice/voice-elevenlabs-http.py", EVAL_SIMPLE_MATH),
("voice/voice-xai.py", EVAL_SIMPLE_MATH),
("voice/voice-xai-http.py", EVAL_SIMPLE_MATH),
("voice/voice-azure.py", EVAL_SIMPLE_MATH),
("voice/voice-azure-http.py", EVAL_SIMPLE_MATH),
("voice/voice-openai.py", EVAL_SIMPLE_MATH),
("voice/voice-openai-http.py", EVAL_SIMPLE_MATH),
("voice/voice-gladia.py", EVAL_SIMPLE_MATH),
("voice/voice-gladia-vad.py", EVAL_SIMPLE_MATH),
("voice/voice-lmnt.py", EVAL_SIMPLE_MATH),
("voice/voice-groq.py", EVAL_SIMPLE_MATH),
("voice/voice-aws.py", EVAL_SIMPLE_MATH),
("voice/voice-aws-strands.py", EVAL_WEATHER),
("voice/voice-google-gemini-tts.py", EVAL_SIMPLE_MATH),
("voice/voice-google.py", EVAL_SIMPLE_MATH),
("voice/voice-google-http.py", EVAL_SIMPLE_MATH),
("voice/voice-assemblyai.py", EVAL_SIMPLE_MATH),
("voice/voice-krisp-viva.py", EVAL_SIMPLE_MATH),
("voice/voice-rime.py", EVAL_SIMPLE_MATH),
("voice/voice-rime-http.py", EVAL_SIMPLE_MATH),
("voice/voice-nvidia.py", EVAL_SIMPLE_MATH),
("voice/voice-google-audio-in.py", EVAL_SIMPLE_MATH),
("voice/voice-fish.py", EVAL_SIMPLE_MATH),
("voice/voice-neuphonic.py", EVAL_SIMPLE_MATH),
("voice/voice-neuphonic-http.py", EVAL_SIMPLE_MATH),
("voice/voice-fal.py", EVAL_SIMPLE_MATH),
("voice/voice-minimax.py", EVAL_SIMPLE_MATH),
("voice/voice-sarvam.py", EVAL_SIMPLE_MATH),
("voice/voice-sarvam-http.py", EVAL_SIMPLE_MATH),
("voice/voice-soniox.py", EVAL_SIMPLE_MATH),
("voice/voice-inworld.py", EVAL_SIMPLE_MATH),
("voice/voice-inworld-http.py", EVAL_SIMPLE_MATH),
("voice/voice-asyncai.py", EVAL_SIMPLE_MATH),
("voice/voice-asyncai-http.py", EVAL_SIMPLE_MATH),
("voice/voice-aicoustics.py", EVAL_SIMPLE_MATH),
("voice/voice-hume.py", EVAL_SIMPLE_MATH),
("voice/voice-gradium.py", EVAL_SIMPLE_MATH),
("voice/voice-camb.py", EVAL_SIMPLE_MATH),
("voice/voice-piper.py", EVAL_SIMPLE_MATH),
("voice/voice-kokoro.py", EVAL_SIMPLE_MATH),
("voice/voice-resemble.py", EVAL_SIMPLE_MATH),
("voice/voice-smallest.py", EVAL_SIMPLE_MATH),
("voice/voice-mistral.py", EVAL_SIMPLE_MATH),
("voice/voice-openai-responses.py", EVAL_SIMPLE_MATH),
("voice/voice-openai-responses-http.py", EVAL_SIMPLE_MATH),
# Needs a local XTTS docker instance running.
# ("voice/voice-xtts.py", EVAL_SIMPLE_MATH),
]
TESTS_VISION = [
("vision/vision-openai.py", EVAL_VISION_IMAGE(eval_speaks_first=True)),
("vision/vision-openai-responses.py", EVAL_VISION_IMAGE(eval_speaks_first=True)),
("vision/vision-openai-responses-http.py", EVAL_VISION_IMAGE(eval_speaks_first=True)),
("vision/vision-anthropic.py", EVAL_VISION_IMAGE(eval_speaks_first=True)),
("vision/vision-aws.py", EVAL_VISION_IMAGE(eval_speaks_first=True)),
("vision/vision-gemini-flash.py", EVAL_VISION_IMAGE(eval_speaks_first=True)),
("vision/vision-moondream.py", EVAL_VISION_IMAGE()),
]
# For a few major services, we also test parallel function calling.
# (We don't bother doing this with every single service, as it's expensive and
# most rely on the same OpenAI-compatible implementation.)
TESTS_FUNCTION_CALLING = [
("getting-started/07-function-calling.py", EVAL_WEATHER),
("getting-started/07-function-calling.py", EVAL_WEATHER_AND_RESTAURANT),
("function-calling/function-calling-openai-responses.py", EVAL_WEATHER),
("function-calling/function-calling-openai-responses.py", EVAL_WEATHER_AND_RESTAURANT),
("function-calling/function-calling-openai-responses-http.py", EVAL_WEATHER),
("function-calling/function-calling-openai-responses-http.py", EVAL_WEATHER_AND_RESTAURANT),
("function-calling/function-calling-anthropic.py", EVAL_WEATHER),
("function-calling/function-calling-anthropic.py", EVAL_WEATHER_AND_RESTAURANT),
("function-calling/function-calling-openai.py", EVAL_WEATHER),
("function-calling/function-calling-google.py", EVAL_WEATHER),
("function-calling/function-calling-google.py", EVAL_WEATHER_AND_RESTAURANT),
("function-calling/function-calling-groq.py", EVAL_WEATHER),
("function-calling/function-calling-grok.py", EVAL_WEATHER),
("function-calling/function-calling-azure.py", EVAL_WEATHER),
("function-calling/function-calling-fireworks.py", EVAL_WEATHER),
("function-calling/function-calling-nvidia.py", EVAL_WEATHER),
("function-calling/function-calling-cerebras.py", EVAL_WEATHER),
("function-calling/function-calling-openrouter.py", EVAL_WEATHER),
("function-calling/function-calling-perplexity.py", EVAL_WEATHER),
("function-calling/function-calling-google-vertex.py", EVAL_WEATHER),
("function-calling/function-calling-qwen.py", EVAL_WEATHER),
("function-calling/function-calling-aws.py", EVAL_WEATHER),
("function-calling/function-calling-sambanova.py", EVAL_WEATHER),
("function-calling/function-calling-aws.py", EVAL_WEATHER_AND_RESTAURANT),
("function-calling/function-calling-nebius.py", EVAL_WEATHER),
("function-calling/function-calling-mistral.py", EVAL_WEATHER),
("function-calling/function-calling-sarvam.py", EVAL_WEATHER),
("function-calling/function-calling-novita.py", EVAL_WEATHER),
("function-calling/function-calling-deepseek.py", EVAL_WEATHER),
("function-calling/function-calling-inception.py", EVAL_WEATHER),
# Video
("function-calling/function-calling-anthropic-video.py", EVAL_VISION_CAMERA),
("function-calling/function-calling-aws-video.py", EVAL_VISION_CAMERA),
("function-calling/function-calling-google-video.py", EVAL_VISION_CAMERA),
("function-calling/function-calling-moondream-video.py", EVAL_VISION_CAMERA),
("function-calling/function-calling-openai-video.py", EVAL_VISION_CAMERA),
("function-calling/function-calling-openai-responses-video.py", EVAL_VISION_CAMERA),
("function-calling/function-calling-openai-responses-video-http.py", EVAL_VISION_CAMERA),
# Currently not working.
# ("function-calling/function-calling-together.py", EVAL_WEATHER),
]
TESTS_FEATURES = [
("features/features-switch-languages.py", EVAL_SWITCH_LANGUAGE),
("features/features-voicemail-detection.py", EVAL_VOICEMAIL),
("features/features-voicemail-detection.py", EVAL_CONVERSATION),
("features/features-concurrent-llm-evaluation.py", EVAL_SIMPLE_MATH),
]
TESTS_REALTIME = [
("realtime/realtime-openai.py", EVAL_WEATHER),
# OpenAI Realtime not released on Azure yet
# ("realtime/realtime-azure.py", EVAL_WEATHER),
("realtime/realtime-openai-text.py", EVAL_WEATHER),
("realtime/realtime-openai-live-video.py", EVAL_VISION_CAMERA),
("realtime/realtime-gemini-live.py", EVAL_WEATHER),
("realtime/realtime-gemini-live-local-vad.py", EVAL_SIMPLE_MATH),
("realtime/realtime-gemini-live-video.py", EVAL_VISION_CAMERA),
("realtime/realtime-gemini-live-google-search.py", EVAL_ONLINE_SEARCH),
("realtime/realtime-gemini-live-vertex.py", EVAL_WEATHER),
("realtime/realtime-aws-nova-sonic.py", EVAL_SIMPLE_MATH),
("realtime/realtime-ultravox.py", EVAL_ORDER),
("realtime/realtime-grok.py", EVAL_WEATHER),
]
TESTS_VIDEO_AVATAR = [
("video-avatar/video-avatar-tavus-video-service.py", EVAL_SIMPLE_MATH),
("video-avatar/video-avatar-heygen-video-service.py", EVAL_SIMPLE_MATH),
("video-avatar/video-avatar-simli-video-service.py", EVAL_SIMPLE_MATH),
("video-avatar/video-avatar-lemonslice-transport.py", EVAL_SIMPLE_MATH),
]
TESTS_TURN_MANAGEMENT = [
("turn-management/turn-management-filter-incomplete-turns.py", EVAL_COMPLETE_TURN),
("turn-management/turn-management-filter-incomplete-turns-function-calling.py", EVAL_WEATHER),
]
TESTS_THINKING = [
("thinking/thinking-anthropic.py", EVAL_SIMPLE_MATH),
("thinking/thinking-google.py", EVAL_SIMPLE_MATH),
("thinking/thinking-functions-anthropic.py", EVAL_FLIGHT_STATUS),
("thinking/thinking-functions-google.py", EVAL_FLIGHT_STATUS),
]
TESTS = [
*TESTS_VOICE,
*TESTS_VISION,
*TESTS_FUNCTION_CALLING,
*TESTS_FEATURES,
*TESTS_REALTIME,
*TESTS_VIDEO_AVATAR,
*TESTS_TURN_MANAGEMENT,
*TESTS_THINKING,
]
async def main(args: argparse.Namespace):
if not check_env_variables():
return
# Log level
logger.remove(0)
log_level = "TRACE" if args.verbose >= 2 else "DEBUG"
if args.verbose:
logger.add(sys.stderr, level=log_level)
runner = EvalRunner(
examples_dir=FOUNDATIONAL_DIR,
name=args.name,
pattern=args.pattern,
record_audio=args.audio,
log_level=log_level,
)
# Parse test config: (test, prompt, eval, user_speaks_first)
for test_config in TESTS:
test, eval_config = test_config
await runner.run_eval(test, eval_config)
runner.print_results()
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Pipecat Eval Runner")
parser.add_argument("--audio", "-a", action="store_true", help="Record audio for each test")
parser.add_argument("--name", "-n", help="Name for the current runner (e.g. 'v.0.0.68')")
parser.add_argument("--pattern", "-p", help="Only run tests that match the pattern")
parser.add_argument("--verbose", "-v", action="count", default=0)
args = parser.parse_args()
asyncio.run(main(args))