Files
pipecat/scripts/evals/run-release-evals.py
Paul Kompfner 1a4a6f4edf refactor(gemini-live): bring tool-result handling in line with the canonical realtime pattern
Lays groundwork for cancel_on_interruption=False support on Gemini Live by
restructuring _process_completed_function_calls to match the shape used by
AWSNovaSonicLLMService and OpenAIRealtimeLLMService in #4441: a single-pass
forward iteration over raw context messages that detects async-tool
messages via async_tool_messages.parse_message and routes them — started
skipped silently, intermediate logged-as-error and surfaced via push_error,
final delivered via the formal FunctionResponse channel.

Replaces the prior two-pass structure that went through the adapter for
sync results — the service now uses a lightweight self._tool_call_id_to_name
map (populated when the model issues tool calls) for the name lookup the
adapter used to provide. Extracts a new GeminiLLMAdapter.to_function_response_dict
static method for the dict-coercion logic that wraps non-dict tool returns
as {value: <result>} for Gemini's FunctionResponse.response field; the
adapter's existing inline copy in _from_standard_message uses it too.

Example consolidation:

- Folds realtime-gemini-live-function-calling.py into the base
  realtime-gemini-live.py example so the base exercises function calling
  out of the box (matching realtime-openai.py and realtime-aws-nova-sonic.py).
- Renames realtime-gemini-live-vertex-function-calling.py to
  realtime-gemini-live-vertex.py, mirroring the consolidation.
- Adds realtime-gemini-live-async-tool.py.
- Updates scripts/evals/run-release-evals.py for the renames.

This commit alone doesn't make cancel_on_interruption=False fully work on
Gemini Live — additional investigation is pending. This is foundational
work to be built on.
2026-05-08 16:42:54 -04:00

302 lines
12 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#
# Copyright (c) 20242026, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import argparse
import asyncio
import sys
from datetime import UTC, datetime, timezone
from pathlib import Path
from dotenv import load_dotenv
from eval import EvalConfig, EvalRunner
from loguru import logger
from PIL import Image
from utils import check_env_variables
load_dotenv(override=True)
SCRIPT_DIR = Path(__file__).resolve().parent
ASSETS_DIR = SCRIPT_DIR / "assets"
FOUNDATIONAL_DIR = SCRIPT_DIR.parent.parent / "examples"
EVAL_SIMPLE_MATH = EvalConfig(
prompt="A simple math addition.",
eval="The user answers the math addition correctly.",
)
EVAL_WEATHER = EvalConfig(
prompt="What's the weather in San Francisco? Temperature should be in Fahrenheit.",
eval="The user talks about the weather in San Francisco, including the degrees.",
)
EVAL_WEATHER_AND_RESTAURANT = EvalConfig(
prompt="What's the weather in San Francisco, and what's a good restaurant there? Temperature should be in Fahrenheit.",
eval="The user talks about the weather in San Francisco, including the degrees, and provides a restaurant recommendation.",
)
EVAL_ONLINE_SEARCH = EvalConfig(
prompt="What's the current date in UTC?",
eval=f"Current date in UTC is {datetime.now(UTC).strftime('%A, %B %d, %Y')}.",
)
EVAL_SWITCH_LANGUAGE = EvalConfig(
prompt="Say something in Spanish.",
eval="The user talks in Spanish.",
)
EVAL_VISION_CAMERA = EvalConfig(
prompt=("Briefly describe what you see.", Image.open(ASSETS_DIR / "cat.jpg")),
eval="The user provides a cat description.",
)
def EVAL_VISION_IMAGE(*, eval_speaks_first: bool = False):
return EvalConfig(
prompt="Briefly describe this image.",
eval="The user provides a cat description.",
eval_speaks_first=eval_speaks_first,
runner_args_body={
"image_path": ASSETS_DIR / "cat.jpg",
"question": "Briefly describe this image.",
},
)
EVAL_VOICEMAIL = EvalConfig(
prompt="Please leave a message.",
eval="The user provides a reasonable voicemail message.",
eval_speaks_first=True,
)
EVAL_CONVERSATION = EvalConfig(
prompt="Hello, this is Mark.",
eval="The user provides any reasonable conversational response to the greeting.",
eval_speaks_first=True,
)
EVAL_FLIGHT_STATUS = EvalConfig(
prompt="Check the status of flight AA100.",
eval="The user says something about the status of flight AA100, such as whether it's on time or delayed.",
)
EVAL_ORDER = EvalConfig(
prompt="I'd like to order a chocolate iced doughnut and a regular brewed coffee.",
eval="The user acknowledges the order of a chocolate iced doughnut and regular brewed coffee.",
eval_speaks_first=True,
)
EVAL_COMPLETE_TURN = EvalConfig(
prompt="I would go to Japan because I love the culture and want to try authentic ramen.",
eval="The user provides a relevant response about Japan or travel, showing the conversation continues normally.",
)
TESTS_VOICE = [
("voice/voice-cartesia.py", EVAL_SIMPLE_MATH),
("voice/voice-cartesia-http.py", EVAL_SIMPLE_MATH),
("voice/voice-speechmatics.py", EVAL_SIMPLE_MATH),
("voice/voice-speechmatics-vad.py", EVAL_SIMPLE_MATH),
("voice/voice-langchain.py", EVAL_SIMPLE_MATH),
("voice/voice-deepgram.py", EVAL_SIMPLE_MATH),
("voice/voice-deepgram-flux.py", EVAL_SIMPLE_MATH),
("voice/voice-deepgram-http.py", EVAL_SIMPLE_MATH),
("voice/voice-elevenlabs.py", EVAL_SIMPLE_MATH),
("voice/voice-elevenlabs-http.py", EVAL_SIMPLE_MATH),
("voice/voice-xai.py", EVAL_SIMPLE_MATH),
("voice/voice-xai-http.py", EVAL_SIMPLE_MATH),
("voice/voice-azure.py", EVAL_SIMPLE_MATH),
("voice/voice-azure-http.py", EVAL_SIMPLE_MATH),
("voice/voice-openai.py", EVAL_SIMPLE_MATH),
("voice/voice-openai-http.py", EVAL_SIMPLE_MATH),
("voice/voice-gladia.py", EVAL_SIMPLE_MATH),
("voice/voice-gladia-vad.py", EVAL_SIMPLE_MATH),
("voice/voice-lmnt.py", EVAL_SIMPLE_MATH),
("voice/voice-groq.py", EVAL_SIMPLE_MATH),
("voice/voice-aws.py", EVAL_SIMPLE_MATH),
("voice/voice-aws-strands.py", EVAL_WEATHER),
("voice/voice-google-gemini-tts.py", EVAL_SIMPLE_MATH),
("voice/voice-google.py", EVAL_SIMPLE_MATH),
("voice/voice-google-http.py", EVAL_SIMPLE_MATH),
("voice/voice-assemblyai.py", EVAL_SIMPLE_MATH),
("voice/voice-krisp-viva.py", EVAL_SIMPLE_MATH),
("voice/voice-rime.py", EVAL_SIMPLE_MATH),
("voice/voice-rime-http.py", EVAL_SIMPLE_MATH),
("voice/voice-nvidia.py", EVAL_SIMPLE_MATH),
("voice/voice-google-audio-in.py", EVAL_SIMPLE_MATH),
("voice/voice-fish.py", EVAL_SIMPLE_MATH),
("voice/voice-neuphonic.py", EVAL_SIMPLE_MATH),
("voice/voice-neuphonic-http.py", EVAL_SIMPLE_MATH),
("voice/voice-fal.py", EVAL_SIMPLE_MATH),
("voice/voice-minimax.py", EVAL_SIMPLE_MATH),
("voice/voice-sarvam.py", EVAL_SIMPLE_MATH),
("voice/voice-sarvam-http.py", EVAL_SIMPLE_MATH),
("voice/voice-soniox.py", EVAL_SIMPLE_MATH),
("voice/voice-inworld.py", EVAL_SIMPLE_MATH),
("voice/voice-inworld-http.py", EVAL_SIMPLE_MATH),
("voice/voice-asyncai.py", EVAL_SIMPLE_MATH),
("voice/voice-asyncai-http.py", EVAL_SIMPLE_MATH),
("voice/voice-aicoustics.py", EVAL_SIMPLE_MATH),
("voice/voice-hume.py", EVAL_SIMPLE_MATH),
("voice/voice-gradium.py", EVAL_SIMPLE_MATH),
("voice/voice-camb.py", EVAL_SIMPLE_MATH),
("voice/voice-piper.py", EVAL_SIMPLE_MATH),
("voice/voice-kokoro.py", EVAL_SIMPLE_MATH),
("voice/voice-resemble.py", EVAL_SIMPLE_MATH),
("voice/voice-smallest.py", EVAL_SIMPLE_MATH),
("voice/voice-mistral.py", EVAL_SIMPLE_MATH),
("voice/voice-openai-responses.py", EVAL_SIMPLE_MATH),
("voice/voice-openai-responses-http.py", EVAL_SIMPLE_MATH),
# Needs a local XTTS docker instance running.
# ("voice/voice-xtts.py", EVAL_SIMPLE_MATH),
]
TESTS_VISION = [
("vision/vision-openai.py", EVAL_VISION_IMAGE(eval_speaks_first=True)),
("vision/vision-openai-responses.py", EVAL_VISION_IMAGE(eval_speaks_first=True)),
("vision/vision-openai-responses-http.py", EVAL_VISION_IMAGE(eval_speaks_first=True)),
("vision/vision-anthropic.py", EVAL_VISION_IMAGE(eval_speaks_first=True)),
("vision/vision-aws.py", EVAL_VISION_IMAGE(eval_speaks_first=True)),
("vision/vision-gemini-flash.py", EVAL_VISION_IMAGE(eval_speaks_first=True)),
("vision/vision-moondream.py", EVAL_VISION_IMAGE()),
]
# For a few major services, we also test parallel function calling.
# (We don't bother doing this with every single service, as it's expensive and
# most rely on the same OpenAI-compatible implementation.)
TESTS_FUNCTION_CALLING = [
("getting-started/07-function-calling.py", EVAL_WEATHER),
("getting-started/07-function-calling.py", EVAL_WEATHER_AND_RESTAURANT),
("function-calling/function-calling-openai-responses.py", EVAL_WEATHER),
("function-calling/function-calling-openai-responses.py", EVAL_WEATHER_AND_RESTAURANT),
("function-calling/function-calling-openai-responses-http.py", EVAL_WEATHER),
("function-calling/function-calling-openai-responses-http.py", EVAL_WEATHER_AND_RESTAURANT),
("function-calling/function-calling-anthropic.py", EVAL_WEATHER),
("function-calling/function-calling-anthropic.py", EVAL_WEATHER_AND_RESTAURANT),
("function-calling/function-calling-openai.py", EVAL_WEATHER),
("function-calling/function-calling-google.py", EVAL_WEATHER),
("function-calling/function-calling-google.py", EVAL_WEATHER_AND_RESTAURANT),
("function-calling/function-calling-groq.py", EVAL_WEATHER),
("function-calling/function-calling-grok.py", EVAL_WEATHER),
("function-calling/function-calling-azure.py", EVAL_WEATHER),
("function-calling/function-calling-fireworks.py", EVAL_WEATHER),
("function-calling/function-calling-nvidia.py", EVAL_WEATHER),
("function-calling/function-calling-cerebras.py", EVAL_WEATHER),
("function-calling/function-calling-openrouter.py", EVAL_WEATHER),
("function-calling/function-calling-perplexity.py", EVAL_WEATHER),
("function-calling/function-calling-google-vertex.py", EVAL_WEATHER),
("function-calling/function-calling-qwen.py", EVAL_WEATHER),
("function-calling/function-calling-aws.py", EVAL_WEATHER),
("function-calling/function-calling-sambanova.py", EVAL_WEATHER),
("function-calling/function-calling-aws.py", EVAL_WEATHER_AND_RESTAURANT),
("function-calling/function-calling-nebius.py", EVAL_WEATHER),
("function-calling/function-calling-mistral.py", EVAL_WEATHER),
("function-calling/function-calling-sarvam.py", EVAL_WEATHER),
("function-calling/function-calling-novita.py", EVAL_WEATHER),
("function-calling/function-calling-deepseek.py", EVAL_WEATHER),
# Video
("function-calling/function-calling-anthropic-video.py", EVAL_VISION_CAMERA),
("function-calling/function-calling-aws-video.py", EVAL_VISION_CAMERA),
("function-calling/function-calling-google-video.py", EVAL_VISION_CAMERA),
("function-calling/function-calling-moondream-video.py", EVAL_VISION_CAMERA),
("function-calling/function-calling-openai-video.py", EVAL_VISION_CAMERA),
("function-calling/function-calling-openai-responses-video.py", EVAL_VISION_CAMERA),
("function-calling/function-calling-openai-responses-video-http.py", EVAL_VISION_CAMERA),
# Currently not working.
# ("function-calling/function-calling-together.py", EVAL_WEATHER),
]
TESTS_FEATURES = [
("features/features-switch-languages.py", EVAL_SWITCH_LANGUAGE),
("features/features-voicemail-detection.py", EVAL_VOICEMAIL),
("features/features-voicemail-detection.py", EVAL_CONVERSATION),
("features/features-concurrent-llm-evaluation.py", EVAL_SIMPLE_MATH),
]
TESTS_REALTIME = [
("realtime/realtime-openai.py", EVAL_WEATHER),
# OpenAI Realtime not released on Azure yet
# ("realtime/realtime-azure.py", EVAL_WEATHER),
("realtime/realtime-openai-text.py", EVAL_WEATHER),
("realtime/realtime-openai-live-video.py", EVAL_VISION_CAMERA),
("realtime/realtime-gemini-live.py", EVAL_WEATHER),
("realtime/realtime-gemini-live-local-vad.py", EVAL_SIMPLE_MATH),
("realtime/realtime-gemini-live-video.py", EVAL_VISION_CAMERA),
("realtime/realtime-gemini-live-google-search.py", EVAL_ONLINE_SEARCH),
("realtime/realtime-gemini-live-vertex.py", EVAL_WEATHER),
("realtime/realtime-aws-nova-sonic.py", EVAL_SIMPLE_MATH),
("realtime/realtime-ultravox.py", EVAL_ORDER),
("realtime/realtime-grok.py", EVAL_WEATHER),
]
TESTS_VIDEO_AVATAR = [
("video-avatar/video-avatar-tavus-video-service.py", EVAL_SIMPLE_MATH),
("video-avatar/video-avatar-heygen-video-service.py", EVAL_SIMPLE_MATH),
("video-avatar/video-avatar-simli-video-service.py", EVAL_SIMPLE_MATH),
("video-avatar/video-avatar-lemonslice-transport.py", EVAL_SIMPLE_MATH),
]
TESTS_TURN_MANAGEMENT = [
("turn-management/turn-management-filter-incomplete-turns.py", EVAL_COMPLETE_TURN),
]
TESTS_THINKING = [
("thinking/thinking-anthropic.py", EVAL_SIMPLE_MATH),
("thinking/thinking-google.py", EVAL_SIMPLE_MATH),
("thinking/thinking-functions-anthropic.py", EVAL_FLIGHT_STATUS),
("thinking/thinking-functions-google.py", EVAL_FLIGHT_STATUS),
]
TESTS = [
*TESTS_VOICE,
*TESTS_VISION,
*TESTS_FUNCTION_CALLING,
*TESTS_FEATURES,
*TESTS_REALTIME,
*TESTS_VIDEO_AVATAR,
*TESTS_TURN_MANAGEMENT,
*TESTS_THINKING,
]
async def main(args: argparse.Namespace):
if not check_env_variables():
return
# Log level
logger.remove(0)
log_level = "TRACE" if args.verbose >= 2 else "DEBUG"
if args.verbose:
logger.add(sys.stderr, level=log_level)
runner = EvalRunner(
examples_dir=FOUNDATIONAL_DIR,
name=args.name,
pattern=args.pattern,
record_audio=args.audio,
log_level=log_level,
)
# Parse test config: (test, prompt, eval, user_speaks_first)
for test_config in TESTS:
test, eval_config = test_config
await runner.run_eval(test, eval_config)
runner.print_results()
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Pipecat Eval Runner")
parser.add_argument("--audio", "-a", action="store_true", help="Record audio for each test")
parser.add_argument("--name", "-n", help="Name for the current runner (e.g. 'v.0.0.68')")
parser.add_argument("--pattern", "-p", help="Only run tests that match the pattern")
parser.add_argument("--verbose", "-v", action="count", default=0)
args = parser.parse_args()
asyncio.run(main(args))