From b30af3e1557deed97131660920142598c528a0ea Mon Sep 17 00:00:00 2001 From: Mark Backman Date: Mon, 18 Aug 2025 14:57:16 -0400 Subject: [PATCH] Tests specify USER_SPEAKS_FIRST or BOT_SPEAKS_FIRST --- scripts/evals/run-release-evals.py | 171 +++++++++++++++-------------- 1 file changed, 89 insertions(+), 82 deletions(-) diff --git a/scripts/evals/run-release-evals.py b/scripts/evals/run-release-evals.py index f30b71193..938200464 100644 --- a/scripts/evals/run-release-evals.py +++ b/scripts/evals/run-release-evals.py @@ -24,8 +24,9 @@ ASSETS_DIR = SCRIPT_DIR / "assets" FOUNDATIONAL_DIR = SCRIPT_DIR.parent.parent / "examples" / "foundational" -# User speaks first +# Speaking order constants USER_SPEAKS_FIRST = True +BOT_SPEAKS_FIRST = False # Math PROMPT_SIMPLE_MATH = "A simple math addition." @@ -56,113 +57,123 @@ EVAL_CONVERSATION = "A start of a conversation, not a voicemail." TESTS_07 = [ # 07 series - ("07-interruptible.py", PROMPT_SIMPLE_MATH, None), - ("07-interruptible-cartesia-http.py", PROMPT_SIMPLE_MATH, None), - ("07a-interruptible-speechmatics.py", PROMPT_SIMPLE_MATH, None), - ("07aa-interruptible-soniox.py", PROMPT_SIMPLE_MATH, None), - ("07ab-interruptible-inworld-http.py", PROMPT_SIMPLE_MATH, None), - ("07ac-interruptible-asyncai.py", PROMPT_SIMPLE_MATH, None), - ("07ac-interruptible-asyncai-http.py", PROMPT_SIMPLE_MATH, None), - ("07b-interruptible-langchain.py", PROMPT_SIMPLE_MATH, None), - ("07c-interruptible-deepgram.py", PROMPT_SIMPLE_MATH, None), - ("07d-interruptible-elevenlabs.py", PROMPT_SIMPLE_MATH, None), - ("07d-interruptible-elevenlabs-http.py", PROMPT_SIMPLE_MATH, None), - ("07e-interruptible-playht.py", PROMPT_SIMPLE_MATH, None), - ("07e-interruptible-playht-http.py", PROMPT_SIMPLE_MATH, None), - ("07f-interruptible-azure.py", PROMPT_SIMPLE_MATH, None), - ("07g-interruptible-openai.py", PROMPT_SIMPLE_MATH, None), - ("07h-interruptible-openpipe.py", PROMPT_SIMPLE_MATH, None), - ("07j-interruptible-gladia.py", PROMPT_SIMPLE_MATH, None), - ("07k-interruptible-lmnt.py", PROMPT_SIMPLE_MATH, None), - ("07l-interruptible-groq.py", PROMPT_SIMPLE_MATH, None), - ("07m-interruptible-aws.py", PROMPT_SIMPLE_MATH, None), - ("07n-interruptible-gemini.py", PROMPT_SIMPLE_MATH, None), - ("07n-interruptible-google.py", PROMPT_SIMPLE_MATH, None), - ("07o-interruptible-assemblyai.py", PROMPT_SIMPLE_MATH, None), - ("07q-interruptible-rime.py", PROMPT_SIMPLE_MATH, None), - ("07q-interruptible-rime-http.py", PROMPT_SIMPLE_MATH, None), - ("07r-interruptible-riva-nim.py", PROMPT_SIMPLE_MATH, None), - ("07s-interruptible-google-audio-in.py", PROMPT_SIMPLE_MATH, None), - ("07t-interruptible-fish.py", PROMPT_SIMPLE_MATH, None), - ("07v-interruptible-neuphonic.py", PROMPT_SIMPLE_MATH, None), - ("07v-interruptible-neuphonic-http.py", PROMPT_SIMPLE_MATH, None), - ("07w-interruptible-fal.py", PROMPT_SIMPLE_MATH, None), - ("07y-interruptible-minimax.py", PROMPT_SIMPLE_MATH, None), - ("07z-interruptible-sarvam.py", PROMPT_SIMPLE_MATH, None), + ("07-interruptible.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), + ("07-interruptible-cartesia-http.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), + ("07a-interruptible-speechmatics.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), + ("07aa-interruptible-soniox.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), + ("07ab-interruptible-inworld-http.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), + ("07ac-interruptible-asyncai.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), + ("07ac-interruptible-asyncai-http.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), + ("07b-interruptible-langchain.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), + ("07c-interruptible-deepgram.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), + ("07d-interruptible-elevenlabs.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), + ("07d-interruptible-elevenlabs-http.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), + ("07e-interruptible-playht.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), + ("07e-interruptible-playht-http.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), + ("07f-interruptible-azure.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), + ("07g-interruptible-openai.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), + ("07h-interruptible-openpipe.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), + ("07j-interruptible-gladia.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), + ("07k-interruptible-lmnt.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), + ("07l-interruptible-groq.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), + ("07m-interruptible-aws.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), + ("07n-interruptible-gemini.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), + ("07n-interruptible-google.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), + ("07o-interruptible-assemblyai.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), + ("07q-interruptible-rime.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), + ("07q-interruptible-rime-http.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), + ("07r-interruptible-riva-nim.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), + ("07s-interruptible-google-audio-in.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), + ("07t-interruptible-fish.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), + ("07v-interruptible-neuphonic.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), + ("07v-interruptible-neuphonic-http.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), + ("07w-interruptible-fal.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), + ("07y-interruptible-minimax.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), + ("07z-interruptible-sarvam.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), # Needs a local XTTS docker instance running. - # ("07i-interruptible-xtts.py", PROMPT_SIMPLE_MATH, None), + # ("07i-interruptible-xtts.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), # Needs a Krisp license. - # ("07p-interruptible-krisp.py", PROMPT_SIMPLE_MATH, None), + # ("07p-interruptible-krisp.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), # Needs GPU resources. - # ("07u-interruptible-ultravox.py", PROMPT_SIMPLE_MATH, None), + # ("07u-interruptible-ultravox.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), ] TESTS_12 = [ - ("12-describe-video.py", PROMPT_VISION, EVAL_VISION), - ("12a-describe-video-gemini-flash.py", PROMPT_VISION, EVAL_VISION), - ("12b-describe-video-gpt-4o.py", PROMPT_VISION, EVAL_VISION), - ("12c-describe-video-anthropic.py", PROMPT_VISION, EVAL_VISION), + ("12-describe-video.py", PROMPT_VISION, EVAL_VISION, BOT_SPEAKS_FIRST), + ("12a-describe-video-gemini-flash.py", PROMPT_VISION, EVAL_VISION, BOT_SPEAKS_FIRST), + ("12b-describe-video-gpt-4o.py", PROMPT_VISION, EVAL_VISION, BOT_SPEAKS_FIRST), + ("12c-describe-video-anthropic.py", PROMPT_VISION, EVAL_VISION, BOT_SPEAKS_FIRST), ] TESTS_14 = [ - ("14-function-calling.py", PROMPT_WEATHER, EVAL_WEATHER), - ("14a-function-calling-anthropic.py", PROMPT_WEATHER, EVAL_WEATHER), - ("14b-function-calling-anthropic-video.py", PROMPT_WEATHER, EVAL_WEATHER), - ("14d-function-calling-video.py", PROMPT_WEATHER, EVAL_WEATHER), - ("14e-function-calling-google.py", PROMPT_WEATHER, EVAL_WEATHER), - ("14f-function-calling-groq.py", PROMPT_WEATHER, EVAL_WEATHER), - ("14g-function-calling-grok.py", PROMPT_WEATHER, EVAL_WEATHER), - ("14h-function-calling-azure.py", PROMPT_WEATHER, EVAL_WEATHER), - ("14i-function-calling-fireworks.py", PROMPT_WEATHER, EVAL_WEATHER), - ("14j-function-calling-nim.py", PROMPT_WEATHER, EVAL_WEATHER), - ("14m-function-calling-openrouter.py", PROMPT_WEATHER, EVAL_WEATHER), - ("14n-function-calling-perplexity.py", PROMPT_WEATHER, EVAL_WEATHER), - ("14p-function-calling-gemini-vertex-ai.py", PROMPT_WEATHER, EVAL_WEATHER), - ("14q-function-calling-qwen.py", PROMPT_WEATHER, EVAL_WEATHER), - ("14r-function-calling-aws.py", PROMPT_WEATHER, EVAL_WEATHER), - ("14v-function-calling-openai.py", PROMPT_WEATHER, EVAL_WEATHER), - ("14w-function-calling-mistral.py", PROMPT_WEATHER, EVAL_WEATHER), + ("14-function-calling.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST), + ("14a-function-calling-anthropic.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST), + ("14b-function-calling-anthropic-video.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST), + ("14d-function-calling-video.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST), + ("14e-function-calling-google.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST), + ("14f-function-calling-groq.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST), + ("14g-function-calling-grok.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST), + ("14h-function-calling-azure.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST), + ("14i-function-calling-fireworks.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST), + ("14j-function-calling-nim.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST), + ("14m-function-calling-openrouter.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST), + ("14n-function-calling-perplexity.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST), + ("14p-function-calling-gemini-vertex-ai.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST), + ("14q-function-calling-qwen.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST), + ("14r-function-calling-aws.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST), + ("14v-function-calling-openai.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST), + ("14w-function-calling-mistral.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST), # Currently not working. - # ("14c-function-calling-together.py", PROMPT_WEATHER, EVAL_WEATHER), - # ("14k-function-calling-cerebras.py", PROMPT_WEATHER, EVAL_WEATHER), - # ("14l-function-calling-deepseek.py", PROMPT_WEATHER, EVAL_WEATHER), - # ("14o-function-calling-gemini-openai-format.py", PROMPT_WEATHER, EVAL_WEATHER), + # ("14c-function-calling-together.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST), + # ("14k-function-calling-cerebras.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST), + # ("14l-function-calling-deepseek.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST), + # ("14o-function-calling-gemini-openai-format.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST), ] TESTS_15 = [ - ("15a-switch-languages.py", PROMPT_SWITCH_LANGUAGE, EVAL_SWITCH_LANGUAGE), + ("15a-switch-languages.py", PROMPT_SWITCH_LANGUAGE, EVAL_SWITCH_LANGUAGE, BOT_SPEAKS_FIRST), ] TESTS_19 = [ - ("19-openai-realtime-beta.py", PROMPT_WEATHER, EVAL_WEATHER), - ("19a-azure-realtime-beta.py", PROMPT_WEATHER, EVAL_WEATHER), - ("19b-openai-realtime-beta-text.py", PROMPT_WEATHER, EVAL_WEATHER), + ("19-openai-realtime-beta.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST), + ("19a-azure-realtime-beta.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST), + ("19b-openai-realtime-beta-text.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST), ] TESTS_21 = [ - ("21a-tavus-video-service.py", PROMPT_SIMPLE_MATH, None), + ("21a-tavus-video-service.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), ] TESTS_26 = [ - ("26-gemini-multimodal-live.py", PROMPT_SIMPLE_MATH, None), - ("26a-gemini-multimodal-live-transcription.py", PROMPT_SIMPLE_MATH, None), - ("26b-gemini-multimodal-live-function-calling.py", PROMPT_WEATHER, EVAL_WEATHER), - ("26c-gemini-multimodal-live-video.py", PROMPT_SIMPLE_MATH, None), - ("26e-gemini-multimodal-google-search.py", PROMPT_ONLINE_SEARCH, EVAL_ONLINE_SEARCH), + ("26-gemini-multimodal-live.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), + ("26a-gemini-multimodal-live-transcription.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), + ( + "26b-gemini-multimodal-live-function-calling.py", + PROMPT_WEATHER, + EVAL_WEATHER, + BOT_SPEAKS_FIRST, + ), + ("26c-gemini-multimodal-live-video.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), + ( + "26e-gemini-multimodal-google-search.py", + PROMPT_ONLINE_SEARCH, + EVAL_ONLINE_SEARCH, + BOT_SPEAKS_FIRST, + ), # Currently not working. - # ("26d-gemini-multimodal-live-text.py", PROMPT_SIMPLE_MATH, None), + # ("26d-gemini-multimodal-live-text.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), ] TESTS_27 = [ - ("27-simli-layer.py", PROMPT_SIMPLE_MATH, None), + ("27-simli-layer.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), ] TESTS_40 = [ - ("40-aws-nova-sonic.py", PROMPT_SIMPLE_MATH, None), + ("40-aws-nova-sonic.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), ] TESTS_43 = [ - ("43a-heygen-video-service.py", PROMPT_SIMPLE_MATH, None), + ("43a-heygen-video-service.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), ] TESTS_44 = [ @@ -203,13 +214,9 @@ async def main(args: argparse.Namespace): log_level=log_level, ) - # Parse test config: (test, prompt, eval) or (test, prompt, eval, user_speaks_first) + # Parse test config: (test, prompt, eval, user_speaks_first) for test_config in TESTS: - if len(test_config) == 3: - test, prompt, eval = test_config - user_speaks_first = False - else: - test, prompt, eval, user_speaks_first = test_config + test, prompt, eval, user_speaks_first = test_config await runner.run_eval(test, prompt, eval, user_speaks_first)