From 12979293ad3fa74e650067bc30d3b2ee2d3a1fea Mon Sep 17 00:00:00 2001 From: Paul Kompfner Date: Thu, 11 Dec 2025 15:58:48 -0500 Subject: [PATCH] Add thinking examples to eval suite --- examples/foundational/49a-thinking-anthropic.py | 17 ++++++++++++----- examples/foundational/49b-thinking-google.py | 17 ++++++++++++----- .../49c-thinking-functions-anthropic.py | 12 ++++++++++-- .../49d-thinking-functions-google.py | 12 ++++++++++-- scripts/evals/run-release-evals.py | 13 +++++++++++++ 5 files changed, 57 insertions(+), 14 deletions(-) diff --git a/examples/foundational/49a-thinking-anthropic.py b/examples/foundational/49a-thinking-anthropic.py index 6017f335e..a4a315ac5 100644 --- a/examples/foundational/49a-thinking-anthropic.py +++ b/examples/foundational/49a-thinking-anthropic.py @@ -111,16 +111,23 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments): @transport.event_handler("on_client_connected") async def on_client_connected(transport, client): logger.info(f"Client connected") - # Kick off the conversation, using a prompt conducive to demonstrating - # thinking (chosen from Google and Anthropic docs). + # Kick off the conversation. messages.append( { "role": "user", - "content": "Analogize photosynthesis and growing up.", - # "content": "Compare and contrast electric cars and hybrid cars." - # "content": "Are there an infinite number of prime numbers such that n mod 4 == 3?" + "content": "Say hello briefly.", } ) + # Here are some example example prompts conducive to demonstrating + # thinking (picked from Google and Anthropic docs). + # messages.append( + # { + # "role": "user", + # "content": "Analogize photosynthesis and growing up. Keep your answer concise.", + # # "content": "Compare and contrast electric cars and hybrid cars." + # # "content": "Are there an infinite number of prime numbers such that n mod 4 == 3?" + # } + # ) await task.queue_frames([LLMRunFrame()]) @transport.event_handler("on_client_disconnected") diff --git a/examples/foundational/49b-thinking-google.py b/examples/foundational/49b-thinking-google.py index 85df6da39..40d82dec0 100644 --- a/examples/foundational/49b-thinking-google.py +++ b/examples/foundational/49b-thinking-google.py @@ -116,16 +116,23 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments): @transport.event_handler("on_client_connected") async def on_client_connected(transport, client): logger.info(f"Client connected") - # Kick off the conversation, using a prompt conducive to demonstrating - # thinking (chosen from Google and Anthropic docs). + # Kick off the conversation. messages.append( { "role": "user", - "content": "Analogize photosynthesis and growing up.", - # "content": "Compare and contrast electric cars and hybrid cars." - # "content": "Are there an infinite number of prime numbers such that n mod 4 == 3?" + "content": "Say hello briefly.", } ) + # Here are some example example prompts conducive to demonstrating + # thinking (picked from Google and Anthropic docs). + # messages.append( + # { + # "role": "user", + # "content": "Analogize photosynthesis and growing up. Keep your answer concise.", + # # "content": "Compare and contrast electric cars and hybrid cars." + # # "content": "Are there an infinite number of prime numbers such that n mod 4 == 3?" + # } + # ) await task.queue_frames([LLMRunFrame()]) @transport.event_handler("on_client_disconnected") diff --git a/examples/foundational/49c-thinking-functions-anthropic.py b/examples/foundational/49c-thinking-functions-anthropic.py index 3d71f2c47..e821b9d09 100644 --- a/examples/foundational/49c-thinking-functions-anthropic.py +++ b/examples/foundational/49c-thinking-functions-anthropic.py @@ -138,13 +138,21 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments): async def on_client_connected(transport, client): logger.info(f"Client connected") # Kick off the conversation. - # This example comes from Gemini docs. messages.append( { "role": "user", - "content": "Check the status of flight AA100 and, if it's delayed, book me a taxi 2 hours before its departure time.", + "content": "Say hello briefly.", } ) + # Here is an example prompt conducive to demonstrating thinking and + # function calling. + # This example comes from Gemini docs. + # messages.append( + # { + # "role": "user", + # "content": "Check the status of flight AA100 and, if it's delayed, book me a taxi 2 hours before its departure time.", + # } + # ) await task.queue_frames([LLMRunFrame()]) @transport.event_handler("on_client_disconnected") diff --git a/examples/foundational/49d-thinking-functions-google.py b/examples/foundational/49d-thinking-functions-google.py index 3ec2b62d8..cdf4621b1 100644 --- a/examples/foundational/49d-thinking-functions-google.py +++ b/examples/foundational/49d-thinking-functions-google.py @@ -143,13 +143,21 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments): async def on_client_connected(transport, client): logger.info(f"Client connected") # Kick off the conversation. - # This example comes from Gemini docs. messages.append( { "role": "user", - "content": "Check the status of flight AA100 and, if it's delayed, book me a taxi 2 hours before its departure time.", + "content": "Say hello briefly.", } ) + # Here is an example prompt conducive to demonstrating thinking and + # function calling. + # This example comes from Gemini docs. + # messages.append( + # { + # "role": "user", + # "content": "Check the status of flight AA100 and, if it's delayed, book me a taxi 2 hours before its departure time.", + # } + # ) await task.queue_frames([LLMRunFrame()]) @transport.event_handler("on_client_disconnected") diff --git a/scripts/evals/run-release-evals.py b/scripts/evals/run-release-evals.py index f45128133..863514c64 100644 --- a/scripts/evals/run-release-evals.py +++ b/scripts/evals/run-release-evals.py @@ -74,6 +74,11 @@ EVAL_CONVERSATION = EvalConfig( eval_speaks_first=True, ) +EVAL_FLIGHT_STATUS = EvalConfig( + prompt="Check the status of flight AA100.", + eval="The user says something about the status of flight AA100, such as whether it's on time or delayed.", +) + TESTS_07 = [ # 07 series @@ -204,6 +209,13 @@ TESTS_44 = [ ("44-voicemail-detection.py", EVAL_CONVERSATION), ] +TESTS_49 = [ + ("49a-thinking-anthropic.py", EVAL_SIMPLE_MATH), + ("49b-thinking-google.py", EVAL_SIMPLE_MATH), + ("49c-thinking-functions-anthropic.py", EVAL_FLIGHT_STATUS), + ("49d-thinking-functions-google.py", EVAL_FLIGHT_STATUS), +] + TESTS = [ *TESTS_07, *TESTS_12, @@ -216,6 +228,7 @@ TESTS = [ *TESTS_40, *TESTS_43, *TESTS_44, + *TESTS_49, ]