From 12979293ad3fa74e650067bc30d3b2ee2d3a1fea Mon Sep 17 00:00:00 2001
From: Paul Kompfner <paul@daily.co>
Date: Thu, 11 Dec 2025 15:58:48 -0500
Subject: [PATCH] Add thinking examples to eval suite

---
 examples/foundational/49a-thinking-anthropic.py | 17 ++++++++++++-----
 examples/foundational/49b-thinking-google.py    | 17 ++++++++++++-----
 .../49c-thinking-functions-anthropic.py         | 12 ++++++++++--
 .../49d-thinking-functions-google.py            | 12 ++++++++++--
 scripts/evals/run-release-evals.py              | 13 +++++++++++++
 5 files changed, 57 insertions(+), 14 deletions(-)

diff --git a/examples/foundational/49a-thinking-anthropic.py b/examples/foundational/49a-thinking-anthropic.py
index 6017f335e..a4a315ac5 100644
--- a/examples/foundational/49a-thinking-anthropic.py
+++ b/examples/foundational/49a-thinking-anthropic.py
@@ -111,16 +111,23 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
     @transport.event_handler("on_client_connected")
     async def on_client_connected(transport, client):
         logger.info(f"Client connected")
-        # Kick off the conversation, using a prompt conducive to demonstrating
-        # thinking (chosen from Google and Anthropic docs).
+        # Kick off the conversation.
         messages.append(
             {
                 "role": "user",
-                "content": "Analogize photosynthesis and growing up.",
-                # "content": "Compare and contrast electric cars and hybrid cars."
-                # "content": "Are there an infinite number of prime numbers such that n mod 4 == 3?"
+                "content": "Say hello briefly.",
             }
         )
+        # Here are some example example prompts conducive to demonstrating
+        # thinking (picked from Google and Anthropic docs).
+        # messages.append(
+        #     {
+        #         "role": "user",
+        #         "content": "Analogize photosynthesis and growing up. Keep your answer concise.",
+        #         # "content": "Compare and contrast electric cars and hybrid cars."
+        #         # "content": "Are there an infinite number of prime numbers such that n mod 4 == 3?"
+        #     }
+        # )
         await task.queue_frames([LLMRunFrame()])
 
     @transport.event_handler("on_client_disconnected")
diff --git a/examples/foundational/49b-thinking-google.py b/examples/foundational/49b-thinking-google.py
index 85df6da39..40d82dec0 100644
--- a/examples/foundational/49b-thinking-google.py
+++ b/examples/foundational/49b-thinking-google.py
@@ -116,16 +116,23 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
     @transport.event_handler("on_client_connected")
     async def on_client_connected(transport, client):
         logger.info(f"Client connected")
-        # Kick off the conversation, using a prompt conducive to demonstrating
-        # thinking (chosen from Google and Anthropic docs).
+        # Kick off the conversation.
         messages.append(
             {
                 "role": "user",
-                "content": "Analogize photosynthesis and growing up.",
-                # "content": "Compare and contrast electric cars and hybrid cars."
-                # "content": "Are there an infinite number of prime numbers such that n mod 4 == 3?"
+                "content": "Say hello briefly.",
             }
         )
+        # Here are some example example prompts conducive to demonstrating
+        # thinking (picked from Google and Anthropic docs).
+        # messages.append(
+        #     {
+        #         "role": "user",
+        #         "content": "Analogize photosynthesis and growing up. Keep your answer concise.",
+        #         # "content": "Compare and contrast electric cars and hybrid cars."
+        #         # "content": "Are there an infinite number of prime numbers such that n mod 4 == 3?"
+        #     }
+        # )
         await task.queue_frames([LLMRunFrame()])
 
     @transport.event_handler("on_client_disconnected")
diff --git a/examples/foundational/49c-thinking-functions-anthropic.py b/examples/foundational/49c-thinking-functions-anthropic.py
index 3d71f2c47..e821b9d09 100644
--- a/examples/foundational/49c-thinking-functions-anthropic.py
+++ b/examples/foundational/49c-thinking-functions-anthropic.py
@@ -138,13 +138,21 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
     async def on_client_connected(transport, client):
         logger.info(f"Client connected")
         # Kick off the conversation.
-        # This example comes from Gemini docs.
         messages.append(
             {
                 "role": "user",
-                "content": "Check the status of flight AA100 and, if it's delayed, book me a taxi 2 hours before its departure time.",
+                "content": "Say hello briefly.",
             }
         )
+        # Here is an example prompt conducive to demonstrating thinking and
+        # function calling.
+        # This example comes from Gemini docs.
+        # messages.append(
+        #     {
+        #         "role": "user",
+        #         "content": "Check the status of flight AA100 and, if it's delayed, book me a taxi 2 hours before its departure time.",
+        #     }
+        # )
         await task.queue_frames([LLMRunFrame()])
 
     @transport.event_handler("on_client_disconnected")
diff --git a/examples/foundational/49d-thinking-functions-google.py b/examples/foundational/49d-thinking-functions-google.py
index 3ec2b62d8..cdf4621b1 100644
--- a/examples/foundational/49d-thinking-functions-google.py
+++ b/examples/foundational/49d-thinking-functions-google.py
@@ -143,13 +143,21 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
     async def on_client_connected(transport, client):
         logger.info(f"Client connected")
         # Kick off the conversation.
-        # This example comes from Gemini docs.
         messages.append(
             {
                 "role": "user",
-                "content": "Check the status of flight AA100 and, if it's delayed, book me a taxi 2 hours before its departure time.",
+                "content": "Say hello briefly.",
             }
         )
+        # Here is an example prompt conducive to demonstrating thinking and
+        # function calling.
+        # This example comes from Gemini docs.
+        # messages.append(
+        #     {
+        #         "role": "user",
+        #         "content": "Check the status of flight AA100 and, if it's delayed, book me a taxi 2 hours before its departure time.",
+        #     }
+        # )
         await task.queue_frames([LLMRunFrame()])
 
     @transport.event_handler("on_client_disconnected")
diff --git a/scripts/evals/run-release-evals.py b/scripts/evals/run-release-evals.py
index f45128133..863514c64 100644
--- a/scripts/evals/run-release-evals.py
+++ b/scripts/evals/run-release-evals.py
@@ -74,6 +74,11 @@ EVAL_CONVERSATION = EvalConfig(
     eval_speaks_first=True,
 )
 
+EVAL_FLIGHT_STATUS = EvalConfig(
+    prompt="Check the status of flight AA100.",
+    eval="The user says something about the status of flight AA100, such as whether it's on time or delayed.",
+)
+
 
 TESTS_07 = [
     # 07 series
@@ -204,6 +209,13 @@ TESTS_44 = [
     ("44-voicemail-detection.py", EVAL_CONVERSATION),
 ]
 
+TESTS_49 = [
+    ("49a-thinking-anthropic.py", EVAL_SIMPLE_MATH),
+    ("49b-thinking-google.py", EVAL_SIMPLE_MATH),
+    ("49c-thinking-functions-anthropic.py", EVAL_FLIGHT_STATUS),
+    ("49d-thinking-functions-google.py", EVAL_FLIGHT_STATUS),
+]
+
 TESTS = [
     *TESTS_07,
     *TESTS_12,
@@ -216,6 +228,7 @@ TESTS = [
     *TESTS_40,
     *TESTS_43,
     *TESTS_44,
+    *TESTS_49,
 ]