Add thinking examples to eval suite

2025-12-11 15:58:48 -05:00
parent 28248e9b00
commit 12979293ad
5 changed files with 57 additions and 14 deletions
--- a/examples/foundational/49a-thinking-anthropic.py
+++ b/examples/foundational/49a-thinking-anthropic.py
@@ -111,16 +111,23 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
    @transport.event_handler("on_client_connected")
    async def on_client_connected(transport, client):
        logger.info(f"Client connected")
-        # Kick off the conversation, using a prompt conducive to demonstrating
-        # thinking (chosen from Google and Anthropic docs).
+        # Kick off the conversation.
        messages.append(
            {
                "role": "user",
-                "content": "Analogize photosynthesis and growing up.",
-                # "content": "Compare and contrast electric cars and hybrid cars."
-                # "content": "Are there an infinite number of prime numbers such that n mod 4 == 3?"
+                "content": "Say hello briefly.",
            }
        )
+        # Here are some example example prompts conducive to demonstrating
+        # thinking (picked from Google and Anthropic docs).
+        # messages.append(
+        #     {
+        #         "role": "user",
+        #         "content": "Analogize photosynthesis and growing up. Keep your answer concise.",
+        #         # "content": "Compare and contrast electric cars and hybrid cars."
+        #         # "content": "Are there an infinite number of prime numbers such that n mod 4 == 3?"
+        #     }
+        # )
        await task.queue_frames([LLMRunFrame()])

    @transport.event_handler("on_client_disconnected")
--- a/examples/foundational/49b-thinking-google.py
+++ b/examples/foundational/49b-thinking-google.py
@@ -116,16 +116,23 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
    @transport.event_handler("on_client_connected")
    async def on_client_connected(transport, client):
        logger.info(f"Client connected")
-        # Kick off the conversation, using a prompt conducive to demonstrating
-        # thinking (chosen from Google and Anthropic docs).
+        # Kick off the conversation.
        messages.append(
            {
                "role": "user",
-                "content": "Analogize photosynthesis and growing up.",
-                # "content": "Compare and contrast electric cars and hybrid cars."
-                # "content": "Are there an infinite number of prime numbers such that n mod 4 == 3?"
+                "content": "Say hello briefly.",
            }
        )
+        # Here are some example example prompts conducive to demonstrating
+        # thinking (picked from Google and Anthropic docs).
+        # messages.append(
+        #     {
+        #         "role": "user",
+        #         "content": "Analogize photosynthesis and growing up. Keep your answer concise.",
+        #         # "content": "Compare and contrast electric cars and hybrid cars."
+        #         # "content": "Are there an infinite number of prime numbers such that n mod 4 == 3?"
+        #     }
+        # )
        await task.queue_frames([LLMRunFrame()])

    @transport.event_handler("on_client_disconnected")
--- a/examples/foundational/49c-thinking-functions-anthropic.py
+++ b/examples/foundational/49c-thinking-functions-anthropic.py
@@ -138,13 +138,21 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
    async def on_client_connected(transport, client):
        logger.info(f"Client connected")
        # Kick off the conversation.
-        # This example comes from Gemini docs.
        messages.append(
            {
                "role": "user",
-                "content": "Check the status of flight AA100 and, if it's delayed, book me a taxi 2 hours before its departure time.",
+                "content": "Say hello briefly.",
            }
        )
+        # Here is an example prompt conducive to demonstrating thinking and
+        # function calling.
+        # This example comes from Gemini docs.
+        # messages.append(
+        #     {
+        #         "role": "user",
+        #         "content": "Check the status of flight AA100 and, if it's delayed, book me a taxi 2 hours before its departure time.",
+        #     }
+        # )
        await task.queue_frames([LLMRunFrame()])

    @transport.event_handler("on_client_disconnected")
--- a/examples/foundational/49d-thinking-functions-google.py
+++ b/examples/foundational/49d-thinking-functions-google.py
@@ -143,13 +143,21 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
    async def on_client_connected(transport, client):
        logger.info(f"Client connected")
        # Kick off the conversation.
-        # This example comes from Gemini docs.
        messages.append(
            {
                "role": "user",
-                "content": "Check the status of flight AA100 and, if it's delayed, book me a taxi 2 hours before its departure time.",
+                "content": "Say hello briefly.",
            }
        )
+        # Here is an example prompt conducive to demonstrating thinking and
+        # function calling.
+        # This example comes from Gemini docs.
+        # messages.append(
+        #     {
+        #         "role": "user",
+        #         "content": "Check the status of flight AA100 and, if it's delayed, book me a taxi 2 hours before its departure time.",
+        #     }
+        # )
        await task.queue_frames([LLMRunFrame()])

    @transport.event_handler("on_client_disconnected")
--- a/scripts/evals/run-release-evals.py
+++ b/scripts/evals/run-release-evals.py
@@ -74,6 +74,11 @@ EVAL_CONVERSATION = EvalConfig(
    eval_speaks_first=True,
 )

+EVAL_FLIGHT_STATUS = EvalConfig(
+    prompt="Check the status of flight AA100.",
+    eval="The user says something about the status of flight AA100, such as whether it's on time or delayed.",
+)
+

 TESTS_07 = [
    # 07 series
@@ -204,6 +209,13 @@ TESTS_44 = [
    ("44-voicemail-detection.py", EVAL_CONVERSATION),
 ]

+TESTS_49 = [
+    ("49a-thinking-anthropic.py", EVAL_SIMPLE_MATH),
+    ("49b-thinking-google.py", EVAL_SIMPLE_MATH),
+    ("49c-thinking-functions-anthropic.py", EVAL_FLIGHT_STATUS),
+    ("49d-thinking-functions-google.py", EVAL_FLIGHT_STATUS),
+]
+
 TESTS = [
    *TESTS_07,
    *TESTS_12,
@@ -216,6 +228,7 @@ TESTS = [
    *TESTS_40,
    *TESTS_43,
    *TESTS_44,
+    *TESTS_49,
 ]