Add thinking examples to eval suite

This commit is contained in:
Paul Kompfner
2025-12-11 15:58:48 -05:00
parent 28248e9b00
commit 12979293ad
5 changed files with 57 additions and 14 deletions

View File

@@ -111,16 +111,23 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
@transport.event_handler("on_client_connected")
async def on_client_connected(transport, client):
logger.info(f"Client connected")
# Kick off the conversation, using a prompt conducive to demonstrating
# thinking (chosen from Google and Anthropic docs).
# Kick off the conversation.
messages.append(
{
"role": "user",
"content": "Analogize photosynthesis and growing up.",
# "content": "Compare and contrast electric cars and hybrid cars."
# "content": "Are there an infinite number of prime numbers such that n mod 4 == 3?"
"content": "Say hello briefly.",
}
)
# Here are some example example prompts conducive to demonstrating
# thinking (picked from Google and Anthropic docs).
# messages.append(
# {
# "role": "user",
# "content": "Analogize photosynthesis and growing up. Keep your answer concise.",
# # "content": "Compare and contrast electric cars and hybrid cars."
# # "content": "Are there an infinite number of prime numbers such that n mod 4 == 3?"
# }
# )
await task.queue_frames([LLMRunFrame()])
@transport.event_handler("on_client_disconnected")

View File

@@ -116,16 +116,23 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
@transport.event_handler("on_client_connected")
async def on_client_connected(transport, client):
logger.info(f"Client connected")
# Kick off the conversation, using a prompt conducive to demonstrating
# thinking (chosen from Google and Anthropic docs).
# Kick off the conversation.
messages.append(
{
"role": "user",
"content": "Analogize photosynthesis and growing up.",
# "content": "Compare and contrast electric cars and hybrid cars."
# "content": "Are there an infinite number of prime numbers such that n mod 4 == 3?"
"content": "Say hello briefly.",
}
)
# Here are some example example prompts conducive to demonstrating
# thinking (picked from Google and Anthropic docs).
# messages.append(
# {
# "role": "user",
# "content": "Analogize photosynthesis and growing up. Keep your answer concise.",
# # "content": "Compare and contrast electric cars and hybrid cars."
# # "content": "Are there an infinite number of prime numbers such that n mod 4 == 3?"
# }
# )
await task.queue_frames([LLMRunFrame()])
@transport.event_handler("on_client_disconnected")

View File

@@ -138,13 +138,21 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
async def on_client_connected(transport, client):
logger.info(f"Client connected")
# Kick off the conversation.
# This example comes from Gemini docs.
messages.append(
{
"role": "user",
"content": "Check the status of flight AA100 and, if it's delayed, book me a taxi 2 hours before its departure time.",
"content": "Say hello briefly.",
}
)
# Here is an example prompt conducive to demonstrating thinking and
# function calling.
# This example comes from Gemini docs.
# messages.append(
# {
# "role": "user",
# "content": "Check the status of flight AA100 and, if it's delayed, book me a taxi 2 hours before its departure time.",
# }
# )
await task.queue_frames([LLMRunFrame()])
@transport.event_handler("on_client_disconnected")

View File

@@ -143,13 +143,21 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
async def on_client_connected(transport, client):
logger.info(f"Client connected")
# Kick off the conversation.
# This example comes from Gemini docs.
messages.append(
{
"role": "user",
"content": "Check the status of flight AA100 and, if it's delayed, book me a taxi 2 hours before its departure time.",
"content": "Say hello briefly.",
}
)
# Here is an example prompt conducive to demonstrating thinking and
# function calling.
# This example comes from Gemini docs.
# messages.append(
# {
# "role": "user",
# "content": "Check the status of flight AA100 and, if it's delayed, book me a taxi 2 hours before its departure time.",
# }
# )
await task.queue_frames([LLMRunFrame()])
@transport.event_handler("on_client_disconnected")

View File

@@ -74,6 +74,11 @@ EVAL_CONVERSATION = EvalConfig(
eval_speaks_first=True,
)
EVAL_FLIGHT_STATUS = EvalConfig(
prompt="Check the status of flight AA100.",
eval="The user says something about the status of flight AA100, such as whether it's on time or delayed.",
)
TESTS_07 = [
# 07 series
@@ -204,6 +209,13 @@ TESTS_44 = [
("44-voicemail-detection.py", EVAL_CONVERSATION),
]
TESTS_49 = [
("49a-thinking-anthropic.py", EVAL_SIMPLE_MATH),
("49b-thinking-google.py", EVAL_SIMPLE_MATH),
("49c-thinking-functions-anthropic.py", EVAL_FLIGHT_STATUS),
("49d-thinking-functions-google.py", EVAL_FLIGHT_STATUS),
]
TESTS = [
*TESTS_07,
*TESTS_12,
@@ -216,6 +228,7 @@ TESTS = [
*TESTS_40,
*TESTS_43,
*TESTS_44,
*TESTS_49,
]