Add thinking examples to eval suite
This commit is contained in:
@@ -111,16 +111,23 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation, using a prompt conducive to demonstrating
|
||||
# thinking (chosen from Google and Anthropic docs).
|
||||
# Kick off the conversation.
|
||||
messages.append(
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Analogize photosynthesis and growing up.",
|
||||
# "content": "Compare and contrast electric cars and hybrid cars."
|
||||
# "content": "Are there an infinite number of prime numbers such that n mod 4 == 3?"
|
||||
"content": "Say hello briefly.",
|
||||
}
|
||||
)
|
||||
# Here are some example example prompts conducive to demonstrating
|
||||
# thinking (picked from Google and Anthropic docs).
|
||||
# messages.append(
|
||||
# {
|
||||
# "role": "user",
|
||||
# "content": "Analogize photosynthesis and growing up. Keep your answer concise.",
|
||||
# # "content": "Compare and contrast electric cars and hybrid cars."
|
||||
# # "content": "Are there an infinite number of prime numbers such that n mod 4 == 3?"
|
||||
# }
|
||||
# )
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
|
||||
@@ -116,16 +116,23 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation, using a prompt conducive to demonstrating
|
||||
# thinking (chosen from Google and Anthropic docs).
|
||||
# Kick off the conversation.
|
||||
messages.append(
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Analogize photosynthesis and growing up.",
|
||||
# "content": "Compare and contrast electric cars and hybrid cars."
|
||||
# "content": "Are there an infinite number of prime numbers such that n mod 4 == 3?"
|
||||
"content": "Say hello briefly.",
|
||||
}
|
||||
)
|
||||
# Here are some example example prompts conducive to demonstrating
|
||||
# thinking (picked from Google and Anthropic docs).
|
||||
# messages.append(
|
||||
# {
|
||||
# "role": "user",
|
||||
# "content": "Analogize photosynthesis and growing up. Keep your answer concise.",
|
||||
# # "content": "Compare and contrast electric cars and hybrid cars."
|
||||
# # "content": "Are there an infinite number of prime numbers such that n mod 4 == 3?"
|
||||
# }
|
||||
# )
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
|
||||
@@ -138,13 +138,21 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
# This example comes from Gemini docs.
|
||||
messages.append(
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Check the status of flight AA100 and, if it's delayed, book me a taxi 2 hours before its departure time.",
|
||||
"content": "Say hello briefly.",
|
||||
}
|
||||
)
|
||||
# Here is an example prompt conducive to demonstrating thinking and
|
||||
# function calling.
|
||||
# This example comes from Gemini docs.
|
||||
# messages.append(
|
||||
# {
|
||||
# "role": "user",
|
||||
# "content": "Check the status of flight AA100 and, if it's delayed, book me a taxi 2 hours before its departure time.",
|
||||
# }
|
||||
# )
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
|
||||
@@ -143,13 +143,21 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
# This example comes from Gemini docs.
|
||||
messages.append(
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Check the status of flight AA100 and, if it's delayed, book me a taxi 2 hours before its departure time.",
|
||||
"content": "Say hello briefly.",
|
||||
}
|
||||
)
|
||||
# Here is an example prompt conducive to demonstrating thinking and
|
||||
# function calling.
|
||||
# This example comes from Gemini docs.
|
||||
# messages.append(
|
||||
# {
|
||||
# "role": "user",
|
||||
# "content": "Check the status of flight AA100 and, if it's delayed, book me a taxi 2 hours before its departure time.",
|
||||
# }
|
||||
# )
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
|
||||
@@ -74,6 +74,11 @@ EVAL_CONVERSATION = EvalConfig(
|
||||
eval_speaks_first=True,
|
||||
)
|
||||
|
||||
EVAL_FLIGHT_STATUS = EvalConfig(
|
||||
prompt="Check the status of flight AA100.",
|
||||
eval="The user says something about the status of flight AA100, such as whether it's on time or delayed.",
|
||||
)
|
||||
|
||||
|
||||
TESTS_07 = [
|
||||
# 07 series
|
||||
@@ -204,6 +209,13 @@ TESTS_44 = [
|
||||
("44-voicemail-detection.py", EVAL_CONVERSATION),
|
||||
]
|
||||
|
||||
TESTS_49 = [
|
||||
("49a-thinking-anthropic.py", EVAL_SIMPLE_MATH),
|
||||
("49b-thinking-google.py", EVAL_SIMPLE_MATH),
|
||||
("49c-thinking-functions-anthropic.py", EVAL_FLIGHT_STATUS),
|
||||
("49d-thinking-functions-google.py", EVAL_FLIGHT_STATUS),
|
||||
]
|
||||
|
||||
TESTS = [
|
||||
*TESTS_07,
|
||||
*TESTS_12,
|
||||
@@ -216,6 +228,7 @@ TESTS = [
|
||||
*TESTS_40,
|
||||
*TESTS_43,
|
||||
*TESTS_44,
|
||||
*TESTS_49,
|
||||
]
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user