scripts(evals): improve eval prompting

This commit is contained in:
Aleix Conchillo Flaqué
2025-10-29 18:09:10 -07:00
parent 8fa6cbac51
commit 74fb6e7676
2 changed files with 12 additions and 13 deletions

View File

@@ -271,10 +271,9 @@ async def run_eval_pipeline(
elif isinstance(eval_config.prompt, tuple):
example_prompt, example_image = eval_config.prompt
eval_prompt = f"The answer is correct if it matches: {eval}."
common_system_prompt = (
"The user might say things other than the answer and that's allowed. "
f"You should only call the eval function with your assessment when the user actually answers the question. {eval_prompt}"
f"You should only call the eval function when the user: {eval_config.eval}"
)
if eval_config.eval_speaks_first:
system_prompt = f"You are an LLM eval, be extremly brief. You will start the conversation by saying: '{example_prompt}'. {common_system_prompt}"

View File

@@ -26,32 +26,32 @@ FOUNDATIONAL_DIR = SCRIPT_DIR.parent.parent / "examples" / "foundational"
EVAL_SIMPLE_MATH = EvalConfig(
prompt="A simple math addition.",
eval="Correct math addition.",
eval="The user answers the math addition correctly.",
)
EVAL_WEATHER = EvalConfig(
prompt="What's the weather in San Francisco?",
eval="Something specific about the current weather in San Francisco, including the degrees.",
eval="The user says something specific about the current weather in San Francisco, including the degrees.",
)
EVAL_ONLINE_SEARCH = EvalConfig(
prompt="What's the date right now in London?",
eval=f"Today is {datetime.now(timezone.utc).strftime('%B %d, %Y')}.",
eval=f"The user says today is {datetime.now(timezone.utc).strftime('%B %d, %Y')} in London.",
)
EVAL_SWITCH_LANGUAGE = EvalConfig(
prompt="Say something in Spanish.",
eval="The user is now talking in Spanish.",
eval="The user talks in Spanish.",
)
EVAL_VISION_CAMERA = EvalConfig(
prompt=("Briefly describe what you see.", Image.open(ASSETS_DIR / "cat.jpg")),
eval="A cat description.",
eval="The user provides a cat description.",
)
EVAL_VISION_IMAGE = EvalConfig(
prompt="Briefly describe this image.",
eval="A cat description.",
eval="The user provides a cat description.",
eval_speaks_first=True,
runner_args_body={
"image_path": ASSETS_DIR / "cat.jpg",
@@ -60,14 +60,14 @@ EVAL_VISION_IMAGE = EvalConfig(
)
EVAL_VOICEMAIL = EvalConfig(
prompt="Please leave a message after the beep.",
eval="Assess the conversation and determine if it is a voicemail.",
prompt="Please leave a message.",
eval="The user leaves a voicemail message.",
eval_speaks_first=True,
)
EVAL_CONVERSATION = EvalConfig(
prompt="Hello, this is Mark.",
eval="A start of a conversation, not a voicemail.",
eval="The user replies with a greeting.",
eval_speaks_first=True,
)
@@ -172,11 +172,11 @@ TESTS_21 = [
]
TESTS_26 = [
("26-gemini-multimodal-live.py", EVAL_SIMPLE_MATH),
("26-gemini-live.py", EVAL_SIMPLE_MATH),
("26a-gemini-live-transcription.py", EVAL_SIMPLE_MATH),
("26b-gemini-live-function-calling.py", EVAL_WEATHER),
("26c-gemini-live-video.py", EVAL_SIMPLE_MATH),
("26e-gemini-multimodal-google-search.py", EVAL_ONLINE_SEARCH),
("26e-gemini-live-google-search.py", EVAL_ONLINE_SEARCH),
("26h-gemini-live-vertex-function-calling.py", EVAL_WEATHER),
# Currently not working.
# ("26d-gemini-live-text.py", EVAL_SIMPLE_MATH),