diff --git a/scripts/evals/eval.py b/scripts/evals/eval.py index bbc170803..3818cd568 100644 --- a/scripts/evals/eval.py +++ b/scripts/evals/eval.py @@ -93,7 +93,7 @@ class EvalRunner: self, example_file: str, prompt: EvalPrompt, - eval: Optional[str] = None, + eval: str, user_speaks_first: bool = False, ): if not re.match(self._pattern, example_file): @@ -203,7 +203,7 @@ async def run_eval_pipeline( eval_runner: EvalRunner, example_file: str, prompt: EvalPrompt, - eval: Optional[str], + eval: str, user_speaks_first: bool = False, ): logger.info(f"Starting eval bot") @@ -266,15 +266,12 @@ async def run_eval_pipeline( elif isinstance(prompt, tuple): example_prompt, example_image = prompt - # See if we need to include an eval prompt. - eval_prompt = "" - if eval: - if user_speaks_first: - eval_prompt = f"After the user responds, evaluate if their response is appropriate for the context and matches: [{eval}]." - system_prompt = f"You will start the conversation by saying: '{prompt}'. {eval_prompt} Then call the eval function with your assessment." - else: - eval_prompt = f"The answer is correct if the user says [{eval}]." - system_prompt = f"You are an LLM eval, be extremly brief. Your goal is to only ask one question: {example_prompt}. Call the eval function only if the user answers the question and check if the answer is correct (words as numbers are valid). {eval_prompt}" + eval_prompt = f"The answer is correct if it's appropriate for the context and matches: {eval}." + common_system_prompt = f"Call the eval function with your assessment only if the user answers the question. {eval_prompt}" + if user_speaks_first: + system_prompt = f"You are an LLM eval, be extremly brief. You will start the conversation by saying: '{example_prompt}'. {common_system_prompt}" + else: + system_prompt = f"You are an LLM eval, be extremly brief. Your goal is to first ask one question: {example_prompt}. {common_system_prompt}" messages = [ { diff --git a/scripts/evals/run-release-evals.py b/scripts/evals/run-release-evals.py index 938200464..8e4e32419 100644 --- a/scripts/evals/run-release-evals.py +++ b/scripts/evals/run-release-evals.py @@ -30,6 +30,7 @@ BOT_SPEAKS_FIRST = False # Math PROMPT_SIMPLE_MATH = "A simple math addition." +EVAL_SIMPLE_MATH = "Correct math addition." # Weather PROMPT_WEATHER = "What's the weather in San Francisco?" @@ -43,7 +44,7 @@ EVAL_ONLINE_SEARCH = f"Today is {datetime.now(timezone.utc).strftime('%B %d, %Y' # Switch language PROMPT_SWITCH_LANGUAGE = "Say something in Spanish." -EVAL_SWITCH_LANGUAGE = "Check if the user is now talking in Spanish." +EVAL_SWITCH_LANGUAGE = "The user is now talking in Spanish." # Vision PROMPT_VISION = ("What do you see?", Image.open(ASSETS_DIR / "cat.jpg")) @@ -57,45 +58,55 @@ EVAL_CONVERSATION = "A start of a conversation, not a voicemail." TESTS_07 = [ # 07 series - ("07-interruptible.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), - ("07-interruptible-cartesia-http.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), - ("07a-interruptible-speechmatics.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), - ("07aa-interruptible-soniox.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), - ("07ab-interruptible-inworld-http.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), - ("07ac-interruptible-asyncai.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), - ("07ac-interruptible-asyncai-http.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), - ("07b-interruptible-langchain.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), - ("07c-interruptible-deepgram.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), - ("07d-interruptible-elevenlabs.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), - ("07d-interruptible-elevenlabs-http.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), - ("07e-interruptible-playht.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), - ("07e-interruptible-playht-http.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), - ("07f-interruptible-azure.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), - ("07g-interruptible-openai.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), - ("07h-interruptible-openpipe.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), - ("07j-interruptible-gladia.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), - ("07k-interruptible-lmnt.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), - ("07l-interruptible-groq.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), - ("07m-interruptible-aws.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), - ("07n-interruptible-gemini.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), - ("07n-interruptible-google.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), - ("07o-interruptible-assemblyai.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), - ("07q-interruptible-rime.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), - ("07q-interruptible-rime-http.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), - ("07r-interruptible-riva-nim.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), - ("07s-interruptible-google-audio-in.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), - ("07t-interruptible-fish.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), - ("07v-interruptible-neuphonic.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), - ("07v-interruptible-neuphonic-http.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), - ("07w-interruptible-fal.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), - ("07y-interruptible-minimax.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), - ("07z-interruptible-sarvam.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), + ("07-interruptible.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST), + ("07-interruptible-cartesia-http.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST), + ("07a-interruptible-speechmatics.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST), + ("07aa-interruptible-soniox.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST), + ("07ab-interruptible-inworld-http.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST), + ("07ac-interruptible-asyncai.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST), + ("07ac-interruptible-asyncai-http.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST), + ("07b-interruptible-langchain.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST), + ("07c-interruptible-deepgram.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST), + ("07d-interruptible-elevenlabs.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST), + ( + "07d-interruptible-elevenlabs-http.py", + PROMPT_SIMPLE_MATH, + EVAL_SIMPLE_MATH, + BOT_SPEAKS_FIRST, + ), + ("07e-interruptible-playht.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST), + ("07e-interruptible-playht-http.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST), + ("07f-interruptible-azure.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST), + ("07g-interruptible-openai.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST), + ("07h-interruptible-openpipe.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST), + ("07j-interruptible-gladia.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST), + ("07k-interruptible-lmnt.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST), + ("07l-interruptible-groq.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST), + ("07m-interruptible-aws.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST), + ("07n-interruptible-gemini.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST), + ("07n-interruptible-google.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST), + ("07o-interruptible-assemblyai.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST), + ("07q-interruptible-rime.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST), + ("07q-interruptible-rime-http.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST), + ("07r-interruptible-riva-nim.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST), + ( + "07s-interruptible-google-audio-in.py", + PROMPT_SIMPLE_MATH, + EVAL_SIMPLE_MATH, + BOT_SPEAKS_FIRST, + ), + ("07t-interruptible-fish.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST), + ("07v-interruptible-neuphonic.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST), + ("07v-interruptible-neuphonic-http.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST), + ("07w-interruptible-fal.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST), + ("07y-interruptible-minimax.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST), + ("07z-interruptible-sarvam.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST), # Needs a local XTTS docker instance running. - # ("07i-interruptible-xtts.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), + # ("07i-interruptible-xtts.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST), # Needs a Krisp license. - # ("07p-interruptible-krisp.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), + # ("07p-interruptible-krisp.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST), # Needs GPU resources. - # ("07u-interruptible-ultravox.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), + # ("07u-interruptible-ultravox.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST), ] TESTS_12 = [ @@ -141,19 +152,24 @@ TESTS_19 = [ ] TESTS_21 = [ - ("21a-tavus-video-service.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), + ("21a-tavus-video-service.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST), ] TESTS_26 = [ - ("26-gemini-multimodal-live.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), - ("26a-gemini-multimodal-live-transcription.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), + ("26-gemini-multimodal-live.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST), + ( + "26a-gemini-multimodal-live-transcription.py", + PROMPT_SIMPLE_MATH, + EVAL_SIMPLE_MATH, + BOT_SPEAKS_FIRST, + ), ( "26b-gemini-multimodal-live-function-calling.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST, ), - ("26c-gemini-multimodal-live-video.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), + ("26c-gemini-multimodal-live-video.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST), ( "26e-gemini-multimodal-google-search.py", PROMPT_ONLINE_SEARCH, @@ -161,19 +177,19 @@ TESTS_26 = [ BOT_SPEAKS_FIRST, ), # Currently not working. - # ("26d-gemini-multimodal-live-text.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), + # ("26d-gemini-multimodal-live-text.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST), ] TESTS_27 = [ - ("27-simli-layer.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), + ("27-simli-layer.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST), ] TESTS_40 = [ - ("40-aws-nova-sonic.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), + ("40-aws-nova-sonic.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST), ] TESTS_43 = [ - ("43a-heygen-video-service.py", PROMPT_SIMPLE_MATH, None, BOT_SPEAKS_FIRST), + ("43a-heygen-video-service.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST), ] TESTS_44 = [