Merge pull request #2831 from pipecat-ai/mb/update-evals

Update release evals for OpenAI Realtime, Gemini Live Vertex; shorten…
2025-10-10 12:34:27 -04:00
parent 8d2a98e0e7 4b415721e2
commit f7fe673ad1
14 changed files with 212 additions and 145 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1487,7 +1487,7 @@ quality and critical bugs impacting `ParallelPipelines` functionality.**
 - Added `session_token` parameter to `AWSNovaSonicLLMService`.

 - Added Gemini Multimodal Live File API for uploading, fetching, listing, and
-  deleting files. See `26f-gemini-multimodal-live-files-api.py` for example usage.
+  deleting files. See `26f-gemini-live-files-api.py` for example usage.

 ### Changed

@@ -3493,7 +3493,7 @@ stt = DeepgramSTTService(..., live_options=LiveOptions(model="nova-2-general"))
 - Added the new modalities option and helper function to set Gemini output
  modalities.

- Added `examples/foundational/26d-gemini-multimodal-live-text.py` which is
+- Added `examples/foundational/26d-gemini-live-text.py` which is
  using Gemini as TEXT modality and using another TTS provider for TTS process.

 ### Changed
@@ -3680,9 +3680,9 @@ stt = DeepgramSTTService(..., live_options=LiveOptions(model="nova-2-general"))
 - Added new foundational examples for `GeminiMultimodalLiveLLMService`:

  - `26-gemini-multimodal-live.py`
-  - `26a-gemini-multimodal-live-transcription.py`
-  - `26b-gemini-multimodal-live-video.py`
-  - `26c-gemini-multimodal-live-video.py`
+  - `26a-gemini-live-transcription.py`
+  - `26b-gemini-live-video.py`
+  - `26c-gemini-live-video.py`

 - Added `SimliVideoService`. This is an integration for Simli AI avatars.
  (see https://www.simli.com)
--- a/examples/foundational/26-gemini-multimodal-live.py
+++ b/examples/foundational/26-gemini-multimodal-live.py
--- a/examples/foundational/26a-gemini-multimodal-live-transcription.py
+++ b/examples/foundational/26a-gemini-multimodal-live-transcription.py
--- a/examples/foundational/26b-gemini-multimodal-live-function-calling.py
+++ b/examples/foundational/26b-gemini-multimodal-live-function-calling.py
@@ -122,7 +122,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
        required=["location"],
    )
    search_tool = {"google_search": {}}
-    # KNOWN ISSUE: If using GeminiVertexMultimodalLiveLLMService, it appears
+    # KNOWN ISSUE: If using GeminiVertexLiveLLMService, it appears
    # you cannot use the "google_search" tool alongside other tools.
    # See https://github.com/googleapis/python-genai/issues/941.
    tools = ToolsSchema(
--- a/examples/foundational/26c-gemini-multimodal-live-video.py
+++ b/examples/foundational/26c-gemini-multimodal-live-video.py
--- a/examples/foundational/26d-gemini-multimodal-live-text.py
+++ b/examples/foundational/26d-gemini-multimodal-live-text.py
@@ -80,7 +80,7 @@ transport_params = {
 async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
    logger.info(f"Starting bot")

-    # KNOWN ISSUE: If using GeminiVertexMultimodalLiveLLMService, it appears
+    # KNOWN ISSUE: If using GeminiVertexLiveLLMService, it appears
    # you cannot specify a modality other than AUDIO.
    llm = GeminiLiveLLMService(
        api_key=os.getenv("GOOGLE_API_KEY"),
--- a/examples/foundational/26e-gemini-multimodal-google-search.py
+++ b/examples/foundational/26e-gemini-multimodal-google-search.py
--- a/examples/foundational/26f-gemini-multimodal-live-files-api.py
+++ b/examples/foundational/26f-gemini-multimodal-live-files-api.py
--- a/examples/foundational/26g-gemini-multimodal-live-groundingMetadata.py
+++ b/examples/foundational/26g-gemini-multimodal-live-groundingMetadata.py
--- a/examples/foundational/26h-gemini-live-vertex-function-calling.py
+++ b/examples/foundational/26h-gemini-live-vertex-function-calling.py
@@ -0,0 +1,191 @@
+#
+# Copyright (c) 2024–2025, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+
+
+import os
+from datetime import datetime
+
+from dotenv import load_dotenv
+from google.genai.types import HttpOptions
+from loguru import logger
+
+from pipecat.adapters.schemas.function_schema import FunctionSchema
+from pipecat.adapters.schemas.tools_schema import AdapterType, ToolsSchema
+from pipecat.audio.vad.silero import SileroVADAnalyzer
+from pipecat.audio.vad.vad_analyzer import VADParams
+from pipecat.frames.frames import LLMRunFrame
+from pipecat.pipeline.pipeline import Pipeline
+from pipecat.pipeline.runner import PipelineRunner
+from pipecat.pipeline.task import PipelineParams, PipelineTask
+from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
+from pipecat.runner.types import RunnerArguments
+from pipecat.runner.utils import create_transport
+from pipecat.services.google.gemini_live.llm import GeminiLiveLLMService
+from pipecat.services.google.gemini_live.llm_vertex import GeminiLiveVertexLLMService
+from pipecat.services.llm_service import FunctionCallParams
+from pipecat.transports.base_transport import BaseTransport, TransportParams
+from pipecat.transports.daily.transport import DailyParams
+from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
+
+load_dotenv(override=True)
+
+
+async def fetch_weather_from_api(params: FunctionCallParams):
+    temperature = 75 if params.arguments["format"] == "fahrenheit" else 24
+    await params.result_callback(
+        {
+            "conditions": "nice",
+            "temperature": temperature,
+            "format": params.arguments["format"],
+            "timestamp": datetime.now().strftime("%Y%m%d_%H%M%S"),
+        }
+    )
+
+
+async def fetch_restaurant_recommendation(params: FunctionCallParams):
+    await params.result_callback({"name": "The Golden Dragon"})
+
+
+system_instruction = """
+You are a helpful assistant who can answer questions and use tools.
+
+You have three tools available to you:
+1. get_current_weather: Use this tool to get the current weather in a specific location.
+2. get_restaurant_recommendation: Use this tool to get a restaurant recommendation in a specific location.
+"""
+
+
+# We store functions so objects (e.g. SileroVADAnalyzer) don't get
+# instantiated. The function will be called when the desired transport gets
+# selected.
+transport_params = {
+    "daily": lambda: DailyParams(
+        audio_in_enabled=True,
+        audio_out_enabled=True,
+        # set stop_secs to something roughly similar to the internal setting
+        # of the Multimodal Live api, just to align events. This doesn't really
+        # matter because we can only use the Multimodal Live API's phrase
+        # endpointing, for now.
+        vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.5)),
+    ),
+    "twilio": lambda: FastAPIWebsocketParams(
+        audio_in_enabled=True,
+        audio_out_enabled=True,
+        # set stop_secs to something roughly similar to the internal setting
+        # of the Multimodal Live api, just to align events. This doesn't really
+        # matter because we can only use the Multimodal Live API's phrase
+        # endpointing, for now.
+        vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.5)),
+    ),
+    "webrtc": lambda: TransportParams(
+        audio_in_enabled=True,
+        audio_out_enabled=True,
+        # set stop_secs to something roughly similar to the internal setting
+        # of the Multimodal Live api, just to align events. This doesn't really
+        # matter because we can only use the Multimodal Live API's phrase
+        # endpointing, for now.
+        vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.5)),
+    ),
+}
+
+
+async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
+    logger.info(f"Starting bot")
+
+    weather_function = FunctionSchema(
+        name="get_current_weather",
+        description="Get the current weather",
+        properties={
+            "location": {
+                "type": "string",
+                "description": "The city and state, e.g. San Francisco, CA",
+            },
+            "format": {
+                "type": "string",
+                "enum": ["celsius", "fahrenheit"],
+                "description": "The temperature unit to use. Infer this from the user's location.",
+            },
+        },
+        required=["location", "format"],
+    )
+    restaurant_function = FunctionSchema(
+        name="get_restaurant_recommendation",
+        description="Get a restaurant recommendation",
+        properties={
+            "location": {
+                "type": "string",
+                "description": "The city and state, e.g. San Francisco, CA",
+            },
+        },
+        required=["location"],
+    )
+    # KNOWN ISSUE: If using GeminiVertexLiveLLMService, it appears
+    # you cannot use the "google_search" tool alongside other tools.
+    # See https://github.com/googleapis/python-genai/issues/941.
+    tools = ToolsSchema(standard_tools=[weather_function, restaurant_function])
+
+    llm = GeminiLiveVertexLLMService(
+        credentials=os.getenv("GOOGLE_VERTEX_TEST_CREDENTIALS"),
+        project_id=os.getenv("GOOGLE_CLOUD_PROJECT_ID"),
+        location=os.getenv("GOOGLE_CLOUD_LOCATION"),
+        system_instruction=system_instruction,
+        voice_id="Puck",  # Aoede, Charon, Fenrir, Kore, Puck
+        tools=tools,
+    )
+
+    llm.register_function("get_current_weather", fetch_weather_from_api)
+    llm.register_function("get_restaurant_recommendation", fetch_restaurant_recommendation)
+
+    context = OpenAILLMContext(
+        [{"role": "user", "content": "Say hello."}],
+    )
+    context_aggregator = llm.create_context_aggregator(context)
+
+    pipeline = Pipeline(
+        [
+            transport.input(),
+            context_aggregator.user(),
+            llm,
+            transport.output(),
+            context_aggregator.assistant(),
+        ]
+    )
+
+    task = PipelineTask(
+        pipeline,
+        params=PipelineParams(
+            enable_metrics=True,
+            enable_usage_metrics=True,
+        ),
+        idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
+    )
+
+    @transport.event_handler("on_client_connected")
+    async def on_client_connected(transport, client):
+        logger.info(f"Client connected")
+        # Kick off the conversation.
+        await task.queue_frames([LLMRunFrame()])
+
+    @transport.event_handler("on_client_disconnected")
+    async def on_client_disconnected(transport, client):
+        logger.info(f"Client disconnected")
+        await task.cancel()
+
+    runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
+
+    await runner.run(task)
+
+
+async def bot(runner_args: RunnerArguments):
+    """Main bot entry point compatible with Pipecat Cloud."""
+    transport = await create_transport(runner_args, transport_params)
+    await run_bot(transport, runner_args)
+
+
+if __name__ == "__main__":
+    from pipecat.runner.run import main
+
+    main()
--- a/examples/foundational/26h-gemini-multimodal-live-vertex.py
+++ b/examples/foundational/26h-gemini-multimodal-live-vertex.py
@@ -1,133 +0,0 @@
-#
-# Copyright (c) 2024–2025, Daily
-#
-# SPDX-License-Identifier: BSD 2-Clause License
-#
-
-import os
-
-from dotenv import load_dotenv
-from loguru import logger
-
-from pipecat.audio.vad.silero import SileroVADAnalyzer
-from pipecat.audio.vad.vad_analyzer import VADParams
-from pipecat.frames.frames import LLMMessagesAppendFrame
-from pipecat.pipeline.pipeline import Pipeline
-from pipecat.pipeline.runner import PipelineRunner
-from pipecat.pipeline.task import PipelineParams, PipelineTask
-from pipecat.runner.types import RunnerArguments
-from pipecat.runner.utils import create_transport
-from pipecat.services.google.gemini_live.llm_vertex import GeminiLiveVertexLLMService
-from pipecat.transports.base_transport import BaseTransport, TransportParams
-from pipecat.transports.daily.transport import DailyParams
-from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
-
-# Load environment variables
-load_dotenv(override=True)
-
-
-# We store functions so objects (e.g. SileroVADAnalyzer) don't get
-# instantiated. The function will be called when the desired transport gets
-# selected.
-transport_params = {
-    "daily": lambda: DailyParams(
-        audio_in_enabled=True,
-        audio_out_enabled=True,
-        # set stop_secs to something roughly similar to the internal setting
-        # of the Multimodal Live api, just to align events.
-        vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.5)),
-    ),
-    "twilio": lambda: FastAPIWebsocketParams(
-        audio_in_enabled=True,
-        audio_out_enabled=True,
-        # set stop_secs to something roughly similar to the internal setting
-        # of the Multimodal Live api, just to align events.
-        vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.5)),
-    ),
-    "webrtc": lambda: TransportParams(
-        audio_in_enabled=True,
-        audio_out_enabled=True,
-        # set stop_secs to something roughly similar to the internal setting
-        # of the Multimodal Live api, just to align events.
-        vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.5)),
-    ),
-}
-
-
-async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
-    logger.info(f"Starting bot")
-
-    # Create the Gemini Vertex Multimodal Live LLM service
-    system_instruction = f"""
-    You are a helpful AI assistant.
-    Your goal is to demonstrate your capabilities in a helpful and engaging way.
-    Your output will be converted to audio so don't include special characters in your answers.
-    Respond to what the user said in a creative and helpful way.
-    """
-
-    llm = GeminiLiveVertexLLMService(
-        credentials=os.getenv("GOOGLE_VERTEX_TEST_CREDENTIALS"),
-        project_id=os.getenv("GOOGLE_CLOUD_PROJECT_ID"),
-        location=os.getenv("GOOGLE_CLOUD_LOCATION"),
-        system_instruction=system_instruction,
-        voice_id="Puck",  # Aoede, Charon, Fenrir, Kore, Puck
-    )
-
-    # Build the pipeline
-    pipeline = Pipeline(
-        [
-            transport.input(),
-            llm,
-            transport.output(),
-        ]
-    )
-
-    # Configure the pipeline task
-    task = PipelineTask(
-        pipeline,
-        params=PipelineParams(
-            enable_metrics=True,
-            enable_usage_metrics=True,
-        ),
-        idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
-    )
-
-    # Handle client connection event
-    @transport.event_handler("on_client_connected")
-    async def on_client_connected(transport, client):
-        logger.info(f"Client connected")
-        # Kick off the conversation.
-        await task.queue_frames(
-            [
-                LLMMessagesAppendFrame(
-                    messages=[
-                        {
-                            "role": "user",
-                            "content": f"Greet the user and introduce yourself.",
-                        }
-                    ]
-                )
-            ]
-        )
-
-    # Handle client disconnection events
-    @transport.event_handler("on_client_disconnected")
-    async def on_client_disconnected(transport, client):
-        logger.info(f"Client disconnected")
-        await task.cancel()
-
-    # Run the pipeline
-    runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
-    await runner.run(task)
-
-
-async def bot(runner_args: RunnerArguments):
-    """Main bot entry point compatible with Pipecat Cloud."""
-    transport = await create_transport(runner_args, transport_params)
-    await run_bot(transport, runner_args)
-
-
-if __name__ == "__main__":
-    from pipecat.runner.run import main
-
-    main()
--- a/examples/foundational/26i-gemini-multimodal-live-graceful-end.py
+++ b/examples/foundational/26i-gemini-multimodal-live-graceful-end.py
--- a/examples/foundational/README.md
+++ b/examples/foundational/README.md
@@ -105,7 +105,7 @@ uv run 07-interruptible.py -t twilio -x NGROK_HOST_NAME
 ### Vision & Multimodal

 - **[12a-describe-video-gemini-flash.py](./12a-describe-video-gemini-flash.py)**: Bot describes user's video (Video input, Multimodal LLMs)
- **[26c-gemini-multimodal-live-video.py](./26c-gemini-multimodal-live-video.py)**: Gemini with video input (Streaming video, Function calls)
+- **[26c-gemini-live-video.py](./26c-gemini-live-video.py)**: Gemini with video input (Streaming video, Function calls)

 ### Voice & Language

--- a/scripts/evals/run-release-evals.py
+++ b/scripts/evals/run-release-evals.py
@@ -147,7 +147,10 @@ TESTS_15 = [
 ]

 TESTS_19 = [
+    ("19-openai-realtime.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
    ("19-openai-realtime-beta.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
+    # OpenAI Realtime not released on Azure yet
+    # ("19a-azure-realtime.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
    ("19a-azure-realtime-beta.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
    ("19b-openai-realtime-text.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
    ("19b-openai-realtime-beta-text.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
@@ -160,18 +163,18 @@ TESTS_21 = [
 TESTS_26 = [
    ("26-gemini-multimodal-live.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
    (
-        "26a-gemini-multimodal-live-transcription.py",
+        "26a-gemini-live-transcription.py",
        PROMPT_SIMPLE_MATH,
        EVAL_SIMPLE_MATH,
        BOT_SPEAKS_FIRST,
    ),
    (
-        "26b-gemini-multimodal-live-function-calling.py",
+        "26b-gemini-live-function-calling.py",
        PROMPT_WEATHER,
        EVAL_WEATHER,
        BOT_SPEAKS_FIRST,
    ),
-    ("26c-gemini-multimodal-live-video.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
+    ("26c-gemini-live-video.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
    (
        "26e-gemini-multimodal-google-search.py",
        PROMPT_ONLINE_SEARCH,
@@ -179,7 +182,13 @@ TESTS_26 = [
        BOT_SPEAKS_FIRST,
    ),
    # Currently not working.
-    # ("26d-gemini-multimodal-live-text.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
+    # ("26d-gemini-live-text.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
+    (
+        "26h-gemini-live-vertex-function-calling.py",
+        PROMPT_WEATHER,
+        EVAL_WEATHER,
+        BOT_SPEAKS_FIRST,
+    ),
 ]

 TESTS_27 = [