diff --git a/CHANGELOG.md b/CHANGELOG.md index 9763f78f2..70324c65e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1487,7 +1487,7 @@ quality and critical bugs impacting `ParallelPipelines` functionality.** - Added `session_token` parameter to `AWSNovaSonicLLMService`. - Added Gemini Multimodal Live File API for uploading, fetching, listing, and - deleting files. See `26f-gemini-multimodal-live-files-api.py` for example usage. + deleting files. See `26f-gemini-live-files-api.py` for example usage. ### Changed @@ -3493,7 +3493,7 @@ stt = DeepgramSTTService(..., live_options=LiveOptions(model="nova-2-general")) - Added the new modalities option and helper function to set Gemini output modalities. -- Added `examples/foundational/26d-gemini-multimodal-live-text.py` which is +- Added `examples/foundational/26d-gemini-live-text.py` which is using Gemini as TEXT modality and using another TTS provider for TTS process. ### Changed @@ -3680,9 +3680,9 @@ stt = DeepgramSTTService(..., live_options=LiveOptions(model="nova-2-general")) - Added new foundational examples for `GeminiMultimodalLiveLLMService`: - `26-gemini-multimodal-live.py` - - `26a-gemini-multimodal-live-transcription.py` - - `26b-gemini-multimodal-live-video.py` - - `26c-gemini-multimodal-live-video.py` + - `26a-gemini-live-transcription.py` + - `26b-gemini-live-video.py` + - `26c-gemini-live-video.py` - Added `SimliVideoService`. This is an integration for Simli AI avatars. (see https://www.simli.com) diff --git a/examples/foundational/26-gemini-multimodal-live.py b/examples/foundational/26-gemini-live.py similarity index 100% rename from examples/foundational/26-gemini-multimodal-live.py rename to examples/foundational/26-gemini-live.py diff --git a/examples/foundational/26a-gemini-multimodal-live-transcription.py b/examples/foundational/26a-gemini-live-transcription.py similarity index 100% rename from examples/foundational/26a-gemini-multimodal-live-transcription.py rename to examples/foundational/26a-gemini-live-transcription.py diff --git a/examples/foundational/26b-gemini-multimodal-live-function-calling.py b/examples/foundational/26b-gemini-live-function-calling.py similarity index 98% rename from examples/foundational/26b-gemini-multimodal-live-function-calling.py rename to examples/foundational/26b-gemini-live-function-calling.py index 8fecf0de6..65d159bb0 100644 --- a/examples/foundational/26b-gemini-multimodal-live-function-calling.py +++ b/examples/foundational/26b-gemini-live-function-calling.py @@ -122,7 +122,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments): required=["location"], ) search_tool = {"google_search": {}} - # KNOWN ISSUE: If using GeminiVertexMultimodalLiveLLMService, it appears + # KNOWN ISSUE: If using GeminiVertexLiveLLMService, it appears # you cannot use the "google_search" tool alongside other tools. # See https://github.com/googleapis/python-genai/issues/941. tools = ToolsSchema( diff --git a/examples/foundational/26c-gemini-multimodal-live-video.py b/examples/foundational/26c-gemini-live-video.py similarity index 100% rename from examples/foundational/26c-gemini-multimodal-live-video.py rename to examples/foundational/26c-gemini-live-video.py diff --git a/examples/foundational/26d-gemini-multimodal-live-text.py b/examples/foundational/26d-gemini-live-text.py similarity index 98% rename from examples/foundational/26d-gemini-multimodal-live-text.py rename to examples/foundational/26d-gemini-live-text.py index 05b792c00..42387f76d 100644 --- a/examples/foundational/26d-gemini-multimodal-live-text.py +++ b/examples/foundational/26d-gemini-live-text.py @@ -80,7 +80,7 @@ transport_params = { async def run_bot(transport: BaseTransport, runner_args: RunnerArguments): logger.info(f"Starting bot") - # KNOWN ISSUE: If using GeminiVertexMultimodalLiveLLMService, it appears + # KNOWN ISSUE: If using GeminiVertexLiveLLMService, it appears # you cannot specify a modality other than AUDIO. llm = GeminiLiveLLMService( api_key=os.getenv("GOOGLE_API_KEY"), diff --git a/examples/foundational/26e-gemini-multimodal-google-search.py b/examples/foundational/26e-gemini-live-google-search.py similarity index 100% rename from examples/foundational/26e-gemini-multimodal-google-search.py rename to examples/foundational/26e-gemini-live-google-search.py diff --git a/examples/foundational/26f-gemini-multimodal-live-files-api.py b/examples/foundational/26f-gemini-live-files-api.py similarity index 100% rename from examples/foundational/26f-gemini-multimodal-live-files-api.py rename to examples/foundational/26f-gemini-live-files-api.py diff --git a/examples/foundational/26g-gemini-multimodal-live-groundingMetadata.py b/examples/foundational/26g-gemini-live-groundingMetadata.py similarity index 100% rename from examples/foundational/26g-gemini-multimodal-live-groundingMetadata.py rename to examples/foundational/26g-gemini-live-groundingMetadata.py diff --git a/examples/foundational/26h-gemini-live-vertex-function-calling.py b/examples/foundational/26h-gemini-live-vertex-function-calling.py new file mode 100644 index 000000000..c0344a052 --- /dev/null +++ b/examples/foundational/26h-gemini-live-vertex-function-calling.py @@ -0,0 +1,191 @@ +# +# Copyright (c) 2024–2025, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + + +import os +from datetime import datetime + +from dotenv import load_dotenv +from google.genai.types import HttpOptions +from loguru import logger + +from pipecat.adapters.schemas.function_schema import FunctionSchema +from pipecat.adapters.schemas.tools_schema import AdapterType, ToolsSchema +from pipecat.audio.vad.silero import SileroVADAnalyzer +from pipecat.audio.vad.vad_analyzer import VADParams +from pipecat.frames.frames import LLMRunFrame +from pipecat.pipeline.pipeline import Pipeline +from pipecat.pipeline.runner import PipelineRunner +from pipecat.pipeline.task import PipelineParams, PipelineTask +from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext +from pipecat.runner.types import RunnerArguments +from pipecat.runner.utils import create_transport +from pipecat.services.google.gemini_live.llm import GeminiLiveLLMService +from pipecat.services.google.gemini_live.llm_vertex import GeminiLiveVertexLLMService +from pipecat.services.llm_service import FunctionCallParams +from pipecat.transports.base_transport import BaseTransport, TransportParams +from pipecat.transports.daily.transport import DailyParams +from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams + +load_dotenv(override=True) + + +async def fetch_weather_from_api(params: FunctionCallParams): + temperature = 75 if params.arguments["format"] == "fahrenheit" else 24 + await params.result_callback( + { + "conditions": "nice", + "temperature": temperature, + "format": params.arguments["format"], + "timestamp": datetime.now().strftime("%Y%m%d_%H%M%S"), + } + ) + + +async def fetch_restaurant_recommendation(params: FunctionCallParams): + await params.result_callback({"name": "The Golden Dragon"}) + + +system_instruction = """ +You are a helpful assistant who can answer questions and use tools. + +You have three tools available to you: +1. get_current_weather: Use this tool to get the current weather in a specific location. +2. get_restaurant_recommendation: Use this tool to get a restaurant recommendation in a specific location. +""" + + +# We store functions so objects (e.g. SileroVADAnalyzer) don't get +# instantiated. The function will be called when the desired transport gets +# selected. +transport_params = { + "daily": lambda: DailyParams( + audio_in_enabled=True, + audio_out_enabled=True, + # set stop_secs to something roughly similar to the internal setting + # of the Multimodal Live api, just to align events. This doesn't really + # matter because we can only use the Multimodal Live API's phrase + # endpointing, for now. + vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.5)), + ), + "twilio": lambda: FastAPIWebsocketParams( + audio_in_enabled=True, + audio_out_enabled=True, + # set stop_secs to something roughly similar to the internal setting + # of the Multimodal Live api, just to align events. This doesn't really + # matter because we can only use the Multimodal Live API's phrase + # endpointing, for now. + vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.5)), + ), + "webrtc": lambda: TransportParams( + audio_in_enabled=True, + audio_out_enabled=True, + # set stop_secs to something roughly similar to the internal setting + # of the Multimodal Live api, just to align events. This doesn't really + # matter because we can only use the Multimodal Live API's phrase + # endpointing, for now. + vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.5)), + ), +} + + +async def run_bot(transport: BaseTransport, runner_args: RunnerArguments): + logger.info(f"Starting bot") + + weather_function = FunctionSchema( + name="get_current_weather", + description="Get the current weather", + properties={ + "location": { + "type": "string", + "description": "The city and state, e.g. San Francisco, CA", + }, + "format": { + "type": "string", + "enum": ["celsius", "fahrenheit"], + "description": "The temperature unit to use. Infer this from the user's location.", + }, + }, + required=["location", "format"], + ) + restaurant_function = FunctionSchema( + name="get_restaurant_recommendation", + description="Get a restaurant recommendation", + properties={ + "location": { + "type": "string", + "description": "The city and state, e.g. San Francisco, CA", + }, + }, + required=["location"], + ) + # KNOWN ISSUE: If using GeminiVertexLiveLLMService, it appears + # you cannot use the "google_search" tool alongside other tools. + # See https://github.com/googleapis/python-genai/issues/941. + tools = ToolsSchema(standard_tools=[weather_function, restaurant_function]) + + llm = GeminiLiveVertexLLMService( + credentials=os.getenv("GOOGLE_VERTEX_TEST_CREDENTIALS"), + project_id=os.getenv("GOOGLE_CLOUD_PROJECT_ID"), + location=os.getenv("GOOGLE_CLOUD_LOCATION"), + system_instruction=system_instruction, + voice_id="Puck", # Aoede, Charon, Fenrir, Kore, Puck + tools=tools, + ) + + llm.register_function("get_current_weather", fetch_weather_from_api) + llm.register_function("get_restaurant_recommendation", fetch_restaurant_recommendation) + + context = OpenAILLMContext( + [{"role": "user", "content": "Say hello."}], + ) + context_aggregator = llm.create_context_aggregator(context) + + pipeline = Pipeline( + [ + transport.input(), + context_aggregator.user(), + llm, + transport.output(), + context_aggregator.assistant(), + ] + ) + + task = PipelineTask( + pipeline, + params=PipelineParams( + enable_metrics=True, + enable_usage_metrics=True, + ), + idle_timeout_secs=runner_args.pipeline_idle_timeout_secs, + ) + + @transport.event_handler("on_client_connected") + async def on_client_connected(transport, client): + logger.info(f"Client connected") + # Kick off the conversation. + await task.queue_frames([LLMRunFrame()]) + + @transport.event_handler("on_client_disconnected") + async def on_client_disconnected(transport, client): + logger.info(f"Client disconnected") + await task.cancel() + + runner = PipelineRunner(handle_sigint=runner_args.handle_sigint) + + await runner.run(task) + + +async def bot(runner_args: RunnerArguments): + """Main bot entry point compatible with Pipecat Cloud.""" + transport = await create_transport(runner_args, transport_params) + await run_bot(transport, runner_args) + + +if __name__ == "__main__": + from pipecat.runner.run import main + + main() diff --git a/examples/foundational/26h-gemini-multimodal-live-vertex.py b/examples/foundational/26h-gemini-multimodal-live-vertex.py deleted file mode 100644 index 79bfac531..000000000 --- a/examples/foundational/26h-gemini-multimodal-live-vertex.py +++ /dev/null @@ -1,133 +0,0 @@ -# -# Copyright (c) 2024–2025, Daily -# -# SPDX-License-Identifier: BSD 2-Clause License -# - -import os - -from dotenv import load_dotenv -from loguru import logger - -from pipecat.audio.vad.silero import SileroVADAnalyzer -from pipecat.audio.vad.vad_analyzer import VADParams -from pipecat.frames.frames import LLMMessagesAppendFrame -from pipecat.pipeline.pipeline import Pipeline -from pipecat.pipeline.runner import PipelineRunner -from pipecat.pipeline.task import PipelineParams, PipelineTask -from pipecat.runner.types import RunnerArguments -from pipecat.runner.utils import create_transport -from pipecat.services.google.gemini_live.llm_vertex import GeminiLiveVertexLLMService -from pipecat.transports.base_transport import BaseTransport, TransportParams -from pipecat.transports.daily.transport import DailyParams -from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams - -# Load environment variables -load_dotenv(override=True) - - -# We store functions so objects (e.g. SileroVADAnalyzer) don't get -# instantiated. The function will be called when the desired transport gets -# selected. -transport_params = { - "daily": lambda: DailyParams( - audio_in_enabled=True, - audio_out_enabled=True, - # set stop_secs to something roughly similar to the internal setting - # of the Multimodal Live api, just to align events. - vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.5)), - ), - "twilio": lambda: FastAPIWebsocketParams( - audio_in_enabled=True, - audio_out_enabled=True, - # set stop_secs to something roughly similar to the internal setting - # of the Multimodal Live api, just to align events. - vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.5)), - ), - "webrtc": lambda: TransportParams( - audio_in_enabled=True, - audio_out_enabled=True, - # set stop_secs to something roughly similar to the internal setting - # of the Multimodal Live api, just to align events. - vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.5)), - ), -} - - -async def run_bot(transport: BaseTransport, runner_args: RunnerArguments): - logger.info(f"Starting bot") - - # Create the Gemini Vertex Multimodal Live LLM service - system_instruction = f""" - You are a helpful AI assistant. - Your goal is to demonstrate your capabilities in a helpful and engaging way. - Your output will be converted to audio so don't include special characters in your answers. - Respond to what the user said in a creative and helpful way. - """ - - llm = GeminiLiveVertexLLMService( - credentials=os.getenv("GOOGLE_VERTEX_TEST_CREDENTIALS"), - project_id=os.getenv("GOOGLE_CLOUD_PROJECT_ID"), - location=os.getenv("GOOGLE_CLOUD_LOCATION"), - system_instruction=system_instruction, - voice_id="Puck", # Aoede, Charon, Fenrir, Kore, Puck - ) - - # Build the pipeline - pipeline = Pipeline( - [ - transport.input(), - llm, - transport.output(), - ] - ) - - # Configure the pipeline task - task = PipelineTask( - pipeline, - params=PipelineParams( - enable_metrics=True, - enable_usage_metrics=True, - ), - idle_timeout_secs=runner_args.pipeline_idle_timeout_secs, - ) - - # Handle client connection event - @transport.event_handler("on_client_connected") - async def on_client_connected(transport, client): - logger.info(f"Client connected") - # Kick off the conversation. - await task.queue_frames( - [ - LLMMessagesAppendFrame( - messages=[ - { - "role": "user", - "content": f"Greet the user and introduce yourself.", - } - ] - ) - ] - ) - - # Handle client disconnection events - @transport.event_handler("on_client_disconnected") - async def on_client_disconnected(transport, client): - logger.info(f"Client disconnected") - await task.cancel() - - # Run the pipeline - runner = PipelineRunner(handle_sigint=runner_args.handle_sigint) - await runner.run(task) - - -async def bot(runner_args: RunnerArguments): - """Main bot entry point compatible with Pipecat Cloud.""" - transport = await create_transport(runner_args, transport_params) - await run_bot(transport, runner_args) - - -if __name__ == "__main__": - from pipecat.runner.run import main - - main() diff --git a/examples/foundational/26i-gemini-multimodal-live-graceful-end.py b/examples/foundational/26i-gemini-live-graceful-end.py similarity index 100% rename from examples/foundational/26i-gemini-multimodal-live-graceful-end.py rename to examples/foundational/26i-gemini-live-graceful-end.py diff --git a/examples/foundational/README.md b/examples/foundational/README.md index 9a6c26005..c1ead3ece 100644 --- a/examples/foundational/README.md +++ b/examples/foundational/README.md @@ -105,7 +105,7 @@ uv run 07-interruptible.py -t twilio -x NGROK_HOST_NAME ### Vision & Multimodal - **[12a-describe-video-gemini-flash.py](./12a-describe-video-gemini-flash.py)**: Bot describes user's video (Video input, Multimodal LLMs) -- **[26c-gemini-multimodal-live-video.py](./26c-gemini-multimodal-live-video.py)**: Gemini with video input (Streaming video, Function calls) +- **[26c-gemini-live-video.py](./26c-gemini-live-video.py)**: Gemini with video input (Streaming video, Function calls) ### Voice & Language diff --git a/scripts/evals/run-release-evals.py b/scripts/evals/run-release-evals.py index 206cf848e..44cb69aed 100644 --- a/scripts/evals/run-release-evals.py +++ b/scripts/evals/run-release-evals.py @@ -147,7 +147,10 @@ TESTS_15 = [ ] TESTS_19 = [ + ("19-openai-realtime.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST), ("19-openai-realtime-beta.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST), + # OpenAI Realtime not released on Azure yet + # ("19a-azure-realtime.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST), ("19a-azure-realtime-beta.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST), ("19b-openai-realtime-text.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST), ("19b-openai-realtime-beta-text.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST), @@ -160,18 +163,18 @@ TESTS_21 = [ TESTS_26 = [ ("26-gemini-multimodal-live.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST), ( - "26a-gemini-multimodal-live-transcription.py", + "26a-gemini-live-transcription.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST, ), ( - "26b-gemini-multimodal-live-function-calling.py", + "26b-gemini-live-function-calling.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST, ), - ("26c-gemini-multimodal-live-video.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST), + ("26c-gemini-live-video.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST), ( "26e-gemini-multimodal-google-search.py", PROMPT_ONLINE_SEARCH, @@ -179,7 +182,13 @@ TESTS_26 = [ BOT_SPEAKS_FIRST, ), # Currently not working. - # ("26d-gemini-multimodal-live-text.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST), + # ("26d-gemini-live-text.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST), + ( + "26h-gemini-live-vertex-function-calling.py", + PROMPT_WEATHER, + EVAL_WEATHER, + BOT_SPEAKS_FIRST, + ), ] TESTS_27 = [