diff --git a/CHANGELOG.md b/CHANGELOG.md index ae7e3dcb1..3d585698b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added +- Added support for Nano Banana models to `GoogleLLMService`. For example, you + can now use the `gemini-2.5-flash-image` model to generate images. + - `PermissionError` is now caught if NLTK's `punkt_tab` can't be downloaded. - Added `HumeTTSService` for text-to-speech synthesis using Hume AI's @@ -18,6 +21,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added `hume` optional dependency group for Hume AI TTS integration. +### Changed + +- Updated default `GoogleLLMService` model to `gemini-2.5-flash`. + ### Fixed - Fixed RTVI incoming message handling, broken in 0.0.87. diff --git a/examples/foundational/07n-interruptible-gemini-image.py b/examples/foundational/07n-interruptible-gemini-image.py new file mode 100644 index 000000000..61b8e650a --- /dev/null +++ b/examples/foundational/07n-interruptible-gemini-image.py @@ -0,0 +1,151 @@ +# +# Copyright (c) 2024–2025, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +""" +A conversational AI bot using Gemini for both LLM, STT and TTS. + +This example demonstrates how to use Gemini's image generation capabilities. + +Features showcased: +- Gemini LLM for conversation and image generation +- Google TTS and STT + +Run with: + python examples/foundational/07n-interruptible-gemini-image.py + +Make sure to set your environment variables: + export GOOGLE_API_KEY=your_api_key_here +""" + +import os + +from dotenv import load_dotenv +from loguru import logger + +from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams +from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3 +from pipecat.audio.vad.silero import SileroVADAnalyzer +from pipecat.audio.vad.vad_analyzer import VADParams +from pipecat.frames.frames import LLMRunFrame +from pipecat.pipeline.pipeline import Pipeline +from pipecat.pipeline.runner import PipelineRunner +from pipecat.pipeline.task import PipelineParams, PipelineTask +from pipecat.processors.aggregators.llm_context import LLMContext +from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair +from pipecat.runner.types import RunnerArguments +from pipecat.runner.utils import create_transport +from pipecat.services.google.llm import GoogleLLMService +from pipecat.services.google.stt import GoogleSTTService +from pipecat.services.google.tts import GoogleTTSService +from pipecat.transcriptions.language import Language +from pipecat.transports.base_transport import BaseTransport, TransportParams +from pipecat.transports.daily.transport import DailyParams +from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams + +load_dotenv(override=True) + +# We store functions so objects (e.g. SileroVADAnalyzer) don't get +# instantiated. The function will be called when the desired transport gets +# selected. +transport_params = { + "daily": lambda: DailyParams( + audio_in_enabled=True, + audio_out_enabled=True, + video_out_enabled=True, + video_out_width=1024, + video_out_height=1024, + vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)), + turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()), + ), + "webrtc": lambda: TransportParams( + audio_in_enabled=True, + audio_out_enabled=True, + video_out_enabled=True, + video_out_width=1024, + video_out_height=1024, + vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)), + turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()), + ), +} + + +async def run_bot(transport: BaseTransport, runner_args: RunnerArguments): + logger.info(f"Starting bot") + + stt = GoogleSTTService( + params=GoogleSTTService.InputParams(languages=Language.EN_US), + credentials=os.getenv("GOOGLE_TEST_CREDENTIALS"), + ) + + tts = GoogleTTSService( + voice_id="en-US-Chirp3-HD-Charon", + params=GoogleTTSService.InputParams(language=Language.EN_US), + credentials=os.getenv("GOOGLE_TEST_CREDENTIALS"), + ) + + llm = GoogleLLMService( + api_key=os.getenv("GOOGLE_API_KEY"), + model="gemini-2.5-flash-image", + ) + + messages = [ + { + "role": "system", + "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.", + }, + ] + + context = LLMContext(messages) + context_aggregator = LLMContextAggregatorPair(context) + + pipeline = Pipeline( + [ + transport.input(), # Transport user input + stt, # STT + context_aggregator.user(), # User responses + llm, # LLM + tts, # Gemini TTS + transport.output(), # Transport bot output + context_aggregator.assistant(), # Assistant spoken responses + ] + ) + + task = PipelineTask( + pipeline, + params=PipelineParams( + enable_metrics=True, + enable_usage_metrics=True, + ), + idle_timeout_secs=runner_args.pipeline_idle_timeout_secs, + ) + + @transport.event_handler("on_client_connected") + async def on_client_connected(transport, client): + logger.info(f"Client connected") + # Kick off the conversation with a styled introduction + messages.append({"role": "system", "content": "Please introduce yourself to the user."}) + await task.queue_frames([LLMRunFrame()]) + + @transport.event_handler("on_client_disconnected") + async def on_client_disconnected(transport, client): + logger.info(f"Client disconnected") + await task.cancel() + + runner = PipelineRunner(handle_sigint=runner_args.handle_sigint) + + await runner.run(task) + + +async def bot(runner_args: RunnerArguments): + """Main bot entry point compatible with Pipecat Cloud.""" + transport = await create_transport(runner_args, transport_params) + await run_bot(transport, runner_args) + + +if __name__ == "__main__": + from pipecat.runner.run import main + + main() diff --git a/pyproject.toml b/pyproject.toml index 0adbf281e..375ca5245 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -62,7 +62,7 @@ fal = [ "fal-client~=0.5.9" ] fireworks = [] fish = [ "ormsgpack~=1.7.0", "pipecat-ai[websockets-base]" ] gladia = [ "pipecat-ai[websockets-base]" ] -google = [ "google-cloud-speech~=2.32.0", "google-cloud-texttospeech~=2.26.0", "google-genai~=1.24.0", "pipecat-ai[websockets-base]" ] +google = [ "google-cloud-speech>=2.33.0,<3", "google-cloud-texttospeech>=2.31.0,<3", "google-genai>=1.41.0,<2", "pipecat-ai[websockets-base]" ] grok = [] groq = [ "groq~=0.23.0" ] gstreamer = [ "pygobject~=3.50.0" ] diff --git a/src/pipecat/services/google/llm.py b/src/pipecat/services/google/llm.py index 70d4ca2bf..b45c276d0 100644 --- a/src/pipecat/services/google/llm.py +++ b/src/pipecat/services/google/llm.py @@ -35,6 +35,7 @@ from pipecat.frames.frames import ( LLMMessagesFrame, LLMTextFrame, LLMUpdateSettingsFrame, + OutputImageRawFrame, UserImageRawFrame, ) from pipecat.metrics.metrics import LLMTokenUsage @@ -72,6 +73,9 @@ try: HttpOptions, Part, ) + + # Temporary hack to be able to process Nano Banana returned images. + genai._api_client.READ_BUFFER_SIZE = 5 * 1024 * 1024 except ModuleNotFoundError as e: logger.error(f"Exception: {e}") logger.error("In order to use Google AI, you need to `pip install pipecat-ai[google]`.") @@ -682,7 +686,7 @@ class GoogleLLMService(LLMService): self, *, api_key: str, - model: str = "gemini-2.0-flash", + model: str = "gemini-2.5-flash", params: Optional[InputParams] = None, system_instruction: Optional[str] = None, tools: Optional[List[Dict[str, Any]]] = None, @@ -710,6 +714,7 @@ class GoogleLLMService(LLMService): self._api_key = api_key self._system_instruction = system_instruction self._http_options = http_options + self._create_client(api_key, http_options) self._settings = { "max_tokens": params.max_tokens, @@ -788,6 +793,9 @@ class GoogleLLMService(LLMService): # and can be configured to turn it off. if not self._model_name.startswith("gemini-2.5-flash"): return + # If we have an image model, we don't use a budget either. + if "image" in self._model_name: + return # If thinking_config is already set, don't override it. if "thinking_config" in generation_params: return @@ -927,6 +935,12 @@ class GoogleLLMService(LLMService): arguments=function_call.args or {}, ) ) + elif part.inline_data and part.inline_data.data: + image = Image.open(io.BytesIO(part.inline_data.data)) + frame = OutputImageRawFrame( + image=image.tobytes(), size=image.size, format="RGB" + ) + await self.push_frame(frame) if ( candidate.grounding_metadata diff --git a/uv.lock b/uv.lock index bef2f3a60..8e4b59009 100644 --- a/uv.lock +++ b/uv.lock @@ -1809,7 +1809,7 @@ wheels = [ [[package]] name = "google-cloud-speech" -version = "2.32.0" +version = "2.33.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "google-api-core", extra = ["grpc"] }, @@ -1817,14 +1817,14 @@ dependencies = [ { name = "proto-plus" }, { name = "protobuf" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/dc/fc/7e47328069850f084ee17e26b5572de067e30fdab862e381702222d237b7/google_cloud_speech-2.32.0.tar.gz", hash = "sha256:89c2618b131d310c6c00e7c04d290ffa9a5d68c20191030766a7737850f04e77", size = 387621, upload-time = "2025-04-14T10:16:35.386Z" } +sdist = { url = "https://files.pythonhosted.org/packages/9a/74/9c5a556f8af19cab461058aa15e1409e7afa453ca2383473a24a12801ef7/google_cloud_speech-2.33.0.tar.gz", hash = "sha256:fd08511b5124fdaa768d71a4054e84a5d8eb02531cb6f84f311c0387ea1314ed", size = 389072, upload-time = "2025-06-11T23:56:37.231Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/85/a4/f41f2737cd0597f2aa5855b0a12f353fad4506868887590671230df81c77/google_cloud_speech-2.32.0-py3-none-any.whl", hash = "sha256:537b279d8697fe5b5bc5f485f2d48a6b343fc76f73385b5776806c37bc5f8ea1", size = 334148, upload-time = "2025-04-14T10:16:33.89Z" }, + { url = "https://files.pythonhosted.org/packages/12/1d/880342b2541b4bad888ad8ab2ac77d4b5dad25b32a2a1c5f21140c14c8e3/google_cloud_speech-2.33.0-py3-none-any.whl", hash = "sha256:4ba16c8517c24a6abcde877289b0f40b719090504bf06b1adea248198ccd50a5", size = 335681, upload-time = "2025-06-11T23:56:36.026Z" }, ] [[package]] name = "google-cloud-texttospeech" -version = "2.26.0" +version = "2.31.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "google-api-core", extra = ["grpc"] }, @@ -1832,9 +1832,9 @@ dependencies = [ { name = "proto-plus" }, { name = "protobuf" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/5b/3d/214506e1163138159a3ba172adc0945970843ce9a8c5332db06772806dff/google_cloud_texttospeech-2.26.0.tar.gz", hash = "sha256:43af1b88a6b9becde69a3bbf8aa80cdfa5f12f8999e56bcf9dec374354ed7f6a", size = 181084, upload-time = "2025-04-14T10:16:39.737Z" } +sdist = { url = "https://files.pythonhosted.org/packages/ec/4b/7ccadbec28ee255a3176c3de0a14705c4b6469777f1c7ddbf4452fa893e3/google_cloud_texttospeech-2.31.0.tar.gz", hash = "sha256:1f0c0c6448f175e1e2f63d96fb13af5d9abee6970bbb22c1e4036f53136a5588", size = 184880, upload-time = "2025-09-25T14:03:22.786Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/f0/eb/fb3a2c16f5612c4a131b2bfa242aaf7800ec0cee479759d9de2cc919ba70/google_cloud_texttospeech-2.26.0-py3-none-any.whl", hash = "sha256:837835aadeb261983d139ef1c5e60c99f80199e22330bf4f62e217360b9e07b8", size = 188122, upload-time = "2025-04-14T10:16:38.466Z" }, + { url = "https://files.pythonhosted.org/packages/18/3e/54ff1a5af26f90c5d76e7e80b9208f8484035b5bd8fb6a06c819fed6a8c9/google_cloud_texttospeech-2.31.0-py3-none-any.whl", hash = "sha256:9442134b4b8e7e3d179dfd3850a5a953a6a6a9cf000a3640caddb85cf97ab69b", size = 191280, upload-time = "2025-09-25T14:03:16.667Z" }, ] [[package]] @@ -1874,7 +1874,7 @@ wheels = [ [[package]] name = "google-genai" -version = "1.24.0" +version = "1.41.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "anyio" }, @@ -1886,9 +1886,9 @@ dependencies = [ { name = "typing-extensions" }, { name = "websockets" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/8d/cf/37ac8cd4752e28e547b8a52765fe48a2ada2d0d286ea03f46e4d8c69ff4f/google_genai-1.24.0.tar.gz", hash = "sha256:bc896e30ad26d05a2af3d17c2ba10ea214a94f1c0cdb93d5c004dc038774e75a", size = 226740, upload-time = "2025-07-01T22:14:24.365Z" } +sdist = { url = "https://files.pythonhosted.org/packages/72/8b/ee20bcf707769b3b0e1106c3b5c811507736af7e8a60f29a70af1750ba19/google_genai-1.41.0.tar.gz", hash = "sha256:134f861bb0ace4e34af0501ecb75ceee15f7662fd8120698cd185e8cb39f2800", size = 245812, upload-time = "2025-10-02T22:30:29.699Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/30/28/a35f64fc02e599808101617a21d447d241dadeba2aac1f4dc2d1179b8218/google_genai-1.24.0-py3-none-any.whl", hash = "sha256:98be8c51632576289ecc33cd84bcdaf4356ef0bef04ac7578660c49175af22b9", size = 226065, upload-time = "2025-07-01T22:14:23.177Z" }, + { url = "https://files.pythonhosted.org/packages/15/14/e5e8fbca8863fee718208566c4e927b8e9f45fd46ec5cf89e24759da545b/google_genai-1.41.0-py3-none-any.whl", hash = "sha256:111a3ee64c1a0927d3879faddb368234594432479a40c311e5fe4db338ca8778", size = 245931, upload-time = "2025-10-02T22:30:27.885Z" }, ] [[package]] @@ -4558,9 +4558,9 @@ requires-dist = [ { name = "fastapi", marker = "extra == 'runner'", specifier = ">=0.115.6,<0.117.0" }, { name = "fastapi", marker = "extra == 'websocket'", specifier = ">=0.115.6,<0.117.0" }, { name = "faster-whisper", marker = "extra == 'whisper'", specifier = "~=1.1.1" }, - { name = "google-cloud-speech", marker = "extra == 'google'", specifier = "~=2.32.0" }, - { name = "google-cloud-texttospeech", marker = "extra == 'google'", specifier = "~=2.26.0" }, - { name = "google-genai", marker = "extra == 'google'", specifier = "~=1.24.0" }, + { name = "google-cloud-speech", marker = "extra == 'google'", specifier = ">=2.33.0,<3" }, + { name = "google-cloud-texttospeech", marker = "extra == 'google'", specifier = ">=2.31.0,<3" }, + { name = "google-genai", marker = "extra == 'google'", specifier = ">=1.41.0,<2" }, { name = "groq", marker = "extra == 'groq'", specifier = "~=0.23.0" }, { name = "hume", marker = "extra == 'hume'", specifier = ">=0.11.2" }, { name = "langchain", marker = "extra == 'langchain'", specifier = "~=0.3.20" },