Merge pull request #2792 from pipecat-ai/aleix/google-nano-banana

GoogleLLMService: added support for image generation
This commit is contained in:
Aleix Conchillo Flaqué
2025-10-06 22:42:14 -07:00
committed by GitHub
5 changed files with 186 additions and 14 deletions

View File

@@ -9,6 +9,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Added
- Added support for Nano Banana models to `GoogleLLMService`. For example, you
can now use the `gemini-2.5-flash-image` model to generate images.
- `PermissionError` is now caught if NLTK's `punkt_tab` can't be downloaded.
- Added `HumeTTSService` for text-to-speech synthesis using Hume AI's
@@ -18,6 +21,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Added `hume` optional dependency group for Hume AI TTS integration.
### Changed
- Updated default `GoogleLLMService` model to `gemini-2.5-flash`.
### Fixed
- Fixed RTVI incoming message handling, broken in 0.0.87.

View File

@@ -0,0 +1,151 @@
#
# Copyright (c) 20242025, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
"""
A conversational AI bot using Gemini for both LLM, STT and TTS.
This example demonstrates how to use Gemini's image generation capabilities.
Features showcased:
- Gemini LLM for conversation and image generation
- Google TTS and STT
Run with:
python examples/foundational/07n-interruptible-gemini-image.py
Make sure to set your environment variables:
export GOOGLE_API_KEY=your_api_key_here
"""
import os
from dotenv import load_dotenv
from loguru import logger
from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.audio.vad.vad_analyzer import VADParams
from pipecat.frames.frames import LLMRunFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.llm_context import LLMContext
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
from pipecat.runner.types import RunnerArguments
from pipecat.runner.utils import create_transport
from pipecat.services.google.llm import GoogleLLMService
from pipecat.services.google.stt import GoogleSTTService
from pipecat.services.google.tts import GoogleTTSService
from pipecat.transcriptions.language import Language
from pipecat.transports.base_transport import BaseTransport, TransportParams
from pipecat.transports.daily.transport import DailyParams
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
load_dotenv(override=True)
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
# instantiated. The function will be called when the desired transport gets
# selected.
transport_params = {
"daily": lambda: DailyParams(
audio_in_enabled=True,
audio_out_enabled=True,
video_out_enabled=True,
video_out_width=1024,
video_out_height=1024,
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
),
"webrtc": lambda: TransportParams(
audio_in_enabled=True,
audio_out_enabled=True,
video_out_enabled=True,
video_out_width=1024,
video_out_height=1024,
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
),
}
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
logger.info(f"Starting bot")
stt = GoogleSTTService(
params=GoogleSTTService.InputParams(languages=Language.EN_US),
credentials=os.getenv("GOOGLE_TEST_CREDENTIALS"),
)
tts = GoogleTTSService(
voice_id="en-US-Chirp3-HD-Charon",
params=GoogleTTSService.InputParams(language=Language.EN_US),
credentials=os.getenv("GOOGLE_TEST_CREDENTIALS"),
)
llm = GoogleLLMService(
api_key=os.getenv("GOOGLE_API_KEY"),
model="gemini-2.5-flash-image",
)
messages = [
{
"role": "system",
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
},
]
context = LLMContext(messages)
context_aggregator = LLMContextAggregatorPair(context)
pipeline = Pipeline(
[
transport.input(), # Transport user input
stt, # STT
context_aggregator.user(), # User responses
llm, # LLM
tts, # Gemini TTS
transport.output(), # Transport bot output
context_aggregator.assistant(), # Assistant spoken responses
]
)
task = PipelineTask(
pipeline,
params=PipelineParams(
enable_metrics=True,
enable_usage_metrics=True,
),
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
)
@transport.event_handler("on_client_connected")
async def on_client_connected(transport, client):
logger.info(f"Client connected")
# Kick off the conversation with a styled introduction
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
await task.queue_frames([LLMRunFrame()])
@transport.event_handler("on_client_disconnected")
async def on_client_disconnected(transport, client):
logger.info(f"Client disconnected")
await task.cancel()
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
await runner.run(task)
async def bot(runner_args: RunnerArguments):
"""Main bot entry point compatible with Pipecat Cloud."""
transport = await create_transport(runner_args, transport_params)
await run_bot(transport, runner_args)
if __name__ == "__main__":
from pipecat.runner.run import main
main()

View File

@@ -62,7 +62,7 @@ fal = [ "fal-client~=0.5.9" ]
fireworks = []
fish = [ "ormsgpack~=1.7.0", "pipecat-ai[websockets-base]" ]
gladia = [ "pipecat-ai[websockets-base]" ]
google = [ "google-cloud-speech~=2.32.0", "google-cloud-texttospeech~=2.26.0", "google-genai~=1.24.0", "pipecat-ai[websockets-base]" ]
google = [ "google-cloud-speech>=2.33.0,<3", "google-cloud-texttospeech>=2.31.0,<3", "google-genai>=1.41.0,<2", "pipecat-ai[websockets-base]" ]
grok = []
groq = [ "groq~=0.23.0" ]
gstreamer = [ "pygobject~=3.50.0" ]

View File

@@ -35,6 +35,7 @@ from pipecat.frames.frames import (
LLMMessagesFrame,
LLMTextFrame,
LLMUpdateSettingsFrame,
OutputImageRawFrame,
UserImageRawFrame,
)
from pipecat.metrics.metrics import LLMTokenUsage
@@ -72,6 +73,9 @@ try:
HttpOptions,
Part,
)
# Temporary hack to be able to process Nano Banana returned images.
genai._api_client.READ_BUFFER_SIZE = 5 * 1024 * 1024
except ModuleNotFoundError as e:
logger.error(f"Exception: {e}")
logger.error("In order to use Google AI, you need to `pip install pipecat-ai[google]`.")
@@ -682,7 +686,7 @@ class GoogleLLMService(LLMService):
self,
*,
api_key: str,
model: str = "gemini-2.0-flash",
model: str = "gemini-2.5-flash",
params: Optional[InputParams] = None,
system_instruction: Optional[str] = None,
tools: Optional[List[Dict[str, Any]]] = None,
@@ -710,6 +714,7 @@ class GoogleLLMService(LLMService):
self._api_key = api_key
self._system_instruction = system_instruction
self._http_options = http_options
self._create_client(api_key, http_options)
self._settings = {
"max_tokens": params.max_tokens,
@@ -788,6 +793,9 @@ class GoogleLLMService(LLMService):
# and can be configured to turn it off.
if not self._model_name.startswith("gemini-2.5-flash"):
return
# If we have an image model, we don't use a budget either.
if "image" in self._model_name:
return
# If thinking_config is already set, don't override it.
if "thinking_config" in generation_params:
return
@@ -927,6 +935,12 @@ class GoogleLLMService(LLMService):
arguments=function_call.args or {},
)
)
elif part.inline_data and part.inline_data.data:
image = Image.open(io.BytesIO(part.inline_data.data))
frame = OutputImageRawFrame(
image=image.tobytes(), size=image.size, format="RGB"
)
await self.push_frame(frame)
if (
candidate.grounding_metadata

24
uv.lock generated
View File

@@ -1809,7 +1809,7 @@ wheels = [
[[package]]
name = "google-cloud-speech"
version = "2.32.0"
version = "2.33.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "google-api-core", extra = ["grpc"] },
@@ -1817,14 +1817,14 @@ dependencies = [
{ name = "proto-plus" },
{ name = "protobuf" },
]
sdist = { url = "https://files.pythonhosted.org/packages/dc/fc/7e47328069850f084ee17e26b5572de067e30fdab862e381702222d237b7/google_cloud_speech-2.32.0.tar.gz", hash = "sha256:89c2618b131d310c6c00e7c04d290ffa9a5d68c20191030766a7737850f04e77", size = 387621, upload-time = "2025-04-14T10:16:35.386Z" }
sdist = { url = "https://files.pythonhosted.org/packages/9a/74/9c5a556f8af19cab461058aa15e1409e7afa453ca2383473a24a12801ef7/google_cloud_speech-2.33.0.tar.gz", hash = "sha256:fd08511b5124fdaa768d71a4054e84a5d8eb02531cb6f84f311c0387ea1314ed", size = 389072, upload-time = "2025-06-11T23:56:37.231Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/85/a4/f41f2737cd0597f2aa5855b0a12f353fad4506868887590671230df81c77/google_cloud_speech-2.32.0-py3-none-any.whl", hash = "sha256:537b279d8697fe5b5bc5f485f2d48a6b343fc76f73385b5776806c37bc5f8ea1", size = 334148, upload-time = "2025-04-14T10:16:33.89Z" },
{ url = "https://files.pythonhosted.org/packages/12/1d/880342b2541b4bad888ad8ab2ac77d4b5dad25b32a2a1c5f21140c14c8e3/google_cloud_speech-2.33.0-py3-none-any.whl", hash = "sha256:4ba16c8517c24a6abcde877289b0f40b719090504bf06b1adea248198ccd50a5", size = 335681, upload-time = "2025-06-11T23:56:36.026Z" },
]
[[package]]
name = "google-cloud-texttospeech"
version = "2.26.0"
version = "2.31.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "google-api-core", extra = ["grpc"] },
@@ -1832,9 +1832,9 @@ dependencies = [
{ name = "proto-plus" },
{ name = "protobuf" },
]
sdist = { url = "https://files.pythonhosted.org/packages/5b/3d/214506e1163138159a3ba172adc0945970843ce9a8c5332db06772806dff/google_cloud_texttospeech-2.26.0.tar.gz", hash = "sha256:43af1b88a6b9becde69a3bbf8aa80cdfa5f12f8999e56bcf9dec374354ed7f6a", size = 181084, upload-time = "2025-04-14T10:16:39.737Z" }
sdist = { url = "https://files.pythonhosted.org/packages/ec/4b/7ccadbec28ee255a3176c3de0a14705c4b6469777f1c7ddbf4452fa893e3/google_cloud_texttospeech-2.31.0.tar.gz", hash = "sha256:1f0c0c6448f175e1e2f63d96fb13af5d9abee6970bbb22c1e4036f53136a5588", size = 184880, upload-time = "2025-09-25T14:03:22.786Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/f0/eb/fb3a2c16f5612c4a131b2bfa242aaf7800ec0cee479759d9de2cc919ba70/google_cloud_texttospeech-2.26.0-py3-none-any.whl", hash = "sha256:837835aadeb261983d139ef1c5e60c99f80199e22330bf4f62e217360b9e07b8", size = 188122, upload-time = "2025-04-14T10:16:38.466Z" },
{ url = "https://files.pythonhosted.org/packages/18/3e/54ff1a5af26f90c5d76e7e80b9208f8484035b5bd8fb6a06c819fed6a8c9/google_cloud_texttospeech-2.31.0-py3-none-any.whl", hash = "sha256:9442134b4b8e7e3d179dfd3850a5a953a6a6a9cf000a3640caddb85cf97ab69b", size = 191280, upload-time = "2025-09-25T14:03:16.667Z" },
]
[[package]]
@@ -1874,7 +1874,7 @@ wheels = [
[[package]]
name = "google-genai"
version = "1.24.0"
version = "1.41.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "anyio" },
@@ -1886,9 +1886,9 @@ dependencies = [
{ name = "typing-extensions" },
{ name = "websockets" },
]
sdist = { url = "https://files.pythonhosted.org/packages/8d/cf/37ac8cd4752e28e547b8a52765fe48a2ada2d0d286ea03f46e4d8c69ff4f/google_genai-1.24.0.tar.gz", hash = "sha256:bc896e30ad26d05a2af3d17c2ba10ea214a94f1c0cdb93d5c004dc038774e75a", size = 226740, upload-time = "2025-07-01T22:14:24.365Z" }
sdist = { url = "https://files.pythonhosted.org/packages/72/8b/ee20bcf707769b3b0e1106c3b5c811507736af7e8a60f29a70af1750ba19/google_genai-1.41.0.tar.gz", hash = "sha256:134f861bb0ace4e34af0501ecb75ceee15f7662fd8120698cd185e8cb39f2800", size = 245812, upload-time = "2025-10-02T22:30:29.699Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/30/28/a35f64fc02e599808101617a21d447d241dadeba2aac1f4dc2d1179b8218/google_genai-1.24.0-py3-none-any.whl", hash = "sha256:98be8c51632576289ecc33cd84bcdaf4356ef0bef04ac7578660c49175af22b9", size = 226065, upload-time = "2025-07-01T22:14:23.177Z" },
{ url = "https://files.pythonhosted.org/packages/15/14/e5e8fbca8863fee718208566c4e927b8e9f45fd46ec5cf89e24759da545b/google_genai-1.41.0-py3-none-any.whl", hash = "sha256:111a3ee64c1a0927d3879faddb368234594432479a40c311e5fe4db338ca8778", size = 245931, upload-time = "2025-10-02T22:30:27.885Z" },
]
[[package]]
@@ -4558,9 +4558,9 @@ requires-dist = [
{ name = "fastapi", marker = "extra == 'runner'", specifier = ">=0.115.6,<0.117.0" },
{ name = "fastapi", marker = "extra == 'websocket'", specifier = ">=0.115.6,<0.117.0" },
{ name = "faster-whisper", marker = "extra == 'whisper'", specifier = "~=1.1.1" },
{ name = "google-cloud-speech", marker = "extra == 'google'", specifier = "~=2.32.0" },
{ name = "google-cloud-texttospeech", marker = "extra == 'google'", specifier = "~=2.26.0" },
{ name = "google-genai", marker = "extra == 'google'", specifier = "~=1.24.0" },
{ name = "google-cloud-speech", marker = "extra == 'google'", specifier = ">=2.33.0,<3" },
{ name = "google-cloud-texttospeech", marker = "extra == 'google'", specifier = ">=2.31.0,<3" },
{ name = "google-genai", marker = "extra == 'google'", specifier = ">=1.41.0,<2" },
{ name = "groq", marker = "extra == 'groq'", specifier = "~=0.23.0" },
{ name = "hume", marker = "extra == 'hume'", specifier = ">=0.11.2" },
{ name = "langchain", marker = "extra == 'langchain'", specifier = "~=0.3.20" },