Warn when TEXT modality is set for Gemini Live, and remove 26d text example

All recent Gemini Live models (including the default
gemini-2.5-flash-native-audio-preview-12-2025, and going at least as
far back as gemini-2.5-flash-native-audio-preview-09-2025) only
support AUDIO as a response modality. We considered using
`modalities=TEXT` as a Pipecat-level signal to suppress audio output
frames (so developers could pair Gemini Live with an external TTS),
but the output transcription from the API arrives too late relative
to the audio to be useful for driving an external TTS service.

For now, just log a warning when a TEXT modality is configured
(at init or via set_model_modalities) and proceed as normal. The 26d
text-modality example is removed since it no longer represents a
viable configuration.
This commit is contained in:
Paul Kompfner
2026-03-27 16:15:27 -04:00
parent 4f9c8a6860
commit 04adb697be
2 changed files with 11 additions and 142 deletions

View File

@@ -1,142 +0,0 @@
#
# Copyright (c) 2024-2026, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import os
from dotenv import load_dotenv
from loguru import logger
from pipecat.frames.frames import LLMRunFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.llm_context import LLMContext
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
from pipecat.runner.types import RunnerArguments
from pipecat.runner.utils import create_transport
from pipecat.services.cartesia.tts import CartesiaTTSService
from pipecat.services.google.gemini_live.llm import GeminiLiveLLMService, GeminiModalities
from pipecat.transports.base_transport import BaseTransport, TransportParams
from pipecat.transports.daily.transport import DailyParams
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
load_dotenv(override=True)
SYSTEM_INSTRUCTION = f"""
"You are Gemini Chatbot, a friendly, helpful robot.
Your goal is to demonstrate your capabilities in a succinct way.
Your output will be spoken aloud, so avoid special characters that can't easily be spoken, such as emojis or bullet points.
Respond to what the user said in a creative and helpful way. Keep your responses brief. One or two sentences at most.
"""
# We use lambdas to defer transport parameter creation until the transport
# type is selected at runtime.
transport_params = {
"daily": lambda: DailyParams(
audio_in_enabled=True,
audio_out_enabled=True,
),
"twilio": lambda: FastAPIWebsocketParams(
audio_in_enabled=True,
audio_out_enabled=True,
),
"webrtc": lambda: TransportParams(
audio_in_enabled=True,
audio_out_enabled=True,
),
}
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
logger.info(f"Starting bot")
# KNOWN ISSUE: If using GeminiLiveVertexLLMService, you cannot specify a
# modality other than AUDIO (at least not if using the service's default
# model, which is a native audio model:
# https://cloud.google.com/vertex-ai/generative-ai/docs/live-api/tools#native-audio).
llm = GeminiLiveLLMService(
api_key=os.getenv("GOOGLE_API_KEY"),
settings=GeminiLiveLLMService.Settings(
system_instruction=SYSTEM_INSTRUCTION,
modalities=GeminiModalities.TEXT,
),
tools=[{"google_search": {}}, {"code_execution": {}}],
)
# Optionally, you can set the response modalities via a function
# llm.set_model_modalities(
# GeminiMultimodalModalities.TEXT
# )
tts = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY"), voice_id="71a7ad14-091c-4e8e-a314-022ece01c121"
)
messages = [
{
"role": "developer",
"content": 'Start by saying "Hello, I\'m Gemini".',
},
]
# Set up conversation context and management
# The context_aggregator will automatically collect conversation context
context = LLMContext(messages)
# Server-side VAD is enabled by default; no local VAD is added.
user_aggregator, assistant_aggregator = LLMContextAggregatorPair(context)
pipeline = Pipeline(
[
transport.input(),
user_aggregator,
llm,
tts,
transport.output(),
assistant_aggregator,
]
)
task = PipelineTask(
pipeline,
params=PipelineParams(
enable_metrics=True,
enable_usage_metrics=True,
),
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
)
@transport.event_handler("on_client_connected")
async def on_client_connected(transport, client):
logger.info(f"Client connected")
# Kick off the conversation.
await task.queue_frames([LLMRunFrame()])
@transport.event_handler("on_client_disconnected")
async def on_client_disconnected(transport, client):
logger.info(f"Client disconnected")
await task.cancel()
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
await runner.run(task)
async def bot(runner_args: RunnerArguments):
"""Main bot entry point compatible with Pipecat Cloud."""
transport = await create_transport(runner_args, transport_params)
await run_bot(transport, runner_args)
if __name__ == "__main__":
from pipecat.runner.run import main
main()

View File

@@ -787,6 +787,13 @@ class GeminiLiveLLMService(LLMService):
if settings is not None:
default_settings.apply_update(settings)
# Warn if user requested TEXT modality
if default_settings.modalities == GeminiModalities.TEXT:
logger.warning(
f"Modality {default_settings.modalities.value!r} may not be supported by recent "
"Gemini Live models."
)
super().__init__(
base_url=base_url,
settings=default_settings,
@@ -920,6 +927,10 @@ class GeminiLiveLLMService(LLMService):
Args:
modalities: The modalities to use for responses.
"""
if modalities == GeminiModalities.TEXT:
logger.warning(
f"Modality {modalities.value!r} may not be supported by recent Gemini Live models."
)
self._settings.modalities = modalities
def set_language(self, language: Language):