Warn when TEXT modality is set for Gemini Live, and remove 26d text example
All recent Gemini Live models (including the default gemini-2.5-flash-native-audio-preview-12-2025, and going at least as far back as gemini-2.5-flash-native-audio-preview-09-2025) only support AUDIO as a response modality. We considered using `modalities=TEXT` as a Pipecat-level signal to suppress audio output frames (so developers could pair Gemini Live with an external TTS), but the output transcription from the API arrives too late relative to the audio to be useful for driving an external TTS service. For now, just log a warning when a TEXT modality is configured (at init or via set_model_modalities) and proceed as normal. The 26d text-modality example is removed since it no longer represents a viable configuration.
This commit is contained in:
@@ -1,142 +0,0 @@
|
||||
#
|
||||
# Copyright (c) 2024-2026, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
|
||||
import os
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.frames.frames import LLMRunFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
from pipecat.services.google.gemini_live.llm import GeminiLiveLLMService, GeminiModalities
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
SYSTEM_INSTRUCTION = f"""
|
||||
"You are Gemini Chatbot, a friendly, helpful robot.
|
||||
|
||||
Your goal is to demonstrate your capabilities in a succinct way.
|
||||
|
||||
Your output will be spoken aloud, so avoid special characters that can't easily be spoken, such as emojis or bullet points.
|
||||
|
||||
Respond to what the user said in a creative and helpful way. Keep your responses brief. One or two sentences at most.
|
||||
"""
|
||||
|
||||
|
||||
# We use lambdas to defer transport parameter creation until the transport
|
||||
# type is selected at runtime.
|
||||
transport_params = {
|
||||
"daily": lambda: DailyParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
),
|
||||
"twilio": lambda: FastAPIWebsocketParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
),
|
||||
"webrtc": lambda: TransportParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
# KNOWN ISSUE: If using GeminiLiveVertexLLMService, you cannot specify a
|
||||
# modality other than AUDIO (at least not if using the service's default
|
||||
# model, which is a native audio model:
|
||||
# https://cloud.google.com/vertex-ai/generative-ai/docs/live-api/tools#native-audio).
|
||||
llm = GeminiLiveLLMService(
|
||||
api_key=os.getenv("GOOGLE_API_KEY"),
|
||||
settings=GeminiLiveLLMService.Settings(
|
||||
system_instruction=SYSTEM_INSTRUCTION,
|
||||
modalities=GeminiModalities.TEXT,
|
||||
),
|
||||
tools=[{"google_search": {}}, {"code_execution": {}}],
|
||||
)
|
||||
|
||||
# Optionally, you can set the response modalities via a function
|
||||
# llm.set_model_modalities(
|
||||
# GeminiMultimodalModalities.TEXT
|
||||
# )
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"), voice_id="71a7ad14-091c-4e8e-a314-022ece01c121"
|
||||
)
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "developer",
|
||||
"content": 'Start by saying "Hello, I\'m Gemini".',
|
||||
},
|
||||
]
|
||||
|
||||
# Set up conversation context and management
|
||||
# The context_aggregator will automatically collect conversation context
|
||||
context = LLMContext(messages)
|
||||
# Server-side VAD is enabled by default; no local VAD is added.
|
||||
user_aggregator, assistant_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(),
|
||||
user_aggregator,
|
||||
llm,
|
||||
tts,
|
||||
transport.output(),
|
||||
assistant_aggregator,
|
||||
]
|
||||
)
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
params=PipelineParams(
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
),
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
async def bot(runner_args: RunnerArguments):
|
||||
"""Main bot entry point compatible with Pipecat Cloud."""
|
||||
transport = await create_transport(runner_args, transport_params)
|
||||
await run_bot(transport, runner_args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from pipecat.runner.run import main
|
||||
|
||||
main()
|
||||
@@ -787,6 +787,13 @@ class GeminiLiveLLMService(LLMService):
|
||||
if settings is not None:
|
||||
default_settings.apply_update(settings)
|
||||
|
||||
# Warn if user requested TEXT modality
|
||||
if default_settings.modalities == GeminiModalities.TEXT:
|
||||
logger.warning(
|
||||
f"Modality {default_settings.modalities.value!r} may not be supported by recent "
|
||||
"Gemini Live models."
|
||||
)
|
||||
|
||||
super().__init__(
|
||||
base_url=base_url,
|
||||
settings=default_settings,
|
||||
@@ -920,6 +927,10 @@ class GeminiLiveLLMService(LLMService):
|
||||
Args:
|
||||
modalities: The modalities to use for responses.
|
||||
"""
|
||||
if modalities == GeminiModalities.TEXT:
|
||||
logger.warning(
|
||||
f"Modality {modalities.value!r} may not be supported by recent Gemini Live models."
|
||||
)
|
||||
self._settings.modalities = modalities
|
||||
|
||||
def set_language(self, language: Language):
|
||||
|
||||
Reference in New Issue
Block a user