Update GeminiTTSService for streaming, other Google TTS improvements

This commit is contained in:
Mark Backman
2025-11-12 09:43:52 -05:00
parent d823a3edec
commit edbf96b3c5
4 changed files with 580 additions and 273 deletions

View File

@@ -20,6 +20,21 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Added Hindi support for Rime TTS services.
- Updated `GeminiTTSService` to use Google Cloud Text-to-Speech streaming API
instead of the deprecated Gemini API. Now uses `credentials` /
`credentials_path` for authentication. The `api_key` parameter is deprecated.
Also, added support for `prompt` parameter for style instructions and
expressive markup tags. Significantly improved latency with streaming
synthesis.
- Updated language mappings for the Google and Gemini TTS services to match
official documentation.
### Deprecated
- The `api_key` parameter in `GeminiTTSService` is deprecated. Use
`credentials` or `credentials_path` instead for Google Cloud authentication.
### Fixed
- Fixed subtle issue of assistant context messages ending up with double spaces

View File

@@ -4,24 +4,6 @@
# SPDX-License-Identifier: BSD 2-Clause License
#
"""
A conversational AI bot using Gemini for both LLM and TTS.
This example demonstrates how to use Gemini's TTS capabilities with the new
GeminiTTSService, which uses Gemini's TTS-specific models instead of Google Cloud TTS.
Features showcased:
- Gemini LLM for conversation
- Gemini TTS with natural voice control
- Support for different voice personalities
- Style and tone control through natural language prompts
Run with:
python examples/foundational/gemini-tts.py
Make sure to set your environment variables:
export GOOGLE_API_KEY=your_api_key_here
"""
import os
@@ -84,10 +66,13 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
)
tts = GeminiTTSService(
api_key=os.getenv("GOOGLE_API_KEY"),
model="gemini-2.5-flash-preview-tts", # TTS-specific model
credentials=os.getenv("GOOGLE_TEST_CREDENTIALS"),
model="gemini-2.5-flash-tts",
voice_id="Charon",
params=GeminiTTSService.InputParams(language=Language.EN_US),
params=GeminiTTSService.InputParams(
language=Language.EN_US,
prompt="You are a helpful AI assistant. Speak in a natural, conversational tone.",
),
)
llm = GoogleLLMService(
@@ -101,13 +86,20 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
"role": "system",
"content": """You are a helpful AI assistant in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way.
IMPORTANT: Since you're using Gemini TTS which supports natural voice control, you can include speaking instructions in your responses. For example:
- "Say cheerfully: Welcome to our conversation!"
- "Read this in a calm, professional tone: Here are the details you requested."
- "Speak in an excited whisper: I have some great news to share!"
- "Say slowly and clearly: Let me explain this step by step."
IMPORTANT: You're using Gemini TTS which supports expressive markup tags. You can use these tags in your responses:
- [sigh] - Insert a sigh sound
- [laughing] - Insert a laugh
- [uhm] - Insert a hesitation sound
- [whispering] - Speak the next part in a whisper
- [shouting] - Speak the next part louder
- [extremely fast] - Speak the next part very quickly
- [short pause], [medium pause], [long pause] - Add pauses for dramatic effect
Feel free to use natural language instructions to control your voice style, tone, pace, and emotion. The TTS system will interpret these instructions and adjust the speech accordingly.
Examples:
- "Well [sigh] that's a tricky question."
- "[laughing] That's a great joke!"
- "[whispering] Let me tell you a secret."
- "The answer is... [long pause] ...42!"
Your output will be converted to audio, so avoid special characters in your answers. Respond to what the user said in a creative and helpful way.""",
},
@@ -140,11 +132,11 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
@transport.event_handler("on_client_connected")
async def on_client_connected(transport, client):
logger.info(f"Client connected")
# Kick off the conversation with a styled introduction
# Kick off the conversation
messages.append(
{
"role": "system",
"content": "Say cheerfully and warmly: Hello! I'm your AI assistant powered by Gemini's new TTS technology. I can speak with different voices, tones, and styles. How can I help you today?",
"content": "Hello! I'm your AI assistant. I can help you with a variety of tasks. What would you like to know?",
}
)
await task.queue_frames([LLMRunFrame()])

View File

@@ -16,6 +16,7 @@ for natural voice control and multi-speaker conversations.
import json
import os
import warnings
from pipecat.utils.tracing.service_decorators import traced_tts
@@ -51,19 +52,13 @@ except ModuleNotFoundError as e:
)
raise Exception(f"Missing module: {e}")
try:
from google import genai
from google.genai import types
except ModuleNotFoundError as e:
logger.error(f"Exception: {e}")
logger.error("In order to use Gemini TTS, you need to `pip install pipecat-ai[google]`.")
raise Exception(f"Missing module: {e}")
def language_to_google_tts_language(language: Language) -> Optional[str]:
"""Convert a Language enum to Google TTS language code.
Source:
https://docs.cloud.google.com/text-to-speech/docs/chirp3-hd
Args:
language: The Language enum value to convert.
@@ -71,9 +66,6 @@ def language_to_google_tts_language(language: Language) -> Optional[str]:
The corresponding Google TTS language code, or None if not supported.
"""
LANGUAGE_MAP = {
# Afrikaans
Language.AF: "af-ZA",
Language.AF_ZA: "af-ZA",
# Arabic
Language.AR: "ar-XA",
# Bengali
@@ -82,14 +74,9 @@ def language_to_google_tts_language(language: Language) -> Optional[str]:
# Bulgarian
Language.BG: "bg-BG",
Language.BG_BG: "bg-BG",
# Catalan
Language.CA: "ca-ES",
Language.CA_ES: "ca-ES",
# Chinese (Mandarin and Cantonese)
Language.ZH: "cmn-CN",
Language.ZH_CN: "cmn-CN",
Language.ZH_TW: "cmn-TW",
Language.ZH_HK: "yue-HK",
# Croatian
Language.HR: "hr-HR",
Language.HR_HR: "hr-HR",
# Czech
Language.CS: "cs-CZ",
Language.CS_CZ: "cs-CZ",
@@ -109,9 +96,6 @@ def language_to_google_tts_language(language: Language) -> Optional[str]:
# Estonian
Language.ET: "et-EE",
Language.ET_EE: "et-EE",
# Filipino
Language.FIL: "fil-PH",
Language.FIL_PH: "fil-PH",
# Finnish
Language.FI: "fi-FI",
Language.FI_FI: "fi-FI",
@@ -119,9 +103,6 @@ def language_to_google_tts_language(language: Language) -> Optional[str]:
Language.FR: "fr-FR",
Language.FR_CA: "fr-CA",
Language.FR_FR: "fr-FR",
# Galician
Language.GL: "gl-ES",
Language.GL_ES: "gl-ES",
# German
Language.DE: "de-DE",
Language.DE_DE: "de-DE",
@@ -140,9 +121,6 @@ def language_to_google_tts_language(language: Language) -> Optional[str]:
# Hungarian
Language.HU: "hu-HU",
Language.HU_HU: "hu-HU",
# Icelandic
Language.IS: "is-IS",
Language.IS_IS: "is-IS",
# Indonesian
Language.ID: "id-ID",
Language.ID_ID: "id-ID",
@@ -164,12 +142,12 @@ def language_to_google_tts_language(language: Language) -> Optional[str]:
# Lithuanian
Language.LT: "lt-LT",
Language.LT_LT: "lt-LT",
# Malay
Language.MS: "ms-MY",
Language.MS_MY: "ms-MY",
# Malayalam
Language.ML: "ml-IN",
Language.ML_IN: "ml-IN",
# Chinese (Mandarin)
Language.ZH: "cmn-CN",
Language.ZH_CN: "cmn-CN",
# Marathi
Language.MR: "mr-IN",
Language.MR_IN: "mr-IN",
@@ -181,12 +159,8 @@ def language_to_google_tts_language(language: Language) -> Optional[str]:
Language.PL: "pl-PL",
Language.PL_PL: "pl-PL",
# Portuguese
Language.PT: "pt-PT",
Language.PT: "pt-BR",
Language.PT_BR: "pt-BR",
Language.PT_PT: "pt-PT",
# Punjabi
Language.PA: "pa-IN",
Language.PA_IN: "pa-IN",
# Romanian
Language.RO: "ro-RO",
Language.RO_RO: "ro-RO",
@@ -199,10 +173,16 @@ def language_to_google_tts_language(language: Language) -> Optional[str]:
# Slovak
Language.SK: "sk-SK",
Language.SK_SK: "sk-SK",
# Slovenian
Language.SL: "sl-SI",
Language.SL_SI: "sl-SI",
# Spanish
Language.ES: "es-ES",
Language.ES_ES: "es-ES",
Language.ES_US: "es-US",
# Swahili
Language.SW: "sw-KE",
Language.SW_KE: "sw-KE",
# Swedish
Language.SV: "sv-SE",
Language.SV_SE: "sv-SE",
@@ -221,6 +201,9 @@ def language_to_google_tts_language(language: Language) -> Optional[str]:
# Ukrainian
Language.UK: "uk-UA",
Language.UK_UA: "uk-UA",
# Urdu
Language.UR: "ur-IN",
Language.UR_IN: "ur-IN",
# Vietnamese
Language.VI: "vi-VN",
Language.VI_VN: "vi-VN",
@@ -229,6 +212,267 @@ def language_to_google_tts_language(language: Language) -> Optional[str]:
return resolve_language(language, LANGUAGE_MAP, use_base_code=False)
def language_to_gemini_tts_language(language: Language) -> Optional[str]:
"""Convert a Language enum to Gemini TTS language code.
Source:
https://docs.cloud.google.com/text-to-speech/docs/gemini-tts#available_languages
Args:
language: The Language enum value to convert.
Returns:
The corresponding Gemini TTS language code, or None if not supported.
"""
LANGUAGE_MAP = {
# Afrikaans (Preview)
Language.AF: "af-ZA",
Language.AF_ZA: "af-ZA",
# Albanian (Preview)
Language.SQ: "sq-AL",
Language.SQ_AL: "sq-AL",
# Amharic (Preview)
Language.AM: "am-ET",
Language.AM_ET: "am-ET",
# Arabic
Language.AR: "ar-EG", # GA: Egypt
Language.AR_EG: "ar-EG",
Language.AR_001: "ar-001", # Preview: World
# Armenian (Preview)
Language.HY: "hy-AM",
Language.HY_AM: "hy-AM",
# Azerbaijani (Preview)
Language.AZ: "az-AZ",
Language.AZ_AZ: "az-AZ",
# Basque (Preview)
Language.EU: "eu-ES",
Language.EU_ES: "eu-ES",
# Belarusian (Preview)
Language.BE: "be-BY",
Language.BE_BY: "be-BY",
# Bengali (GA)
Language.BN: "bn-BD",
Language.BN_BD: "bn-BD",
# Bulgarian (Preview)
Language.BG: "bg-BG",
Language.BG_BG: "bg-BG",
# Burmese (Preview)
Language.MY: "my-MM",
Language.MY_MM: "my-MM",
# Catalan (Preview)
Language.CA: "ca-ES",
Language.CA_ES: "ca-ES",
# Cebuano (Preview)
Language.CEB: "ceb-PH",
Language.CEB_PH: "ceb-PH",
# Chinese (Mandarin)
Language.ZH: "cmn-CN", # Preview
Language.ZH_CN: "cmn-CN",
Language.ZH_TW: "cmn-TW", # Preview
# Croatian (Preview)
Language.HR: "hr-HR",
Language.HR_HR: "hr-HR",
# Czech (Preview)
Language.CS: "cs-CZ",
Language.CS_CZ: "cs-CZ",
# Danish (Preview)
Language.DA: "da-DK",
Language.DA_DK: "da-DK",
# Dutch (GA)
Language.NL: "nl-NL",
Language.NL_NL: "nl-NL",
# English
Language.EN: "en-US", # GA
Language.EN_US: "en-US",
Language.EN_AU: "en-AU", # Preview
Language.EN_GB: "en-GB", # Preview
Language.EN_IN: "en-IN", # GA
# Estonian (Preview)
Language.ET: "et-EE",
Language.ET_EE: "et-EE",
# Filipino (Preview)
Language.FIL: "fil-PH",
Language.FIL_PH: "fil-PH",
# Finnish (Preview)
Language.FI: "fi-FI",
Language.FI_FI: "fi-FI",
# French
Language.FR: "fr-FR", # GA
Language.FR_FR: "fr-FR",
Language.FR_CA: "fr-CA", # Preview
# Galician (Preview)
Language.GL: "gl-ES",
Language.GL_ES: "gl-ES",
# Georgian (Preview)
Language.KA: "ka-GE",
Language.KA_GE: "ka-GE",
# German (GA)
Language.DE: "de-DE",
Language.DE_DE: "de-DE",
# Greek (Preview)
Language.EL: "el-GR",
Language.EL_GR: "el-GR",
# Gujarati (Preview)
Language.GU: "gu-IN",
Language.GU_IN: "gu-IN",
# Haitian Creole (Preview)
Language.HT: "ht-HT",
Language.HT_HT: "ht-HT",
# Hebrew (Preview)
Language.HE: "he-IL",
Language.HE_IL: "he-IL",
# Hindi (GA)
Language.HI: "hi-IN",
Language.HI_IN: "hi-IN",
# Hungarian (Preview)
Language.HU: "hu-HU",
Language.HU_HU: "hu-HU",
# Icelandic (Preview)
Language.IS: "is-IS",
Language.IS_IS: "is-IS",
# Indonesian (GA)
Language.ID: "id-ID",
Language.ID_ID: "id-ID",
# Italian (GA)
Language.IT: "it-IT",
Language.IT_IT: "it-IT",
# Japanese (GA)
Language.JA: "ja-JP",
Language.JA_JP: "ja-JP",
# Javanese (Preview)
Language.JV: "jv-JV",
Language.JV_JV: "jv-JV",
# Kannada (Preview)
Language.KN: "kn-IN",
Language.KN_IN: "kn-IN",
# Konkani (Preview)
Language.KOK: "kok-IN",
Language.KOK_IN: "kok-IN",
# Korean (GA)
Language.KO: "ko-KR",
Language.KO_KR: "ko-KR",
# Lao (Preview)
Language.LO: "lo-LA",
Language.LO_LA: "lo-LA",
# Latin (Preview)
Language.LA: "la-VA",
Language.LA_VA: "la-VA",
# Latvian (Preview)
Language.LV: "lv-LV",
Language.LV_LV: "lv-LV",
# Lithuanian (Preview)
Language.LT: "lt-LT",
Language.LT_LT: "lt-LT",
# Luxembourgish (Preview)
Language.LB: "lb-LU",
Language.LB_LU: "lb-LU",
# Macedonian (Preview)
Language.MK: "mk-MK",
Language.MK_MK: "mk-MK",
# Maithili (Preview)
Language.MAI: "mai-IN",
Language.MAI_IN: "mai-IN",
# Malagasy (Preview)
Language.MG: "mg-MG",
Language.MG_MG: "mg-MG",
# Malay (Preview)
Language.MS: "ms-MY",
Language.MS_MY: "ms-MY",
# Malayalam (Preview)
Language.ML: "ml-IN",
Language.ML_IN: "ml-IN",
# Marathi (GA)
Language.MR: "mr-IN",
Language.MR_IN: "mr-IN",
# Mongolian (Preview)
Language.MN: "mn-MN",
Language.MN_MN: "mn-MN",
# Nepali (Preview)
Language.NE: "ne-NP",
Language.NE_NP: "ne-NP",
# Norwegian
Language.NO: "nb-NO", # Preview: Bokmål
Language.NB: "nb-NO",
Language.NB_NO: "nb-NO",
Language.NN: "nn-NO", # Preview: Nynorsk
Language.NN_NO: "nn-NO",
# Odia (Preview)
Language.OR: "or-IN",
Language.OR_IN: "or-IN",
# Pashto (Preview)
Language.PS: "ps-AF",
Language.PS_AF: "ps-AF",
# Persian (Preview)
Language.FA: "fa-IR",
Language.FA_IR: "fa-IR",
# Polish (GA)
Language.PL: "pl-PL",
Language.PL_PL: "pl-PL",
# Portuguese
Language.PT: "pt-BR", # GA: Brazil
Language.PT_BR: "pt-BR",
Language.PT_PT: "pt-PT", # Preview: Portugal
# Punjabi (Preview)
Language.PA: "pa-IN",
Language.PA_IN: "pa-IN",
# Romanian (GA)
Language.RO: "ro-RO",
Language.RO_RO: "ro-RO",
# Russian (GA)
Language.RU: "ru-RU",
Language.RU_RU: "ru-RU",
# Serbian (Preview)
Language.SR: "sr-RS",
Language.SR_RS: "sr-RS",
# Sindhi (Preview)
Language.SD: "sd-IN",
Language.SD_IN: "sd-IN",
# Sinhala (Preview)
Language.SI: "si-LK",
Language.SI_LK: "si-LK",
# Slovak (Preview)
Language.SK: "sk-SK",
Language.SK_SK: "sk-SK",
# Slovenian (Preview)
Language.SL: "sl-SI",
Language.SL_SI: "sl-SI",
# Spanish
Language.ES: "es-ES", # GA
Language.ES_ES: "es-ES",
Language.ES_419: "es-419", # Preview: Latin America
Language.ES_MX: "es-MX", # Preview: Mexico
# Swahili (Preview)
Language.SW: "sw-KE",
Language.SW_KE: "sw-KE",
# Swedish (Preview)
Language.SV: "sv-SE",
Language.SV_SE: "sv-SE",
# Tamil (GA)
Language.TA: "ta-IN",
Language.TA_IN: "ta-IN",
# Telugu (GA)
Language.TE: "te-IN",
Language.TE_IN: "te-IN",
# Thai (GA)
Language.TH: "th-TH",
Language.TH_TH: "th-TH",
# Turkish (GA)
Language.TR: "tr-TR",
Language.TR_TR: "tr-TR",
# Ukrainian (GA)
Language.UK: "uk-UA",
Language.UK_UA: "uk-UA",
# Urdu (Preview)
Language.UR: "ur-PK",
Language.UR_PK: "ur-PK",
# Vietnamese (GA)
Language.VI: "vi-VN",
Language.VI_VN: "vi-VN",
}
return resolve_language(language, LANGUAGE_MAP, use_base_code=False)
class GoogleHttpTTSService(TTSService):
"""Google Cloud Text-to-Speech HTTP service with SSML support.
@@ -498,7 +742,139 @@ class GoogleHttpTTSService(TTSService):
yield ErrorFrame(error=error_message)
class GoogleTTSService(TTSService):
class GoogleBaseTTSService(TTSService):
"""Base class for Google Cloud Text-to-Speech streaming services.
Provides shared streaming synthesis logic for Google TTS services.
This is an abstract base class. Use GoogleTTSService or GeminiTTSService instead.
"""
def _create_client(
self, credentials: Optional[str], credentials_path: Optional[str]
) -> texttospeech_v1.TextToSpeechAsyncClient:
"""Create authenticated Google Text-to-Speech client.
Args:
credentials: JSON string with service account credentials.
credentials_path: Path to service account JSON file.
Returns:
Authenticated TextToSpeechAsyncClient instance.
Raises:
ValueError: If no valid credentials are provided.
"""
creds: Optional[service_account.Credentials] = None
if credentials:
# Use provided credentials JSON string
json_account_info = json.loads(credentials)
creds = service_account.Credentials.from_service_account_info(json_account_info)
elif credentials_path:
# Use service account JSON file if provided
creds = service_account.Credentials.from_service_account_file(credentials_path)
else:
try:
creds, project_id = default(
scopes=["https://www.googleapis.com/auth/cloud-platform"]
)
except GoogleAuthError:
pass
if not creds:
raise ValueError("No valid credentials provided.")
return texttospeech_v1.TextToSpeechAsyncClient(credentials=creds)
def can_generate_metrics(self) -> bool:
"""Check if this service can generate processing metrics.
Returns:
True, as Google streaming TTS services support metrics generation.
"""
return True
@property
def includes_inter_frame_spaces(self) -> bool:
"""Indicates that Google and Gemini TTSTextFrames include necessary inter-frame spaces.
Returns:
True, indicating that Google's text frames include necessary inter-frame spaces.
"""
return True
def language_to_service_language(self, language: Language) -> Optional[str]:
"""Convert a Language enum to Google TTS language format.
Args:
language: The language to convert.
Returns:
The Google TTS-specific language code, or None if not supported.
"""
return language_to_google_tts_language(language)
async def _stream_tts(
self,
streaming_config: texttospeech_v1.StreamingSynthesizeConfig,
text: str,
prompt: Optional[str] = None,
) -> AsyncGenerator[Frame, None]:
"""Shared streaming synthesis logic.
Args:
streaming_config: The streaming configuration.
text: The text to synthesize.
prompt: Optional prompt for style instructions (Gemini only).
Yields:
Frame: Audio frames containing the synthesized speech.
"""
config_request = texttospeech_v1.StreamingSynthesizeRequest(
streaming_config=streaming_config
)
async def request_generator():
yield config_request
synthesis_input_params = {"text": text}
if prompt is not None:
synthesis_input_params["prompt"] = prompt
yield texttospeech_v1.StreamingSynthesizeRequest(
input=texttospeech_v1.StreamingSynthesisInput(**synthesis_input_params)
)
streaming_responses = await self._client.streaming_synthesize(request_generator())
await self.start_tts_usage_metrics(text)
yield TTSStartedFrame()
audio_buffer = b""
first_chunk_for_ttfb = False
CHUNK_SIZE = self.chunk_size
async for response in streaming_responses:
chunk = response.audio_content
if not chunk:
continue
if not first_chunk_for_ttfb:
await self.stop_ttfb_metrics()
first_chunk_for_ttfb = True
audio_buffer += chunk
while len(audio_buffer) >= CHUNK_SIZE:
piece = audio_buffer[:CHUNK_SIZE]
audio_buffer = audio_buffer[CHUNK_SIZE:]
yield TTSAudioRawFrame(piece, self.sample_rate, 1)
if audio_buffer:
yield TTSAudioRawFrame(audio_buffer, self.sample_rate, 1)
yield TTSStoppedFrame()
class GoogleTTSService(GoogleBaseTTSService):
"""Google Cloud Text-to-Speech streaming service.
Provides real-time text-to-speech synthesis using Google Cloud's streaming API
@@ -570,62 +946,6 @@ class GoogleTTSService(TTSService):
credentials, credentials_path
)
def _create_client(
self, credentials: Optional[str], credentials_path: Optional[str]
) -> texttospeech_v1.TextToSpeechAsyncClient:
creds: Optional[service_account.Credentials] = None
# Create a Google Cloud service account for the Cloud Text-to-Speech API
# Using either the provided credentials JSON string or the path to a service account JSON
# file, create a Google Cloud service account and use it to authenticate with the API.
if credentials:
# Use provided credentials JSON string
json_account_info = json.loads(credentials)
creds = service_account.Credentials.from_service_account_info(json_account_info)
elif credentials_path:
# Use service account JSON file if provided
creds = service_account.Credentials.from_service_account_file(credentials_path)
else:
try:
creds, project_id = default(
scopes=["https://www.googleapis.com/auth/cloud-platform"]
)
except GoogleAuthError:
pass
if not creds:
raise ValueError("No valid credentials provided.")
return texttospeech_v1.TextToSpeechAsyncClient(credentials=creds)
def can_generate_metrics(self) -> bool:
"""Check if this service can generate processing metrics.
Returns:
True, as Google streaming TTS service supports metrics generation.
"""
return True
@property
def includes_inter_frame_spaces(self) -> bool:
"""Indicates that Google TTSTextFrames include necessary inter-frame spaces.
Returns:
True, indicating that Google's text frames include necessary inter-frame spaces.
"""
return True
def language_to_service_language(self, language: Language) -> Optional[str]:
"""Convert a Language enum to Google TTS language format.
Args:
language: The language to convert.
Returns:
The Google TTS-specific language code, or None if not supported.
"""
return language_to_google_tts_language(language)
async def _update_settings(self, settings: Mapping[str, Any]):
"""Override to handle speaking_rate updates for streaming API.
@@ -657,6 +977,7 @@ class GoogleTTSService(TTSService):
try:
await self.start_ttfb_metrics()
# Build voice selection params
if self._voice_cloning_key:
voice_clone_params = texttospeech_v1.VoiceCloneParams(
voice_cloning_key=self._voice_cloning_key
@@ -669,6 +990,7 @@ class GoogleTTSService(TTSService):
language_code=self._settings["language"], name=self._voice_id
)
# Create streaming config
streaming_config = texttospeech_v1.StreamingSynthesizeConfig(
voice=voice,
streaming_audio_config=texttospeech_v1.StreamingAudioConfig(
@@ -677,45 +999,10 @@ class GoogleTTSService(TTSService):
speaking_rate=self._settings["speaking_rate"],
),
)
config_request = texttospeech_v1.StreamingSynthesizeRequest(
streaming_config=streaming_config
)
async def request_generator():
yield config_request
yield texttospeech_v1.StreamingSynthesizeRequest(
input=texttospeech_v1.StreamingSynthesisInput(text=text)
)
streaming_responses = await self._client.streaming_synthesize(request_generator())
await self.start_tts_usage_metrics(text)
yield TTSStartedFrame()
audio_buffer = b""
first_chunk_for_ttfb = False
CHUNK_SIZE = self.chunk_size
async for response in streaming_responses:
chunk = response.audio_content
if not chunk:
continue
if not first_chunk_for_ttfb:
await self.stop_ttfb_metrics()
first_chunk_for_ttfb = True
audio_buffer += chunk
while len(audio_buffer) >= CHUNK_SIZE:
piece = audio_buffer[:CHUNK_SIZE]
audio_buffer = audio_buffer[CHUNK_SIZE:]
yield TTSAudioRawFrame(piece, self.sample_rate, 1)
if audio_buffer:
yield TTSAudioRawFrame(audio_buffer, self.sample_rate, 1)
yield TTSStoppedFrame()
# Use base class streaming logic
async for frame in self._stream_tts(streaming_config, text):
yield frame
except Exception as e:
logger.exception(f"{self} error generating TTS: {e}")
@@ -723,25 +1010,29 @@ class GoogleTTSService(TTSService):
yield ErrorFrame(error=error_message)
class GeminiTTSService(TTSService):
"""Gemini Text-to-Speech service using Gemini TTS models.
class GeminiTTSService(GoogleBaseTTSService):
"""Gemini Text-to-Speech streaming service using Gemini TTS models.
Provides text-to-speech synthesis using Gemini's TTS-specific models
(gemini-2.5-flash-preview-tts and gemini-2.5-pro-preview-tts) with
support for natural voice control, multiple speakers, and voice styles.
Provides real-time text-to-speech synthesis using Gemini's TTS-specific models
(gemini-2.5-flash-tts and gemini-2.5-pro-tts) with support for natural
voice control, prompts for style instructions, expressive markup tags,
and multi-speaker conversations.
Note:
Requires Google AI API key. This uses the Gemini API, not Google Cloud TTS.
Audio-out is currently a preview feature.
Requires Google Cloud credentials via service account JSON, credentials file,
or default application credentials (GOOGLE_APPLICATION_CREDENTIALS).
Uses the Google Cloud Text-to-Speech streaming API for low-latency synthesis.
Example::
tts = GeminiTTSService(
api_key="your-google-ai-api-key",
model="gemini-2.5-flash-preview-tts",
credentials_path="/path/to/service-account.json",
model="gemini-2.5-flash-tts",
voice_id="Kore",
params=GeminiTTSService.InputParams(
language=Language.EN_US,
prompt="Say this in a friendly and helpful tone"
)
)
"""
@@ -750,36 +1041,36 @@ class GeminiTTSService(TTSService):
# List of available Gemini TTS voices
AVAILABLE_VOICES = [
"Zephyr",
"Puck",
"Achernar",
"Achird",
"Algenib",
"Algieba",
"Alnilam",
"Aoede",
"Autonoe",
"Callirhoe",
"Charon",
"Kore",
"Despina",
"Enceladus",
"Erinome",
"Fenrir",
"Gacrux",
"Iapetus",
"Kore",
"Laomedeia",
"Leda",
"Orus",
"Aoede",
"Callirhoe",
"Autonoe",
"Enceladus",
"Iapetus",
"Umbriel",
"Algieba",
"Despina",
"Erinome",
"Algenib",
"Rasalgethi",
"Laomedeia",
"Achernar",
"Alnilam",
"Schedar",
"Gacrux",
"Puck",
"Pulcherrima",
"Achird",
"Zubenelgenubi",
"Vindemiatrix",
"Rasalgethi",
"Sadachbia",
"Sadaltager",
"Schedar",
"Sulafar",
"Umbriel",
"Vindemiatrix",
"Zephyr",
"Zubenelgenubi",
]
class InputParams(BaseModel):
@@ -787,19 +1078,23 @@ class GeminiTTSService(TTSService):
Parameters:
language: Language for synthesis. Defaults to English.
prompt: Optional style instructions for how to synthesize the content.
multi_speaker: Whether to enable multi-speaker support.
speaker_configs: List of speaker configurations for multi-speaker mode.
"""
language: Optional[Language] = Language.EN
prompt: Optional[str] = None
multi_speaker: bool = False
speaker_configs: Optional[List[dict]] = None
def __init__(
self,
*,
api_key: str,
model: str = "gemini-2.5-flash-preview-tts",
api_key: Optional[str] = None,
model: str = "gemini-2.5-flash-tts",
credentials: Optional[str] = None,
credentials_path: Optional[str] = None,
voice_id: str = "Kore",
sample_rate: Optional[int] = None,
params: Optional[InputParams] = None,
@@ -808,14 +1103,30 @@ class GeminiTTSService(TTSService):
"""Initializes the Gemini TTS service.
Args:
api_key: Google AI API key for authentication.
api_key:
.. deprecated:: 0.0.95
The `api_key` parameter is deprecated. Use `credentials` or
`credentials_path` instead for Google Cloud authentication.
model: Gemini TTS model to use. Must be a TTS model like
"gemini-2.5-flash-preview-tts" or "gemini-2.5-pro-preview-tts".
"gemini-2.5-flash-tts" or "gemini-2.5-pro-tts".
credentials: JSON string containing Google Cloud service account credentials.
credentials_path: Path to Google Cloud service account JSON file.
voice_id: Voice name from the available Gemini voices.
sample_rate: Audio sample rate in Hz. If None, uses Google's default 24kHz.
params: TTS configuration parameters.
**kwargs: Additional arguments passed to parent TTSService.
"""
# Handle deprecated api_key parameter
if api_key is not None:
warnings.warn(
"The 'api_key' parameter is deprecated and will be removed in a future version. "
"Use 'credentials' or 'credentials_path' instead for Google Cloud authentication.",
DeprecationWarning,
stacklevel=2,
)
if sample_rate and sample_rate != self.GOOGLE_SAMPLE_RATE:
logger.warning(
f"Google TTS only supports {self.GOOGLE_SAMPLE_RATE}Hz sample rate. "
@@ -828,35 +1139,20 @@ class GeminiTTSService(TTSService):
if voice_id not in self.AVAILABLE_VOICES:
logger.warning(f"Voice '{voice_id}' not in known voices list. Using anyway.")
self._api_key = api_key
self._model = model
self._voice_id = voice_id
self._settings = {
"language": self.language_to_service_language(params.language)
if params.language
else "en-US",
"prompt": params.prompt,
"multi_speaker": params.multi_speaker,
"speaker_configs": params.speaker_configs,
}
self._client = genai.Client(api_key=api_key)
def can_generate_metrics(self) -> bool:
"""Check if this service can generate processing metrics.
Returns:
True, as Gemini TTS service supports metrics generation.
"""
return True
@property
def includes_inter_frame_spaces(self) -> bool:
"""Indicates that Gemini TTSTextFrames include necessary inter-frame spaces.
Returns:
True, indicating that Gemini's text frames include necessary inter-frame spaces.
"""
return True
self._client: texttospeech_v1.TextToSpeechAsyncClient = self._create_client(
credentials, credentials_path
)
def language_to_service_language(self, language: Language) -> Optional[str]:
"""Convert a Language enum to Gemini TTS language format.
@@ -867,7 +1163,7 @@ class GeminiTTSService(TTSService):
Returns:
The Gemini TTS-specific language code, or None if not supported.
"""
return language_to_google_tts_language(language)
return language_to_gemini_tts_language(language)
def set_voice(self, voice_id: str):
"""Set the voice for TTS generation.
@@ -892,88 +1188,73 @@ class GeminiTTSService(TTSService):
f"Current rate of {self.sample_rate}Hz may cause issues."
)
@traced_tts
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
"""Generate speech from text using Gemini TTS models.
async def _update_settings(self, settings: Mapping[str, Any]):
"""Override to handle prompt updates.
Args:
text: The text to synthesize into speech. Can include natural language
instructions for style, tone, etc.
settings: Dictionary of settings to update. Can include 'prompt' (str)
"""
if "prompt" in settings:
self._settings["prompt"] = settings["prompt"]
await super()._update_settings(settings)
@traced_tts
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
"""Generate streaming speech from text using Gemini TTS models.
Args:
text: The text to synthesize into speech. Can include markup tags
like [sigh], [laughing], [whispering] for expressive control.
Yields:
Frame: Audio frames containing the synthesized speech.
Frame: Audio frames containing the synthesized speech as it's generated.
"""
logger.debug(f"{self}: Generating TTS [{text}]")
try:
await self.start_ttfb_metrics()
# Build the speech config
# Build voice selection params
if self._settings["multi_speaker"] and self._settings["speaker_configs"]:
# Multi-speaker mode
speaker_voice_configs = []
for speaker_config in self._settings["speaker_configs"]:
speaker_voice_configs.append(
types.SpeakerVoiceConfig(
speaker=speaker_config["speaker"],
voice_config=types.VoiceConfig(
prebuilt_voice_config=types.PrebuiltVoiceConfig(
voice_name=speaker_config.get("voice_id", self._voice_id)
)
),
texttospeech_v1.MultispeakerPrebuiltVoice(
speaker_alias=speaker_config["speaker_alias"],
speaker_id=speaker_config.get("speaker_id", self._voice_id),
)
)
speech_config = types.SpeechConfig(
multi_speaker_voice_config=types.MultiSpeakerVoiceConfig(
speaker_voice_configs=speaker_voice_configs
)
multi_speaker_voice_config = texttospeech_v1.MultiSpeakerVoiceConfig(
speaker_voice_configs=speaker_voice_configs
)
voice = texttospeech_v1.VoiceSelectionParams(
language_code=self._settings["language"],
model_name=self._model,
multi_speaker_voice_config=multi_speaker_voice_config,
)
else:
# Single speaker mode
speech_config = types.SpeechConfig(
voice_config=types.VoiceConfig(
prebuilt_voice_config=types.PrebuiltVoiceConfig(voice_name=self._voice_id)
)
voice = texttospeech_v1.VoiceSelectionParams(
language_code=self._settings["language"],
name=self._voice_id,
model_name=self._model,
)
# Create the generation config
generation_config = types.GenerateContentConfig(
response_modalities=["AUDIO"],
speech_config=speech_config,
# Create streaming config
streaming_config = texttospeech_v1.StreamingSynthesizeConfig(
voice=voice,
streaming_audio_config=texttospeech_v1.StreamingAudioConfig(
audio_encoding=texttospeech_v1.AudioEncoding.PCM,
sample_rate_hertz=self.sample_rate,
),
)
# Generate the content
response = await self._client.aio.models.generate_content(
model=self._model,
contents=text,
config=generation_config,
)
await self.start_tts_usage_metrics(text)
yield TTSStartedFrame()
# Extract audio data from response
if response.candidates and len(response.candidates) > 0:
candidate = response.candidates[0]
if candidate.content and candidate.content.parts:
for part in candidate.content.parts:
if part.inline_data and part.inline_data.mime_type.startswith("audio/"):
audio_data = part.inline_data.data
await self.stop_ttfb_metrics()
# Gemini TTS returns PCM audio data, chunk it appropriately
CHUNK_SIZE = self.chunk_size
for i in range(0, len(audio_data), CHUNK_SIZE):
chunk = audio_data[i : i + CHUNK_SIZE]
if not chunk:
break
frame = TTSAudioRawFrame(chunk, self.sample_rate, 1)
yield frame
yield TTSStoppedFrame()
# Use base class streaming logic with prompt support
async for frame in self._stream_tts(streaming_config, text, self._settings["prompt"]):
yield frame
except Exception as e:
logger.exception(f"{self} error generating TTS: {e}")

View File

@@ -66,6 +66,7 @@ class Language(StrEnum):
AR_TN = "ar-TN"
AR_XA = "ar-XA"
AR_YE = "ar-YE"
AR_001 = "ar-001"
# Assamese
AS = "as"
@@ -83,6 +84,7 @@ class Language(StrEnum):
# Belarusian
BE = "be"
BE_BY = "be-BY"
# Bulgarian
BG = "bg"
@@ -109,6 +111,7 @@ class Language(StrEnum):
# Cebuano
CEB = "ceb"
CEB_PH = "ceb-PH"
# Mandarin Chinese
CMN = "cmn"
@@ -181,6 +184,7 @@ class Language(StrEnum):
ES_US = "es-US"
ES_UY = "es-UY"
ES_VE = "es-VE"
ES_419 = "es-419"
# Estonian
ET = "et"
@@ -250,6 +254,7 @@ class Language(StrEnum):
# Haitian Creole
HT = "ht"
HT_HT = "ht-HT"
# Hungarian
HU = "hu"
@@ -288,6 +293,7 @@ class Language(StrEnum):
# Javanese
JV = "jv"
JV_ID = "jv-ID"
JV_JV = "jv-JV"
JW = "jw" # Fal requires for Javanese
# Georgian
@@ -309,6 +315,10 @@ class Language(StrEnum):
KN = "kn"
KN_IN = "kn-IN"
# Konkani
KOK = "kok"
KOK_IN = "kok-IN"
# Korean
KO = "ko"
KO_KR = "ko-KR"
@@ -322,9 +332,11 @@ class Language(StrEnum):
# Latin
LA = "la"
LA_VA = "la-VA"
# Luxembourgish
LB = "lb"
LB_LU = "lb-LU"
# Lingala
LN = "ln"
@@ -349,6 +361,7 @@ class Language(StrEnum):
# Malagasy
MG = "mg"
MG_MG = "mg-MG"
# Maori
MI = "mi"
@@ -357,6 +370,10 @@ class Language(StrEnum):
MK = "mk"
MK_MK = "mk-MK"
# Maithili
MAI = "mai"
MAI_IN = "mai-IN"
# Malayalam
ML = "ml"
ML_IN = "ml-IN"
@@ -387,6 +404,7 @@ class Language(StrEnum):
NB_NO = "nb-NO"
NO = "no"
NN = "nn" # Norwegian Nynorsk
NN_NO = "nn-NO"
# Nepali
NE = "ne"
@@ -440,6 +458,7 @@ class Language(StrEnum):
# Sindhi
SD = "sd"
SD_IN = "sd-IN"
# Sinhala
SI = "si"