diff --git a/CHANGELOG.md b/CHANGELOG.md index de9cf3328..bb7223b7c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,6 +20,21 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added Hindi support for Rime TTS services. +- Updated `GeminiTTSService` to use Google Cloud Text-to-Speech streaming API + instead of the deprecated Gemini API. Now uses `credentials` / + `credentials_path` for authentication. The `api_key` parameter is deprecated. + Also, added support for `prompt` parameter for style instructions and + expressive markup tags. Significantly improved latency with streaming + synthesis. + +- Updated language mappings for the Google and Gemini TTS services to match + official documentation. + +### Deprecated + +- The `api_key` parameter in `GeminiTTSService` is deprecated. Use + `credentials` or `credentials_path` instead for Google Cloud authentication. + ### Fixed - Fixed subtle issue of assistant context messages ending up with double spaces diff --git a/examples/foundational/07n-interruptible-gemini.py b/examples/foundational/07n-interruptible-gemini.py index 4da14f908..3a244a128 100644 --- a/examples/foundational/07n-interruptible-gemini.py +++ b/examples/foundational/07n-interruptible-gemini.py @@ -4,24 +4,6 @@ # SPDX-License-Identifier: BSD 2-Clause License # -""" -A conversational AI bot using Gemini for both LLM and TTS. - -This example demonstrates how to use Gemini's TTS capabilities with the new -GeminiTTSService, which uses Gemini's TTS-specific models instead of Google Cloud TTS. - -Features showcased: -- Gemini LLM for conversation -- Gemini TTS with natural voice control -- Support for different voice personalities -- Style and tone control through natural language prompts - -Run with: - python examples/foundational/gemini-tts.py - -Make sure to set your environment variables: - export GOOGLE_API_KEY=your_api_key_here -""" import os @@ -84,10 +66,13 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments): ) tts = GeminiTTSService( - api_key=os.getenv("GOOGLE_API_KEY"), - model="gemini-2.5-flash-preview-tts", # TTS-specific model + credentials=os.getenv("GOOGLE_TEST_CREDENTIALS"), + model="gemini-2.5-flash-tts", voice_id="Charon", - params=GeminiTTSService.InputParams(language=Language.EN_US), + params=GeminiTTSService.InputParams( + language=Language.EN_US, + prompt="You are a helpful AI assistant. Speak in a natural, conversational tone.", + ), ) llm = GoogleLLMService( @@ -101,13 +86,20 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments): "role": "system", "content": """You are a helpful AI assistant in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. - IMPORTANT: Since you're using Gemini TTS which supports natural voice control, you can include speaking instructions in your responses. For example: - - "Say cheerfully: Welcome to our conversation!" - - "Read this in a calm, professional tone: Here are the details you requested." - - "Speak in an excited whisper: I have some great news to share!" - - "Say slowly and clearly: Let me explain this step by step." + IMPORTANT: You're using Gemini TTS which supports expressive markup tags. You can use these tags in your responses: + - [sigh] - Insert a sigh sound + - [laughing] - Insert a laugh + - [uhm] - Insert a hesitation sound + - [whispering] - Speak the next part in a whisper + - [shouting] - Speak the next part louder + - [extremely fast] - Speak the next part very quickly + - [short pause], [medium pause], [long pause] - Add pauses for dramatic effect - Feel free to use natural language instructions to control your voice style, tone, pace, and emotion. The TTS system will interpret these instructions and adjust the speech accordingly. + Examples: + - "Well [sigh] that's a tricky question." + - "[laughing] That's a great joke!" + - "[whispering] Let me tell you a secret." + - "The answer is... [long pause] ...42!" Your output will be converted to audio, so avoid special characters in your answers. Respond to what the user said in a creative and helpful way.""", }, @@ -140,11 +132,11 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments): @transport.event_handler("on_client_connected") async def on_client_connected(transport, client): logger.info(f"Client connected") - # Kick off the conversation with a styled introduction + # Kick off the conversation messages.append( { "role": "system", - "content": "Say cheerfully and warmly: Hello! I'm your AI assistant powered by Gemini's new TTS technology. I can speak with different voices, tones, and styles. How can I help you today?", + "content": "Hello! I'm your AI assistant. I can help you with a variety of tasks. What would you like to know?", } ) await task.queue_frames([LLMRunFrame()]) diff --git a/src/pipecat/services/google/tts.py b/src/pipecat/services/google/tts.py index bd3dbc203..b20532676 100644 --- a/src/pipecat/services/google/tts.py +++ b/src/pipecat/services/google/tts.py @@ -16,6 +16,7 @@ for natural voice control and multi-speaker conversations. import json import os +import warnings from pipecat.utils.tracing.service_decorators import traced_tts @@ -51,19 +52,13 @@ except ModuleNotFoundError as e: ) raise Exception(f"Missing module: {e}") -try: - from google import genai - from google.genai import types - -except ModuleNotFoundError as e: - logger.error(f"Exception: {e}") - logger.error("In order to use Gemini TTS, you need to `pip install pipecat-ai[google]`.") - raise Exception(f"Missing module: {e}") - def language_to_google_tts_language(language: Language) -> Optional[str]: """Convert a Language enum to Google TTS language code. + Source: + https://docs.cloud.google.com/text-to-speech/docs/chirp3-hd + Args: language: The Language enum value to convert. @@ -71,9 +66,6 @@ def language_to_google_tts_language(language: Language) -> Optional[str]: The corresponding Google TTS language code, or None if not supported. """ LANGUAGE_MAP = { - # Afrikaans - Language.AF: "af-ZA", - Language.AF_ZA: "af-ZA", # Arabic Language.AR: "ar-XA", # Bengali @@ -82,14 +74,9 @@ def language_to_google_tts_language(language: Language) -> Optional[str]: # Bulgarian Language.BG: "bg-BG", Language.BG_BG: "bg-BG", - # Catalan - Language.CA: "ca-ES", - Language.CA_ES: "ca-ES", - # Chinese (Mandarin and Cantonese) - Language.ZH: "cmn-CN", - Language.ZH_CN: "cmn-CN", - Language.ZH_TW: "cmn-TW", - Language.ZH_HK: "yue-HK", + # Croatian + Language.HR: "hr-HR", + Language.HR_HR: "hr-HR", # Czech Language.CS: "cs-CZ", Language.CS_CZ: "cs-CZ", @@ -109,9 +96,6 @@ def language_to_google_tts_language(language: Language) -> Optional[str]: # Estonian Language.ET: "et-EE", Language.ET_EE: "et-EE", - # Filipino - Language.FIL: "fil-PH", - Language.FIL_PH: "fil-PH", # Finnish Language.FI: "fi-FI", Language.FI_FI: "fi-FI", @@ -119,9 +103,6 @@ def language_to_google_tts_language(language: Language) -> Optional[str]: Language.FR: "fr-FR", Language.FR_CA: "fr-CA", Language.FR_FR: "fr-FR", - # Galician - Language.GL: "gl-ES", - Language.GL_ES: "gl-ES", # German Language.DE: "de-DE", Language.DE_DE: "de-DE", @@ -140,9 +121,6 @@ def language_to_google_tts_language(language: Language) -> Optional[str]: # Hungarian Language.HU: "hu-HU", Language.HU_HU: "hu-HU", - # Icelandic - Language.IS: "is-IS", - Language.IS_IS: "is-IS", # Indonesian Language.ID: "id-ID", Language.ID_ID: "id-ID", @@ -164,12 +142,12 @@ def language_to_google_tts_language(language: Language) -> Optional[str]: # Lithuanian Language.LT: "lt-LT", Language.LT_LT: "lt-LT", - # Malay - Language.MS: "ms-MY", - Language.MS_MY: "ms-MY", # Malayalam Language.ML: "ml-IN", Language.ML_IN: "ml-IN", + # Chinese (Mandarin) + Language.ZH: "cmn-CN", + Language.ZH_CN: "cmn-CN", # Marathi Language.MR: "mr-IN", Language.MR_IN: "mr-IN", @@ -181,12 +159,8 @@ def language_to_google_tts_language(language: Language) -> Optional[str]: Language.PL: "pl-PL", Language.PL_PL: "pl-PL", # Portuguese - Language.PT: "pt-PT", + Language.PT: "pt-BR", Language.PT_BR: "pt-BR", - Language.PT_PT: "pt-PT", - # Punjabi - Language.PA: "pa-IN", - Language.PA_IN: "pa-IN", # Romanian Language.RO: "ro-RO", Language.RO_RO: "ro-RO", @@ -199,10 +173,16 @@ def language_to_google_tts_language(language: Language) -> Optional[str]: # Slovak Language.SK: "sk-SK", Language.SK_SK: "sk-SK", + # Slovenian + Language.SL: "sl-SI", + Language.SL_SI: "sl-SI", # Spanish Language.ES: "es-ES", Language.ES_ES: "es-ES", Language.ES_US: "es-US", + # Swahili + Language.SW: "sw-KE", + Language.SW_KE: "sw-KE", # Swedish Language.SV: "sv-SE", Language.SV_SE: "sv-SE", @@ -221,6 +201,9 @@ def language_to_google_tts_language(language: Language) -> Optional[str]: # Ukrainian Language.UK: "uk-UA", Language.UK_UA: "uk-UA", + # Urdu + Language.UR: "ur-IN", + Language.UR_IN: "ur-IN", # Vietnamese Language.VI: "vi-VN", Language.VI_VN: "vi-VN", @@ -229,6 +212,267 @@ def language_to_google_tts_language(language: Language) -> Optional[str]: return resolve_language(language, LANGUAGE_MAP, use_base_code=False) +def language_to_gemini_tts_language(language: Language) -> Optional[str]: + """Convert a Language enum to Gemini TTS language code. + + Source: + https://docs.cloud.google.com/text-to-speech/docs/gemini-tts#available_languages + + Args: + language: The Language enum value to convert. + + Returns: + The corresponding Gemini TTS language code, or None if not supported. + """ + LANGUAGE_MAP = { + # Afrikaans (Preview) + Language.AF: "af-ZA", + Language.AF_ZA: "af-ZA", + # Albanian (Preview) + Language.SQ: "sq-AL", + Language.SQ_AL: "sq-AL", + # Amharic (Preview) + Language.AM: "am-ET", + Language.AM_ET: "am-ET", + # Arabic + Language.AR: "ar-EG", # GA: Egypt + Language.AR_EG: "ar-EG", + Language.AR_001: "ar-001", # Preview: World + # Armenian (Preview) + Language.HY: "hy-AM", + Language.HY_AM: "hy-AM", + # Azerbaijani (Preview) + Language.AZ: "az-AZ", + Language.AZ_AZ: "az-AZ", + # Basque (Preview) + Language.EU: "eu-ES", + Language.EU_ES: "eu-ES", + # Belarusian (Preview) + Language.BE: "be-BY", + Language.BE_BY: "be-BY", + # Bengali (GA) + Language.BN: "bn-BD", + Language.BN_BD: "bn-BD", + # Bulgarian (Preview) + Language.BG: "bg-BG", + Language.BG_BG: "bg-BG", + # Burmese (Preview) + Language.MY: "my-MM", + Language.MY_MM: "my-MM", + # Catalan (Preview) + Language.CA: "ca-ES", + Language.CA_ES: "ca-ES", + # Cebuano (Preview) + Language.CEB: "ceb-PH", + Language.CEB_PH: "ceb-PH", + # Chinese (Mandarin) + Language.ZH: "cmn-CN", # Preview + Language.ZH_CN: "cmn-CN", + Language.ZH_TW: "cmn-TW", # Preview + # Croatian (Preview) + Language.HR: "hr-HR", + Language.HR_HR: "hr-HR", + # Czech (Preview) + Language.CS: "cs-CZ", + Language.CS_CZ: "cs-CZ", + # Danish (Preview) + Language.DA: "da-DK", + Language.DA_DK: "da-DK", + # Dutch (GA) + Language.NL: "nl-NL", + Language.NL_NL: "nl-NL", + # English + Language.EN: "en-US", # GA + Language.EN_US: "en-US", + Language.EN_AU: "en-AU", # Preview + Language.EN_GB: "en-GB", # Preview + Language.EN_IN: "en-IN", # GA + # Estonian (Preview) + Language.ET: "et-EE", + Language.ET_EE: "et-EE", + # Filipino (Preview) + Language.FIL: "fil-PH", + Language.FIL_PH: "fil-PH", + # Finnish (Preview) + Language.FI: "fi-FI", + Language.FI_FI: "fi-FI", + # French + Language.FR: "fr-FR", # GA + Language.FR_FR: "fr-FR", + Language.FR_CA: "fr-CA", # Preview + # Galician (Preview) + Language.GL: "gl-ES", + Language.GL_ES: "gl-ES", + # Georgian (Preview) + Language.KA: "ka-GE", + Language.KA_GE: "ka-GE", + # German (GA) + Language.DE: "de-DE", + Language.DE_DE: "de-DE", + # Greek (Preview) + Language.EL: "el-GR", + Language.EL_GR: "el-GR", + # Gujarati (Preview) + Language.GU: "gu-IN", + Language.GU_IN: "gu-IN", + # Haitian Creole (Preview) + Language.HT: "ht-HT", + Language.HT_HT: "ht-HT", + # Hebrew (Preview) + Language.HE: "he-IL", + Language.HE_IL: "he-IL", + # Hindi (GA) + Language.HI: "hi-IN", + Language.HI_IN: "hi-IN", + # Hungarian (Preview) + Language.HU: "hu-HU", + Language.HU_HU: "hu-HU", + # Icelandic (Preview) + Language.IS: "is-IS", + Language.IS_IS: "is-IS", + # Indonesian (GA) + Language.ID: "id-ID", + Language.ID_ID: "id-ID", + # Italian (GA) + Language.IT: "it-IT", + Language.IT_IT: "it-IT", + # Japanese (GA) + Language.JA: "ja-JP", + Language.JA_JP: "ja-JP", + # Javanese (Preview) + Language.JV: "jv-JV", + Language.JV_JV: "jv-JV", + # Kannada (Preview) + Language.KN: "kn-IN", + Language.KN_IN: "kn-IN", + # Konkani (Preview) + Language.KOK: "kok-IN", + Language.KOK_IN: "kok-IN", + # Korean (GA) + Language.KO: "ko-KR", + Language.KO_KR: "ko-KR", + # Lao (Preview) + Language.LO: "lo-LA", + Language.LO_LA: "lo-LA", + # Latin (Preview) + Language.LA: "la-VA", + Language.LA_VA: "la-VA", + # Latvian (Preview) + Language.LV: "lv-LV", + Language.LV_LV: "lv-LV", + # Lithuanian (Preview) + Language.LT: "lt-LT", + Language.LT_LT: "lt-LT", + # Luxembourgish (Preview) + Language.LB: "lb-LU", + Language.LB_LU: "lb-LU", + # Macedonian (Preview) + Language.MK: "mk-MK", + Language.MK_MK: "mk-MK", + # Maithili (Preview) + Language.MAI: "mai-IN", + Language.MAI_IN: "mai-IN", + # Malagasy (Preview) + Language.MG: "mg-MG", + Language.MG_MG: "mg-MG", + # Malay (Preview) + Language.MS: "ms-MY", + Language.MS_MY: "ms-MY", + # Malayalam (Preview) + Language.ML: "ml-IN", + Language.ML_IN: "ml-IN", + # Marathi (GA) + Language.MR: "mr-IN", + Language.MR_IN: "mr-IN", + # Mongolian (Preview) + Language.MN: "mn-MN", + Language.MN_MN: "mn-MN", + # Nepali (Preview) + Language.NE: "ne-NP", + Language.NE_NP: "ne-NP", + # Norwegian + Language.NO: "nb-NO", # Preview: Bokmål + Language.NB: "nb-NO", + Language.NB_NO: "nb-NO", + Language.NN: "nn-NO", # Preview: Nynorsk + Language.NN_NO: "nn-NO", + # Odia (Preview) + Language.OR: "or-IN", + Language.OR_IN: "or-IN", + # Pashto (Preview) + Language.PS: "ps-AF", + Language.PS_AF: "ps-AF", + # Persian (Preview) + Language.FA: "fa-IR", + Language.FA_IR: "fa-IR", + # Polish (GA) + Language.PL: "pl-PL", + Language.PL_PL: "pl-PL", + # Portuguese + Language.PT: "pt-BR", # GA: Brazil + Language.PT_BR: "pt-BR", + Language.PT_PT: "pt-PT", # Preview: Portugal + # Punjabi (Preview) + Language.PA: "pa-IN", + Language.PA_IN: "pa-IN", + # Romanian (GA) + Language.RO: "ro-RO", + Language.RO_RO: "ro-RO", + # Russian (GA) + Language.RU: "ru-RU", + Language.RU_RU: "ru-RU", + # Serbian (Preview) + Language.SR: "sr-RS", + Language.SR_RS: "sr-RS", + # Sindhi (Preview) + Language.SD: "sd-IN", + Language.SD_IN: "sd-IN", + # Sinhala (Preview) + Language.SI: "si-LK", + Language.SI_LK: "si-LK", + # Slovak (Preview) + Language.SK: "sk-SK", + Language.SK_SK: "sk-SK", + # Slovenian (Preview) + Language.SL: "sl-SI", + Language.SL_SI: "sl-SI", + # Spanish + Language.ES: "es-ES", # GA + Language.ES_ES: "es-ES", + Language.ES_419: "es-419", # Preview: Latin America + Language.ES_MX: "es-MX", # Preview: Mexico + # Swahili (Preview) + Language.SW: "sw-KE", + Language.SW_KE: "sw-KE", + # Swedish (Preview) + Language.SV: "sv-SE", + Language.SV_SE: "sv-SE", + # Tamil (GA) + Language.TA: "ta-IN", + Language.TA_IN: "ta-IN", + # Telugu (GA) + Language.TE: "te-IN", + Language.TE_IN: "te-IN", + # Thai (GA) + Language.TH: "th-TH", + Language.TH_TH: "th-TH", + # Turkish (GA) + Language.TR: "tr-TR", + Language.TR_TR: "tr-TR", + # Ukrainian (GA) + Language.UK: "uk-UA", + Language.UK_UA: "uk-UA", + # Urdu (Preview) + Language.UR: "ur-PK", + Language.UR_PK: "ur-PK", + # Vietnamese (GA) + Language.VI: "vi-VN", + Language.VI_VN: "vi-VN", + } + + return resolve_language(language, LANGUAGE_MAP, use_base_code=False) + + class GoogleHttpTTSService(TTSService): """Google Cloud Text-to-Speech HTTP service with SSML support. @@ -498,7 +742,139 @@ class GoogleHttpTTSService(TTSService): yield ErrorFrame(error=error_message) -class GoogleTTSService(TTSService): +class GoogleBaseTTSService(TTSService): + """Base class for Google Cloud Text-to-Speech streaming services. + + Provides shared streaming synthesis logic for Google TTS services. + This is an abstract base class. Use GoogleTTSService or GeminiTTSService instead. + """ + + def _create_client( + self, credentials: Optional[str], credentials_path: Optional[str] + ) -> texttospeech_v1.TextToSpeechAsyncClient: + """Create authenticated Google Text-to-Speech client. + + Args: + credentials: JSON string with service account credentials. + credentials_path: Path to service account JSON file. + + Returns: + Authenticated TextToSpeechAsyncClient instance. + + Raises: + ValueError: If no valid credentials are provided. + """ + creds: Optional[service_account.Credentials] = None + + if credentials: + # Use provided credentials JSON string + json_account_info = json.loads(credentials) + creds = service_account.Credentials.from_service_account_info(json_account_info) + elif credentials_path: + # Use service account JSON file if provided + creds = service_account.Credentials.from_service_account_file(credentials_path) + else: + try: + creds, project_id = default( + scopes=["https://www.googleapis.com/auth/cloud-platform"] + ) + except GoogleAuthError: + pass + + if not creds: + raise ValueError("No valid credentials provided.") + + return texttospeech_v1.TextToSpeechAsyncClient(credentials=creds) + + def can_generate_metrics(self) -> bool: + """Check if this service can generate processing metrics. + + Returns: + True, as Google streaming TTS services support metrics generation. + """ + return True + + @property + def includes_inter_frame_spaces(self) -> bool: + """Indicates that Google and Gemini TTSTextFrames include necessary inter-frame spaces. + + Returns: + True, indicating that Google's text frames include necessary inter-frame spaces. + """ + return True + + def language_to_service_language(self, language: Language) -> Optional[str]: + """Convert a Language enum to Google TTS language format. + + Args: + language: The language to convert. + + Returns: + The Google TTS-specific language code, or None if not supported. + """ + return language_to_google_tts_language(language) + + async def _stream_tts( + self, + streaming_config: texttospeech_v1.StreamingSynthesizeConfig, + text: str, + prompt: Optional[str] = None, + ) -> AsyncGenerator[Frame, None]: + """Shared streaming synthesis logic. + + Args: + streaming_config: The streaming configuration. + text: The text to synthesize. + prompt: Optional prompt for style instructions (Gemini only). + + Yields: + Frame: Audio frames containing the synthesized speech. + """ + config_request = texttospeech_v1.StreamingSynthesizeRequest( + streaming_config=streaming_config + ) + + async def request_generator(): + yield config_request + synthesis_input_params = {"text": text} + if prompt is not None: + synthesis_input_params["prompt"] = prompt + yield texttospeech_v1.StreamingSynthesizeRequest( + input=texttospeech_v1.StreamingSynthesisInput(**synthesis_input_params) + ) + + streaming_responses = await self._client.streaming_synthesize(request_generator()) + await self.start_tts_usage_metrics(text) + + yield TTSStartedFrame() + + audio_buffer = b"" + first_chunk_for_ttfb = False + + CHUNK_SIZE = self.chunk_size + + async for response in streaming_responses: + chunk = response.audio_content + if not chunk: + continue + + if not first_chunk_for_ttfb: + await self.stop_ttfb_metrics() + first_chunk_for_ttfb = True + + audio_buffer += chunk + while len(audio_buffer) >= CHUNK_SIZE: + piece = audio_buffer[:CHUNK_SIZE] + audio_buffer = audio_buffer[CHUNK_SIZE:] + yield TTSAudioRawFrame(piece, self.sample_rate, 1) + + if audio_buffer: + yield TTSAudioRawFrame(audio_buffer, self.sample_rate, 1) + + yield TTSStoppedFrame() + + +class GoogleTTSService(GoogleBaseTTSService): """Google Cloud Text-to-Speech streaming service. Provides real-time text-to-speech synthesis using Google Cloud's streaming API @@ -570,62 +946,6 @@ class GoogleTTSService(TTSService): credentials, credentials_path ) - def _create_client( - self, credentials: Optional[str], credentials_path: Optional[str] - ) -> texttospeech_v1.TextToSpeechAsyncClient: - creds: Optional[service_account.Credentials] = None - - # Create a Google Cloud service account for the Cloud Text-to-Speech API - # Using either the provided credentials JSON string or the path to a service account JSON - # file, create a Google Cloud service account and use it to authenticate with the API. - if credentials: - # Use provided credentials JSON string - json_account_info = json.loads(credentials) - creds = service_account.Credentials.from_service_account_info(json_account_info) - elif credentials_path: - # Use service account JSON file if provided - creds = service_account.Credentials.from_service_account_file(credentials_path) - else: - try: - creds, project_id = default( - scopes=["https://www.googleapis.com/auth/cloud-platform"] - ) - except GoogleAuthError: - pass - - if not creds: - raise ValueError("No valid credentials provided.") - - return texttospeech_v1.TextToSpeechAsyncClient(credentials=creds) - - def can_generate_metrics(self) -> bool: - """Check if this service can generate processing metrics. - - Returns: - True, as Google streaming TTS service supports metrics generation. - """ - return True - - @property - def includes_inter_frame_spaces(self) -> bool: - """Indicates that Google TTSTextFrames include necessary inter-frame spaces. - - Returns: - True, indicating that Google's text frames include necessary inter-frame spaces. - """ - return True - - def language_to_service_language(self, language: Language) -> Optional[str]: - """Convert a Language enum to Google TTS language format. - - Args: - language: The language to convert. - - Returns: - The Google TTS-specific language code, or None if not supported. - """ - return language_to_google_tts_language(language) - async def _update_settings(self, settings: Mapping[str, Any]): """Override to handle speaking_rate updates for streaming API. @@ -657,6 +977,7 @@ class GoogleTTSService(TTSService): try: await self.start_ttfb_metrics() + # Build voice selection params if self._voice_cloning_key: voice_clone_params = texttospeech_v1.VoiceCloneParams( voice_cloning_key=self._voice_cloning_key @@ -669,6 +990,7 @@ class GoogleTTSService(TTSService): language_code=self._settings["language"], name=self._voice_id ) + # Create streaming config streaming_config = texttospeech_v1.StreamingSynthesizeConfig( voice=voice, streaming_audio_config=texttospeech_v1.StreamingAudioConfig( @@ -677,45 +999,10 @@ class GoogleTTSService(TTSService): speaking_rate=self._settings["speaking_rate"], ), ) - config_request = texttospeech_v1.StreamingSynthesizeRequest( - streaming_config=streaming_config - ) - async def request_generator(): - yield config_request - yield texttospeech_v1.StreamingSynthesizeRequest( - input=texttospeech_v1.StreamingSynthesisInput(text=text) - ) - - streaming_responses = await self._client.streaming_synthesize(request_generator()) - await self.start_tts_usage_metrics(text) - - yield TTSStartedFrame() - - audio_buffer = b"" - first_chunk_for_ttfb = False - - CHUNK_SIZE = self.chunk_size - - async for response in streaming_responses: - chunk = response.audio_content - if not chunk: - continue - - if not first_chunk_for_ttfb: - await self.stop_ttfb_metrics() - first_chunk_for_ttfb = True - - audio_buffer += chunk - while len(audio_buffer) >= CHUNK_SIZE: - piece = audio_buffer[:CHUNK_SIZE] - audio_buffer = audio_buffer[CHUNK_SIZE:] - yield TTSAudioRawFrame(piece, self.sample_rate, 1) - - if audio_buffer: - yield TTSAudioRawFrame(audio_buffer, self.sample_rate, 1) - - yield TTSStoppedFrame() + # Use base class streaming logic + async for frame in self._stream_tts(streaming_config, text): + yield frame except Exception as e: logger.exception(f"{self} error generating TTS: {e}") @@ -723,25 +1010,29 @@ class GoogleTTSService(TTSService): yield ErrorFrame(error=error_message) -class GeminiTTSService(TTSService): - """Gemini Text-to-Speech service using Gemini TTS models. +class GeminiTTSService(GoogleBaseTTSService): + """Gemini Text-to-Speech streaming service using Gemini TTS models. - Provides text-to-speech synthesis using Gemini's TTS-specific models - (gemini-2.5-flash-preview-tts and gemini-2.5-pro-preview-tts) with - support for natural voice control, multiple speakers, and voice styles. + Provides real-time text-to-speech synthesis using Gemini's TTS-specific models + (gemini-2.5-flash-tts and gemini-2.5-pro-tts) with support for natural + voice control, prompts for style instructions, expressive markup tags, + and multi-speaker conversations. Note: - Requires Google AI API key. This uses the Gemini API, not Google Cloud TTS. - Audio-out is currently a preview feature. + Requires Google Cloud credentials via service account JSON, credentials file, + or default application credentials (GOOGLE_APPLICATION_CREDENTIALS). + + Uses the Google Cloud Text-to-Speech streaming API for low-latency synthesis. Example:: tts = GeminiTTSService( - api_key="your-google-ai-api-key", - model="gemini-2.5-flash-preview-tts", + credentials_path="/path/to/service-account.json", + model="gemini-2.5-flash-tts", voice_id="Kore", params=GeminiTTSService.InputParams( language=Language.EN_US, + prompt="Say this in a friendly and helpful tone" ) ) """ @@ -750,36 +1041,36 @@ class GeminiTTSService(TTSService): # List of available Gemini TTS voices AVAILABLE_VOICES = [ - "Zephyr", - "Puck", + "Achernar", + "Achird", + "Algenib", + "Algieba", + "Alnilam", + "Aoede", + "Autonoe", + "Callirhoe", "Charon", - "Kore", + "Despina", + "Enceladus", + "Erinome", "Fenrir", + "Gacrux", + "Iapetus", + "Kore", + "Laomedeia", "Leda", "Orus", - "Aoede", - "Callirhoe", - "Autonoe", - "Enceladus", - "Iapetus", - "Umbriel", - "Algieba", - "Despina", - "Erinome", - "Algenib", - "Rasalgethi", - "Laomedeia", - "Achernar", - "Alnilam", - "Schedar", - "Gacrux", + "Puck", "Pulcherrima", - "Achird", - "Zubenelgenubi", - "Vindemiatrix", + "Rasalgethi", "Sadachbia", "Sadaltager", + "Schedar", "Sulafar", + "Umbriel", + "Vindemiatrix", + "Zephyr", + "Zubenelgenubi", ] class InputParams(BaseModel): @@ -787,19 +1078,23 @@ class GeminiTTSService(TTSService): Parameters: language: Language for synthesis. Defaults to English. + prompt: Optional style instructions for how to synthesize the content. multi_speaker: Whether to enable multi-speaker support. speaker_configs: List of speaker configurations for multi-speaker mode. """ language: Optional[Language] = Language.EN + prompt: Optional[str] = None multi_speaker: bool = False speaker_configs: Optional[List[dict]] = None def __init__( self, *, - api_key: str, - model: str = "gemini-2.5-flash-preview-tts", + api_key: Optional[str] = None, + model: str = "gemini-2.5-flash-tts", + credentials: Optional[str] = None, + credentials_path: Optional[str] = None, voice_id: str = "Kore", sample_rate: Optional[int] = None, params: Optional[InputParams] = None, @@ -808,14 +1103,30 @@ class GeminiTTSService(TTSService): """Initializes the Gemini TTS service. Args: - api_key: Google AI API key for authentication. + api_key: + + .. deprecated:: 0.0.95 + The `api_key` parameter is deprecated. Use `credentials` or + `credentials_path` instead for Google Cloud authentication. + model: Gemini TTS model to use. Must be a TTS model like - "gemini-2.5-flash-preview-tts" or "gemini-2.5-pro-preview-tts". + "gemini-2.5-flash-tts" or "gemini-2.5-pro-tts". + credentials: JSON string containing Google Cloud service account credentials. + credentials_path: Path to Google Cloud service account JSON file. voice_id: Voice name from the available Gemini voices. sample_rate: Audio sample rate in Hz. If None, uses Google's default 24kHz. params: TTS configuration parameters. **kwargs: Additional arguments passed to parent TTSService. """ + # Handle deprecated api_key parameter + if api_key is not None: + warnings.warn( + "The 'api_key' parameter is deprecated and will be removed in a future version. " + "Use 'credentials' or 'credentials_path' instead for Google Cloud authentication.", + DeprecationWarning, + stacklevel=2, + ) + if sample_rate and sample_rate != self.GOOGLE_SAMPLE_RATE: logger.warning( f"Google TTS only supports {self.GOOGLE_SAMPLE_RATE}Hz sample rate. " @@ -828,35 +1139,20 @@ class GeminiTTSService(TTSService): if voice_id not in self.AVAILABLE_VOICES: logger.warning(f"Voice '{voice_id}' not in known voices list. Using anyway.") - self._api_key = api_key self._model = model self._voice_id = voice_id self._settings = { "language": self.language_to_service_language(params.language) if params.language else "en-US", + "prompt": params.prompt, "multi_speaker": params.multi_speaker, "speaker_configs": params.speaker_configs, } - self._client = genai.Client(api_key=api_key) - - def can_generate_metrics(self) -> bool: - """Check if this service can generate processing metrics. - - Returns: - True, as Gemini TTS service supports metrics generation. - """ - return True - - @property - def includes_inter_frame_spaces(self) -> bool: - """Indicates that Gemini TTSTextFrames include necessary inter-frame spaces. - - Returns: - True, indicating that Gemini's text frames include necessary inter-frame spaces. - """ - return True + self._client: texttospeech_v1.TextToSpeechAsyncClient = self._create_client( + credentials, credentials_path + ) def language_to_service_language(self, language: Language) -> Optional[str]: """Convert a Language enum to Gemini TTS language format. @@ -867,7 +1163,7 @@ class GeminiTTSService(TTSService): Returns: The Gemini TTS-specific language code, or None if not supported. """ - return language_to_google_tts_language(language) + return language_to_gemini_tts_language(language) def set_voice(self, voice_id: str): """Set the voice for TTS generation. @@ -892,88 +1188,73 @@ class GeminiTTSService(TTSService): f"Current rate of {self.sample_rate}Hz may cause issues." ) - @traced_tts - async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: - """Generate speech from text using Gemini TTS models. + async def _update_settings(self, settings: Mapping[str, Any]): + """Override to handle prompt updates. Args: - text: The text to synthesize into speech. Can include natural language - instructions for style, tone, etc. + settings: Dictionary of settings to update. Can include 'prompt' (str) + """ + if "prompt" in settings: + self._settings["prompt"] = settings["prompt"] + await super()._update_settings(settings) + + @traced_tts + async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: + """Generate streaming speech from text using Gemini TTS models. + + Args: + text: The text to synthesize into speech. Can include markup tags + like [sigh], [laughing], [whispering] for expressive control. Yields: - Frame: Audio frames containing the synthesized speech. + Frame: Audio frames containing the synthesized speech as it's generated. """ logger.debug(f"{self}: Generating TTS [{text}]") try: await self.start_ttfb_metrics() - # Build the speech config + # Build voice selection params if self._settings["multi_speaker"] and self._settings["speaker_configs"]: # Multi-speaker mode speaker_voice_configs = [] for speaker_config in self._settings["speaker_configs"]: speaker_voice_configs.append( - types.SpeakerVoiceConfig( - speaker=speaker_config["speaker"], - voice_config=types.VoiceConfig( - prebuilt_voice_config=types.PrebuiltVoiceConfig( - voice_name=speaker_config.get("voice_id", self._voice_id) - ) - ), + texttospeech_v1.MultispeakerPrebuiltVoice( + speaker_alias=speaker_config["speaker_alias"], + speaker_id=speaker_config.get("speaker_id", self._voice_id), ) ) - speech_config = types.SpeechConfig( - multi_speaker_voice_config=types.MultiSpeakerVoiceConfig( - speaker_voice_configs=speaker_voice_configs - ) + multi_speaker_voice_config = texttospeech_v1.MultiSpeakerVoiceConfig( + speaker_voice_configs=speaker_voice_configs + ) + + voice = texttospeech_v1.VoiceSelectionParams( + language_code=self._settings["language"], + model_name=self._model, + multi_speaker_voice_config=multi_speaker_voice_config, ) else: # Single speaker mode - speech_config = types.SpeechConfig( - voice_config=types.VoiceConfig( - prebuilt_voice_config=types.PrebuiltVoiceConfig(voice_name=self._voice_id) - ) + voice = texttospeech_v1.VoiceSelectionParams( + language_code=self._settings["language"], + name=self._voice_id, + model_name=self._model, ) - # Create the generation config - generation_config = types.GenerateContentConfig( - response_modalities=["AUDIO"], - speech_config=speech_config, + # Create streaming config + streaming_config = texttospeech_v1.StreamingSynthesizeConfig( + voice=voice, + streaming_audio_config=texttospeech_v1.StreamingAudioConfig( + audio_encoding=texttospeech_v1.AudioEncoding.PCM, + sample_rate_hertz=self.sample_rate, + ), ) - # Generate the content - response = await self._client.aio.models.generate_content( - model=self._model, - contents=text, - config=generation_config, - ) - - await self.start_tts_usage_metrics(text) - - yield TTSStartedFrame() - - # Extract audio data from response - if response.candidates and len(response.candidates) > 0: - candidate = response.candidates[0] - if candidate.content and candidate.content.parts: - for part in candidate.content.parts: - if part.inline_data and part.inline_data.mime_type.startswith("audio/"): - audio_data = part.inline_data.data - await self.stop_ttfb_metrics() - - # Gemini TTS returns PCM audio data, chunk it appropriately - CHUNK_SIZE = self.chunk_size - - for i in range(0, len(audio_data), CHUNK_SIZE): - chunk = audio_data[i : i + CHUNK_SIZE] - if not chunk: - break - frame = TTSAudioRawFrame(chunk, self.sample_rate, 1) - yield frame - - yield TTSStoppedFrame() + # Use base class streaming logic with prompt support + async for frame in self._stream_tts(streaming_config, text, self._settings["prompt"]): + yield frame except Exception as e: logger.exception(f"{self} error generating TTS: {e}") diff --git a/src/pipecat/transcriptions/language.py b/src/pipecat/transcriptions/language.py index cc84346c7..01c75d49f 100644 --- a/src/pipecat/transcriptions/language.py +++ b/src/pipecat/transcriptions/language.py @@ -66,6 +66,7 @@ class Language(StrEnum): AR_TN = "ar-TN" AR_XA = "ar-XA" AR_YE = "ar-YE" + AR_001 = "ar-001" # Assamese AS = "as" @@ -83,6 +84,7 @@ class Language(StrEnum): # Belarusian BE = "be" + BE_BY = "be-BY" # Bulgarian BG = "bg" @@ -109,6 +111,7 @@ class Language(StrEnum): # Cebuano CEB = "ceb" + CEB_PH = "ceb-PH" # Mandarin Chinese CMN = "cmn" @@ -181,6 +184,7 @@ class Language(StrEnum): ES_US = "es-US" ES_UY = "es-UY" ES_VE = "es-VE" + ES_419 = "es-419" # Estonian ET = "et" @@ -250,6 +254,7 @@ class Language(StrEnum): # Haitian Creole HT = "ht" + HT_HT = "ht-HT" # Hungarian HU = "hu" @@ -288,6 +293,7 @@ class Language(StrEnum): # Javanese JV = "jv" JV_ID = "jv-ID" + JV_JV = "jv-JV" JW = "jw" # Fal requires for Javanese # Georgian @@ -309,6 +315,10 @@ class Language(StrEnum): KN = "kn" KN_IN = "kn-IN" + # Konkani + KOK = "kok" + KOK_IN = "kok-IN" + # Korean KO = "ko" KO_KR = "ko-KR" @@ -322,9 +332,11 @@ class Language(StrEnum): # Latin LA = "la" + LA_VA = "la-VA" # Luxembourgish LB = "lb" + LB_LU = "lb-LU" # Lingala LN = "ln" @@ -349,6 +361,7 @@ class Language(StrEnum): # Malagasy MG = "mg" + MG_MG = "mg-MG" # Maori MI = "mi" @@ -357,6 +370,10 @@ class Language(StrEnum): MK = "mk" MK_MK = "mk-MK" + # Maithili + MAI = "mai" + MAI_IN = "mai-IN" + # Malayalam ML = "ml" ML_IN = "ml-IN" @@ -387,6 +404,7 @@ class Language(StrEnum): NB_NO = "nb-NO" NO = "no" NN = "nn" # Norwegian Nynorsk + NN_NO = "nn-NO" # Nepali NE = "ne" @@ -440,6 +458,7 @@ class Language(StrEnum): # Sindhi SD = "sd" + SD_IN = "sd-IN" # Sinhala SI = "si"