Update GeminiTTSService for streaming, other Google TTS improvements

2025-11-12 09:43:52 -05:00
parent d823a3edec
commit edbf96b3c5
4 changed files with 580 additions and 273 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -20,6 +20,21 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

 - Added Hindi support for Rime TTS services.

+- Updated `GeminiTTSService` to use Google Cloud Text-to-Speech streaming API
+  instead of the deprecated Gemini API. Now uses `credentials` /
+  `credentials_path` for authentication. The `api_key` parameter is deprecated.
+  Also, added support for `prompt` parameter for style instructions and
+  expressive markup tags. Significantly improved latency with streaming
+  synthesis.
+
+- Updated language mappings for the Google and Gemini TTS services to match
+  official documentation.
+
+### Deprecated
+
+- The `api_key` parameter in `GeminiTTSService` is deprecated. Use
+  `credentials` or `credentials_path` instead for Google Cloud authentication.
+
 ### Fixed

 - Fixed subtle issue of assistant context messages ending up with double spaces
--- a/examples/foundational/07n-interruptible-gemini.py
+++ b/examples/foundational/07n-interruptible-gemini.py
@@ -4,24 +4,6 @@
 # SPDX-License-Identifier: BSD 2-Clause License
 #

-"""
-A conversational AI bot using Gemini for both LLM and TTS.
-
-This example demonstrates how to use Gemini's TTS capabilities with the new
-GeminiTTSService, which uses Gemini's TTS-specific models instead of Google Cloud TTS.
-
-Features showcased:
- Gemini LLM for conversation
- Gemini TTS with natural voice control
- Support for different voice personalities
- Style and tone control through natural language prompts
-
-Run with:
-    python examples/foundational/gemini-tts.py
-
-Make sure to set your environment variables:
-    export GOOGLE_API_KEY=your_api_key_here
-"""

 import os

@@ -84,10 +66,13 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
    )

    tts = GeminiTTSService(
-        api_key=os.getenv("GOOGLE_API_KEY"),
-        model="gemini-2.5-flash-preview-tts",  # TTS-specific model
+        credentials=os.getenv("GOOGLE_TEST_CREDENTIALS"),
+        model="gemini-2.5-flash-tts",
        voice_id="Charon",
-        params=GeminiTTSService.InputParams(language=Language.EN_US),
+        params=GeminiTTSService.InputParams(
+            language=Language.EN_US,
+            prompt="You are a helpful AI assistant. Speak in a natural, conversational tone.",
+        ),
    )

    llm = GoogleLLMService(
@@ -101,13 +86,20 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
            "role": "system",
            "content": """You are a helpful AI assistant in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way.

-            IMPORTANT: Since you're using Gemini TTS which supports natural voice control, you can include speaking instructions in your responses. For example:
-            - "Say cheerfully: Welcome to our conversation!"
-            - "Read this in a calm, professional tone: Here are the details you requested."
-            - "Speak in an excited whisper: I have some great news to share!"
-            - "Say slowly and clearly: Let me explain this step by step."
+            IMPORTANT: You're using Gemini TTS which supports expressive markup tags. You can use these tags in your responses:
+            - [sigh] - Insert a sigh sound
+            - [laughing] - Insert a laugh
+            - [uhm] - Insert a hesitation sound
+            - [whispering] - Speak the next part in a whisper
+            - [shouting] - Speak the next part louder
+            - [extremely fast] - Speak the next part very quickly
+            - [short pause], [medium pause], [long pause] - Add pauses for dramatic effect

-            Feel free to use natural language instructions to control your voice style, tone, pace, and emotion. The TTS system will interpret these instructions and adjust the speech accordingly.
+            Examples:
+            - "Well [sigh] that's a tricky question."
+            - "[laughing] That's a great joke!"
+            - "[whispering] Let me tell you a secret."
+            - "The answer is... [long pause] ...42!"

            Your output will be converted to audio, so avoid special characters in your answers. Respond to what the user said in a creative and helpful way.""",
        },
@@ -140,11 +132,11 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
    @transport.event_handler("on_client_connected")
    async def on_client_connected(transport, client):
        logger.info(f"Client connected")
-        # Kick off the conversation with a styled introduction
+        # Kick off the conversation
        messages.append(
            {
                "role": "system",
-                "content": "Say cheerfully and warmly: Hello! I'm your AI assistant powered by Gemini's new TTS technology. I can speak with different voices, tones, and styles. How can I help you today?",
+                "content": "Hello! I'm your AI assistant. I can help you with a variety of tasks. What would you like to know?",
            }
        )
        await task.queue_frames([LLMRunFrame()])
--- a/src/pipecat/services/google/tts.py
+++ b/src/pipecat/services/google/tts.py
@@ -16,6 +16,7 @@ for natural voice control and multi-speaker conversations.

 import json
 import os
+import warnings

 from pipecat.utils.tracing.service_decorators import traced_tts

@@ -51,19 +52,13 @@ except ModuleNotFoundError as e:
    )
    raise Exception(f"Missing module: {e}")

-try:
-    from google import genai
-    from google.genai import types
-
-except ModuleNotFoundError as e:
-    logger.error(f"Exception: {e}")
-    logger.error("In order to use Gemini TTS, you need to `pip install pipecat-ai[google]`.")
-    raise Exception(f"Missing module: {e}")
-

 def language_to_google_tts_language(language: Language) -> Optional[str]:
    """Convert a Language enum to Google TTS language code.

+    Source:
+    https://docs.cloud.google.com/text-to-speech/docs/chirp3-hd
+
    Args:
        language: The Language enum value to convert.

@@ -71,9 +66,6 @@ def language_to_google_tts_language(language: Language) -> Optional[str]:
        The corresponding Google TTS language code, or None if not supported.
    """
    LANGUAGE_MAP = {
-        # Afrikaans
-        Language.AF: "af-ZA",
-        Language.AF_ZA: "af-ZA",
        # Arabic
        Language.AR: "ar-XA",
        # Bengali
@@ -82,14 +74,9 @@ def language_to_google_tts_language(language: Language) -> Optional[str]:
        # Bulgarian
        Language.BG: "bg-BG",
        Language.BG_BG: "bg-BG",
-        # Catalan
-        Language.CA: "ca-ES",
-        Language.CA_ES: "ca-ES",
-        # Chinese (Mandarin and Cantonese)
-        Language.ZH: "cmn-CN",
-        Language.ZH_CN: "cmn-CN",
-        Language.ZH_TW: "cmn-TW",
-        Language.ZH_HK: "yue-HK",
+        # Croatian
+        Language.HR: "hr-HR",
+        Language.HR_HR: "hr-HR",
        # Czech
        Language.CS: "cs-CZ",
        Language.CS_CZ: "cs-CZ",
@@ -109,9 +96,6 @@ def language_to_google_tts_language(language: Language) -> Optional[str]:
        # Estonian
        Language.ET: "et-EE",
        Language.ET_EE: "et-EE",
-        # Filipino
-        Language.FIL: "fil-PH",
-        Language.FIL_PH: "fil-PH",
        # Finnish
        Language.FI: "fi-FI",
        Language.FI_FI: "fi-FI",
@@ -119,9 +103,6 @@ def language_to_google_tts_language(language: Language) -> Optional[str]:
        Language.FR: "fr-FR",
        Language.FR_CA: "fr-CA",
        Language.FR_FR: "fr-FR",
-        # Galician
-        Language.GL: "gl-ES",
-        Language.GL_ES: "gl-ES",
        # German
        Language.DE: "de-DE",
        Language.DE_DE: "de-DE",
@@ -140,9 +121,6 @@ def language_to_google_tts_language(language: Language) -> Optional[str]:
        # Hungarian
        Language.HU: "hu-HU",
        Language.HU_HU: "hu-HU",
-        # Icelandic
-        Language.IS: "is-IS",
-        Language.IS_IS: "is-IS",
        # Indonesian
        Language.ID: "id-ID",
        Language.ID_ID: "id-ID",
@@ -164,12 +142,12 @@ def language_to_google_tts_language(language: Language) -> Optional[str]:
        # Lithuanian
        Language.LT: "lt-LT",
        Language.LT_LT: "lt-LT",
-        # Malay
-        Language.MS: "ms-MY",
-        Language.MS_MY: "ms-MY",
        # Malayalam
        Language.ML: "ml-IN",
        Language.ML_IN: "ml-IN",
+        # Chinese (Mandarin)
+        Language.ZH: "cmn-CN",
+        Language.ZH_CN: "cmn-CN",
        # Marathi
        Language.MR: "mr-IN",
        Language.MR_IN: "mr-IN",
@@ -181,12 +159,8 @@ def language_to_google_tts_language(language: Language) -> Optional[str]:
        Language.PL: "pl-PL",
        Language.PL_PL: "pl-PL",
        # Portuguese
-        Language.PT: "pt-PT",
+        Language.PT: "pt-BR",
        Language.PT_BR: "pt-BR",
-        Language.PT_PT: "pt-PT",
-        # Punjabi
-        Language.PA: "pa-IN",
-        Language.PA_IN: "pa-IN",
        # Romanian
        Language.RO: "ro-RO",
        Language.RO_RO: "ro-RO",
@@ -199,10 +173,16 @@ def language_to_google_tts_language(language: Language) -> Optional[str]:
        # Slovak
        Language.SK: "sk-SK",
        Language.SK_SK: "sk-SK",
+        # Slovenian
+        Language.SL: "sl-SI",
+        Language.SL_SI: "sl-SI",
        # Spanish
        Language.ES: "es-ES",
        Language.ES_ES: "es-ES",
        Language.ES_US: "es-US",
+        # Swahili
+        Language.SW: "sw-KE",
+        Language.SW_KE: "sw-KE",
        # Swedish
        Language.SV: "sv-SE",
        Language.SV_SE: "sv-SE",
@@ -221,6 +201,9 @@ def language_to_google_tts_language(language: Language) -> Optional[str]:
        # Ukrainian
        Language.UK: "uk-UA",
        Language.UK_UA: "uk-UA",
+        # Urdu
+        Language.UR: "ur-IN",
+        Language.UR_IN: "ur-IN",
        # Vietnamese
        Language.VI: "vi-VN",
        Language.VI_VN: "vi-VN",
@@ -229,6 +212,267 @@ def language_to_google_tts_language(language: Language) -> Optional[str]:
    return resolve_language(language, LANGUAGE_MAP, use_base_code=False)


+def language_to_gemini_tts_language(language: Language) -> Optional[str]:
+    """Convert a Language enum to Gemini TTS language code.
+
+    Source:
+    https://docs.cloud.google.com/text-to-speech/docs/gemini-tts#available_languages
+
+    Args:
+        language: The Language enum value to convert.
+
+    Returns:
+        The corresponding Gemini TTS language code, or None if not supported.
+    """
+    LANGUAGE_MAP = {
+        # Afrikaans (Preview)
+        Language.AF: "af-ZA",
+        Language.AF_ZA: "af-ZA",
+        # Albanian (Preview)
+        Language.SQ: "sq-AL",
+        Language.SQ_AL: "sq-AL",
+        # Amharic (Preview)
+        Language.AM: "am-ET",
+        Language.AM_ET: "am-ET",
+        # Arabic
+        Language.AR: "ar-EG",  # GA: Egypt
+        Language.AR_EG: "ar-EG",
+        Language.AR_001: "ar-001",  # Preview: World
+        # Armenian (Preview)
+        Language.HY: "hy-AM",
+        Language.HY_AM: "hy-AM",
+        # Azerbaijani (Preview)
+        Language.AZ: "az-AZ",
+        Language.AZ_AZ: "az-AZ",
+        # Basque (Preview)
+        Language.EU: "eu-ES",
+        Language.EU_ES: "eu-ES",
+        # Belarusian (Preview)
+        Language.BE: "be-BY",
+        Language.BE_BY: "be-BY",
+        # Bengali (GA)
+        Language.BN: "bn-BD",
+        Language.BN_BD: "bn-BD",
+        # Bulgarian (Preview)
+        Language.BG: "bg-BG",
+        Language.BG_BG: "bg-BG",
+        # Burmese (Preview)
+        Language.MY: "my-MM",
+        Language.MY_MM: "my-MM",
+        # Catalan (Preview)
+        Language.CA: "ca-ES",
+        Language.CA_ES: "ca-ES",
+        # Cebuano (Preview)
+        Language.CEB: "ceb-PH",
+        Language.CEB_PH: "ceb-PH",
+        # Chinese (Mandarin)
+        Language.ZH: "cmn-CN",  # Preview
+        Language.ZH_CN: "cmn-CN",
+        Language.ZH_TW: "cmn-TW",  # Preview
+        # Croatian (Preview)
+        Language.HR: "hr-HR",
+        Language.HR_HR: "hr-HR",
+        # Czech (Preview)
+        Language.CS: "cs-CZ",
+        Language.CS_CZ: "cs-CZ",
+        # Danish (Preview)
+        Language.DA: "da-DK",
+        Language.DA_DK: "da-DK",
+        # Dutch (GA)
+        Language.NL: "nl-NL",
+        Language.NL_NL: "nl-NL",
+        # English
+        Language.EN: "en-US",  # GA
+        Language.EN_US: "en-US",
+        Language.EN_AU: "en-AU",  # Preview
+        Language.EN_GB: "en-GB",  # Preview
+        Language.EN_IN: "en-IN",  # GA
+        # Estonian (Preview)
+        Language.ET: "et-EE",
+        Language.ET_EE: "et-EE",
+        # Filipino (Preview)
+        Language.FIL: "fil-PH",
+        Language.FIL_PH: "fil-PH",
+        # Finnish (Preview)
+        Language.FI: "fi-FI",
+        Language.FI_FI: "fi-FI",
+        # French
+        Language.FR: "fr-FR",  # GA
+        Language.FR_FR: "fr-FR",
+        Language.FR_CA: "fr-CA",  # Preview
+        # Galician (Preview)
+        Language.GL: "gl-ES",
+        Language.GL_ES: "gl-ES",
+        # Georgian (Preview)
+        Language.KA: "ka-GE",
+        Language.KA_GE: "ka-GE",
+        # German (GA)
+        Language.DE: "de-DE",
+        Language.DE_DE: "de-DE",
+        # Greek (Preview)
+        Language.EL: "el-GR",
+        Language.EL_GR: "el-GR",
+        # Gujarati (Preview)
+        Language.GU: "gu-IN",
+        Language.GU_IN: "gu-IN",
+        # Haitian Creole (Preview)
+        Language.HT: "ht-HT",
+        Language.HT_HT: "ht-HT",
+        # Hebrew (Preview)
+        Language.HE: "he-IL",
+        Language.HE_IL: "he-IL",
+        # Hindi (GA)
+        Language.HI: "hi-IN",
+        Language.HI_IN: "hi-IN",
+        # Hungarian (Preview)
+        Language.HU: "hu-HU",
+        Language.HU_HU: "hu-HU",
+        # Icelandic (Preview)
+        Language.IS: "is-IS",
+        Language.IS_IS: "is-IS",
+        # Indonesian (GA)
+        Language.ID: "id-ID",
+        Language.ID_ID: "id-ID",
+        # Italian (GA)
+        Language.IT: "it-IT",
+        Language.IT_IT: "it-IT",
+        # Japanese (GA)
+        Language.JA: "ja-JP",
+        Language.JA_JP: "ja-JP",
+        # Javanese (Preview)
+        Language.JV: "jv-JV",
+        Language.JV_JV: "jv-JV",
+        # Kannada (Preview)
+        Language.KN: "kn-IN",
+        Language.KN_IN: "kn-IN",
+        # Konkani (Preview)
+        Language.KOK: "kok-IN",
+        Language.KOK_IN: "kok-IN",
+        # Korean (GA)
+        Language.KO: "ko-KR",
+        Language.KO_KR: "ko-KR",
+        # Lao (Preview)
+        Language.LO: "lo-LA",
+        Language.LO_LA: "lo-LA",
+        # Latin (Preview)
+        Language.LA: "la-VA",
+        Language.LA_VA: "la-VA",
+        # Latvian (Preview)
+        Language.LV: "lv-LV",
+        Language.LV_LV: "lv-LV",
+        # Lithuanian (Preview)
+        Language.LT: "lt-LT",
+        Language.LT_LT: "lt-LT",
+        # Luxembourgish (Preview)
+        Language.LB: "lb-LU",
+        Language.LB_LU: "lb-LU",
+        # Macedonian (Preview)
+        Language.MK: "mk-MK",
+        Language.MK_MK: "mk-MK",
+        # Maithili (Preview)
+        Language.MAI: "mai-IN",
+        Language.MAI_IN: "mai-IN",
+        # Malagasy (Preview)
+        Language.MG: "mg-MG",
+        Language.MG_MG: "mg-MG",
+        # Malay (Preview)
+        Language.MS: "ms-MY",
+        Language.MS_MY: "ms-MY",
+        # Malayalam (Preview)
+        Language.ML: "ml-IN",
+        Language.ML_IN: "ml-IN",
+        # Marathi (GA)
+        Language.MR: "mr-IN",
+        Language.MR_IN: "mr-IN",
+        # Mongolian (Preview)
+        Language.MN: "mn-MN",
+        Language.MN_MN: "mn-MN",
+        # Nepali (Preview)
+        Language.NE: "ne-NP",
+        Language.NE_NP: "ne-NP",
+        # Norwegian
+        Language.NO: "nb-NO",  # Preview: Bokmål
+        Language.NB: "nb-NO",
+        Language.NB_NO: "nb-NO",
+        Language.NN: "nn-NO",  # Preview: Nynorsk
+        Language.NN_NO: "nn-NO",
+        # Odia (Preview)
+        Language.OR: "or-IN",
+        Language.OR_IN: "or-IN",
+        # Pashto (Preview)
+        Language.PS: "ps-AF",
+        Language.PS_AF: "ps-AF",
+        # Persian (Preview)
+        Language.FA: "fa-IR",
+        Language.FA_IR: "fa-IR",
+        # Polish (GA)
+        Language.PL: "pl-PL",
+        Language.PL_PL: "pl-PL",
+        # Portuguese
+        Language.PT: "pt-BR",  # GA: Brazil
+        Language.PT_BR: "pt-BR",
+        Language.PT_PT: "pt-PT",  # Preview: Portugal
+        # Punjabi (Preview)
+        Language.PA: "pa-IN",
+        Language.PA_IN: "pa-IN",
+        # Romanian (GA)
+        Language.RO: "ro-RO",
+        Language.RO_RO: "ro-RO",
+        # Russian (GA)
+        Language.RU: "ru-RU",
+        Language.RU_RU: "ru-RU",
+        # Serbian (Preview)
+        Language.SR: "sr-RS",
+        Language.SR_RS: "sr-RS",
+        # Sindhi (Preview)
+        Language.SD: "sd-IN",
+        Language.SD_IN: "sd-IN",
+        # Sinhala (Preview)
+        Language.SI: "si-LK",
+        Language.SI_LK: "si-LK",
+        # Slovak (Preview)
+        Language.SK: "sk-SK",
+        Language.SK_SK: "sk-SK",
+        # Slovenian (Preview)
+        Language.SL: "sl-SI",
+        Language.SL_SI: "sl-SI",
+        # Spanish
+        Language.ES: "es-ES",  # GA
+        Language.ES_ES: "es-ES",
+        Language.ES_419: "es-419",  # Preview: Latin America
+        Language.ES_MX: "es-MX",  # Preview: Mexico
+        # Swahili (Preview)
+        Language.SW: "sw-KE",
+        Language.SW_KE: "sw-KE",
+        # Swedish (Preview)
+        Language.SV: "sv-SE",
+        Language.SV_SE: "sv-SE",
+        # Tamil (GA)
+        Language.TA: "ta-IN",
+        Language.TA_IN: "ta-IN",
+        # Telugu (GA)
+        Language.TE: "te-IN",
+        Language.TE_IN: "te-IN",
+        # Thai (GA)
+        Language.TH: "th-TH",
+        Language.TH_TH: "th-TH",
+        # Turkish (GA)
+        Language.TR: "tr-TR",
+        Language.TR_TR: "tr-TR",
+        # Ukrainian (GA)
+        Language.UK: "uk-UA",
+        Language.UK_UA: "uk-UA",
+        # Urdu (Preview)
+        Language.UR: "ur-PK",
+        Language.UR_PK: "ur-PK",
+        # Vietnamese (GA)
+        Language.VI: "vi-VN",
+        Language.VI_VN: "vi-VN",
+    }
+
+    return resolve_language(language, LANGUAGE_MAP, use_base_code=False)
+
+
 class GoogleHttpTTSService(TTSService):
    """Google Cloud Text-to-Speech HTTP service with SSML support.

@@ -498,7 +742,139 @@ class GoogleHttpTTSService(TTSService):
            yield ErrorFrame(error=error_message)


-class GoogleTTSService(TTSService):
+class GoogleBaseTTSService(TTSService):
+    """Base class for Google Cloud Text-to-Speech streaming services.
+
+    Provides shared streaming synthesis logic for Google TTS services.
+    This is an abstract base class. Use GoogleTTSService or GeminiTTSService instead.
+    """
+
+    def _create_client(
+        self, credentials: Optional[str], credentials_path: Optional[str]
+    ) -> texttospeech_v1.TextToSpeechAsyncClient:
+        """Create authenticated Google Text-to-Speech client.
+
+        Args:
+            credentials: JSON string with service account credentials.
+            credentials_path: Path to service account JSON file.
+
+        Returns:
+            Authenticated TextToSpeechAsyncClient instance.
+
+        Raises:
+            ValueError: If no valid credentials are provided.
+        """
+        creds: Optional[service_account.Credentials] = None
+
+        if credentials:
+            # Use provided credentials JSON string
+            json_account_info = json.loads(credentials)
+            creds = service_account.Credentials.from_service_account_info(json_account_info)
+        elif credentials_path:
+            # Use service account JSON file if provided
+            creds = service_account.Credentials.from_service_account_file(credentials_path)
+        else:
+            try:
+                creds, project_id = default(
+                    scopes=["https://www.googleapis.com/auth/cloud-platform"]
+                )
+            except GoogleAuthError:
+                pass
+
+        if not creds:
+            raise ValueError("No valid credentials provided.")
+
+        return texttospeech_v1.TextToSpeechAsyncClient(credentials=creds)
+
+    def can_generate_metrics(self) -> bool:
+        """Check if this service can generate processing metrics.
+
+        Returns:
+            True, as Google streaming TTS services support metrics generation.
+        """
+        return True
+
+    @property
+    def includes_inter_frame_spaces(self) -> bool:
+        """Indicates that Google and Gemini TTSTextFrames include necessary inter-frame spaces.
+
+        Returns:
+            True, indicating that Google's text frames include necessary inter-frame spaces.
+        """
+        return True
+
+    def language_to_service_language(self, language: Language) -> Optional[str]:
+        """Convert a Language enum to Google TTS language format.
+
+        Args:
+            language: The language to convert.
+
+        Returns:
+            The Google TTS-specific language code, or None if not supported.
+        """
+        return language_to_google_tts_language(language)
+
+    async def _stream_tts(
+        self,
+        streaming_config: texttospeech_v1.StreamingSynthesizeConfig,
+        text: str,
+        prompt: Optional[str] = None,
+    ) -> AsyncGenerator[Frame, None]:
+        """Shared streaming synthesis logic.
+
+        Args:
+            streaming_config: The streaming configuration.
+            text: The text to synthesize.
+            prompt: Optional prompt for style instructions (Gemini only).
+
+        Yields:
+            Frame: Audio frames containing the synthesized speech.
+        """
+        config_request = texttospeech_v1.StreamingSynthesizeRequest(
+            streaming_config=streaming_config
+        )
+
+        async def request_generator():
+            yield config_request
+            synthesis_input_params = {"text": text}
+            if prompt is not None:
+                synthesis_input_params["prompt"] = prompt
+            yield texttospeech_v1.StreamingSynthesizeRequest(
+                input=texttospeech_v1.StreamingSynthesisInput(**synthesis_input_params)
+            )
+
+        streaming_responses = await self._client.streaming_synthesize(request_generator())
+        await self.start_tts_usage_metrics(text)
+
+        yield TTSStartedFrame()
+
+        audio_buffer = b""
+        first_chunk_for_ttfb = False
+
+        CHUNK_SIZE = self.chunk_size
+
+        async for response in streaming_responses:
+            chunk = response.audio_content
+            if not chunk:
+                continue
+
+            if not first_chunk_for_ttfb:
+                await self.stop_ttfb_metrics()
+                first_chunk_for_ttfb = True
+
+            audio_buffer += chunk
+            while len(audio_buffer) >= CHUNK_SIZE:
+                piece = audio_buffer[:CHUNK_SIZE]
+                audio_buffer = audio_buffer[CHUNK_SIZE:]
+                yield TTSAudioRawFrame(piece, self.sample_rate, 1)
+
+        if audio_buffer:
+            yield TTSAudioRawFrame(audio_buffer, self.sample_rate, 1)
+
+        yield TTSStoppedFrame()
+
+
+class GoogleTTSService(GoogleBaseTTSService):
    """Google Cloud Text-to-Speech streaming service.

    Provides real-time text-to-speech synthesis using Google Cloud's streaming API
@@ -570,62 +946,6 @@ class GoogleTTSService(TTSService):
            credentials, credentials_path
        )

-    def _create_client(
-        self, credentials: Optional[str], credentials_path: Optional[str]
-    ) -> texttospeech_v1.TextToSpeechAsyncClient:
-        creds: Optional[service_account.Credentials] = None
-
-        # Create a Google Cloud service account for the Cloud Text-to-Speech API
-        # Using either the provided credentials JSON string or the path to a service account JSON
-        # file, create a Google Cloud service account and use it to authenticate with the API.
-        if credentials:
-            # Use provided credentials JSON string
-            json_account_info = json.loads(credentials)
-            creds = service_account.Credentials.from_service_account_info(json_account_info)
-        elif credentials_path:
-            # Use service account JSON file if provided
-            creds = service_account.Credentials.from_service_account_file(credentials_path)
-        else:
-            try:
-                creds, project_id = default(
-                    scopes=["https://www.googleapis.com/auth/cloud-platform"]
-                )
-            except GoogleAuthError:
-                pass
-
-        if not creds:
-            raise ValueError("No valid credentials provided.")
-
-        return texttospeech_v1.TextToSpeechAsyncClient(credentials=creds)
-
-    def can_generate_metrics(self) -> bool:
-        """Check if this service can generate processing metrics.
-
-        Returns:
-            True, as Google streaming TTS service supports metrics generation.
-        """
-        return True
-
-    @property
-    def includes_inter_frame_spaces(self) -> bool:
-        """Indicates that Google TTSTextFrames include necessary inter-frame spaces.
-
-        Returns:
-            True, indicating that Google's text frames include necessary inter-frame spaces.
-        """
-        return True
-
-    def language_to_service_language(self, language: Language) -> Optional[str]:
-        """Convert a Language enum to Google TTS language format.
-
-        Args:
-            language: The language to convert.
-
-        Returns:
-            The Google TTS-specific language code, or None if not supported.
-        """
-        return language_to_google_tts_language(language)
-
    async def _update_settings(self, settings: Mapping[str, Any]):
        """Override to handle speaking_rate updates for streaming API.

@@ -657,6 +977,7 @@ class GoogleTTSService(TTSService):
        try:
            await self.start_ttfb_metrics()

+            # Build voice selection params
            if self._voice_cloning_key:
                voice_clone_params = texttospeech_v1.VoiceCloneParams(
                    voice_cloning_key=self._voice_cloning_key
@@ -669,6 +990,7 @@ class GoogleTTSService(TTSService):
                    language_code=self._settings["language"], name=self._voice_id
                )

+            # Create streaming config
            streaming_config = texttospeech_v1.StreamingSynthesizeConfig(
                voice=voice,
                streaming_audio_config=texttospeech_v1.StreamingAudioConfig(
@@ -677,45 +999,10 @@ class GoogleTTSService(TTSService):
                    speaking_rate=self._settings["speaking_rate"],
                ),
            )
-            config_request = texttospeech_v1.StreamingSynthesizeRequest(
-                streaming_config=streaming_config
-            )

-            async def request_generator():
-                yield config_request
-                yield texttospeech_v1.StreamingSynthesizeRequest(
-                    input=texttospeech_v1.StreamingSynthesisInput(text=text)
-                )
-
-            streaming_responses = await self._client.streaming_synthesize(request_generator())
-            await self.start_tts_usage_metrics(text)
-
-            yield TTSStartedFrame()
-
-            audio_buffer = b""
-            first_chunk_for_ttfb = False
-
-            CHUNK_SIZE = self.chunk_size
-
-            async for response in streaming_responses:
-                chunk = response.audio_content
-                if not chunk:
-                    continue
-
-                if not first_chunk_for_ttfb:
-                    await self.stop_ttfb_metrics()
-                    first_chunk_for_ttfb = True
-
-                audio_buffer += chunk
-                while len(audio_buffer) >= CHUNK_SIZE:
-                    piece = audio_buffer[:CHUNK_SIZE]
-                    audio_buffer = audio_buffer[CHUNK_SIZE:]
-                    yield TTSAudioRawFrame(piece, self.sample_rate, 1)
-
-            if audio_buffer:
-                yield TTSAudioRawFrame(audio_buffer, self.sample_rate, 1)
-
-            yield TTSStoppedFrame()
+            # Use base class streaming logic
+            async for frame in self._stream_tts(streaming_config, text):
+                yield frame

        except Exception as e:
            logger.exception(f"{self} error generating TTS: {e}")
@@ -723,25 +1010,29 @@ class GoogleTTSService(TTSService):
            yield ErrorFrame(error=error_message)


-class GeminiTTSService(TTSService):
-    """Gemini Text-to-Speech service using Gemini TTS models.
+class GeminiTTSService(GoogleBaseTTSService):
+    """Gemini Text-to-Speech streaming service using Gemini TTS models.

-    Provides text-to-speech synthesis using Gemini's TTS-specific models
-    (gemini-2.5-flash-preview-tts and gemini-2.5-pro-preview-tts) with
-    support for natural voice control, multiple speakers, and voice styles.
+    Provides real-time text-to-speech synthesis using Gemini's TTS-specific models
+    (gemini-2.5-flash-tts and gemini-2.5-pro-tts) with support for natural
+    voice control, prompts for style instructions, expressive markup tags,
+    and multi-speaker conversations.

    Note:
-        Requires Google AI API key. This uses the Gemini API, not Google Cloud TTS.
-        Audio-out is currently a preview feature.
+        Requires Google Cloud credentials via service account JSON, credentials file,
+        or default application credentials (GOOGLE_APPLICATION_CREDENTIALS).
+
+        Uses the Google Cloud Text-to-Speech streaming API for low-latency synthesis.

    Example::

        tts = GeminiTTSService(
-            api_key="your-google-ai-api-key",
-            model="gemini-2.5-flash-preview-tts",
+            credentials_path="/path/to/service-account.json",
+            model="gemini-2.5-flash-tts",
            voice_id="Kore",
            params=GeminiTTSService.InputParams(
                language=Language.EN_US,
+                prompt="Say this in a friendly and helpful tone"
            )
        )
    """
@@ -750,36 +1041,36 @@ class GeminiTTSService(TTSService):

    # List of available Gemini TTS voices
    AVAILABLE_VOICES = [
-        "Zephyr",
-        "Puck",
+        "Achernar",
+        "Achird",
+        "Algenib",
+        "Algieba",
+        "Alnilam",
+        "Aoede",
+        "Autonoe",
+        "Callirhoe",
        "Charon",
-        "Kore",
+        "Despina",
+        "Enceladus",
+        "Erinome",
        "Fenrir",
+        "Gacrux",
+        "Iapetus",
+        "Kore",
+        "Laomedeia",
        "Leda",
        "Orus",
-        "Aoede",
-        "Callirhoe",
-        "Autonoe",
-        "Enceladus",
-        "Iapetus",
-        "Umbriel",
-        "Algieba",
-        "Despina",
-        "Erinome",
-        "Algenib",
-        "Rasalgethi",
-        "Laomedeia",
-        "Achernar",
-        "Alnilam",
-        "Schedar",
-        "Gacrux",
+        "Puck",
        "Pulcherrima",
-        "Achird",
-        "Zubenelgenubi",
-        "Vindemiatrix",
+        "Rasalgethi",
        "Sadachbia",
        "Sadaltager",
+        "Schedar",
        "Sulafar",
+        "Umbriel",
+        "Vindemiatrix",
+        "Zephyr",
+        "Zubenelgenubi",
    ]

    class InputParams(BaseModel):
@@ -787,19 +1078,23 @@ class GeminiTTSService(TTSService):

        Parameters:
            language: Language for synthesis. Defaults to English.
+            prompt: Optional style instructions for how to synthesize the content.
            multi_speaker: Whether to enable multi-speaker support.
            speaker_configs: List of speaker configurations for multi-speaker mode.
        """

        language: Optional[Language] = Language.EN
+        prompt: Optional[str] = None
        multi_speaker: bool = False
        speaker_configs: Optional[List[dict]] = None

    def __init__(
        self,
        *,
-        api_key: str,
-        model: str = "gemini-2.5-flash-preview-tts",
+        api_key: Optional[str] = None,
+        model: str = "gemini-2.5-flash-tts",
+        credentials: Optional[str] = None,
+        credentials_path: Optional[str] = None,
        voice_id: str = "Kore",
        sample_rate: Optional[int] = None,
        params: Optional[InputParams] = None,
@@ -808,14 +1103,30 @@ class GeminiTTSService(TTSService):
        """Initializes the Gemini TTS service.

        Args:
-            api_key: Google AI API key for authentication.
+            api_key:
+
+                .. deprecated:: 0.0.95
+                    The `api_key` parameter is deprecated. Use `credentials` or
+                    `credentials_path` instead for Google Cloud authentication.
+
            model: Gemini TTS model to use. Must be a TTS model like
-                   "gemini-2.5-flash-preview-tts" or "gemini-2.5-pro-preview-tts".
+                   "gemini-2.5-flash-tts" or "gemini-2.5-pro-tts".
+            credentials: JSON string containing Google Cloud service account credentials.
+            credentials_path: Path to Google Cloud service account JSON file.
            voice_id: Voice name from the available Gemini voices.
            sample_rate: Audio sample rate in Hz. If None, uses Google's default 24kHz.
            params: TTS configuration parameters.
            **kwargs: Additional arguments passed to parent TTSService.
        """
+        # Handle deprecated api_key parameter
+        if api_key is not None:
+            warnings.warn(
+                "The 'api_key' parameter is deprecated and will be removed in a future version. "
+                "Use 'credentials' or 'credentials_path' instead for Google Cloud authentication.",
+                DeprecationWarning,
+                stacklevel=2,
+            )
+
        if sample_rate and sample_rate != self.GOOGLE_SAMPLE_RATE:
            logger.warning(
                f"Google TTS only supports {self.GOOGLE_SAMPLE_RATE}Hz sample rate. "
@@ -828,35 +1139,20 @@ class GeminiTTSService(TTSService):
        if voice_id not in self.AVAILABLE_VOICES:
            logger.warning(f"Voice '{voice_id}' not in known voices list. Using anyway.")

-        self._api_key = api_key
        self._model = model
        self._voice_id = voice_id
        self._settings = {
            "language": self.language_to_service_language(params.language)
            if params.language
            else "en-US",
+            "prompt": params.prompt,
            "multi_speaker": params.multi_speaker,
            "speaker_configs": params.speaker_configs,
        }

-        self._client = genai.Client(api_key=api_key)
-
-    def can_generate_metrics(self) -> bool:
-        """Check if this service can generate processing metrics.
-
-        Returns:
-            True, as Gemini TTS service supports metrics generation.
-        """
-        return True
-
-    @property
-    def includes_inter_frame_spaces(self) -> bool:
-        """Indicates that Gemini TTSTextFrames include necessary inter-frame spaces.
-
-        Returns:
-            True, indicating that Gemini's text frames include necessary inter-frame spaces.
-        """
-        return True
+        self._client: texttospeech_v1.TextToSpeechAsyncClient = self._create_client(
+            credentials, credentials_path
+        )

    def language_to_service_language(self, language: Language) -> Optional[str]:
        """Convert a Language enum to Gemini TTS language format.
@@ -867,7 +1163,7 @@ class GeminiTTSService(TTSService):
        Returns:
            The Gemini TTS-specific language code, or None if not supported.
        """
-        return language_to_google_tts_language(language)
+        return language_to_gemini_tts_language(language)

    def set_voice(self, voice_id: str):
        """Set the voice for TTS generation.
@@ -892,88 +1188,73 @@ class GeminiTTSService(TTSService):
                f"Current rate of {self.sample_rate}Hz may cause issues."
            )

-    @traced_tts
-    async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
-        """Generate speech from text using Gemini TTS models.
+    async def _update_settings(self, settings: Mapping[str, Any]):
+        """Override to handle prompt updates.

        Args:
-            text: The text to synthesize into speech. Can include natural language
-                  instructions for style, tone, etc.
+            settings: Dictionary of settings to update. Can include 'prompt' (str)
+        """
+        if "prompt" in settings:
+            self._settings["prompt"] = settings["prompt"]
+        await super()._update_settings(settings)
+
+    @traced_tts
+    async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
+        """Generate streaming speech from text using Gemini TTS models.
+
+        Args:
+            text: The text to synthesize into speech. Can include markup tags
+                  like [sigh], [laughing], [whispering] for expressive control.

        Yields:
-            Frame: Audio frames containing the synthesized speech.
+            Frame: Audio frames containing the synthesized speech as it's generated.
        """
        logger.debug(f"{self}: Generating TTS [{text}]")

        try:
            await self.start_ttfb_metrics()

-            # Build the speech config
+            # Build voice selection params
            if self._settings["multi_speaker"] and self._settings["speaker_configs"]:
                # Multi-speaker mode
                speaker_voice_configs = []
                for speaker_config in self._settings["speaker_configs"]:
                    speaker_voice_configs.append(
-                        types.SpeakerVoiceConfig(
-                            speaker=speaker_config["speaker"],
-                            voice_config=types.VoiceConfig(
-                                prebuilt_voice_config=types.PrebuiltVoiceConfig(
-                                    voice_name=speaker_config.get("voice_id", self._voice_id)
-                                )
-                            ),
+                        texttospeech_v1.MultispeakerPrebuiltVoice(
+                            speaker_alias=speaker_config["speaker_alias"],
+                            speaker_id=speaker_config.get("speaker_id", self._voice_id),
                        )
                    )

-                speech_config = types.SpeechConfig(
-                    multi_speaker_voice_config=types.MultiSpeakerVoiceConfig(
-                        speaker_voice_configs=speaker_voice_configs
-                    )
+                multi_speaker_voice_config = texttospeech_v1.MultiSpeakerVoiceConfig(
+                    speaker_voice_configs=speaker_voice_configs
+                )
+
+                voice = texttospeech_v1.VoiceSelectionParams(
+                    language_code=self._settings["language"],
+                    model_name=self._model,
+                    multi_speaker_voice_config=multi_speaker_voice_config,
                )
            else:
                # Single speaker mode
-                speech_config = types.SpeechConfig(
-                    voice_config=types.VoiceConfig(
-                        prebuilt_voice_config=types.PrebuiltVoiceConfig(voice_name=self._voice_id)
-                    )
+                voice = texttospeech_v1.VoiceSelectionParams(
+                    language_code=self._settings["language"],
+                    name=self._voice_id,
+                    model_name=self._model,
                )

-            # Create the generation config
-            generation_config = types.GenerateContentConfig(
-                response_modalities=["AUDIO"],
-                speech_config=speech_config,
+            # Create streaming config
+            streaming_config = texttospeech_v1.StreamingSynthesizeConfig(
+                voice=voice,
+                streaming_audio_config=texttospeech_v1.StreamingAudioConfig(
+                    audio_encoding=texttospeech_v1.AudioEncoding.PCM,
+                    sample_rate_hertz=self.sample_rate,
+                ),
            )

-            # Generate the content
-            response = await self._client.aio.models.generate_content(
-                model=self._model,
-                contents=text,
-                config=generation_config,
-            )
-
-            await self.start_tts_usage_metrics(text)
-
-            yield TTSStartedFrame()
-
-            # Extract audio data from response
-            if response.candidates and len(response.candidates) > 0:
-                candidate = response.candidates[0]
-                if candidate.content and candidate.content.parts:
-                    for part in candidate.content.parts:
-                        if part.inline_data and part.inline_data.mime_type.startswith("audio/"):
-                            audio_data = part.inline_data.data
-                            await self.stop_ttfb_metrics()
-
-                            # Gemini TTS returns PCM audio data, chunk it appropriately
-                            CHUNK_SIZE = self.chunk_size
-
-                            for i in range(0, len(audio_data), CHUNK_SIZE):
-                                chunk = audio_data[i : i + CHUNK_SIZE]
-                                if not chunk:
-                                    break
-                                frame = TTSAudioRawFrame(chunk, self.sample_rate, 1)
-                                yield frame
-
-            yield TTSStoppedFrame()
+            # Use base class streaming logic with prompt support
+            async for frame in self._stream_tts(streaming_config, text, self._settings["prompt"]):
+                yield frame

        except Exception as e:
            logger.exception(f"{self} error generating TTS: {e}")
--- a/src/pipecat/transcriptions/language.py
+++ b/src/pipecat/transcriptions/language.py
@@ -66,6 +66,7 @@ class Language(StrEnum):
    AR_TN = "ar-TN"
    AR_XA = "ar-XA"
    AR_YE = "ar-YE"
+    AR_001 = "ar-001"

    # Assamese
    AS = "as"
@@ -83,6 +84,7 @@ class Language(StrEnum):

    # Belarusian
    BE = "be"
+    BE_BY = "be-BY"

    # Bulgarian
    BG = "bg"
@@ -109,6 +111,7 @@ class Language(StrEnum):

    # Cebuano
    CEB = "ceb"
+    CEB_PH = "ceb-PH"

    # Mandarin Chinese
    CMN = "cmn"
@@ -181,6 +184,7 @@ class Language(StrEnum):
    ES_US = "es-US"
    ES_UY = "es-UY"
    ES_VE = "es-VE"
+    ES_419 = "es-419"

    # Estonian
    ET = "et"
@@ -250,6 +254,7 @@ class Language(StrEnum):

    # Haitian Creole
    HT = "ht"
+    HT_HT = "ht-HT"

    # Hungarian
    HU = "hu"
@@ -288,6 +293,7 @@ class Language(StrEnum):
    # Javanese
    JV = "jv"
    JV_ID = "jv-ID"
+    JV_JV = "jv-JV"
    JW = "jw"  # Fal requires for Javanese

    # Georgian
@@ -309,6 +315,10 @@ class Language(StrEnum):
    KN = "kn"
    KN_IN = "kn-IN"

+    # Konkani
+    KOK = "kok"
+    KOK_IN = "kok-IN"
+
    # Korean
    KO = "ko"
    KO_KR = "ko-KR"
@@ -322,9 +332,11 @@ class Language(StrEnum):

    # Latin
    LA = "la"
+    LA_VA = "la-VA"

    # Luxembourgish
    LB = "lb"
+    LB_LU = "lb-LU"

    # Lingala
    LN = "ln"
@@ -349,6 +361,7 @@ class Language(StrEnum):

    # Malagasy
    MG = "mg"
+    MG_MG = "mg-MG"

    # Maori
    MI = "mi"
@@ -357,6 +370,10 @@ class Language(StrEnum):
    MK = "mk"
    MK_MK = "mk-MK"

+    # Maithili
+    MAI = "mai"
+    MAI_IN = "mai-IN"
+
    # Malayalam
    ML = "ml"
    ML_IN = "ml-IN"
@@ -387,6 +404,7 @@ class Language(StrEnum):
    NB_NO = "nb-NO"
    NO = "no"
    NN = "nn"  # Norwegian Nynorsk
+    NN_NO = "nn-NO"

    # Nepali
    NE = "ne"
@@ -440,6 +458,7 @@ class Language(StrEnum):

    # Sindhi
    SD = "sd"
+    SD_IN = "sd-IN"

    # Sinhala
    SI = "si"