Update GeminiTTSService for streaming, other Google TTS improvements
This commit is contained in:
15
CHANGELOG.md
15
CHANGELOG.md
@@ -20,6 +20,21 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
|
||||
- Added Hindi support for Rime TTS services.
|
||||
|
||||
- Updated `GeminiTTSService` to use Google Cloud Text-to-Speech streaming API
|
||||
instead of the deprecated Gemini API. Now uses `credentials` /
|
||||
`credentials_path` for authentication. The `api_key` parameter is deprecated.
|
||||
Also, added support for `prompt` parameter for style instructions and
|
||||
expressive markup tags. Significantly improved latency with streaming
|
||||
synthesis.
|
||||
|
||||
- Updated language mappings for the Google and Gemini TTS services to match
|
||||
official documentation.
|
||||
|
||||
### Deprecated
|
||||
|
||||
- The `api_key` parameter in `GeminiTTSService` is deprecated. Use
|
||||
`credentials` or `credentials_path` instead for Google Cloud authentication.
|
||||
|
||||
### Fixed
|
||||
|
||||
- Fixed subtle issue of assistant context messages ending up with double spaces
|
||||
|
||||
@@ -4,24 +4,6 @@
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""
|
||||
A conversational AI bot using Gemini for both LLM and TTS.
|
||||
|
||||
This example demonstrates how to use Gemini's TTS capabilities with the new
|
||||
GeminiTTSService, which uses Gemini's TTS-specific models instead of Google Cloud TTS.
|
||||
|
||||
Features showcased:
|
||||
- Gemini LLM for conversation
|
||||
- Gemini TTS with natural voice control
|
||||
- Support for different voice personalities
|
||||
- Style and tone control through natural language prompts
|
||||
|
||||
Run with:
|
||||
python examples/foundational/gemini-tts.py
|
||||
|
||||
Make sure to set your environment variables:
|
||||
export GOOGLE_API_KEY=your_api_key_here
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
@@ -84,10 +66,13 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
)
|
||||
|
||||
tts = GeminiTTSService(
|
||||
api_key=os.getenv("GOOGLE_API_KEY"),
|
||||
model="gemini-2.5-flash-preview-tts", # TTS-specific model
|
||||
credentials=os.getenv("GOOGLE_TEST_CREDENTIALS"),
|
||||
model="gemini-2.5-flash-tts",
|
||||
voice_id="Charon",
|
||||
params=GeminiTTSService.InputParams(language=Language.EN_US),
|
||||
params=GeminiTTSService.InputParams(
|
||||
language=Language.EN_US,
|
||||
prompt="You are a helpful AI assistant. Speak in a natural, conversational tone.",
|
||||
),
|
||||
)
|
||||
|
||||
llm = GoogleLLMService(
|
||||
@@ -101,13 +86,20 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
"role": "system",
|
||||
"content": """You are a helpful AI assistant in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way.
|
||||
|
||||
IMPORTANT: Since you're using Gemini TTS which supports natural voice control, you can include speaking instructions in your responses. For example:
|
||||
- "Say cheerfully: Welcome to our conversation!"
|
||||
- "Read this in a calm, professional tone: Here are the details you requested."
|
||||
- "Speak in an excited whisper: I have some great news to share!"
|
||||
- "Say slowly and clearly: Let me explain this step by step."
|
||||
IMPORTANT: You're using Gemini TTS which supports expressive markup tags. You can use these tags in your responses:
|
||||
- [sigh] - Insert a sigh sound
|
||||
- [laughing] - Insert a laugh
|
||||
- [uhm] - Insert a hesitation sound
|
||||
- [whispering] - Speak the next part in a whisper
|
||||
- [shouting] - Speak the next part louder
|
||||
- [extremely fast] - Speak the next part very quickly
|
||||
- [short pause], [medium pause], [long pause] - Add pauses for dramatic effect
|
||||
|
||||
Feel free to use natural language instructions to control your voice style, tone, pace, and emotion. The TTS system will interpret these instructions and adjust the speech accordingly.
|
||||
Examples:
|
||||
- "Well [sigh] that's a tricky question."
|
||||
- "[laughing] That's a great joke!"
|
||||
- "[whispering] Let me tell you a secret."
|
||||
- "The answer is... [long pause] ...42!"
|
||||
|
||||
Your output will be converted to audio, so avoid special characters in your answers. Respond to what the user said in a creative and helpful way.""",
|
||||
},
|
||||
@@ -140,11 +132,11 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation with a styled introduction
|
||||
# Kick off the conversation
|
||||
messages.append(
|
||||
{
|
||||
"role": "system",
|
||||
"content": "Say cheerfully and warmly: Hello! I'm your AI assistant powered by Gemini's new TTS technology. I can speak with different voices, tones, and styles. How can I help you today?",
|
||||
"content": "Hello! I'm your AI assistant. I can help you with a variety of tasks. What would you like to know?",
|
||||
}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@@ -16,6 +16,7 @@ for natural voice control and multi-speaker conversations.
|
||||
|
||||
import json
|
||||
import os
|
||||
import warnings
|
||||
|
||||
from pipecat.utils.tracing.service_decorators import traced_tts
|
||||
|
||||
@@ -51,19 +52,13 @@ except ModuleNotFoundError as e:
|
||||
)
|
||||
raise Exception(f"Missing module: {e}")
|
||||
|
||||
try:
|
||||
from google import genai
|
||||
from google.genai import types
|
||||
|
||||
except ModuleNotFoundError as e:
|
||||
logger.error(f"Exception: {e}")
|
||||
logger.error("In order to use Gemini TTS, you need to `pip install pipecat-ai[google]`.")
|
||||
raise Exception(f"Missing module: {e}")
|
||||
|
||||
|
||||
def language_to_google_tts_language(language: Language) -> Optional[str]:
|
||||
"""Convert a Language enum to Google TTS language code.
|
||||
|
||||
Source:
|
||||
https://docs.cloud.google.com/text-to-speech/docs/chirp3-hd
|
||||
|
||||
Args:
|
||||
language: The Language enum value to convert.
|
||||
|
||||
@@ -71,9 +66,6 @@ def language_to_google_tts_language(language: Language) -> Optional[str]:
|
||||
The corresponding Google TTS language code, or None if not supported.
|
||||
"""
|
||||
LANGUAGE_MAP = {
|
||||
# Afrikaans
|
||||
Language.AF: "af-ZA",
|
||||
Language.AF_ZA: "af-ZA",
|
||||
# Arabic
|
||||
Language.AR: "ar-XA",
|
||||
# Bengali
|
||||
@@ -82,14 +74,9 @@ def language_to_google_tts_language(language: Language) -> Optional[str]:
|
||||
# Bulgarian
|
||||
Language.BG: "bg-BG",
|
||||
Language.BG_BG: "bg-BG",
|
||||
# Catalan
|
||||
Language.CA: "ca-ES",
|
||||
Language.CA_ES: "ca-ES",
|
||||
# Chinese (Mandarin and Cantonese)
|
||||
Language.ZH: "cmn-CN",
|
||||
Language.ZH_CN: "cmn-CN",
|
||||
Language.ZH_TW: "cmn-TW",
|
||||
Language.ZH_HK: "yue-HK",
|
||||
# Croatian
|
||||
Language.HR: "hr-HR",
|
||||
Language.HR_HR: "hr-HR",
|
||||
# Czech
|
||||
Language.CS: "cs-CZ",
|
||||
Language.CS_CZ: "cs-CZ",
|
||||
@@ -109,9 +96,6 @@ def language_to_google_tts_language(language: Language) -> Optional[str]:
|
||||
# Estonian
|
||||
Language.ET: "et-EE",
|
||||
Language.ET_EE: "et-EE",
|
||||
# Filipino
|
||||
Language.FIL: "fil-PH",
|
||||
Language.FIL_PH: "fil-PH",
|
||||
# Finnish
|
||||
Language.FI: "fi-FI",
|
||||
Language.FI_FI: "fi-FI",
|
||||
@@ -119,9 +103,6 @@ def language_to_google_tts_language(language: Language) -> Optional[str]:
|
||||
Language.FR: "fr-FR",
|
||||
Language.FR_CA: "fr-CA",
|
||||
Language.FR_FR: "fr-FR",
|
||||
# Galician
|
||||
Language.GL: "gl-ES",
|
||||
Language.GL_ES: "gl-ES",
|
||||
# German
|
||||
Language.DE: "de-DE",
|
||||
Language.DE_DE: "de-DE",
|
||||
@@ -140,9 +121,6 @@ def language_to_google_tts_language(language: Language) -> Optional[str]:
|
||||
# Hungarian
|
||||
Language.HU: "hu-HU",
|
||||
Language.HU_HU: "hu-HU",
|
||||
# Icelandic
|
||||
Language.IS: "is-IS",
|
||||
Language.IS_IS: "is-IS",
|
||||
# Indonesian
|
||||
Language.ID: "id-ID",
|
||||
Language.ID_ID: "id-ID",
|
||||
@@ -164,12 +142,12 @@ def language_to_google_tts_language(language: Language) -> Optional[str]:
|
||||
# Lithuanian
|
||||
Language.LT: "lt-LT",
|
||||
Language.LT_LT: "lt-LT",
|
||||
# Malay
|
||||
Language.MS: "ms-MY",
|
||||
Language.MS_MY: "ms-MY",
|
||||
# Malayalam
|
||||
Language.ML: "ml-IN",
|
||||
Language.ML_IN: "ml-IN",
|
||||
# Chinese (Mandarin)
|
||||
Language.ZH: "cmn-CN",
|
||||
Language.ZH_CN: "cmn-CN",
|
||||
# Marathi
|
||||
Language.MR: "mr-IN",
|
||||
Language.MR_IN: "mr-IN",
|
||||
@@ -181,12 +159,8 @@ def language_to_google_tts_language(language: Language) -> Optional[str]:
|
||||
Language.PL: "pl-PL",
|
||||
Language.PL_PL: "pl-PL",
|
||||
# Portuguese
|
||||
Language.PT: "pt-PT",
|
||||
Language.PT: "pt-BR",
|
||||
Language.PT_BR: "pt-BR",
|
||||
Language.PT_PT: "pt-PT",
|
||||
# Punjabi
|
||||
Language.PA: "pa-IN",
|
||||
Language.PA_IN: "pa-IN",
|
||||
# Romanian
|
||||
Language.RO: "ro-RO",
|
||||
Language.RO_RO: "ro-RO",
|
||||
@@ -199,10 +173,16 @@ def language_to_google_tts_language(language: Language) -> Optional[str]:
|
||||
# Slovak
|
||||
Language.SK: "sk-SK",
|
||||
Language.SK_SK: "sk-SK",
|
||||
# Slovenian
|
||||
Language.SL: "sl-SI",
|
||||
Language.SL_SI: "sl-SI",
|
||||
# Spanish
|
||||
Language.ES: "es-ES",
|
||||
Language.ES_ES: "es-ES",
|
||||
Language.ES_US: "es-US",
|
||||
# Swahili
|
||||
Language.SW: "sw-KE",
|
||||
Language.SW_KE: "sw-KE",
|
||||
# Swedish
|
||||
Language.SV: "sv-SE",
|
||||
Language.SV_SE: "sv-SE",
|
||||
@@ -221,6 +201,9 @@ def language_to_google_tts_language(language: Language) -> Optional[str]:
|
||||
# Ukrainian
|
||||
Language.UK: "uk-UA",
|
||||
Language.UK_UA: "uk-UA",
|
||||
# Urdu
|
||||
Language.UR: "ur-IN",
|
||||
Language.UR_IN: "ur-IN",
|
||||
# Vietnamese
|
||||
Language.VI: "vi-VN",
|
||||
Language.VI_VN: "vi-VN",
|
||||
@@ -229,6 +212,267 @@ def language_to_google_tts_language(language: Language) -> Optional[str]:
|
||||
return resolve_language(language, LANGUAGE_MAP, use_base_code=False)
|
||||
|
||||
|
||||
def language_to_gemini_tts_language(language: Language) -> Optional[str]:
|
||||
"""Convert a Language enum to Gemini TTS language code.
|
||||
|
||||
Source:
|
||||
https://docs.cloud.google.com/text-to-speech/docs/gemini-tts#available_languages
|
||||
|
||||
Args:
|
||||
language: The Language enum value to convert.
|
||||
|
||||
Returns:
|
||||
The corresponding Gemini TTS language code, or None if not supported.
|
||||
"""
|
||||
LANGUAGE_MAP = {
|
||||
# Afrikaans (Preview)
|
||||
Language.AF: "af-ZA",
|
||||
Language.AF_ZA: "af-ZA",
|
||||
# Albanian (Preview)
|
||||
Language.SQ: "sq-AL",
|
||||
Language.SQ_AL: "sq-AL",
|
||||
# Amharic (Preview)
|
||||
Language.AM: "am-ET",
|
||||
Language.AM_ET: "am-ET",
|
||||
# Arabic
|
||||
Language.AR: "ar-EG", # GA: Egypt
|
||||
Language.AR_EG: "ar-EG",
|
||||
Language.AR_001: "ar-001", # Preview: World
|
||||
# Armenian (Preview)
|
||||
Language.HY: "hy-AM",
|
||||
Language.HY_AM: "hy-AM",
|
||||
# Azerbaijani (Preview)
|
||||
Language.AZ: "az-AZ",
|
||||
Language.AZ_AZ: "az-AZ",
|
||||
# Basque (Preview)
|
||||
Language.EU: "eu-ES",
|
||||
Language.EU_ES: "eu-ES",
|
||||
# Belarusian (Preview)
|
||||
Language.BE: "be-BY",
|
||||
Language.BE_BY: "be-BY",
|
||||
# Bengali (GA)
|
||||
Language.BN: "bn-BD",
|
||||
Language.BN_BD: "bn-BD",
|
||||
# Bulgarian (Preview)
|
||||
Language.BG: "bg-BG",
|
||||
Language.BG_BG: "bg-BG",
|
||||
# Burmese (Preview)
|
||||
Language.MY: "my-MM",
|
||||
Language.MY_MM: "my-MM",
|
||||
# Catalan (Preview)
|
||||
Language.CA: "ca-ES",
|
||||
Language.CA_ES: "ca-ES",
|
||||
# Cebuano (Preview)
|
||||
Language.CEB: "ceb-PH",
|
||||
Language.CEB_PH: "ceb-PH",
|
||||
# Chinese (Mandarin)
|
||||
Language.ZH: "cmn-CN", # Preview
|
||||
Language.ZH_CN: "cmn-CN",
|
||||
Language.ZH_TW: "cmn-TW", # Preview
|
||||
# Croatian (Preview)
|
||||
Language.HR: "hr-HR",
|
||||
Language.HR_HR: "hr-HR",
|
||||
# Czech (Preview)
|
||||
Language.CS: "cs-CZ",
|
||||
Language.CS_CZ: "cs-CZ",
|
||||
# Danish (Preview)
|
||||
Language.DA: "da-DK",
|
||||
Language.DA_DK: "da-DK",
|
||||
# Dutch (GA)
|
||||
Language.NL: "nl-NL",
|
||||
Language.NL_NL: "nl-NL",
|
||||
# English
|
||||
Language.EN: "en-US", # GA
|
||||
Language.EN_US: "en-US",
|
||||
Language.EN_AU: "en-AU", # Preview
|
||||
Language.EN_GB: "en-GB", # Preview
|
||||
Language.EN_IN: "en-IN", # GA
|
||||
# Estonian (Preview)
|
||||
Language.ET: "et-EE",
|
||||
Language.ET_EE: "et-EE",
|
||||
# Filipino (Preview)
|
||||
Language.FIL: "fil-PH",
|
||||
Language.FIL_PH: "fil-PH",
|
||||
# Finnish (Preview)
|
||||
Language.FI: "fi-FI",
|
||||
Language.FI_FI: "fi-FI",
|
||||
# French
|
||||
Language.FR: "fr-FR", # GA
|
||||
Language.FR_FR: "fr-FR",
|
||||
Language.FR_CA: "fr-CA", # Preview
|
||||
# Galician (Preview)
|
||||
Language.GL: "gl-ES",
|
||||
Language.GL_ES: "gl-ES",
|
||||
# Georgian (Preview)
|
||||
Language.KA: "ka-GE",
|
||||
Language.KA_GE: "ka-GE",
|
||||
# German (GA)
|
||||
Language.DE: "de-DE",
|
||||
Language.DE_DE: "de-DE",
|
||||
# Greek (Preview)
|
||||
Language.EL: "el-GR",
|
||||
Language.EL_GR: "el-GR",
|
||||
# Gujarati (Preview)
|
||||
Language.GU: "gu-IN",
|
||||
Language.GU_IN: "gu-IN",
|
||||
# Haitian Creole (Preview)
|
||||
Language.HT: "ht-HT",
|
||||
Language.HT_HT: "ht-HT",
|
||||
# Hebrew (Preview)
|
||||
Language.HE: "he-IL",
|
||||
Language.HE_IL: "he-IL",
|
||||
# Hindi (GA)
|
||||
Language.HI: "hi-IN",
|
||||
Language.HI_IN: "hi-IN",
|
||||
# Hungarian (Preview)
|
||||
Language.HU: "hu-HU",
|
||||
Language.HU_HU: "hu-HU",
|
||||
# Icelandic (Preview)
|
||||
Language.IS: "is-IS",
|
||||
Language.IS_IS: "is-IS",
|
||||
# Indonesian (GA)
|
||||
Language.ID: "id-ID",
|
||||
Language.ID_ID: "id-ID",
|
||||
# Italian (GA)
|
||||
Language.IT: "it-IT",
|
||||
Language.IT_IT: "it-IT",
|
||||
# Japanese (GA)
|
||||
Language.JA: "ja-JP",
|
||||
Language.JA_JP: "ja-JP",
|
||||
# Javanese (Preview)
|
||||
Language.JV: "jv-JV",
|
||||
Language.JV_JV: "jv-JV",
|
||||
# Kannada (Preview)
|
||||
Language.KN: "kn-IN",
|
||||
Language.KN_IN: "kn-IN",
|
||||
# Konkani (Preview)
|
||||
Language.KOK: "kok-IN",
|
||||
Language.KOK_IN: "kok-IN",
|
||||
# Korean (GA)
|
||||
Language.KO: "ko-KR",
|
||||
Language.KO_KR: "ko-KR",
|
||||
# Lao (Preview)
|
||||
Language.LO: "lo-LA",
|
||||
Language.LO_LA: "lo-LA",
|
||||
# Latin (Preview)
|
||||
Language.LA: "la-VA",
|
||||
Language.LA_VA: "la-VA",
|
||||
# Latvian (Preview)
|
||||
Language.LV: "lv-LV",
|
||||
Language.LV_LV: "lv-LV",
|
||||
# Lithuanian (Preview)
|
||||
Language.LT: "lt-LT",
|
||||
Language.LT_LT: "lt-LT",
|
||||
# Luxembourgish (Preview)
|
||||
Language.LB: "lb-LU",
|
||||
Language.LB_LU: "lb-LU",
|
||||
# Macedonian (Preview)
|
||||
Language.MK: "mk-MK",
|
||||
Language.MK_MK: "mk-MK",
|
||||
# Maithili (Preview)
|
||||
Language.MAI: "mai-IN",
|
||||
Language.MAI_IN: "mai-IN",
|
||||
# Malagasy (Preview)
|
||||
Language.MG: "mg-MG",
|
||||
Language.MG_MG: "mg-MG",
|
||||
# Malay (Preview)
|
||||
Language.MS: "ms-MY",
|
||||
Language.MS_MY: "ms-MY",
|
||||
# Malayalam (Preview)
|
||||
Language.ML: "ml-IN",
|
||||
Language.ML_IN: "ml-IN",
|
||||
# Marathi (GA)
|
||||
Language.MR: "mr-IN",
|
||||
Language.MR_IN: "mr-IN",
|
||||
# Mongolian (Preview)
|
||||
Language.MN: "mn-MN",
|
||||
Language.MN_MN: "mn-MN",
|
||||
# Nepali (Preview)
|
||||
Language.NE: "ne-NP",
|
||||
Language.NE_NP: "ne-NP",
|
||||
# Norwegian
|
||||
Language.NO: "nb-NO", # Preview: Bokmål
|
||||
Language.NB: "nb-NO",
|
||||
Language.NB_NO: "nb-NO",
|
||||
Language.NN: "nn-NO", # Preview: Nynorsk
|
||||
Language.NN_NO: "nn-NO",
|
||||
# Odia (Preview)
|
||||
Language.OR: "or-IN",
|
||||
Language.OR_IN: "or-IN",
|
||||
# Pashto (Preview)
|
||||
Language.PS: "ps-AF",
|
||||
Language.PS_AF: "ps-AF",
|
||||
# Persian (Preview)
|
||||
Language.FA: "fa-IR",
|
||||
Language.FA_IR: "fa-IR",
|
||||
# Polish (GA)
|
||||
Language.PL: "pl-PL",
|
||||
Language.PL_PL: "pl-PL",
|
||||
# Portuguese
|
||||
Language.PT: "pt-BR", # GA: Brazil
|
||||
Language.PT_BR: "pt-BR",
|
||||
Language.PT_PT: "pt-PT", # Preview: Portugal
|
||||
# Punjabi (Preview)
|
||||
Language.PA: "pa-IN",
|
||||
Language.PA_IN: "pa-IN",
|
||||
# Romanian (GA)
|
||||
Language.RO: "ro-RO",
|
||||
Language.RO_RO: "ro-RO",
|
||||
# Russian (GA)
|
||||
Language.RU: "ru-RU",
|
||||
Language.RU_RU: "ru-RU",
|
||||
# Serbian (Preview)
|
||||
Language.SR: "sr-RS",
|
||||
Language.SR_RS: "sr-RS",
|
||||
# Sindhi (Preview)
|
||||
Language.SD: "sd-IN",
|
||||
Language.SD_IN: "sd-IN",
|
||||
# Sinhala (Preview)
|
||||
Language.SI: "si-LK",
|
||||
Language.SI_LK: "si-LK",
|
||||
# Slovak (Preview)
|
||||
Language.SK: "sk-SK",
|
||||
Language.SK_SK: "sk-SK",
|
||||
# Slovenian (Preview)
|
||||
Language.SL: "sl-SI",
|
||||
Language.SL_SI: "sl-SI",
|
||||
# Spanish
|
||||
Language.ES: "es-ES", # GA
|
||||
Language.ES_ES: "es-ES",
|
||||
Language.ES_419: "es-419", # Preview: Latin America
|
||||
Language.ES_MX: "es-MX", # Preview: Mexico
|
||||
# Swahili (Preview)
|
||||
Language.SW: "sw-KE",
|
||||
Language.SW_KE: "sw-KE",
|
||||
# Swedish (Preview)
|
||||
Language.SV: "sv-SE",
|
||||
Language.SV_SE: "sv-SE",
|
||||
# Tamil (GA)
|
||||
Language.TA: "ta-IN",
|
||||
Language.TA_IN: "ta-IN",
|
||||
# Telugu (GA)
|
||||
Language.TE: "te-IN",
|
||||
Language.TE_IN: "te-IN",
|
||||
# Thai (GA)
|
||||
Language.TH: "th-TH",
|
||||
Language.TH_TH: "th-TH",
|
||||
# Turkish (GA)
|
||||
Language.TR: "tr-TR",
|
||||
Language.TR_TR: "tr-TR",
|
||||
# Ukrainian (GA)
|
||||
Language.UK: "uk-UA",
|
||||
Language.UK_UA: "uk-UA",
|
||||
# Urdu (Preview)
|
||||
Language.UR: "ur-PK",
|
||||
Language.UR_PK: "ur-PK",
|
||||
# Vietnamese (GA)
|
||||
Language.VI: "vi-VN",
|
||||
Language.VI_VN: "vi-VN",
|
||||
}
|
||||
|
||||
return resolve_language(language, LANGUAGE_MAP, use_base_code=False)
|
||||
|
||||
|
||||
class GoogleHttpTTSService(TTSService):
|
||||
"""Google Cloud Text-to-Speech HTTP service with SSML support.
|
||||
|
||||
@@ -498,7 +742,139 @@ class GoogleHttpTTSService(TTSService):
|
||||
yield ErrorFrame(error=error_message)
|
||||
|
||||
|
||||
class GoogleTTSService(TTSService):
|
||||
class GoogleBaseTTSService(TTSService):
|
||||
"""Base class for Google Cloud Text-to-Speech streaming services.
|
||||
|
||||
Provides shared streaming synthesis logic for Google TTS services.
|
||||
This is an abstract base class. Use GoogleTTSService or GeminiTTSService instead.
|
||||
"""
|
||||
|
||||
def _create_client(
|
||||
self, credentials: Optional[str], credentials_path: Optional[str]
|
||||
) -> texttospeech_v1.TextToSpeechAsyncClient:
|
||||
"""Create authenticated Google Text-to-Speech client.
|
||||
|
||||
Args:
|
||||
credentials: JSON string with service account credentials.
|
||||
credentials_path: Path to service account JSON file.
|
||||
|
||||
Returns:
|
||||
Authenticated TextToSpeechAsyncClient instance.
|
||||
|
||||
Raises:
|
||||
ValueError: If no valid credentials are provided.
|
||||
"""
|
||||
creds: Optional[service_account.Credentials] = None
|
||||
|
||||
if credentials:
|
||||
# Use provided credentials JSON string
|
||||
json_account_info = json.loads(credentials)
|
||||
creds = service_account.Credentials.from_service_account_info(json_account_info)
|
||||
elif credentials_path:
|
||||
# Use service account JSON file if provided
|
||||
creds = service_account.Credentials.from_service_account_file(credentials_path)
|
||||
else:
|
||||
try:
|
||||
creds, project_id = default(
|
||||
scopes=["https://www.googleapis.com/auth/cloud-platform"]
|
||||
)
|
||||
except GoogleAuthError:
|
||||
pass
|
||||
|
||||
if not creds:
|
||||
raise ValueError("No valid credentials provided.")
|
||||
|
||||
return texttospeech_v1.TextToSpeechAsyncClient(credentials=creds)
|
||||
|
||||
def can_generate_metrics(self) -> bool:
|
||||
"""Check if this service can generate processing metrics.
|
||||
|
||||
Returns:
|
||||
True, as Google streaming TTS services support metrics generation.
|
||||
"""
|
||||
return True
|
||||
|
||||
@property
|
||||
def includes_inter_frame_spaces(self) -> bool:
|
||||
"""Indicates that Google and Gemini TTSTextFrames include necessary inter-frame spaces.
|
||||
|
||||
Returns:
|
||||
True, indicating that Google's text frames include necessary inter-frame spaces.
|
||||
"""
|
||||
return True
|
||||
|
||||
def language_to_service_language(self, language: Language) -> Optional[str]:
|
||||
"""Convert a Language enum to Google TTS language format.
|
||||
|
||||
Args:
|
||||
language: The language to convert.
|
||||
|
||||
Returns:
|
||||
The Google TTS-specific language code, or None if not supported.
|
||||
"""
|
||||
return language_to_google_tts_language(language)
|
||||
|
||||
async def _stream_tts(
|
||||
self,
|
||||
streaming_config: texttospeech_v1.StreamingSynthesizeConfig,
|
||||
text: str,
|
||||
prompt: Optional[str] = None,
|
||||
) -> AsyncGenerator[Frame, None]:
|
||||
"""Shared streaming synthesis logic.
|
||||
|
||||
Args:
|
||||
streaming_config: The streaming configuration.
|
||||
text: The text to synthesize.
|
||||
prompt: Optional prompt for style instructions (Gemini only).
|
||||
|
||||
Yields:
|
||||
Frame: Audio frames containing the synthesized speech.
|
||||
"""
|
||||
config_request = texttospeech_v1.StreamingSynthesizeRequest(
|
||||
streaming_config=streaming_config
|
||||
)
|
||||
|
||||
async def request_generator():
|
||||
yield config_request
|
||||
synthesis_input_params = {"text": text}
|
||||
if prompt is not None:
|
||||
synthesis_input_params["prompt"] = prompt
|
||||
yield texttospeech_v1.StreamingSynthesizeRequest(
|
||||
input=texttospeech_v1.StreamingSynthesisInput(**synthesis_input_params)
|
||||
)
|
||||
|
||||
streaming_responses = await self._client.streaming_synthesize(request_generator())
|
||||
await self.start_tts_usage_metrics(text)
|
||||
|
||||
yield TTSStartedFrame()
|
||||
|
||||
audio_buffer = b""
|
||||
first_chunk_for_ttfb = False
|
||||
|
||||
CHUNK_SIZE = self.chunk_size
|
||||
|
||||
async for response in streaming_responses:
|
||||
chunk = response.audio_content
|
||||
if not chunk:
|
||||
continue
|
||||
|
||||
if not first_chunk_for_ttfb:
|
||||
await self.stop_ttfb_metrics()
|
||||
first_chunk_for_ttfb = True
|
||||
|
||||
audio_buffer += chunk
|
||||
while len(audio_buffer) >= CHUNK_SIZE:
|
||||
piece = audio_buffer[:CHUNK_SIZE]
|
||||
audio_buffer = audio_buffer[CHUNK_SIZE:]
|
||||
yield TTSAudioRawFrame(piece, self.sample_rate, 1)
|
||||
|
||||
if audio_buffer:
|
||||
yield TTSAudioRawFrame(audio_buffer, self.sample_rate, 1)
|
||||
|
||||
yield TTSStoppedFrame()
|
||||
|
||||
|
||||
class GoogleTTSService(GoogleBaseTTSService):
|
||||
"""Google Cloud Text-to-Speech streaming service.
|
||||
|
||||
Provides real-time text-to-speech synthesis using Google Cloud's streaming API
|
||||
@@ -570,62 +946,6 @@ class GoogleTTSService(TTSService):
|
||||
credentials, credentials_path
|
||||
)
|
||||
|
||||
def _create_client(
|
||||
self, credentials: Optional[str], credentials_path: Optional[str]
|
||||
) -> texttospeech_v1.TextToSpeechAsyncClient:
|
||||
creds: Optional[service_account.Credentials] = None
|
||||
|
||||
# Create a Google Cloud service account for the Cloud Text-to-Speech API
|
||||
# Using either the provided credentials JSON string or the path to a service account JSON
|
||||
# file, create a Google Cloud service account and use it to authenticate with the API.
|
||||
if credentials:
|
||||
# Use provided credentials JSON string
|
||||
json_account_info = json.loads(credentials)
|
||||
creds = service_account.Credentials.from_service_account_info(json_account_info)
|
||||
elif credentials_path:
|
||||
# Use service account JSON file if provided
|
||||
creds = service_account.Credentials.from_service_account_file(credentials_path)
|
||||
else:
|
||||
try:
|
||||
creds, project_id = default(
|
||||
scopes=["https://www.googleapis.com/auth/cloud-platform"]
|
||||
)
|
||||
except GoogleAuthError:
|
||||
pass
|
||||
|
||||
if not creds:
|
||||
raise ValueError("No valid credentials provided.")
|
||||
|
||||
return texttospeech_v1.TextToSpeechAsyncClient(credentials=creds)
|
||||
|
||||
def can_generate_metrics(self) -> bool:
|
||||
"""Check if this service can generate processing metrics.
|
||||
|
||||
Returns:
|
||||
True, as Google streaming TTS service supports metrics generation.
|
||||
"""
|
||||
return True
|
||||
|
||||
@property
|
||||
def includes_inter_frame_spaces(self) -> bool:
|
||||
"""Indicates that Google TTSTextFrames include necessary inter-frame spaces.
|
||||
|
||||
Returns:
|
||||
True, indicating that Google's text frames include necessary inter-frame spaces.
|
||||
"""
|
||||
return True
|
||||
|
||||
def language_to_service_language(self, language: Language) -> Optional[str]:
|
||||
"""Convert a Language enum to Google TTS language format.
|
||||
|
||||
Args:
|
||||
language: The language to convert.
|
||||
|
||||
Returns:
|
||||
The Google TTS-specific language code, or None if not supported.
|
||||
"""
|
||||
return language_to_google_tts_language(language)
|
||||
|
||||
async def _update_settings(self, settings: Mapping[str, Any]):
|
||||
"""Override to handle speaking_rate updates for streaming API.
|
||||
|
||||
@@ -657,6 +977,7 @@ class GoogleTTSService(TTSService):
|
||||
try:
|
||||
await self.start_ttfb_metrics()
|
||||
|
||||
# Build voice selection params
|
||||
if self._voice_cloning_key:
|
||||
voice_clone_params = texttospeech_v1.VoiceCloneParams(
|
||||
voice_cloning_key=self._voice_cloning_key
|
||||
@@ -669,6 +990,7 @@ class GoogleTTSService(TTSService):
|
||||
language_code=self._settings["language"], name=self._voice_id
|
||||
)
|
||||
|
||||
# Create streaming config
|
||||
streaming_config = texttospeech_v1.StreamingSynthesizeConfig(
|
||||
voice=voice,
|
||||
streaming_audio_config=texttospeech_v1.StreamingAudioConfig(
|
||||
@@ -677,45 +999,10 @@ class GoogleTTSService(TTSService):
|
||||
speaking_rate=self._settings["speaking_rate"],
|
||||
),
|
||||
)
|
||||
config_request = texttospeech_v1.StreamingSynthesizeRequest(
|
||||
streaming_config=streaming_config
|
||||
)
|
||||
|
||||
async def request_generator():
|
||||
yield config_request
|
||||
yield texttospeech_v1.StreamingSynthesizeRequest(
|
||||
input=texttospeech_v1.StreamingSynthesisInput(text=text)
|
||||
)
|
||||
|
||||
streaming_responses = await self._client.streaming_synthesize(request_generator())
|
||||
await self.start_tts_usage_metrics(text)
|
||||
|
||||
yield TTSStartedFrame()
|
||||
|
||||
audio_buffer = b""
|
||||
first_chunk_for_ttfb = False
|
||||
|
||||
CHUNK_SIZE = self.chunk_size
|
||||
|
||||
async for response in streaming_responses:
|
||||
chunk = response.audio_content
|
||||
if not chunk:
|
||||
continue
|
||||
|
||||
if not first_chunk_for_ttfb:
|
||||
await self.stop_ttfb_metrics()
|
||||
first_chunk_for_ttfb = True
|
||||
|
||||
audio_buffer += chunk
|
||||
while len(audio_buffer) >= CHUNK_SIZE:
|
||||
piece = audio_buffer[:CHUNK_SIZE]
|
||||
audio_buffer = audio_buffer[CHUNK_SIZE:]
|
||||
yield TTSAudioRawFrame(piece, self.sample_rate, 1)
|
||||
|
||||
if audio_buffer:
|
||||
yield TTSAudioRawFrame(audio_buffer, self.sample_rate, 1)
|
||||
|
||||
yield TTSStoppedFrame()
|
||||
# Use base class streaming logic
|
||||
async for frame in self._stream_tts(streaming_config, text):
|
||||
yield frame
|
||||
|
||||
except Exception as e:
|
||||
logger.exception(f"{self} error generating TTS: {e}")
|
||||
@@ -723,25 +1010,29 @@ class GoogleTTSService(TTSService):
|
||||
yield ErrorFrame(error=error_message)
|
||||
|
||||
|
||||
class GeminiTTSService(TTSService):
|
||||
"""Gemini Text-to-Speech service using Gemini TTS models.
|
||||
class GeminiTTSService(GoogleBaseTTSService):
|
||||
"""Gemini Text-to-Speech streaming service using Gemini TTS models.
|
||||
|
||||
Provides text-to-speech synthesis using Gemini's TTS-specific models
|
||||
(gemini-2.5-flash-preview-tts and gemini-2.5-pro-preview-tts) with
|
||||
support for natural voice control, multiple speakers, and voice styles.
|
||||
Provides real-time text-to-speech synthesis using Gemini's TTS-specific models
|
||||
(gemini-2.5-flash-tts and gemini-2.5-pro-tts) with support for natural
|
||||
voice control, prompts for style instructions, expressive markup tags,
|
||||
and multi-speaker conversations.
|
||||
|
||||
Note:
|
||||
Requires Google AI API key. This uses the Gemini API, not Google Cloud TTS.
|
||||
Audio-out is currently a preview feature.
|
||||
Requires Google Cloud credentials via service account JSON, credentials file,
|
||||
or default application credentials (GOOGLE_APPLICATION_CREDENTIALS).
|
||||
|
||||
Uses the Google Cloud Text-to-Speech streaming API for low-latency synthesis.
|
||||
|
||||
Example::
|
||||
|
||||
tts = GeminiTTSService(
|
||||
api_key="your-google-ai-api-key",
|
||||
model="gemini-2.5-flash-preview-tts",
|
||||
credentials_path="/path/to/service-account.json",
|
||||
model="gemini-2.5-flash-tts",
|
||||
voice_id="Kore",
|
||||
params=GeminiTTSService.InputParams(
|
||||
language=Language.EN_US,
|
||||
prompt="Say this in a friendly and helpful tone"
|
||||
)
|
||||
)
|
||||
"""
|
||||
@@ -750,36 +1041,36 @@ class GeminiTTSService(TTSService):
|
||||
|
||||
# List of available Gemini TTS voices
|
||||
AVAILABLE_VOICES = [
|
||||
"Zephyr",
|
||||
"Puck",
|
||||
"Achernar",
|
||||
"Achird",
|
||||
"Algenib",
|
||||
"Algieba",
|
||||
"Alnilam",
|
||||
"Aoede",
|
||||
"Autonoe",
|
||||
"Callirhoe",
|
||||
"Charon",
|
||||
"Kore",
|
||||
"Despina",
|
||||
"Enceladus",
|
||||
"Erinome",
|
||||
"Fenrir",
|
||||
"Gacrux",
|
||||
"Iapetus",
|
||||
"Kore",
|
||||
"Laomedeia",
|
||||
"Leda",
|
||||
"Orus",
|
||||
"Aoede",
|
||||
"Callirhoe",
|
||||
"Autonoe",
|
||||
"Enceladus",
|
||||
"Iapetus",
|
||||
"Umbriel",
|
||||
"Algieba",
|
||||
"Despina",
|
||||
"Erinome",
|
||||
"Algenib",
|
||||
"Rasalgethi",
|
||||
"Laomedeia",
|
||||
"Achernar",
|
||||
"Alnilam",
|
||||
"Schedar",
|
||||
"Gacrux",
|
||||
"Puck",
|
||||
"Pulcherrima",
|
||||
"Achird",
|
||||
"Zubenelgenubi",
|
||||
"Vindemiatrix",
|
||||
"Rasalgethi",
|
||||
"Sadachbia",
|
||||
"Sadaltager",
|
||||
"Schedar",
|
||||
"Sulafar",
|
||||
"Umbriel",
|
||||
"Vindemiatrix",
|
||||
"Zephyr",
|
||||
"Zubenelgenubi",
|
||||
]
|
||||
|
||||
class InputParams(BaseModel):
|
||||
@@ -787,19 +1078,23 @@ class GeminiTTSService(TTSService):
|
||||
|
||||
Parameters:
|
||||
language: Language for synthesis. Defaults to English.
|
||||
prompt: Optional style instructions for how to synthesize the content.
|
||||
multi_speaker: Whether to enable multi-speaker support.
|
||||
speaker_configs: List of speaker configurations for multi-speaker mode.
|
||||
"""
|
||||
|
||||
language: Optional[Language] = Language.EN
|
||||
prompt: Optional[str] = None
|
||||
multi_speaker: bool = False
|
||||
speaker_configs: Optional[List[dict]] = None
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
api_key: str,
|
||||
model: str = "gemini-2.5-flash-preview-tts",
|
||||
api_key: Optional[str] = None,
|
||||
model: str = "gemini-2.5-flash-tts",
|
||||
credentials: Optional[str] = None,
|
||||
credentials_path: Optional[str] = None,
|
||||
voice_id: str = "Kore",
|
||||
sample_rate: Optional[int] = None,
|
||||
params: Optional[InputParams] = None,
|
||||
@@ -808,14 +1103,30 @@ class GeminiTTSService(TTSService):
|
||||
"""Initializes the Gemini TTS service.
|
||||
|
||||
Args:
|
||||
api_key: Google AI API key for authentication.
|
||||
api_key:
|
||||
|
||||
.. deprecated:: 0.0.95
|
||||
The `api_key` parameter is deprecated. Use `credentials` or
|
||||
`credentials_path` instead for Google Cloud authentication.
|
||||
|
||||
model: Gemini TTS model to use. Must be a TTS model like
|
||||
"gemini-2.5-flash-preview-tts" or "gemini-2.5-pro-preview-tts".
|
||||
"gemini-2.5-flash-tts" or "gemini-2.5-pro-tts".
|
||||
credentials: JSON string containing Google Cloud service account credentials.
|
||||
credentials_path: Path to Google Cloud service account JSON file.
|
||||
voice_id: Voice name from the available Gemini voices.
|
||||
sample_rate: Audio sample rate in Hz. If None, uses Google's default 24kHz.
|
||||
params: TTS configuration parameters.
|
||||
**kwargs: Additional arguments passed to parent TTSService.
|
||||
"""
|
||||
# Handle deprecated api_key parameter
|
||||
if api_key is not None:
|
||||
warnings.warn(
|
||||
"The 'api_key' parameter is deprecated and will be removed in a future version. "
|
||||
"Use 'credentials' or 'credentials_path' instead for Google Cloud authentication.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
if sample_rate and sample_rate != self.GOOGLE_SAMPLE_RATE:
|
||||
logger.warning(
|
||||
f"Google TTS only supports {self.GOOGLE_SAMPLE_RATE}Hz sample rate. "
|
||||
@@ -828,35 +1139,20 @@ class GeminiTTSService(TTSService):
|
||||
if voice_id not in self.AVAILABLE_VOICES:
|
||||
logger.warning(f"Voice '{voice_id}' not in known voices list. Using anyway.")
|
||||
|
||||
self._api_key = api_key
|
||||
self._model = model
|
||||
self._voice_id = voice_id
|
||||
self._settings = {
|
||||
"language": self.language_to_service_language(params.language)
|
||||
if params.language
|
||||
else "en-US",
|
||||
"prompt": params.prompt,
|
||||
"multi_speaker": params.multi_speaker,
|
||||
"speaker_configs": params.speaker_configs,
|
||||
}
|
||||
|
||||
self._client = genai.Client(api_key=api_key)
|
||||
|
||||
def can_generate_metrics(self) -> bool:
|
||||
"""Check if this service can generate processing metrics.
|
||||
|
||||
Returns:
|
||||
True, as Gemini TTS service supports metrics generation.
|
||||
"""
|
||||
return True
|
||||
|
||||
@property
|
||||
def includes_inter_frame_spaces(self) -> bool:
|
||||
"""Indicates that Gemini TTSTextFrames include necessary inter-frame spaces.
|
||||
|
||||
Returns:
|
||||
True, indicating that Gemini's text frames include necessary inter-frame spaces.
|
||||
"""
|
||||
return True
|
||||
self._client: texttospeech_v1.TextToSpeechAsyncClient = self._create_client(
|
||||
credentials, credentials_path
|
||||
)
|
||||
|
||||
def language_to_service_language(self, language: Language) -> Optional[str]:
|
||||
"""Convert a Language enum to Gemini TTS language format.
|
||||
@@ -867,7 +1163,7 @@ class GeminiTTSService(TTSService):
|
||||
Returns:
|
||||
The Gemini TTS-specific language code, or None if not supported.
|
||||
"""
|
||||
return language_to_google_tts_language(language)
|
||||
return language_to_gemini_tts_language(language)
|
||||
|
||||
def set_voice(self, voice_id: str):
|
||||
"""Set the voice for TTS generation.
|
||||
@@ -892,88 +1188,73 @@ class GeminiTTSService(TTSService):
|
||||
f"Current rate of {self.sample_rate}Hz may cause issues."
|
||||
)
|
||||
|
||||
@traced_tts
|
||||
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
|
||||
"""Generate speech from text using Gemini TTS models.
|
||||
async def _update_settings(self, settings: Mapping[str, Any]):
|
||||
"""Override to handle prompt updates.
|
||||
|
||||
Args:
|
||||
text: The text to synthesize into speech. Can include natural language
|
||||
instructions for style, tone, etc.
|
||||
settings: Dictionary of settings to update. Can include 'prompt' (str)
|
||||
"""
|
||||
if "prompt" in settings:
|
||||
self._settings["prompt"] = settings["prompt"]
|
||||
await super()._update_settings(settings)
|
||||
|
||||
@traced_tts
|
||||
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
|
||||
"""Generate streaming speech from text using Gemini TTS models.
|
||||
|
||||
Args:
|
||||
text: The text to synthesize into speech. Can include markup tags
|
||||
like [sigh], [laughing], [whispering] for expressive control.
|
||||
|
||||
Yields:
|
||||
Frame: Audio frames containing the synthesized speech.
|
||||
Frame: Audio frames containing the synthesized speech as it's generated.
|
||||
"""
|
||||
logger.debug(f"{self}: Generating TTS [{text}]")
|
||||
|
||||
try:
|
||||
await self.start_ttfb_metrics()
|
||||
|
||||
# Build the speech config
|
||||
# Build voice selection params
|
||||
if self._settings["multi_speaker"] and self._settings["speaker_configs"]:
|
||||
# Multi-speaker mode
|
||||
speaker_voice_configs = []
|
||||
for speaker_config in self._settings["speaker_configs"]:
|
||||
speaker_voice_configs.append(
|
||||
types.SpeakerVoiceConfig(
|
||||
speaker=speaker_config["speaker"],
|
||||
voice_config=types.VoiceConfig(
|
||||
prebuilt_voice_config=types.PrebuiltVoiceConfig(
|
||||
voice_name=speaker_config.get("voice_id", self._voice_id)
|
||||
)
|
||||
),
|
||||
texttospeech_v1.MultispeakerPrebuiltVoice(
|
||||
speaker_alias=speaker_config["speaker_alias"],
|
||||
speaker_id=speaker_config.get("speaker_id", self._voice_id),
|
||||
)
|
||||
)
|
||||
|
||||
speech_config = types.SpeechConfig(
|
||||
multi_speaker_voice_config=types.MultiSpeakerVoiceConfig(
|
||||
speaker_voice_configs=speaker_voice_configs
|
||||
)
|
||||
multi_speaker_voice_config = texttospeech_v1.MultiSpeakerVoiceConfig(
|
||||
speaker_voice_configs=speaker_voice_configs
|
||||
)
|
||||
|
||||
voice = texttospeech_v1.VoiceSelectionParams(
|
||||
language_code=self._settings["language"],
|
||||
model_name=self._model,
|
||||
multi_speaker_voice_config=multi_speaker_voice_config,
|
||||
)
|
||||
else:
|
||||
# Single speaker mode
|
||||
speech_config = types.SpeechConfig(
|
||||
voice_config=types.VoiceConfig(
|
||||
prebuilt_voice_config=types.PrebuiltVoiceConfig(voice_name=self._voice_id)
|
||||
)
|
||||
voice = texttospeech_v1.VoiceSelectionParams(
|
||||
language_code=self._settings["language"],
|
||||
name=self._voice_id,
|
||||
model_name=self._model,
|
||||
)
|
||||
|
||||
# Create the generation config
|
||||
generation_config = types.GenerateContentConfig(
|
||||
response_modalities=["AUDIO"],
|
||||
speech_config=speech_config,
|
||||
# Create streaming config
|
||||
streaming_config = texttospeech_v1.StreamingSynthesizeConfig(
|
||||
voice=voice,
|
||||
streaming_audio_config=texttospeech_v1.StreamingAudioConfig(
|
||||
audio_encoding=texttospeech_v1.AudioEncoding.PCM,
|
||||
sample_rate_hertz=self.sample_rate,
|
||||
),
|
||||
)
|
||||
|
||||
# Generate the content
|
||||
response = await self._client.aio.models.generate_content(
|
||||
model=self._model,
|
||||
contents=text,
|
||||
config=generation_config,
|
||||
)
|
||||
|
||||
await self.start_tts_usage_metrics(text)
|
||||
|
||||
yield TTSStartedFrame()
|
||||
|
||||
# Extract audio data from response
|
||||
if response.candidates and len(response.candidates) > 0:
|
||||
candidate = response.candidates[0]
|
||||
if candidate.content and candidate.content.parts:
|
||||
for part in candidate.content.parts:
|
||||
if part.inline_data and part.inline_data.mime_type.startswith("audio/"):
|
||||
audio_data = part.inline_data.data
|
||||
await self.stop_ttfb_metrics()
|
||||
|
||||
# Gemini TTS returns PCM audio data, chunk it appropriately
|
||||
CHUNK_SIZE = self.chunk_size
|
||||
|
||||
for i in range(0, len(audio_data), CHUNK_SIZE):
|
||||
chunk = audio_data[i : i + CHUNK_SIZE]
|
||||
if not chunk:
|
||||
break
|
||||
frame = TTSAudioRawFrame(chunk, self.sample_rate, 1)
|
||||
yield frame
|
||||
|
||||
yield TTSStoppedFrame()
|
||||
# Use base class streaming logic with prompt support
|
||||
async for frame in self._stream_tts(streaming_config, text, self._settings["prompt"]):
|
||||
yield frame
|
||||
|
||||
except Exception as e:
|
||||
logger.exception(f"{self} error generating TTS: {e}")
|
||||
|
||||
@@ -66,6 +66,7 @@ class Language(StrEnum):
|
||||
AR_TN = "ar-TN"
|
||||
AR_XA = "ar-XA"
|
||||
AR_YE = "ar-YE"
|
||||
AR_001 = "ar-001"
|
||||
|
||||
# Assamese
|
||||
AS = "as"
|
||||
@@ -83,6 +84,7 @@ class Language(StrEnum):
|
||||
|
||||
# Belarusian
|
||||
BE = "be"
|
||||
BE_BY = "be-BY"
|
||||
|
||||
# Bulgarian
|
||||
BG = "bg"
|
||||
@@ -109,6 +111,7 @@ class Language(StrEnum):
|
||||
|
||||
# Cebuano
|
||||
CEB = "ceb"
|
||||
CEB_PH = "ceb-PH"
|
||||
|
||||
# Mandarin Chinese
|
||||
CMN = "cmn"
|
||||
@@ -181,6 +184,7 @@ class Language(StrEnum):
|
||||
ES_US = "es-US"
|
||||
ES_UY = "es-UY"
|
||||
ES_VE = "es-VE"
|
||||
ES_419 = "es-419"
|
||||
|
||||
# Estonian
|
||||
ET = "et"
|
||||
@@ -250,6 +254,7 @@ class Language(StrEnum):
|
||||
|
||||
# Haitian Creole
|
||||
HT = "ht"
|
||||
HT_HT = "ht-HT"
|
||||
|
||||
# Hungarian
|
||||
HU = "hu"
|
||||
@@ -288,6 +293,7 @@ class Language(StrEnum):
|
||||
# Javanese
|
||||
JV = "jv"
|
||||
JV_ID = "jv-ID"
|
||||
JV_JV = "jv-JV"
|
||||
JW = "jw" # Fal requires for Javanese
|
||||
|
||||
# Georgian
|
||||
@@ -309,6 +315,10 @@ class Language(StrEnum):
|
||||
KN = "kn"
|
||||
KN_IN = "kn-IN"
|
||||
|
||||
# Konkani
|
||||
KOK = "kok"
|
||||
KOK_IN = "kok-IN"
|
||||
|
||||
# Korean
|
||||
KO = "ko"
|
||||
KO_KR = "ko-KR"
|
||||
@@ -322,9 +332,11 @@ class Language(StrEnum):
|
||||
|
||||
# Latin
|
||||
LA = "la"
|
||||
LA_VA = "la-VA"
|
||||
|
||||
# Luxembourgish
|
||||
LB = "lb"
|
||||
LB_LU = "lb-LU"
|
||||
|
||||
# Lingala
|
||||
LN = "ln"
|
||||
@@ -349,6 +361,7 @@ class Language(StrEnum):
|
||||
|
||||
# Malagasy
|
||||
MG = "mg"
|
||||
MG_MG = "mg-MG"
|
||||
|
||||
# Maori
|
||||
MI = "mi"
|
||||
@@ -357,6 +370,10 @@ class Language(StrEnum):
|
||||
MK = "mk"
|
||||
MK_MK = "mk-MK"
|
||||
|
||||
# Maithili
|
||||
MAI = "mai"
|
||||
MAI_IN = "mai-IN"
|
||||
|
||||
# Malayalam
|
||||
ML = "ml"
|
||||
ML_IN = "ml-IN"
|
||||
@@ -387,6 +404,7 @@ class Language(StrEnum):
|
||||
NB_NO = "nb-NO"
|
||||
NO = "no"
|
||||
NN = "nn" # Norwegian Nynorsk
|
||||
NN_NO = "nn-NO"
|
||||
|
||||
# Nepali
|
||||
NE = "ne"
|
||||
@@ -440,6 +458,7 @@ class Language(StrEnum):
|
||||
|
||||
# Sindhi
|
||||
SD = "sd"
|
||||
SD_IN = "sd-IN"
|
||||
|
||||
# Sinhala
|
||||
SI = "si"
|
||||
|
||||
Reference in New Issue
Block a user