From b7d56d5ff0e7cc378e53ee36e0fa6ad4842f7c8b Mon Sep 17 00:00:00 2001 From: Mark Backman Date: Fri, 11 Apr 2025 09:21:14 -0400 Subject: [PATCH] Add language support for Gemini Live --- .../services/gemini_multimodal_live/gemini.py | 109 ++++++++++++++++++ src/pipecat/transcriptions/language.py | 4 + 2 files changed, 113 insertions(+) diff --git a/src/pipecat/services/gemini_multimodal_live/gemini.py b/src/pipecat/services/gemini_multimodal_live/gemini.py index 7b8524d91..7225f8cae 100644 --- a/src/pipecat/services/gemini_multimodal_live/gemini.py +++ b/src/pipecat/services/gemini_multimodal_live/gemini.py @@ -55,12 +55,105 @@ from pipecat.services.openai.llm import ( OpenAIAssistantContextAggregator, OpenAIUserContextAggregator, ) +from pipecat.transcriptions.language import Language from pipecat.utils.time import time_now_iso8601 from . import events from .audio_transcriber import AudioTranscriber +def language_to_gemini_language(language: Language) -> Optional[str]: + """Maps a Language enum value to a Gemini Live supported language code. + + Source: + https://ai.google.dev/api/generate-content#MediaResolution + + Returns None if the language is not supported by Gemini Live. + """ + language_map = { + # Arabic + Language.AR: "ar-XA", + # Bengali + Language.BN_IN: "bn-IN", + # Chinese (Mandarin) + Language.CMN: "cmn-CN", + Language.CMN_CN: "cmn-CN", + Language.ZH: "cmn-CN", # Map general Chinese to Mandarin for Gemini + Language.ZH_CN: "cmn-CN", # Map Simplified Chinese to Mandarin for Gemini + # German + Language.DE: "de-DE", + Language.DE_DE: "de-DE", + # English + Language.EN: "en-US", # Default to US English (though not explicitly listed in supported codes) + Language.EN_US: "en-US", + Language.EN_AU: "en-AU", + Language.EN_GB: "en-GB", + Language.EN_IN: "en-IN", + # Spanish + Language.ES: "es-ES", # Default to Spain Spanish + Language.ES_ES: "es-ES", + Language.ES_US: "es-US", + # French + Language.FR: "fr-FR", # Default to France French + Language.FR_FR: "fr-FR", + Language.FR_CA: "fr-CA", + # Gujarati + Language.GU: "gu-IN", + Language.GU_IN: "gu-IN", + # Hindi + Language.HI: "hi-IN", + Language.HI_IN: "hi-IN", + # Indonesian + Language.ID: "id-ID", + Language.ID_ID: "id-ID", + # Italian + Language.IT: "it-IT", + Language.IT_IT: "it-IT", + # Japanese + Language.JA: "ja-JP", + Language.JA_JP: "ja-JP", + # Kannada + Language.KN: "kn-IN", + Language.KN_IN: "kn-IN", + # Korean + Language.KO: "ko-KR", + Language.KO_KR: "ko-KR", + # Malayalam + Language.ML: "ml-IN", + Language.ML_IN: "ml-IN", + # Marathi + Language.MR: "mr-IN", + Language.MR_IN: "mr-IN", + # Dutch + Language.NL: "nl-NL", + Language.NL_NL: "nl-NL", + # Polish + Language.PL: "pl-PL", + Language.PL_PL: "pl-PL", + # Portuguese (Brazil) + Language.PT_BR: "pt-BR", + # Russian + Language.RU: "ru-RU", + Language.RU_RU: "ru-RU", + # Tamil + Language.TA: "ta-IN", + Language.TA_IN: "ta-IN", + # Telugu + Language.TE: "te-IN", + Language.TE_IN: "te-IN", + # Thai + Language.TH: "th-TH", + Language.TH_TH: "th-TH", + # Turkish + Language.TR: "tr-TR", + Language.TR_TR: "tr-TR", + # Vietnamese + Language.VI: "vi-VN", + Language.VI_VN: "vi-VN", + } + return language_map.get(language) + + class GeminiMultimodalLiveContext(OpenAILLMContext): @staticmethod def upgrade(obj: OpenAILLMContext) -> "GeminiMultimodalLiveContext": @@ -153,6 +246,7 @@ class InputParams(BaseModel): modalities: Optional[GeminiMultimodalModalities] = Field( default=GeminiMultimodalModalities.AUDIO ) + language: Optional[Language] = Field(default=Language.EN_US) extra: Optional[Dict[str, Any]] = Field(default_factory=dict) @@ -183,6 +277,7 @@ class GeminiMultimodalLiveLLMService(LLMService): self._base_url = base_url self.set_model_name(model) self._voice_id = voice_id + self._language_code = params.language self._system_instruction = system_instruction self._tools = tools @@ -214,6 +309,11 @@ class GeminiMultimodalLiveLLMService(LLMService): self._sample_rate = 24000 + self._language = params.language + self._language_code = ( + language_to_gemini_language(params.language) if params.language else "en-US" + ) + self._settings = { "frequency_penalty": params.frequency_penalty, "max_tokens": params.max_tokens, @@ -222,6 +322,7 @@ class GeminiMultimodalLiveLLMService(LLMService): "top_k": params.top_k, "top_p": params.top_p, "modalities": params.modalities, + "language": self._language_code, "extra": params.extra if isinstance(params.extra, dict) else {}, } @@ -237,6 +338,13 @@ class GeminiMultimodalLiveLLMService(LLMService): def set_model_modalities(self, modalities: GeminiMultimodalModalities): self._settings["modalities"] = modalities + def set_language(self, language: Language): + """Set the language for generation.""" + self._language = language + self._language_code = language_to_gemini_language(language) or "en-US" + self._settings["language"] = self._language_code + logger.info(f"Set Gemini language to: {self._language_code}") + async def set_context(self, context: OpenAILLMContext): """Set the context explicitly from outside the pipeline. @@ -431,6 +539,7 @@ class GeminiMultimodalLiveLLMService(LLMService): "voice_config": { "prebuilt_voice_config": {"voice_name": self._voice_id} }, + "language_code": self._settings["language"], }, }, }, diff --git a/src/pipecat/transcriptions/language.py b/src/pipecat/transcriptions/language.py index 197564740..a6de4f46e 100644 --- a/src/pipecat/transcriptions/language.py +++ b/src/pipecat/transcriptions/language.py @@ -83,6 +83,10 @@ class Language(StrEnum): CA = "ca" CA_ES = "ca-ES" + # Mandarin Chinese + CMN = "cmn" + CMN_CN = "cmn-CN" + # Czech CS = "cs" CS_CZ = "cs-CZ"