From d75a02dc51d780ea6b95d4d548db59836a3dc140 Mon Sep 17 00:00:00 2001 From: Mark Backman Date: Tue, 1 Oct 2024 21:01:44 -0400 Subject: [PATCH] Use Language enum and set languages accordingly --- src/pipecat/services/aws.py | 66 +++++++++++++++++++- src/pipecat/services/azure.py | 96 +++++++++++++++++++++++++++++- src/pipecat/services/cartesia.py | 6 +- src/pipecat/services/deepgram.py | 2 +- src/pipecat/services/elevenlabs.py | 74 ++++++++++++++++++++++- src/pipecat/services/gladia.py | 83 +++++++++++++++++++++++++- src/pipecat/services/google.py | 94 ++++++++++++++++++++++++++++- src/pipecat/services/lmnt.py | 31 +++++++++- src/pipecat/services/xtts.py | 49 ++++++++++++++- 9 files changed, 484 insertions(+), 17 deletions(-) diff --git a/src/pipecat/services/aws.py b/src/pipecat/services/aws.py index 7004c21f7..210e0e700 100644 --- a/src/pipecat/services/aws.py +++ b/src/pipecat/services/aws.py @@ -17,6 +17,7 @@ from pipecat.frames.frames import ( TTSStoppedFrame, ) from pipecat.services.ai_services import TTSService +from pipecat.transcriptions.language import Language try: import boto3 @@ -29,10 +30,71 @@ except ModuleNotFoundError as e: raise Exception(f"Missing module: {e}") +def language_to_aws_language(language: Language) -> str | None: + match language: + case Language.CA: + return "ca-ES" + case Language.ZH: + return "cmn-CN" + case Language.DA: + return "da-DK" + case Language.NL: + return "nl-NL" + case Language.NL_BE: + return "nl-BE" + case Language.EN: + return "en-US" + case Language.EN_US: + return "en-US" + case Language.EN_AU: + return "en-AU" + case Language.EN_GB: + return "en-GB" + case Language.EN_NZ: + return "en-NZ" + case Language.EN_IN: + return "en-IN" + case Language.FI: + return "fi-FI" + case Language.FR: + return "fr-FR" + case Language.FR_CA: + return "fr-CA" + case Language.DE: + return "de-DE" + case Language.HI: + return "hi-IN" + case Language.IT: + return "it-IT" + case Language.JA: + return "ja-JP" + case Language.KO: + return "ko-KR" + case Language.NO: + return "nb-NO" + case Language.PL: + return "pl-PL" + case Language.PT: + return "pt-PT" + case Language.PT_BR: + return "pt-BR" + case Language.RO: + return "ro-RO" + case Language.RU: + return "ru-RU" + case Language.ES: + return "es-ES" + case Language.SV: + return "sv-SE" + case Language.TR: + return "tr-TR" + return None + + class AWSTTSService(TTSService): class InputParams(BaseModel): engine: Optional[str] = None - language: Optional[str] = None + language: Optional[Language] = Language.EN pitch: Optional[str] = None rate: Optional[str] = None volume: Optional[str] = None @@ -59,7 +121,7 @@ class AWSTTSService(TTSService): self._settings = { "sample_rate": sample_rate, "engine": params.engine, - "language": params.language, + "language": language_to_aws_language(params.language) if params.language else "en-US", "pitch": params.pitch, "rate": params.rate, "volume": params.volume, diff --git a/src/pipecat/services/azure.py b/src/pipecat/services/azure.py index 1b2b9a3f2..078b7df6b 100644 --- a/src/pipecat/services/azure.py +++ b/src/pipecat/services/azure.py @@ -27,6 +27,7 @@ from pipecat.frames.frames import ( ) from pipecat.services.ai_services import ImageGenService, STTService, TTSService from pipecat.services.openai import BaseOpenAILLMService +from pipecat.transcriptions.language import Language from pipecat.utils.time import time_now_iso8601 # See .env.example for Azure configuration needed @@ -70,10 +71,101 @@ class AzureLLMService(BaseOpenAILLMService): ) +def language_to_azure_language(language: Language) -> str | None: + match language: + case Language.BG: + return "bg-BG" + case Language.CA: + return "ca-ES" + case Language.ZH: + return "zh-CN" + case Language.ZH_TW: + return "zh-TW" + case Language.CS: + return "cs-CZ" + case Language.DA: + return "da-DK" + case Language.NL: + return "nl-NL" + case Language.EN: + return "en-US" + case Language.EN_US: + return "en-US" + case Language.EN_AU: + return "en-AU" + case Language.EN_GB: + return "en-GB" + case Language.EN_NZ: + return "en-NZ" + case Language.EN_IN: + return "en-IN" + case Language.ET: + return "et-EE" + case Language.FI: + return "fi-FI" + case Language.NL_BE: + return "nl-BE" + case Language.FR: + return "fr-FR" + case Language.FR_CA: + return "fr-CA" + case Language.DE: + return "de-DE" + case Language.DE_CH: + return "de-CH" + case Language.EL: + return "el-GR" + case Language.HI: + return "hi-IN" + case Language.HU: + return "hu-HU" + case Language.ID: + return "id-ID" + case Language.IT: + return "it-IT" + case Language.JA: + return "ja-JP" + case Language.KO: + return "ko-KR" + case Language.LV: + return "lv-LV" + case Language.LT: + return "lt-LT" + case Language.MS: + return "ms-MY" + case Language.NO: + return "nb-NO" + case Language.PL: + return "pl-PL" + case Language.PT: + return "pt-PT" + case Language.PT_BR: + return "pt-BR" + case Language.RO: + return "ro-RO" + case Language.RU: + return "ru-RU" + case Language.SK: + return "sk-SK" + case Language.ES: + return "es-ES" + case Language.SV: + return "sv-SE" + case Language.TH: + return "th-TH" + case Language.TR: + return "tr-TR" + case Language.UK: + return "uk-UA" + case Language.VI: + return "vi-VN" + return None + + class AzureTTSService(TTSService): class InputParams(BaseModel): emphasis: Optional[str] = None - language: Optional[str] = "en-US" + language: Optional[Language] = Language.EN pitch: Optional[str] = None rate: Optional[str] = "1.05" role: Optional[str] = None @@ -99,7 +191,7 @@ class AzureTTSService(TTSService): self._settings = { "sample_rate": sample_rate, "emphasis": params.emphasis, - "language": params.language, + "language": language_to_azure_language(params.language) if params.language else "en-US", "pitch": params.pitch, "rate": params.rate, "role": params.role, diff --git a/src/pipecat/services/cartesia.py b/src/pipecat/services/cartesia.py index 0817879c4..22e09cb2b 100644 --- a/src/pipecat/services/cartesia.py +++ b/src/pipecat/services/cartesia.py @@ -106,7 +106,7 @@ class CartesiaTTSService(WordTTSService): "encoding": params.encoding, "sample_rate": params.sample_rate, }, - "language": language_to_cartesia_language(params.language) if params.language else None, + "language": language_to_cartesia_language(params.language) if params.language else "en", "speed": params.speed, "emotion": params.emotion, } @@ -280,7 +280,7 @@ class CartesiaHttpTTSService(TTSService): encoding: Optional[str] = "pcm_s16le" sample_rate: Optional[int] = 16000 container: Optional[str] = "raw" - language: Optional[str] = "en" + language: Optional[Language] = Language.EN speed: Optional[Union[str, float]] = "" emotion: Optional[List[str]] = [] @@ -303,7 +303,7 @@ class CartesiaHttpTTSService(TTSService): "encoding": params.encoding, "sample_rate": params.sample_rate, }, - "language": params.language, + "language": language_to_cartesia_language(params.language) if params.language else None, "speed": params.speed, "emotion": params.emotion, } diff --git a/src/pipecat/services/deepgram.py b/src/pipecat/services/deepgram.py index 40fe0168d..55a3ba68e 100644 --- a/src/pipecat/services/deepgram.py +++ b/src/pipecat/services/deepgram.py @@ -120,7 +120,7 @@ class DeepgramSTTService(STTService): url: str = "", live_options: LiveOptions = LiveOptions( encoding="linear16", - language="en-US", + language=Language.EN, model="nova-2-conversationalai", sample_rate=16000, channels=1, diff --git a/src/pipecat/services/elevenlabs.py b/src/pipecat/services/elevenlabs.py index 871b3eec6..695988b03 100644 --- a/src/pipecat/services/elevenlabs.py +++ b/src/pipecat/services/elevenlabs.py @@ -50,6 +50,76 @@ def sample_rate_from_output_format(output_format: str) -> int: return 16000 +def language_to_elevenlabs_language(language: Language) -> str | None: + match language: + case Language.BG: + return "bg" + case Language.ZH: + return "zh" + case Language.CS: + return "cs" + case Language.DA: + return "da" + case Language.NL: + return "nl" + case ( + Language.EN + | Language.EN_US + | Language.EN_AU + | Language.EN_GB + | Language.EN_NZ + | Language.EN_IN + ): + return "en" + case Language.FI: + return "fi" + case Language.FR | Language.FR_CA: + return "fr" + case Language.DE | Language.DE_CH: + return "de" + case Language.EL: + return "el" + case Language.HI: + return "hi" + case Language.HU: + return "hu" + case Language.ID: + return "id" + case Language.IT: + return "it" + case Language.JA: + return "ja" + case Language.KO: + return "ko" + case Language.MS: + return "ms" + case Language.NO: + return "no" + case Language.PL: + return "pl" + case Language.PT: + return "pt-PT" + case Language.PT_BR: + return "pt-BR" + case Language.RO: + return "ro" + case Language.RU: + return "ru" + case Language.SK: + return "sk" + case Language.ES: + return "es" + case Language.SV: + return "sv" + case Language.TR: + return "tr" + case Language.UK: + return "uk" + case Language.VI: + return "vi" + return None + + def calculate_word_times( alignment_info: Mapping[str, Any], cumulative_time: float ) -> List[Tuple[str, float]]: @@ -128,7 +198,9 @@ class ElevenLabsTTSService(WordTTSService): self._url = url self._settings = { "sample_rate": sample_rate_from_output_format(params.output_format), - "language": params.language, + "language": language_to_elevenlabs_language(params.language) + if params.language + else "en", "output_format": params.output_format, "optimize_streaming_latency": params.optimize_streaming_latency, "stability": params.stability, diff --git a/src/pipecat/services/gladia.py b/src/pipecat/services/gladia.py index 16f3dab97..0938a1cee 100644 --- a/src/pipecat/services/gladia.py +++ b/src/pipecat/services/gladia.py @@ -20,6 +20,7 @@ from pipecat.frames.frames import ( TranscriptionFrame, ) from pipecat.services.ai_services import STTService +from pipecat.transcriptions.language import Language from pipecat.utils.time import time_now_iso8601 # See .env.example for Gladia configuration needed @@ -33,10 +34,88 @@ except ModuleNotFoundError as e: raise Exception(f"Missing module: {e}") +def language_to_gladia_language(language: Language) -> str | None: + match language: + case Language.BG: + return "bg" + case Language.CA: + return "ca" + case Language.ZH: + return "zh" + case Language.CS: + return "cs" + case Language.DA: + return "da" + case Language.NL: + return "nl" + case ( + Language.EN + | Language.EN_US + | Language.EN_AU + | Language.EN_GB + | Language.EN_NZ + | Language.EN_IN + ): + return "en" + case Language.ET: + return "et" + case Language.FI: + return "fi" + case Language.FR | Language.FR_CA: + return "fr" + case Language.DE | Language.DE_CH: + return "de" + case Language.EL: + return "el" + case Language.HI: + return "hi" + case Language.HU: + return "hu" + case Language.ID: + return "id" + case Language.IT: + return "it" + case Language.JA: + return "ja" + case Language.KO: + return "ko" + case Language.LV: + return "lv" + case Language.LT: + return "lt" + case Language.MS: + return "ms" + case Language.NO: + return "no" + case Language.PL: + return "pl" + case Language.PT | Language.PT_BR: + return "pt" + case Language.RO: + return "ro" + case Language.RU: + return "ru" + case Language.SK: + return "sk" + case Language.ES: + return "es" + case Language.SV: + return "sv" + case Language.TH: + return "th" + case Language.TR: + return "tr" + case Language.UK: + return "uk" + case Language.VI: + return "vi" + return None + + class GladiaSTTService(STTService): class InputParams(BaseModel): sample_rate: Optional[int] = 16000 - language: Optional[str] = "english" + language: Optional[Language] = Language.EN transcription_hint: Optional[str] = None endpointing: Optional[int] = 200 prosody: Optional[bool] = None @@ -56,7 +135,7 @@ class GladiaSTTService(STTService): self._url = url self._settings = { "sample_rate": params.sample_rate, - "language": params.language, + "language": language_to_gladia_language(params.language) if params.language else "en", "transcription_hint": params.transcription_hint, "endpointing": params.endpointing, "prosody": params.prosody, diff --git a/src/pipecat/services/google.py b/src/pipecat/services/google.py index 05fff2056..55a2576d2 100644 --- a/src/pipecat/services/google.py +++ b/src/pipecat/services/google.py @@ -30,6 +30,7 @@ from pipecat.processors.aggregators.openai_llm_context import ( ) from pipecat.processors.frame_processor import FrameDirection from pipecat.services.ai_services import LLMService, TTSService +from pipecat.transcriptions.language import Language try: import google.ai.generativelanguage as glm @@ -145,13 +146,100 @@ class GoogleLLMService(LLMService): await self._process_context(context) +def language_to_google_language(language: Language) -> str | None: + match language: + case Language.BG: + return "bg-BG" + case Language.CA: + return "ca-ES" + case Language.ZH: + return "cmn-CN" + case Language.ZH_TW: + return "cmn-TW" + case Language.CS: + return "cs-CZ" + case Language.DA: + return "da-DK" + case Language.NL: + return "nl-NL" + case Language.EN: + return "en-US" + case Language.EN_US: + return "en-US" + case Language.EN_AU: + return "en-AU" + case Language.EN_GB: + return "en-GB" + case Language.EN_IN: + return "en-IN" + case Language.ET: + return "et-EE" + case Language.FI: + return "fi-FI" + case Language.NL_BE: + return "nl-BE" + case Language.FR: + return "fr-FR" + case Language.FR_CA: + return "fr-CA" + case Language.DE: + return "de-DE" + case Language.EL: + return "el-GR" + case Language.HI: + return "hi-IN" + case Language.HU: + return "hu-HU" + case Language.ID: + return "id-ID" + case Language.IT: + return "it-IT" + case Language.JA: + return "ja-JP" + case Language.KO: + return "ko-KR" + case Language.LV: + return "lv-LV" + case Language.LT: + return "lt-LT" + case Language.MS: + return "ms-MY" + case Language.NO: + return "nb-NO" + case Language.PL: + return "pl-PL" + case Language.PT: + return "pt-PT" + case Language.PT_BR: + return "pt-BR" + case Language.RO: + return "ro-RO" + case Language.RU: + return "ru-RU" + case Language.SK: + return "sk-SK" + case Language.ES: + return "es-ES" + case Language.SV: + return "sv-SE" + case Language.TH: + return "th-TH" + case Language.TR: + return "tr-TR" + case Language.UK: + return "uk-UA" + case Language.VI: + return "vi-VN" + return None + + class GoogleTTSService(TTSService): class InputParams(BaseModel): pitch: Optional[str] = None rate: Optional[str] = None volume: Optional[str] = None emphasis: Optional[Literal["strong", "moderate", "reduced", "none"]] = None - language: Optional[str] = "en-US" + language: Optional[Language] = Language.EN gender: Optional[Literal["male", "female", "neutral"]] = None google_style: Optional[Literal["apologetic", "calm", "empathetic", "firm", "lively"]] = None @@ -173,7 +261,9 @@ class GoogleTTSService(TTSService): "rate": params.rate, "volume": params.volume, "emphasis": params.emphasis, - "language": params.language, + "language": language_to_google_language(params.language) + if params.language + else "en-US", "gender": params.gender, "google_style": params.google_style, } diff --git a/src/pipecat/services/lmnt.py b/src/pipecat/services/lmnt.py index c828e7a7a..1ea3f1e62 100644 --- a/src/pipecat/services/lmnt.py +++ b/src/pipecat/services/lmnt.py @@ -22,6 +22,7 @@ from pipecat.frames.frames import ( ) from pipecat.processors.frame_processor import FrameDirection from pipecat.services.ai_services import TTSService +from pipecat.transcriptions.language import Language # See .env.example for LMNT configuration needed try: @@ -34,6 +35,32 @@ except ModuleNotFoundError as e: raise Exception(f"Missing module: {e}") +def language_to_lmnt_language(language: Language) -> str | None: + match language: + case Language.DE: + return "de" + case ( + Language.EN + | Language.EN_US + | Language.EN_AU + | Language.EN_GB + | Language.EN_NZ + | Language.EN_IN + ): + return "en" + case Language.ES: + return "es" + case Language.FR | Language.FR_CA: + return "fr" + case Language.PT | Language.PT_BR: + return "pt" + case Language.ZH | Language.ZH_TW: + return "zh" + case Language.KO: + return "ko" + return None + + class LmntTTSService(TTSService): def __init__( self, @@ -41,7 +68,7 @@ class LmntTTSService(TTSService): api_key: str, voice_id: str, sample_rate: int = 24000, - language: str = "en", + language: Language = Language.EN, **kwargs, ): # Let TTSService produce TTSStoppedFrames after a short delay of @@ -55,7 +82,7 @@ class LmntTTSService(TTSService): "encoding": "pcm_s16le", "sample_rate": sample_rate, }, - "language": language, + "language": language_to_lmnt_language(language) if language else "en", } self.set_voice(voice_id) diff --git a/src/pipecat/services/xtts.py b/src/pipecat/services/xtts.py index 7826cfcd8..eb20d5f3c 100644 --- a/src/pipecat/services/xtts.py +++ b/src/pipecat/services/xtts.py @@ -19,6 +19,7 @@ from pipecat.frames.frames import ( TTSStoppedFrame, ) from pipecat.services.ai_services import TTSService +from pipecat.transcriptions.language import Language try: import resampy @@ -36,12 +37,56 @@ except ModuleNotFoundError as e: # https://github.com/coqui-ai/xtts-streaming-server +def language_to_xtts_language(language: Language) -> str | None: + match language: + case Language.CS: + return "cs" + case Language.DE: + return "de" + case ( + Language.EN + | Language.EN_US + | Language.EN_AU + | Language.EN_GB + | Language.EN_NZ + | Language.EN_IN + ): + return "en" + case Language.ES: + return "es" + case Language.FR: + return "fr" + case Language.HI: + return "hi" + case Language.HU: + return "hu" + case Language.IT: + return "it" + case Language.JA: + return "ja" + case Language.KO: + return "ko" + case Language.NL: + return "nl" + case Language.PL: + return "pl" + case Language.PT | Language.PT_BR: + return "pt" + case Language.RU: + return "ru" + case Language.TR: + return "tr" + case Language.ZH: + return "zh-cn" + return None + + class XTTSService(TTSService): def __init__( self, *, voice_id: str, - language: str, + language: Language, base_url: str, aiohttp_session: aiohttp.ClientSession, **kwargs, @@ -49,7 +94,7 @@ class XTTSService(TTSService): super().__init__(**kwargs) self._settings = { - "language": language, + "language": language_to_xtts_language(language) if language else "en", "base_url": base_url, } self.set_voice(voice_id)