Use Language enum and set languages accordingly

This commit is contained in:
Mark Backman
2024-10-01 21:01:44 -04:00
parent 28643b453d
commit d75a02dc51
9 changed files with 484 additions and 17 deletions

View File

@@ -17,6 +17,7 @@ from pipecat.frames.frames import (
TTSStoppedFrame,
)
from pipecat.services.ai_services import TTSService
from pipecat.transcriptions.language import Language
try:
import boto3
@@ -29,10 +30,71 @@ except ModuleNotFoundError as e:
raise Exception(f"Missing module: {e}")
def language_to_aws_language(language: Language) -> str | None:
match language:
case Language.CA:
return "ca-ES"
case Language.ZH:
return "cmn-CN"
case Language.DA:
return "da-DK"
case Language.NL:
return "nl-NL"
case Language.NL_BE:
return "nl-BE"
case Language.EN:
return "en-US"
case Language.EN_US:
return "en-US"
case Language.EN_AU:
return "en-AU"
case Language.EN_GB:
return "en-GB"
case Language.EN_NZ:
return "en-NZ"
case Language.EN_IN:
return "en-IN"
case Language.FI:
return "fi-FI"
case Language.FR:
return "fr-FR"
case Language.FR_CA:
return "fr-CA"
case Language.DE:
return "de-DE"
case Language.HI:
return "hi-IN"
case Language.IT:
return "it-IT"
case Language.JA:
return "ja-JP"
case Language.KO:
return "ko-KR"
case Language.NO:
return "nb-NO"
case Language.PL:
return "pl-PL"
case Language.PT:
return "pt-PT"
case Language.PT_BR:
return "pt-BR"
case Language.RO:
return "ro-RO"
case Language.RU:
return "ru-RU"
case Language.ES:
return "es-ES"
case Language.SV:
return "sv-SE"
case Language.TR:
return "tr-TR"
return None
class AWSTTSService(TTSService):
class InputParams(BaseModel):
engine: Optional[str] = None
language: Optional[str] = None
language: Optional[Language] = Language.EN
pitch: Optional[str] = None
rate: Optional[str] = None
volume: Optional[str] = None
@@ -59,7 +121,7 @@ class AWSTTSService(TTSService):
self._settings = {
"sample_rate": sample_rate,
"engine": params.engine,
"language": params.language,
"language": language_to_aws_language(params.language) if params.language else "en-US",
"pitch": params.pitch,
"rate": params.rate,
"volume": params.volume,

View File

@@ -27,6 +27,7 @@ from pipecat.frames.frames import (
)
from pipecat.services.ai_services import ImageGenService, STTService, TTSService
from pipecat.services.openai import BaseOpenAILLMService
from pipecat.transcriptions.language import Language
from pipecat.utils.time import time_now_iso8601
# See .env.example for Azure configuration needed
@@ -70,10 +71,101 @@ class AzureLLMService(BaseOpenAILLMService):
)
def language_to_azure_language(language: Language) -> str | None:
match language:
case Language.BG:
return "bg-BG"
case Language.CA:
return "ca-ES"
case Language.ZH:
return "zh-CN"
case Language.ZH_TW:
return "zh-TW"
case Language.CS:
return "cs-CZ"
case Language.DA:
return "da-DK"
case Language.NL:
return "nl-NL"
case Language.EN:
return "en-US"
case Language.EN_US:
return "en-US"
case Language.EN_AU:
return "en-AU"
case Language.EN_GB:
return "en-GB"
case Language.EN_NZ:
return "en-NZ"
case Language.EN_IN:
return "en-IN"
case Language.ET:
return "et-EE"
case Language.FI:
return "fi-FI"
case Language.NL_BE:
return "nl-BE"
case Language.FR:
return "fr-FR"
case Language.FR_CA:
return "fr-CA"
case Language.DE:
return "de-DE"
case Language.DE_CH:
return "de-CH"
case Language.EL:
return "el-GR"
case Language.HI:
return "hi-IN"
case Language.HU:
return "hu-HU"
case Language.ID:
return "id-ID"
case Language.IT:
return "it-IT"
case Language.JA:
return "ja-JP"
case Language.KO:
return "ko-KR"
case Language.LV:
return "lv-LV"
case Language.LT:
return "lt-LT"
case Language.MS:
return "ms-MY"
case Language.NO:
return "nb-NO"
case Language.PL:
return "pl-PL"
case Language.PT:
return "pt-PT"
case Language.PT_BR:
return "pt-BR"
case Language.RO:
return "ro-RO"
case Language.RU:
return "ru-RU"
case Language.SK:
return "sk-SK"
case Language.ES:
return "es-ES"
case Language.SV:
return "sv-SE"
case Language.TH:
return "th-TH"
case Language.TR:
return "tr-TR"
case Language.UK:
return "uk-UA"
case Language.VI:
return "vi-VN"
return None
class AzureTTSService(TTSService):
class InputParams(BaseModel):
emphasis: Optional[str] = None
language: Optional[str] = "en-US"
language: Optional[Language] = Language.EN
pitch: Optional[str] = None
rate: Optional[str] = "1.05"
role: Optional[str] = None
@@ -99,7 +191,7 @@ class AzureTTSService(TTSService):
self._settings = {
"sample_rate": sample_rate,
"emphasis": params.emphasis,
"language": params.language,
"language": language_to_azure_language(params.language) if params.language else "en-US",
"pitch": params.pitch,
"rate": params.rate,
"role": params.role,

View File

@@ -106,7 +106,7 @@ class CartesiaTTSService(WordTTSService):
"encoding": params.encoding,
"sample_rate": params.sample_rate,
},
"language": language_to_cartesia_language(params.language) if params.language else None,
"language": language_to_cartesia_language(params.language) if params.language else "en",
"speed": params.speed,
"emotion": params.emotion,
}
@@ -280,7 +280,7 @@ class CartesiaHttpTTSService(TTSService):
encoding: Optional[str] = "pcm_s16le"
sample_rate: Optional[int] = 16000
container: Optional[str] = "raw"
language: Optional[str] = "en"
language: Optional[Language] = Language.EN
speed: Optional[Union[str, float]] = ""
emotion: Optional[List[str]] = []
@@ -303,7 +303,7 @@ class CartesiaHttpTTSService(TTSService):
"encoding": params.encoding,
"sample_rate": params.sample_rate,
},
"language": params.language,
"language": language_to_cartesia_language(params.language) if params.language else None,
"speed": params.speed,
"emotion": params.emotion,
}

View File

@@ -120,7 +120,7 @@ class DeepgramSTTService(STTService):
url: str = "",
live_options: LiveOptions = LiveOptions(
encoding="linear16",
language="en-US",
language=Language.EN,
model="nova-2-conversationalai",
sample_rate=16000,
channels=1,

View File

@@ -50,6 +50,76 @@ def sample_rate_from_output_format(output_format: str) -> int:
return 16000
def language_to_elevenlabs_language(language: Language) -> str | None:
match language:
case Language.BG:
return "bg"
case Language.ZH:
return "zh"
case Language.CS:
return "cs"
case Language.DA:
return "da"
case Language.NL:
return "nl"
case (
Language.EN
| Language.EN_US
| Language.EN_AU
| Language.EN_GB
| Language.EN_NZ
| Language.EN_IN
):
return "en"
case Language.FI:
return "fi"
case Language.FR | Language.FR_CA:
return "fr"
case Language.DE | Language.DE_CH:
return "de"
case Language.EL:
return "el"
case Language.HI:
return "hi"
case Language.HU:
return "hu"
case Language.ID:
return "id"
case Language.IT:
return "it"
case Language.JA:
return "ja"
case Language.KO:
return "ko"
case Language.MS:
return "ms"
case Language.NO:
return "no"
case Language.PL:
return "pl"
case Language.PT:
return "pt-PT"
case Language.PT_BR:
return "pt-BR"
case Language.RO:
return "ro"
case Language.RU:
return "ru"
case Language.SK:
return "sk"
case Language.ES:
return "es"
case Language.SV:
return "sv"
case Language.TR:
return "tr"
case Language.UK:
return "uk"
case Language.VI:
return "vi"
return None
def calculate_word_times(
alignment_info: Mapping[str, Any], cumulative_time: float
) -> List[Tuple[str, float]]:
@@ -128,7 +198,9 @@ class ElevenLabsTTSService(WordTTSService):
self._url = url
self._settings = {
"sample_rate": sample_rate_from_output_format(params.output_format),
"language": params.language,
"language": language_to_elevenlabs_language(params.language)
if params.language
else "en",
"output_format": params.output_format,
"optimize_streaming_latency": params.optimize_streaming_latency,
"stability": params.stability,

View File

@@ -20,6 +20,7 @@ from pipecat.frames.frames import (
TranscriptionFrame,
)
from pipecat.services.ai_services import STTService
from pipecat.transcriptions.language import Language
from pipecat.utils.time import time_now_iso8601
# See .env.example for Gladia configuration needed
@@ -33,10 +34,88 @@ except ModuleNotFoundError as e:
raise Exception(f"Missing module: {e}")
def language_to_gladia_language(language: Language) -> str | None:
match language:
case Language.BG:
return "bg"
case Language.CA:
return "ca"
case Language.ZH:
return "zh"
case Language.CS:
return "cs"
case Language.DA:
return "da"
case Language.NL:
return "nl"
case (
Language.EN
| Language.EN_US
| Language.EN_AU
| Language.EN_GB
| Language.EN_NZ
| Language.EN_IN
):
return "en"
case Language.ET:
return "et"
case Language.FI:
return "fi"
case Language.FR | Language.FR_CA:
return "fr"
case Language.DE | Language.DE_CH:
return "de"
case Language.EL:
return "el"
case Language.HI:
return "hi"
case Language.HU:
return "hu"
case Language.ID:
return "id"
case Language.IT:
return "it"
case Language.JA:
return "ja"
case Language.KO:
return "ko"
case Language.LV:
return "lv"
case Language.LT:
return "lt"
case Language.MS:
return "ms"
case Language.NO:
return "no"
case Language.PL:
return "pl"
case Language.PT | Language.PT_BR:
return "pt"
case Language.RO:
return "ro"
case Language.RU:
return "ru"
case Language.SK:
return "sk"
case Language.ES:
return "es"
case Language.SV:
return "sv"
case Language.TH:
return "th"
case Language.TR:
return "tr"
case Language.UK:
return "uk"
case Language.VI:
return "vi"
return None
class GladiaSTTService(STTService):
class InputParams(BaseModel):
sample_rate: Optional[int] = 16000
language: Optional[str] = "english"
language: Optional[Language] = Language.EN
transcription_hint: Optional[str] = None
endpointing: Optional[int] = 200
prosody: Optional[bool] = None
@@ -56,7 +135,7 @@ class GladiaSTTService(STTService):
self._url = url
self._settings = {
"sample_rate": params.sample_rate,
"language": params.language,
"language": language_to_gladia_language(params.language) if params.language else "en",
"transcription_hint": params.transcription_hint,
"endpointing": params.endpointing,
"prosody": params.prosody,

View File

@@ -30,6 +30,7 @@ from pipecat.processors.aggregators.openai_llm_context import (
)
from pipecat.processors.frame_processor import FrameDirection
from pipecat.services.ai_services import LLMService, TTSService
from pipecat.transcriptions.language import Language
try:
import google.ai.generativelanguage as glm
@@ -145,13 +146,100 @@ class GoogleLLMService(LLMService):
await self._process_context(context)
def language_to_google_language(language: Language) -> str | None:
match language:
case Language.BG:
return "bg-BG"
case Language.CA:
return "ca-ES"
case Language.ZH:
return "cmn-CN"
case Language.ZH_TW:
return "cmn-TW"
case Language.CS:
return "cs-CZ"
case Language.DA:
return "da-DK"
case Language.NL:
return "nl-NL"
case Language.EN:
return "en-US"
case Language.EN_US:
return "en-US"
case Language.EN_AU:
return "en-AU"
case Language.EN_GB:
return "en-GB"
case Language.EN_IN:
return "en-IN"
case Language.ET:
return "et-EE"
case Language.FI:
return "fi-FI"
case Language.NL_BE:
return "nl-BE"
case Language.FR:
return "fr-FR"
case Language.FR_CA:
return "fr-CA"
case Language.DE:
return "de-DE"
case Language.EL:
return "el-GR"
case Language.HI:
return "hi-IN"
case Language.HU:
return "hu-HU"
case Language.ID:
return "id-ID"
case Language.IT:
return "it-IT"
case Language.JA:
return "ja-JP"
case Language.KO:
return "ko-KR"
case Language.LV:
return "lv-LV"
case Language.LT:
return "lt-LT"
case Language.MS:
return "ms-MY"
case Language.NO:
return "nb-NO"
case Language.PL:
return "pl-PL"
case Language.PT:
return "pt-PT"
case Language.PT_BR:
return "pt-BR"
case Language.RO:
return "ro-RO"
case Language.RU:
return "ru-RU"
case Language.SK:
return "sk-SK"
case Language.ES:
return "es-ES"
case Language.SV:
return "sv-SE"
case Language.TH:
return "th-TH"
case Language.TR:
return "tr-TR"
case Language.UK:
return "uk-UA"
case Language.VI:
return "vi-VN"
return None
class GoogleTTSService(TTSService):
class InputParams(BaseModel):
pitch: Optional[str] = None
rate: Optional[str] = None
volume: Optional[str] = None
emphasis: Optional[Literal["strong", "moderate", "reduced", "none"]] = None
language: Optional[str] = "en-US"
language: Optional[Language] = Language.EN
gender: Optional[Literal["male", "female", "neutral"]] = None
google_style: Optional[Literal["apologetic", "calm", "empathetic", "firm", "lively"]] = None
@@ -173,7 +261,9 @@ class GoogleTTSService(TTSService):
"rate": params.rate,
"volume": params.volume,
"emphasis": params.emphasis,
"language": params.language,
"language": language_to_google_language(params.language)
if params.language
else "en-US",
"gender": params.gender,
"google_style": params.google_style,
}

View File

@@ -22,6 +22,7 @@ from pipecat.frames.frames import (
)
from pipecat.processors.frame_processor import FrameDirection
from pipecat.services.ai_services import TTSService
from pipecat.transcriptions.language import Language
# See .env.example for LMNT configuration needed
try:
@@ -34,6 +35,32 @@ except ModuleNotFoundError as e:
raise Exception(f"Missing module: {e}")
def language_to_lmnt_language(language: Language) -> str | None:
match language:
case Language.DE:
return "de"
case (
Language.EN
| Language.EN_US
| Language.EN_AU
| Language.EN_GB
| Language.EN_NZ
| Language.EN_IN
):
return "en"
case Language.ES:
return "es"
case Language.FR | Language.FR_CA:
return "fr"
case Language.PT | Language.PT_BR:
return "pt"
case Language.ZH | Language.ZH_TW:
return "zh"
case Language.KO:
return "ko"
return None
class LmntTTSService(TTSService):
def __init__(
self,
@@ -41,7 +68,7 @@ class LmntTTSService(TTSService):
api_key: str,
voice_id: str,
sample_rate: int = 24000,
language: str = "en",
language: Language = Language.EN,
**kwargs,
):
# Let TTSService produce TTSStoppedFrames after a short delay of
@@ -55,7 +82,7 @@ class LmntTTSService(TTSService):
"encoding": "pcm_s16le",
"sample_rate": sample_rate,
},
"language": language,
"language": language_to_lmnt_language(language) if language else "en",
}
self.set_voice(voice_id)

View File

@@ -19,6 +19,7 @@ from pipecat.frames.frames import (
TTSStoppedFrame,
)
from pipecat.services.ai_services import TTSService
from pipecat.transcriptions.language import Language
try:
import resampy
@@ -36,12 +37,56 @@ except ModuleNotFoundError as e:
# https://github.com/coqui-ai/xtts-streaming-server
def language_to_xtts_language(language: Language) -> str | None:
match language:
case Language.CS:
return "cs"
case Language.DE:
return "de"
case (
Language.EN
| Language.EN_US
| Language.EN_AU
| Language.EN_GB
| Language.EN_NZ
| Language.EN_IN
):
return "en"
case Language.ES:
return "es"
case Language.FR:
return "fr"
case Language.HI:
return "hi"
case Language.HU:
return "hu"
case Language.IT:
return "it"
case Language.JA:
return "ja"
case Language.KO:
return "ko"
case Language.NL:
return "nl"
case Language.PL:
return "pl"
case Language.PT | Language.PT_BR:
return "pt"
case Language.RU:
return "ru"
case Language.TR:
return "tr"
case Language.ZH:
return "zh-cn"
return None
class XTTSService(TTSService):
def __init__(
self,
*,
voice_id: str,
language: str,
language: Language,
base_url: str,
aiohttp_session: aiohttp.ClientSession,
**kwargs,
@@ -49,7 +94,7 @@ class XTTSService(TTSService):
super().__init__(**kwargs)
self._settings = {
"language": language,
"language": language_to_xtts_language(language) if language else "en",
"base_url": base_url,
}
self.set_voice(voice_id)