diff --git a/CHANGELOG.md b/CHANGELOG.md index f5c87c1ec..0c5c30b0e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added +- Added new language support for `AWSTranscribeSTTService`. All languages + supporting streaming data input are now supported: + https://docs.aws.amazon.com/transcribe/latest/dg/supported-languages.html + - The development runner how handles custom `body` data for `DailyTransport`. The `body` data is passed to the Pipecat client. You can POST to the `/start` endpoint with a request body of: diff --git a/src/pipecat/services/aws/stt.py b/src/pipecat/services/aws/stt.py index 57cd391b6..f67f0979f 100644 --- a/src/pipecat/services/aws/stt.py +++ b/src/pipecat/services/aws/stt.py @@ -314,6 +314,10 @@ class AWSTranscribeSTTService(STTService): def language_to_service_language(self, language: Language) -> str | None: """Convert internal language enum to AWS Transcribe language code. + Source: + https://docs.aws.amazon.com/transcribe/latest/dg/supported-languages.html + All language codes that support streaming are included. + Args: language: Internal language enumeration value. @@ -321,17 +325,145 @@ class AWSTranscribeSTTService(STTService): AWS Transcribe compatible language code, or None if unsupported. """ language_map = { - Language.EN: "en-US", - Language.ES: "es-US", - Language.FR: "fr-FR", - Language.DE: "de-DE", + # Afrikaans + Language.AF: "af-ZA", + Language.AF_ZA: "af-ZA", + # Arabic + Language.AR: "ar-SA", # Default to Modern Standard Arabic + Language.AR_AE: "ar-AE", # Gulf Arabic + Language.AR_SA: "ar-SA", # Modern Standard Arabic + # Basque + Language.EU: "eu-ES", + Language.EU_ES: "eu-ES", + # Catalan + Language.CA: "ca-ES", + Language.CA_ES: "ca-ES", + # Chinese + Language.ZH: "zh-CN", # Default to Simplified + Language.ZH_CN: "zh-CN", # Simplified + Language.ZH_TW: "zh-TW", # Traditional + Language.ZH_HK: "zh-HK", # Cantonese (also yue-HK) + Language.YUE: "zh-HK", # Cantonese fallback + # Croatian + Language.HR: "hr-HR", + Language.HR_HR: "hr-HR", + # Czech + Language.CS: "cs-CZ", + Language.CS_CZ: "cs-CZ", + # Danish + Language.DA: "da-DK", + Language.DA_DK: "da-DK", + # Dutch + Language.NL: "nl-NL", + Language.NL_NL: "nl-NL", + # English + Language.EN: "en-US", # Default to US + Language.EN_AU: "en-AU", # Australian + Language.EN_GB: "en-GB", # British + Language.EN_IN: "en-IN", # Indian + Language.EN_IE: "en-IE", # Irish + Language.EN_NZ: "en-NZ", # New Zealand + # Note: Scottish (en-AB) and Welsh (en-WL) don't have direct Language enum matches + Language.EN_ZA: "en-ZA", # South African + Language.EN_US: "en-US", # US + # Persian/Farsi + Language.FA: "fa-IR", + Language.FA_IR: "fa-IR", + # Finnish + Language.FI: "fi-FI", + Language.FI_FI: "fi-FI", + # French + Language.FR: "fr-FR", # Default to France + Language.FR_FR: "fr-FR", + Language.FR_CA: "fr-CA", # Canadian + # Galician + Language.GL: "gl-ES", + Language.GL_ES: "gl-ES", + # Georgian + Language.KA: "ka-GE", + Language.KA_GE: "ka-GE", + # German + Language.DE: "de-DE", # Default to Germany + Language.DE_DE: "de-DE", + Language.DE_CH: "de-CH", # Swiss + # Greek + Language.EL: "el-GR", + Language.EL_GR: "el-GR", + # Hebrew + Language.HE: "he-IL", + Language.HE_IL: "he-IL", + # Hindi + Language.HI: "hi-IN", + Language.HI_IN: "hi-IN", + # Indonesian + Language.ID: "id-ID", + Language.ID_ID: "id-ID", + # Italian Language.IT: "it-IT", - Language.PT: "pt-BR", + Language.IT_IT: "it-IT", + # Japanese Language.JA: "ja-JP", + Language.JA_JP: "ja-JP", + # Korean Language.KO: "ko-KR", - Language.ZH: "zh-CN", + Language.KO_KR: "ko-KR", + # Latvian + Language.LV: "lv-LV", + Language.LV_LV: "lv-LV", + # Malay + Language.MS: "ms-MY", + Language.MS_MY: "ms-MY", + # Norwegian + Language.NB: "no-NO", # Norwegian Bokmål + Language.NB_NO: "no-NO", + Language.NO: "no-NO", + # Polish Language.PL: "pl-PL", + Language.PL_PL: "pl-PL", + # Portuguese + Language.PT: "pt-PT", # Default to Portugal + Language.PT_PT: "pt-PT", + Language.PT_BR: "pt-BR", # Brazilian + # Romanian + Language.RO: "ro-RO", + Language.RO_RO: "ro-RO", + # Russian + Language.RU: "ru-RU", + Language.RU_RU: "ru-RU", + # Serbian + Language.SR: "sr-RS", + Language.SR_RS: "sr-RS", + # Slovak + Language.SK: "sk-SK", + Language.SK_SK: "sk-SK", + # Somali + Language.SO: "so-SO", + Language.SO_SO: "so-SO", + # Spanish + Language.ES: "es-ES", # Default to Spain + Language.ES_ES: "es-ES", + Language.ES_US: "es-US", # US Spanish + # Swedish + Language.SV: "sv-SE", + Language.SV_SE: "sv-SE", + # Tagalog/Filipino + Language.TL: "tl-PH", + Language.FIL: "tl-PH", # Filipino maps to Tagalog + Language.FIL_PH: "tl-PH", + # Thai + Language.TH: "th-TH", + Language.TH_TH: "th-TH", + # Ukrainian + Language.UK: "uk-UA", + Language.UK_UA: "uk-UA", + # Vietnamese + Language.VI: "vi-VN", + Language.VI_VN: "vi-VN", + # Zulu + Language.ZU: "zu-ZA", + Language.ZU_ZA: "zu-ZA", } + return language_map.get(language) @traced_stt