Merge pull request #2362 from pipecat-ai/mb/aws-stt-languages

AWSTranscribeSTTService add support for new languages
This commit is contained in:
Mark Backman
2025-08-05 14:00:50 -07:00
committed by GitHub
2 changed files with 142 additions and 6 deletions

View File

@@ -9,6 +9,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Added
- Added new language support for `AWSTranscribeSTTService`. All languages
supporting streaming data input are now supported:
https://docs.aws.amazon.com/transcribe/latest/dg/supported-languages.html
- The development runner how handles custom `body` data for `DailyTransport`.
The `body` data is passed to the Pipecat client. You can POST to the `/start`
endpoint with a request body of:

View File

@@ -314,6 +314,10 @@ class AWSTranscribeSTTService(STTService):
def language_to_service_language(self, language: Language) -> str | None:
"""Convert internal language enum to AWS Transcribe language code.
Source:
https://docs.aws.amazon.com/transcribe/latest/dg/supported-languages.html
All language codes that support streaming are included.
Args:
language: Internal language enumeration value.
@@ -321,17 +325,145 @@ class AWSTranscribeSTTService(STTService):
AWS Transcribe compatible language code, or None if unsupported.
"""
language_map = {
Language.EN: "en-US",
Language.ES: "es-US",
Language.FR: "fr-FR",
Language.DE: "de-DE",
# Afrikaans
Language.AF: "af-ZA",
Language.AF_ZA: "af-ZA",
# Arabic
Language.AR: "ar-SA", # Default to Modern Standard Arabic
Language.AR_AE: "ar-AE", # Gulf Arabic
Language.AR_SA: "ar-SA", # Modern Standard Arabic
# Basque
Language.EU: "eu-ES",
Language.EU_ES: "eu-ES",
# Catalan
Language.CA: "ca-ES",
Language.CA_ES: "ca-ES",
# Chinese
Language.ZH: "zh-CN", # Default to Simplified
Language.ZH_CN: "zh-CN", # Simplified
Language.ZH_TW: "zh-TW", # Traditional
Language.ZH_HK: "zh-HK", # Cantonese (also yue-HK)
Language.YUE: "zh-HK", # Cantonese fallback
# Croatian
Language.HR: "hr-HR",
Language.HR_HR: "hr-HR",
# Czech
Language.CS: "cs-CZ",
Language.CS_CZ: "cs-CZ",
# Danish
Language.DA: "da-DK",
Language.DA_DK: "da-DK",
# Dutch
Language.NL: "nl-NL",
Language.NL_NL: "nl-NL",
# English
Language.EN: "en-US", # Default to US
Language.EN_AU: "en-AU", # Australian
Language.EN_GB: "en-GB", # British
Language.EN_IN: "en-IN", # Indian
Language.EN_IE: "en-IE", # Irish
Language.EN_NZ: "en-NZ", # New Zealand
# Note: Scottish (en-AB) and Welsh (en-WL) don't have direct Language enum matches
Language.EN_ZA: "en-ZA", # South African
Language.EN_US: "en-US", # US
# Persian/Farsi
Language.FA: "fa-IR",
Language.FA_IR: "fa-IR",
# Finnish
Language.FI: "fi-FI",
Language.FI_FI: "fi-FI",
# French
Language.FR: "fr-FR", # Default to France
Language.FR_FR: "fr-FR",
Language.FR_CA: "fr-CA", # Canadian
# Galician
Language.GL: "gl-ES",
Language.GL_ES: "gl-ES",
# Georgian
Language.KA: "ka-GE",
Language.KA_GE: "ka-GE",
# German
Language.DE: "de-DE", # Default to Germany
Language.DE_DE: "de-DE",
Language.DE_CH: "de-CH", # Swiss
# Greek
Language.EL: "el-GR",
Language.EL_GR: "el-GR",
# Hebrew
Language.HE: "he-IL",
Language.HE_IL: "he-IL",
# Hindi
Language.HI: "hi-IN",
Language.HI_IN: "hi-IN",
# Indonesian
Language.ID: "id-ID",
Language.ID_ID: "id-ID",
# Italian
Language.IT: "it-IT",
Language.PT: "pt-BR",
Language.IT_IT: "it-IT",
# Japanese
Language.JA: "ja-JP",
Language.JA_JP: "ja-JP",
# Korean
Language.KO: "ko-KR",
Language.ZH: "zh-CN",
Language.KO_KR: "ko-KR",
# Latvian
Language.LV: "lv-LV",
Language.LV_LV: "lv-LV",
# Malay
Language.MS: "ms-MY",
Language.MS_MY: "ms-MY",
# Norwegian
Language.NB: "no-NO", # Norwegian Bokmål
Language.NB_NO: "no-NO",
Language.NO: "no-NO",
# Polish
Language.PL: "pl-PL",
Language.PL_PL: "pl-PL",
# Portuguese
Language.PT: "pt-PT", # Default to Portugal
Language.PT_PT: "pt-PT",
Language.PT_BR: "pt-BR", # Brazilian
# Romanian
Language.RO: "ro-RO",
Language.RO_RO: "ro-RO",
# Russian
Language.RU: "ru-RU",
Language.RU_RU: "ru-RU",
# Serbian
Language.SR: "sr-RS",
Language.SR_RS: "sr-RS",
# Slovak
Language.SK: "sk-SK",
Language.SK_SK: "sk-SK",
# Somali
Language.SO: "so-SO",
Language.SO_SO: "so-SO",
# Spanish
Language.ES: "es-ES", # Default to Spain
Language.ES_ES: "es-ES",
Language.ES_US: "es-US", # US Spanish
# Swedish
Language.SV: "sv-SE",
Language.SV_SE: "sv-SE",
# Tagalog/Filipino
Language.TL: "tl-PH",
Language.FIL: "tl-PH", # Filipino maps to Tagalog
Language.FIL_PH: "tl-PH",
# Thai
Language.TH: "th-TH",
Language.TH_TH: "th-TH",
# Ukrainian
Language.UK: "uk-UA",
Language.UK_UA: "uk-UA",
# Vietnamese
Language.VI: "vi-VN",
Language.VI_VN: "vi-VN",
# Zulu
Language.ZU: "zu-ZA",
Language.ZU_ZA: "zu-ZA",
}
return language_map.get(language)
@traced_stt