Merge pull request #3346 from pipecat-ai/mb/cartesia-pronunciation-dict

Cartesia TTS: Add support for pronunciation_dict_id
This commit is contained in:
Mark Backman
2026-01-06 08:52:09 -05:00
committed by GitHub
2 changed files with 13 additions and 0 deletions

1
changelog/3346.added.md Normal file
View File

@@ -0,0 +1 @@
- Added `pronunciation_dict_id` parameter to `CartesiaTTSService.InputParams` and `CartesiaHttpTTSService.InputParams` to support Cartesia's pronunciation dictionary feature for custom pronunciations.

View File

@@ -213,12 +213,14 @@ class CartesiaTTSService(AudioContextWordTTSService):
generation_config: Generation configuration for Sonic-3 models. Includes volume,
speed (numeric), and emotion (string) parameters.
pronunciation_dict_id: The ID of the pronunciation dictionary to use for custom pronunciations.
"""
language: Optional[Language] = Language.EN
speed: Optional[Literal["slow", "normal", "fast"]] = None
emotion: Optional[List[str]] = []
generation_config: Optional[GenerationConfig] = None
pronunciation_dict_id: Optional[str] = None
def __init__(
self,
@@ -300,6 +302,7 @@ class CartesiaTTSService(AudioContextWordTTSService):
"speed": params.speed,
"emotion": params.emotion,
"generation_config": params.generation_config,
"pronunciation_dict_id": params.pronunciation_dict_id,
}
self.set_model_name(model)
self.set_voice(voice_id)
@@ -444,6 +447,9 @@ class CartesiaTTSService(AudioContextWordTTSService):
exclude_none=True
)
if self._settings["pronunciation_dict_id"]:
msg["pronunciation_dict_id"] = self._settings["pronunciation_dict_id"]
return json.dumps(msg)
async def start(self, frame: StartFrame):
@@ -636,12 +642,14 @@ class CartesiaHttpTTSService(TTSService):
generation_config: Generation configuration for Sonic-3 models. Includes volume,
speed (numeric), and emotion (string) parameters.
pronunciation_dict_id: The ID of the pronunciation dictionary to use for custom pronunciations.
"""
language: Optional[Language] = Language.EN
speed: Optional[Literal["slow", "normal", "fast"]] = None
emotion: Optional[List[str]] = Field(default_factory=list)
generation_config: Optional[GenerationConfig] = None
pronunciation_dict_id: Optional[str] = None
def __init__(
self,
@@ -690,6 +698,7 @@ class CartesiaHttpTTSService(TTSService):
"speed": params.speed,
"emotion": params.emotion,
"generation_config": params.generation_config,
"pronunciation_dict_id": params.pronunciation_dict_id,
}
self.set_voice(voice_id)
self.set_model_name(model)
@@ -788,6 +797,9 @@ class CartesiaHttpTTSService(TTSService):
exclude_none=True
)
if self._settings["pronunciation_dict_id"]:
payload["pronunciation_dict_id"] = self._settings["pronunciation_dict_id"]
yield TTSStartedFrame()
session = await self._client._get_session()