Merge pull request #3346 from pipecat-ai/mb/cartesia-pronunciation-dict

Cartesia TTS: Add support for pronunciation_dict_id
2026-01-06 08:52:09 -05:00
parent 81b28beef5 d8be1282b5
commit c61a5e7173
2 changed files with 13 additions and 0 deletions
--- a/changelog/3346.added.md
+++ b/changelog/3346.added.md
@@ -0,0 +1 @@
+- Added `pronunciation_dict_id` parameter to `CartesiaTTSService.InputParams` and `CartesiaHttpTTSService.InputParams` to support Cartesia's pronunciation dictionary feature for custom pronunciations.
--- a/src/pipecat/services/cartesia/tts.py
+++ b/src/pipecat/services/cartesia/tts.py
@@ -213,12 +213,14 @@ class CartesiaTTSService(AudioContextWordTTSService):

            generation_config: Generation configuration for Sonic-3 models. Includes volume,
                speed (numeric), and emotion (string) parameters.
+            pronunciation_dict_id: The ID of the pronunciation dictionary to use for custom pronunciations.
        """

        language: Optional[Language] = Language.EN
        speed: Optional[Literal["slow", "normal", "fast"]] = None
        emotion: Optional[List[str]] = []
        generation_config: Optional[GenerationConfig] = None
+        pronunciation_dict_id: Optional[str] = None

    def __init__(
        self,
@@ -300,6 +302,7 @@ class CartesiaTTSService(AudioContextWordTTSService):
            "speed": params.speed,
            "emotion": params.emotion,
            "generation_config": params.generation_config,
+            "pronunciation_dict_id": params.pronunciation_dict_id,
        }
        self.set_model_name(model)
        self.set_voice(voice_id)
@@ -444,6 +447,9 @@ class CartesiaTTSService(AudioContextWordTTSService):
                exclude_none=True
            )

+        if self._settings["pronunciation_dict_id"]:
+            msg["pronunciation_dict_id"] = self._settings["pronunciation_dict_id"]
+
        return json.dumps(msg)

    async def start(self, frame: StartFrame):
@@ -636,12 +642,14 @@ class CartesiaHttpTTSService(TTSService):

            generation_config: Generation configuration for Sonic-3 models. Includes volume,
                speed (numeric), and emotion (string) parameters.
+            pronunciation_dict_id: The ID of the pronunciation dictionary to use for custom pronunciations.
        """

        language: Optional[Language] = Language.EN
        speed: Optional[Literal["slow", "normal", "fast"]] = None
        emotion: Optional[List[str]] = Field(default_factory=list)
        generation_config: Optional[GenerationConfig] = None
+        pronunciation_dict_id: Optional[str] = None

    def __init__(
        self,
@@ -690,6 +698,7 @@ class CartesiaHttpTTSService(TTSService):
            "speed": params.speed,
            "emotion": params.emotion,
            "generation_config": params.generation_config,
+            "pronunciation_dict_id": params.pronunciation_dict_id,
        }
        self.set_voice(voice_id)
        self.set_model_name(model)
@@ -788,6 +797,9 @@ class CartesiaHttpTTSService(TTSService):
                    exclude_none=True
                )

+            if self._settings["pronunciation_dict_id"]:
+                payload["pronunciation_dict_id"] = self._settings["pronunciation_dict_id"]
+
            yield TTSStartedFrame()

            session = await self._client._get_session()
				`@@ -0,0 +1 @@`
				- Added `pronunciation_dict_id` parameter to `CartesiaTTSService.InputParams` and `CartesiaHttpTTSService.InputParams` to support Cartesia's pronunciation dictionary feature for custom pronunciations.