diff --git a/changelog/3346.added.md b/changelog/3346.added.md new file mode 100644 index 000000000..b72d23b55 --- /dev/null +++ b/changelog/3346.added.md @@ -0,0 +1 @@ +- Added `pronunciation_dict_id` parameter to `CartesiaTTSService.InputParams` and `CartesiaHttpTTSService.InputParams` to support Cartesia's pronunciation dictionary feature for custom pronunciations. diff --git a/src/pipecat/services/cartesia/tts.py b/src/pipecat/services/cartesia/tts.py index 536c64776..ee200eac4 100644 --- a/src/pipecat/services/cartesia/tts.py +++ b/src/pipecat/services/cartesia/tts.py @@ -213,12 +213,14 @@ class CartesiaTTSService(AudioContextWordTTSService): generation_config: Generation configuration for Sonic-3 models. Includes volume, speed (numeric), and emotion (string) parameters. + pronunciation_dict_id: The ID of the pronunciation dictionary to use for custom pronunciations. """ language: Optional[Language] = Language.EN speed: Optional[Literal["slow", "normal", "fast"]] = None emotion: Optional[List[str]] = [] generation_config: Optional[GenerationConfig] = None + pronunciation_dict_id: Optional[str] = None def __init__( self, @@ -300,6 +302,7 @@ class CartesiaTTSService(AudioContextWordTTSService): "speed": params.speed, "emotion": params.emotion, "generation_config": params.generation_config, + "pronunciation_dict_id": params.pronunciation_dict_id, } self.set_model_name(model) self.set_voice(voice_id) @@ -444,6 +447,9 @@ class CartesiaTTSService(AudioContextWordTTSService): exclude_none=True ) + if self._settings["pronunciation_dict_id"]: + msg["pronunciation_dict_id"] = self._settings["pronunciation_dict_id"] + return json.dumps(msg) async def start(self, frame: StartFrame): @@ -636,12 +642,14 @@ class CartesiaHttpTTSService(TTSService): generation_config: Generation configuration for Sonic-3 models. Includes volume, speed (numeric), and emotion (string) parameters. + pronunciation_dict_id: The ID of the pronunciation dictionary to use for custom pronunciations. """ language: Optional[Language] = Language.EN speed: Optional[Literal["slow", "normal", "fast"]] = None emotion: Optional[List[str]] = Field(default_factory=list) generation_config: Optional[GenerationConfig] = None + pronunciation_dict_id: Optional[str] = None def __init__( self, @@ -690,6 +698,7 @@ class CartesiaHttpTTSService(TTSService): "speed": params.speed, "emotion": params.emotion, "generation_config": params.generation_config, + "pronunciation_dict_id": params.pronunciation_dict_id, } self.set_voice(voice_id) self.set_model_name(model) @@ -788,6 +797,9 @@ class CartesiaHttpTTSService(TTSService): exclude_none=True ) + if self._settings["pronunciation_dict_id"]: + payload["pronunciation_dict_id"] = self._settings["pronunciation_dict_id"] + yield TTSStartedFrame() session = await self._client._get_session()