From 2ee481d5411c90e947f5d064b149ced64f160021 Mon Sep 17 00:00:00 2001 From: Manish Kumar Date: Sat, 30 Aug 2025 22:59:10 +0530 Subject: [PATCH 1/2] feat: add voice cloning and speaking rate to GoogleTTSService --- src/pipecat/services/google/tts.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/src/pipecat/services/google/tts.py b/src/pipecat/services/google/tts.py index 6fb99a728..676896cc7 100644 --- a/src/pipecat/services/google/tts.py +++ b/src/pipecat/services/google/tts.py @@ -500,9 +500,11 @@ class GoogleTTSService(TTSService): Parameters: language: Language for synthesis. Defaults to English. + speaking_rate: The speaking rate, in the range [0.25, 4.0]. """ language: Optional[Language] = Language.EN + speaking_rate: Optional[float] = None def __init__( self, @@ -510,6 +512,7 @@ class GoogleTTSService(TTSService): credentials: Optional[str] = None, credentials_path: Optional[str] = None, voice_id: str = "en-US-Chirp3-HD-Charon", + voice_cloning_key: Optional[str] = None, sample_rate: Optional[int] = None, params: InputParams = InputParams(), **kwargs, @@ -532,8 +535,10 @@ class GoogleTTSService(TTSService): "language": self.language_to_service_language(params.language) if params.language else "en-US", + "speaking_rate": params.speaking_rate, } self.set_voice(voice_id) + self._voice_cloning_key = voice_cloning_key self._client: texttospeech_v1.TextToSpeechAsyncClient = self._create_client( credentials, credentials_path ) @@ -600,15 +605,24 @@ class GoogleTTSService(TTSService): try: await self.start_ttfb_metrics() - voice = texttospeech_v1.VoiceSelectionParams( - language_code=self._settings["language"], name=self._voice_id - ) + if self._voice_cloning_key: + voice_clone_params = texttospeech_v1.VoiceCloneParams( + voice_cloning_key=self._voice_cloning_key + ) + voice = texttospeech_v1.VoiceSelectionParams( + language_code=self._settings["language"], voice_clone=voice_clone_params + ) + else: + voice = texttospeech_v1.VoiceSelectionParams( + language_code=self._settings["language"], name=self._voice_id + ) streaming_config = texttospeech_v1.StreamingSynthesizeConfig( voice=voice, streaming_audio_config=texttospeech_v1.StreamingAudioConfig( audio_encoding=texttospeech_v1.AudioEncoding.PCM, sample_rate_hertz=self.sample_rate, + speaking_rate=self._settings["speaking_rate"], ), ) config_request = texttospeech_v1.StreamingSynthesizeRequest( From 4699ee8d86289f5f474fecdba89840dc13ace42e Mon Sep 17 00:00:00 2001 From: Manish Kumar Date: Thu, 4 Sep 2025 22:45:51 +0530 Subject: [PATCH 2/2] docs: add docstring for voice_cloning_key and update CHANGELOG --- CHANGELOG.md | 2 ++ src/pipecat/services/google/tts.py | 1 + 2 files changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5396a1eb7..d8d80693e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## Added +- Added `voice_cloning_key` to `GoogleTTSService` to support custom cloned voices. +- Added `speaking_rate` to `GoogleTTSService.InputParams` to control the speaking rate. - `BaseOutputTransport` now implements `write_dtmf()` by loading DTMF audio and sending it through the transport. This makes sending DTMF generic across all output transports. diff --git a/src/pipecat/services/google/tts.py b/src/pipecat/services/google/tts.py index 676896cc7..bfda3292a 100644 --- a/src/pipecat/services/google/tts.py +++ b/src/pipecat/services/google/tts.py @@ -523,6 +523,7 @@ class GoogleTTSService(TTSService): credentials: JSON string containing Google Cloud service account credentials. credentials_path: Path to Google Cloud service account JSON file. voice_id: Google TTS voice identifier (e.g., "en-US-Chirp3-HD-Charon"). + voice_cloning_key: The voice cloning key for Chirp 3 custom voices. sample_rate: Audio sample rate in Hz. If None, uses default. params: Language configuration parameters. **kwargs: Additional arguments passed to parent TTSService.