Merge pull request #2547 from manishkjs/feat/google-tts-voice-cloning

feat: add voice cloning and speaking rate to GoogleTTSService
2025-09-11 14:32:21 -07:00
parent c26d336e34 4699ee8d86
commit 0c30cc6ea6
2 changed files with 20 additions and 3 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -167,6 +167,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
          # Handle navigation failure
  ```

+- Added `voice_cloning_key` to `GoogleTTSService` to support custom cloned voices.
+- Added `speaking_rate` to `GoogleTTSService.InputParams` to control the speaking rate.
 - `BaseOutputTransport` now implements `write_dtmf()` by loading DTMF audio and
  sending it through the transport. This makes sending DTMF generic across all
  output transports.
--- a/src/pipecat/services/google/tts.py
+++ b/src/pipecat/services/google/tts.py
@@ -500,9 +500,11 @@ class GoogleTTSService(TTSService):

        Parameters:
            language: Language for synthesis. Defaults to English.
+            speaking_rate: The speaking rate, in the range [0.25, 4.0].
        """

        language: Optional[Language] = Language.EN
+        speaking_rate: Optional[float] = None

    def __init__(
        self,
@@ -510,6 +512,7 @@ class GoogleTTSService(TTSService):
        credentials: Optional[str] = None,
        credentials_path: Optional[str] = None,
        voice_id: str = "en-US-Chirp3-HD-Charon",
+        voice_cloning_key: Optional[str] = None,
        sample_rate: Optional[int] = None,
        params: InputParams = InputParams(),
        **kwargs,
@@ -520,6 +523,7 @@ class GoogleTTSService(TTSService):
            credentials: JSON string containing Google Cloud service account credentials.
            credentials_path: Path to Google Cloud service account JSON file.
            voice_id: Google TTS voice identifier (e.g., "en-US-Chirp3-HD-Charon").
+            voice_cloning_key: The voice cloning key for Chirp 3 custom voices.
            sample_rate: Audio sample rate in Hz. If None, uses default.
            params: Language configuration parameters.
            **kwargs: Additional arguments passed to parent TTSService.
@@ -532,8 +536,10 @@ class GoogleTTSService(TTSService):
            "language": self.language_to_service_language(params.language)
            if params.language
            else "en-US",
+            "speaking_rate": params.speaking_rate,
        }
        self.set_voice(voice_id)
+        self._voice_cloning_key = voice_cloning_key
        self._client: texttospeech_v1.TextToSpeechAsyncClient = self._create_client(
            credentials, credentials_path
        )
@@ -600,15 +606,24 @@ class GoogleTTSService(TTSService):
        try:
            await self.start_ttfb_metrics()

-            voice = texttospeech_v1.VoiceSelectionParams(
-                language_code=self._settings["language"], name=self._voice_id
-            )
+            if self._voice_cloning_key:
+                voice_clone_params = texttospeech_v1.VoiceCloneParams(
+                    voice_cloning_key=self._voice_cloning_key
+                )
+                voice = texttospeech_v1.VoiceSelectionParams(
+                    language_code=self._settings["language"], voice_clone=voice_clone_params
+                )
+            else:
+                voice = texttospeech_v1.VoiceSelectionParams(
+                    language_code=self._settings["language"], name=self._voice_id
+                )

            streaming_config = texttospeech_v1.StreamingSynthesizeConfig(
                voice=voice,
                streaming_audio_config=texttospeech_v1.StreamingAudioConfig(
                    audio_encoding=texttospeech_v1.AudioEncoding.PCM,
                    sample_rate_hertz=self.sample_rate,
+                    speaking_rate=self._settings["speaking_rate"],
                ),
            )
            config_request = texttospeech_v1.StreamingSynthesizeRequest(