Merge pull request #2547 from manishkjs/feat/google-tts-voice-cloning

feat: add voice cloning and speaking rate to GoogleTTSService
This commit is contained in:
Aleix Conchillo Flaqué
2025-09-11 14:32:21 -07:00
committed by GitHub
2 changed files with 20 additions and 3 deletions

View File

@@ -167,6 +167,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
# Handle navigation failure
```
- Added `voice_cloning_key` to `GoogleTTSService` to support custom cloned voices.
- Added `speaking_rate` to `GoogleTTSService.InputParams` to control the speaking rate.
- `BaseOutputTransport` now implements `write_dtmf()` by loading DTMF audio and
sending it through the transport. This makes sending DTMF generic across all
output transports.

View File

@@ -500,9 +500,11 @@ class GoogleTTSService(TTSService):
Parameters:
language: Language for synthesis. Defaults to English.
speaking_rate: The speaking rate, in the range [0.25, 4.0].
"""
language: Optional[Language] = Language.EN
speaking_rate: Optional[float] = None
def __init__(
self,
@@ -510,6 +512,7 @@ class GoogleTTSService(TTSService):
credentials: Optional[str] = None,
credentials_path: Optional[str] = None,
voice_id: str = "en-US-Chirp3-HD-Charon",
voice_cloning_key: Optional[str] = None,
sample_rate: Optional[int] = None,
params: InputParams = InputParams(),
**kwargs,
@@ -520,6 +523,7 @@ class GoogleTTSService(TTSService):
credentials: JSON string containing Google Cloud service account credentials.
credentials_path: Path to Google Cloud service account JSON file.
voice_id: Google TTS voice identifier (e.g., "en-US-Chirp3-HD-Charon").
voice_cloning_key: The voice cloning key for Chirp 3 custom voices.
sample_rate: Audio sample rate in Hz. If None, uses default.
params: Language configuration parameters.
**kwargs: Additional arguments passed to parent TTSService.
@@ -532,8 +536,10 @@ class GoogleTTSService(TTSService):
"language": self.language_to_service_language(params.language)
if params.language
else "en-US",
"speaking_rate": params.speaking_rate,
}
self.set_voice(voice_id)
self._voice_cloning_key = voice_cloning_key
self._client: texttospeech_v1.TextToSpeechAsyncClient = self._create_client(
credentials, credentials_path
)
@@ -600,15 +606,24 @@ class GoogleTTSService(TTSService):
try:
await self.start_ttfb_metrics()
voice = texttospeech_v1.VoiceSelectionParams(
language_code=self._settings["language"], name=self._voice_id
)
if self._voice_cloning_key:
voice_clone_params = texttospeech_v1.VoiceCloneParams(
voice_cloning_key=self._voice_cloning_key
)
voice = texttospeech_v1.VoiceSelectionParams(
language_code=self._settings["language"], voice_clone=voice_clone_params
)
else:
voice = texttospeech_v1.VoiceSelectionParams(
language_code=self._settings["language"], name=self._voice_id
)
streaming_config = texttospeech_v1.StreamingSynthesizeConfig(
voice=voice,
streaming_audio_config=texttospeech_v1.StreamingAudioConfig(
audio_encoding=texttospeech_v1.AudioEncoding.PCM,
sample_rate_hertz=self.sample_rate,
speaking_rate=self._settings["speaking_rate"],
),
)
config_request = texttospeech_v1.StreamingSynthesizeRequest(