Merge pull request #3101 from hwuiwon/hw/inworld-talking-speed

feat: Add speaking rate control to Inworld TTS service.
2025-11-20 09:50:55 -05:00
parent 51bdd8b728 ead361f665
commit ab58f72322
2 changed files with 17 additions and 7 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -24,6 +24,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

 - Added word-level timestamps support to Hume TTS service

+- Added optional speaking rate control to `InworldTTSService`.
+
 ### Changed

 - ⚠️ Breaking change: `LLMContext.create_image_message()`,
@@ -89,6 +91,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

 - Prevented `HeyGenVideoService` from automatically disconnecting after 5 minutes.

+- Fixed `InworldTTSService` audio config payload to use camelCase keys expected
+  by the Inworld API.
+
 ## [0.0.94] - 2025-11-10

 ### Changed
--- a/src/pipecat/services/inworld/tts.py
+++ b/src/pipecat/services/inworld/tts.py
@@ -146,6 +146,8 @@ class InworldTTSService(TTSService):
        Parameters:
            temperature: Voice temperature control for synthesis variability (e.g., 1.1).
                        Valid range: [0, 2]. Higher values increase variability.
+            speaking_rate: Speaking speed control (range: [0.5, 1.5]). Defaults to 1.0 when
+                           unset.

        Note:
            Language is automatically inferred from the input text by Inworld's TTS models,
@@ -153,6 +155,7 @@ class InworldTTSService(TTSService):
        """

        temperature: Optional[float] = None  # optional temperature control (range: [0, 2])
+        speaking_rate: Optional[float] = None  # optional speaking rate control (range: [0.5, 1.5])

    def __init__(
        self,
@@ -198,6 +201,7 @@ class InworldTTSService(TTSService):
                     - Other formats as supported by Inworld API
            params: Optional input parameters for additional configuration. Use this to specify:
                   - temperature: Voice temperature control for variability (range: [0, 2], e.g., 1.1, optional)
+                   - speaking_rate: Set desired speaking speed (range: [0.5, 1.5], optional)
                   Language is automatically inferred from input text.
            **kwargs: Additional arguments passed to the parent TTSService class.

@@ -228,15 +232,18 @@ class InworldTTSService(TTSService):
        self._settings = {
            "voiceId": voice_id,  # Voice selection from direct parameter
            "modelId": model,  # TTS model selection from direct parameter
-            "audio_config": {  # Audio format configuration
-                "audio_encoding": encoding,  # Format: LINEAR16, MP3, etc.
-                "sample_rate_hertz": 0,  # Will be set in start() from parent service
+            "audioConfig": {  # Audio format configuration
+                "audioEncoding": encoding,  # Format: LINEAR16, MP3, etc.
+                "sampleRateHertz": 0,  # Will be set in start() from parent service
            },
        }

        # Add optional temperature parameter if provided (valid range: [0, 2])
        if params and params.temperature is not None:
            self._settings["temperature"] = params.temperature
+        # Add optional speaking rate if provided (valid range: [0.5, 1.5])
+        if params and params.speaking_rate is not None:
+            self._settings["audioConfig"]["speakingRate"] = params.speaking_rate

        # Register voice and model with parent service for metrics and tracking
        self.set_voice(voice_id)  # Used for logging and metrics
@@ -257,7 +264,7 @@ class InworldTTSService(TTSService):
            frame: The start frame containing initialization parameters.
        """
        await super().start(frame)
-        self._settings["audio_config"]["sample_rate_hertz"] = self.sample_rate
+        self._settings["audioConfig"]["sampleRateHertz"] = self.sample_rate

    async def stop(self, frame: EndFrame):
        """Stop the Inworld TTS service.
@@ -323,9 +330,7 @@ class InworldTTSService(TTSService):
            "text": text,  # Text to synthesize
            "voiceId": self._settings["voiceId"],  # Voice selection (Ashley, Hades, etc.)
            "modelId": self._settings["modelId"],  # TTS model (inworld-tts-1)
-            "audio_config": self._settings[
-                "audio_config"
-            ],  # Audio format settings (LINEAR16, 48kHz)
+            "audioConfig": self._settings["audioConfig"],  # Audio format settings (LINEAR16, 48kHz)
        }

        # Add optional temperature parameter if configured (valid range: [0, 2])