Add append_trailing_space to TTSService to prevent vocalizing trailing punctuation; update DeepgramTTSService and RimeTTSService to use the arg

2026-01-13 09:24:23 -05:00
parent 2296caf529
commit 31daa889e8
5 changed files with 30 additions and 6 deletions
--- a/changelog/3424.added.md
+++ b/changelog/3424.added.md
@@ -0,0 +1 @@
+- Added `append_trailing_space` parameter to `TTSService` to automatically append a trailing space to text before sending to TTS, helping prevent some services from vocalizing trailing punctuation.
--- a/changelog/3424.changed.md
+++ b/changelog/3424.changed.md
@@ -0,0 +1 @@
+- `DeepgramTTSService` and `RimeTTSService` now set `append_trailing_space` to `True` to prevent punctuation (e.g., “dot”) from being pronounced.
--- a/src/pipecat/services/deepgram/tts.py
+++ b/src/pipecat/services/deepgram/tts.py
@@ -85,6 +85,7 @@ class DeepgramTTSService(WebsocketTTSService):
            sample_rate=sample_rate,
            pause_frame_processing=True,
            push_stop_frames=True,
+            append_trailing_space=True,
            **kwargs,
        )

@@ -291,9 +292,7 @@ class DeepgramTTSService(WebsocketTTSService):
        Yields:
            Frame: Audio frames containing the synthesized speech, plus start/stop frames.
        """
-        # Append trailing space to prevent TTS from vocalizing trailing periods as "dot"
-        text_with_trailing_space = text + " "
-        logger.debug(f"{self}: Generating TTS [{text_with_trailing_space}]")
+        logger.debug(f"{self}: Generating TTS [{text}]")

        try:
            # Reconnect if the websocket is closed
@@ -301,14 +300,14 @@ class DeepgramTTSService(WebsocketTTSService):
                await self._connect()

            await self.start_ttfb_metrics()
-            await self.start_tts_usage_metrics(text_with_trailing_space)
+            await self.start_tts_usage_metrics(text)

            yield TTSStartedFrame()

            # Send text message to Deepgram
            # Note: We don't send Flush here - that should only be sent when the
            # LLM finishes a complete response via flush_audio()
-            speak_msg = {"type": "Speak", "text": text_with_trailing_space}
+            speak_msg = {"type": "Speak", "text": text}
            await self._get_websocket().send(json.dumps(speak_msg))

            # The audio frames will be handled in _receive_messages
--- a/src/pipecat/services/rime/tts.py
+++ b/src/pipecat/services/rime/tts.py
@@ -130,6 +130,7 @@ class RimeTTSService(AudioContextWordTTSService):
            push_text_frames=False,
            push_stop_frames=True,
            pause_frame_processing=True,
+            append_trailing_space=True,
            sample_rate=sample_rate,
            **kwargs,
        )
--- a/src/pipecat/services/tts_service.py
+++ b/src/pipecat/services/tts_service.py
@@ -101,6 +101,9 @@ class TTSService(AIService):
        silence_time_s: float = 2.0,
        # if True, we will pause processing frames while we are receiving audio
        pause_frame_processing: bool = False,
+        # if True, append a trailing space to text before sending to TTS
+        # (helps prevent some TTS services from vocalizing trailing punctuation)
+        append_trailing_space: bool = False,
        # TTS output sample rate
        sample_rate: Optional[int] = None,
        # Text aggregator to aggregate incoming tokens and decide when to push to the TTS.
@@ -132,6 +135,8 @@ class TTSService(AIService):
            push_silence_after_stop: Whether to push silence audio after TTSStoppedFrame.
            silence_time_s: Duration of silence to push when push_silence_after_stop is True.
            pause_frame_processing: Whether to pause frame processing during audio generation.
+            append_trailing_space: Whether to append a trailing space to text before sending to TTS.
+                This helps prevent some TTS services from vocalizing trailing punctuation (e.g., "dot").
            sample_rate: Output sample rate for generated audio.
            text_aggregator: Custom text aggregator for processing incoming text.

@@ -161,6 +166,7 @@ class TTSService(AIService):
        self._push_silence_after_stop: bool = push_silence_after_stop
        self._silence_time_s: float = silence_time_s
        self._pause_frame_processing: bool = pause_frame_processing
+        self._append_trailing_space: bool = append_trailing_space
        self._init_sample_rate = sample_rate
        self._sample_rate = 0
        self._voice_id: str = ""
@@ -273,6 +279,19 @@ class TTSService(AIService):
        """
        return Language(language)

+    def _prepare_text_for_tts(self, text: str) -> str:
+        """Prepare text for TTS by applying any transformations required by the TTS service.
+
+        Args:
+            text: The text to prepare.
+
+        Returns:
+            The prepared text with transformations applied.
+        """
+        if self._append_trailing_space and not text.endswith(" "):
+            return text + " "
+        return text
+
    async def update_setting(self, key: str, value: Any):
        """Update a service-specific setting.

@@ -603,7 +622,10 @@ class TTSService(AIService):
        for aggregation_type, transform in self._text_transforms:
            if aggregation_type == type or aggregation_type == "*":
                transformed_text = await transform(transformed_text, type)
-        await self.process_generator(self.run_tts(transformed_text))
+
+        # Apply any final text preparation (e.g., trailing space)
+        prepared_text = self._prepare_text_for_tts(transformed_text)
+        await self.process_generator(self.run_tts(prepared_text))

        await self.stop_processing_metrics()
				`@@ -0,0 +1 @@`
				- Added `append_trailing_space` parameter to `TTSService` to automatically append a trailing space to text before sending to TTS, helping prevent some services from vocalizing trailing punctuation.
				`@@ -0,0 +1 @@`
				- `DeepgramTTSService` and `RimeTTSService` now set `append_trailing_space` to `True` to prevent punctuation (e.g., “dot”) from being pronounced.