From 31daa889e83b960fab79d66b2ab014d930e15a2e Mon Sep 17 00:00:00 2001 From: Mark Backman Date: Tue, 13 Jan 2026 09:24:23 -0500 Subject: [PATCH] Add append_trailing_space to TTSService to prevent vocalizing trailing punctuation; update DeepgramTTSService and RimeTTSService to use the arg --- changelog/3424.added.md | 1 + changelog/3424.changed.md | 1 + src/pipecat/services/deepgram/tts.py | 9 ++++----- src/pipecat/services/rime/tts.py | 1 + src/pipecat/services/tts_service.py | 24 +++++++++++++++++++++++- 5 files changed, 30 insertions(+), 6 deletions(-) create mode 100644 changelog/3424.added.md create mode 100644 changelog/3424.changed.md diff --git a/changelog/3424.added.md b/changelog/3424.added.md new file mode 100644 index 000000000..61cc8ea77 --- /dev/null +++ b/changelog/3424.added.md @@ -0,0 +1 @@ +- Added `append_trailing_space` parameter to `TTSService` to automatically append a trailing space to text before sending to TTS, helping prevent some services from vocalizing trailing punctuation. diff --git a/changelog/3424.changed.md b/changelog/3424.changed.md new file mode 100644 index 000000000..2e665ca2d --- /dev/null +++ b/changelog/3424.changed.md @@ -0,0 +1 @@ +- `DeepgramTTSService` and `RimeTTSService` now set `append_trailing_space` to `True` to prevent punctuation (e.g., “dot”) from being pronounced. diff --git a/src/pipecat/services/deepgram/tts.py b/src/pipecat/services/deepgram/tts.py index ec41baf26..a53ec56d2 100644 --- a/src/pipecat/services/deepgram/tts.py +++ b/src/pipecat/services/deepgram/tts.py @@ -85,6 +85,7 @@ class DeepgramTTSService(WebsocketTTSService): sample_rate=sample_rate, pause_frame_processing=True, push_stop_frames=True, + append_trailing_space=True, **kwargs, ) @@ -291,9 +292,7 @@ class DeepgramTTSService(WebsocketTTSService): Yields: Frame: Audio frames containing the synthesized speech, plus start/stop frames. """ - # Append trailing space to prevent TTS from vocalizing trailing periods as "dot" - text_with_trailing_space = text + " " - logger.debug(f"{self}: Generating TTS [{text_with_trailing_space}]") + logger.debug(f"{self}: Generating TTS [{text}]") try: # Reconnect if the websocket is closed @@ -301,14 +300,14 @@ class DeepgramTTSService(WebsocketTTSService): await self._connect() await self.start_ttfb_metrics() - await self.start_tts_usage_metrics(text_with_trailing_space) + await self.start_tts_usage_metrics(text) yield TTSStartedFrame() # Send text message to Deepgram # Note: We don't send Flush here - that should only be sent when the # LLM finishes a complete response via flush_audio() - speak_msg = {"type": "Speak", "text": text_with_trailing_space} + speak_msg = {"type": "Speak", "text": text} await self._get_websocket().send(json.dumps(speak_msg)) # The audio frames will be handled in _receive_messages diff --git a/src/pipecat/services/rime/tts.py b/src/pipecat/services/rime/tts.py index b6fe25e0e..39f1a626c 100644 --- a/src/pipecat/services/rime/tts.py +++ b/src/pipecat/services/rime/tts.py @@ -130,6 +130,7 @@ class RimeTTSService(AudioContextWordTTSService): push_text_frames=False, push_stop_frames=True, pause_frame_processing=True, + append_trailing_space=True, sample_rate=sample_rate, **kwargs, ) diff --git a/src/pipecat/services/tts_service.py b/src/pipecat/services/tts_service.py index 4b1d20b9b..e04c4b649 100644 --- a/src/pipecat/services/tts_service.py +++ b/src/pipecat/services/tts_service.py @@ -101,6 +101,9 @@ class TTSService(AIService): silence_time_s: float = 2.0, # if True, we will pause processing frames while we are receiving audio pause_frame_processing: bool = False, + # if True, append a trailing space to text before sending to TTS + # (helps prevent some TTS services from vocalizing trailing punctuation) + append_trailing_space: bool = False, # TTS output sample rate sample_rate: Optional[int] = None, # Text aggregator to aggregate incoming tokens and decide when to push to the TTS. @@ -132,6 +135,8 @@ class TTSService(AIService): push_silence_after_stop: Whether to push silence audio after TTSStoppedFrame. silence_time_s: Duration of silence to push when push_silence_after_stop is True. pause_frame_processing: Whether to pause frame processing during audio generation. + append_trailing_space: Whether to append a trailing space to text before sending to TTS. + This helps prevent some TTS services from vocalizing trailing punctuation (e.g., "dot"). sample_rate: Output sample rate for generated audio. text_aggregator: Custom text aggregator for processing incoming text. @@ -161,6 +166,7 @@ class TTSService(AIService): self._push_silence_after_stop: bool = push_silence_after_stop self._silence_time_s: float = silence_time_s self._pause_frame_processing: bool = pause_frame_processing + self._append_trailing_space: bool = append_trailing_space self._init_sample_rate = sample_rate self._sample_rate = 0 self._voice_id: str = "" @@ -273,6 +279,19 @@ class TTSService(AIService): """ return Language(language) + def _prepare_text_for_tts(self, text: str) -> str: + """Prepare text for TTS by applying any transformations required by the TTS service. + + Args: + text: The text to prepare. + + Returns: + The prepared text with transformations applied. + """ + if self._append_trailing_space and not text.endswith(" "): + return text + " " + return text + async def update_setting(self, key: str, value: Any): """Update a service-specific setting. @@ -603,7 +622,10 @@ class TTSService(AIService): for aggregation_type, transform in self._text_transforms: if aggregation_type == type or aggregation_type == "*": transformed_text = await transform(transformed_text, type) - await self.process_generator(self.run_tts(transformed_text)) + + # Apply any final text preparation (e.g., trailing space) + prepared_text = self._prepare_text_for_tts(transformed_text) + await self.process_generator(self.run_tts(prepared_text)) await self.stop_processing_metrics()