Add append_trailing_space to TTSService to prevent vocalizing trailing punctuation; update DeepgramTTSService and RimeTTSService to use the arg

This commit is contained in:
Mark Backman
2026-01-13 09:24:23 -05:00
parent 2296caf529
commit 31daa889e8
5 changed files with 30 additions and 6 deletions

1
changelog/3424.added.md Normal file
View File

@@ -0,0 +1 @@
- Added `append_trailing_space` parameter to `TTSService` to automatically append a trailing space to text before sending to TTS, helping prevent some services from vocalizing trailing punctuation.

View File

@@ -0,0 +1 @@
- `DeepgramTTSService` and `RimeTTSService` now set `append_trailing_space` to `True` to prevent punctuation (e.g., “dot”) from being pronounced.

View File

@@ -85,6 +85,7 @@ class DeepgramTTSService(WebsocketTTSService):
sample_rate=sample_rate,
pause_frame_processing=True,
push_stop_frames=True,
append_trailing_space=True,
**kwargs,
)
@@ -291,9 +292,7 @@ class DeepgramTTSService(WebsocketTTSService):
Yields:
Frame: Audio frames containing the synthesized speech, plus start/stop frames.
"""
# Append trailing space to prevent TTS from vocalizing trailing periods as "dot"
text_with_trailing_space = text + " "
logger.debug(f"{self}: Generating TTS [{text_with_trailing_space}]")
logger.debug(f"{self}: Generating TTS [{text}]")
try:
# Reconnect if the websocket is closed
@@ -301,14 +300,14 @@ class DeepgramTTSService(WebsocketTTSService):
await self._connect()
await self.start_ttfb_metrics()
await self.start_tts_usage_metrics(text_with_trailing_space)
await self.start_tts_usage_metrics(text)
yield TTSStartedFrame()
# Send text message to Deepgram
# Note: We don't send Flush here - that should only be sent when the
# LLM finishes a complete response via flush_audio()
speak_msg = {"type": "Speak", "text": text_with_trailing_space}
speak_msg = {"type": "Speak", "text": text}
await self._get_websocket().send(json.dumps(speak_msg))
# The audio frames will be handled in _receive_messages

View File

@@ -130,6 +130,7 @@ class RimeTTSService(AudioContextWordTTSService):
push_text_frames=False,
push_stop_frames=True,
pause_frame_processing=True,
append_trailing_space=True,
sample_rate=sample_rate,
**kwargs,
)

View File

@@ -101,6 +101,9 @@ class TTSService(AIService):
silence_time_s: float = 2.0,
# if True, we will pause processing frames while we are receiving audio
pause_frame_processing: bool = False,
# if True, append a trailing space to text before sending to TTS
# (helps prevent some TTS services from vocalizing trailing punctuation)
append_trailing_space: bool = False,
# TTS output sample rate
sample_rate: Optional[int] = None,
# Text aggregator to aggregate incoming tokens and decide when to push to the TTS.
@@ -132,6 +135,8 @@ class TTSService(AIService):
push_silence_after_stop: Whether to push silence audio after TTSStoppedFrame.
silence_time_s: Duration of silence to push when push_silence_after_stop is True.
pause_frame_processing: Whether to pause frame processing during audio generation.
append_trailing_space: Whether to append a trailing space to text before sending to TTS.
This helps prevent some TTS services from vocalizing trailing punctuation (e.g., "dot").
sample_rate: Output sample rate for generated audio.
text_aggregator: Custom text aggregator for processing incoming text.
@@ -161,6 +166,7 @@ class TTSService(AIService):
self._push_silence_after_stop: bool = push_silence_after_stop
self._silence_time_s: float = silence_time_s
self._pause_frame_processing: bool = pause_frame_processing
self._append_trailing_space: bool = append_trailing_space
self._init_sample_rate = sample_rate
self._sample_rate = 0
self._voice_id: str = ""
@@ -273,6 +279,19 @@ class TTSService(AIService):
"""
return Language(language)
def _prepare_text_for_tts(self, text: str) -> str:
"""Prepare text for TTS by applying any transformations required by the TTS service.
Args:
text: The text to prepare.
Returns:
The prepared text with transformations applied.
"""
if self._append_trailing_space and not text.endswith(" "):
return text + " "
return text
async def update_setting(self, key: str, value: Any):
"""Update a service-specific setting.
@@ -603,7 +622,10 @@ class TTSService(AIService):
for aggregation_type, transform in self._text_transforms:
if aggregation_type == type or aggregation_type == "*":
transformed_text = await transform(transformed_text, type)
await self.process_generator(self.run_tts(transformed_text))
# Apply any final text preparation (e.g., trailing space)
prepared_text = self._prepare_text_for_tts(transformed_text)
await self.process_generator(self.run_tts(prepared_text))
await self.stop_processing_metrics()