Add append_trailing_space to TTSService to prevent vocalizing trailing punctuation; update DeepgramTTSService and RimeTTSService to use the arg
This commit is contained in:
1
changelog/3424.added.md
Normal file
1
changelog/3424.added.md
Normal file
@@ -0,0 +1 @@
|
||||
- Added `append_trailing_space` parameter to `TTSService` to automatically append a trailing space to text before sending to TTS, helping prevent some services from vocalizing trailing punctuation.
|
||||
1
changelog/3424.changed.md
Normal file
1
changelog/3424.changed.md
Normal file
@@ -0,0 +1 @@
|
||||
- `DeepgramTTSService` and `RimeTTSService` now set `append_trailing_space` to `True` to prevent punctuation (e.g., “dot”) from being pronounced.
|
||||
@@ -85,6 +85,7 @@ class DeepgramTTSService(WebsocketTTSService):
|
||||
sample_rate=sample_rate,
|
||||
pause_frame_processing=True,
|
||||
push_stop_frames=True,
|
||||
append_trailing_space=True,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@@ -291,9 +292,7 @@ class DeepgramTTSService(WebsocketTTSService):
|
||||
Yields:
|
||||
Frame: Audio frames containing the synthesized speech, plus start/stop frames.
|
||||
"""
|
||||
# Append trailing space to prevent TTS from vocalizing trailing periods as "dot"
|
||||
text_with_trailing_space = text + " "
|
||||
logger.debug(f"{self}: Generating TTS [{text_with_trailing_space}]")
|
||||
logger.debug(f"{self}: Generating TTS [{text}]")
|
||||
|
||||
try:
|
||||
# Reconnect if the websocket is closed
|
||||
@@ -301,14 +300,14 @@ class DeepgramTTSService(WebsocketTTSService):
|
||||
await self._connect()
|
||||
|
||||
await self.start_ttfb_metrics()
|
||||
await self.start_tts_usage_metrics(text_with_trailing_space)
|
||||
await self.start_tts_usage_metrics(text)
|
||||
|
||||
yield TTSStartedFrame()
|
||||
|
||||
# Send text message to Deepgram
|
||||
# Note: We don't send Flush here - that should only be sent when the
|
||||
# LLM finishes a complete response via flush_audio()
|
||||
speak_msg = {"type": "Speak", "text": text_with_trailing_space}
|
||||
speak_msg = {"type": "Speak", "text": text}
|
||||
await self._get_websocket().send(json.dumps(speak_msg))
|
||||
|
||||
# The audio frames will be handled in _receive_messages
|
||||
|
||||
@@ -130,6 +130,7 @@ class RimeTTSService(AudioContextWordTTSService):
|
||||
push_text_frames=False,
|
||||
push_stop_frames=True,
|
||||
pause_frame_processing=True,
|
||||
append_trailing_space=True,
|
||||
sample_rate=sample_rate,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@@ -101,6 +101,9 @@ class TTSService(AIService):
|
||||
silence_time_s: float = 2.0,
|
||||
# if True, we will pause processing frames while we are receiving audio
|
||||
pause_frame_processing: bool = False,
|
||||
# if True, append a trailing space to text before sending to TTS
|
||||
# (helps prevent some TTS services from vocalizing trailing punctuation)
|
||||
append_trailing_space: bool = False,
|
||||
# TTS output sample rate
|
||||
sample_rate: Optional[int] = None,
|
||||
# Text aggregator to aggregate incoming tokens and decide when to push to the TTS.
|
||||
@@ -132,6 +135,8 @@ class TTSService(AIService):
|
||||
push_silence_after_stop: Whether to push silence audio after TTSStoppedFrame.
|
||||
silence_time_s: Duration of silence to push when push_silence_after_stop is True.
|
||||
pause_frame_processing: Whether to pause frame processing during audio generation.
|
||||
append_trailing_space: Whether to append a trailing space to text before sending to TTS.
|
||||
This helps prevent some TTS services from vocalizing trailing punctuation (e.g., "dot").
|
||||
sample_rate: Output sample rate for generated audio.
|
||||
text_aggregator: Custom text aggregator for processing incoming text.
|
||||
|
||||
@@ -161,6 +166,7 @@ class TTSService(AIService):
|
||||
self._push_silence_after_stop: bool = push_silence_after_stop
|
||||
self._silence_time_s: float = silence_time_s
|
||||
self._pause_frame_processing: bool = pause_frame_processing
|
||||
self._append_trailing_space: bool = append_trailing_space
|
||||
self._init_sample_rate = sample_rate
|
||||
self._sample_rate = 0
|
||||
self._voice_id: str = ""
|
||||
@@ -273,6 +279,19 @@ class TTSService(AIService):
|
||||
"""
|
||||
return Language(language)
|
||||
|
||||
def _prepare_text_for_tts(self, text: str) -> str:
|
||||
"""Prepare text for TTS by applying any transformations required by the TTS service.
|
||||
|
||||
Args:
|
||||
text: The text to prepare.
|
||||
|
||||
Returns:
|
||||
The prepared text with transformations applied.
|
||||
"""
|
||||
if self._append_trailing_space and not text.endswith(" "):
|
||||
return text + " "
|
||||
return text
|
||||
|
||||
async def update_setting(self, key: str, value: Any):
|
||||
"""Update a service-specific setting.
|
||||
|
||||
@@ -603,7 +622,10 @@ class TTSService(AIService):
|
||||
for aggregation_type, transform in self._text_transforms:
|
||||
if aggregation_type == type or aggregation_type == "*":
|
||||
transformed_text = await transform(transformed_text, type)
|
||||
await self.process_generator(self.run_tts(transformed_text))
|
||||
|
||||
# Apply any final text preparation (e.g., trailing space)
|
||||
prepared_text = self._prepare_text_for_tts(transformed_text)
|
||||
await self.process_generator(self.run_tts(prepared_text))
|
||||
|
||||
await self.stop_processing_metrics()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user