diff --git a/src/pipecat/services/nvidia/sagemaker/tts.py b/src/pipecat/services/nvidia/sagemaker/tts.py index 8a0e3cd5d..90d85bc97 100644 --- a/src/pipecat/services/nvidia/sagemaker/tts.py +++ b/src/pipecat/services/nvidia/sagemaker/tts.py @@ -479,9 +479,7 @@ class NvidiaSageMakerTTSService(InterruptibleTTSService): """Send text to NIM; audio arrives asynchronously via _receive_messages.""" logger.debug(f"{self}: Generating TTS [{text}]") - text = sanitize_text_for_tts(text) - - logger.debug(f"{self}: sanitized text: {text}") + text = text.strip() if not text or not any(c.isalnum() for c in text): return diff --git a/src/pipecat/utils/text/tts_text_sanitizer.py b/src/pipecat/utils/text/tts_text_sanitizer.py deleted file mode 100644 index 919c0946d..000000000 --- a/src/pipecat/utils/text/tts_text_sanitizer.py +++ /dev/null @@ -1,131 +0,0 @@ -# -# Copyright (c) 2024-2026, Daily -# -# SPDX-License-Identifier: BSD 2-Clause License -# - -"""Utility for stripping non-speakable characters and markdown formatting from text. - -Both NvidiaSageMakerHTTPTTSService and NvidiaSageMakerWebsocketTTSService -use :func:`sanitize_text_for_tts` so the logic lives in one place. -""" - -import re - -# --------------------------------------------------------------------------- -# Emoji / symbol ranges -# --------------------------------------------------------------------------- - -_EMOJI_PATTERN = re.compile( - "[" - "\U0001f600-\U0001f64f" # Emoticons - "\U0001f300-\U0001f5ff" # Misc Symbols and Pictographs - "\U0001f680-\U0001f6ff" # Transport and Map - "\U0001f700-\U0001f77f" # Alchemical Symbols - "\U0001f780-\U0001f7ff" # Geometric Shapes Extended - "\U0001f800-\U0001f8ff" # Supplemental Arrows-C - "\U0001f900-\U0001f9ff" # Supplemental Symbols and Pictographs - "\U0001fa00-\U0001fa6f" # Chess Symbols - "\U0001fa70-\U0001faff" # Symbols and Pictographs Extended-A - "\U00002702-\U000027b0" # Dingbats - "\U0001f1e0-\U0001f1ff" # Flags (iOS) - "]+", - flags=re.UNICODE, -) - - -def sanitize_text_for_tts(text: str) -> str: - """Remove emojis and markdown formatting that should not be spoken aloud. - - Transformations applied (in order): - 1. Fenced code blocks (``` ... ```) → removed entirely - 2. Markdown headers (# / ## / …) → header text kept, # stripped - 3. Horizontal rules (--- / *** / ___) → removed - 4. Table separator rows (|---|---|) → removed - 5. Table data rows (| a | b |) → cells joined with commas - 6. Bold / italic markers (**x**, *x*, __x__, _x_) → text kept, markers stripped - 7. Blockquote markers (> …) → marker stripped, text kept - 8. Inline code backticks (`x`) → backticks stripped, text kept - 9. Emojis → removed - 10. Curly quotes → straight quotes - Em/en dashes → comma (natural pause, not a spoken symbol) - Separator hyphens ( - ) → comma - Unordered list bullets (^- , ^* ) → removed - Remaining bare * and _ → removed - Other non-speakable symbols → removed - 11. Collapse extra whitespace - - Args: - text: Raw text, potentially containing markdown and/or emoji. - - Returns: - Plain text suitable for speech synthesis. - """ - # 1. Fenced code blocks - text = re.sub(r"```[\s\S]*?```", "", text) - - # 2. Markdown headers (# Heading → Heading) - text = re.sub(r"^#{1,6}\s+", "", text, flags=re.MULTILINE) - - # 3. Horizontal rules (---, ***, ___ on their own line) - text = re.sub(r"^\s*[-*_]{3,}\s*$", "", text, flags=re.MULTILINE) - - # 4. Table separator rows |---|:---:|---| - text = re.sub(r"^\s*\|[\s\-|:]+\|\s*$", "", text, flags=re.MULTILINE) - - # 5. Table data rows | cell | cell | → cell, cell - def _table_row_to_csv(m: re.Match) -> str: - cells = [c.strip() for c in m.group(1).split("|")] - return ", ".join(c for c in cells if c) - - text = re.sub(r"^\s*\|(.+)\|\s*$", _table_row_to_csv, text, flags=re.MULTILINE) - - # 6. Bold / italic (**x**, *x*, __x__, _x_) - # Handle triple before double before single to avoid partial matches. - text = re.sub(r"\*{3}([^*]+)\*{3}", r"\1", text) - text = re.sub(r"\*{2}([^*]+)\*{2}", r"\1", text) - text = re.sub(r"\*([^*\s][^*]*[^*\s]|\S)\*", r"\1", text) - text = re.sub(r"_{3}([^_]+)_{3}", r"\1", text) - text = re.sub(r"_{2}([^_]+)_{2}", r"\1", text) - text = re.sub(r"_([^_\s][^_]*[^_\s]|\S)_", r"\1", text) - - # 7. Blockquote markers - text = re.sub(r"^\s*>\s*", "", text, flags=re.MULTILINE) - - # 8. Inline code backticks - text = re.sub(r"`([^`]*)`", r"\1", text) - - # 9. Emojis - text = _EMOJI_PATTERN.sub("", text) - - # 10. Typographic and non-speakable characters - - # Curly quotes → straight equivalents (speakable) - text = text.replace("\u2018", "'") # LEFT SINGLE QUOTATION MARK - text = text.replace("\u2019", "'") # RIGHT SINGLE QUOTATION MARK - text = text.replace("\u201c", '"') # LEFT DOUBLE QUOTATION MARK - text = text.replace("\u201d", '"') # RIGHT DOUBLE QUOTATION MARK - - # Em/en dashes → comma (they mark a pause, not a spoken symbol) - text = text.replace("\u2014", ", ") # EM DASH - text = text.replace("\u2013", ", ") # EN DASH - - # Hyphen used as a separator ( - ) → comma; keep word-hyphens (e.g. "well-known") - text = re.sub(r"(?<= )- | -(?= )", ", ", text) - - # Unordered list bullets at line start (not caught by step 3) - text = re.sub(r"^[\s]*[-*]\s+", "", text, flags=re.MULTILINE) - - # Remaining bare * and _ (e.g. orphaned markers) - text = text.replace("*", "") - text = text.replace("_", " ") - - # Other symbols that TTS engines typically misread or glitch on - text = re.sub(r"[\\|<>{}[\]~^=+#@]", "", text) - - # 11. Collapse whitespace - text = re.sub(r"\n{3,}", "\n\n", text) # no more than one blank line - text = re.sub(r"[ \t]+", " ", text) # collapse spaces/tabs - # text = text.strip() - - return text