From 54620133d4473e3f8d10d26112fbf002ece6b8ea Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= <aleix@daily.co>
Date: Tue, 18 Mar 2025 23:08:34 -0700
Subject: [PATCH] services: add spelling out support to CartesiaTTSService and
 RimeTTSService

---
 CHANGELOG.md                     | 3 +++
 src/pipecat/services/cartesia.py | 6 +++++-
 src/pipecat/services/rime.py     | 6 +++++-
 3 files changed, 13 insertions(+), 2 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5abdf4ef5..3006ed5ca 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -142,6 +142,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Fixed
 
+- Fixed a `CartesiaTTSService` and `RimeTTSService` issue that would consider
+  text between spelling out tags end of sentence.
+
 - Fixed a `match_endofsentence` issue that would result in floating point
   numbers to be considered an end of sentence.
 
diff --git a/src/pipecat/services/cartesia.py b/src/pipecat/services/cartesia.py
index 8b7f57c63..5a795d750 100644
--- a/src/pipecat/services/cartesia.py
+++ b/src/pipecat/services/cartesia.py
@@ -7,7 +7,7 @@
 import base64
 import json
 import uuid
-from typing import AsyncGenerator, List, Optional, Union
+from typing import AsyncGenerator, List, Optional, Sequence, Union
 
 from loguru import logger
 from pydantic import BaseModel
@@ -26,6 +26,8 @@ from pipecat.frames.frames import (
 from pipecat.processors.frame_processor import FrameDirection
 from pipecat.services.ai_services import AudioContextWordTTSService, TTSService
 from pipecat.transcriptions.language import Language
+from pipecat.utils.text.base_text_aggregator import BaseTextAggregator
+from pipecat.utils.text.skip_tags_aggregator import SkipTagsAggregator
 
 # See .env.example for Cartesia configuration needed
 try:
@@ -89,6 +91,7 @@ class CartesiaTTSService(AudioContextWordTTSService):
         encoding: str = "pcm_s16le",
         container: str = "raw",
         params: InputParams = InputParams(),
+        text_aggregators: Sequence[BaseTextAggregator] = [],
         **kwargs,
     ):
         # Aggregating sentences still gives cleaner-sounding results and fewer
@@ -106,6 +109,7 @@ class CartesiaTTSService(AudioContextWordTTSService):
             push_text_frames=False,
             pause_frame_processing=True,
             sample_rate=sample_rate,
+            text_aggregators=text_aggregators or [SkipTagsAggregator([("<spell>", "</spell>")])],
             **kwargs,
         )
 
diff --git a/src/pipecat/services/rime.py b/src/pipecat/services/rime.py
index b2610b06c..471f82d66 100644
--- a/src/pipecat/services/rime.py
+++ b/src/pipecat/services/rime.py
@@ -7,7 +7,7 @@
 import base64
 import json
 import uuid
-from typing import AsyncGenerator, Optional
+from typing import AsyncGenerator, Optional, Sequence
 
 import aiohttp
 from loguru import logger
@@ -27,6 +27,8 @@ from pipecat.frames.frames import (
 from pipecat.processors.frame_processor import FrameDirection
 from pipecat.services.ai_services import AudioContextWordTTSService, TTSService
 from pipecat.transcriptions.language import Language
+from pipecat.utils.text.base_text_aggregator import BaseTextAggregator
+from pipecat.utils.text.skip_tags_aggregator import SkipTagsAggregator
 
 try:
     import websockets
@@ -78,6 +80,7 @@ class RimeTTSService(AudioContextWordTTSService):
         model: str = "mistv2",
         sample_rate: Optional[int] = None,
         params: InputParams = InputParams(),
+        text_aggregators: Sequence[BaseTextAggregator] = [],
         **kwargs,
     ):
         """Initialize Rime TTS service.
@@ -97,6 +100,7 @@ class RimeTTSService(AudioContextWordTTSService):
             push_stop_frames=True,
             pause_frame_processing=True,
             sample_rate=sample_rate,
+            text_aggregators=text_aggregators or [SkipTagsAggregator([("spell(", ")")])],
             **kwargs,
         )