services: add spelling out support to CartesiaTTSService and RimeTTSService

2025-03-18 23:08:34 -07:00
parent e7224473f2
commit 54620133d4
3 changed files with 13 additions and 2 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -142,6 +142,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

 ### Fixed

+- Fixed a `CartesiaTTSService` and `RimeTTSService` issue that would consider
+  text between spelling out tags end of sentence.
+
 - Fixed a `match_endofsentence` issue that would result in floating point
  numbers to be considered an end of sentence.

--- a/src/pipecat/services/cartesia.py
+++ b/src/pipecat/services/cartesia.py
@@ -7,7 +7,7 @@
 import base64
 import json
 import uuid
-from typing import AsyncGenerator, List, Optional, Union
+from typing import AsyncGenerator, List, Optional, Sequence, Union

 from loguru import logger
 from pydantic import BaseModel
@@ -26,6 +26,8 @@ from pipecat.frames.frames import (
 from pipecat.processors.frame_processor import FrameDirection
 from pipecat.services.ai_services import AudioContextWordTTSService, TTSService
 from pipecat.transcriptions.language import Language
+from pipecat.utils.text.base_text_aggregator import BaseTextAggregator
+from pipecat.utils.text.skip_tags_aggregator import SkipTagsAggregator

 # See .env.example for Cartesia configuration needed
 try:
@@ -89,6 +91,7 @@ class CartesiaTTSService(AudioContextWordTTSService):
        encoding: str = "pcm_s16le",
        container: str = "raw",
        params: InputParams = InputParams(),
+        text_aggregators: Sequence[BaseTextAggregator] = [],
        **kwargs,
    ):
        # Aggregating sentences still gives cleaner-sounding results and fewer
@@ -106,6 +109,7 @@ class CartesiaTTSService(AudioContextWordTTSService):
            push_text_frames=False,
            pause_frame_processing=True,
            sample_rate=sample_rate,
+            text_aggregators=text_aggregators or [SkipTagsAggregator([("<spell>", "</spell>")])],
            **kwargs,
        )

--- a/src/pipecat/services/rime.py
+++ b/src/pipecat/services/rime.py
@@ -7,7 +7,7 @@
 import base64
 import json
 import uuid
-from typing import AsyncGenerator, Optional
+from typing import AsyncGenerator, Optional, Sequence

 import aiohttp
 from loguru import logger
@@ -27,6 +27,8 @@ from pipecat.frames.frames import (
 from pipecat.processors.frame_processor import FrameDirection
 from pipecat.services.ai_services import AudioContextWordTTSService, TTSService
 from pipecat.transcriptions.language import Language
+from pipecat.utils.text.base_text_aggregator import BaseTextAggregator
+from pipecat.utils.text.skip_tags_aggregator import SkipTagsAggregator

 try:
    import websockets
@@ -78,6 +80,7 @@ class RimeTTSService(AudioContextWordTTSService):
        model: str = "mistv2",
        sample_rate: Optional[int] = None,
        params: InputParams = InputParams(),
+        text_aggregators: Sequence[BaseTextAggregator] = [],
        **kwargs,
    ):
        """Initialize Rime TTS service.
@@ -97,6 +100,7 @@ class RimeTTSService(AudioContextWordTTSService):
            push_stop_frames=True,
            pause_frame_processing=True,
            sample_rate=sample_rate,
+            text_aggregators=text_aggregators or [SkipTagsAggregator([("spell(", ")")])],
            **kwargs,
        )