From 54620133d4473e3f8d10d26112fbf002ece6b8ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= Date: Tue, 18 Mar 2025 23:08:34 -0700 Subject: [PATCH] services: add spelling out support to CartesiaTTSService and RimeTTSService --- CHANGELOG.md | 3 +++ src/pipecat/services/cartesia.py | 6 +++++- src/pipecat/services/rime.py | 6 +++++- 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5abdf4ef5..3006ed5ca 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -142,6 +142,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed +- Fixed a `CartesiaTTSService` and `RimeTTSService` issue that would consider + text between spelling out tags end of sentence. + - Fixed a `match_endofsentence` issue that would result in floating point numbers to be considered an end of sentence. diff --git a/src/pipecat/services/cartesia.py b/src/pipecat/services/cartesia.py index 8b7f57c63..5a795d750 100644 --- a/src/pipecat/services/cartesia.py +++ b/src/pipecat/services/cartesia.py @@ -7,7 +7,7 @@ import base64 import json import uuid -from typing import AsyncGenerator, List, Optional, Union +from typing import AsyncGenerator, List, Optional, Sequence, Union from loguru import logger from pydantic import BaseModel @@ -26,6 +26,8 @@ from pipecat.frames.frames import ( from pipecat.processors.frame_processor import FrameDirection from pipecat.services.ai_services import AudioContextWordTTSService, TTSService from pipecat.transcriptions.language import Language +from pipecat.utils.text.base_text_aggregator import BaseTextAggregator +from pipecat.utils.text.skip_tags_aggregator import SkipTagsAggregator # See .env.example for Cartesia configuration needed try: @@ -89,6 +91,7 @@ class CartesiaTTSService(AudioContextWordTTSService): encoding: str = "pcm_s16le", container: str = "raw", params: InputParams = InputParams(), + text_aggregators: Sequence[BaseTextAggregator] = [], **kwargs, ): # Aggregating sentences still gives cleaner-sounding results and fewer @@ -106,6 +109,7 @@ class CartesiaTTSService(AudioContextWordTTSService): push_text_frames=False, pause_frame_processing=True, sample_rate=sample_rate, + text_aggregators=text_aggregators or [SkipTagsAggregator([("", "")])], **kwargs, ) diff --git a/src/pipecat/services/rime.py b/src/pipecat/services/rime.py index b2610b06c..471f82d66 100644 --- a/src/pipecat/services/rime.py +++ b/src/pipecat/services/rime.py @@ -7,7 +7,7 @@ import base64 import json import uuid -from typing import AsyncGenerator, Optional +from typing import AsyncGenerator, Optional, Sequence import aiohttp from loguru import logger @@ -27,6 +27,8 @@ from pipecat.frames.frames import ( from pipecat.processors.frame_processor import FrameDirection from pipecat.services.ai_services import AudioContextWordTTSService, TTSService from pipecat.transcriptions.language import Language +from pipecat.utils.text.base_text_aggregator import BaseTextAggregator +from pipecat.utils.text.skip_tags_aggregator import SkipTagsAggregator try: import websockets @@ -78,6 +80,7 @@ class RimeTTSService(AudioContextWordTTSService): model: str = "mistv2", sample_rate: Optional[int] = None, params: InputParams = InputParams(), + text_aggregators: Sequence[BaseTextAggregator] = [], **kwargs, ): """Initialize Rime TTS service. @@ -97,6 +100,7 @@ class RimeTTSService(AudioContextWordTTSService): push_stop_frames=True, pause_frame_processing=True, sample_rate=sample_rate, + text_aggregators=text_aggregators or [SkipTagsAggregator([("spell(", ")")])], **kwargs, )