diff --git a/CHANGELOG.md b/CHANGELOG.md index 5abdf4ef5..3006ed5ca 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -142,6 +142,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed +- Fixed a `CartesiaTTSService` and `RimeTTSService` issue that would consider + text between spelling out tags end of sentence. + - Fixed a `match_endofsentence` issue that would result in floating point numbers to be considered an end of sentence. diff --git a/src/pipecat/services/cartesia.py b/src/pipecat/services/cartesia.py index 8b7f57c63..5a795d750 100644 --- a/src/pipecat/services/cartesia.py +++ b/src/pipecat/services/cartesia.py @@ -7,7 +7,7 @@ import base64 import json import uuid -from typing import AsyncGenerator, List, Optional, Union +from typing import AsyncGenerator, List, Optional, Sequence, Union from loguru import logger from pydantic import BaseModel @@ -26,6 +26,8 @@ from pipecat.frames.frames import ( from pipecat.processors.frame_processor import FrameDirection from pipecat.services.ai_services import AudioContextWordTTSService, TTSService from pipecat.transcriptions.language import Language +from pipecat.utils.text.base_text_aggregator import BaseTextAggregator +from pipecat.utils.text.skip_tags_aggregator import SkipTagsAggregator # See .env.example for Cartesia configuration needed try: @@ -89,6 +91,7 @@ class CartesiaTTSService(AudioContextWordTTSService): encoding: str = "pcm_s16le", container: str = "raw", params: InputParams = InputParams(), + text_aggregators: Sequence[BaseTextAggregator] = [], **kwargs, ): # Aggregating sentences still gives cleaner-sounding results and fewer @@ -106,6 +109,7 @@ class CartesiaTTSService(AudioContextWordTTSService): push_text_frames=False, pause_frame_processing=True, sample_rate=sample_rate, + text_aggregators=text_aggregators or [SkipTagsAggregator([("", "")])], **kwargs, ) diff --git a/src/pipecat/services/rime.py b/src/pipecat/services/rime.py index b2610b06c..471f82d66 100644 --- a/src/pipecat/services/rime.py +++ b/src/pipecat/services/rime.py @@ -7,7 +7,7 @@ import base64 import json import uuid -from typing import AsyncGenerator, Optional +from typing import AsyncGenerator, Optional, Sequence import aiohttp from loguru import logger @@ -27,6 +27,8 @@ from pipecat.frames.frames import ( from pipecat.processors.frame_processor import FrameDirection from pipecat.services.ai_services import AudioContextWordTTSService, TTSService from pipecat.transcriptions.language import Language +from pipecat.utils.text.base_text_aggregator import BaseTextAggregator +from pipecat.utils.text.skip_tags_aggregator import SkipTagsAggregator try: import websockets @@ -78,6 +80,7 @@ class RimeTTSService(AudioContextWordTTSService): model: str = "mistv2", sample_rate: Optional[int] = None, params: InputParams = InputParams(), + text_aggregators: Sequence[BaseTextAggregator] = [], **kwargs, ): """Initialize Rime TTS service. @@ -97,6 +100,7 @@ class RimeTTSService(AudioContextWordTTSService): push_stop_frames=True, pause_frame_processing=True, sample_rate=sample_rate, + text_aggregators=text_aggregators or [SkipTagsAggregator([("spell(", ")")])], **kwargs, )