services: add spelling out support to CartesiaTTSService and RimeTTSService

This commit is contained in:
Aleix Conchillo Flaqué
2025-03-18 23:08:34 -07:00
parent e7224473f2
commit 54620133d4
3 changed files with 13 additions and 2 deletions

View File

@@ -142,6 +142,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Fixed
- Fixed a `CartesiaTTSService` and `RimeTTSService` issue that would consider
text between spelling out tags end of sentence.
- Fixed a `match_endofsentence` issue that would result in floating point
numbers to be considered an end of sentence.

View File

@@ -7,7 +7,7 @@
import base64
import json
import uuid
from typing import AsyncGenerator, List, Optional, Union
from typing import AsyncGenerator, List, Optional, Sequence, Union
from loguru import logger
from pydantic import BaseModel
@@ -26,6 +26,8 @@ from pipecat.frames.frames import (
from pipecat.processors.frame_processor import FrameDirection
from pipecat.services.ai_services import AudioContextWordTTSService, TTSService
from pipecat.transcriptions.language import Language
from pipecat.utils.text.base_text_aggregator import BaseTextAggregator
from pipecat.utils.text.skip_tags_aggregator import SkipTagsAggregator
# See .env.example for Cartesia configuration needed
try:
@@ -89,6 +91,7 @@ class CartesiaTTSService(AudioContextWordTTSService):
encoding: str = "pcm_s16le",
container: str = "raw",
params: InputParams = InputParams(),
text_aggregators: Sequence[BaseTextAggregator] = [],
**kwargs,
):
# Aggregating sentences still gives cleaner-sounding results and fewer
@@ -106,6 +109,7 @@ class CartesiaTTSService(AudioContextWordTTSService):
push_text_frames=False,
pause_frame_processing=True,
sample_rate=sample_rate,
text_aggregators=text_aggregators or [SkipTagsAggregator([("<spell>", "</spell>")])],
**kwargs,
)

View File

@@ -7,7 +7,7 @@
import base64
import json
import uuid
from typing import AsyncGenerator, Optional
from typing import AsyncGenerator, Optional, Sequence
import aiohttp
from loguru import logger
@@ -27,6 +27,8 @@ from pipecat.frames.frames import (
from pipecat.processors.frame_processor import FrameDirection
from pipecat.services.ai_services import AudioContextWordTTSService, TTSService
from pipecat.transcriptions.language import Language
from pipecat.utils.text.base_text_aggregator import BaseTextAggregator
from pipecat.utils.text.skip_tags_aggregator import SkipTagsAggregator
try:
import websockets
@@ -78,6 +80,7 @@ class RimeTTSService(AudioContextWordTTSService):
model: str = "mistv2",
sample_rate: Optional[int] = None,
params: InputParams = InputParams(),
text_aggregators: Sequence[BaseTextAggregator] = [],
**kwargs,
):
"""Initialize Rime TTS service.
@@ -97,6 +100,7 @@ class RimeTTSService(AudioContextWordTTSService):
push_stop_frames=True,
pause_frame_processing=True,
sample_rate=sample_rate,
text_aggregators=text_aggregators or [SkipTagsAggregator([("spell(", ")")])],
**kwargs,
)