services: add spelling out support to CartesiaTTSService and RimeTTSService
This commit is contained in:
@@ -142,6 +142,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
|
||||
### Fixed
|
||||
|
||||
- Fixed a `CartesiaTTSService` and `RimeTTSService` issue that would consider
|
||||
text between spelling out tags end of sentence.
|
||||
|
||||
- Fixed a `match_endofsentence` issue that would result in floating point
|
||||
numbers to be considered an end of sentence.
|
||||
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
import base64
|
||||
import json
|
||||
import uuid
|
||||
from typing import AsyncGenerator, List, Optional, Union
|
||||
from typing import AsyncGenerator, List, Optional, Sequence, Union
|
||||
|
||||
from loguru import logger
|
||||
from pydantic import BaseModel
|
||||
@@ -26,6 +26,8 @@ from pipecat.frames.frames import (
|
||||
from pipecat.processors.frame_processor import FrameDirection
|
||||
from pipecat.services.ai_services import AudioContextWordTTSService, TTSService
|
||||
from pipecat.transcriptions.language import Language
|
||||
from pipecat.utils.text.base_text_aggregator import BaseTextAggregator
|
||||
from pipecat.utils.text.skip_tags_aggregator import SkipTagsAggregator
|
||||
|
||||
# See .env.example for Cartesia configuration needed
|
||||
try:
|
||||
@@ -89,6 +91,7 @@ class CartesiaTTSService(AudioContextWordTTSService):
|
||||
encoding: str = "pcm_s16le",
|
||||
container: str = "raw",
|
||||
params: InputParams = InputParams(),
|
||||
text_aggregators: Sequence[BaseTextAggregator] = [],
|
||||
**kwargs,
|
||||
):
|
||||
# Aggregating sentences still gives cleaner-sounding results and fewer
|
||||
@@ -106,6 +109,7 @@ class CartesiaTTSService(AudioContextWordTTSService):
|
||||
push_text_frames=False,
|
||||
pause_frame_processing=True,
|
||||
sample_rate=sample_rate,
|
||||
text_aggregators=text_aggregators or [SkipTagsAggregator([("<spell>", "</spell>")])],
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
import base64
|
||||
import json
|
||||
import uuid
|
||||
from typing import AsyncGenerator, Optional
|
||||
from typing import AsyncGenerator, Optional, Sequence
|
||||
|
||||
import aiohttp
|
||||
from loguru import logger
|
||||
@@ -27,6 +27,8 @@ from pipecat.frames.frames import (
|
||||
from pipecat.processors.frame_processor import FrameDirection
|
||||
from pipecat.services.ai_services import AudioContextWordTTSService, TTSService
|
||||
from pipecat.transcriptions.language import Language
|
||||
from pipecat.utils.text.base_text_aggregator import BaseTextAggregator
|
||||
from pipecat.utils.text.skip_tags_aggregator import SkipTagsAggregator
|
||||
|
||||
try:
|
||||
import websockets
|
||||
@@ -78,6 +80,7 @@ class RimeTTSService(AudioContextWordTTSService):
|
||||
model: str = "mistv2",
|
||||
sample_rate: Optional[int] = None,
|
||||
params: InputParams = InputParams(),
|
||||
text_aggregators: Sequence[BaseTextAggregator] = [],
|
||||
**kwargs,
|
||||
):
|
||||
"""Initialize Rime TTS service.
|
||||
@@ -97,6 +100,7 @@ class RimeTTSService(AudioContextWordTTSService):
|
||||
push_stop_frames=True,
|
||||
pause_frame_processing=True,
|
||||
sample_rate=sample_rate,
|
||||
text_aggregators=text_aggregators or [SkipTagsAggregator([("spell(", ")")])],
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user