Merge pull request #4390 from pipecat-ai/mb/cartesia-tts-api-updates
feat(cartesia): align TTS services with latest API and buffering guidance
This commit is contained in:
1
changelog/4390.added.md
Normal file
1
changelog/4390.added.md
Normal file
@@ -0,0 +1 @@
|
||||
- Added a `max_buffer_delay_ms` constructor argument to `CartesiaTTSService` for controlling Cartesia's server-side text buffering. When unset, Pipecat picks a sensible default based on `text_aggregation_mode`: `0` in `SENTENCE` mode (custom buffering — avoids stacking client-side aggregation on top of Cartesia's default 3000ms server buffer) and unset in `TOKEN` mode (Cartesia's managed buffering applies). Pass an explicit value (0–5000ms) to override.
|
||||
1
changelog/4390.changed.2.md
Normal file
1
changelog/4390.changed.2.md
Normal file
@@ -0,0 +1 @@
|
||||
- Default `cartesia_version` for `CartesiaTTSService` bumped from `2025-04-16` to `2026-03-01`, matching `CartesiaHttpTTSService` and unlocking the `use_normalized_timestamps` and `max_buffer_delay_ms` fields.
|
||||
1
changelog/4390.changed.md
Normal file
1
changelog/4390.changed.md
Normal file
@@ -0,0 +1 @@
|
||||
- ⚠️ `CartesiaTTSService` now sends `use_normalized_timestamps: true` instead of the deprecated `use_original_timestamps` field. Word timestamps now reflect what was actually spoken (post text-normalization and pronunciation-dictionary substitution), matching the convention Pipecat uses for ElevenLabs. This is a behavior change for `sonic-3` users, who were previously receiving timestamps tied to the input transcript.
|
||||
1
changelog/4390.fixed.2.md
Normal file
1
changelog/4390.fixed.2.md
Normal file
@@ -0,0 +1 @@
|
||||
- Fixed `CartesiaHttpTTSService` pushing two `ErrorFrame`s on a non-200 response — one with the API's error text and a second, less informative "Unknown error" frame from the outer exception handler. It now pushes a single frame that includes the HTTP status code and returns cleanly.
|
||||
1
changelog/4390.fixed.3.md
Normal file
1
changelog/4390.fixed.3.md
Normal file
@@ -0,0 +1 @@
|
||||
- Fixed Cartesia tag helpers (`SPELL`, `EMOTION_TAG`, `PAUSE_TAG`, `VOLUME_TAG`, `SPEED_TAG`) raising `TypeError` when called on an instance (e.g. `tts.SPELL("hi")`). They're now `@staticmethod` and callable from both the class and an instance.
|
||||
1
changelog/4390.fixed.md
Normal file
1
changelog/4390.fixed.md
Normal file
@@ -0,0 +1 @@
|
||||
- Fixed `CartesiaTTSService` surfacing `flush_done` messages from Cartesia as `ErrorFrame`s. The latest API emits a `flush_done` per transcript when server-side buffering is disabled; Pipecat now consumes them silently since each turn already has its own `context_id`.
|
||||
@@ -42,7 +42,6 @@
|
||||
"src/pipecat/services/azure/stt.py",
|
||||
"src/pipecat/services/azure/tts.py",
|
||||
"src/pipecat/services/cartesia/stt.py",
|
||||
"src/pipecat/services/cartesia/tts.py",
|
||||
"src/pipecat/services/deepgram/flux/base.py",
|
||||
"src/pipecat/services/deepgram/flux/sagemaker/stt.py",
|
||||
"src/pipecat/services/deepgram/flux/stt.py",
|
||||
|
||||
@@ -232,12 +232,13 @@ class CartesiaTTSService(WebsocketTTSService):
|
||||
*,
|
||||
api_key: str,
|
||||
voice_id: str | None = None,
|
||||
cartesia_version: str = "2025-04-16",
|
||||
cartesia_version: str = "2026-03-01",
|
||||
url: str = "wss://api.cartesia.ai/tts/websocket",
|
||||
model: str | None = None,
|
||||
sample_rate: int | None = None,
|
||||
encoding: str = "pcm_s16le",
|
||||
container: str = "raw",
|
||||
max_buffer_delay_ms: int | None = None,
|
||||
params: InputParams | None = None,
|
||||
settings: Settings | None = None,
|
||||
text_aggregation_mode: TextAggregationMode | None = None,
|
||||
@@ -263,6 +264,12 @@ class CartesiaTTSService(WebsocketTTSService):
|
||||
sample_rate: Audio sample rate. If None, uses default.
|
||||
encoding: Audio encoding format.
|
||||
container: Audio container format.
|
||||
max_buffer_delay_ms: Server-side buffering window before generation
|
||||
starts. ``0`` disables server buffering (custom buffering); any
|
||||
value in (0, 5000] enables managed buffering. If ``None``,
|
||||
derived from ``text_aggregation_mode``: ``0`` for ``SENTENCE``
|
||||
(avoids stacking client and server buffering), unset for
|
||||
``TOKEN`` (uses Cartesia's 3000ms default).
|
||||
params: Additional input parameters for voice customization.
|
||||
|
||||
.. deprecated:: 0.0.105
|
||||
@@ -353,6 +360,15 @@ class CartesiaTTSService(WebsocketTTSService):
|
||||
self._output_encoding = encoding
|
||||
self._output_sample_rate = 0 # Set in start() from self.sample_rate
|
||||
|
||||
# Cartesia warns against the "middle ground" of client-side sentence
|
||||
# aggregation plus the server's default 3000ms buffer. When the user
|
||||
# doesn't pick a value, send 0 in SENTENCE mode (custom buffering) and
|
||||
# leave it unset in TOKEN mode so the server default applies (managed
|
||||
# buffering).
|
||||
if max_buffer_delay_ms is None and not self._is_streaming_tokens:
|
||||
max_buffer_delay_ms = 0
|
||||
self._max_buffer_delay_ms = max_buffer_delay_ms
|
||||
|
||||
self._receive_task = None
|
||||
|
||||
def can_generate_metrics(self) -> bool:
|
||||
@@ -375,22 +391,27 @@ class CartesiaTTSService(WebsocketTTSService):
|
||||
return language_to_cartesia_language(language)
|
||||
|
||||
# A set of Cartesia-specific helpers for text transformations
|
||||
@staticmethod
|
||||
def SPELL(text: str) -> str:
|
||||
"""Wrap text in Cartesia spell tag."""
|
||||
return f"<spell>{text}</spell>"
|
||||
|
||||
@staticmethod
|
||||
def EMOTION_TAG(emotion: CartesiaEmotion) -> str:
|
||||
"""Convenience method to create an emotion tag."""
|
||||
return f'<emotion value="{emotion}" />'
|
||||
|
||||
@staticmethod
|
||||
def PAUSE_TAG(seconds: float) -> str:
|
||||
"""Convenience method to create a pause tag."""
|
||||
return f'<break time="{seconds}s" />'
|
||||
|
||||
@staticmethod
|
||||
def VOLUME_TAG(volume: float) -> str:
|
||||
"""Convenience method to create a volume tag."""
|
||||
return f'<volume ratio="{volume}" />'
|
||||
|
||||
@staticmethod
|
||||
def SPEED_TAG(speed: float) -> str:
|
||||
"""Convenience method to create a speed tag."""
|
||||
return f'<speed ratio="{speed}" />'
|
||||
@@ -466,9 +487,12 @@ class CartesiaTTSService(WebsocketTTSService):
|
||||
"sample_rate": self._output_sample_rate,
|
||||
},
|
||||
"add_timestamps": add_timestamps,
|
||||
"use_original_timestamps": False if self._settings.model == "sonic" else True,
|
||||
"use_normalized_timestamps": False,
|
||||
}
|
||||
|
||||
if self._max_buffer_delay_ms is not None:
|
||||
msg["max_buffer_delay_ms"] = self._max_buffer_delay_ms
|
||||
|
||||
if self._settings.language:
|
||||
msg["language"] = self._settings.language
|
||||
|
||||
@@ -647,6 +671,13 @@ class CartesiaTTSService(WebsocketTTSService):
|
||||
await self.stop_all_metrics()
|
||||
await self.push_error(error_msg=f"Error: {msg}")
|
||||
self.reset_active_audio_context()
|
||||
elif msg["type"] == "flush_done":
|
||||
# Cartesia emits flush_done as a per-transcript boundary marker
|
||||
# within a context (e.g. when max_buffer_delay_ms=0 causes the
|
||||
# server to flush each submission). We don't need it: each turn
|
||||
# already has its own context_id and audio chunks are tagged
|
||||
# with it. Acknowledge silently.
|
||||
pass
|
||||
else:
|
||||
await self.push_error(error_msg=f"Error, unknown message type: {msg}")
|
||||
|
||||
@@ -885,6 +916,9 @@ class CartesiaHttpTTSService(TTSService):
|
||||
logger.debug(f"{self}: Generating TTS [{text}]")
|
||||
|
||||
try:
|
||||
if self._session is None:
|
||||
raise RuntimeError("HTTP session is not initialized; call start() before run_tts()")
|
||||
|
||||
voice_config = {"mode": "id", "id": self._settings.voice}
|
||||
|
||||
output_format = {
|
||||
@@ -921,8 +955,10 @@ class CartesiaHttpTTSService(TTSService):
|
||||
async with self._session.post(url, json=payload, headers=headers) as response:
|
||||
if response.status != 200:
|
||||
error_text = await response.text()
|
||||
yield ErrorFrame(error=f"Cartesia API error: {error_text}")
|
||||
raise Exception(f"Cartesia API returned status {response.status}: {error_text}")
|
||||
yield ErrorFrame(
|
||||
error=f"Cartesia API error (status {response.status}): {error_text}"
|
||||
)
|
||||
return
|
||||
|
||||
audio_data = await response.read()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user