From e546541e2094cf587d38f1e5e5db59101dd4669c Mon Sep 17 00:00:00 2001 From: Mark Backman Date: Thu, 30 Apr 2026 09:25:31 -0400 Subject: [PATCH 1/6] feat(cartesia): align WebSocket TTS with latest API and buffering guidance - Bump default cartesia_version to 2026-03-01. - Replace deprecated use_original_timestamps with use_normalized_timestamps so word timestamps match what was actually spoken. - Add max_buffer_delay_ms init arg; auto-derive 0 in SENTENCE mode to avoid the doc-warned "middle ground" of client + server buffering, leave unset in TOKEN mode for managed buffering. - Silently consume flush_done messages now emitted per transcript when server-side buffering is disabled. --- src/pipecat/services/cartesia/tts.py | 30 ++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/src/pipecat/services/cartesia/tts.py b/src/pipecat/services/cartesia/tts.py index 320d5a913..264493fcd 100644 --- a/src/pipecat/services/cartesia/tts.py +++ b/src/pipecat/services/cartesia/tts.py @@ -232,12 +232,13 @@ class CartesiaTTSService(WebsocketTTSService): *, api_key: str, voice_id: str | None = None, - cartesia_version: str = "2025-04-16", + cartesia_version: str = "2026-03-01", url: str = "wss://api.cartesia.ai/tts/websocket", model: str | None = None, sample_rate: int | None = None, encoding: str = "pcm_s16le", container: str = "raw", + max_buffer_delay_ms: int | None = None, params: InputParams | None = None, settings: Settings | None = None, text_aggregation_mode: TextAggregationMode | None = None, @@ -263,6 +264,12 @@ class CartesiaTTSService(WebsocketTTSService): sample_rate: Audio sample rate. If None, uses default. encoding: Audio encoding format. container: Audio container format. + max_buffer_delay_ms: Server-side buffering window before generation + starts. ``0`` disables server buffering (custom buffering); any + value in (0, 5000] enables managed buffering. If ``None``, + derived from ``text_aggregation_mode``: ``0`` for ``SENTENCE`` + (avoids stacking client and server buffering), unset for + ``TOKEN`` (uses Cartesia's 3000ms default). params: Additional input parameters for voice customization. .. deprecated:: 0.0.105 @@ -353,6 +360,15 @@ class CartesiaTTSService(WebsocketTTSService): self._output_encoding = encoding self._output_sample_rate = 0 # Set in start() from self.sample_rate + # Cartesia warns against the "middle ground" of client-side sentence + # aggregation plus the server's default 3000ms buffer. When the user + # doesn't pick a value, send 0 in SENTENCE mode (custom buffering) and + # leave it unset in TOKEN mode so the server default applies (managed + # buffering). + if max_buffer_delay_ms is None and not self._is_streaming_tokens: + max_buffer_delay_ms = 0 + self._max_buffer_delay_ms = max_buffer_delay_ms + self._receive_task = None def can_generate_metrics(self) -> bool: @@ -466,9 +482,12 @@ class CartesiaTTSService(WebsocketTTSService): "sample_rate": self._output_sample_rate, }, "add_timestamps": add_timestamps, - "use_original_timestamps": False if self._settings.model == "sonic" else True, + "use_normalized_timestamps": True, } + if self._max_buffer_delay_ms is not None: + msg["max_buffer_delay_ms"] = self._max_buffer_delay_ms + if self._settings.language: msg["language"] = self._settings.language @@ -647,6 +666,13 @@ class CartesiaTTSService(WebsocketTTSService): await self.stop_all_metrics() await self.push_error(error_msg=f"Error: {msg}") self.reset_active_audio_context() + elif msg["type"] == "flush_done": + # Cartesia emits flush_done as a per-transcript boundary marker + # within a context (e.g. when max_buffer_delay_ms=0 causes the + # server to flush each submission). We don't need it: each turn + # already has its own context_id and audio chunks are tagged + # with it. Acknowledge silently. + pass else: await self.push_error(error_msg=f"Error, unknown message type: {msg}") From e508642b0a2fb33e170a8b2b7f25635c29900e18 Mon Sep 17 00:00:00 2001 From: Mark Backman Date: Thu, 30 Apr 2026 09:31:22 -0400 Subject: [PATCH 2/6] refactor(cartesia): mark tag helpers as @staticmethod SPELL/EMOTION_TAG/PAUSE_TAG/VOLUME_TAG/SPEED_TAG are stateless and worked only via class-level access. Decorating them lets instance access work too and silences the missing-self lint warning. --- src/pipecat/services/cartesia/tts.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/pipecat/services/cartesia/tts.py b/src/pipecat/services/cartesia/tts.py index 264493fcd..192d5dc90 100644 --- a/src/pipecat/services/cartesia/tts.py +++ b/src/pipecat/services/cartesia/tts.py @@ -391,22 +391,27 @@ class CartesiaTTSService(WebsocketTTSService): return language_to_cartesia_language(language) # A set of Cartesia-specific helpers for text transformations + @staticmethod def SPELL(text: str) -> str: """Wrap text in Cartesia spell tag.""" return f"{text}" + @staticmethod def EMOTION_TAG(emotion: CartesiaEmotion) -> str: """Convenience method to create an emotion tag.""" return f'' + @staticmethod def PAUSE_TAG(seconds: float) -> str: """Convenience method to create a pause tag.""" return f'' + @staticmethod def VOLUME_TAG(volume: float) -> str: """Convenience method to create a volume tag.""" return f'' + @staticmethod def SPEED_TAG(speed: float) -> str: """Convenience method to create a speed tag.""" return f'' From 3e5aabc5f2b445f3e82d0f350ba52eb1f782b3e6 Mon Sep 17 00:00:00 2001 From: Mark Backman Date: Thu, 30 Apr 2026 09:33:33 -0400 Subject: [PATCH 3/6] fix(cartesia): guard HTTP session before use Pyright flagged the .post() call on a possibly-None _session. Raise a clear RuntimeError if start() wasn't called instead of crashing on the attribute access. --- pyrightconfig.json | 1 - src/pipecat/services/cartesia/tts.py | 3 +++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/pyrightconfig.json b/pyrightconfig.json index 5483be730..8f6b951c6 100644 --- a/pyrightconfig.json +++ b/pyrightconfig.json @@ -42,7 +42,6 @@ "src/pipecat/services/azure/stt.py", "src/pipecat/services/azure/tts.py", "src/pipecat/services/cartesia/stt.py", - "src/pipecat/services/cartesia/tts.py", "src/pipecat/services/deepgram/flux/base.py", "src/pipecat/services/deepgram/flux/sagemaker/stt.py", "src/pipecat/services/deepgram/flux/stt.py", diff --git a/src/pipecat/services/cartesia/tts.py b/src/pipecat/services/cartesia/tts.py index 192d5dc90..f91fa90b5 100644 --- a/src/pipecat/services/cartesia/tts.py +++ b/src/pipecat/services/cartesia/tts.py @@ -916,6 +916,9 @@ class CartesiaHttpTTSService(TTSService): logger.debug(f"{self}: Generating TTS [{text}]") try: + if self._session is None: + raise RuntimeError("HTTP session is not initialized; call start() before run_tts()") + voice_config = {"mode": "id", "id": self._settings.voice} output_format = { From 21547c8680974448eacb8abec5ec1b453e52ddb7 Mon Sep 17 00:00:00 2001 From: Mark Backman Date: Thu, 30 Apr 2026 09:41:43 -0400 Subject: [PATCH 4/6] fix(cartesia): stop double-yielding ErrorFrame on HTTP non-200 The non-200 branch yielded an ErrorFrame and then raised, which the outer except caught and yielded a second, less informative "Unknown error" frame. Return after the yield and fold the status code into the message. --- src/pipecat/services/cartesia/tts.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/pipecat/services/cartesia/tts.py b/src/pipecat/services/cartesia/tts.py index f91fa90b5..c6c539e97 100644 --- a/src/pipecat/services/cartesia/tts.py +++ b/src/pipecat/services/cartesia/tts.py @@ -955,8 +955,10 @@ class CartesiaHttpTTSService(TTSService): async with self._session.post(url, json=payload, headers=headers) as response: if response.status != 200: error_text = await response.text() - yield ErrorFrame(error=f"Cartesia API error: {error_text}") - raise Exception(f"Cartesia API returned status {response.status}: {error_text}") + yield ErrorFrame( + error=f"Cartesia API error (status {response.status}): {error_text}" + ) + return audio_data = await response.read() From fb42a7dcf32193b8c2f8a75ce218f5527c94cb7c Mon Sep 17 00:00:00 2001 From: Mark Backman Date: Thu, 30 Apr 2026 09:45:16 -0400 Subject: [PATCH 5/6] Add changelog for #4390 --- changelog/4390.added.md | 1 + changelog/4390.changed.2.md | 1 + changelog/4390.changed.md | 1 + changelog/4390.fixed.2.md | 1 + changelog/4390.fixed.3.md | 1 + changelog/4390.fixed.md | 1 + 6 files changed, 6 insertions(+) create mode 100644 changelog/4390.added.md create mode 100644 changelog/4390.changed.2.md create mode 100644 changelog/4390.changed.md create mode 100644 changelog/4390.fixed.2.md create mode 100644 changelog/4390.fixed.3.md create mode 100644 changelog/4390.fixed.md diff --git a/changelog/4390.added.md b/changelog/4390.added.md new file mode 100644 index 000000000..8c7cb4a99 --- /dev/null +++ b/changelog/4390.added.md @@ -0,0 +1 @@ +- Added a `max_buffer_delay_ms` constructor argument to `CartesiaTTSService` for controlling Cartesia's server-side text buffering. When unset, Pipecat picks a sensible default based on `text_aggregation_mode`: `0` in `SENTENCE` mode (custom buffering — avoids stacking client-side aggregation on top of Cartesia's default 3000ms server buffer) and unset in `TOKEN` mode (Cartesia's managed buffering applies). Pass an explicit value (0–5000ms) to override. diff --git a/changelog/4390.changed.2.md b/changelog/4390.changed.2.md new file mode 100644 index 000000000..70018745e --- /dev/null +++ b/changelog/4390.changed.2.md @@ -0,0 +1 @@ +- Default `cartesia_version` for `CartesiaTTSService` bumped from `2025-04-16` to `2026-03-01`, matching `CartesiaHttpTTSService` and unlocking the `use_normalized_timestamps` and `max_buffer_delay_ms` fields. diff --git a/changelog/4390.changed.md b/changelog/4390.changed.md new file mode 100644 index 000000000..cfc9840f8 --- /dev/null +++ b/changelog/4390.changed.md @@ -0,0 +1 @@ +- ⚠️ `CartesiaTTSService` now sends `use_normalized_timestamps: true` instead of the deprecated `use_original_timestamps` field. Word timestamps now reflect what was actually spoken (post text-normalization and pronunciation-dictionary substitution), matching the convention Pipecat uses for ElevenLabs. This is a behavior change for `sonic-3` users, who were previously receiving timestamps tied to the input transcript. diff --git a/changelog/4390.fixed.2.md b/changelog/4390.fixed.2.md new file mode 100644 index 000000000..cc0f62057 --- /dev/null +++ b/changelog/4390.fixed.2.md @@ -0,0 +1 @@ +- Fixed `CartesiaHttpTTSService` pushing two `ErrorFrame`s on a non-200 response — one with the API's error text and a second, less informative "Unknown error" frame from the outer exception handler. It now pushes a single frame that includes the HTTP status code and returns cleanly. diff --git a/changelog/4390.fixed.3.md b/changelog/4390.fixed.3.md new file mode 100644 index 000000000..6de9fd130 --- /dev/null +++ b/changelog/4390.fixed.3.md @@ -0,0 +1 @@ +- Fixed Cartesia tag helpers (`SPELL`, `EMOTION_TAG`, `PAUSE_TAG`, `VOLUME_TAG`, `SPEED_TAG`) raising `TypeError` when called on an instance (e.g. `tts.SPELL("hi")`). They're now `@staticmethod` and callable from both the class and an instance. diff --git a/changelog/4390.fixed.md b/changelog/4390.fixed.md new file mode 100644 index 000000000..4cc9afb3f --- /dev/null +++ b/changelog/4390.fixed.md @@ -0,0 +1 @@ +- Fixed `CartesiaTTSService` surfacing `flush_done` messages from Cartesia as `ErrorFrame`s. The latest API emits a `flush_done` per transcript when server-side buffering is disabled; Pipecat now consumes them silently since each turn already has its own `context_id`. From 6487f895b35d5012411bbe6786eb18e19c966024 Mon Sep 17 00:00:00 2001 From: Mark Backman Date: Thu, 30 Apr 2026 14:21:14 -0400 Subject: [PATCH 6/6] Setting use_normalized_timestamps to False so that input and output text match --- src/pipecat/services/cartesia/tts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pipecat/services/cartesia/tts.py b/src/pipecat/services/cartesia/tts.py index c6c539e97..6b3fc4b83 100644 --- a/src/pipecat/services/cartesia/tts.py +++ b/src/pipecat/services/cartesia/tts.py @@ -487,7 +487,7 @@ class CartesiaTTSService(WebsocketTTSService): "sample_rate": self._output_sample_rate, }, "add_timestamps": add_timestamps, - "use_normalized_timestamps": True, + "use_normalized_timestamps": False, } if self._max_buffer_delay_ms is not None: