diff --git a/CHANGELOG.md b/CHANGELOG.md index 85848f22b..1183c1483 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -105,6 +105,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed +- Fixed a `CartesiaTTSService` service issue that would cause audio overlapping + in some cases. + - Fixed a websocket-based service issue (e.g. `CartesiaTTSService`) that was preventing a reconnection after the server disconnected cleanly, which was causing an inifite loop instead. diff --git a/src/pipecat/services/cartesia.py b/src/pipecat/services/cartesia.py index d5ea46a30..49bcac2f1 100644 --- a/src/pipecat/services/cartesia.py +++ b/src/pipecat/services/cartesia.py @@ -27,7 +27,7 @@ from pipecat.frames.frames import ( TTSStoppedFrame, ) from pipecat.processors.frame_processor import FrameDirection -from pipecat.services.ai_services import TTSService, WordTTSService +from pipecat.services.ai_services import AudioContextWordTTSService, TTSService from pipecat.services.websocket_service import WebsocketService from pipecat.transcriptions.language import Language @@ -75,7 +75,7 @@ def language_to_cartesia_language(language: Language) -> Optional[str]: return result -class CartesiaTTSService(WordTTSService, WebsocketService): +class CartesiaTTSService(AudioContextWordTTSService, WebsocketService): class InputParams(BaseModel): language: Optional[Language] = Language.EN speed: Optional[Union[str, float]] = "" @@ -105,7 +105,7 @@ class CartesiaTTSService(WordTTSService, WebsocketService): # if we're interrupted. Cartesia gives us word-by-word timestamps. We # can use those to generate text frames ourselves aligned with the # playout timing of the audio! - WordTTSService.__init__( + AudioContextWordTTSService.__init__( self, aggregate_sentences=True, push_text_frames=False, @@ -191,12 +191,12 @@ class CartesiaTTSService(WordTTSService, WebsocketService): self._receive_task = self.create_task(self._receive_task_handler(self.push_error)) async def _disconnect(self): - await self._disconnect_websocket() - if self._receive_task: await self.cancel_task(self._receive_task) self._receive_task = None + await self._disconnect_websocket() + async def _connect_websocket(self): try: logger.debug("Connecting to Cartesia") @@ -239,21 +239,19 @@ class CartesiaTTSService(WordTTSService, WebsocketService): logger.trace(f"{self}: flushing audio") msg = self._build_msg(text="", continue_transcript=False) await self._websocket.send(msg) + self._context_id = None async def _receive_messages(self): async for message in self._get_websocket(): msg = json.loads(message) - if not msg or msg["context_id"] != self._context_id: + if not msg or not self.audio_context_available(msg["context_id"]): continue if msg["type"] == "done": await self.stop_ttfb_metrics() - # Unset _context_id but not the _context_id_start_timestamp - # because we are likely still playing out audio and need the - # timestamp to set send context frames. - self._context_id = None await self.add_word_timestamps( [("TTSStoppedFrame", 0), ("LLMFullResponseEndFrame", 0), ("Reset", 0)] ) + await self.remove_audio_context(msg["context_id"]) elif msg["type"] == "timestamps": await self.add_word_timestamps( list(zip(msg["word_timestamps"]["words"], msg["word_timestamps"]["start"])) @@ -266,12 +264,13 @@ class CartesiaTTSService(WordTTSService, WebsocketService): sample_rate=self.sample_rate, num_channels=1, ) - await self.push_frame(frame) + await self.append_to_audio_context(msg["context_id"], frame) elif msg["type"] == "error": logger.error(f"{self} error: {msg}") await self.push_frame(TTSStoppedFrame()) await self.stop_all_metrics() await self.push_error(ErrorFrame(f"{self} error: {msg['error']}")) + self._context_id = None else: logger.error(f"{self} error, unknown message type: {msg}") @@ -299,6 +298,7 @@ class CartesiaTTSService(WordTTSService, WebsocketService): await self.start_ttfb_metrics() yield TTSStartedFrame() self._context_id = str(uuid.uuid4()) + await self.create_audio_context(self._context_id) msg = self._build_msg(text=text or " ") # Text must contain at least one character