diff --git a/CHANGELOG.md b/CHANGELOG.md index d8696008a..d480e86b0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Changed + +- Upgraded to Cartesia's new Python library 1.0.0. `CartesiaTTSService` now + expects a voice ID instead of a voice name (you can get the voice ID from + Cartesia's playground). You can also specify the audio `sample_rate` and + `encoding` instead of the previous `output_format`. + ### Fixed - Fixed an issue with asynchronous STT services (Deepgram and Azure) that could diff --git a/examples/foundational/07d-interruptible-cartesia.py b/examples/foundational/07d-interruptible-cartesia.py index 8c4a16f02..610fdb5b8 100644 --- a/examples/foundational/07d-interruptible-cartesia.py +++ b/examples/foundational/07d-interruptible-cartesia.py @@ -38,7 +38,6 @@ async def main(room_url: str, token): "Respond bot", DailyParams( audio_out_enabled=True, - audio_out_sample_rate=44100, transcription_enabled=True, vad_enabled=True, vad_analyzer=SileroVADAnalyzer() @@ -47,8 +46,7 @@ async def main(room_url: str, token): tts = CartesiaTTSService( api_key=os.getenv("CARTESIA_API_KEY"), - voice_name="British Lady", - output_format="pcm_44100" + voice_id="a0e99841-438c-4a64-b679-ae501e7d6091", # Barbershop Man ) llm = OpenAILLMService( diff --git a/examples/foundational/15-switch-voices.py b/examples/foundational/15-switch-voices.py index d7a36e63a..65806e369 100644 --- a/examples/foundational/15-switch-voices.py +++ b/examples/foundational/15-switch-voices.py @@ -66,7 +66,6 @@ async def main(room_url: str, token): "Pipecat", DailyParams( audio_out_enabled=True, - audio_out_sample_rate=44100, transcription_enabled=True, vad_enabled=True, vad_analyzer=SileroVADAnalyzer() @@ -75,20 +74,17 @@ async def main(room_url: str, token): news_lady = CartesiaTTSService( api_key=os.getenv("CARTESIA_API_KEY"), - voice_name="Newslady", - output_format="pcm_44100" + voice_id="bf991597-6c13-47e4-8411-91ec2de5c466", # Newslady ) british_lady = CartesiaTTSService( api_key=os.getenv("CARTESIA_API_KEY"), - voice_name="British Lady", - output_format="pcm_44100" + voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22", # British Lady ) barbershop_man = CartesiaTTSService( api_key=os.getenv("CARTESIA_API_KEY"), - voice_name="Barbershop Man", - output_format="pcm_44100" + voice_id="a0e99841-438c-4a64-b679-ae501e7d6091", # Barbershop Man ) llm = OpenAILLMService( diff --git a/linux-py3.10-requirements.txt b/linux-py3.10-requirements.txt index 0874c8c1d..878e2d17e 100644 --- a/linux-py3.10-requirements.txt +++ b/linux-py3.10-requirements.txt @@ -44,7 +44,7 @@ blinker==1.8.2 # via flask cachetools==5.3.3 # via google-auth -cartesia==0.1.1 +cartesia==1.0.0 # via pipecat-ai (pyproject.toml) certifi==2024.6.2 # via diff --git a/macos-py3.10-requirements.txt b/macos-py3.10-requirements.txt index fad025c9f..a3f68e728 100644 --- a/macos-py3.10-requirements.txt +++ b/macos-py3.10-requirements.txt @@ -44,7 +44,7 @@ blinker==1.8.2 # via flask cachetools==5.3.3 # via google-auth -cartesia==0.1.1 +cartesia==1.0.0 # via pipecat-ai (pyproject.toml) certifi==2024.6.2 # via @@ -210,7 +210,7 @@ langchain-core==0.2.9 # langchain-community # langchain-openai # langchain-text-splitters -langchain-openai==0.1.9 +langchain-openai==0.1.10 # via pipecat-ai (pyproject.toml) langchain-text-splitters==0.2.1 # via langchain diff --git a/pyproject.toml b/pyproject.toml index 6d21d787b..33534e08a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,7 +36,7 @@ Website = "https://pipecat.ai" [project.optional-dependencies] anthropic = [ "anthropic~=0.25.7" ] azure = [ "azure-cognitiveservices-speech~=1.37.0" ] -cartesia = [ "cartesia~=0.1.1" ] +cartesia = [ "cartesia~=1.0.0" ] daily = [ "daily-python~=0.10.1" ] deepgram = [ "deepgram-sdk~=3.2.7" ] examples = [ "python-dotenv~=1.0.0", "flask~=3.0.3", "flask_cors~=4.0.1" ] diff --git a/src/pipecat/services/ai_services.py b/src/pipecat/services/ai_services.py index 95ba2e86a..308ffc7dc 100644 --- a/src/pipecat/services/ai_services.py +++ b/src/pipecat/services/ai_services.py @@ -16,14 +16,13 @@ from pipecat.frames.frames import ( EndFrame, ErrorFrame, Frame, - LLMFullResponseStartFrame, + LLMFullResponseEndFrame, StartFrame, StartInterruptionFrame, TTSStartedFrame, TTSStoppedFrame, TextFrame, VisionImageRawFrame, - LLMFullResponseEndFrame, ) from pipecat.processors.frame_processor import FrameDirection, FrameProcessor from pipecat.utils.audio import calculate_audio_volume diff --git a/src/pipecat/services/cartesia.py b/src/pipecat/services/cartesia.py index f81226576..448cd25ad 100644 --- a/src/pipecat/services/cartesia.py +++ b/src/pipecat/services/cartesia.py @@ -4,7 +4,7 @@ # SPDX-License-Identifier: BSD 2-Clause License # -from cartesia.tts import AsyncCartesiaTTS +from cartesia import AsyncCartesia from typing import AsyncGenerator @@ -20,22 +20,24 @@ class CartesiaTTSService(TTSService): self, *, api_key: str, - voice_name: str, - model_id: str = "upbeat-moon", - output_format: str = "pcm_16000", + voice_id: str, + model_id: str = "sonic-english", + encoding: str = "pcm_s16le", + sample_rate: int = 16000, **kwargs): super().__init__(**kwargs) self._api_key = api_key - self._voice_name = voice_name self._model_id = model_id - self._output_format = output_format + self._output_format = { + "container": "raw", + "encoding": encoding, + "sample_rate": sample_rate, + } try: - self._client = AsyncCartesiaTTS(api_key=self._api_key) - voices = self._client.get_voices() - voice_id = voices[self._voice_name]["id"] - self._voice = self._client.get_voice_embedding(voice_id=voice_id) + self._client = AsyncCartesia(api_key=self._api_key) + self._voice = self._client.voices.get(id=voice_id) except Exception as e: logger.error(f"{self} initialization error: {e}") @@ -48,16 +50,16 @@ class CartesiaTTSService(TTSService): try: await self.start_ttfb_metrics() - chunk_generator = await self._client.generate( + chunk_generator = await self._client.tts.sse( stream=True, transcript=text, - voice=self._voice, + voice_embedding=self._voice["embedding"], model_id=self._model_id, output_format=self._output_format, ) async for chunk in chunk_generator: await self.stop_ttfb_metrics() - yield AudioRawFrame(chunk["audio"], chunk["sampling_rate"], 1) + yield AudioRawFrame(chunk["audio"], self._output_format["sample_rate"], 1) except Exception as e: logger.error(f"{self} exception: {e}")