services(cartesia): upgrade to new cartesia 1.0.0

2024-06-25 11:49:40 -07:00
parent 84074e90ee
commit 4f38d989f5
8 changed files with 31 additions and 29 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

 ## [Unreleased]

+### Changed
+
+- Upgraded to Cartesia's new Python library 1.0.0. `CartesiaTTSService` now
+  expects a voice ID instead of a voice name (you can get the voice ID from
+  Cartesia's playground). You can also specify the audio `sample_rate` and
+  `encoding` instead of the previous `output_format`.
+
 ### Fixed

 - Fixed an issue with asynchronous STT services (Deepgram and Azure) that could
--- a/examples/foundational/07d-interruptible-cartesia.py
+++ b/examples/foundational/07d-interruptible-cartesia.py
@@ -38,7 +38,6 @@ async def main(room_url: str, token):
        "Respond bot",
        DailyParams(
            audio_out_enabled=True,
-            audio_out_sample_rate=44100,
            transcription_enabled=True,
            vad_enabled=True,
            vad_analyzer=SileroVADAnalyzer()
@@ -47,8 +46,7 @@ async def main(room_url: str, token):

    tts = CartesiaTTSService(
        api_key=os.getenv("CARTESIA_API_KEY"),
-        voice_name="British Lady",
-        output_format="pcm_44100"
+        voice_id="a0e99841-438c-4a64-b679-ae501e7d6091",  # Barbershop Man
    )

    llm = OpenAILLMService(
--- a/examples/foundational/15-switch-voices.py
+++ b/examples/foundational/15-switch-voices.py
@@ -66,7 +66,6 @@ async def main(room_url: str, token):
            "Pipecat",
            DailyParams(
                audio_out_enabled=True,
-                audio_out_sample_rate=44100,
                transcription_enabled=True,
                vad_enabled=True,
                vad_analyzer=SileroVADAnalyzer()
@@ -75,20 +74,17 @@ async def main(room_url: str, token):

        news_lady = CartesiaTTSService(
            api_key=os.getenv("CARTESIA_API_KEY"),
-            voice_name="Newslady",
-            output_format="pcm_44100"
+            voice_id="bf991597-6c13-47e4-8411-91ec2de5c466",  # Newslady
        )

        british_lady = CartesiaTTSService(
            api_key=os.getenv("CARTESIA_API_KEY"),
-            voice_name="British Lady",
-            output_format="pcm_44100"
+            voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22",  # British Lady
        )

        barbershop_man = CartesiaTTSService(
            api_key=os.getenv("CARTESIA_API_KEY"),
-            voice_name="Barbershop Man",
-            output_format="pcm_44100"
+            voice_id="a0e99841-438c-4a64-b679-ae501e7d6091",  # Barbershop Man
        )

        llm = OpenAILLMService(
--- a/linux-py3.10-requirements.txt
+++ b/linux-py3.10-requirements.txt
@@ -44,7 +44,7 @@ blinker==1.8.2
    # via flask
 cachetools==5.3.3
    # via google-auth
-cartesia==0.1.1
+cartesia==1.0.0
    # via pipecat-ai (pyproject.toml)
 certifi==2024.6.2
    # via
--- a/macos-py3.10-requirements.txt
+++ b/macos-py3.10-requirements.txt
@@ -44,7 +44,7 @@ blinker==1.8.2
    # via flask
 cachetools==5.3.3
    # via google-auth
-cartesia==0.1.1
+cartesia==1.0.0
    # via pipecat-ai (pyproject.toml)
 certifi==2024.6.2
    # via
@@ -210,7 +210,7 @@ langchain-core==0.2.9
    #   langchain-community
    #   langchain-openai
    #   langchain-text-splitters
-langchain-openai==0.1.9
+langchain-openai==0.1.10
    # via pipecat-ai (pyproject.toml)
 langchain-text-splitters==0.2.1
    # via langchain
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -36,7 +36,7 @@ Website = "https://pipecat.ai"
 [project.optional-dependencies]
 anthropic = [ "anthropic~=0.25.7" ]
 azure = [ "azure-cognitiveservices-speech~=1.37.0" ]
-cartesia = [ "cartesia~=0.1.1" ]
+cartesia = [ "cartesia~=1.0.0" ]
 daily = [ "daily-python~=0.10.1" ]
 deepgram = [ "deepgram-sdk~=3.2.7" ]
 examples = [ "python-dotenv~=1.0.0", "flask~=3.0.3", "flask_cors~=4.0.1" ]
--- a/src/pipecat/services/ai_services.py
+++ b/src/pipecat/services/ai_services.py
@@ -16,14 +16,13 @@ from pipecat.frames.frames import (
    EndFrame,
    ErrorFrame,
    Frame,
-    LLMFullResponseStartFrame,
+    LLMFullResponseEndFrame,
    StartFrame,
    StartInterruptionFrame,
    TTSStartedFrame,
    TTSStoppedFrame,
    TextFrame,
    VisionImageRawFrame,
-    LLMFullResponseEndFrame,
 )
 from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
 from pipecat.utils.audio import calculate_audio_volume
--- a/src/pipecat/services/cartesia.py
+++ b/src/pipecat/services/cartesia.py
@@ -4,7 +4,7 @@
 # SPDX-License-Identifier: BSD 2-Clause License
 #

-from cartesia.tts import AsyncCartesiaTTS
+from cartesia import AsyncCartesia

 from typing import AsyncGenerator

@@ -20,22 +20,24 @@ class CartesiaTTSService(TTSService):
            self,
            *,
            api_key: str,
-            voice_name: str,
-            model_id: str = "upbeat-moon",
-            output_format: str = "pcm_16000",
+            voice_id: str,
+            model_id: str = "sonic-english",
+            encoding: str = "pcm_s16le",
+            sample_rate: int = 16000,
            **kwargs):
        super().__init__(**kwargs)

        self._api_key = api_key
-        self._voice_name = voice_name
        self._model_id = model_id
-        self._output_format = output_format
+        self._output_format = {
+            "container": "raw",
+            "encoding": encoding,
+            "sample_rate": sample_rate,
+        }

        try:
-            self._client = AsyncCartesiaTTS(api_key=self._api_key)
-            voices = self._client.get_voices()
-            voice_id = voices[self._voice_name]["id"]
-            self._voice = self._client.get_voice_embedding(voice_id=voice_id)
+            self._client = AsyncCartesia(api_key=self._api_key)
+            self._voice = self._client.voices.get(id=voice_id)
        except Exception as e:
            logger.error(f"{self} initialization error: {e}")

@@ -48,16 +50,16 @@ class CartesiaTTSService(TTSService):
        try:
            await self.start_ttfb_metrics()

-            chunk_generator = await self._client.generate(
+            chunk_generator = await self._client.tts.sse(
                stream=True,
                transcript=text,
-                voice=self._voice,
+                voice_embedding=self._voice["embedding"],
                model_id=self._model_id,
                output_format=self._output_format,
            )

            async for chunk in chunk_generator:
                await self.stop_ttfb_metrics()
-                yield AudioRawFrame(chunk["audio"], chunk["sampling_rate"], 1)
+                yield AudioRawFrame(chunk["audio"], self._output_format["sample_rate"], 1)
        except Exception as e:
            logger.error(f"{self} exception: {e}")