services(cartesia): upgrade to new cartesia 1.0.0

This commit is contained in:
Aleix Conchillo Flaqué
2024-06-25 11:49:40 -07:00
parent 84074e90ee
commit 4f38d989f5
8 changed files with 31 additions and 29 deletions

View File

@@ -7,6 +7,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased]
### Changed
- Upgraded to Cartesia's new Python library 1.0.0. `CartesiaTTSService` now
expects a voice ID instead of a voice name (you can get the voice ID from
Cartesia's playground). You can also specify the audio `sample_rate` and
`encoding` instead of the previous `output_format`.
### Fixed
- Fixed an issue with asynchronous STT services (Deepgram and Azure) that could

View File

@@ -38,7 +38,6 @@ async def main(room_url: str, token):
"Respond bot",
DailyParams(
audio_out_enabled=True,
audio_out_sample_rate=44100,
transcription_enabled=True,
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer()
@@ -47,8 +46,7 @@ async def main(room_url: str, token):
tts = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),
voice_name="British Lady",
output_format="pcm_44100"
voice_id="a0e99841-438c-4a64-b679-ae501e7d6091", # Barbershop Man
)
llm = OpenAILLMService(

View File

@@ -66,7 +66,6 @@ async def main(room_url: str, token):
"Pipecat",
DailyParams(
audio_out_enabled=True,
audio_out_sample_rate=44100,
transcription_enabled=True,
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer()
@@ -75,20 +74,17 @@ async def main(room_url: str, token):
news_lady = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),
voice_name="Newslady",
output_format="pcm_44100"
voice_id="bf991597-6c13-47e4-8411-91ec2de5c466", # Newslady
)
british_lady = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),
voice_name="British Lady",
output_format="pcm_44100"
voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22", # British Lady
)
barbershop_man = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),
voice_name="Barbershop Man",
output_format="pcm_44100"
voice_id="a0e99841-438c-4a64-b679-ae501e7d6091", # Barbershop Man
)
llm = OpenAILLMService(

View File

@@ -44,7 +44,7 @@ blinker==1.8.2
# via flask
cachetools==5.3.3
# via google-auth
cartesia==0.1.1
cartesia==1.0.0
# via pipecat-ai (pyproject.toml)
certifi==2024.6.2
# via

View File

@@ -44,7 +44,7 @@ blinker==1.8.2
# via flask
cachetools==5.3.3
# via google-auth
cartesia==0.1.1
cartesia==1.0.0
# via pipecat-ai (pyproject.toml)
certifi==2024.6.2
# via
@@ -210,7 +210,7 @@ langchain-core==0.2.9
# langchain-community
# langchain-openai
# langchain-text-splitters
langchain-openai==0.1.9
langchain-openai==0.1.10
# via pipecat-ai (pyproject.toml)
langchain-text-splitters==0.2.1
# via langchain

View File

@@ -36,7 +36,7 @@ Website = "https://pipecat.ai"
[project.optional-dependencies]
anthropic = [ "anthropic~=0.25.7" ]
azure = [ "azure-cognitiveservices-speech~=1.37.0" ]
cartesia = [ "cartesia~=0.1.1" ]
cartesia = [ "cartesia~=1.0.0" ]
daily = [ "daily-python~=0.10.1" ]
deepgram = [ "deepgram-sdk~=3.2.7" ]
examples = [ "python-dotenv~=1.0.0", "flask~=3.0.3", "flask_cors~=4.0.1" ]

View File

@@ -16,14 +16,13 @@ from pipecat.frames.frames import (
EndFrame,
ErrorFrame,
Frame,
LLMFullResponseStartFrame,
LLMFullResponseEndFrame,
StartFrame,
StartInterruptionFrame,
TTSStartedFrame,
TTSStoppedFrame,
TextFrame,
VisionImageRawFrame,
LLMFullResponseEndFrame,
)
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
from pipecat.utils.audio import calculate_audio_volume

View File

@@ -4,7 +4,7 @@
# SPDX-License-Identifier: BSD 2-Clause License
#
from cartesia.tts import AsyncCartesiaTTS
from cartesia import AsyncCartesia
from typing import AsyncGenerator
@@ -20,22 +20,24 @@ class CartesiaTTSService(TTSService):
self,
*,
api_key: str,
voice_name: str,
model_id: str = "upbeat-moon",
output_format: str = "pcm_16000",
voice_id: str,
model_id: str = "sonic-english",
encoding: str = "pcm_s16le",
sample_rate: int = 16000,
**kwargs):
super().__init__(**kwargs)
self._api_key = api_key
self._voice_name = voice_name
self._model_id = model_id
self._output_format = output_format
self._output_format = {
"container": "raw",
"encoding": encoding,
"sample_rate": sample_rate,
}
try:
self._client = AsyncCartesiaTTS(api_key=self._api_key)
voices = self._client.get_voices()
voice_id = voices[self._voice_name]["id"]
self._voice = self._client.get_voice_embedding(voice_id=voice_id)
self._client = AsyncCartesia(api_key=self._api_key)
self._voice = self._client.voices.get(id=voice_id)
except Exception as e:
logger.error(f"{self} initialization error: {e}")
@@ -48,16 +50,16 @@ class CartesiaTTSService(TTSService):
try:
await self.start_ttfb_metrics()
chunk_generator = await self._client.generate(
chunk_generator = await self._client.tts.sse(
stream=True,
transcript=text,
voice=self._voice,
voice_embedding=self._voice["embedding"],
model_id=self._model_id,
output_format=self._output_format,
)
async for chunk in chunk_generator:
await self.stop_ttfb_metrics()
yield AudioRawFrame(chunk["audio"], chunk["sampling_rate"], 1)
yield AudioRawFrame(chunk["audio"], self._output_format["sample_rate"], 1)
except Exception as e:
logger.error(f"{self} exception: {e}")