services(cartesia): upgrade to new cartesia 1.0.0
This commit is contained in:
@@ -7,6 +7,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
|
||||
## [Unreleased]
|
||||
|
||||
### Changed
|
||||
|
||||
- Upgraded to Cartesia's new Python library 1.0.0. `CartesiaTTSService` now
|
||||
expects a voice ID instead of a voice name (you can get the voice ID from
|
||||
Cartesia's playground). You can also specify the audio `sample_rate` and
|
||||
`encoding` instead of the previous `output_format`.
|
||||
|
||||
### Fixed
|
||||
|
||||
- Fixed an issue with asynchronous STT services (Deepgram and Azure) that could
|
||||
|
||||
@@ -38,7 +38,6 @@ async def main(room_url: str, token):
|
||||
"Respond bot",
|
||||
DailyParams(
|
||||
audio_out_enabled=True,
|
||||
audio_out_sample_rate=44100,
|
||||
transcription_enabled=True,
|
||||
vad_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer()
|
||||
@@ -47,8 +46,7 @@ async def main(room_url: str, token):
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
voice_name="British Lady",
|
||||
output_format="pcm_44100"
|
||||
voice_id="a0e99841-438c-4a64-b679-ae501e7d6091", # Barbershop Man
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
|
||||
@@ -66,7 +66,6 @@ async def main(room_url: str, token):
|
||||
"Pipecat",
|
||||
DailyParams(
|
||||
audio_out_enabled=True,
|
||||
audio_out_sample_rate=44100,
|
||||
transcription_enabled=True,
|
||||
vad_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer()
|
||||
@@ -75,20 +74,17 @@ async def main(room_url: str, token):
|
||||
|
||||
news_lady = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
voice_name="Newslady",
|
||||
output_format="pcm_44100"
|
||||
voice_id="bf991597-6c13-47e4-8411-91ec2de5c466", # Newslady
|
||||
)
|
||||
|
||||
british_lady = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
voice_name="British Lady",
|
||||
output_format="pcm_44100"
|
||||
voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22", # British Lady
|
||||
)
|
||||
|
||||
barbershop_man = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
voice_name="Barbershop Man",
|
||||
output_format="pcm_44100"
|
||||
voice_id="a0e99841-438c-4a64-b679-ae501e7d6091", # Barbershop Man
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
|
||||
@@ -44,7 +44,7 @@ blinker==1.8.2
|
||||
# via flask
|
||||
cachetools==5.3.3
|
||||
# via google-auth
|
||||
cartesia==0.1.1
|
||||
cartesia==1.0.0
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
certifi==2024.6.2
|
||||
# via
|
||||
|
||||
@@ -44,7 +44,7 @@ blinker==1.8.2
|
||||
# via flask
|
||||
cachetools==5.3.3
|
||||
# via google-auth
|
||||
cartesia==0.1.1
|
||||
cartesia==1.0.0
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
certifi==2024.6.2
|
||||
# via
|
||||
@@ -210,7 +210,7 @@ langchain-core==0.2.9
|
||||
# langchain-community
|
||||
# langchain-openai
|
||||
# langchain-text-splitters
|
||||
langchain-openai==0.1.9
|
||||
langchain-openai==0.1.10
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
langchain-text-splitters==0.2.1
|
||||
# via langchain
|
||||
|
||||
@@ -36,7 +36,7 @@ Website = "https://pipecat.ai"
|
||||
[project.optional-dependencies]
|
||||
anthropic = [ "anthropic~=0.25.7" ]
|
||||
azure = [ "azure-cognitiveservices-speech~=1.37.0" ]
|
||||
cartesia = [ "cartesia~=0.1.1" ]
|
||||
cartesia = [ "cartesia~=1.0.0" ]
|
||||
daily = [ "daily-python~=0.10.1" ]
|
||||
deepgram = [ "deepgram-sdk~=3.2.7" ]
|
||||
examples = [ "python-dotenv~=1.0.0", "flask~=3.0.3", "flask_cors~=4.0.1" ]
|
||||
|
||||
@@ -16,14 +16,13 @@ from pipecat.frames.frames import (
|
||||
EndFrame,
|
||||
ErrorFrame,
|
||||
Frame,
|
||||
LLMFullResponseStartFrame,
|
||||
LLMFullResponseEndFrame,
|
||||
StartFrame,
|
||||
StartInterruptionFrame,
|
||||
TTSStartedFrame,
|
||||
TTSStoppedFrame,
|
||||
TextFrame,
|
||||
VisionImageRawFrame,
|
||||
LLMFullResponseEndFrame,
|
||||
)
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.utils.audio import calculate_audio_volume
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
from cartesia.tts import AsyncCartesiaTTS
|
||||
from cartesia import AsyncCartesia
|
||||
|
||||
from typing import AsyncGenerator
|
||||
|
||||
@@ -20,22 +20,24 @@ class CartesiaTTSService(TTSService):
|
||||
self,
|
||||
*,
|
||||
api_key: str,
|
||||
voice_name: str,
|
||||
model_id: str = "upbeat-moon",
|
||||
output_format: str = "pcm_16000",
|
||||
voice_id: str,
|
||||
model_id: str = "sonic-english",
|
||||
encoding: str = "pcm_s16le",
|
||||
sample_rate: int = 16000,
|
||||
**kwargs):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
self._api_key = api_key
|
||||
self._voice_name = voice_name
|
||||
self._model_id = model_id
|
||||
self._output_format = output_format
|
||||
self._output_format = {
|
||||
"container": "raw",
|
||||
"encoding": encoding,
|
||||
"sample_rate": sample_rate,
|
||||
}
|
||||
|
||||
try:
|
||||
self._client = AsyncCartesiaTTS(api_key=self._api_key)
|
||||
voices = self._client.get_voices()
|
||||
voice_id = voices[self._voice_name]["id"]
|
||||
self._voice = self._client.get_voice_embedding(voice_id=voice_id)
|
||||
self._client = AsyncCartesia(api_key=self._api_key)
|
||||
self._voice = self._client.voices.get(id=voice_id)
|
||||
except Exception as e:
|
||||
logger.error(f"{self} initialization error: {e}")
|
||||
|
||||
@@ -48,16 +50,16 @@ class CartesiaTTSService(TTSService):
|
||||
try:
|
||||
await self.start_ttfb_metrics()
|
||||
|
||||
chunk_generator = await self._client.generate(
|
||||
chunk_generator = await self._client.tts.sse(
|
||||
stream=True,
|
||||
transcript=text,
|
||||
voice=self._voice,
|
||||
voice_embedding=self._voice["embedding"],
|
||||
model_id=self._model_id,
|
||||
output_format=self._output_format,
|
||||
)
|
||||
|
||||
async for chunk in chunk_generator:
|
||||
await self.stop_ttfb_metrics()
|
||||
yield AudioRawFrame(chunk["audio"], chunk["sampling_rate"], 1)
|
||||
yield AudioRawFrame(chunk["audio"], self._output_format["sample_rate"], 1)
|
||||
except Exception as e:
|
||||
logger.error(f"{self} exception: {e}")
|
||||
|
||||
Reference in New Issue
Block a user