diff --git a/examples/foundational/07c-interruptible-deepgram.py b/examples/foundational/07c-interruptible-deepgram.py
index fc33c246f..f3b4ee246 100644
--- a/examples/foundational/07c-interruptible-deepgram.py
+++ b/examples/foundational/07c-interruptible-deepgram.py
@@ -50,7 +50,7 @@ async def main():
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
- tts = DeepgramTTSService(api_key=os.getenv("DEEPGRAM_API_KEY"), voice="aura-helios-en")
+ tts = DeepgramTTSService(api_key=os.getenv("DEEPGRAM_API_KEY"), voice_id="aura-helios-en")
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o")
diff --git a/examples/foundational/07e-interruptible-playht.py b/examples/foundational/07e-interruptible-playht.py
index 9c48df93a..58d85ed79 100644
--- a/examples/foundational/07e-interruptible-playht.py
+++ b/examples/foundational/07e-interruptible-playht.py
@@ -4,11 +4,15 @@
# SPDX-License-Identifier: BSD 2-Clause License
#
-import aiohttp
import asyncio
import os
import sys
+import aiohttp
+from dotenv import load_dotenv
+from loguru import logger
+from runner import configure
+
from pipecat.frames.frames import LLMMessagesFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
@@ -17,17 +21,11 @@ from pipecat.processors.aggregators.llm_response import (
LLMAssistantResponseAggregator,
LLMUserResponseAggregator,
)
-from pipecat.services.playht import PlayHTTTSService
from pipecat.services.openai import OpenAILLMService
+from pipecat.services.playht import PlayHTTTSService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.vad.silero import SileroVADAnalyzer
-from runner import configure
-
-from loguru import logger
-
-from dotenv import load_dotenv
-
load_dotenv(override=True)
logger.remove(0)
@@ -54,7 +52,7 @@ async def main():
tts = PlayHTTTSService(
user_id=os.getenv("PLAYHT_USER_ID"),
api_key=os.getenv("PLAYHT_API_KEY"),
- voice_url="s3://voice-cloning-zero-shot/801a663f-efd0-4254-98d0-5c175514c3e8/jennifer/manifest.json",
+ voice_id="s3://voice-cloning-zero-shot/801a663f-efd0-4254-98d0-5c175514c3e8/jennifer/manifest.json",
)
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o")
diff --git a/examples/foundational/07g-interruptible-openai-tts.py b/examples/foundational/07g-interruptible-openai-tts.py
index 70576c97a..b7671c42f 100644
--- a/examples/foundational/07g-interruptible-openai-tts.py
+++ b/examples/foundational/07g-interruptible-openai-tts.py
@@ -4,11 +4,15 @@
# SPDX-License-Identifier: BSD 2-Clause License
#
-import aiohttp
import asyncio
import os
import sys
+import aiohttp
+from dotenv import load_dotenv
+from loguru import logger
+from runner import configure
+
from pipecat.frames.frames import LLMMessagesFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
@@ -17,17 +21,10 @@ from pipecat.processors.aggregators.llm_response import (
LLMAssistantResponseAggregator,
LLMUserResponseAggregator,
)
-from pipecat.services.openai import OpenAITTSService
-from pipecat.services.openai import OpenAILLMService
+from pipecat.services.openai import OpenAILLMService, OpenAITTSService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.vad.silero import SileroVADAnalyzer
-from runner import configure
-
-from loguru import logger
-
-from dotenv import load_dotenv
-
load_dotenv(override=True)
logger.remove(0)
@@ -51,7 +48,7 @@ async def main():
),
)
- tts = OpenAITTSService(api_key=os.getenv("OPENAI_API_KEY"), voice="alloy")
+ tts = OpenAITTSService(api_key=os.getenv("OPENAI_API_KEY"), voice_id="alloy")
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o")
diff --git a/examples/foundational/16-gpu-container-local-bot.py b/examples/foundational/16-gpu-container-local-bot.py
index 06bf45195..b3ae9686e 100644
--- a/examples/foundational/16-gpu-container-local-bot.py
+++ b/examples/foundational/16-gpu-container-local-bot.py
@@ -5,10 +5,14 @@
#
import asyncio
-import aiohttp
import os
import sys
+import aiohttp
+from dotenv import load_dotenv
+from loguru import logger
+from runner import configure
+
from pipecat.frames.frames import LLMMessagesFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
@@ -26,12 +30,6 @@ from pipecat.transports.services.daily import (
)
from pipecat.vad.silero import SileroVADAnalyzer
-from runner import configure
-
-from loguru import logger
-
-from dotenv import load_dotenv
-
load_dotenv(override=True)
logger.remove(0)
@@ -57,7 +55,7 @@ async def main():
tts = DeepgramTTSService(
aiohttp_session=session,
api_key=os.getenv("DEEPGRAM_API_KEY"),
- voice="aura-asteria-en",
+ voice_id="aura-asteria-en",
base_url="http://0.0.0.0:8080/v1/speak",
)
diff --git a/src/pipecat/frames/frames.py b/src/pipecat/frames/frames.py
index 7cb93606d..e2ef78df5 100644
--- a/src/pipecat/frames/frames.py
+++ b/src/pipecat/frames/frames.py
@@ -530,10 +530,24 @@ class UserImageRequestFrame(ControlFrame):
class ServiceUpdateSettingsFrame(ControlFrame):
"""A control frame containing a request to update service settings."""
- service_type: str
settings: Dict[str, Any]
+@dataclass
+class LLMUpdateSettingsFrame(ServiceUpdateSettingsFrame):
+ pass
+
+
+@dataclass
+class TTSUpdateSettingsFrame(ServiceUpdateSettingsFrame):
+ pass
+
+
+@dataclass
+class STTUpdateSettingsFrame(ServiceUpdateSettingsFrame):
+ pass
+
+
@dataclass
class FunctionCallInProgressFrame(SystemFrame):
"""A frame signaling that a function call is in progress."""
diff --git a/src/pipecat/services/ai_services.py b/src/pipecat/services/ai_services.py
index 6ddc4d426..64d79b582 100644
--- a/src/pipecat/services/ai_services.py
+++ b/src/pipecat/services/ai_services.py
@@ -8,7 +8,7 @@ import asyncio
import io
import wave
from abc import abstractmethod
-from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
+from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple
from loguru import logger
@@ -19,14 +19,15 @@ from pipecat.frames.frames import (
ErrorFrame,
Frame,
LLMFullResponseEndFrame,
- ServiceUpdateSettingsFrame,
StartFrame,
StartInterruptionFrame,
+ STTUpdateSettingsFrame,
TextFrame,
TTSAudioRawFrame,
TTSSpeakFrame,
TTSStartedFrame,
TTSStoppedFrame,
+ TTSUpdateSettingsFrame,
UserImageRequestFrame,
VisionImageRawFrame,
)
@@ -44,6 +45,7 @@ class AIService(FrameProcessor):
def __init__(self, **kwargs):
super().__init__(**kwargs)
self._model_name: str = ""
+ self._settings: Dict[str, Any] = {}
@property
def model_name(self) -> str:
@@ -62,6 +64,16 @@ class AIService(FrameProcessor):
async def cancel(self, frame: CancelFrame):
pass
+ async def _update_settings(self, settings: Dict[str, Any]):
+ for key, value in settings.items():
+ if key in self._settings:
+ logger.debug(f"Updating setting {key} to: [{value}] for {self.name}")
+ self._settings[key] = value
+ elif key == "model":
+ self.set_model_name(value)
+ else:
+ logger.warning(f"Unknown setting for {self.name} service: {key}")
+
async def process_frame(self, frame: Frame, direction: FrameDirection):
await super().process_frame(frame, direction)
@@ -168,6 +180,7 @@ class TTSService(AIService):
self._push_stop_frames: bool = push_stop_frames
self._stop_frame_timeout_s: float = stop_frame_timeout_s
self._sample_rate: int = sample_rate
+ self._voice_id: str = ""
self._settings: Dict[str, Any] = {}
self._stop_frame_task: Optional[asyncio.Task] = None
@@ -184,60 +197,8 @@ class TTSService(AIService):
self.set_model_name(model)
@abstractmethod
- async def set_voice(self, voice: str):
- pass
-
- @abstractmethod
- async def set_language(self, language: Language):
- pass
-
- @abstractmethod
- async def set_speed(self, speed: Union[str, float]):
- pass
-
- @abstractmethod
- async def set_emotion(self, emotion: List[str]):
- pass
-
- @abstractmethod
- async def set_engine(self, engine: str):
- pass
-
- @abstractmethod
- async def set_pitch(self, pitch: str):
- pass
-
- @abstractmethod
- async def set_rate(self, rate: str):
- pass
-
- @abstractmethod
- async def set_volume(self, volume: str):
- pass
-
- @abstractmethod
- async def set_emphasis(self, emphasis: str):
- pass
-
- @abstractmethod
- async def set_style(self, style: str):
- pass
-
- @abstractmethod
- async def set_style_degree(self, style_degree: str):
- pass
-
- @abstractmethod
- async def set_role(self, role: str):
- pass
-
- @abstractmethod
- async def set_gender(self, gender: str):
- pass
-
- @abstractmethod
- async def set_google_style(self, google_style: str):
- pass
+ def set_voice(self, voice: str):
+ self._voice_id = voice
@abstractmethod
async def flush_audio(self):
@@ -269,20 +230,18 @@ class TTSService(AIService):
async def _update_settings(self, settings: Dict[str, Any]):
for key, value in settings.items():
- setter = getattr(self, f"set_{key}", None)
- if setter and callable(setter):
- try:
- if key == "language":
- await setter(Language(value))
- else:
- await setter(value)
- except Exception as e:
- logger.warning(f"Error setting {key}: {e}")
+ if key in self._settings:
+ logger.debug(f"Updating TTS setting {key} to: [{value}]")
+ self._settings[key] = value
+ if key == "language":
+ self._settings[key] = Language(value)
+ elif key == "model":
+ self.set_model_name(value)
+ elif key == "voice":
+ self.set_voice(value)
else:
logger.warning(f"Unknown setting for TTS service: {key}")
- self._settings.update(settings)
-
async def say(self, text: str):
aggregate_sentences = self._aggregate_sentences
self._aggregate_sentences = False
@@ -309,7 +268,7 @@ class TTSService(AIService):
elif isinstance(frame, TTSSpeakFrame):
await self._push_tts_frames(frame.text)
await self.flush_audio()
- elif isinstance(frame, ServiceUpdateSettingsFrame) and frame.service_type == "tts":
+ elif isinstance(frame, TTSUpdateSettingsFrame):
await self._update_settings(frame.settings)
else:
await self.push_frame(frame, direction)
@@ -448,31 +407,24 @@ class STTService(AIService):
async def set_model(self, model: str):
self.set_model_name(model)
- @abstractmethod
- async def set_language(self, language: Language):
- pass
-
@abstractmethod
async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]:
"""Returns transcript as a string"""
pass
async def _update_settings(self, settings: Dict[str, Any]):
+ logger.debug(f"Updating STT settings: {self._settings}")
for key, value in settings.items():
- setter = getattr(self, f"set_{key}", None)
- if setter and callable(setter):
- try:
- if key == "language":
- await setter(Language(value))
- else:
- await setter(value)
- except Exception as e:
- logger.warning(f"Error setting {key}: {e}")
+ if key in self._settings:
+ logger.debug(f"Updating STT setting {key} to: [{value}]")
+ self._settings[key] = value
+ if key == "language":
+ self._settings[key] = Language(value)
+ elif key == "model":
+ self.set_model_name(value)
else:
logger.warning(f"Unknown setting for STT service: {key}")
- self._settings.update(settings)
-
async def process_audio_frame(self, frame: AudioRawFrame):
await self.process_generator(self.run_stt(frame.audio))
@@ -484,7 +436,7 @@ class STTService(AIService):
# In this service we accumulate audio internally and at the end we
# push a TextFrame. We don't really want to push audio frames down.
await self.process_audio_frame(frame)
- elif isinstance(frame, ServiceUpdateSettingsFrame) and frame.service_type == "stt":
+ elif isinstance(frame, STTUpdateSettingsFrame):
await self._update_settings(frame.settings)
else:
await self.push_frame(frame, direction)
diff --git a/src/pipecat/services/anthropic.py b/src/pipecat/services/anthropic.py
index 09d6c5402..1b7064209 100644
--- a/src/pipecat/services/anthropic.py
+++ b/src/pipecat/services/anthropic.py
@@ -25,7 +25,7 @@ from pipecat.frames.frames import (
LLMFullResponseEndFrame,
LLMFullResponseStartFrame,
LLMMessagesFrame,
- ServiceUpdateSettingsFrame,
+ LLMUpdateSettingsFrame,
StartInterruptionFrame,
TextFrame,
UserImageRawFrame,
@@ -96,12 +96,14 @@ class AnthropicLLMService(LLMService):
super().__init__(**kwargs)
self._client = AsyncAnthropic(api_key=api_key)
self.set_model_name(model)
- self._max_tokens = params.max_tokens
- self._enable_prompt_caching_beta: bool = params.enable_prompt_caching_beta or False
- self._temperature = params.temperature
- self._top_k = params.top_k
- self._top_p = params.top_p
- self._extra = params.extra if isinstance(params.extra, dict) else {}
+ self._settings = {
+ "max_tokens": params.max_tokens,
+ "enable_prompt_caching_beta": params.enable_prompt_caching_beta or False,
+ "temperature": params.temperature,
+ "top_k": params.top_k,
+ "top_p": params.top_p,
+ "extra": params.extra if isinstance(params.extra, dict) else {},
+ }
def can_generate_metrics(self) -> bool:
return True
@@ -120,30 +122,6 @@ class AnthropicLLMService(LLMService):
)
return AnthropicContextAggregatorPair(_user=user, _assistant=assistant)
- async def set_enable_prompt_caching_beta(self, enable_prompt_caching_beta: bool):
- logger.debug(f"Switching LLM enable_prompt_caching_beta to: [{enable_prompt_caching_beta}]")
- self._enable_prompt_caching_beta = enable_prompt_caching_beta
-
- async def set_max_tokens(self, max_tokens: int):
- logger.debug(f"Switching LLM max_tokens to: [{max_tokens}]")
- self._max_tokens = max_tokens
-
- async def set_temperature(self, temperature: float):
- logger.debug(f"Switching LLM temperature to: [{temperature}]")
- self._temperature = temperature
-
- async def set_top_k(self, top_k: float):
- logger.debug(f"Switching LLM top_k to: [{top_k}]")
- self._top_k = top_k
-
- async def set_top_p(self, top_p: float):
- logger.debug(f"Switching LLM top_p to: [{top_p}]")
- self._top_p = top_p
-
- async def set_extra(self, extra: Dict[str, Any]):
- logger.debug(f"Switching LLM extra to: [{extra}]")
- self._extra = extra
-
async def _process_context(self, context: OpenAILLMContext):
# Usage tracking. We track the usage reported by Anthropic in prompt_tokens and
# completion_tokens. We also estimate the completion tokens from output text
@@ -165,11 +143,11 @@ class AnthropicLLMService(LLMService):
)
messages = context.messages
- if self._enable_prompt_caching_beta:
+ if self._settings["enable_prompt_caching_beta"]:
messages = context.get_messages_with_cache_control_markers()
api_call = self._client.messages.create
- if self._enable_prompt_caching_beta:
+ if self._settings["enable_prompt_caching_beta"]:
api_call = self._client.beta.prompt_caching.messages.create
await self.start_ttfb_metrics()
@@ -179,14 +157,14 @@ class AnthropicLLMService(LLMService):
"system": context.system,
"messages": messages,
"model": self.model_name,
- "max_tokens": self._max_tokens,
+ "max_tokens": self._settings["max_tokens"],
"stream": True,
- "temperature": self._temperature,
- "top_k": self._top_k,
- "top_p": self._top_p,
+ "temperature": self._settings["temperature"],
+ "top_k": self._settings["top_k"],
+ "top_p": self._settings["top_p"],
}
- params.update(self._extra)
+ params.update(self._settings["extra"])
response = await api_call(**params)
@@ -284,17 +262,6 @@ class AnthropicLLMService(LLMService):
cache_read_input_tokens=cache_read_input_tokens,
)
- async def _update_settings(self, settings: Dict[str, Any]):
- for key, value in settings.items():
- setter = getattr(self, f"set_{key}", None)
- if setter and callable(setter):
- try:
- await setter(value)
- except Exception as e:
- logger.warning(f"Error setting {key}: {e}")
- else:
- logger.warning(f"Unknown setting for Anthropic LLM service: {key}")
-
async def process_frame(self, frame: Frame, direction: FrameDirection):
await super().process_frame(frame, direction)
@@ -309,11 +276,11 @@ class AnthropicLLMService(LLMService):
# UserImageRawFrames coming through the pipeline and add them
# to the context.
context = AnthropicLLMContext.from_image_frame(frame)
- elif isinstance(frame, ServiceUpdateSettingsFrame) and frame.service_type == "llm":
+ elif isinstance(frame, LLMUpdateSettingsFrame):
await self._update_settings(frame.settings)
elif isinstance(frame, LLMEnablePromptCachingFrame):
logger.debug(f"Setting enable prompt caching to: [{frame.enable}]")
- self._enable_prompt_caching_beta = frame.enable
+ self._settings["enable_prompt_caching_beta"] = frame.enable
else:
await self.push_frame(frame, direction)
diff --git a/src/pipecat/services/aws.py b/src/pipecat/services/aws.py
index 80240985f..7004c21f7 100644
--- a/src/pipecat/services/aws.py
+++ b/src/pipecat/services/aws.py
@@ -6,6 +6,7 @@
from typing import AsyncGenerator, Optional
+from loguru import logger
from pydantic import BaseModel
from pipecat.frames.frames import (
@@ -17,8 +18,6 @@ from pipecat.frames.frames import (
)
from pipecat.services.ai_services import TTSService
-from loguru import logger
-
try:
import boto3
from botocore.exceptions import BotoCoreError, ClientError
@@ -57,9 +56,16 @@ class AWSTTSService(TTSService):
aws_secret_access_key=api_key,
region_name=region,
)
- self._voice_id = voice_id
- self._sample_rate = sample_rate
- self._params = params
+ self._settings = {
+ "sample_rate": sample_rate,
+ "engine": params.engine,
+ "language": params.language,
+ "pitch": params.pitch,
+ "rate": params.rate,
+ "volume": params.volume,
+ }
+
+ self.set_voice(voice_id)
def can_generate_metrics(self) -> bool:
return True
@@ -67,18 +73,18 @@ class AWSTTSService(TTSService):
def _construct_ssml(self, text: str) -> str:
ssml = ""
- if self._params.language:
- ssml += f""
+ if self._settings["language"]:
+ ssml += f""
prosody_attrs = []
# Prosody tags are only supported for standard and neural engines
- if self._params.engine != "generative":
- if self._params.rate:
- prosody_attrs.append(f"rate='{self._params.rate}'")
- if self._params.pitch:
- prosody_attrs.append(f"pitch='{self._params.pitch}'")
- if self._params.volume:
- prosody_attrs.append(f"volume='{self._params.volume}'")
+ if self._settings["engine"] != "generative":
+ if self._settings["rate"]:
+ prosody_attrs.append(f"rate='{self._settings["rate"]}'")
+ if self._settings["pitch"]:
+ prosody_attrs.append(f"pitch='{self._settings["pitch"]}'")
+ if self._settings["volume"]:
+ prosody_attrs.append(f"volume='{self._settings["volume"]}'")
if prosody_attrs:
ssml += f""
@@ -90,41 +96,13 @@ class AWSTTSService(TTSService):
if prosody_attrs:
ssml += ""
- if self._params.language:
+ if self._settings["language"]:
ssml += ""
ssml += ""
return ssml
- async def set_voice(self, voice: str):
- logger.debug(f"Switching TTS voice to: [{voice}]")
- self._voice_id = voice
-
- async def set_engine(self, engine: str):
- logger.debug(f"Switching TTS engine to: [{engine}]")
- self._params.engine = engine
-
- async def set_language(self, language: str):
- logger.debug(f"Switching TTS language to: [{language}]")
- self._params.language = language
-
- async def set_pitch(self, pitch: str):
- logger.debug(f"Switching TTS pitch to: [{pitch}]")
- self._params.pitch = pitch
-
- async def set_rate(self, rate: str):
- logger.debug(f"Switching TTS rate to: [{rate}]")
- self._params.rate = rate
-
- async def set_volume(self, volume: str):
- logger.debug(f"Switching TTS volume to: [{volume}]")
- self._params.volume = volume
-
- async def set_params(self, params: InputParams):
- logger.debug(f"Switching TTS params to: [{params}]")
- self._params = params
-
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
logger.debug(f"Generating TTS: [{text}]")
@@ -139,8 +117,8 @@ class AWSTTSService(TTSService):
"TextType": "ssml",
"OutputFormat": "pcm",
"VoiceId": self._voice_id,
- "Engine": self._params.engine,
- "SampleRate": str(self._sample_rate),
+ "Engine": self._settings["engine"],
+ "SampleRate": str(self._settings["sample_rate"]),
}
# Filter out None values
@@ -160,7 +138,7 @@ class AWSTTSService(TTSService):
chunk = audio_data[i : i + chunk_size]
if len(chunk) > 0:
await self.stop_ttfb_metrics()
- frame = TTSAudioRawFrame(chunk, self._sample_rate, 1)
+ frame = TTSAudioRawFrame(chunk, self._settings["sample_rate"], 1)
yield frame
await self.push_frame(TTSStoppedFrame())
diff --git a/src/pipecat/services/azure.py b/src/pipecat/services/azure.py
index a1349cefe..1b2b9a3f2 100644
--- a/src/pipecat/services/azure.py
+++ b/src/pipecat/services/azure.py
@@ -4,12 +4,13 @@
# SPDX-License-Identifier: BSD 2-Clause License
#
-import aiohttp
import asyncio
import io
-
from typing import AsyncGenerator, Optional
+import aiohttp
+from loguru import logger
+from PIL import Image
from pydantic import BaseModel
from pipecat.frames.frames import (
@@ -28,10 +29,6 @@ from pipecat.services.ai_services import ImageGenService, STTService, TTSService
from pipecat.services.openai import BaseOpenAILLMService
from pipecat.utils.time import time_now_iso8601
-from PIL import Image
-
-from loguru import logger
-
# See .env.example for Azure configuration needed
try:
from azure.cognitiveservices.speech import (
@@ -89,7 +86,7 @@ class AzureTTSService(TTSService):
*,
api_key: str,
region: str,
- voice="en-US-SaraNeural",
+ voice_id="en-US-SaraNeural",
sample_rate: int = 16000,
params: InputParams = InputParams(),
**kwargs,
@@ -99,114 +96,67 @@ class AzureTTSService(TTSService):
speech_config = SpeechConfig(subscription=api_key, region=region)
self._speech_synthesizer = SpeechSynthesizer(speech_config=speech_config, audio_config=None)
- self._voice = voice
- self._sample_rate = sample_rate
- self._params = params
+ self._settings = {
+ "sample_rate": sample_rate,
+ "emphasis": params.emphasis,
+ "language": params.language,
+ "pitch": params.pitch,
+ "rate": params.rate,
+ "role": params.role,
+ "style": params.style,
+ "style_degree": params.style_degree,
+ "volume": params.volume,
+ }
+
+ self.set_voice(voice_id)
def can_generate_metrics(self) -> bool:
return True
def _construct_ssml(self, text: str) -> str:
ssml = (
- f""
- f""
+ f""
""
)
- if self._params.style:
- ssml += f""
prosody_attrs = []
- if self._params.rate:
- prosody_attrs.append(f"rate='{self._params.rate}'")
- if self._params.pitch:
- prosody_attrs.append(f"pitch='{self._params.pitch}'")
- if self._params.volume:
- prosody_attrs.append(f"volume='{self._params.volume}'")
+ if self._settings["rate"]:
+ prosody_attrs.append(f"rate='{self._settings['rate']}'")
+ if self._settings["pitch"]:
+ prosody_attrs.append(f"pitch='{self._settings['pitch']}'")
+ if self._settings["volume"]:
+ prosody_attrs.append(f"volume='{self._settings['volume']}'")
ssml += f""
- if self._params.emphasis:
- ssml += f""
+ if self._settings["emphasis"]:
+ ssml += f""
ssml += text
- if self._params.emphasis:
+ if self._settings["emphasis"]:
ssml += ""
ssml += ""
- if self._params.style:
+ if self._settings["style"]:
ssml += ""
ssml += ""
return ssml
- async def set_voice(self, voice: str):
- logger.debug(f"Switching TTS voice to: [{voice}]")
- self._voice = voice
-
- async def set_emphasis(self, emphasis: str):
- logger.debug(f"Setting TTS emphasis to: [{emphasis}]")
- self._params.emphasis = emphasis
-
- async def set_language(self, language: str):
- logger.debug(f"Setting TTS language code to: [{language}]")
- self._params.language = language
-
- async def set_pitch(self, pitch: str):
- logger.debug(f"Setting TTS pitch to: [{pitch}]")
- self._params.pitch = pitch
-
- async def set_rate(self, rate: str):
- logger.debug(f"Setting TTS rate to: [{rate}]")
- self._params.rate = rate
-
- async def set_role(self, role: str):
- logger.debug(f"Setting TTS role to: [{role}]")
- self._params.role = role
-
- async def set_style(self, style: str):
- logger.debug(f"Setting TTS style to: [{style}]")
- self._params.style = style
-
- async def set_style_degree(self, style_degree: str):
- logger.debug(f"Setting TTS style degree to: [{style_degree}]")
- self._params.style_degree = style_degree
-
- async def set_volume(self, volume: str):
- logger.debug(f"Setting TTS volume to: [{volume}]")
- self._params.volume = volume
-
- async def set_params(self, **kwargs):
- valid_params = {
- "voice": self.set_voice,
- "emphasis": self.set_emphasis,
- "language_code": self.set_language,
- "pitch": self.set_pitch,
- "rate": self.set_rate,
- "role": self.set_role,
- "style": self.set_style,
- "style_degree": self.set_style_degree,
- "volume": self.set_volume,
- }
-
- for param, value in kwargs.items():
- if param in valid_params:
- await valid_params[param](value)
- else:
- logger.warning(f"Ignoring unknown parameter: {param}")
-
- logger.debug(f"Updated TTS parameters: {', '.join(kwargs.keys())}")
-
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
logger.debug(f"Generating TTS: [{text}]")
@@ -222,7 +172,9 @@ class AzureTTSService(TTSService):
await self.push_frame(TTSStartedFrame())
# Azure always sends a 44-byte header. Strip it off.
yield TTSAudioRawFrame(
- audio=result.audio_data[44:], sample_rate=self._sample_rate, num_channels=1
+ audio=result.audio_data[44:],
+ sample_rate=self._settings["sample_rate"],
+ num_channels=1,
)
await self.push_frame(TTSStoppedFrame())
elif result.reason == ResultReason.Canceled:
diff --git a/src/pipecat/services/cartesia.py b/src/pipecat/services/cartesia.py
index 5f798b1e5..0817879c4 100644
--- a/src/pipecat/services/cartesia.py
+++ b/src/pipecat/services/cartesia.py
@@ -4,36 +4,35 @@
# SPDX-License-Identifier: BSD 2-Clause License
#
+import asyncio
+import base64
import json
import uuid
-import base64
-import asyncio
+from typing import AsyncGenerator, List, Optional, Union
-from typing import AsyncGenerator, Optional, Union, List
+from loguru import logger
from pydantic.main import BaseModel
from pipecat.frames.frames import (
CancelFrame,
+ EndFrame,
ErrorFrame,
Frame,
- StartInterruptionFrame,
+ LLMFullResponseEndFrame,
StartFrame,
- EndFrame,
+ StartInterruptionFrame,
TTSAudioRawFrame,
TTSStartedFrame,
TTSStoppedFrame,
- LLMFullResponseEndFrame,
)
from pipecat.processors.frame_processor import FrameDirection
+from pipecat.services.ai_services import TTSService, WordTTSService
from pipecat.transcriptions.language import Language
-from pipecat.services.ai_services import WordTTSService, TTSService
-
-from loguru import logger
# See .env.example for Cartesia configuration needed
try:
- from cartesia import AsyncCartesia
import websockets
+ from cartesia import AsyncCartesia
except ModuleNotFoundError as e:
logger.error(f"Exception: {e}")
logger.error(
@@ -66,7 +65,7 @@ class CartesiaTTSService(WordTTSService):
encoding: Optional[str] = "pcm_s16le"
sample_rate: Optional[int] = 16000
container: Optional[str] = "raw"
- language: Optional[str] = "en"
+ language: Optional[Language] = Language.EN
speed: Optional[Union[str, float]] = ""
emotion: Optional[List[str]] = []
@@ -77,7 +76,7 @@ class CartesiaTTSService(WordTTSService):
voice_id: str,
cartesia_version: str = "2024-06-10",
url: str = "wss://api.cartesia.ai/tts/websocket",
- model_id: str = "sonic-english",
+ model: str = "sonic-english",
params: InputParams = InputParams(),
**kwargs,
):
@@ -101,17 +100,18 @@ class CartesiaTTSService(WordTTSService):
self._api_key = api_key
self._cartesia_version = cartesia_version
self._url = url
- self._voice_id = voice_id
- self._model_id = model_id
- self.set_model_name(model_id)
- self._output_format = {
- "container": params.container,
- "encoding": params.encoding,
- "sample_rate": params.sample_rate,
+ self._settings = {
+ "output_format": {
+ "container": params.container,
+ "encoding": params.encoding,
+ "sample_rate": params.sample_rate,
+ },
+ "language": language_to_cartesia_language(params.language) if params.language else None,
+ "speed": params.speed,
+ "emotion": params.emotion,
}
- self._language = params.language
- self._speed = params.speed
- self._emotion = params.emotion
+ self.set_model_name(model)
+ self.set_voice(voice_id)
self._websocket = None
self._context_id = None
@@ -125,42 +125,28 @@ class CartesiaTTSService(WordTTSService):
await super().set_model(model)
logger.debug(f"Switching TTS model to: [{model}]")
- async def set_voice(self, voice: str):
- logger.debug(f"Switching TTS voice to: [{voice}]")
- self._voice_id = voice
-
- async def set_speed(self, speed: str):
- logger.debug(f"Switching TTS speed to: [{speed}]")
- self._speed = speed
-
- async def set_emotion(self, emotion: list[str]):
- logger.debug(f"Switching TTS emotion to: [{emotion}]")
- self._emotion = emotion
-
- async def set_language(self, language: Language):
- logger.debug(f"Switching TTS language to: [{language}]")
- self._language = language_to_cartesia_language(language)
-
def _build_msg(
self, text: str = "", continue_transcript: bool = True, add_timestamps: bool = True
):
- voice_config = {"mode": "id", "id": self._voice_id}
+ voice_config = {}
+ voice_config["mode"] = "id"
+ voice_config["id"] = self._voice_id
- if self._speed or self._emotion:
+ if self._settings["speed"] or self._settings["emotion"]:
voice_config["__experimental_controls"] = {}
- if self._speed:
- voice_config["__experimental_controls"]["speed"] = self._speed
- if self._emotion:
- voice_config["__experimental_controls"]["emotion"] = self._emotion
+ if self._settings["speed"]:
+ voice_config["__experimental_controls"]["speed"] = self._settings["speed"]
+ if self._settings["emotion"]:
+ voice_config["__experimental_controls"]["emotion"] = self._settings["emotion"]
msg = {
"transcript": text,
"continue": continue_transcript,
"context_id": self._context_id,
- "model_id": self._model_name,
+ "model_id": self.model_name,
"voice": voice_config,
- "output_format": self._output_format,
- "language": self._language,
+ "output_format": self._settings["output_format"],
+ "language": self._settings["language"],
"add_timestamps": add_timestamps,
}
return json.dumps(msg)
@@ -245,7 +231,7 @@ class CartesiaTTSService(WordTTSService):
self.start_word_timestamps()
frame = TTSAudioRawFrame(
audio=base64.b64decode(msg["data"]),
- sample_rate=self._output_format["sample_rate"],
+ sample_rate=self._settings["output_format"]["sample_rate"],
num_channels=1,
)
await self.push_frame(frame)
@@ -303,7 +289,7 @@ class CartesiaHttpTTSService(TTSService):
*,
api_key: str,
voice_id: str,
- model_id: str = "sonic-english",
+ model: str = "sonic-english",
base_url: str = "https://api.cartesia.ai",
params: InputParams = InputParams(),
**kwargs,
@@ -311,17 +297,18 @@ class CartesiaHttpTTSService(TTSService):
super().__init__(**kwargs)
self._api_key = api_key
- self._voice_id = voice_id
- self._model_id = model_id
- self.set_model_name(model_id)
- self._output_format = {
- "container": params.container,
- "encoding": params.encoding,
- "sample_rate": params.sample_rate,
+ self._settings = {
+ "output_format": {
+ "container": params.container,
+ "encoding": params.encoding,
+ "sample_rate": params.sample_rate,
+ },
+ "language": params.language,
+ "speed": params.speed,
+ "emotion": params.emotion,
}
- self._language = params.language
- self._speed = params.speed
- self._emotion = params.emotion
+ self.set_voice(voice_id)
+ self.set_model_name(model)
self._client = AsyncCartesia(api_key=api_key, base_url=base_url)
@@ -333,22 +320,6 @@ class CartesiaHttpTTSService(TTSService):
self._model_id = model
await super().set_model(model)
- async def set_voice(self, voice: str):
- logger.debug(f"Switching TTS voice to: [{voice}]")
- self._voice_id = voice
-
- async def set_speed(self, speed: str):
- logger.debug(f"Switching TTS speed to: [{speed}]")
- self._speed = speed
-
- async def set_emotion(self, emotion: list[str]):
- logger.debug(f"Switching TTS emotion to: [{emotion}]")
- self._emotion = emotion
-
- async def set_language(self, language: Language):
- logger.debug(f"Switching TTS language to: [{language}]")
- self._language = language_to_cartesia_language(language)
-
async def stop(self, frame: EndFrame):
await super().stop(frame)
await self._client.close()
@@ -365,19 +336,19 @@ class CartesiaHttpTTSService(TTSService):
try:
voice_controls = None
- if self._speed or self._emotion:
+ if self._settings["speed"] or self._settings["emotion"]:
voice_controls = {}
- if self._speed:
- voice_controls["speed"] = self._speed
- if self._emotion:
- voice_controls["emotion"] = self._emotion
+ if self._settings["speed"]:
+ voice_controls["speed"] = self._settings["speed"]
+ if self._settings["emotion"]:
+ voice_controls["emotion"] = self._settings["emotion"]
output = await self._client.tts.sse(
model_id=self._model_id,
transcript=text,
voice_id=self._voice_id,
- output_format=self._output_format,
- language=self._language,
+ output_format=self._settings["output_format"],
+ language=self._settings["language"],
stream=False,
_experimental_voice_controls=voice_controls,
)
@@ -386,7 +357,7 @@ class CartesiaHttpTTSService(TTSService):
frame = TTSAudioRawFrame(
audio=output["audio"],
- sample_rate=self._output_format["sample_rate"],
+ sample_rate=self._settings["output_format"]["sample_rate"],
num_channels=1,
)
yield frame
diff --git a/src/pipecat/services/deepgram.py b/src/pipecat/services/deepgram.py
index d109cce3c..40fe0168d 100644
--- a/src/pipecat/services/deepgram.py
+++ b/src/pipecat/services/deepgram.py
@@ -5,9 +5,10 @@
#
import asyncio
-
from typing import AsyncGenerator
+from loguru import logger
+
from pipecat.frames.frames import (
CancelFrame,
EndFrame,
@@ -24,8 +25,6 @@ from pipecat.services.ai_services import STTService, TTSService
from pipecat.transcriptions.language import Language
from pipecat.utils.time import time_now_iso8601
-from loguru import logger
-
# See .env.example for Deepgram configuration needed
try:
from deepgram import (
@@ -50,32 +49,30 @@ class DeepgramTTSService(TTSService):
self,
*,
api_key: str,
- voice: str = "aura-helios-en",
+ voice_id: str = "aura-helios-en",
sample_rate: int = 16000,
encoding: str = "linear16",
**kwargs,
):
super().__init__(**kwargs)
- self._voice = voice
- self._sample_rate = sample_rate
- self._encoding = encoding
+ self._settings = {
+ "sample_rate": sample_rate,
+ "encoding": encoding,
+ }
+ self.set_voice(voice_id)
self._deepgram_client = DeepgramClient(api_key=api_key)
def can_generate_metrics(self) -> bool:
return True
- async def set_voice(self, voice: str):
- logger.debug(f"Switching TTS voice to: [{voice}]")
- self._voice = voice
-
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
logger.debug(f"Generating TTS: [{text}]")
options = SpeakOptions(
- model=self._voice,
- encoding=self._encoding,
- sample_rate=self._sample_rate,
+ model=self._voice_id,
+ encoding=self._settings["encoding"],
+ sample_rate=self._settings["sample_rate"],
container="none",
)
@@ -103,7 +100,9 @@ class DeepgramTTSService(TTSService):
chunk = audio_buffer.read(chunk_size)
if not chunk:
break
- frame = TTSAudioRawFrame(audio=chunk, sample_rate=self._sample_rate, num_channels=1)
+ frame = TTSAudioRawFrame(
+ audio=chunk, sample_rate=self._settings["sample_rate"], num_channels=1
+ )
yield frame
await self.push_frame(TTSStoppedFrame())
@@ -135,7 +134,7 @@ class DeepgramSTTService(STTService):
):
super().__init__(**kwargs)
- self._live_options = live_options
+ self._settings = vars(live_options)
self._client = DeepgramClient(
api_key, config=DeepgramClientOptions(url=url, options={"keepalive": "true"})
@@ -147,7 +146,7 @@ class DeepgramSTTService(STTService):
@property
def vad_enabled(self):
- return self._live_options.vad_events
+ return self._settings["vad_events"]
def can_generate_metrics(self) -> bool:
return self.vad_enabled
@@ -155,13 +154,7 @@ class DeepgramSTTService(STTService):
async def set_model(self, model: str):
await super().set_model(model)
logger.debug(f"Switching STT model to: [{model}]")
- self._live_options.model = model
- await self._disconnect()
- await self._connect()
-
- async def set_language(self, language: Language):
- logger.debug(f"Switching STT language to: [{language}]")
- self._live_options.language = language
+ self._settings["model"] = model
await self._disconnect()
await self._connect()
@@ -182,7 +175,7 @@ class DeepgramSTTService(STTService):
yield None
async def _connect(self):
- if await self._connection.start(self._live_options):
+ if await self._connection.start(self._settings):
logger.debug(f"{self}: Connected to Deepgram")
else:
logger.error(f"{self}: Unable to connect to Deepgram")
diff --git a/src/pipecat/services/elevenlabs.py b/src/pipecat/services/elevenlabs.py
index 611f2a024..871b3eec6 100644
--- a/src/pipecat/services/elevenlabs.py
+++ b/src/pipecat/services/elevenlabs.py
@@ -24,6 +24,7 @@ from pipecat.frames.frames import (
)
from pipecat.processors.frame_processor import FrameDirection
from pipecat.services.ai_services import WordTTSService
+from pipecat.transcriptions.language import Language
# See .env.example for ElevenLabs configuration needed
try:
@@ -72,7 +73,7 @@ def calculate_word_times(
class ElevenLabsTTSService(WordTTSService):
class InputParams(BaseModel):
- language: Optional[str] = None
+ language: Optional[Language] = Language.EN
output_format: Literal["pcm_16000", "pcm_22050", "pcm_24000", "pcm_44100"] = "pcm_16000"
optimize_streaming_latency: Optional[str] = None
stability: Optional[float] = None
@@ -124,10 +125,19 @@ class ElevenLabsTTSService(WordTTSService):
)
self._api_key = api_key
- self._voice_id = voice_id
- self.set_model_name(model)
self._url = url
- self._params = params
+ self._settings = {
+ "sample_rate": sample_rate_from_output_format(params.output_format),
+ "language": params.language,
+ "output_format": params.output_format,
+ "optimize_streaming_latency": params.optimize_streaming_latency,
+ "stability": params.stability,
+ "similarity_boost": params.similarity_boost,
+ "style": params.style,
+ "use_speaker_boost": params.use_speaker_boost,
+ }
+ self.set_model_name(model)
+ self.set_voice(voice_id)
self._voice_settings = self._set_voice_settings()
# Websocket connection to ElevenLabs.
@@ -142,19 +152,22 @@ class ElevenLabsTTSService(WordTTSService):
def _set_voice_settings(self):
voice_settings = {}
- if self._params.stability is not None and self._params.similarity_boost is not None:
- voice_settings["stability"] = self._params.stability
- voice_settings["similarity_boost"] = self._params.similarity_boost
- if self._params.style is not None:
- voice_settings["style"] = self._params.style
- if self._params.use_speaker_boost is not None:
- voice_settings["use_speaker_boost"] = self._params.use_speaker_boost
+ if (
+ self._settings["stability"] is not None
+ and self._settings["similarity_boost"] is not None
+ ):
+ voice_settings["stability"] = self._settings["stability"]
+ voice_settings["similarity_boost"] = self._settings["similarity_boost"]
+ if self._settings["style"] is not None:
+ voice_settings["style"] = self._settings["style"]
+ if self._settings["use_speaker_boost"] is not None:
+ voice_settings["use_speaker_boost"] = self._settings["use_speaker_boost"]
else:
- if self._params.style is not None:
+ if self._settings["style"] is not None:
logger.warning(
"'style' is set but will not be applied because 'stability' and 'similarity_boost' are not both set."
)
- if self._params.use_speaker_boost is not None:
+ if self._settings["use_speaker_boost"] is not None:
logger.warning(
"'use_speaker_boost' is set but will not be applied because 'stability' and 'similarity_boost' are not both set."
)
@@ -167,33 +180,13 @@ class ElevenLabsTTSService(WordTTSService):
await self._disconnect()
await self._connect()
- async def set_voice(self, voice: str):
- logger.debug(f"Switching TTS voice to: [{voice}]")
- self._voice_id = voice
- await self._disconnect()
- await self._connect()
-
- async def set_voice_settings(
- self,
- stability: Optional[float] = None,
- similarity_boost: Optional[float] = None,
- style: Optional[float] = None,
- use_speaker_boost: Optional[bool] = None,
- ):
- self._params.stability = stability if stability is not None else self._params.stability
- self._params.similarity_boost = (
- similarity_boost if similarity_boost is not None else self._params.similarity_boost
- )
- self._params.style = style if style is not None else self._params.style
- self._params.use_speaker_boost = (
- use_speaker_boost if use_speaker_boost is not None else self._params.use_speaker_boost
- )
-
- self._set_voice_settings()
-
- if self._websocket:
- msg = {"voice_settings": self._voice_settings}
- await self._websocket.send(json.dumps(msg))
+ async def _update_settings(self, settings: Dict[str, Any]):
+ prev_voice = self._voice_id
+ await super()._update_settings(settings)
+ if not prev_voice == self._voice_id:
+ await self._disconnect()
+ await self._connect()
+ logger.debug(f"Switching TTS voice to: [{self._voice_id}]")
async def start(self, frame: StartFrame):
await super().start(frame)
@@ -223,19 +216,19 @@ class ElevenLabsTTSService(WordTTSService):
try:
voice_id = self._voice_id
model = self.model_name
- output_format = self._params.output_format
+ output_format = self._settings["output_format"]
url = f"{self._url}/v1/text-to-speech/{voice_id}/stream-input?model_id={model}&output_format={output_format}"
- if self._params.optimize_streaming_latency:
- url += f"&optimize_streaming_latency={self._params.optimize_streaming_latency}"
+ if self._settings["optimize_streaming_latency"]:
+ url += f"&optimize_streaming_latency={self._settings["optimize_streaming_latency"]}"
# language can only be used with the 'eleven_turbo_v2_5' model
- if self._params.language:
+ if self._settings["language"]:
if model == "eleven_turbo_v2_5":
- url += f"&language_code={self._params.language}"
+ url += f"&language_code={self._settings["language"]}"
else:
logger.debug(
- f"Language code [{self._params.language}] not applied. Language codes can only be used with the 'eleven_turbo_v2_5' model."
+ f"Language code [{self._settings["language"]}] not applied. Language codes can only be used with the 'eleven_turbo_v2_5' model."
)
self._websocket = await websockets.connect(url)
@@ -286,7 +279,7 @@ class ElevenLabsTTSService(WordTTSService):
self.start_word_timestamps()
audio = base64.b64decode(msg["audio"])
- frame = TTSAudioRawFrame(audio, self.sample_rate, 1)
+ frame = TTSAudioRawFrame(audio, self._settings["sample_rate"], 1)
await self.push_frame(frame)
if msg.get("alignment"):
diff --git a/src/pipecat/services/gladia.py b/src/pipecat/services/gladia.py
index a590d73cf..16f3dab97 100644
--- a/src/pipecat/services/gladia.py
+++ b/src/pipecat/services/gladia.py
@@ -6,8 +6,9 @@
import base64
import json
-
from typing import AsyncGenerator, Optional
+
+from loguru import logger
from pydantic.main import BaseModel
from pipecat.frames.frames import (
@@ -21,8 +22,6 @@ from pipecat.frames.frames import (
from pipecat.services.ai_services import STTService
from pipecat.utils.time import time_now_iso8601
-from loguru import logger
-
# See .env.example for Gladia configuration needed
try:
import websockets
@@ -55,7 +54,13 @@ class GladiaSTTService(STTService):
self._api_key = api_key
self._url = url
- self._params = params
+ self._settings = {
+ "sample_rate": params.sample_rate,
+ "language": params.language,
+ "transcription_hint": params.transcription_hint,
+ "endpointing": params.endpointing,
+ "prosody": params.prosody,
+ }
self._confidence = confidence
async def start(self, frame: StartFrame):
@@ -84,7 +89,11 @@ class GladiaSTTService(STTService):
"encoding": "WAV/PCM",
"model_type": "fast",
"language_behaviour": "manual",
- **self._params.model_dump(exclude_none=True),
+ "sample_rate": self._settings["sample_rate"],
+ "language": self._settings["language"],
+ "transcription_hint": self._settings["transcription_hint"],
+ "endpointing": self._settings["endpointing"],
+ "prosody": self._settings["prosody"],
}
await self._websocket.send(json.dumps(configuration))
diff --git a/src/pipecat/services/google.py b/src/pipecat/services/google.py
index 092b4703e..05fff2056 100644
--- a/src/pipecat/services/google.py
+++ b/src/pipecat/services/google.py
@@ -6,7 +6,7 @@
import asyncio
import json
-from typing import Any, AsyncGenerator, Dict, List, Literal, Optional
+from typing import AsyncGenerator, List, Literal, Optional
from loguru import logger
from pydantic import BaseModel
@@ -17,7 +17,7 @@ from pipecat.frames.frames import (
LLMFullResponseEndFrame,
LLMFullResponseStartFrame,
LLMMessagesFrame,
- ServiceUpdateSettingsFrame,
+ LLMUpdateSettingsFrame,
TextFrame,
TTSAudioRawFrame,
TTSStartedFrame,
@@ -64,21 +64,6 @@ class GoogleLLMService(LLMService):
self.set_model_name(model)
self._client = gai.GenerativeModel(model)
- async def set_model(self, model: str):
- logger.debug(f"Switching LLM model to: [{model}]")
- self._create_client(model)
-
- async def _update_settings(self, settings: Dict[str, Any]):
- for key, value in settings.items():
- setter = getattr(self, f"set_{key}", None)
- if setter and callable(setter):
- try:
- await setter(value)
- except Exception as e:
- logger.warning(f"Error setting {key}: {e}")
- else:
- logger.warning(f"Unknown setting for Google LLM service: {key}")
-
def _get_messages_from_openai_context(self, context: OpenAILLMContext) -> List[glm.Content]:
openai_messages = context.get_messages()
google_messages = []
@@ -151,7 +136,7 @@ class GoogleLLMService(LLMService):
context = OpenAILLMContext.from_messages(frame.messages)
elif isinstance(frame, VisionImageRawFrame):
context = OpenAILLMContext.from_image_frame(frame)
- elif isinstance(frame, ServiceUpdateSettingsFrame) and frame.service_type == "llm":
+ elif isinstance(frame, LLMUpdateSettingsFrame):
await self._update_settings(frame.settings)
else:
await self.push_frame(frame, direction)
@@ -182,8 +167,17 @@ class GoogleTTSService(TTSService):
):
super().__init__(sample_rate=sample_rate, **kwargs)
- self._voice_id: str = voice_id
- self._params = params
+ self._settings = {
+ "sample_rate": sample_rate,
+ "pitch": params.pitch,
+ "rate": params.rate,
+ "volume": params.volume,
+ "emphasis": params.emphasis,
+ "language": params.language,
+ "gender": params.gender,
+ "google_style": params.google_style,
+ }
+ self.set_voice(voice_id)
self._client: texttospeech_v1.TextToSpeechAsyncClient = self._create_client(
credentials, credentials_path
)
@@ -216,38 +210,38 @@ class GoogleTTSService(TTSService):
# Voice tag
voice_attrs = [f"name='{self._voice_id}'"]
- if self._params.language:
- voice_attrs.append(f"language='{self._params.language}'")
- if self._params.gender:
- voice_attrs.append(f"gender='{self._params.gender}'")
+ if self._settings["language"]:
+ voice_attrs.append(f"language='{self._settings['language']}'")
+ if self._settings["gender"]:
+ voice_attrs.append(f"gender='{self._settings['gender']}'")
ssml += f""
# Prosody tag
prosody_attrs = []
- if self._params.pitch:
- prosody_attrs.append(f"pitch='{self._params.pitch}'")
- if self._params.rate:
- prosody_attrs.append(f"rate='{self._params.rate}'")
- if self._params.volume:
- prosody_attrs.append(f"volume='{self._params.volume}'")
+ if self._settings["pitch"]:
+ prosody_attrs.append(f"pitch='{self._settings['pitch']}'")
+ if self._settings["rate"]:
+ prosody_attrs.append(f"rate='{self._settings['rate']}'")
+ if self._settings["volume"]:
+ prosody_attrs.append(f"volume='{self._settings['volume']}'")
if prosody_attrs:
ssml += f""
# Emphasis tag
- if self._params.emphasis:
- ssml += f""
+ if self._settings["emphasis"]:
+ ssml += f""
# Google style tag
- if self._params.google_style:
- ssml += f""
+ if self._settings["google_style"]:
+ ssml += f""
ssml += text
# Close tags
- if self._params.google_style:
+ if self._settings["google_style"]:
ssml += ""
- if self._params.emphasis:
+ if self._settings["emphasis"]:
ssml += ""
if prosody_attrs:
ssml += ""
@@ -255,46 +249,6 @@ class GoogleTTSService(TTSService):
return ssml
- async def set_voice(self, voice: str) -> None:
- logger.debug(f"Switching TTS voice to: [{voice}]")
- self._voice_id = voice
-
- async def set_language(self, language: str) -> None:
- logger.debug(f"Switching TTS language to: [{language}]")
- self._params.language = language
-
- async def set_pitch(self, pitch: str) -> None:
- logger.debug(f"Switching TTS pitch to: [{pitch}]")
- self._params.pitch = pitch
-
- async def set_rate(self, rate: str) -> None:
- logger.debug(f"Switching TTS rate to: [{rate}]")
- self._params.rate = rate
-
- async def set_volume(self, volume: str) -> None:
- logger.debug(f"Switching TTS volume to: [{volume}]")
- self._params.volume = volume
-
- async def set_emphasis(
- self, emphasis: Literal["strong", "moderate", "reduced", "none"]
- ) -> None:
- logger.debug(f"Switching TTS emphasis to: [{emphasis}]")
- self._params.emphasis = emphasis
-
- async def set_gender(self, gender: Literal["male", "female", "neutral"]) -> None:
- logger.debug(f"Switch TTS gender to [{gender}]")
- self._params.gender = gender
-
- async def set_google_style(
- self, google_style: Literal["apologetic", "calm", "empathetic", "firm", "lively"]
- ) -> None:
- logger.debug(f"Switching TTS google style to: [{google_style}]")
- self._params.google_style = google_style
-
- async def set_params(self, params: InputParams) -> None:
- logger.debug(f"Switching TTS params to: [{params}]")
- self._params = params
-
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
logger.debug(f"Generating TTS: [{text}]")
@@ -304,11 +258,11 @@ class GoogleTTSService(TTSService):
ssml = self._construct_ssml(text)
synthesis_input = texttospeech_v1.SynthesisInput(ssml=ssml)
voice = texttospeech_v1.VoiceSelectionParams(
- language_code=self._params.language, name=self._voice_id
+ language_code=self._settings["language"], name=self._voice_id
)
audio_config = texttospeech_v1.AudioConfig(
audio_encoding=texttospeech_v1.AudioEncoding.LINEAR16,
- sample_rate_hertz=self.sample_rate,
+ sample_rate_hertz=self._settings["sample_rate"],
)
request = texttospeech_v1.SynthesizeSpeechRequest(
@@ -331,7 +285,7 @@ class GoogleTTSService(TTSService):
if not chunk:
break
await self.stop_ttfb_metrics()
- frame = TTSAudioRawFrame(chunk, self.sample_rate, 1)
+ frame = TTSAudioRawFrame(chunk, self._settings["sample_rate"], 1)
yield frame
await asyncio.sleep(0) # Allow other tasks to run
diff --git a/src/pipecat/services/lmnt.py b/src/pipecat/services/lmnt.py
index 8f18002c5..c828e7a7a 100644
--- a/src/pipecat/services/lmnt.py
+++ b/src/pipecat/services/lmnt.py
@@ -5,10 +5,10 @@
#
import asyncio
-
from typing import AsyncGenerator
-from pipecat.processors.frame_processor import FrameDirection
+from loguru import logger
+
from pipecat.frames.frames import (
CancelFrame,
EndFrame,
@@ -20,10 +20,9 @@ from pipecat.frames.frames import (
TTSStartedFrame,
TTSStoppedFrame,
)
+from pipecat.processors.frame_processor import FrameDirection
from pipecat.services.ai_services import TTSService
-from loguru import logger
-
# See .env.example for LMNT configuration needed
try:
from lmnt.api import Speech
@@ -50,13 +49,16 @@ class LmntTTSService(TTSService):
super().__init__(push_stop_frames=True, sample_rate=sample_rate, **kwargs)
self._api_key = api_key
- self._voice_id = voice_id
- self._output_format = {
- "container": "raw",
- "encoding": "pcm_s16le",
- "sample_rate": sample_rate,
+ self._settings = {
+ "output_format": {
+ "container": "raw",
+ "encoding": "pcm_s16le",
+ "sample_rate": sample_rate,
+ },
+ "language": language,
}
- self._language = language
+
+ self.set_voice(voice_id)
self._speech = None
self._connection = None
@@ -68,10 +70,6 @@ class LmntTTSService(TTSService):
def can_generate_metrics(self) -> bool:
return True
- async def set_voice(self, voice: str):
- logger.debug(f"Switching TTS voice to: [{voice}]")
- self._voice_id = voice
-
async def start(self, frame: StartFrame):
await super().start(frame)
await self._connect()
@@ -93,7 +91,9 @@ class LmntTTSService(TTSService):
try:
self._speech = Speech()
self._connection = await self._speech.synthesize_streaming(
- self._voice_id, format="raw", sample_rate=self._output_format["sample_rate"]
+ self._voice_id,
+ format="raw",
+ sample_rate=self._settings["output_format"]["sample_rate"],
)
self._receive_task = self.get_event_loop().create_task(self._receive_task_handler())
except Exception as e:
@@ -130,7 +130,7 @@ class LmntTTSService(TTSService):
await self.stop_ttfb_metrics()
frame = TTSAudioRawFrame(
audio=msg["audio"],
- sample_rate=self._output_format["sample_rate"],
+ sample_rate=self._settings["output_format"]["sample_rate"],
num_channels=1,
)
await self.push_frame(frame)
diff --git a/src/pipecat/services/openai.py b/src/pipecat/services/openai.py
index 57c1b3399..68b7fa4de 100644
--- a/src/pipecat/services/openai.py
+++ b/src/pipecat/services/openai.py
@@ -24,7 +24,7 @@ from pipecat.frames.frames import (
LLMFullResponseEndFrame,
LLMFullResponseStartFrame,
LLMMessagesFrame,
- ServiceUpdateSettingsFrame,
+ LLMUpdateSettingsFrame,
StartInterruptionFrame,
TextFrame,
TTSAudioRawFrame,
@@ -111,14 +111,16 @@ class BaseOpenAILLMService(LLMService):
**kwargs,
):
super().__init__(**kwargs)
+ self._settings = {
+ "frequency_penalty": params.frequency_penalty,
+ "presence_penalty": params.presence_penalty,
+ "seed": params.seed,
+ "temperature": params.temperature,
+ "top_p": params.top_p,
+ "extra": params.extra if isinstance(params.extra, dict) else {},
+ }
self.set_model_name(model)
self._client = self.create_client(api_key=api_key, base_url=base_url, **kwargs)
- self._frequency_penalty = params.frequency_penalty
- self._presence_penalty = params.presence_penalty
- self._seed = params.seed
- self._temperature = params.temperature
- self._top_p = params.top_p
- self._extra = params.extra if isinstance(params.extra, dict) else {}
def create_client(self, api_key=None, base_url=None, **kwargs):
return AsyncOpenAI(
@@ -134,30 +136,6 @@ class BaseOpenAILLMService(LLMService):
def can_generate_metrics(self) -> bool:
return True
- async def set_frequency_penalty(self, frequency_penalty: float):
- logger.debug(f"Switching LLM frequency_penalty to: [{frequency_penalty}]")
- self._frequency_penalty = frequency_penalty
-
- async def set_presence_penalty(self, presence_penalty: float):
- logger.debug(f"Switching LLM presence_penalty to: [{presence_penalty}]")
- self._presence_penalty = presence_penalty
-
- async def set_seed(self, seed: int):
- logger.debug(f"Switching LLM seed to: [{seed}]")
- self._seed = seed
-
- async def set_temperature(self, temperature: float):
- logger.debug(f"Switching LLM temperature to: [{temperature}]")
- self._temperature = temperature
-
- async def set_top_p(self, top_p: float):
- logger.debug(f"Switching LLM top_p to: [{top_p}]")
- self._top_p = top_p
-
- async def set_extra(self, extra: Dict[str, Any]):
- logger.debug(f"Switching LLM extra to: [{extra}]")
- self._extra = extra
-
async def get_chat_completions(
self, context: OpenAILLMContext, messages: List[ChatCompletionMessageParam]
) -> AsyncStream[ChatCompletionChunk]:
@@ -168,14 +146,14 @@ class BaseOpenAILLMService(LLMService):
"tools": context.tools,
"tool_choice": context.tool_choice,
"stream_options": {"include_usage": True},
- "frequency_penalty": self._frequency_penalty,
- "presence_penalty": self._presence_penalty,
- "seed": self._seed,
- "temperature": self._temperature,
- "top_p": self._top_p,
+ "frequency_penalty": self._settings["frequency_penalty"],
+ "presence_penalty": self._settings["presence_penalty"],
+ "seed": self._settings["seed"],
+ "temperature": self._settings["temperature"],
+ "top_p": self._settings["top_p"],
}
- params.update(self._extra)
+ params.update(self._settings["extra"])
chunks = await self._client.chat.completions.create(**params)
return chunks
@@ -295,17 +273,6 @@ class BaseOpenAILLMService(LLMService):
f"The LLM tried to call a function named '{function_name}', but there isn't a callback registered for that function."
)
- async def _update_settings(self, settings: Dict[str, Any]):
- for key, value in settings.items():
- setter = getattr(self, f"set_{key}", None)
- if setter and callable(setter):
- try:
- await setter(value)
- except Exception as e:
- logger.warning(f"Error setting {key}: {e}")
- else:
- logger.warning(f"Unknown setting for OpenAI LLM service: {key}")
-
async def process_frame(self, frame: Frame, direction: FrameDirection):
await super().process_frame(frame, direction)
@@ -316,7 +283,7 @@ class BaseOpenAILLMService(LLMService):
context = OpenAILLMContext.from_messages(frame.messages)
elif isinstance(frame, VisionImageRawFrame):
context = OpenAILLMContext.from_image_frame(frame)
- elif isinstance(frame, ServiceUpdateSettingsFrame) and frame.service_type == "llm":
+ elif isinstance(frame, LLMUpdateSettingsFrame):
await self._update_settings(frame.settings)
else:
await self.push_frame(frame, direction)
@@ -414,29 +381,27 @@ class OpenAITTSService(TTSService):
self,
*,
api_key: str | None = None,
- voice: str = "alloy",
+ voice_id: str = "alloy",
model: Literal["tts-1", "tts-1-hd"] = "tts-1",
sample_rate: int = 24000,
**kwargs,
):
super().__init__(sample_rate=sample_rate, **kwargs)
- self._voice: ValidVoice = VALID_VOICES.get(voice, "alloy")
+ self._settings = {
+ "sample_rate": sample_rate,
+ }
self.set_model_name(model)
- self._sample_rate = sample_rate
+ self.set_voice(voice_id)
self._client = AsyncOpenAI(api_key=api_key)
def can_generate_metrics(self) -> bool:
return True
- async def set_voice(self, voice: str):
- logger.debug(f"Switching TTS voice to: [{voice}]")
- self._voice = VALID_VOICES.get(voice, self._voice)
-
async def set_model(self, model: str):
logger.debug(f"Switching TTS model to: [{model}]")
- self._model = model
+ self.set_model_name(model)
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
logger.debug(f"Generating TTS: [{text}]")
@@ -446,7 +411,7 @@ class OpenAITTSService(TTSService):
async with self._client.audio.speech.with_streaming_response.create(
input=text,
model=self.model_name,
- voice=self._voice,
+ voice=VALID_VOICES[self._voice_id],
response_format="pcm",
) as r:
if r.status_code != 200:
@@ -465,7 +430,7 @@ class OpenAITTSService(TTSService):
async for chunk in r.iter_bytes(8192):
if len(chunk) > 0:
await self.stop_ttfb_metrics()
- frame = TTSAudioRawFrame(chunk, self.sample_rate, 1)
+ frame = TTSAudioRawFrame(chunk, self._settings["sample_rate"], 1)
yield frame
await self.push_frame(TTSStoppedFrame())
except BadRequestError as e:
diff --git a/src/pipecat/services/playht.py b/src/pipecat/services/playht.py
index 2ffa3a419..aea5d8d92 100644
--- a/src/pipecat/services/playht.py
+++ b/src/pipecat/services/playht.py
@@ -6,17 +6,21 @@
import io
import struct
-
from typing import AsyncGenerator
-from pipecat.frames.frames import Frame, TTSAudioRawFrame, TTSStartedFrame, TTSStoppedFrame
-from pipecat.services.ai_services import TTSService
-
from loguru import logger
+from pipecat.frames.frames import (
+ Frame,
+ TTSAudioRawFrame,
+ TTSStartedFrame,
+ TTSStoppedFrame,
+)
+from pipecat.services.ai_services import TTSService
+
try:
- from pyht.client import TTSOptions
from pyht.async_client import AsyncClient
+ from pyht.client import TTSOptions
from pyht.protos.api_pb2 import Format
except ModuleNotFoundError as e:
logger.error(f"Exception: {e}")
@@ -28,7 +32,7 @@ except ModuleNotFoundError as e:
class PlayHTTTSService(TTSService):
def __init__(
- self, *, api_key: str, user_id: str, voice_url: str, sample_rate: int = 16000, **kwargs
+ self, *, api_key: str, user_id: str, voice_id: str, sample_rate: int = 16000, **kwargs
):
super().__init__(sample_rate=sample_rate, **kwargs)
@@ -39,17 +43,23 @@ class PlayHTTTSService(TTSService):
user_id=self._user_id,
api_key=self._speech_key,
)
+ self._settings = {
+ "sample_rate": sample_rate,
+ "quality": "higher",
+ "format": Format.FORMAT_WAV,
+ "voice_engine": "PlayHT2.0-turbo",
+ }
+ self.set_voice(voice_id)
self._options = TTSOptions(
- voice=voice_url, sample_rate=sample_rate, quality="higher", format=Format.FORMAT_WAV
+ voice=self._voice_id,
+ sample_rate=self._settings["sample_rate"],
+ quality=self._settings["quality"],
+ format=self._settings["format"],
)
def can_generate_metrics(self) -> bool:
return True
- async def set_voice(self, voice: str):
- logger.debug(f"Switching TTS voice to: [{voice}]")
- self._options.voice = voice
-
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
logger.debug(f"Generating TTS: [{text}]")
@@ -60,7 +70,7 @@ class PlayHTTTSService(TTSService):
await self.start_ttfb_metrics()
playht_gen = self._client.tts(
- text, voice_engine="PlayHT2.0-turbo", options=self._options
+ text, voice_engine=self._settings["voice_engine"], options=self._options
)
await self.start_tts_usage_metrics(text)
@@ -83,7 +93,7 @@ class PlayHTTTSService(TTSService):
else:
if len(chunk):
await self.stop_ttfb_metrics()
- frame = TTSAudioRawFrame(chunk, 16000, 1)
+ frame = TTSAudioRawFrame(chunk, self._settings["sample_rate"], 1)
yield frame
await self.push_frame(TTSStoppedFrame())
except Exception as e:
diff --git a/src/pipecat/services/together.py b/src/pipecat/services/together.py
index 26a1a99fd..1bf74e508 100644
--- a/src/pipecat/services/together.py
+++ b/src/pipecat/services/together.py
@@ -50,13 +50,15 @@ class TogetherLLMService(OpenAILLMService):
):
super().__init__(api_key=api_key, base_url=base_url, model=model, params=params, **kwargs)
self.set_model_name(model)
- self._max_tokens = params.max_tokens
- self._frequency_penalty = params.frequency_penalty
- self._presence_penalty = params.presence_penalty
- self._temperature = params.temperature
- self._top_k = params.top_k
- self._top_p = params.top_p
- self._extra = params.extra if isinstance(params.extra, dict) else {}
+ self._settings = {
+ "max_tokens": params.max_tokens,
+ "frequency_penalty": params.frequency_penalty,
+ "presence_penalty": params.presence_penalty,
+ "seed": params.seed,
+ "temperature": params.temperature,
+ "top_p": params.top_p,
+ "extra": params.extra if isinstance(params.extra, dict) else {},
+ }
def can_generate_metrics(self) -> bool:
return True
@@ -72,42 +74,3 @@ class TogetherLLMService(OpenAILLMService):
)
),
)
-
- async def set_frequency_penalty(self, frequency_penalty: float):
- logger.debug(f"Switching LLM frequency_penalty to: [{frequency_penalty}]")
- self._frequency_penalty = frequency_penalty
-
- async def set_max_tokens(self, max_tokens: int):
- logger.debug(f"Switching LLM max_tokens to: [{max_tokens}]")
- self._max_tokens = max_tokens
-
- async def set_presence_penalty(self, presence_penalty: float):
- logger.debug(f"Switching LLM presence_penalty to: [{presence_penalty}]")
- self._presence_penalty = presence_penalty
-
- async def set_temperature(self, temperature: float):
- logger.debug(f"Switching LLM temperature to: [{temperature}]")
- self._temperature = temperature
-
- async def set_top_k(self, top_k: float):
- logger.debug(f"Switching LLM top_k to: [{top_k}]")
- self._top_k = top_k
-
- async def set_top_p(self, top_p: float):
- logger.debug(f"Switching LLM top_p to: [{top_p}]")
- self._top_p = top_p
-
- async def set_extra(self, extra: Dict[str, Any]):
- logger.debug(f"Switching LLM extra to: [{extra}]")
- self._extra = extra
-
- async def _update_settings(self, settings: Dict[str, Any]):
- for key, value in settings.items():
- setter = getattr(self, f"set_{key}", None)
- if setter and callable(setter):
- try:
- await setter(value)
- except Exception as e:
- logger.warning(f"Error setting {key}: {e}")
- else:
- logger.warning(f"Unknown setting for Together LLM service: {key}")
diff --git a/src/pipecat/services/xtts.py b/src/pipecat/services/xtts.py
index 2c47d59e8..7826cfcd8 100644
--- a/src/pipecat/services/xtts.py
+++ b/src/pipecat/services/xtts.py
@@ -4,10 +4,12 @@
# SPDX-License-Identifier: BSD 2-Clause License
#
-import aiohttp
-
from typing import Any, AsyncGenerator, Dict
+import aiohttp
+import numpy as np
+from loguru import logger
+
from pipecat.frames.frames import (
ErrorFrame,
Frame,
@@ -18,10 +20,6 @@ from pipecat.frames.frames import (
)
from pipecat.services.ai_services import TTSService
-import numpy as np
-
-from loguru import logger
-
try:
import resampy
except ModuleNotFoundError as e:
@@ -50,9 +48,11 @@ class XTTSService(TTSService):
):
super().__init__(**kwargs)
- self._voice_id = voice_id
- self._language = language
- self._base_url = base_url
+ self._settings = {
+ "language": language,
+ "base_url": base_url,
+ }
+ self.set_voice(voice_id)
self._studio_speakers: Dict[str, Any] | None = None
self._aiohttp_session = aiohttp_session
@@ -61,7 +61,7 @@ class XTTSService(TTSService):
async def start(self, frame: StartFrame):
await super().start(frame)
- async with self._aiohttp_session.get(self._base_url + "/studio_speakers") as r:
+ async with self._aiohttp_session.get(self._settings["base_url"] + "/studio_speakers") as r:
if r.status != 200:
text = await r.text()
logger.error(
@@ -75,10 +75,6 @@ class XTTSService(TTSService):
return
self._studio_speakers = await r.json()
- async def set_voice(self, voice: str):
- logger.debug(f"Switching TTS voice to: [{voice}]")
- self._voice_id = voice
-
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
logger.debug(f"Generating TTS: [{text}]")
@@ -88,11 +84,11 @@ class XTTSService(TTSService):
embeddings = self._studio_speakers[self._voice_id]
- url = self._base_url + "/tts_stream"
+ url = self._settings["base_url"] + "/tts_stream"
payload = {
"text": text.replace(".", "").replace("*", ""),
- "language": self._language,
+ "language": self._settings["language"],
"speaker_embedding": embeddings["speaker_embedding"],
"gpt_cond_latent": embeddings["gpt_cond_latent"],
"add_wav_header": False,