Update to use LLM, STT, TTS subclasses and remove setter methods

This commit is contained in:
Mark Backman
2024-10-01 14:37:07 -04:00
parent 88cca7bf68
commit 28643b453d
19 changed files with 395 additions and 685 deletions

View File

@@ -50,7 +50,7 @@ async def main():
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
tts = DeepgramTTSService(api_key=os.getenv("DEEPGRAM_API_KEY"), voice="aura-helios-en")
tts = DeepgramTTSService(api_key=os.getenv("DEEPGRAM_API_KEY"), voice_id="aura-helios-en")
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o")

View File

@@ -4,11 +4,15 @@
# SPDX-License-Identifier: BSD 2-Clause License
#
import aiohttp
import asyncio
import os
import sys
import aiohttp
from dotenv import load_dotenv
from loguru import logger
from runner import configure
from pipecat.frames.frames import LLMMessagesFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
@@ -17,17 +21,11 @@ from pipecat.processors.aggregators.llm_response import (
LLMAssistantResponseAggregator,
LLMUserResponseAggregator,
)
from pipecat.services.playht import PlayHTTTSService
from pipecat.services.openai import OpenAILLMService
from pipecat.services.playht import PlayHTTTSService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.vad.silero import SileroVADAnalyzer
from runner import configure
from loguru import logger
from dotenv import load_dotenv
load_dotenv(override=True)
logger.remove(0)
@@ -54,7 +52,7 @@ async def main():
tts = PlayHTTTSService(
user_id=os.getenv("PLAYHT_USER_ID"),
api_key=os.getenv("PLAYHT_API_KEY"),
voice_url="s3://voice-cloning-zero-shot/801a663f-efd0-4254-98d0-5c175514c3e8/jennifer/manifest.json",
voice_id="s3://voice-cloning-zero-shot/801a663f-efd0-4254-98d0-5c175514c3e8/jennifer/manifest.json",
)
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o")

View File

@@ -4,11 +4,15 @@
# SPDX-License-Identifier: BSD 2-Clause License
#
import aiohttp
import asyncio
import os
import sys
import aiohttp
from dotenv import load_dotenv
from loguru import logger
from runner import configure
from pipecat.frames.frames import LLMMessagesFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
@@ -17,17 +21,10 @@ from pipecat.processors.aggregators.llm_response import (
LLMAssistantResponseAggregator,
LLMUserResponseAggregator,
)
from pipecat.services.openai import OpenAITTSService
from pipecat.services.openai import OpenAILLMService
from pipecat.services.openai import OpenAILLMService, OpenAITTSService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.vad.silero import SileroVADAnalyzer
from runner import configure
from loguru import logger
from dotenv import load_dotenv
load_dotenv(override=True)
logger.remove(0)
@@ -51,7 +48,7 @@ async def main():
),
)
tts = OpenAITTSService(api_key=os.getenv("OPENAI_API_KEY"), voice="alloy")
tts = OpenAITTSService(api_key=os.getenv("OPENAI_API_KEY"), voice_id="alloy")
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o")

View File

@@ -5,10 +5,14 @@
#
import asyncio
import aiohttp
import os
import sys
import aiohttp
from dotenv import load_dotenv
from loguru import logger
from runner import configure
from pipecat.frames.frames import LLMMessagesFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
@@ -26,12 +30,6 @@ from pipecat.transports.services.daily import (
)
from pipecat.vad.silero import SileroVADAnalyzer
from runner import configure
from loguru import logger
from dotenv import load_dotenv
load_dotenv(override=True)
logger.remove(0)
@@ -57,7 +55,7 @@ async def main():
tts = DeepgramTTSService(
aiohttp_session=session,
api_key=os.getenv("DEEPGRAM_API_KEY"),
voice="aura-asteria-en",
voice_id="aura-asteria-en",
base_url="http://0.0.0.0:8080/v1/speak",
)

View File

@@ -530,10 +530,24 @@ class UserImageRequestFrame(ControlFrame):
class ServiceUpdateSettingsFrame(ControlFrame):
"""A control frame containing a request to update service settings."""
service_type: str
settings: Dict[str, Any]
@dataclass
class LLMUpdateSettingsFrame(ServiceUpdateSettingsFrame):
pass
@dataclass
class TTSUpdateSettingsFrame(ServiceUpdateSettingsFrame):
pass
@dataclass
class STTUpdateSettingsFrame(ServiceUpdateSettingsFrame):
pass
@dataclass
class FunctionCallInProgressFrame(SystemFrame):
"""A frame signaling that a function call is in progress."""

View File

@@ -8,7 +8,7 @@ import asyncio
import io
import wave
from abc import abstractmethod
from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple
from loguru import logger
@@ -19,14 +19,15 @@ from pipecat.frames.frames import (
ErrorFrame,
Frame,
LLMFullResponseEndFrame,
ServiceUpdateSettingsFrame,
StartFrame,
StartInterruptionFrame,
STTUpdateSettingsFrame,
TextFrame,
TTSAudioRawFrame,
TTSSpeakFrame,
TTSStartedFrame,
TTSStoppedFrame,
TTSUpdateSettingsFrame,
UserImageRequestFrame,
VisionImageRawFrame,
)
@@ -44,6 +45,7 @@ class AIService(FrameProcessor):
def __init__(self, **kwargs):
super().__init__(**kwargs)
self._model_name: str = ""
self._settings: Dict[str, Any] = {}
@property
def model_name(self) -> str:
@@ -62,6 +64,16 @@ class AIService(FrameProcessor):
async def cancel(self, frame: CancelFrame):
pass
async def _update_settings(self, settings: Dict[str, Any]):
for key, value in settings.items():
if key in self._settings:
logger.debug(f"Updating setting {key} to: [{value}] for {self.name}")
self._settings[key] = value
elif key == "model":
self.set_model_name(value)
else:
logger.warning(f"Unknown setting for {self.name} service: {key}")
async def process_frame(self, frame: Frame, direction: FrameDirection):
await super().process_frame(frame, direction)
@@ -168,6 +180,7 @@ class TTSService(AIService):
self._push_stop_frames: bool = push_stop_frames
self._stop_frame_timeout_s: float = stop_frame_timeout_s
self._sample_rate: int = sample_rate
self._voice_id: str = ""
self._settings: Dict[str, Any] = {}
self._stop_frame_task: Optional[asyncio.Task] = None
@@ -184,60 +197,8 @@ class TTSService(AIService):
self.set_model_name(model)
@abstractmethod
async def set_voice(self, voice: str):
pass
@abstractmethod
async def set_language(self, language: Language):
pass
@abstractmethod
async def set_speed(self, speed: Union[str, float]):
pass
@abstractmethod
async def set_emotion(self, emotion: List[str]):
pass
@abstractmethod
async def set_engine(self, engine: str):
pass
@abstractmethod
async def set_pitch(self, pitch: str):
pass
@abstractmethod
async def set_rate(self, rate: str):
pass
@abstractmethod
async def set_volume(self, volume: str):
pass
@abstractmethod
async def set_emphasis(self, emphasis: str):
pass
@abstractmethod
async def set_style(self, style: str):
pass
@abstractmethod
async def set_style_degree(self, style_degree: str):
pass
@abstractmethod
async def set_role(self, role: str):
pass
@abstractmethod
async def set_gender(self, gender: str):
pass
@abstractmethod
async def set_google_style(self, google_style: str):
pass
def set_voice(self, voice: str):
self._voice_id = voice
@abstractmethod
async def flush_audio(self):
@@ -269,20 +230,18 @@ class TTSService(AIService):
async def _update_settings(self, settings: Dict[str, Any]):
for key, value in settings.items():
setter = getattr(self, f"set_{key}", None)
if setter and callable(setter):
try:
if key == "language":
await setter(Language(value))
else:
await setter(value)
except Exception as e:
logger.warning(f"Error setting {key}: {e}")
if key in self._settings:
logger.debug(f"Updating TTS setting {key} to: [{value}]")
self._settings[key] = value
if key == "language":
self._settings[key] = Language(value)
elif key == "model":
self.set_model_name(value)
elif key == "voice":
self.set_voice(value)
else:
logger.warning(f"Unknown setting for TTS service: {key}")
self._settings.update(settings)
async def say(self, text: str):
aggregate_sentences = self._aggregate_sentences
self._aggregate_sentences = False
@@ -309,7 +268,7 @@ class TTSService(AIService):
elif isinstance(frame, TTSSpeakFrame):
await self._push_tts_frames(frame.text)
await self.flush_audio()
elif isinstance(frame, ServiceUpdateSettingsFrame) and frame.service_type == "tts":
elif isinstance(frame, TTSUpdateSettingsFrame):
await self._update_settings(frame.settings)
else:
await self.push_frame(frame, direction)
@@ -448,31 +407,24 @@ class STTService(AIService):
async def set_model(self, model: str):
self.set_model_name(model)
@abstractmethod
async def set_language(self, language: Language):
pass
@abstractmethod
async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]:
"""Returns transcript as a string"""
pass
async def _update_settings(self, settings: Dict[str, Any]):
logger.debug(f"Updating STT settings: {self._settings}")
for key, value in settings.items():
setter = getattr(self, f"set_{key}", None)
if setter and callable(setter):
try:
if key == "language":
await setter(Language(value))
else:
await setter(value)
except Exception as e:
logger.warning(f"Error setting {key}: {e}")
if key in self._settings:
logger.debug(f"Updating STT setting {key} to: [{value}]")
self._settings[key] = value
if key == "language":
self._settings[key] = Language(value)
elif key == "model":
self.set_model_name(value)
else:
logger.warning(f"Unknown setting for STT service: {key}")
self._settings.update(settings)
async def process_audio_frame(self, frame: AudioRawFrame):
await self.process_generator(self.run_stt(frame.audio))
@@ -484,7 +436,7 @@ class STTService(AIService):
# In this service we accumulate audio internally and at the end we
# push a TextFrame. We don't really want to push audio frames down.
await self.process_audio_frame(frame)
elif isinstance(frame, ServiceUpdateSettingsFrame) and frame.service_type == "stt":
elif isinstance(frame, STTUpdateSettingsFrame):
await self._update_settings(frame.settings)
else:
await self.push_frame(frame, direction)

View File

@@ -25,7 +25,7 @@ from pipecat.frames.frames import (
LLMFullResponseEndFrame,
LLMFullResponseStartFrame,
LLMMessagesFrame,
ServiceUpdateSettingsFrame,
LLMUpdateSettingsFrame,
StartInterruptionFrame,
TextFrame,
UserImageRawFrame,
@@ -96,12 +96,14 @@ class AnthropicLLMService(LLMService):
super().__init__(**kwargs)
self._client = AsyncAnthropic(api_key=api_key)
self.set_model_name(model)
self._max_tokens = params.max_tokens
self._enable_prompt_caching_beta: bool = params.enable_prompt_caching_beta or False
self._temperature = params.temperature
self._top_k = params.top_k
self._top_p = params.top_p
self._extra = params.extra if isinstance(params.extra, dict) else {}
self._settings = {
"max_tokens": params.max_tokens,
"enable_prompt_caching_beta": params.enable_prompt_caching_beta or False,
"temperature": params.temperature,
"top_k": params.top_k,
"top_p": params.top_p,
"extra": params.extra if isinstance(params.extra, dict) else {},
}
def can_generate_metrics(self) -> bool:
return True
@@ -120,30 +122,6 @@ class AnthropicLLMService(LLMService):
)
return AnthropicContextAggregatorPair(_user=user, _assistant=assistant)
async def set_enable_prompt_caching_beta(self, enable_prompt_caching_beta: bool):
logger.debug(f"Switching LLM enable_prompt_caching_beta to: [{enable_prompt_caching_beta}]")
self._enable_prompt_caching_beta = enable_prompt_caching_beta
async def set_max_tokens(self, max_tokens: int):
logger.debug(f"Switching LLM max_tokens to: [{max_tokens}]")
self._max_tokens = max_tokens
async def set_temperature(self, temperature: float):
logger.debug(f"Switching LLM temperature to: [{temperature}]")
self._temperature = temperature
async def set_top_k(self, top_k: float):
logger.debug(f"Switching LLM top_k to: [{top_k}]")
self._top_k = top_k
async def set_top_p(self, top_p: float):
logger.debug(f"Switching LLM top_p to: [{top_p}]")
self._top_p = top_p
async def set_extra(self, extra: Dict[str, Any]):
logger.debug(f"Switching LLM extra to: [{extra}]")
self._extra = extra
async def _process_context(self, context: OpenAILLMContext):
# Usage tracking. We track the usage reported by Anthropic in prompt_tokens and
# completion_tokens. We also estimate the completion tokens from output text
@@ -165,11 +143,11 @@ class AnthropicLLMService(LLMService):
)
messages = context.messages
if self._enable_prompt_caching_beta:
if self._settings["enable_prompt_caching_beta"]:
messages = context.get_messages_with_cache_control_markers()
api_call = self._client.messages.create
if self._enable_prompt_caching_beta:
if self._settings["enable_prompt_caching_beta"]:
api_call = self._client.beta.prompt_caching.messages.create
await self.start_ttfb_metrics()
@@ -179,14 +157,14 @@ class AnthropicLLMService(LLMService):
"system": context.system,
"messages": messages,
"model": self.model_name,
"max_tokens": self._max_tokens,
"max_tokens": self._settings["max_tokens"],
"stream": True,
"temperature": self._temperature,
"top_k": self._top_k,
"top_p": self._top_p,
"temperature": self._settings["temperature"],
"top_k": self._settings["top_k"],
"top_p": self._settings["top_p"],
}
params.update(self._extra)
params.update(self._settings["extra"])
response = await api_call(**params)
@@ -284,17 +262,6 @@ class AnthropicLLMService(LLMService):
cache_read_input_tokens=cache_read_input_tokens,
)
async def _update_settings(self, settings: Dict[str, Any]):
for key, value in settings.items():
setter = getattr(self, f"set_{key}", None)
if setter and callable(setter):
try:
await setter(value)
except Exception as e:
logger.warning(f"Error setting {key}: {e}")
else:
logger.warning(f"Unknown setting for Anthropic LLM service: {key}")
async def process_frame(self, frame: Frame, direction: FrameDirection):
await super().process_frame(frame, direction)
@@ -309,11 +276,11 @@ class AnthropicLLMService(LLMService):
# UserImageRawFrames coming through the pipeline and add them
# to the context.
context = AnthropicLLMContext.from_image_frame(frame)
elif isinstance(frame, ServiceUpdateSettingsFrame) and frame.service_type == "llm":
elif isinstance(frame, LLMUpdateSettingsFrame):
await self._update_settings(frame.settings)
elif isinstance(frame, LLMEnablePromptCachingFrame):
logger.debug(f"Setting enable prompt caching to: [{frame.enable}]")
self._enable_prompt_caching_beta = frame.enable
self._settings["enable_prompt_caching_beta"] = frame.enable
else:
await self.push_frame(frame, direction)

View File

@@ -6,6 +6,7 @@
from typing import AsyncGenerator, Optional
from loguru import logger
from pydantic import BaseModel
from pipecat.frames.frames import (
@@ -17,8 +18,6 @@ from pipecat.frames.frames import (
)
from pipecat.services.ai_services import TTSService
from loguru import logger
try:
import boto3
from botocore.exceptions import BotoCoreError, ClientError
@@ -57,9 +56,16 @@ class AWSTTSService(TTSService):
aws_secret_access_key=api_key,
region_name=region,
)
self._voice_id = voice_id
self._sample_rate = sample_rate
self._params = params
self._settings = {
"sample_rate": sample_rate,
"engine": params.engine,
"language": params.language,
"pitch": params.pitch,
"rate": params.rate,
"volume": params.volume,
}
self.set_voice(voice_id)
def can_generate_metrics(self) -> bool:
return True
@@ -67,18 +73,18 @@ class AWSTTSService(TTSService):
def _construct_ssml(self, text: str) -> str:
ssml = "<speak>"
if self._params.language:
ssml += f"<lang xml:lang='{self._params.language}'>"
if self._settings["language"]:
ssml += f"<lang xml:lang='{self._settings["language"]}'>"
prosody_attrs = []
# Prosody tags are only supported for standard and neural engines
if self._params.engine != "generative":
if self._params.rate:
prosody_attrs.append(f"rate='{self._params.rate}'")
if self._params.pitch:
prosody_attrs.append(f"pitch='{self._params.pitch}'")
if self._params.volume:
prosody_attrs.append(f"volume='{self._params.volume}'")
if self._settings["engine"] != "generative":
if self._settings["rate"]:
prosody_attrs.append(f"rate='{self._settings["rate"]}'")
if self._settings["pitch"]:
prosody_attrs.append(f"pitch='{self._settings["pitch"]}'")
if self._settings["volume"]:
prosody_attrs.append(f"volume='{self._settings["volume"]}'")
if prosody_attrs:
ssml += f"<prosody {' '.join(prosody_attrs)}>"
@@ -90,41 +96,13 @@ class AWSTTSService(TTSService):
if prosody_attrs:
ssml += "</prosody>"
if self._params.language:
if self._settings["language"]:
ssml += "</lang>"
ssml += "</speak>"
return ssml
async def set_voice(self, voice: str):
logger.debug(f"Switching TTS voice to: [{voice}]")
self._voice_id = voice
async def set_engine(self, engine: str):
logger.debug(f"Switching TTS engine to: [{engine}]")
self._params.engine = engine
async def set_language(self, language: str):
logger.debug(f"Switching TTS language to: [{language}]")
self._params.language = language
async def set_pitch(self, pitch: str):
logger.debug(f"Switching TTS pitch to: [{pitch}]")
self._params.pitch = pitch
async def set_rate(self, rate: str):
logger.debug(f"Switching TTS rate to: [{rate}]")
self._params.rate = rate
async def set_volume(self, volume: str):
logger.debug(f"Switching TTS volume to: [{volume}]")
self._params.volume = volume
async def set_params(self, params: InputParams):
logger.debug(f"Switching TTS params to: [{params}]")
self._params = params
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
logger.debug(f"Generating TTS: [{text}]")
@@ -139,8 +117,8 @@ class AWSTTSService(TTSService):
"TextType": "ssml",
"OutputFormat": "pcm",
"VoiceId": self._voice_id,
"Engine": self._params.engine,
"SampleRate": str(self._sample_rate),
"Engine": self._settings["engine"],
"SampleRate": str(self._settings["sample_rate"]),
}
# Filter out None values
@@ -160,7 +138,7 @@ class AWSTTSService(TTSService):
chunk = audio_data[i : i + chunk_size]
if len(chunk) > 0:
await self.stop_ttfb_metrics()
frame = TTSAudioRawFrame(chunk, self._sample_rate, 1)
frame = TTSAudioRawFrame(chunk, self._settings["sample_rate"], 1)
yield frame
await self.push_frame(TTSStoppedFrame())

View File

@@ -4,12 +4,13 @@
# SPDX-License-Identifier: BSD 2-Clause License
#
import aiohttp
import asyncio
import io
from typing import AsyncGenerator, Optional
import aiohttp
from loguru import logger
from PIL import Image
from pydantic import BaseModel
from pipecat.frames.frames import (
@@ -28,10 +29,6 @@ from pipecat.services.ai_services import ImageGenService, STTService, TTSService
from pipecat.services.openai import BaseOpenAILLMService
from pipecat.utils.time import time_now_iso8601
from PIL import Image
from loguru import logger
# See .env.example for Azure configuration needed
try:
from azure.cognitiveservices.speech import (
@@ -89,7 +86,7 @@ class AzureTTSService(TTSService):
*,
api_key: str,
region: str,
voice="en-US-SaraNeural",
voice_id="en-US-SaraNeural",
sample_rate: int = 16000,
params: InputParams = InputParams(),
**kwargs,
@@ -99,114 +96,67 @@ class AzureTTSService(TTSService):
speech_config = SpeechConfig(subscription=api_key, region=region)
self._speech_synthesizer = SpeechSynthesizer(speech_config=speech_config, audio_config=None)
self._voice = voice
self._sample_rate = sample_rate
self._params = params
self._settings = {
"sample_rate": sample_rate,
"emphasis": params.emphasis,
"language": params.language,
"pitch": params.pitch,
"rate": params.rate,
"role": params.role,
"style": params.style,
"style_degree": params.style_degree,
"volume": params.volume,
}
self.set_voice(voice_id)
def can_generate_metrics(self) -> bool:
return True
def _construct_ssml(self, text: str) -> str:
ssml = (
f"<speak version='1.0' xml:lang='{self._params.language}' "
f"<speak version='1.0' xml:lang='{self._settings['language']}' "
"xmlns='http://www.w3.org/2001/10/synthesis' "
"xmlns:mstts='http://www.w3.org/2001/mstts'>"
f"<voice name='{self._voice}'>"
f"<voice name='{self._voice_id}'>"
"<mstts:silence type='Sentenceboundary' value='20ms' />"
)
if self._params.style:
ssml += f"<mstts:express-as style='{self._params.style}'"
if self._params.style_degree:
ssml += f" styledegree='{self._params.style_degree}'"
if self._params.role:
ssml += f" role='{self._params.role}'"
if self._settings["style"]:
ssml += f"<mstts:express-as style='{self._settings['style']}'"
if self._settings["style_degree"]:
ssml += f" styledegree='{self._settings['style_degree']}'"
if self._settings["role"]:
ssml += f" role='{self._settings['role']}'"
ssml += ">"
prosody_attrs = []
if self._params.rate:
prosody_attrs.append(f"rate='{self._params.rate}'")
if self._params.pitch:
prosody_attrs.append(f"pitch='{self._params.pitch}'")
if self._params.volume:
prosody_attrs.append(f"volume='{self._params.volume}'")
if self._settings["rate"]:
prosody_attrs.append(f"rate='{self._settings['rate']}'")
if self._settings["pitch"]:
prosody_attrs.append(f"pitch='{self._settings['pitch']}'")
if self._settings["volume"]:
prosody_attrs.append(f"volume='{self._settings['volume']}'")
ssml += f"<prosody {' '.join(prosody_attrs)}>"
if self._params.emphasis:
ssml += f"<emphasis level='{self._params.emphasis}'>"
if self._settings["emphasis"]:
ssml += f"<emphasis level='{self._settings['emphasis']}'>"
ssml += text
if self._params.emphasis:
if self._settings["emphasis"]:
ssml += "</emphasis>"
ssml += "</prosody>"
if self._params.style:
if self._settings["style"]:
ssml += "</mstts:express-as>"
ssml += "</voice></speak>"
return ssml
async def set_voice(self, voice: str):
logger.debug(f"Switching TTS voice to: [{voice}]")
self._voice = voice
async def set_emphasis(self, emphasis: str):
logger.debug(f"Setting TTS emphasis to: [{emphasis}]")
self._params.emphasis = emphasis
async def set_language(self, language: str):
logger.debug(f"Setting TTS language code to: [{language}]")
self._params.language = language
async def set_pitch(self, pitch: str):
logger.debug(f"Setting TTS pitch to: [{pitch}]")
self._params.pitch = pitch
async def set_rate(self, rate: str):
logger.debug(f"Setting TTS rate to: [{rate}]")
self._params.rate = rate
async def set_role(self, role: str):
logger.debug(f"Setting TTS role to: [{role}]")
self._params.role = role
async def set_style(self, style: str):
logger.debug(f"Setting TTS style to: [{style}]")
self._params.style = style
async def set_style_degree(self, style_degree: str):
logger.debug(f"Setting TTS style degree to: [{style_degree}]")
self._params.style_degree = style_degree
async def set_volume(self, volume: str):
logger.debug(f"Setting TTS volume to: [{volume}]")
self._params.volume = volume
async def set_params(self, **kwargs):
valid_params = {
"voice": self.set_voice,
"emphasis": self.set_emphasis,
"language_code": self.set_language,
"pitch": self.set_pitch,
"rate": self.set_rate,
"role": self.set_role,
"style": self.set_style,
"style_degree": self.set_style_degree,
"volume": self.set_volume,
}
for param, value in kwargs.items():
if param in valid_params:
await valid_params[param](value)
else:
logger.warning(f"Ignoring unknown parameter: {param}")
logger.debug(f"Updated TTS parameters: {', '.join(kwargs.keys())}")
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
logger.debug(f"Generating TTS: [{text}]")
@@ -222,7 +172,9 @@ class AzureTTSService(TTSService):
await self.push_frame(TTSStartedFrame())
# Azure always sends a 44-byte header. Strip it off.
yield TTSAudioRawFrame(
audio=result.audio_data[44:], sample_rate=self._sample_rate, num_channels=1
audio=result.audio_data[44:],
sample_rate=self._settings["sample_rate"],
num_channels=1,
)
await self.push_frame(TTSStoppedFrame())
elif result.reason == ResultReason.Canceled:

View File

@@ -4,36 +4,35 @@
# SPDX-License-Identifier: BSD 2-Clause License
#
import asyncio
import base64
import json
import uuid
import base64
import asyncio
from typing import AsyncGenerator, List, Optional, Union
from typing import AsyncGenerator, Optional, Union, List
from loguru import logger
from pydantic.main import BaseModel
from pipecat.frames.frames import (
CancelFrame,
EndFrame,
ErrorFrame,
Frame,
StartInterruptionFrame,
LLMFullResponseEndFrame,
StartFrame,
EndFrame,
StartInterruptionFrame,
TTSAudioRawFrame,
TTSStartedFrame,
TTSStoppedFrame,
LLMFullResponseEndFrame,
)
from pipecat.processors.frame_processor import FrameDirection
from pipecat.services.ai_services import TTSService, WordTTSService
from pipecat.transcriptions.language import Language
from pipecat.services.ai_services import WordTTSService, TTSService
from loguru import logger
# See .env.example for Cartesia configuration needed
try:
from cartesia import AsyncCartesia
import websockets
from cartesia import AsyncCartesia
except ModuleNotFoundError as e:
logger.error(f"Exception: {e}")
logger.error(
@@ -66,7 +65,7 @@ class CartesiaTTSService(WordTTSService):
encoding: Optional[str] = "pcm_s16le"
sample_rate: Optional[int] = 16000
container: Optional[str] = "raw"
language: Optional[str] = "en"
language: Optional[Language] = Language.EN
speed: Optional[Union[str, float]] = ""
emotion: Optional[List[str]] = []
@@ -77,7 +76,7 @@ class CartesiaTTSService(WordTTSService):
voice_id: str,
cartesia_version: str = "2024-06-10",
url: str = "wss://api.cartesia.ai/tts/websocket",
model_id: str = "sonic-english",
model: str = "sonic-english",
params: InputParams = InputParams(),
**kwargs,
):
@@ -101,17 +100,18 @@ class CartesiaTTSService(WordTTSService):
self._api_key = api_key
self._cartesia_version = cartesia_version
self._url = url
self._voice_id = voice_id
self._model_id = model_id
self.set_model_name(model_id)
self._output_format = {
"container": params.container,
"encoding": params.encoding,
"sample_rate": params.sample_rate,
self._settings = {
"output_format": {
"container": params.container,
"encoding": params.encoding,
"sample_rate": params.sample_rate,
},
"language": language_to_cartesia_language(params.language) if params.language else None,
"speed": params.speed,
"emotion": params.emotion,
}
self._language = params.language
self._speed = params.speed
self._emotion = params.emotion
self.set_model_name(model)
self.set_voice(voice_id)
self._websocket = None
self._context_id = None
@@ -125,42 +125,28 @@ class CartesiaTTSService(WordTTSService):
await super().set_model(model)
logger.debug(f"Switching TTS model to: [{model}]")
async def set_voice(self, voice: str):
logger.debug(f"Switching TTS voice to: [{voice}]")
self._voice_id = voice
async def set_speed(self, speed: str):
logger.debug(f"Switching TTS speed to: [{speed}]")
self._speed = speed
async def set_emotion(self, emotion: list[str]):
logger.debug(f"Switching TTS emotion to: [{emotion}]")
self._emotion = emotion
async def set_language(self, language: Language):
logger.debug(f"Switching TTS language to: [{language}]")
self._language = language_to_cartesia_language(language)
def _build_msg(
self, text: str = "", continue_transcript: bool = True, add_timestamps: bool = True
):
voice_config = {"mode": "id", "id": self._voice_id}
voice_config = {}
voice_config["mode"] = "id"
voice_config["id"] = self._voice_id
if self._speed or self._emotion:
if self._settings["speed"] or self._settings["emotion"]:
voice_config["__experimental_controls"] = {}
if self._speed:
voice_config["__experimental_controls"]["speed"] = self._speed
if self._emotion:
voice_config["__experimental_controls"]["emotion"] = self._emotion
if self._settings["speed"]:
voice_config["__experimental_controls"]["speed"] = self._settings["speed"]
if self._settings["emotion"]:
voice_config["__experimental_controls"]["emotion"] = self._settings["emotion"]
msg = {
"transcript": text,
"continue": continue_transcript,
"context_id": self._context_id,
"model_id": self._model_name,
"model_id": self.model_name,
"voice": voice_config,
"output_format": self._output_format,
"language": self._language,
"output_format": self._settings["output_format"],
"language": self._settings["language"],
"add_timestamps": add_timestamps,
}
return json.dumps(msg)
@@ -245,7 +231,7 @@ class CartesiaTTSService(WordTTSService):
self.start_word_timestamps()
frame = TTSAudioRawFrame(
audio=base64.b64decode(msg["data"]),
sample_rate=self._output_format["sample_rate"],
sample_rate=self._settings["output_format"]["sample_rate"],
num_channels=1,
)
await self.push_frame(frame)
@@ -303,7 +289,7 @@ class CartesiaHttpTTSService(TTSService):
*,
api_key: str,
voice_id: str,
model_id: str = "sonic-english",
model: str = "sonic-english",
base_url: str = "https://api.cartesia.ai",
params: InputParams = InputParams(),
**kwargs,
@@ -311,17 +297,18 @@ class CartesiaHttpTTSService(TTSService):
super().__init__(**kwargs)
self._api_key = api_key
self._voice_id = voice_id
self._model_id = model_id
self.set_model_name(model_id)
self._output_format = {
"container": params.container,
"encoding": params.encoding,
"sample_rate": params.sample_rate,
self._settings = {
"output_format": {
"container": params.container,
"encoding": params.encoding,
"sample_rate": params.sample_rate,
},
"language": params.language,
"speed": params.speed,
"emotion": params.emotion,
}
self._language = params.language
self._speed = params.speed
self._emotion = params.emotion
self.set_voice(voice_id)
self.set_model_name(model)
self._client = AsyncCartesia(api_key=api_key, base_url=base_url)
@@ -333,22 +320,6 @@ class CartesiaHttpTTSService(TTSService):
self._model_id = model
await super().set_model(model)
async def set_voice(self, voice: str):
logger.debug(f"Switching TTS voice to: [{voice}]")
self._voice_id = voice
async def set_speed(self, speed: str):
logger.debug(f"Switching TTS speed to: [{speed}]")
self._speed = speed
async def set_emotion(self, emotion: list[str]):
logger.debug(f"Switching TTS emotion to: [{emotion}]")
self._emotion = emotion
async def set_language(self, language: Language):
logger.debug(f"Switching TTS language to: [{language}]")
self._language = language_to_cartesia_language(language)
async def stop(self, frame: EndFrame):
await super().stop(frame)
await self._client.close()
@@ -365,19 +336,19 @@ class CartesiaHttpTTSService(TTSService):
try:
voice_controls = None
if self._speed or self._emotion:
if self._settings["speed"] or self._settings["emotion"]:
voice_controls = {}
if self._speed:
voice_controls["speed"] = self._speed
if self._emotion:
voice_controls["emotion"] = self._emotion
if self._settings["speed"]:
voice_controls["speed"] = self._settings["speed"]
if self._settings["emotion"]:
voice_controls["emotion"] = self._settings["emotion"]
output = await self._client.tts.sse(
model_id=self._model_id,
transcript=text,
voice_id=self._voice_id,
output_format=self._output_format,
language=self._language,
output_format=self._settings["output_format"],
language=self._settings["language"],
stream=False,
_experimental_voice_controls=voice_controls,
)
@@ -386,7 +357,7 @@ class CartesiaHttpTTSService(TTSService):
frame = TTSAudioRawFrame(
audio=output["audio"],
sample_rate=self._output_format["sample_rate"],
sample_rate=self._settings["output_format"]["sample_rate"],
num_channels=1,
)
yield frame

View File

@@ -5,9 +5,10 @@
#
import asyncio
from typing import AsyncGenerator
from loguru import logger
from pipecat.frames.frames import (
CancelFrame,
EndFrame,
@@ -24,8 +25,6 @@ from pipecat.services.ai_services import STTService, TTSService
from pipecat.transcriptions.language import Language
from pipecat.utils.time import time_now_iso8601
from loguru import logger
# See .env.example for Deepgram configuration needed
try:
from deepgram import (
@@ -50,32 +49,30 @@ class DeepgramTTSService(TTSService):
self,
*,
api_key: str,
voice: str = "aura-helios-en",
voice_id: str = "aura-helios-en",
sample_rate: int = 16000,
encoding: str = "linear16",
**kwargs,
):
super().__init__(**kwargs)
self._voice = voice
self._sample_rate = sample_rate
self._encoding = encoding
self._settings = {
"sample_rate": sample_rate,
"encoding": encoding,
}
self.set_voice(voice_id)
self._deepgram_client = DeepgramClient(api_key=api_key)
def can_generate_metrics(self) -> bool:
return True
async def set_voice(self, voice: str):
logger.debug(f"Switching TTS voice to: [{voice}]")
self._voice = voice
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
logger.debug(f"Generating TTS: [{text}]")
options = SpeakOptions(
model=self._voice,
encoding=self._encoding,
sample_rate=self._sample_rate,
model=self._voice_id,
encoding=self._settings["encoding"],
sample_rate=self._settings["sample_rate"],
container="none",
)
@@ -103,7 +100,9 @@ class DeepgramTTSService(TTSService):
chunk = audio_buffer.read(chunk_size)
if not chunk:
break
frame = TTSAudioRawFrame(audio=chunk, sample_rate=self._sample_rate, num_channels=1)
frame = TTSAudioRawFrame(
audio=chunk, sample_rate=self._settings["sample_rate"], num_channels=1
)
yield frame
await self.push_frame(TTSStoppedFrame())
@@ -135,7 +134,7 @@ class DeepgramSTTService(STTService):
):
super().__init__(**kwargs)
self._live_options = live_options
self._settings = vars(live_options)
self._client = DeepgramClient(
api_key, config=DeepgramClientOptions(url=url, options={"keepalive": "true"})
@@ -147,7 +146,7 @@ class DeepgramSTTService(STTService):
@property
def vad_enabled(self):
return self._live_options.vad_events
return self._settings["vad_events"]
def can_generate_metrics(self) -> bool:
return self.vad_enabled
@@ -155,13 +154,7 @@ class DeepgramSTTService(STTService):
async def set_model(self, model: str):
await super().set_model(model)
logger.debug(f"Switching STT model to: [{model}]")
self._live_options.model = model
await self._disconnect()
await self._connect()
async def set_language(self, language: Language):
logger.debug(f"Switching STT language to: [{language}]")
self._live_options.language = language
self._settings["model"] = model
await self._disconnect()
await self._connect()
@@ -182,7 +175,7 @@ class DeepgramSTTService(STTService):
yield None
async def _connect(self):
if await self._connection.start(self._live_options):
if await self._connection.start(self._settings):
logger.debug(f"{self}: Connected to Deepgram")
else:
logger.error(f"{self}: Unable to connect to Deepgram")

View File

@@ -24,6 +24,7 @@ from pipecat.frames.frames import (
)
from pipecat.processors.frame_processor import FrameDirection
from pipecat.services.ai_services import WordTTSService
from pipecat.transcriptions.language import Language
# See .env.example for ElevenLabs configuration needed
try:
@@ -72,7 +73,7 @@ def calculate_word_times(
class ElevenLabsTTSService(WordTTSService):
class InputParams(BaseModel):
language: Optional[str] = None
language: Optional[Language] = Language.EN
output_format: Literal["pcm_16000", "pcm_22050", "pcm_24000", "pcm_44100"] = "pcm_16000"
optimize_streaming_latency: Optional[str] = None
stability: Optional[float] = None
@@ -124,10 +125,19 @@ class ElevenLabsTTSService(WordTTSService):
)
self._api_key = api_key
self._voice_id = voice_id
self.set_model_name(model)
self._url = url
self._params = params
self._settings = {
"sample_rate": sample_rate_from_output_format(params.output_format),
"language": params.language,
"output_format": params.output_format,
"optimize_streaming_latency": params.optimize_streaming_latency,
"stability": params.stability,
"similarity_boost": params.similarity_boost,
"style": params.style,
"use_speaker_boost": params.use_speaker_boost,
}
self.set_model_name(model)
self.set_voice(voice_id)
self._voice_settings = self._set_voice_settings()
# Websocket connection to ElevenLabs.
@@ -142,19 +152,22 @@ class ElevenLabsTTSService(WordTTSService):
def _set_voice_settings(self):
voice_settings = {}
if self._params.stability is not None and self._params.similarity_boost is not None:
voice_settings["stability"] = self._params.stability
voice_settings["similarity_boost"] = self._params.similarity_boost
if self._params.style is not None:
voice_settings["style"] = self._params.style
if self._params.use_speaker_boost is not None:
voice_settings["use_speaker_boost"] = self._params.use_speaker_boost
if (
self._settings["stability"] is not None
and self._settings["similarity_boost"] is not None
):
voice_settings["stability"] = self._settings["stability"]
voice_settings["similarity_boost"] = self._settings["similarity_boost"]
if self._settings["style"] is not None:
voice_settings["style"] = self._settings["style"]
if self._settings["use_speaker_boost"] is not None:
voice_settings["use_speaker_boost"] = self._settings["use_speaker_boost"]
else:
if self._params.style is not None:
if self._settings["style"] is not None:
logger.warning(
"'style' is set but will not be applied because 'stability' and 'similarity_boost' are not both set."
)
if self._params.use_speaker_boost is not None:
if self._settings["use_speaker_boost"] is not None:
logger.warning(
"'use_speaker_boost' is set but will not be applied because 'stability' and 'similarity_boost' are not both set."
)
@@ -167,33 +180,13 @@ class ElevenLabsTTSService(WordTTSService):
await self._disconnect()
await self._connect()
async def set_voice(self, voice: str):
logger.debug(f"Switching TTS voice to: [{voice}]")
self._voice_id = voice
await self._disconnect()
await self._connect()
async def set_voice_settings(
self,
stability: Optional[float] = None,
similarity_boost: Optional[float] = None,
style: Optional[float] = None,
use_speaker_boost: Optional[bool] = None,
):
self._params.stability = stability if stability is not None else self._params.stability
self._params.similarity_boost = (
similarity_boost if similarity_boost is not None else self._params.similarity_boost
)
self._params.style = style if style is not None else self._params.style
self._params.use_speaker_boost = (
use_speaker_boost if use_speaker_boost is not None else self._params.use_speaker_boost
)
self._set_voice_settings()
if self._websocket:
msg = {"voice_settings": self._voice_settings}
await self._websocket.send(json.dumps(msg))
async def _update_settings(self, settings: Dict[str, Any]):
prev_voice = self._voice_id
await super()._update_settings(settings)
if not prev_voice == self._voice_id:
await self._disconnect()
await self._connect()
logger.debug(f"Switching TTS voice to: [{self._voice_id}]")
async def start(self, frame: StartFrame):
await super().start(frame)
@@ -223,19 +216,19 @@ class ElevenLabsTTSService(WordTTSService):
try:
voice_id = self._voice_id
model = self.model_name
output_format = self._params.output_format
output_format = self._settings["output_format"]
url = f"{self._url}/v1/text-to-speech/{voice_id}/stream-input?model_id={model}&output_format={output_format}"
if self._params.optimize_streaming_latency:
url += f"&optimize_streaming_latency={self._params.optimize_streaming_latency}"
if self._settings["optimize_streaming_latency"]:
url += f"&optimize_streaming_latency={self._settings["optimize_streaming_latency"]}"
# language can only be used with the 'eleven_turbo_v2_5' model
if self._params.language:
if self._settings["language"]:
if model == "eleven_turbo_v2_5":
url += f"&language_code={self._params.language}"
url += f"&language_code={self._settings["language"]}"
else:
logger.debug(
f"Language code [{self._params.language}] not applied. Language codes can only be used with the 'eleven_turbo_v2_5' model."
f"Language code [{self._settings["language"]}] not applied. Language codes can only be used with the 'eleven_turbo_v2_5' model."
)
self._websocket = await websockets.connect(url)
@@ -286,7 +279,7 @@ class ElevenLabsTTSService(WordTTSService):
self.start_word_timestamps()
audio = base64.b64decode(msg["audio"])
frame = TTSAudioRawFrame(audio, self.sample_rate, 1)
frame = TTSAudioRawFrame(audio, self._settings["sample_rate"], 1)
await self.push_frame(frame)
if msg.get("alignment"):

View File

@@ -6,8 +6,9 @@
import base64
import json
from typing import AsyncGenerator, Optional
from loguru import logger
from pydantic.main import BaseModel
from pipecat.frames.frames import (
@@ -21,8 +22,6 @@ from pipecat.frames.frames import (
from pipecat.services.ai_services import STTService
from pipecat.utils.time import time_now_iso8601
from loguru import logger
# See .env.example for Gladia configuration needed
try:
import websockets
@@ -55,7 +54,13 @@ class GladiaSTTService(STTService):
self._api_key = api_key
self._url = url
self._params = params
self._settings = {
"sample_rate": params.sample_rate,
"language": params.language,
"transcription_hint": params.transcription_hint,
"endpointing": params.endpointing,
"prosody": params.prosody,
}
self._confidence = confidence
async def start(self, frame: StartFrame):
@@ -84,7 +89,11 @@ class GladiaSTTService(STTService):
"encoding": "WAV/PCM",
"model_type": "fast",
"language_behaviour": "manual",
**self._params.model_dump(exclude_none=True),
"sample_rate": self._settings["sample_rate"],
"language": self._settings["language"],
"transcription_hint": self._settings["transcription_hint"],
"endpointing": self._settings["endpointing"],
"prosody": self._settings["prosody"],
}
await self._websocket.send(json.dumps(configuration))

View File

@@ -6,7 +6,7 @@
import asyncio
import json
from typing import Any, AsyncGenerator, Dict, List, Literal, Optional
from typing import AsyncGenerator, List, Literal, Optional
from loguru import logger
from pydantic import BaseModel
@@ -17,7 +17,7 @@ from pipecat.frames.frames import (
LLMFullResponseEndFrame,
LLMFullResponseStartFrame,
LLMMessagesFrame,
ServiceUpdateSettingsFrame,
LLMUpdateSettingsFrame,
TextFrame,
TTSAudioRawFrame,
TTSStartedFrame,
@@ -64,21 +64,6 @@ class GoogleLLMService(LLMService):
self.set_model_name(model)
self._client = gai.GenerativeModel(model)
async def set_model(self, model: str):
logger.debug(f"Switching LLM model to: [{model}]")
self._create_client(model)
async def _update_settings(self, settings: Dict[str, Any]):
for key, value in settings.items():
setter = getattr(self, f"set_{key}", None)
if setter and callable(setter):
try:
await setter(value)
except Exception as e:
logger.warning(f"Error setting {key}: {e}")
else:
logger.warning(f"Unknown setting for Google LLM service: {key}")
def _get_messages_from_openai_context(self, context: OpenAILLMContext) -> List[glm.Content]:
openai_messages = context.get_messages()
google_messages = []
@@ -151,7 +136,7 @@ class GoogleLLMService(LLMService):
context = OpenAILLMContext.from_messages(frame.messages)
elif isinstance(frame, VisionImageRawFrame):
context = OpenAILLMContext.from_image_frame(frame)
elif isinstance(frame, ServiceUpdateSettingsFrame) and frame.service_type == "llm":
elif isinstance(frame, LLMUpdateSettingsFrame):
await self._update_settings(frame.settings)
else:
await self.push_frame(frame, direction)
@@ -182,8 +167,17 @@ class GoogleTTSService(TTSService):
):
super().__init__(sample_rate=sample_rate, **kwargs)
self._voice_id: str = voice_id
self._params = params
self._settings = {
"sample_rate": sample_rate,
"pitch": params.pitch,
"rate": params.rate,
"volume": params.volume,
"emphasis": params.emphasis,
"language": params.language,
"gender": params.gender,
"google_style": params.google_style,
}
self.set_voice(voice_id)
self._client: texttospeech_v1.TextToSpeechAsyncClient = self._create_client(
credentials, credentials_path
)
@@ -216,38 +210,38 @@ class GoogleTTSService(TTSService):
# Voice tag
voice_attrs = [f"name='{self._voice_id}'"]
if self._params.language:
voice_attrs.append(f"language='{self._params.language}'")
if self._params.gender:
voice_attrs.append(f"gender='{self._params.gender}'")
if self._settings["language"]:
voice_attrs.append(f"language='{self._settings['language']}'")
if self._settings["gender"]:
voice_attrs.append(f"gender='{self._settings['gender']}'")
ssml += f"<voice {' '.join(voice_attrs)}>"
# Prosody tag
prosody_attrs = []
if self._params.pitch:
prosody_attrs.append(f"pitch='{self._params.pitch}'")
if self._params.rate:
prosody_attrs.append(f"rate='{self._params.rate}'")
if self._params.volume:
prosody_attrs.append(f"volume='{self._params.volume}'")
if self._settings["pitch"]:
prosody_attrs.append(f"pitch='{self._settings['pitch']}'")
if self._settings["rate"]:
prosody_attrs.append(f"rate='{self._settings['rate']}'")
if self._settings["volume"]:
prosody_attrs.append(f"volume='{self._settings['volume']}'")
if prosody_attrs:
ssml += f"<prosody {' '.join(prosody_attrs)}>"
# Emphasis tag
if self._params.emphasis:
ssml += f"<emphasis level='{self._params.emphasis}'>"
if self._settings["emphasis"]:
ssml += f"<emphasis level='{self._settings['emphasis']}'>"
# Google style tag
if self._params.google_style:
ssml += f"<google:style name='{self._params.google_style}'>"
if self._settings["google_style"]:
ssml += f"<google:style name='{self._settings['google_style']}'>"
ssml += text
# Close tags
if self._params.google_style:
if self._settings["google_style"]:
ssml += "</google:style>"
if self._params.emphasis:
if self._settings["emphasis"]:
ssml += "</emphasis>"
if prosody_attrs:
ssml += "</prosody>"
@@ -255,46 +249,6 @@ class GoogleTTSService(TTSService):
return ssml
async def set_voice(self, voice: str) -> None:
logger.debug(f"Switching TTS voice to: [{voice}]")
self._voice_id = voice
async def set_language(self, language: str) -> None:
logger.debug(f"Switching TTS language to: [{language}]")
self._params.language = language
async def set_pitch(self, pitch: str) -> None:
logger.debug(f"Switching TTS pitch to: [{pitch}]")
self._params.pitch = pitch
async def set_rate(self, rate: str) -> None:
logger.debug(f"Switching TTS rate to: [{rate}]")
self._params.rate = rate
async def set_volume(self, volume: str) -> None:
logger.debug(f"Switching TTS volume to: [{volume}]")
self._params.volume = volume
async def set_emphasis(
self, emphasis: Literal["strong", "moderate", "reduced", "none"]
) -> None:
logger.debug(f"Switching TTS emphasis to: [{emphasis}]")
self._params.emphasis = emphasis
async def set_gender(self, gender: Literal["male", "female", "neutral"]) -> None:
logger.debug(f"Switch TTS gender to [{gender}]")
self._params.gender = gender
async def set_google_style(
self, google_style: Literal["apologetic", "calm", "empathetic", "firm", "lively"]
) -> None:
logger.debug(f"Switching TTS google style to: [{google_style}]")
self._params.google_style = google_style
async def set_params(self, params: InputParams) -> None:
logger.debug(f"Switching TTS params to: [{params}]")
self._params = params
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
logger.debug(f"Generating TTS: [{text}]")
@@ -304,11 +258,11 @@ class GoogleTTSService(TTSService):
ssml = self._construct_ssml(text)
synthesis_input = texttospeech_v1.SynthesisInput(ssml=ssml)
voice = texttospeech_v1.VoiceSelectionParams(
language_code=self._params.language, name=self._voice_id
language_code=self._settings["language"], name=self._voice_id
)
audio_config = texttospeech_v1.AudioConfig(
audio_encoding=texttospeech_v1.AudioEncoding.LINEAR16,
sample_rate_hertz=self.sample_rate,
sample_rate_hertz=self._settings["sample_rate"],
)
request = texttospeech_v1.SynthesizeSpeechRequest(
@@ -331,7 +285,7 @@ class GoogleTTSService(TTSService):
if not chunk:
break
await self.stop_ttfb_metrics()
frame = TTSAudioRawFrame(chunk, self.sample_rate, 1)
frame = TTSAudioRawFrame(chunk, self._settings["sample_rate"], 1)
yield frame
await asyncio.sleep(0) # Allow other tasks to run

View File

@@ -5,10 +5,10 @@
#
import asyncio
from typing import AsyncGenerator
from pipecat.processors.frame_processor import FrameDirection
from loguru import logger
from pipecat.frames.frames import (
CancelFrame,
EndFrame,
@@ -20,10 +20,9 @@ from pipecat.frames.frames import (
TTSStartedFrame,
TTSStoppedFrame,
)
from pipecat.processors.frame_processor import FrameDirection
from pipecat.services.ai_services import TTSService
from loguru import logger
# See .env.example for LMNT configuration needed
try:
from lmnt.api import Speech
@@ -50,13 +49,16 @@ class LmntTTSService(TTSService):
super().__init__(push_stop_frames=True, sample_rate=sample_rate, **kwargs)
self._api_key = api_key
self._voice_id = voice_id
self._output_format = {
"container": "raw",
"encoding": "pcm_s16le",
"sample_rate": sample_rate,
self._settings = {
"output_format": {
"container": "raw",
"encoding": "pcm_s16le",
"sample_rate": sample_rate,
},
"language": language,
}
self._language = language
self.set_voice(voice_id)
self._speech = None
self._connection = None
@@ -68,10 +70,6 @@ class LmntTTSService(TTSService):
def can_generate_metrics(self) -> bool:
return True
async def set_voice(self, voice: str):
logger.debug(f"Switching TTS voice to: [{voice}]")
self._voice_id = voice
async def start(self, frame: StartFrame):
await super().start(frame)
await self._connect()
@@ -93,7 +91,9 @@ class LmntTTSService(TTSService):
try:
self._speech = Speech()
self._connection = await self._speech.synthesize_streaming(
self._voice_id, format="raw", sample_rate=self._output_format["sample_rate"]
self._voice_id,
format="raw",
sample_rate=self._settings["output_format"]["sample_rate"],
)
self._receive_task = self.get_event_loop().create_task(self._receive_task_handler())
except Exception as e:
@@ -130,7 +130,7 @@ class LmntTTSService(TTSService):
await self.stop_ttfb_metrics()
frame = TTSAudioRawFrame(
audio=msg["audio"],
sample_rate=self._output_format["sample_rate"],
sample_rate=self._settings["output_format"]["sample_rate"],
num_channels=1,
)
await self.push_frame(frame)

View File

@@ -24,7 +24,7 @@ from pipecat.frames.frames import (
LLMFullResponseEndFrame,
LLMFullResponseStartFrame,
LLMMessagesFrame,
ServiceUpdateSettingsFrame,
LLMUpdateSettingsFrame,
StartInterruptionFrame,
TextFrame,
TTSAudioRawFrame,
@@ -111,14 +111,16 @@ class BaseOpenAILLMService(LLMService):
**kwargs,
):
super().__init__(**kwargs)
self._settings = {
"frequency_penalty": params.frequency_penalty,
"presence_penalty": params.presence_penalty,
"seed": params.seed,
"temperature": params.temperature,
"top_p": params.top_p,
"extra": params.extra if isinstance(params.extra, dict) else {},
}
self.set_model_name(model)
self._client = self.create_client(api_key=api_key, base_url=base_url, **kwargs)
self._frequency_penalty = params.frequency_penalty
self._presence_penalty = params.presence_penalty
self._seed = params.seed
self._temperature = params.temperature
self._top_p = params.top_p
self._extra = params.extra if isinstance(params.extra, dict) else {}
def create_client(self, api_key=None, base_url=None, **kwargs):
return AsyncOpenAI(
@@ -134,30 +136,6 @@ class BaseOpenAILLMService(LLMService):
def can_generate_metrics(self) -> bool:
return True
async def set_frequency_penalty(self, frequency_penalty: float):
logger.debug(f"Switching LLM frequency_penalty to: [{frequency_penalty}]")
self._frequency_penalty = frequency_penalty
async def set_presence_penalty(self, presence_penalty: float):
logger.debug(f"Switching LLM presence_penalty to: [{presence_penalty}]")
self._presence_penalty = presence_penalty
async def set_seed(self, seed: int):
logger.debug(f"Switching LLM seed to: [{seed}]")
self._seed = seed
async def set_temperature(self, temperature: float):
logger.debug(f"Switching LLM temperature to: [{temperature}]")
self._temperature = temperature
async def set_top_p(self, top_p: float):
logger.debug(f"Switching LLM top_p to: [{top_p}]")
self._top_p = top_p
async def set_extra(self, extra: Dict[str, Any]):
logger.debug(f"Switching LLM extra to: [{extra}]")
self._extra = extra
async def get_chat_completions(
self, context: OpenAILLMContext, messages: List[ChatCompletionMessageParam]
) -> AsyncStream[ChatCompletionChunk]:
@@ -168,14 +146,14 @@ class BaseOpenAILLMService(LLMService):
"tools": context.tools,
"tool_choice": context.tool_choice,
"stream_options": {"include_usage": True},
"frequency_penalty": self._frequency_penalty,
"presence_penalty": self._presence_penalty,
"seed": self._seed,
"temperature": self._temperature,
"top_p": self._top_p,
"frequency_penalty": self._settings["frequency_penalty"],
"presence_penalty": self._settings["presence_penalty"],
"seed": self._settings["seed"],
"temperature": self._settings["temperature"],
"top_p": self._settings["top_p"],
}
params.update(self._extra)
params.update(self._settings["extra"])
chunks = await self._client.chat.completions.create(**params)
return chunks
@@ -295,17 +273,6 @@ class BaseOpenAILLMService(LLMService):
f"The LLM tried to call a function named '{function_name}', but there isn't a callback registered for that function."
)
async def _update_settings(self, settings: Dict[str, Any]):
for key, value in settings.items():
setter = getattr(self, f"set_{key}", None)
if setter and callable(setter):
try:
await setter(value)
except Exception as e:
logger.warning(f"Error setting {key}: {e}")
else:
logger.warning(f"Unknown setting for OpenAI LLM service: {key}")
async def process_frame(self, frame: Frame, direction: FrameDirection):
await super().process_frame(frame, direction)
@@ -316,7 +283,7 @@ class BaseOpenAILLMService(LLMService):
context = OpenAILLMContext.from_messages(frame.messages)
elif isinstance(frame, VisionImageRawFrame):
context = OpenAILLMContext.from_image_frame(frame)
elif isinstance(frame, ServiceUpdateSettingsFrame) and frame.service_type == "llm":
elif isinstance(frame, LLMUpdateSettingsFrame):
await self._update_settings(frame.settings)
else:
await self.push_frame(frame, direction)
@@ -414,29 +381,27 @@ class OpenAITTSService(TTSService):
self,
*,
api_key: str | None = None,
voice: str = "alloy",
voice_id: str = "alloy",
model: Literal["tts-1", "tts-1-hd"] = "tts-1",
sample_rate: int = 24000,
**kwargs,
):
super().__init__(sample_rate=sample_rate, **kwargs)
self._voice: ValidVoice = VALID_VOICES.get(voice, "alloy")
self._settings = {
"sample_rate": sample_rate,
}
self.set_model_name(model)
self._sample_rate = sample_rate
self.set_voice(voice_id)
self._client = AsyncOpenAI(api_key=api_key)
def can_generate_metrics(self) -> bool:
return True
async def set_voice(self, voice: str):
logger.debug(f"Switching TTS voice to: [{voice}]")
self._voice = VALID_VOICES.get(voice, self._voice)
async def set_model(self, model: str):
logger.debug(f"Switching TTS model to: [{model}]")
self._model = model
self.set_model_name(model)
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
logger.debug(f"Generating TTS: [{text}]")
@@ -446,7 +411,7 @@ class OpenAITTSService(TTSService):
async with self._client.audio.speech.with_streaming_response.create(
input=text,
model=self.model_name,
voice=self._voice,
voice=VALID_VOICES[self._voice_id],
response_format="pcm",
) as r:
if r.status_code != 200:
@@ -465,7 +430,7 @@ class OpenAITTSService(TTSService):
async for chunk in r.iter_bytes(8192):
if len(chunk) > 0:
await self.stop_ttfb_metrics()
frame = TTSAudioRawFrame(chunk, self.sample_rate, 1)
frame = TTSAudioRawFrame(chunk, self._settings["sample_rate"], 1)
yield frame
await self.push_frame(TTSStoppedFrame())
except BadRequestError as e:

View File

@@ -6,17 +6,21 @@
import io
import struct
from typing import AsyncGenerator
from pipecat.frames.frames import Frame, TTSAudioRawFrame, TTSStartedFrame, TTSStoppedFrame
from pipecat.services.ai_services import TTSService
from loguru import logger
from pipecat.frames.frames import (
Frame,
TTSAudioRawFrame,
TTSStartedFrame,
TTSStoppedFrame,
)
from pipecat.services.ai_services import TTSService
try:
from pyht.client import TTSOptions
from pyht.async_client import AsyncClient
from pyht.client import TTSOptions
from pyht.protos.api_pb2 import Format
except ModuleNotFoundError as e:
logger.error(f"Exception: {e}")
@@ -28,7 +32,7 @@ except ModuleNotFoundError as e:
class PlayHTTTSService(TTSService):
def __init__(
self, *, api_key: str, user_id: str, voice_url: str, sample_rate: int = 16000, **kwargs
self, *, api_key: str, user_id: str, voice_id: str, sample_rate: int = 16000, **kwargs
):
super().__init__(sample_rate=sample_rate, **kwargs)
@@ -39,17 +43,23 @@ class PlayHTTTSService(TTSService):
user_id=self._user_id,
api_key=self._speech_key,
)
self._settings = {
"sample_rate": sample_rate,
"quality": "higher",
"format": Format.FORMAT_WAV,
"voice_engine": "PlayHT2.0-turbo",
}
self.set_voice(voice_id)
self._options = TTSOptions(
voice=voice_url, sample_rate=sample_rate, quality="higher", format=Format.FORMAT_WAV
voice=self._voice_id,
sample_rate=self._settings["sample_rate"],
quality=self._settings["quality"],
format=self._settings["format"],
)
def can_generate_metrics(self) -> bool:
return True
async def set_voice(self, voice: str):
logger.debug(f"Switching TTS voice to: [{voice}]")
self._options.voice = voice
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
logger.debug(f"Generating TTS: [{text}]")
@@ -60,7 +70,7 @@ class PlayHTTTSService(TTSService):
await self.start_ttfb_metrics()
playht_gen = self._client.tts(
text, voice_engine="PlayHT2.0-turbo", options=self._options
text, voice_engine=self._settings["voice_engine"], options=self._options
)
await self.start_tts_usage_metrics(text)
@@ -83,7 +93,7 @@ class PlayHTTTSService(TTSService):
else:
if len(chunk):
await self.stop_ttfb_metrics()
frame = TTSAudioRawFrame(chunk, 16000, 1)
frame = TTSAudioRawFrame(chunk, self._settings["sample_rate"], 1)
yield frame
await self.push_frame(TTSStoppedFrame())
except Exception as e:

View File

@@ -50,13 +50,15 @@ class TogetherLLMService(OpenAILLMService):
):
super().__init__(api_key=api_key, base_url=base_url, model=model, params=params, **kwargs)
self.set_model_name(model)
self._max_tokens = params.max_tokens
self._frequency_penalty = params.frequency_penalty
self._presence_penalty = params.presence_penalty
self._temperature = params.temperature
self._top_k = params.top_k
self._top_p = params.top_p
self._extra = params.extra if isinstance(params.extra, dict) else {}
self._settings = {
"max_tokens": params.max_tokens,
"frequency_penalty": params.frequency_penalty,
"presence_penalty": params.presence_penalty,
"seed": params.seed,
"temperature": params.temperature,
"top_p": params.top_p,
"extra": params.extra if isinstance(params.extra, dict) else {},
}
def can_generate_metrics(self) -> bool:
return True
@@ -72,42 +74,3 @@ class TogetherLLMService(OpenAILLMService):
)
),
)
async def set_frequency_penalty(self, frequency_penalty: float):
logger.debug(f"Switching LLM frequency_penalty to: [{frequency_penalty}]")
self._frequency_penalty = frequency_penalty
async def set_max_tokens(self, max_tokens: int):
logger.debug(f"Switching LLM max_tokens to: [{max_tokens}]")
self._max_tokens = max_tokens
async def set_presence_penalty(self, presence_penalty: float):
logger.debug(f"Switching LLM presence_penalty to: [{presence_penalty}]")
self._presence_penalty = presence_penalty
async def set_temperature(self, temperature: float):
logger.debug(f"Switching LLM temperature to: [{temperature}]")
self._temperature = temperature
async def set_top_k(self, top_k: float):
logger.debug(f"Switching LLM top_k to: [{top_k}]")
self._top_k = top_k
async def set_top_p(self, top_p: float):
logger.debug(f"Switching LLM top_p to: [{top_p}]")
self._top_p = top_p
async def set_extra(self, extra: Dict[str, Any]):
logger.debug(f"Switching LLM extra to: [{extra}]")
self._extra = extra
async def _update_settings(self, settings: Dict[str, Any]):
for key, value in settings.items():
setter = getattr(self, f"set_{key}", None)
if setter and callable(setter):
try:
await setter(value)
except Exception as e:
logger.warning(f"Error setting {key}: {e}")
else:
logger.warning(f"Unknown setting for Together LLM service: {key}")

View File

@@ -4,10 +4,12 @@
# SPDX-License-Identifier: BSD 2-Clause License
#
import aiohttp
from typing import Any, AsyncGenerator, Dict
import aiohttp
import numpy as np
from loguru import logger
from pipecat.frames.frames import (
ErrorFrame,
Frame,
@@ -18,10 +20,6 @@ from pipecat.frames.frames import (
)
from pipecat.services.ai_services import TTSService
import numpy as np
from loguru import logger
try:
import resampy
except ModuleNotFoundError as e:
@@ -50,9 +48,11 @@ class XTTSService(TTSService):
):
super().__init__(**kwargs)
self._voice_id = voice_id
self._language = language
self._base_url = base_url
self._settings = {
"language": language,
"base_url": base_url,
}
self.set_voice(voice_id)
self._studio_speakers: Dict[str, Any] | None = None
self._aiohttp_session = aiohttp_session
@@ -61,7 +61,7 @@ class XTTSService(TTSService):
async def start(self, frame: StartFrame):
await super().start(frame)
async with self._aiohttp_session.get(self._base_url + "/studio_speakers") as r:
async with self._aiohttp_session.get(self._settings["base_url"] + "/studio_speakers") as r:
if r.status != 200:
text = await r.text()
logger.error(
@@ -75,10 +75,6 @@ class XTTSService(TTSService):
return
self._studio_speakers = await r.json()
async def set_voice(self, voice: str):
logger.debug(f"Switching TTS voice to: [{voice}]")
self._voice_id = voice
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
logger.debug(f"Generating TTS: [{text}]")
@@ -88,11 +84,11 @@ class XTTSService(TTSService):
embeddings = self._studio_speakers[self._voice_id]
url = self._base_url + "/tts_stream"
url = self._settings["base_url"] + "/tts_stream"
payload = {
"text": text.replace(".", "").replace("*", ""),
"language": self._language,
"language": self._settings["language"],
"speaker_embedding": embeddings["speaker_embedding"],
"gpt_cond_latent": embeddings["gpt_cond_latent"],
"add_wav_header": False,