Update to use LLM, STT, TTS subclasses and remove setter methods
This commit is contained in:
@@ -50,7 +50,7 @@ async def main():
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
|
||||
tts = DeepgramTTSService(api_key=os.getenv("DEEPGRAM_API_KEY"), voice="aura-helios-en")
|
||||
tts = DeepgramTTSService(api_key=os.getenv("DEEPGRAM_API_KEY"), voice_id="aura-helios-en")
|
||||
|
||||
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o")
|
||||
|
||||
|
||||
@@ -4,11 +4,15 @@
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import aiohttp
|
||||
import asyncio
|
||||
import os
|
||||
import sys
|
||||
|
||||
import aiohttp
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
from runner import configure
|
||||
|
||||
from pipecat.frames.frames import LLMMessagesFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
@@ -17,17 +21,11 @@ from pipecat.processors.aggregators.llm_response import (
|
||||
LLMAssistantResponseAggregator,
|
||||
LLMUserResponseAggregator,
|
||||
)
|
||||
from pipecat.services.playht import PlayHTTTSService
|
||||
from pipecat.services.openai import OpenAILLMService
|
||||
from pipecat.services.playht import PlayHTTTSService
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
|
||||
from runner import configure
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
logger.remove(0)
|
||||
@@ -54,7 +52,7 @@ async def main():
|
||||
tts = PlayHTTTSService(
|
||||
user_id=os.getenv("PLAYHT_USER_ID"),
|
||||
api_key=os.getenv("PLAYHT_API_KEY"),
|
||||
voice_url="s3://voice-cloning-zero-shot/801a663f-efd0-4254-98d0-5c175514c3e8/jennifer/manifest.json",
|
||||
voice_id="s3://voice-cloning-zero-shot/801a663f-efd0-4254-98d0-5c175514c3e8/jennifer/manifest.json",
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o")
|
||||
|
||||
@@ -4,11 +4,15 @@
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import aiohttp
|
||||
import asyncio
|
||||
import os
|
||||
import sys
|
||||
|
||||
import aiohttp
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
from runner import configure
|
||||
|
||||
from pipecat.frames.frames import LLMMessagesFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
@@ -17,17 +21,10 @@ from pipecat.processors.aggregators.llm_response import (
|
||||
LLMAssistantResponseAggregator,
|
||||
LLMUserResponseAggregator,
|
||||
)
|
||||
from pipecat.services.openai import OpenAITTSService
|
||||
from pipecat.services.openai import OpenAILLMService
|
||||
from pipecat.services.openai import OpenAILLMService, OpenAITTSService
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
|
||||
from runner import configure
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
logger.remove(0)
|
||||
@@ -51,7 +48,7 @@ async def main():
|
||||
),
|
||||
)
|
||||
|
||||
tts = OpenAITTSService(api_key=os.getenv("OPENAI_API_KEY"), voice="alloy")
|
||||
tts = OpenAITTSService(api_key=os.getenv("OPENAI_API_KEY"), voice_id="alloy")
|
||||
|
||||
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o")
|
||||
|
||||
|
||||
@@ -5,10 +5,14 @@
|
||||
#
|
||||
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import os
|
||||
import sys
|
||||
|
||||
import aiohttp
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
from runner import configure
|
||||
|
||||
from pipecat.frames.frames import LLMMessagesFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
@@ -26,12 +30,6 @@ from pipecat.transports.services.daily import (
|
||||
)
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
|
||||
from runner import configure
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
logger.remove(0)
|
||||
@@ -57,7 +55,7 @@ async def main():
|
||||
tts = DeepgramTTSService(
|
||||
aiohttp_session=session,
|
||||
api_key=os.getenv("DEEPGRAM_API_KEY"),
|
||||
voice="aura-asteria-en",
|
||||
voice_id="aura-asteria-en",
|
||||
base_url="http://0.0.0.0:8080/v1/speak",
|
||||
)
|
||||
|
||||
|
||||
@@ -530,10 +530,24 @@ class UserImageRequestFrame(ControlFrame):
|
||||
class ServiceUpdateSettingsFrame(ControlFrame):
|
||||
"""A control frame containing a request to update service settings."""
|
||||
|
||||
service_type: str
|
||||
settings: Dict[str, Any]
|
||||
|
||||
|
||||
@dataclass
|
||||
class LLMUpdateSettingsFrame(ServiceUpdateSettingsFrame):
|
||||
pass
|
||||
|
||||
|
||||
@dataclass
|
||||
class TTSUpdateSettingsFrame(ServiceUpdateSettingsFrame):
|
||||
pass
|
||||
|
||||
|
||||
@dataclass
|
||||
class STTUpdateSettingsFrame(ServiceUpdateSettingsFrame):
|
||||
pass
|
||||
|
||||
|
||||
@dataclass
|
||||
class FunctionCallInProgressFrame(SystemFrame):
|
||||
"""A frame signaling that a function call is in progress."""
|
||||
|
||||
@@ -8,7 +8,7 @@ import asyncio
|
||||
import io
|
||||
import wave
|
||||
from abc import abstractmethod
|
||||
from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
|
||||
from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple
|
||||
|
||||
from loguru import logger
|
||||
|
||||
@@ -19,14 +19,15 @@ from pipecat.frames.frames import (
|
||||
ErrorFrame,
|
||||
Frame,
|
||||
LLMFullResponseEndFrame,
|
||||
ServiceUpdateSettingsFrame,
|
||||
StartFrame,
|
||||
StartInterruptionFrame,
|
||||
STTUpdateSettingsFrame,
|
||||
TextFrame,
|
||||
TTSAudioRawFrame,
|
||||
TTSSpeakFrame,
|
||||
TTSStartedFrame,
|
||||
TTSStoppedFrame,
|
||||
TTSUpdateSettingsFrame,
|
||||
UserImageRequestFrame,
|
||||
VisionImageRawFrame,
|
||||
)
|
||||
@@ -44,6 +45,7 @@ class AIService(FrameProcessor):
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self._model_name: str = ""
|
||||
self._settings: Dict[str, Any] = {}
|
||||
|
||||
@property
|
||||
def model_name(self) -> str:
|
||||
@@ -62,6 +64,16 @@ class AIService(FrameProcessor):
|
||||
async def cancel(self, frame: CancelFrame):
|
||||
pass
|
||||
|
||||
async def _update_settings(self, settings: Dict[str, Any]):
|
||||
for key, value in settings.items():
|
||||
if key in self._settings:
|
||||
logger.debug(f"Updating setting {key} to: [{value}] for {self.name}")
|
||||
self._settings[key] = value
|
||||
elif key == "model":
|
||||
self.set_model_name(value)
|
||||
else:
|
||||
logger.warning(f"Unknown setting for {self.name} service: {key}")
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
@@ -168,6 +180,7 @@ class TTSService(AIService):
|
||||
self._push_stop_frames: bool = push_stop_frames
|
||||
self._stop_frame_timeout_s: float = stop_frame_timeout_s
|
||||
self._sample_rate: int = sample_rate
|
||||
self._voice_id: str = ""
|
||||
self._settings: Dict[str, Any] = {}
|
||||
|
||||
self._stop_frame_task: Optional[asyncio.Task] = None
|
||||
@@ -184,60 +197,8 @@ class TTSService(AIService):
|
||||
self.set_model_name(model)
|
||||
|
||||
@abstractmethod
|
||||
async def set_voice(self, voice: str):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def set_language(self, language: Language):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def set_speed(self, speed: Union[str, float]):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def set_emotion(self, emotion: List[str]):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def set_engine(self, engine: str):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def set_pitch(self, pitch: str):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def set_rate(self, rate: str):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def set_volume(self, volume: str):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def set_emphasis(self, emphasis: str):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def set_style(self, style: str):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def set_style_degree(self, style_degree: str):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def set_role(self, role: str):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def set_gender(self, gender: str):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def set_google_style(self, google_style: str):
|
||||
pass
|
||||
def set_voice(self, voice: str):
|
||||
self._voice_id = voice
|
||||
|
||||
@abstractmethod
|
||||
async def flush_audio(self):
|
||||
@@ -269,20 +230,18 @@ class TTSService(AIService):
|
||||
|
||||
async def _update_settings(self, settings: Dict[str, Any]):
|
||||
for key, value in settings.items():
|
||||
setter = getattr(self, f"set_{key}", None)
|
||||
if setter and callable(setter):
|
||||
try:
|
||||
if key == "language":
|
||||
await setter(Language(value))
|
||||
else:
|
||||
await setter(value)
|
||||
except Exception as e:
|
||||
logger.warning(f"Error setting {key}: {e}")
|
||||
if key in self._settings:
|
||||
logger.debug(f"Updating TTS setting {key} to: [{value}]")
|
||||
self._settings[key] = value
|
||||
if key == "language":
|
||||
self._settings[key] = Language(value)
|
||||
elif key == "model":
|
||||
self.set_model_name(value)
|
||||
elif key == "voice":
|
||||
self.set_voice(value)
|
||||
else:
|
||||
logger.warning(f"Unknown setting for TTS service: {key}")
|
||||
|
||||
self._settings.update(settings)
|
||||
|
||||
async def say(self, text: str):
|
||||
aggregate_sentences = self._aggregate_sentences
|
||||
self._aggregate_sentences = False
|
||||
@@ -309,7 +268,7 @@ class TTSService(AIService):
|
||||
elif isinstance(frame, TTSSpeakFrame):
|
||||
await self._push_tts_frames(frame.text)
|
||||
await self.flush_audio()
|
||||
elif isinstance(frame, ServiceUpdateSettingsFrame) and frame.service_type == "tts":
|
||||
elif isinstance(frame, TTSUpdateSettingsFrame):
|
||||
await self._update_settings(frame.settings)
|
||||
else:
|
||||
await self.push_frame(frame, direction)
|
||||
@@ -448,31 +407,24 @@ class STTService(AIService):
|
||||
async def set_model(self, model: str):
|
||||
self.set_model_name(model)
|
||||
|
||||
@abstractmethod
|
||||
async def set_language(self, language: Language):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]:
|
||||
"""Returns transcript as a string"""
|
||||
pass
|
||||
|
||||
async def _update_settings(self, settings: Dict[str, Any]):
|
||||
logger.debug(f"Updating STT settings: {self._settings}")
|
||||
for key, value in settings.items():
|
||||
setter = getattr(self, f"set_{key}", None)
|
||||
if setter and callable(setter):
|
||||
try:
|
||||
if key == "language":
|
||||
await setter(Language(value))
|
||||
else:
|
||||
await setter(value)
|
||||
except Exception as e:
|
||||
logger.warning(f"Error setting {key}: {e}")
|
||||
if key in self._settings:
|
||||
logger.debug(f"Updating STT setting {key} to: [{value}]")
|
||||
self._settings[key] = value
|
||||
if key == "language":
|
||||
self._settings[key] = Language(value)
|
||||
elif key == "model":
|
||||
self.set_model_name(value)
|
||||
else:
|
||||
logger.warning(f"Unknown setting for STT service: {key}")
|
||||
|
||||
self._settings.update(settings)
|
||||
|
||||
async def process_audio_frame(self, frame: AudioRawFrame):
|
||||
await self.process_generator(self.run_stt(frame.audio))
|
||||
|
||||
@@ -484,7 +436,7 @@ class STTService(AIService):
|
||||
# In this service we accumulate audio internally and at the end we
|
||||
# push a TextFrame. We don't really want to push audio frames down.
|
||||
await self.process_audio_frame(frame)
|
||||
elif isinstance(frame, ServiceUpdateSettingsFrame) and frame.service_type == "stt":
|
||||
elif isinstance(frame, STTUpdateSettingsFrame):
|
||||
await self._update_settings(frame.settings)
|
||||
else:
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
@@ -25,7 +25,7 @@ from pipecat.frames.frames import (
|
||||
LLMFullResponseEndFrame,
|
||||
LLMFullResponseStartFrame,
|
||||
LLMMessagesFrame,
|
||||
ServiceUpdateSettingsFrame,
|
||||
LLMUpdateSettingsFrame,
|
||||
StartInterruptionFrame,
|
||||
TextFrame,
|
||||
UserImageRawFrame,
|
||||
@@ -96,12 +96,14 @@ class AnthropicLLMService(LLMService):
|
||||
super().__init__(**kwargs)
|
||||
self._client = AsyncAnthropic(api_key=api_key)
|
||||
self.set_model_name(model)
|
||||
self._max_tokens = params.max_tokens
|
||||
self._enable_prompt_caching_beta: bool = params.enable_prompt_caching_beta or False
|
||||
self._temperature = params.temperature
|
||||
self._top_k = params.top_k
|
||||
self._top_p = params.top_p
|
||||
self._extra = params.extra if isinstance(params.extra, dict) else {}
|
||||
self._settings = {
|
||||
"max_tokens": params.max_tokens,
|
||||
"enable_prompt_caching_beta": params.enable_prompt_caching_beta or False,
|
||||
"temperature": params.temperature,
|
||||
"top_k": params.top_k,
|
||||
"top_p": params.top_p,
|
||||
"extra": params.extra if isinstance(params.extra, dict) else {},
|
||||
}
|
||||
|
||||
def can_generate_metrics(self) -> bool:
|
||||
return True
|
||||
@@ -120,30 +122,6 @@ class AnthropicLLMService(LLMService):
|
||||
)
|
||||
return AnthropicContextAggregatorPair(_user=user, _assistant=assistant)
|
||||
|
||||
async def set_enable_prompt_caching_beta(self, enable_prompt_caching_beta: bool):
|
||||
logger.debug(f"Switching LLM enable_prompt_caching_beta to: [{enable_prompt_caching_beta}]")
|
||||
self._enable_prompt_caching_beta = enable_prompt_caching_beta
|
||||
|
||||
async def set_max_tokens(self, max_tokens: int):
|
||||
logger.debug(f"Switching LLM max_tokens to: [{max_tokens}]")
|
||||
self._max_tokens = max_tokens
|
||||
|
||||
async def set_temperature(self, temperature: float):
|
||||
logger.debug(f"Switching LLM temperature to: [{temperature}]")
|
||||
self._temperature = temperature
|
||||
|
||||
async def set_top_k(self, top_k: float):
|
||||
logger.debug(f"Switching LLM top_k to: [{top_k}]")
|
||||
self._top_k = top_k
|
||||
|
||||
async def set_top_p(self, top_p: float):
|
||||
logger.debug(f"Switching LLM top_p to: [{top_p}]")
|
||||
self._top_p = top_p
|
||||
|
||||
async def set_extra(self, extra: Dict[str, Any]):
|
||||
logger.debug(f"Switching LLM extra to: [{extra}]")
|
||||
self._extra = extra
|
||||
|
||||
async def _process_context(self, context: OpenAILLMContext):
|
||||
# Usage tracking. We track the usage reported by Anthropic in prompt_tokens and
|
||||
# completion_tokens. We also estimate the completion tokens from output text
|
||||
@@ -165,11 +143,11 @@ class AnthropicLLMService(LLMService):
|
||||
)
|
||||
|
||||
messages = context.messages
|
||||
if self._enable_prompt_caching_beta:
|
||||
if self._settings["enable_prompt_caching_beta"]:
|
||||
messages = context.get_messages_with_cache_control_markers()
|
||||
|
||||
api_call = self._client.messages.create
|
||||
if self._enable_prompt_caching_beta:
|
||||
if self._settings["enable_prompt_caching_beta"]:
|
||||
api_call = self._client.beta.prompt_caching.messages.create
|
||||
|
||||
await self.start_ttfb_metrics()
|
||||
@@ -179,14 +157,14 @@ class AnthropicLLMService(LLMService):
|
||||
"system": context.system,
|
||||
"messages": messages,
|
||||
"model": self.model_name,
|
||||
"max_tokens": self._max_tokens,
|
||||
"max_tokens": self._settings["max_tokens"],
|
||||
"stream": True,
|
||||
"temperature": self._temperature,
|
||||
"top_k": self._top_k,
|
||||
"top_p": self._top_p,
|
||||
"temperature": self._settings["temperature"],
|
||||
"top_k": self._settings["top_k"],
|
||||
"top_p": self._settings["top_p"],
|
||||
}
|
||||
|
||||
params.update(self._extra)
|
||||
params.update(self._settings["extra"])
|
||||
|
||||
response = await api_call(**params)
|
||||
|
||||
@@ -284,17 +262,6 @@ class AnthropicLLMService(LLMService):
|
||||
cache_read_input_tokens=cache_read_input_tokens,
|
||||
)
|
||||
|
||||
async def _update_settings(self, settings: Dict[str, Any]):
|
||||
for key, value in settings.items():
|
||||
setter = getattr(self, f"set_{key}", None)
|
||||
if setter and callable(setter):
|
||||
try:
|
||||
await setter(value)
|
||||
except Exception as e:
|
||||
logger.warning(f"Error setting {key}: {e}")
|
||||
else:
|
||||
logger.warning(f"Unknown setting for Anthropic LLM service: {key}")
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
@@ -309,11 +276,11 @@ class AnthropicLLMService(LLMService):
|
||||
# UserImageRawFrames coming through the pipeline and add them
|
||||
# to the context.
|
||||
context = AnthropicLLMContext.from_image_frame(frame)
|
||||
elif isinstance(frame, ServiceUpdateSettingsFrame) and frame.service_type == "llm":
|
||||
elif isinstance(frame, LLMUpdateSettingsFrame):
|
||||
await self._update_settings(frame.settings)
|
||||
elif isinstance(frame, LLMEnablePromptCachingFrame):
|
||||
logger.debug(f"Setting enable prompt caching to: [{frame.enable}]")
|
||||
self._enable_prompt_caching_beta = frame.enable
|
||||
self._settings["enable_prompt_caching_beta"] = frame.enable
|
||||
else:
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
|
||||
@@ -6,6 +6,7 @@
|
||||
|
||||
from typing import AsyncGenerator, Optional
|
||||
|
||||
from loguru import logger
|
||||
from pydantic import BaseModel
|
||||
|
||||
from pipecat.frames.frames import (
|
||||
@@ -17,8 +18,6 @@ from pipecat.frames.frames import (
|
||||
)
|
||||
from pipecat.services.ai_services import TTSService
|
||||
|
||||
from loguru import logger
|
||||
|
||||
try:
|
||||
import boto3
|
||||
from botocore.exceptions import BotoCoreError, ClientError
|
||||
@@ -57,9 +56,16 @@ class AWSTTSService(TTSService):
|
||||
aws_secret_access_key=api_key,
|
||||
region_name=region,
|
||||
)
|
||||
self._voice_id = voice_id
|
||||
self._sample_rate = sample_rate
|
||||
self._params = params
|
||||
self._settings = {
|
||||
"sample_rate": sample_rate,
|
||||
"engine": params.engine,
|
||||
"language": params.language,
|
||||
"pitch": params.pitch,
|
||||
"rate": params.rate,
|
||||
"volume": params.volume,
|
||||
}
|
||||
|
||||
self.set_voice(voice_id)
|
||||
|
||||
def can_generate_metrics(self) -> bool:
|
||||
return True
|
||||
@@ -67,18 +73,18 @@ class AWSTTSService(TTSService):
|
||||
def _construct_ssml(self, text: str) -> str:
|
||||
ssml = "<speak>"
|
||||
|
||||
if self._params.language:
|
||||
ssml += f"<lang xml:lang='{self._params.language}'>"
|
||||
if self._settings["language"]:
|
||||
ssml += f"<lang xml:lang='{self._settings["language"]}'>"
|
||||
|
||||
prosody_attrs = []
|
||||
# Prosody tags are only supported for standard and neural engines
|
||||
if self._params.engine != "generative":
|
||||
if self._params.rate:
|
||||
prosody_attrs.append(f"rate='{self._params.rate}'")
|
||||
if self._params.pitch:
|
||||
prosody_attrs.append(f"pitch='{self._params.pitch}'")
|
||||
if self._params.volume:
|
||||
prosody_attrs.append(f"volume='{self._params.volume}'")
|
||||
if self._settings["engine"] != "generative":
|
||||
if self._settings["rate"]:
|
||||
prosody_attrs.append(f"rate='{self._settings["rate"]}'")
|
||||
if self._settings["pitch"]:
|
||||
prosody_attrs.append(f"pitch='{self._settings["pitch"]}'")
|
||||
if self._settings["volume"]:
|
||||
prosody_attrs.append(f"volume='{self._settings["volume"]}'")
|
||||
|
||||
if prosody_attrs:
|
||||
ssml += f"<prosody {' '.join(prosody_attrs)}>"
|
||||
@@ -90,41 +96,13 @@ class AWSTTSService(TTSService):
|
||||
if prosody_attrs:
|
||||
ssml += "</prosody>"
|
||||
|
||||
if self._params.language:
|
||||
if self._settings["language"]:
|
||||
ssml += "</lang>"
|
||||
|
||||
ssml += "</speak>"
|
||||
|
||||
return ssml
|
||||
|
||||
async def set_voice(self, voice: str):
|
||||
logger.debug(f"Switching TTS voice to: [{voice}]")
|
||||
self._voice_id = voice
|
||||
|
||||
async def set_engine(self, engine: str):
|
||||
logger.debug(f"Switching TTS engine to: [{engine}]")
|
||||
self._params.engine = engine
|
||||
|
||||
async def set_language(self, language: str):
|
||||
logger.debug(f"Switching TTS language to: [{language}]")
|
||||
self._params.language = language
|
||||
|
||||
async def set_pitch(self, pitch: str):
|
||||
logger.debug(f"Switching TTS pitch to: [{pitch}]")
|
||||
self._params.pitch = pitch
|
||||
|
||||
async def set_rate(self, rate: str):
|
||||
logger.debug(f"Switching TTS rate to: [{rate}]")
|
||||
self._params.rate = rate
|
||||
|
||||
async def set_volume(self, volume: str):
|
||||
logger.debug(f"Switching TTS volume to: [{volume}]")
|
||||
self._params.volume = volume
|
||||
|
||||
async def set_params(self, params: InputParams):
|
||||
logger.debug(f"Switching TTS params to: [{params}]")
|
||||
self._params = params
|
||||
|
||||
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
|
||||
logger.debug(f"Generating TTS: [{text}]")
|
||||
|
||||
@@ -139,8 +117,8 @@ class AWSTTSService(TTSService):
|
||||
"TextType": "ssml",
|
||||
"OutputFormat": "pcm",
|
||||
"VoiceId": self._voice_id,
|
||||
"Engine": self._params.engine,
|
||||
"SampleRate": str(self._sample_rate),
|
||||
"Engine": self._settings["engine"],
|
||||
"SampleRate": str(self._settings["sample_rate"]),
|
||||
}
|
||||
|
||||
# Filter out None values
|
||||
@@ -160,7 +138,7 @@ class AWSTTSService(TTSService):
|
||||
chunk = audio_data[i : i + chunk_size]
|
||||
if len(chunk) > 0:
|
||||
await self.stop_ttfb_metrics()
|
||||
frame = TTSAudioRawFrame(chunk, self._sample_rate, 1)
|
||||
frame = TTSAudioRawFrame(chunk, self._settings["sample_rate"], 1)
|
||||
yield frame
|
||||
|
||||
await self.push_frame(TTSStoppedFrame())
|
||||
|
||||
@@ -4,12 +4,13 @@
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import aiohttp
|
||||
import asyncio
|
||||
import io
|
||||
|
||||
from typing import AsyncGenerator, Optional
|
||||
|
||||
import aiohttp
|
||||
from loguru import logger
|
||||
from PIL import Image
|
||||
from pydantic import BaseModel
|
||||
|
||||
from pipecat.frames.frames import (
|
||||
@@ -28,10 +29,6 @@ from pipecat.services.ai_services import ImageGenService, STTService, TTSService
|
||||
from pipecat.services.openai import BaseOpenAILLMService
|
||||
from pipecat.utils.time import time_now_iso8601
|
||||
|
||||
from PIL import Image
|
||||
|
||||
from loguru import logger
|
||||
|
||||
# See .env.example for Azure configuration needed
|
||||
try:
|
||||
from azure.cognitiveservices.speech import (
|
||||
@@ -89,7 +86,7 @@ class AzureTTSService(TTSService):
|
||||
*,
|
||||
api_key: str,
|
||||
region: str,
|
||||
voice="en-US-SaraNeural",
|
||||
voice_id="en-US-SaraNeural",
|
||||
sample_rate: int = 16000,
|
||||
params: InputParams = InputParams(),
|
||||
**kwargs,
|
||||
@@ -99,114 +96,67 @@ class AzureTTSService(TTSService):
|
||||
speech_config = SpeechConfig(subscription=api_key, region=region)
|
||||
self._speech_synthesizer = SpeechSynthesizer(speech_config=speech_config, audio_config=None)
|
||||
|
||||
self._voice = voice
|
||||
self._sample_rate = sample_rate
|
||||
self._params = params
|
||||
self._settings = {
|
||||
"sample_rate": sample_rate,
|
||||
"emphasis": params.emphasis,
|
||||
"language": params.language,
|
||||
"pitch": params.pitch,
|
||||
"rate": params.rate,
|
||||
"role": params.role,
|
||||
"style": params.style,
|
||||
"style_degree": params.style_degree,
|
||||
"volume": params.volume,
|
||||
}
|
||||
|
||||
self.set_voice(voice_id)
|
||||
|
||||
def can_generate_metrics(self) -> bool:
|
||||
return True
|
||||
|
||||
def _construct_ssml(self, text: str) -> str:
|
||||
ssml = (
|
||||
f"<speak version='1.0' xml:lang='{self._params.language}' "
|
||||
f"<speak version='1.0' xml:lang='{self._settings['language']}' "
|
||||
"xmlns='http://www.w3.org/2001/10/synthesis' "
|
||||
"xmlns:mstts='http://www.w3.org/2001/mstts'>"
|
||||
f"<voice name='{self._voice}'>"
|
||||
f"<voice name='{self._voice_id}'>"
|
||||
"<mstts:silence type='Sentenceboundary' value='20ms' />"
|
||||
)
|
||||
|
||||
if self._params.style:
|
||||
ssml += f"<mstts:express-as style='{self._params.style}'"
|
||||
if self._params.style_degree:
|
||||
ssml += f" styledegree='{self._params.style_degree}'"
|
||||
if self._params.role:
|
||||
ssml += f" role='{self._params.role}'"
|
||||
if self._settings["style"]:
|
||||
ssml += f"<mstts:express-as style='{self._settings['style']}'"
|
||||
if self._settings["style_degree"]:
|
||||
ssml += f" styledegree='{self._settings['style_degree']}'"
|
||||
if self._settings["role"]:
|
||||
ssml += f" role='{self._settings['role']}'"
|
||||
ssml += ">"
|
||||
|
||||
prosody_attrs = []
|
||||
if self._params.rate:
|
||||
prosody_attrs.append(f"rate='{self._params.rate}'")
|
||||
if self._params.pitch:
|
||||
prosody_attrs.append(f"pitch='{self._params.pitch}'")
|
||||
if self._params.volume:
|
||||
prosody_attrs.append(f"volume='{self._params.volume}'")
|
||||
if self._settings["rate"]:
|
||||
prosody_attrs.append(f"rate='{self._settings['rate']}'")
|
||||
if self._settings["pitch"]:
|
||||
prosody_attrs.append(f"pitch='{self._settings['pitch']}'")
|
||||
if self._settings["volume"]:
|
||||
prosody_attrs.append(f"volume='{self._settings['volume']}'")
|
||||
|
||||
ssml += f"<prosody {' '.join(prosody_attrs)}>"
|
||||
|
||||
if self._params.emphasis:
|
||||
ssml += f"<emphasis level='{self._params.emphasis}'>"
|
||||
if self._settings["emphasis"]:
|
||||
ssml += f"<emphasis level='{self._settings['emphasis']}'>"
|
||||
|
||||
ssml += text
|
||||
|
||||
if self._params.emphasis:
|
||||
if self._settings["emphasis"]:
|
||||
ssml += "</emphasis>"
|
||||
|
||||
ssml += "</prosody>"
|
||||
|
||||
if self._params.style:
|
||||
if self._settings["style"]:
|
||||
ssml += "</mstts:express-as>"
|
||||
|
||||
ssml += "</voice></speak>"
|
||||
|
||||
return ssml
|
||||
|
||||
async def set_voice(self, voice: str):
|
||||
logger.debug(f"Switching TTS voice to: [{voice}]")
|
||||
self._voice = voice
|
||||
|
||||
async def set_emphasis(self, emphasis: str):
|
||||
logger.debug(f"Setting TTS emphasis to: [{emphasis}]")
|
||||
self._params.emphasis = emphasis
|
||||
|
||||
async def set_language(self, language: str):
|
||||
logger.debug(f"Setting TTS language code to: [{language}]")
|
||||
self._params.language = language
|
||||
|
||||
async def set_pitch(self, pitch: str):
|
||||
logger.debug(f"Setting TTS pitch to: [{pitch}]")
|
||||
self._params.pitch = pitch
|
||||
|
||||
async def set_rate(self, rate: str):
|
||||
logger.debug(f"Setting TTS rate to: [{rate}]")
|
||||
self._params.rate = rate
|
||||
|
||||
async def set_role(self, role: str):
|
||||
logger.debug(f"Setting TTS role to: [{role}]")
|
||||
self._params.role = role
|
||||
|
||||
async def set_style(self, style: str):
|
||||
logger.debug(f"Setting TTS style to: [{style}]")
|
||||
self._params.style = style
|
||||
|
||||
async def set_style_degree(self, style_degree: str):
|
||||
logger.debug(f"Setting TTS style degree to: [{style_degree}]")
|
||||
self._params.style_degree = style_degree
|
||||
|
||||
async def set_volume(self, volume: str):
|
||||
logger.debug(f"Setting TTS volume to: [{volume}]")
|
||||
self._params.volume = volume
|
||||
|
||||
async def set_params(self, **kwargs):
|
||||
valid_params = {
|
||||
"voice": self.set_voice,
|
||||
"emphasis": self.set_emphasis,
|
||||
"language_code": self.set_language,
|
||||
"pitch": self.set_pitch,
|
||||
"rate": self.set_rate,
|
||||
"role": self.set_role,
|
||||
"style": self.set_style,
|
||||
"style_degree": self.set_style_degree,
|
||||
"volume": self.set_volume,
|
||||
}
|
||||
|
||||
for param, value in kwargs.items():
|
||||
if param in valid_params:
|
||||
await valid_params[param](value)
|
||||
else:
|
||||
logger.warning(f"Ignoring unknown parameter: {param}")
|
||||
|
||||
logger.debug(f"Updated TTS parameters: {', '.join(kwargs.keys())}")
|
||||
|
||||
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
|
||||
logger.debug(f"Generating TTS: [{text}]")
|
||||
|
||||
@@ -222,7 +172,9 @@ class AzureTTSService(TTSService):
|
||||
await self.push_frame(TTSStartedFrame())
|
||||
# Azure always sends a 44-byte header. Strip it off.
|
||||
yield TTSAudioRawFrame(
|
||||
audio=result.audio_data[44:], sample_rate=self._sample_rate, num_channels=1
|
||||
audio=result.audio_data[44:],
|
||||
sample_rate=self._settings["sample_rate"],
|
||||
num_channels=1,
|
||||
)
|
||||
await self.push_frame(TTSStoppedFrame())
|
||||
elif result.reason == ResultReason.Canceled:
|
||||
|
||||
@@ -4,36 +4,35 @@
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import asyncio
|
||||
import base64
|
||||
import json
|
||||
import uuid
|
||||
import base64
|
||||
import asyncio
|
||||
from typing import AsyncGenerator, List, Optional, Union
|
||||
|
||||
from typing import AsyncGenerator, Optional, Union, List
|
||||
from loguru import logger
|
||||
from pydantic.main import BaseModel
|
||||
|
||||
from pipecat.frames.frames import (
|
||||
CancelFrame,
|
||||
EndFrame,
|
||||
ErrorFrame,
|
||||
Frame,
|
||||
StartInterruptionFrame,
|
||||
LLMFullResponseEndFrame,
|
||||
StartFrame,
|
||||
EndFrame,
|
||||
StartInterruptionFrame,
|
||||
TTSAudioRawFrame,
|
||||
TTSStartedFrame,
|
||||
TTSStoppedFrame,
|
||||
LLMFullResponseEndFrame,
|
||||
)
|
||||
from pipecat.processors.frame_processor import FrameDirection
|
||||
from pipecat.services.ai_services import TTSService, WordTTSService
|
||||
from pipecat.transcriptions.language import Language
|
||||
from pipecat.services.ai_services import WordTTSService, TTSService
|
||||
|
||||
from loguru import logger
|
||||
|
||||
# See .env.example for Cartesia configuration needed
|
||||
try:
|
||||
from cartesia import AsyncCartesia
|
||||
import websockets
|
||||
from cartesia import AsyncCartesia
|
||||
except ModuleNotFoundError as e:
|
||||
logger.error(f"Exception: {e}")
|
||||
logger.error(
|
||||
@@ -66,7 +65,7 @@ class CartesiaTTSService(WordTTSService):
|
||||
encoding: Optional[str] = "pcm_s16le"
|
||||
sample_rate: Optional[int] = 16000
|
||||
container: Optional[str] = "raw"
|
||||
language: Optional[str] = "en"
|
||||
language: Optional[Language] = Language.EN
|
||||
speed: Optional[Union[str, float]] = ""
|
||||
emotion: Optional[List[str]] = []
|
||||
|
||||
@@ -77,7 +76,7 @@ class CartesiaTTSService(WordTTSService):
|
||||
voice_id: str,
|
||||
cartesia_version: str = "2024-06-10",
|
||||
url: str = "wss://api.cartesia.ai/tts/websocket",
|
||||
model_id: str = "sonic-english",
|
||||
model: str = "sonic-english",
|
||||
params: InputParams = InputParams(),
|
||||
**kwargs,
|
||||
):
|
||||
@@ -101,17 +100,18 @@ class CartesiaTTSService(WordTTSService):
|
||||
self._api_key = api_key
|
||||
self._cartesia_version = cartesia_version
|
||||
self._url = url
|
||||
self._voice_id = voice_id
|
||||
self._model_id = model_id
|
||||
self.set_model_name(model_id)
|
||||
self._output_format = {
|
||||
"container": params.container,
|
||||
"encoding": params.encoding,
|
||||
"sample_rate": params.sample_rate,
|
||||
self._settings = {
|
||||
"output_format": {
|
||||
"container": params.container,
|
||||
"encoding": params.encoding,
|
||||
"sample_rate": params.sample_rate,
|
||||
},
|
||||
"language": language_to_cartesia_language(params.language) if params.language else None,
|
||||
"speed": params.speed,
|
||||
"emotion": params.emotion,
|
||||
}
|
||||
self._language = params.language
|
||||
self._speed = params.speed
|
||||
self._emotion = params.emotion
|
||||
self.set_model_name(model)
|
||||
self.set_voice(voice_id)
|
||||
|
||||
self._websocket = None
|
||||
self._context_id = None
|
||||
@@ -125,42 +125,28 @@ class CartesiaTTSService(WordTTSService):
|
||||
await super().set_model(model)
|
||||
logger.debug(f"Switching TTS model to: [{model}]")
|
||||
|
||||
async def set_voice(self, voice: str):
|
||||
logger.debug(f"Switching TTS voice to: [{voice}]")
|
||||
self._voice_id = voice
|
||||
|
||||
async def set_speed(self, speed: str):
|
||||
logger.debug(f"Switching TTS speed to: [{speed}]")
|
||||
self._speed = speed
|
||||
|
||||
async def set_emotion(self, emotion: list[str]):
|
||||
logger.debug(f"Switching TTS emotion to: [{emotion}]")
|
||||
self._emotion = emotion
|
||||
|
||||
async def set_language(self, language: Language):
|
||||
logger.debug(f"Switching TTS language to: [{language}]")
|
||||
self._language = language_to_cartesia_language(language)
|
||||
|
||||
def _build_msg(
|
||||
self, text: str = "", continue_transcript: bool = True, add_timestamps: bool = True
|
||||
):
|
||||
voice_config = {"mode": "id", "id": self._voice_id}
|
||||
voice_config = {}
|
||||
voice_config["mode"] = "id"
|
||||
voice_config["id"] = self._voice_id
|
||||
|
||||
if self._speed or self._emotion:
|
||||
if self._settings["speed"] or self._settings["emotion"]:
|
||||
voice_config["__experimental_controls"] = {}
|
||||
if self._speed:
|
||||
voice_config["__experimental_controls"]["speed"] = self._speed
|
||||
if self._emotion:
|
||||
voice_config["__experimental_controls"]["emotion"] = self._emotion
|
||||
if self._settings["speed"]:
|
||||
voice_config["__experimental_controls"]["speed"] = self._settings["speed"]
|
||||
if self._settings["emotion"]:
|
||||
voice_config["__experimental_controls"]["emotion"] = self._settings["emotion"]
|
||||
|
||||
msg = {
|
||||
"transcript": text,
|
||||
"continue": continue_transcript,
|
||||
"context_id": self._context_id,
|
||||
"model_id": self._model_name,
|
||||
"model_id": self.model_name,
|
||||
"voice": voice_config,
|
||||
"output_format": self._output_format,
|
||||
"language": self._language,
|
||||
"output_format": self._settings["output_format"],
|
||||
"language": self._settings["language"],
|
||||
"add_timestamps": add_timestamps,
|
||||
}
|
||||
return json.dumps(msg)
|
||||
@@ -245,7 +231,7 @@ class CartesiaTTSService(WordTTSService):
|
||||
self.start_word_timestamps()
|
||||
frame = TTSAudioRawFrame(
|
||||
audio=base64.b64decode(msg["data"]),
|
||||
sample_rate=self._output_format["sample_rate"],
|
||||
sample_rate=self._settings["output_format"]["sample_rate"],
|
||||
num_channels=1,
|
||||
)
|
||||
await self.push_frame(frame)
|
||||
@@ -303,7 +289,7 @@ class CartesiaHttpTTSService(TTSService):
|
||||
*,
|
||||
api_key: str,
|
||||
voice_id: str,
|
||||
model_id: str = "sonic-english",
|
||||
model: str = "sonic-english",
|
||||
base_url: str = "https://api.cartesia.ai",
|
||||
params: InputParams = InputParams(),
|
||||
**kwargs,
|
||||
@@ -311,17 +297,18 @@ class CartesiaHttpTTSService(TTSService):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
self._api_key = api_key
|
||||
self._voice_id = voice_id
|
||||
self._model_id = model_id
|
||||
self.set_model_name(model_id)
|
||||
self._output_format = {
|
||||
"container": params.container,
|
||||
"encoding": params.encoding,
|
||||
"sample_rate": params.sample_rate,
|
||||
self._settings = {
|
||||
"output_format": {
|
||||
"container": params.container,
|
||||
"encoding": params.encoding,
|
||||
"sample_rate": params.sample_rate,
|
||||
},
|
||||
"language": params.language,
|
||||
"speed": params.speed,
|
||||
"emotion": params.emotion,
|
||||
}
|
||||
self._language = params.language
|
||||
self._speed = params.speed
|
||||
self._emotion = params.emotion
|
||||
self.set_voice(voice_id)
|
||||
self.set_model_name(model)
|
||||
|
||||
self._client = AsyncCartesia(api_key=api_key, base_url=base_url)
|
||||
|
||||
@@ -333,22 +320,6 @@ class CartesiaHttpTTSService(TTSService):
|
||||
self._model_id = model
|
||||
await super().set_model(model)
|
||||
|
||||
async def set_voice(self, voice: str):
|
||||
logger.debug(f"Switching TTS voice to: [{voice}]")
|
||||
self._voice_id = voice
|
||||
|
||||
async def set_speed(self, speed: str):
|
||||
logger.debug(f"Switching TTS speed to: [{speed}]")
|
||||
self._speed = speed
|
||||
|
||||
async def set_emotion(self, emotion: list[str]):
|
||||
logger.debug(f"Switching TTS emotion to: [{emotion}]")
|
||||
self._emotion = emotion
|
||||
|
||||
async def set_language(self, language: Language):
|
||||
logger.debug(f"Switching TTS language to: [{language}]")
|
||||
self._language = language_to_cartesia_language(language)
|
||||
|
||||
async def stop(self, frame: EndFrame):
|
||||
await super().stop(frame)
|
||||
await self._client.close()
|
||||
@@ -365,19 +336,19 @@ class CartesiaHttpTTSService(TTSService):
|
||||
|
||||
try:
|
||||
voice_controls = None
|
||||
if self._speed or self._emotion:
|
||||
if self._settings["speed"] or self._settings["emotion"]:
|
||||
voice_controls = {}
|
||||
if self._speed:
|
||||
voice_controls["speed"] = self._speed
|
||||
if self._emotion:
|
||||
voice_controls["emotion"] = self._emotion
|
||||
if self._settings["speed"]:
|
||||
voice_controls["speed"] = self._settings["speed"]
|
||||
if self._settings["emotion"]:
|
||||
voice_controls["emotion"] = self._settings["emotion"]
|
||||
|
||||
output = await self._client.tts.sse(
|
||||
model_id=self._model_id,
|
||||
transcript=text,
|
||||
voice_id=self._voice_id,
|
||||
output_format=self._output_format,
|
||||
language=self._language,
|
||||
output_format=self._settings["output_format"],
|
||||
language=self._settings["language"],
|
||||
stream=False,
|
||||
_experimental_voice_controls=voice_controls,
|
||||
)
|
||||
@@ -386,7 +357,7 @@ class CartesiaHttpTTSService(TTSService):
|
||||
|
||||
frame = TTSAudioRawFrame(
|
||||
audio=output["audio"],
|
||||
sample_rate=self._output_format["sample_rate"],
|
||||
sample_rate=self._settings["output_format"]["sample_rate"],
|
||||
num_channels=1,
|
||||
)
|
||||
yield frame
|
||||
|
||||
@@ -5,9 +5,10 @@
|
||||
#
|
||||
|
||||
import asyncio
|
||||
|
||||
from typing import AsyncGenerator
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.frames.frames import (
|
||||
CancelFrame,
|
||||
EndFrame,
|
||||
@@ -24,8 +25,6 @@ from pipecat.services.ai_services import STTService, TTSService
|
||||
from pipecat.transcriptions.language import Language
|
||||
from pipecat.utils.time import time_now_iso8601
|
||||
|
||||
from loguru import logger
|
||||
|
||||
# See .env.example for Deepgram configuration needed
|
||||
try:
|
||||
from deepgram import (
|
||||
@@ -50,32 +49,30 @@ class DeepgramTTSService(TTSService):
|
||||
self,
|
||||
*,
|
||||
api_key: str,
|
||||
voice: str = "aura-helios-en",
|
||||
voice_id: str = "aura-helios-en",
|
||||
sample_rate: int = 16000,
|
||||
encoding: str = "linear16",
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
self._voice = voice
|
||||
self._sample_rate = sample_rate
|
||||
self._encoding = encoding
|
||||
self._settings = {
|
||||
"sample_rate": sample_rate,
|
||||
"encoding": encoding,
|
||||
}
|
||||
self.set_voice(voice_id)
|
||||
self._deepgram_client = DeepgramClient(api_key=api_key)
|
||||
|
||||
def can_generate_metrics(self) -> bool:
|
||||
return True
|
||||
|
||||
async def set_voice(self, voice: str):
|
||||
logger.debug(f"Switching TTS voice to: [{voice}]")
|
||||
self._voice = voice
|
||||
|
||||
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
|
||||
logger.debug(f"Generating TTS: [{text}]")
|
||||
|
||||
options = SpeakOptions(
|
||||
model=self._voice,
|
||||
encoding=self._encoding,
|
||||
sample_rate=self._sample_rate,
|
||||
model=self._voice_id,
|
||||
encoding=self._settings["encoding"],
|
||||
sample_rate=self._settings["sample_rate"],
|
||||
container="none",
|
||||
)
|
||||
|
||||
@@ -103,7 +100,9 @@ class DeepgramTTSService(TTSService):
|
||||
chunk = audio_buffer.read(chunk_size)
|
||||
if not chunk:
|
||||
break
|
||||
frame = TTSAudioRawFrame(audio=chunk, sample_rate=self._sample_rate, num_channels=1)
|
||||
frame = TTSAudioRawFrame(
|
||||
audio=chunk, sample_rate=self._settings["sample_rate"], num_channels=1
|
||||
)
|
||||
yield frame
|
||||
|
||||
await self.push_frame(TTSStoppedFrame())
|
||||
@@ -135,7 +134,7 @@ class DeepgramSTTService(STTService):
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
self._live_options = live_options
|
||||
self._settings = vars(live_options)
|
||||
|
||||
self._client = DeepgramClient(
|
||||
api_key, config=DeepgramClientOptions(url=url, options={"keepalive": "true"})
|
||||
@@ -147,7 +146,7 @@ class DeepgramSTTService(STTService):
|
||||
|
||||
@property
|
||||
def vad_enabled(self):
|
||||
return self._live_options.vad_events
|
||||
return self._settings["vad_events"]
|
||||
|
||||
def can_generate_metrics(self) -> bool:
|
||||
return self.vad_enabled
|
||||
@@ -155,13 +154,7 @@ class DeepgramSTTService(STTService):
|
||||
async def set_model(self, model: str):
|
||||
await super().set_model(model)
|
||||
logger.debug(f"Switching STT model to: [{model}]")
|
||||
self._live_options.model = model
|
||||
await self._disconnect()
|
||||
await self._connect()
|
||||
|
||||
async def set_language(self, language: Language):
|
||||
logger.debug(f"Switching STT language to: [{language}]")
|
||||
self._live_options.language = language
|
||||
self._settings["model"] = model
|
||||
await self._disconnect()
|
||||
await self._connect()
|
||||
|
||||
@@ -182,7 +175,7 @@ class DeepgramSTTService(STTService):
|
||||
yield None
|
||||
|
||||
async def _connect(self):
|
||||
if await self._connection.start(self._live_options):
|
||||
if await self._connection.start(self._settings):
|
||||
logger.debug(f"{self}: Connected to Deepgram")
|
||||
else:
|
||||
logger.error(f"{self}: Unable to connect to Deepgram")
|
||||
|
||||
@@ -24,6 +24,7 @@ from pipecat.frames.frames import (
|
||||
)
|
||||
from pipecat.processors.frame_processor import FrameDirection
|
||||
from pipecat.services.ai_services import WordTTSService
|
||||
from pipecat.transcriptions.language import Language
|
||||
|
||||
# See .env.example for ElevenLabs configuration needed
|
||||
try:
|
||||
@@ -72,7 +73,7 @@ def calculate_word_times(
|
||||
|
||||
class ElevenLabsTTSService(WordTTSService):
|
||||
class InputParams(BaseModel):
|
||||
language: Optional[str] = None
|
||||
language: Optional[Language] = Language.EN
|
||||
output_format: Literal["pcm_16000", "pcm_22050", "pcm_24000", "pcm_44100"] = "pcm_16000"
|
||||
optimize_streaming_latency: Optional[str] = None
|
||||
stability: Optional[float] = None
|
||||
@@ -124,10 +125,19 @@ class ElevenLabsTTSService(WordTTSService):
|
||||
)
|
||||
|
||||
self._api_key = api_key
|
||||
self._voice_id = voice_id
|
||||
self.set_model_name(model)
|
||||
self._url = url
|
||||
self._params = params
|
||||
self._settings = {
|
||||
"sample_rate": sample_rate_from_output_format(params.output_format),
|
||||
"language": params.language,
|
||||
"output_format": params.output_format,
|
||||
"optimize_streaming_latency": params.optimize_streaming_latency,
|
||||
"stability": params.stability,
|
||||
"similarity_boost": params.similarity_boost,
|
||||
"style": params.style,
|
||||
"use_speaker_boost": params.use_speaker_boost,
|
||||
}
|
||||
self.set_model_name(model)
|
||||
self.set_voice(voice_id)
|
||||
self._voice_settings = self._set_voice_settings()
|
||||
|
||||
# Websocket connection to ElevenLabs.
|
||||
@@ -142,19 +152,22 @@ class ElevenLabsTTSService(WordTTSService):
|
||||
|
||||
def _set_voice_settings(self):
|
||||
voice_settings = {}
|
||||
if self._params.stability is not None and self._params.similarity_boost is not None:
|
||||
voice_settings["stability"] = self._params.stability
|
||||
voice_settings["similarity_boost"] = self._params.similarity_boost
|
||||
if self._params.style is not None:
|
||||
voice_settings["style"] = self._params.style
|
||||
if self._params.use_speaker_boost is not None:
|
||||
voice_settings["use_speaker_boost"] = self._params.use_speaker_boost
|
||||
if (
|
||||
self._settings["stability"] is not None
|
||||
and self._settings["similarity_boost"] is not None
|
||||
):
|
||||
voice_settings["stability"] = self._settings["stability"]
|
||||
voice_settings["similarity_boost"] = self._settings["similarity_boost"]
|
||||
if self._settings["style"] is not None:
|
||||
voice_settings["style"] = self._settings["style"]
|
||||
if self._settings["use_speaker_boost"] is not None:
|
||||
voice_settings["use_speaker_boost"] = self._settings["use_speaker_boost"]
|
||||
else:
|
||||
if self._params.style is not None:
|
||||
if self._settings["style"] is not None:
|
||||
logger.warning(
|
||||
"'style' is set but will not be applied because 'stability' and 'similarity_boost' are not both set."
|
||||
)
|
||||
if self._params.use_speaker_boost is not None:
|
||||
if self._settings["use_speaker_boost"] is not None:
|
||||
logger.warning(
|
||||
"'use_speaker_boost' is set but will not be applied because 'stability' and 'similarity_boost' are not both set."
|
||||
)
|
||||
@@ -167,33 +180,13 @@ class ElevenLabsTTSService(WordTTSService):
|
||||
await self._disconnect()
|
||||
await self._connect()
|
||||
|
||||
async def set_voice(self, voice: str):
|
||||
logger.debug(f"Switching TTS voice to: [{voice}]")
|
||||
self._voice_id = voice
|
||||
await self._disconnect()
|
||||
await self._connect()
|
||||
|
||||
async def set_voice_settings(
|
||||
self,
|
||||
stability: Optional[float] = None,
|
||||
similarity_boost: Optional[float] = None,
|
||||
style: Optional[float] = None,
|
||||
use_speaker_boost: Optional[bool] = None,
|
||||
):
|
||||
self._params.stability = stability if stability is not None else self._params.stability
|
||||
self._params.similarity_boost = (
|
||||
similarity_boost if similarity_boost is not None else self._params.similarity_boost
|
||||
)
|
||||
self._params.style = style if style is not None else self._params.style
|
||||
self._params.use_speaker_boost = (
|
||||
use_speaker_boost if use_speaker_boost is not None else self._params.use_speaker_boost
|
||||
)
|
||||
|
||||
self._set_voice_settings()
|
||||
|
||||
if self._websocket:
|
||||
msg = {"voice_settings": self._voice_settings}
|
||||
await self._websocket.send(json.dumps(msg))
|
||||
async def _update_settings(self, settings: Dict[str, Any]):
|
||||
prev_voice = self._voice_id
|
||||
await super()._update_settings(settings)
|
||||
if not prev_voice == self._voice_id:
|
||||
await self._disconnect()
|
||||
await self._connect()
|
||||
logger.debug(f"Switching TTS voice to: [{self._voice_id}]")
|
||||
|
||||
async def start(self, frame: StartFrame):
|
||||
await super().start(frame)
|
||||
@@ -223,19 +216,19 @@ class ElevenLabsTTSService(WordTTSService):
|
||||
try:
|
||||
voice_id = self._voice_id
|
||||
model = self.model_name
|
||||
output_format = self._params.output_format
|
||||
output_format = self._settings["output_format"]
|
||||
url = f"{self._url}/v1/text-to-speech/{voice_id}/stream-input?model_id={model}&output_format={output_format}"
|
||||
|
||||
if self._params.optimize_streaming_latency:
|
||||
url += f"&optimize_streaming_latency={self._params.optimize_streaming_latency}"
|
||||
if self._settings["optimize_streaming_latency"]:
|
||||
url += f"&optimize_streaming_latency={self._settings["optimize_streaming_latency"]}"
|
||||
|
||||
# language can only be used with the 'eleven_turbo_v2_5' model
|
||||
if self._params.language:
|
||||
if self._settings["language"]:
|
||||
if model == "eleven_turbo_v2_5":
|
||||
url += f"&language_code={self._params.language}"
|
||||
url += f"&language_code={self._settings["language"]}"
|
||||
else:
|
||||
logger.debug(
|
||||
f"Language code [{self._params.language}] not applied. Language codes can only be used with the 'eleven_turbo_v2_5' model."
|
||||
f"Language code [{self._settings["language"]}] not applied. Language codes can only be used with the 'eleven_turbo_v2_5' model."
|
||||
)
|
||||
|
||||
self._websocket = await websockets.connect(url)
|
||||
@@ -286,7 +279,7 @@ class ElevenLabsTTSService(WordTTSService):
|
||||
self.start_word_timestamps()
|
||||
|
||||
audio = base64.b64decode(msg["audio"])
|
||||
frame = TTSAudioRawFrame(audio, self.sample_rate, 1)
|
||||
frame = TTSAudioRawFrame(audio, self._settings["sample_rate"], 1)
|
||||
await self.push_frame(frame)
|
||||
|
||||
if msg.get("alignment"):
|
||||
|
||||
@@ -6,8 +6,9 @@
|
||||
|
||||
import base64
|
||||
import json
|
||||
|
||||
from typing import AsyncGenerator, Optional
|
||||
|
||||
from loguru import logger
|
||||
from pydantic.main import BaseModel
|
||||
|
||||
from pipecat.frames.frames import (
|
||||
@@ -21,8 +22,6 @@ from pipecat.frames.frames import (
|
||||
from pipecat.services.ai_services import STTService
|
||||
from pipecat.utils.time import time_now_iso8601
|
||||
|
||||
from loguru import logger
|
||||
|
||||
# See .env.example for Gladia configuration needed
|
||||
try:
|
||||
import websockets
|
||||
@@ -55,7 +54,13 @@ class GladiaSTTService(STTService):
|
||||
|
||||
self._api_key = api_key
|
||||
self._url = url
|
||||
self._params = params
|
||||
self._settings = {
|
||||
"sample_rate": params.sample_rate,
|
||||
"language": params.language,
|
||||
"transcription_hint": params.transcription_hint,
|
||||
"endpointing": params.endpointing,
|
||||
"prosody": params.prosody,
|
||||
}
|
||||
self._confidence = confidence
|
||||
|
||||
async def start(self, frame: StartFrame):
|
||||
@@ -84,7 +89,11 @@ class GladiaSTTService(STTService):
|
||||
"encoding": "WAV/PCM",
|
||||
"model_type": "fast",
|
||||
"language_behaviour": "manual",
|
||||
**self._params.model_dump(exclude_none=True),
|
||||
"sample_rate": self._settings["sample_rate"],
|
||||
"language": self._settings["language"],
|
||||
"transcription_hint": self._settings["transcription_hint"],
|
||||
"endpointing": self._settings["endpointing"],
|
||||
"prosody": self._settings["prosody"],
|
||||
}
|
||||
|
||||
await self._websocket.send(json.dumps(configuration))
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
from typing import Any, AsyncGenerator, Dict, List, Literal, Optional
|
||||
from typing import AsyncGenerator, List, Literal, Optional
|
||||
|
||||
from loguru import logger
|
||||
from pydantic import BaseModel
|
||||
@@ -17,7 +17,7 @@ from pipecat.frames.frames import (
|
||||
LLMFullResponseEndFrame,
|
||||
LLMFullResponseStartFrame,
|
||||
LLMMessagesFrame,
|
||||
ServiceUpdateSettingsFrame,
|
||||
LLMUpdateSettingsFrame,
|
||||
TextFrame,
|
||||
TTSAudioRawFrame,
|
||||
TTSStartedFrame,
|
||||
@@ -64,21 +64,6 @@ class GoogleLLMService(LLMService):
|
||||
self.set_model_name(model)
|
||||
self._client = gai.GenerativeModel(model)
|
||||
|
||||
async def set_model(self, model: str):
|
||||
logger.debug(f"Switching LLM model to: [{model}]")
|
||||
self._create_client(model)
|
||||
|
||||
async def _update_settings(self, settings: Dict[str, Any]):
|
||||
for key, value in settings.items():
|
||||
setter = getattr(self, f"set_{key}", None)
|
||||
if setter and callable(setter):
|
||||
try:
|
||||
await setter(value)
|
||||
except Exception as e:
|
||||
logger.warning(f"Error setting {key}: {e}")
|
||||
else:
|
||||
logger.warning(f"Unknown setting for Google LLM service: {key}")
|
||||
|
||||
def _get_messages_from_openai_context(self, context: OpenAILLMContext) -> List[glm.Content]:
|
||||
openai_messages = context.get_messages()
|
||||
google_messages = []
|
||||
@@ -151,7 +136,7 @@ class GoogleLLMService(LLMService):
|
||||
context = OpenAILLMContext.from_messages(frame.messages)
|
||||
elif isinstance(frame, VisionImageRawFrame):
|
||||
context = OpenAILLMContext.from_image_frame(frame)
|
||||
elif isinstance(frame, ServiceUpdateSettingsFrame) and frame.service_type == "llm":
|
||||
elif isinstance(frame, LLMUpdateSettingsFrame):
|
||||
await self._update_settings(frame.settings)
|
||||
else:
|
||||
await self.push_frame(frame, direction)
|
||||
@@ -182,8 +167,17 @@ class GoogleTTSService(TTSService):
|
||||
):
|
||||
super().__init__(sample_rate=sample_rate, **kwargs)
|
||||
|
||||
self._voice_id: str = voice_id
|
||||
self._params = params
|
||||
self._settings = {
|
||||
"sample_rate": sample_rate,
|
||||
"pitch": params.pitch,
|
||||
"rate": params.rate,
|
||||
"volume": params.volume,
|
||||
"emphasis": params.emphasis,
|
||||
"language": params.language,
|
||||
"gender": params.gender,
|
||||
"google_style": params.google_style,
|
||||
}
|
||||
self.set_voice(voice_id)
|
||||
self._client: texttospeech_v1.TextToSpeechAsyncClient = self._create_client(
|
||||
credentials, credentials_path
|
||||
)
|
||||
@@ -216,38 +210,38 @@ class GoogleTTSService(TTSService):
|
||||
|
||||
# Voice tag
|
||||
voice_attrs = [f"name='{self._voice_id}'"]
|
||||
if self._params.language:
|
||||
voice_attrs.append(f"language='{self._params.language}'")
|
||||
if self._params.gender:
|
||||
voice_attrs.append(f"gender='{self._params.gender}'")
|
||||
if self._settings["language"]:
|
||||
voice_attrs.append(f"language='{self._settings['language']}'")
|
||||
if self._settings["gender"]:
|
||||
voice_attrs.append(f"gender='{self._settings['gender']}'")
|
||||
ssml += f"<voice {' '.join(voice_attrs)}>"
|
||||
|
||||
# Prosody tag
|
||||
prosody_attrs = []
|
||||
if self._params.pitch:
|
||||
prosody_attrs.append(f"pitch='{self._params.pitch}'")
|
||||
if self._params.rate:
|
||||
prosody_attrs.append(f"rate='{self._params.rate}'")
|
||||
if self._params.volume:
|
||||
prosody_attrs.append(f"volume='{self._params.volume}'")
|
||||
if self._settings["pitch"]:
|
||||
prosody_attrs.append(f"pitch='{self._settings['pitch']}'")
|
||||
if self._settings["rate"]:
|
||||
prosody_attrs.append(f"rate='{self._settings['rate']}'")
|
||||
if self._settings["volume"]:
|
||||
prosody_attrs.append(f"volume='{self._settings['volume']}'")
|
||||
|
||||
if prosody_attrs:
|
||||
ssml += f"<prosody {' '.join(prosody_attrs)}>"
|
||||
|
||||
# Emphasis tag
|
||||
if self._params.emphasis:
|
||||
ssml += f"<emphasis level='{self._params.emphasis}'>"
|
||||
if self._settings["emphasis"]:
|
||||
ssml += f"<emphasis level='{self._settings['emphasis']}'>"
|
||||
|
||||
# Google style tag
|
||||
if self._params.google_style:
|
||||
ssml += f"<google:style name='{self._params.google_style}'>"
|
||||
if self._settings["google_style"]:
|
||||
ssml += f"<google:style name='{self._settings['google_style']}'>"
|
||||
|
||||
ssml += text
|
||||
|
||||
# Close tags
|
||||
if self._params.google_style:
|
||||
if self._settings["google_style"]:
|
||||
ssml += "</google:style>"
|
||||
if self._params.emphasis:
|
||||
if self._settings["emphasis"]:
|
||||
ssml += "</emphasis>"
|
||||
if prosody_attrs:
|
||||
ssml += "</prosody>"
|
||||
@@ -255,46 +249,6 @@ class GoogleTTSService(TTSService):
|
||||
|
||||
return ssml
|
||||
|
||||
async def set_voice(self, voice: str) -> None:
|
||||
logger.debug(f"Switching TTS voice to: [{voice}]")
|
||||
self._voice_id = voice
|
||||
|
||||
async def set_language(self, language: str) -> None:
|
||||
logger.debug(f"Switching TTS language to: [{language}]")
|
||||
self._params.language = language
|
||||
|
||||
async def set_pitch(self, pitch: str) -> None:
|
||||
logger.debug(f"Switching TTS pitch to: [{pitch}]")
|
||||
self._params.pitch = pitch
|
||||
|
||||
async def set_rate(self, rate: str) -> None:
|
||||
logger.debug(f"Switching TTS rate to: [{rate}]")
|
||||
self._params.rate = rate
|
||||
|
||||
async def set_volume(self, volume: str) -> None:
|
||||
logger.debug(f"Switching TTS volume to: [{volume}]")
|
||||
self._params.volume = volume
|
||||
|
||||
async def set_emphasis(
|
||||
self, emphasis: Literal["strong", "moderate", "reduced", "none"]
|
||||
) -> None:
|
||||
logger.debug(f"Switching TTS emphasis to: [{emphasis}]")
|
||||
self._params.emphasis = emphasis
|
||||
|
||||
async def set_gender(self, gender: Literal["male", "female", "neutral"]) -> None:
|
||||
logger.debug(f"Switch TTS gender to [{gender}]")
|
||||
self._params.gender = gender
|
||||
|
||||
async def set_google_style(
|
||||
self, google_style: Literal["apologetic", "calm", "empathetic", "firm", "lively"]
|
||||
) -> None:
|
||||
logger.debug(f"Switching TTS google style to: [{google_style}]")
|
||||
self._params.google_style = google_style
|
||||
|
||||
async def set_params(self, params: InputParams) -> None:
|
||||
logger.debug(f"Switching TTS params to: [{params}]")
|
||||
self._params = params
|
||||
|
||||
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
|
||||
logger.debug(f"Generating TTS: [{text}]")
|
||||
|
||||
@@ -304,11 +258,11 @@ class GoogleTTSService(TTSService):
|
||||
ssml = self._construct_ssml(text)
|
||||
synthesis_input = texttospeech_v1.SynthesisInput(ssml=ssml)
|
||||
voice = texttospeech_v1.VoiceSelectionParams(
|
||||
language_code=self._params.language, name=self._voice_id
|
||||
language_code=self._settings["language"], name=self._voice_id
|
||||
)
|
||||
audio_config = texttospeech_v1.AudioConfig(
|
||||
audio_encoding=texttospeech_v1.AudioEncoding.LINEAR16,
|
||||
sample_rate_hertz=self.sample_rate,
|
||||
sample_rate_hertz=self._settings["sample_rate"],
|
||||
)
|
||||
|
||||
request = texttospeech_v1.SynthesizeSpeechRequest(
|
||||
@@ -331,7 +285,7 @@ class GoogleTTSService(TTSService):
|
||||
if not chunk:
|
||||
break
|
||||
await self.stop_ttfb_metrics()
|
||||
frame = TTSAudioRawFrame(chunk, self.sample_rate, 1)
|
||||
frame = TTSAudioRawFrame(chunk, self._settings["sample_rate"], 1)
|
||||
yield frame
|
||||
await asyncio.sleep(0) # Allow other tasks to run
|
||||
|
||||
|
||||
@@ -5,10 +5,10 @@
|
||||
#
|
||||
|
||||
import asyncio
|
||||
|
||||
from typing import AsyncGenerator
|
||||
|
||||
from pipecat.processors.frame_processor import FrameDirection
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.frames.frames import (
|
||||
CancelFrame,
|
||||
EndFrame,
|
||||
@@ -20,10 +20,9 @@ from pipecat.frames.frames import (
|
||||
TTSStartedFrame,
|
||||
TTSStoppedFrame,
|
||||
)
|
||||
from pipecat.processors.frame_processor import FrameDirection
|
||||
from pipecat.services.ai_services import TTSService
|
||||
|
||||
from loguru import logger
|
||||
|
||||
# See .env.example for LMNT configuration needed
|
||||
try:
|
||||
from lmnt.api import Speech
|
||||
@@ -50,13 +49,16 @@ class LmntTTSService(TTSService):
|
||||
super().__init__(push_stop_frames=True, sample_rate=sample_rate, **kwargs)
|
||||
|
||||
self._api_key = api_key
|
||||
self._voice_id = voice_id
|
||||
self._output_format = {
|
||||
"container": "raw",
|
||||
"encoding": "pcm_s16le",
|
||||
"sample_rate": sample_rate,
|
||||
self._settings = {
|
||||
"output_format": {
|
||||
"container": "raw",
|
||||
"encoding": "pcm_s16le",
|
||||
"sample_rate": sample_rate,
|
||||
},
|
||||
"language": language,
|
||||
}
|
||||
self._language = language
|
||||
|
||||
self.set_voice(voice_id)
|
||||
|
||||
self._speech = None
|
||||
self._connection = None
|
||||
@@ -68,10 +70,6 @@ class LmntTTSService(TTSService):
|
||||
def can_generate_metrics(self) -> bool:
|
||||
return True
|
||||
|
||||
async def set_voice(self, voice: str):
|
||||
logger.debug(f"Switching TTS voice to: [{voice}]")
|
||||
self._voice_id = voice
|
||||
|
||||
async def start(self, frame: StartFrame):
|
||||
await super().start(frame)
|
||||
await self._connect()
|
||||
@@ -93,7 +91,9 @@ class LmntTTSService(TTSService):
|
||||
try:
|
||||
self._speech = Speech()
|
||||
self._connection = await self._speech.synthesize_streaming(
|
||||
self._voice_id, format="raw", sample_rate=self._output_format["sample_rate"]
|
||||
self._voice_id,
|
||||
format="raw",
|
||||
sample_rate=self._settings["output_format"]["sample_rate"],
|
||||
)
|
||||
self._receive_task = self.get_event_loop().create_task(self._receive_task_handler())
|
||||
except Exception as e:
|
||||
@@ -130,7 +130,7 @@ class LmntTTSService(TTSService):
|
||||
await self.stop_ttfb_metrics()
|
||||
frame = TTSAudioRawFrame(
|
||||
audio=msg["audio"],
|
||||
sample_rate=self._output_format["sample_rate"],
|
||||
sample_rate=self._settings["output_format"]["sample_rate"],
|
||||
num_channels=1,
|
||||
)
|
||||
await self.push_frame(frame)
|
||||
|
||||
@@ -24,7 +24,7 @@ from pipecat.frames.frames import (
|
||||
LLMFullResponseEndFrame,
|
||||
LLMFullResponseStartFrame,
|
||||
LLMMessagesFrame,
|
||||
ServiceUpdateSettingsFrame,
|
||||
LLMUpdateSettingsFrame,
|
||||
StartInterruptionFrame,
|
||||
TextFrame,
|
||||
TTSAudioRawFrame,
|
||||
@@ -111,14 +111,16 @@ class BaseOpenAILLMService(LLMService):
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
self._settings = {
|
||||
"frequency_penalty": params.frequency_penalty,
|
||||
"presence_penalty": params.presence_penalty,
|
||||
"seed": params.seed,
|
||||
"temperature": params.temperature,
|
||||
"top_p": params.top_p,
|
||||
"extra": params.extra if isinstance(params.extra, dict) else {},
|
||||
}
|
||||
self.set_model_name(model)
|
||||
self._client = self.create_client(api_key=api_key, base_url=base_url, **kwargs)
|
||||
self._frequency_penalty = params.frequency_penalty
|
||||
self._presence_penalty = params.presence_penalty
|
||||
self._seed = params.seed
|
||||
self._temperature = params.temperature
|
||||
self._top_p = params.top_p
|
||||
self._extra = params.extra if isinstance(params.extra, dict) else {}
|
||||
|
||||
def create_client(self, api_key=None, base_url=None, **kwargs):
|
||||
return AsyncOpenAI(
|
||||
@@ -134,30 +136,6 @@ class BaseOpenAILLMService(LLMService):
|
||||
def can_generate_metrics(self) -> bool:
|
||||
return True
|
||||
|
||||
async def set_frequency_penalty(self, frequency_penalty: float):
|
||||
logger.debug(f"Switching LLM frequency_penalty to: [{frequency_penalty}]")
|
||||
self._frequency_penalty = frequency_penalty
|
||||
|
||||
async def set_presence_penalty(self, presence_penalty: float):
|
||||
logger.debug(f"Switching LLM presence_penalty to: [{presence_penalty}]")
|
||||
self._presence_penalty = presence_penalty
|
||||
|
||||
async def set_seed(self, seed: int):
|
||||
logger.debug(f"Switching LLM seed to: [{seed}]")
|
||||
self._seed = seed
|
||||
|
||||
async def set_temperature(self, temperature: float):
|
||||
logger.debug(f"Switching LLM temperature to: [{temperature}]")
|
||||
self._temperature = temperature
|
||||
|
||||
async def set_top_p(self, top_p: float):
|
||||
logger.debug(f"Switching LLM top_p to: [{top_p}]")
|
||||
self._top_p = top_p
|
||||
|
||||
async def set_extra(self, extra: Dict[str, Any]):
|
||||
logger.debug(f"Switching LLM extra to: [{extra}]")
|
||||
self._extra = extra
|
||||
|
||||
async def get_chat_completions(
|
||||
self, context: OpenAILLMContext, messages: List[ChatCompletionMessageParam]
|
||||
) -> AsyncStream[ChatCompletionChunk]:
|
||||
@@ -168,14 +146,14 @@ class BaseOpenAILLMService(LLMService):
|
||||
"tools": context.tools,
|
||||
"tool_choice": context.tool_choice,
|
||||
"stream_options": {"include_usage": True},
|
||||
"frequency_penalty": self._frequency_penalty,
|
||||
"presence_penalty": self._presence_penalty,
|
||||
"seed": self._seed,
|
||||
"temperature": self._temperature,
|
||||
"top_p": self._top_p,
|
||||
"frequency_penalty": self._settings["frequency_penalty"],
|
||||
"presence_penalty": self._settings["presence_penalty"],
|
||||
"seed": self._settings["seed"],
|
||||
"temperature": self._settings["temperature"],
|
||||
"top_p": self._settings["top_p"],
|
||||
}
|
||||
|
||||
params.update(self._extra)
|
||||
params.update(self._settings["extra"])
|
||||
|
||||
chunks = await self._client.chat.completions.create(**params)
|
||||
return chunks
|
||||
@@ -295,17 +273,6 @@ class BaseOpenAILLMService(LLMService):
|
||||
f"The LLM tried to call a function named '{function_name}', but there isn't a callback registered for that function."
|
||||
)
|
||||
|
||||
async def _update_settings(self, settings: Dict[str, Any]):
|
||||
for key, value in settings.items():
|
||||
setter = getattr(self, f"set_{key}", None)
|
||||
if setter and callable(setter):
|
||||
try:
|
||||
await setter(value)
|
||||
except Exception as e:
|
||||
logger.warning(f"Error setting {key}: {e}")
|
||||
else:
|
||||
logger.warning(f"Unknown setting for OpenAI LLM service: {key}")
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
@@ -316,7 +283,7 @@ class BaseOpenAILLMService(LLMService):
|
||||
context = OpenAILLMContext.from_messages(frame.messages)
|
||||
elif isinstance(frame, VisionImageRawFrame):
|
||||
context = OpenAILLMContext.from_image_frame(frame)
|
||||
elif isinstance(frame, ServiceUpdateSettingsFrame) and frame.service_type == "llm":
|
||||
elif isinstance(frame, LLMUpdateSettingsFrame):
|
||||
await self._update_settings(frame.settings)
|
||||
else:
|
||||
await self.push_frame(frame, direction)
|
||||
@@ -414,29 +381,27 @@ class OpenAITTSService(TTSService):
|
||||
self,
|
||||
*,
|
||||
api_key: str | None = None,
|
||||
voice: str = "alloy",
|
||||
voice_id: str = "alloy",
|
||||
model: Literal["tts-1", "tts-1-hd"] = "tts-1",
|
||||
sample_rate: int = 24000,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(sample_rate=sample_rate, **kwargs)
|
||||
|
||||
self._voice: ValidVoice = VALID_VOICES.get(voice, "alloy")
|
||||
self._settings = {
|
||||
"sample_rate": sample_rate,
|
||||
}
|
||||
self.set_model_name(model)
|
||||
self._sample_rate = sample_rate
|
||||
self.set_voice(voice_id)
|
||||
|
||||
self._client = AsyncOpenAI(api_key=api_key)
|
||||
|
||||
def can_generate_metrics(self) -> bool:
|
||||
return True
|
||||
|
||||
async def set_voice(self, voice: str):
|
||||
logger.debug(f"Switching TTS voice to: [{voice}]")
|
||||
self._voice = VALID_VOICES.get(voice, self._voice)
|
||||
|
||||
async def set_model(self, model: str):
|
||||
logger.debug(f"Switching TTS model to: [{model}]")
|
||||
self._model = model
|
||||
self.set_model_name(model)
|
||||
|
||||
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
|
||||
logger.debug(f"Generating TTS: [{text}]")
|
||||
@@ -446,7 +411,7 @@ class OpenAITTSService(TTSService):
|
||||
async with self._client.audio.speech.with_streaming_response.create(
|
||||
input=text,
|
||||
model=self.model_name,
|
||||
voice=self._voice,
|
||||
voice=VALID_VOICES[self._voice_id],
|
||||
response_format="pcm",
|
||||
) as r:
|
||||
if r.status_code != 200:
|
||||
@@ -465,7 +430,7 @@ class OpenAITTSService(TTSService):
|
||||
async for chunk in r.iter_bytes(8192):
|
||||
if len(chunk) > 0:
|
||||
await self.stop_ttfb_metrics()
|
||||
frame = TTSAudioRawFrame(chunk, self.sample_rate, 1)
|
||||
frame = TTSAudioRawFrame(chunk, self._settings["sample_rate"], 1)
|
||||
yield frame
|
||||
await self.push_frame(TTSStoppedFrame())
|
||||
except BadRequestError as e:
|
||||
|
||||
@@ -6,17 +6,21 @@
|
||||
|
||||
import io
|
||||
import struct
|
||||
|
||||
from typing import AsyncGenerator
|
||||
|
||||
from pipecat.frames.frames import Frame, TTSAudioRawFrame, TTSStartedFrame, TTSStoppedFrame
|
||||
from pipecat.services.ai_services import TTSService
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.frames.frames import (
|
||||
Frame,
|
||||
TTSAudioRawFrame,
|
||||
TTSStartedFrame,
|
||||
TTSStoppedFrame,
|
||||
)
|
||||
from pipecat.services.ai_services import TTSService
|
||||
|
||||
try:
|
||||
from pyht.client import TTSOptions
|
||||
from pyht.async_client import AsyncClient
|
||||
from pyht.client import TTSOptions
|
||||
from pyht.protos.api_pb2 import Format
|
||||
except ModuleNotFoundError as e:
|
||||
logger.error(f"Exception: {e}")
|
||||
@@ -28,7 +32,7 @@ except ModuleNotFoundError as e:
|
||||
|
||||
class PlayHTTTSService(TTSService):
|
||||
def __init__(
|
||||
self, *, api_key: str, user_id: str, voice_url: str, sample_rate: int = 16000, **kwargs
|
||||
self, *, api_key: str, user_id: str, voice_id: str, sample_rate: int = 16000, **kwargs
|
||||
):
|
||||
super().__init__(sample_rate=sample_rate, **kwargs)
|
||||
|
||||
@@ -39,17 +43,23 @@ class PlayHTTTSService(TTSService):
|
||||
user_id=self._user_id,
|
||||
api_key=self._speech_key,
|
||||
)
|
||||
self._settings = {
|
||||
"sample_rate": sample_rate,
|
||||
"quality": "higher",
|
||||
"format": Format.FORMAT_WAV,
|
||||
"voice_engine": "PlayHT2.0-turbo",
|
||||
}
|
||||
self.set_voice(voice_id)
|
||||
self._options = TTSOptions(
|
||||
voice=voice_url, sample_rate=sample_rate, quality="higher", format=Format.FORMAT_WAV
|
||||
voice=self._voice_id,
|
||||
sample_rate=self._settings["sample_rate"],
|
||||
quality=self._settings["quality"],
|
||||
format=self._settings["format"],
|
||||
)
|
||||
|
||||
def can_generate_metrics(self) -> bool:
|
||||
return True
|
||||
|
||||
async def set_voice(self, voice: str):
|
||||
logger.debug(f"Switching TTS voice to: [{voice}]")
|
||||
self._options.voice = voice
|
||||
|
||||
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
|
||||
logger.debug(f"Generating TTS: [{text}]")
|
||||
|
||||
@@ -60,7 +70,7 @@ class PlayHTTTSService(TTSService):
|
||||
await self.start_ttfb_metrics()
|
||||
|
||||
playht_gen = self._client.tts(
|
||||
text, voice_engine="PlayHT2.0-turbo", options=self._options
|
||||
text, voice_engine=self._settings["voice_engine"], options=self._options
|
||||
)
|
||||
|
||||
await self.start_tts_usage_metrics(text)
|
||||
@@ -83,7 +93,7 @@ class PlayHTTTSService(TTSService):
|
||||
else:
|
||||
if len(chunk):
|
||||
await self.stop_ttfb_metrics()
|
||||
frame = TTSAudioRawFrame(chunk, 16000, 1)
|
||||
frame = TTSAudioRawFrame(chunk, self._settings["sample_rate"], 1)
|
||||
yield frame
|
||||
await self.push_frame(TTSStoppedFrame())
|
||||
except Exception as e:
|
||||
|
||||
@@ -50,13 +50,15 @@ class TogetherLLMService(OpenAILLMService):
|
||||
):
|
||||
super().__init__(api_key=api_key, base_url=base_url, model=model, params=params, **kwargs)
|
||||
self.set_model_name(model)
|
||||
self._max_tokens = params.max_tokens
|
||||
self._frequency_penalty = params.frequency_penalty
|
||||
self._presence_penalty = params.presence_penalty
|
||||
self._temperature = params.temperature
|
||||
self._top_k = params.top_k
|
||||
self._top_p = params.top_p
|
||||
self._extra = params.extra if isinstance(params.extra, dict) else {}
|
||||
self._settings = {
|
||||
"max_tokens": params.max_tokens,
|
||||
"frequency_penalty": params.frequency_penalty,
|
||||
"presence_penalty": params.presence_penalty,
|
||||
"seed": params.seed,
|
||||
"temperature": params.temperature,
|
||||
"top_p": params.top_p,
|
||||
"extra": params.extra if isinstance(params.extra, dict) else {},
|
||||
}
|
||||
|
||||
def can_generate_metrics(self) -> bool:
|
||||
return True
|
||||
@@ -72,42 +74,3 @@ class TogetherLLMService(OpenAILLMService):
|
||||
)
|
||||
),
|
||||
)
|
||||
|
||||
async def set_frequency_penalty(self, frequency_penalty: float):
|
||||
logger.debug(f"Switching LLM frequency_penalty to: [{frequency_penalty}]")
|
||||
self._frequency_penalty = frequency_penalty
|
||||
|
||||
async def set_max_tokens(self, max_tokens: int):
|
||||
logger.debug(f"Switching LLM max_tokens to: [{max_tokens}]")
|
||||
self._max_tokens = max_tokens
|
||||
|
||||
async def set_presence_penalty(self, presence_penalty: float):
|
||||
logger.debug(f"Switching LLM presence_penalty to: [{presence_penalty}]")
|
||||
self._presence_penalty = presence_penalty
|
||||
|
||||
async def set_temperature(self, temperature: float):
|
||||
logger.debug(f"Switching LLM temperature to: [{temperature}]")
|
||||
self._temperature = temperature
|
||||
|
||||
async def set_top_k(self, top_k: float):
|
||||
logger.debug(f"Switching LLM top_k to: [{top_k}]")
|
||||
self._top_k = top_k
|
||||
|
||||
async def set_top_p(self, top_p: float):
|
||||
logger.debug(f"Switching LLM top_p to: [{top_p}]")
|
||||
self._top_p = top_p
|
||||
|
||||
async def set_extra(self, extra: Dict[str, Any]):
|
||||
logger.debug(f"Switching LLM extra to: [{extra}]")
|
||||
self._extra = extra
|
||||
|
||||
async def _update_settings(self, settings: Dict[str, Any]):
|
||||
for key, value in settings.items():
|
||||
setter = getattr(self, f"set_{key}", None)
|
||||
if setter and callable(setter):
|
||||
try:
|
||||
await setter(value)
|
||||
except Exception as e:
|
||||
logger.warning(f"Error setting {key}: {e}")
|
||||
else:
|
||||
logger.warning(f"Unknown setting for Together LLM service: {key}")
|
||||
|
||||
@@ -4,10 +4,12 @@
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import aiohttp
|
||||
|
||||
from typing import Any, AsyncGenerator, Dict
|
||||
|
||||
import aiohttp
|
||||
import numpy as np
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.frames.frames import (
|
||||
ErrorFrame,
|
||||
Frame,
|
||||
@@ -18,10 +20,6 @@ from pipecat.frames.frames import (
|
||||
)
|
||||
from pipecat.services.ai_services import TTSService
|
||||
|
||||
import numpy as np
|
||||
|
||||
from loguru import logger
|
||||
|
||||
try:
|
||||
import resampy
|
||||
except ModuleNotFoundError as e:
|
||||
@@ -50,9 +48,11 @@ class XTTSService(TTSService):
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
self._voice_id = voice_id
|
||||
self._language = language
|
||||
self._base_url = base_url
|
||||
self._settings = {
|
||||
"language": language,
|
||||
"base_url": base_url,
|
||||
}
|
||||
self.set_voice(voice_id)
|
||||
self._studio_speakers: Dict[str, Any] | None = None
|
||||
self._aiohttp_session = aiohttp_session
|
||||
|
||||
@@ -61,7 +61,7 @@ class XTTSService(TTSService):
|
||||
|
||||
async def start(self, frame: StartFrame):
|
||||
await super().start(frame)
|
||||
async with self._aiohttp_session.get(self._base_url + "/studio_speakers") as r:
|
||||
async with self._aiohttp_session.get(self._settings["base_url"] + "/studio_speakers") as r:
|
||||
if r.status != 200:
|
||||
text = await r.text()
|
||||
logger.error(
|
||||
@@ -75,10 +75,6 @@ class XTTSService(TTSService):
|
||||
return
|
||||
self._studio_speakers = await r.json()
|
||||
|
||||
async def set_voice(self, voice: str):
|
||||
logger.debug(f"Switching TTS voice to: [{voice}]")
|
||||
self._voice_id = voice
|
||||
|
||||
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
|
||||
logger.debug(f"Generating TTS: [{text}]")
|
||||
|
||||
@@ -88,11 +84,11 @@ class XTTSService(TTSService):
|
||||
|
||||
embeddings = self._studio_speakers[self._voice_id]
|
||||
|
||||
url = self._base_url + "/tts_stream"
|
||||
url = self._settings["base_url"] + "/tts_stream"
|
||||
|
||||
payload = {
|
||||
"text": text.replace(".", "").replace("*", ""),
|
||||
"language": self._language,
|
||||
"language": self._settings["language"],
|
||||
"speaker_embedding": embeddings["speaker_embedding"],
|
||||
"gpt_cond_latent": embeddings["gpt_cond_latent"],
|
||||
"add_wav_header": False,
|
||||
|
||||
Reference in New Issue
Block a user