Merge pull request #3946 from pipecat-ai/mb/tts-settings-review

Review TTS settings
This commit is contained in:
Mark Backman
2026-03-07 07:48:26 -05:00
committed by GitHub
50 changed files with 284 additions and 286 deletions

1
changelog/3946.added.md Normal file
View File

@@ -0,0 +1 @@
- Runtime settings updates (via `STTUpdateSettingsFrame`) now work for AWS Transcribe, Azure, Cartesia, Deepgram, ElevenLabs Realtime, Gradium, and Soniox STT services. Previously, changing settings at runtime only stored the new values without reconnecting.

View File

@@ -25,7 +25,7 @@ from pipecat.runner.types import RunnerArguments
from pipecat.runner.utils import create_transport
from pipecat.services.deepgram.stt import DeepgramSTTService
from pipecat.services.openai.llm import OpenAILLMService, OpenAILLMSettings
from pipecat.services.xtts.tts import XTTSService, XTTSSettings
from pipecat.services.xtts.tts import XTTSService, XTTSTTSSettings
from pipecat.transports.base_transport import BaseTransport, TransportParams
from pipecat.transports.daily.transport import DailyParams
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
@@ -59,7 +59,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
tts = XTTSService(
aiohttp_session=session,
settings=XTTSSettings(
settings=XTTSTTSSettings(
voice="Claribel Dervla",
),
base_url="http://localhost:8000",

View File

@@ -104,7 +104,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
# Create Strands agent processor
try:
agent = build_agent(model_id="us.anthropic.claude-3-5-haiku-20241022-v1:0", max_tokens=8000)
agent = build_agent(model_id="us.anthropic.claude-sonnet-4-6", max_tokens=8000)
llm = StrandsAgentsProcessor(agent=agent)
logger.info("Successfully created Strands agent for NAB customer service coaching")
except Exception as e:
@@ -152,7 +152,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
messages=[
{
"role": "user",
"content": f"Greet the user and introduce yourself.",
"content": f"Greet the user and introduce yourself. Don't use emojis.",
}
],
run_llm=True,

View File

@@ -64,7 +64,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
llm = AWSBedrockLLMService(
aws_region="us-west-2",
settings=AWSBedrockLLMSettings(
model="us.anthropic.claude-haiku-4-5-20251001-v1:0",
model="us.anthropic.claude-sonnet-4-6",
temperature=0.8,
system_instruction="You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be spoken aloud, so avoid special characters that can't easily be spoken, such as emojis or bullet points. Respond to what the user said in a creative and helpful way.",
),

View File

@@ -72,7 +72,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
stt = GoogleSTTService(
credentials=os.getenv("GOOGLE_TEST_CREDENTIALS"),
settings=GoogleSTTSettings(
languages=Language.EN_US,
languages=[Language.EN_US],
),
)

View File

@@ -55,7 +55,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
stt = GoogleSTTService(
settings=GoogleSTTSettings(
languages=Language.EN_US,
languages=[Language.EN_US],
),
credentials=os.getenv("GOOGLE_TEST_CREDENTIALS"),
)

View File

@@ -55,8 +55,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
stt = GoogleSTTService(
settings=GoogleSTTSettings(
languages=Language.EN_US,
model="chirp_3",
languages=[Language.EN_US],
# Add model to use a specific model
# model="chirp_3",
),
credentials=os.getenv("GOOGLE_TEST_CREDENTIALS"),
location="us",

View File

@@ -58,7 +58,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
tts = FishAudioTTSService(
api_key=os.getenv("FISH_API_KEY"),
settings=FishAudioTTSSettings(
model="4ce7e917cedd4bc2bb2e6ff3a46acaa1", # Barack Obama
voice="4ce7e917cedd4bc2bb2e6ff3a46acaa1", # Barack Obama
),
)

View File

@@ -24,7 +24,7 @@ from pipecat.processors.aggregators.llm_response_universal import (
from pipecat.runner.types import RunnerArguments
from pipecat.runner.utils import create_transport
from pipecat.services.openai.llm import OpenAILLMService, OpenAILLMSettings
from pipecat.services.sarvam.stt import SarvamSTTService
from pipecat.services.sarvam.stt import SarvamSTTService, SarvamSTTSettings
from pipecat.services.sarvam.tts import SarvamHttpTTSService, SarvamHttpTTSSettings
from pipecat.transcriptions.language import Language
from pipecat.transports.base_transport import BaseTransport, TransportParams
@@ -59,14 +59,16 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
async with aiohttp.ClientSession() as session:
stt = SarvamSTTService(
api_key=os.getenv("SARVAM_API_KEY"),
model="saarika:v2.5",
settings=SarvamSTTSettings(
model="saarika:v2.5",
),
)
tts = SarvamHttpTTSService(
api_key=os.getenv("SARVAM_API_KEY"),
aiohttp_session=session,
settings=SarvamHttpTTSSettings(
language=Language.EN,
language=Language.EN_IN,
),
)

View File

@@ -61,11 +61,8 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
llm = AWSBedrockLLMService(
aws_region="us-west-2",
settings=AWSBedrockLLMSettings(
model="us.anthropic.claude-3-7-sonnet-20250219-v1:0",
# Note: usually, prefer providing latency="optimized" param.
# Here we can't because AWS Bedrock doesn't support it for Claude 3.7,
# which we need for image input.
params=AWSBedrockLLMService.InputParams(temperature=0.8),
model="us.anthropic.claude-sonnet-4-6",
temperature=0.8,
system_instruction="You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be spoken aloud, so avoid special characters that can't easily be spoken, such as emojis or bullet points. Respond to what the user said in a creative and helpful way. You are also able to describe images.",
),
)

View File

@@ -76,7 +76,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
llm = AWSBedrockLLMService(
aws_region="us-west-2",
settings=AWSBedrockLLMSettings(
model="us.anthropic.claude-haiku-4-5-20251001-v1:0",
model="us.anthropic.claude-sonnet-4-6",
temperature=0.8,
system_instruction="You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be spoken aloud, so avoid special characters that can't easily be spoken, such as emojis or bullet points. Respond to what the user said in a creative and helpful way.",
),

View File

@@ -7,7 +7,6 @@
import os
from deepgram import LiveOptions
from dotenv import load_dotenv
from loguru import logger
@@ -28,7 +27,7 @@ from pipecat.processors.filters.function_filter import FunctionFilter
from pipecat.runner.types import RunnerArguments
from pipecat.runner.utils import create_transport
from pipecat.services.cartesia.tts import CartesiaTTSService, CartesiaTTSSettings
from pipecat.services.deepgram.stt import DeepgramSTTService
from pipecat.services.deepgram.stt import DeepgramSTTService, DeepgramSTTSettings
from pipecat.services.llm_service import FunctionCallParams
from pipecat.services.openai.llm import OpenAILLMService, OpenAILLMSettings
from pipecat.transports.base_transport import BaseTransport, TransportParams
@@ -102,7 +101,10 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
logger.info(f"Starting bot")
stt = DeepgramSTTService(
api_key=os.getenv("DEEPGRAM_API_KEY"), live_options=LiveOptions(language="multi")
api_key=os.getenv("DEEPGRAM_API_KEY"),
settings=DeepgramSTTSettings(
language="multi",
),
)
tts = SwitchLanguage()

View File

@@ -7,7 +7,6 @@
import asyncio
import os
from deepgram import LiveOptions
from dotenv import load_dotenv
from loguru import logger
@@ -114,7 +113,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
STTUpdateSettingsFrame(
delta=DeepgramSageMakerSTTSettings(
language=Language.ES,
live_options=LiveOptions(punctuate=False),
punctuate=False,
)
)
)

View File

@@ -7,7 +7,6 @@
import asyncio
import os
from deepgram import LiveOptions
from dotenv import load_dotenv
from loguru import logger
@@ -108,7 +107,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
STTUpdateSettingsFrame(
delta=DeepgramSTTSettings(
language=Language.ES,
live_options=LiveOptions(punctuate=False),
punctuate=False,
)
)
)

View File

@@ -62,7 +62,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
llm = AWSBedrockLLMService(
aws_region="us-west-2",
settings=AWSBedrockLLMSettings(
model="us.anthropic.claude-haiku-4-5-20251001-v1:0",
model="us.anthropic.claude-sonnet-4-6",
temperature=0.8,
system_instruction="You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be spoken aloud, so avoid special characters that can't easily be spoken, such as emojis or bullet points. Respond to what the user said in a creative and helpful way.",
),

View File

@@ -104,7 +104,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
await asyncio.sleep(10)
logger.info("Updating Gradium STT settings: delay_in_frames=5")
await task.queue_frame(STTUpdateSettingsFrame(delta=GradiumSTTSettings(delay_in_frames=5)))
await task.queue_frame(STTUpdateSettingsFrame(delta=GradiumSTTSettings(delay_in_frames=16)))
@transport.event_handler("on_client_disconnected")
async def on_client_disconnected(transport, client):

View File

@@ -146,6 +146,7 @@ TESTS_07 = [
("07zg-interruptible-camb.py", EVAL_SIMPLE_MATH),
("07zi-interruptible-piper.py", EVAL_SIMPLE_MATH),
("07zj-interruptible-kokoro.py", EVAL_SIMPLE_MATH),
("07zk-interruptible-resembleai.py", EVAL_SIMPLE_MATH),
# Needs a local XTTS docker instance running.
# ("07i-interruptible-xtts.py", EVAL_SIMPLE_MATH),
]

View File

@@ -74,7 +74,7 @@ def language_to_async_language(language: Language) -> Optional[str]:
@dataclass
class AsyncAITTSSettings(TTSSettings):
"""Settings for Async AI TTS services."""
"""Settings for AsyncAITTSService and AsyncAIHttpTTSService."""
pass

View File

@@ -107,7 +107,7 @@ class AWSTranscribeSTTService(WebsocketSTTService):
_warn_deprecated_param("language", AWSTranscribeSTTSettings, "language")
default_settings.language = self.language_to_service_language(language)
# 3. No params to apply
# 3. (No step 3, as there's no params object to apply)
# 4. Apply settings delta (canonical API, always wins)
if settings is not None:
@@ -158,22 +158,12 @@ class AWSTranscribeSTTService(WebsocketSTTService):
return encoding_map.get(encoding, encoding)
async def _update_settings(self, delta: STTSettings) -> dict[str, Any]:
"""Apply a settings delta.
Settings are stored but not applied to the active connection.
"""
"""Apply a settings delta and reconnect if anything changed."""
changed = await super()._update_settings(delta)
if not changed:
return changed
# TODO: someday we could reconnect here to apply updated settings.
# Code might look something like the below:
# if changed and self._websocket:
# await self._disconnect()
# await self._connect()
self._warn_unhandled_updated_settings(changed)
if changed and self._websocket:
await self._disconnect()
await self._connect()
return changed

View File

@@ -123,7 +123,7 @@ def language_to_aws_language(language: Language) -> Optional[str]:
@dataclass
class AWSPollyTTSSettings(TTSSettings):
"""Settings for AWS Polly TTS service.
"""Settings for AWSPollyTTSService.
Parameters:
engine: TTS engine to use ('standard', 'neural', etc.).

View File

@@ -112,7 +112,7 @@ class AzureSTTService(STTService):
_warn_deprecated_param("language", AzureSTTSettings, "language")
default_settings.language = language_to_azure_language(language)
# 3. No params to apply
# 3. (No step 3, as there's no params object to apply)
# 4. Apply settings delta (canonical API, always wins)
if settings is not None:
@@ -159,23 +159,16 @@ class AzureSTTService(STTService):
return language_to_azure_language(language)
async def _update_settings(self, delta: STTSettings) -> dict[str, Any]:
"""Apply a settings delta.
Settings are stored but not applied to the active recognizer.
"""
"""Apply a settings delta and reconnect if language changed."""
changed = await super()._update_settings(delta)
# TODO: someday we could reconnect here to apply updated settings.
# Code might look something like the below:
# if "language" in changed:
# self._speech_config.speech_recognition_language = self._settings.language
# if self._speech_recognizer:
# # Requires refactoring to set up and tear down recognizer, as
# # language is applied at recognizer initialization
# await self._disconnect()
# await self._connect()
self._warn_unhandled_updated_settings(changed)
if "language" in changed:
self._speech_config.speech_recognition_language = (
self._settings.language or language_to_azure_language(Language.EN_US)
)
if self._audio_stream:
await self._disconnect()
await self._connect()
return changed
@@ -202,14 +195,32 @@ class AzureSTTService(STTService):
async def start(self, frame: StartFrame):
"""Start the speech recognition service.
Initializes the Azure speech recognizer with audio stream configuration
and begins continuous speech recognition.
Args:
frame: Frame indicating the start of processing.
"""
await super().start(frame)
await self._connect()
async def stop(self, frame: EndFrame):
"""Stop the speech recognition service.
Args:
frame: Frame indicating the end of processing.
"""
await super().stop(frame)
await self._disconnect()
async def cancel(self, frame: CancelFrame):
"""Cancel the speech recognition service.
Args:
frame: Frame indicating cancellation.
"""
await super().cancel(frame)
await self._disconnect()
async def _connect(self):
"""Initialize the Azure speech recognizer and begin continuous recognition."""
if self._audio_stream:
return
@@ -231,37 +242,15 @@ class AzureSTTService(STTService):
error_msg=f"Uncaught exception during initialization: {e}", exception=e
)
async def stop(self, frame: EndFrame):
"""Stop the speech recognition service.
Cleanly shuts down the Azure speech recognizer and closes audio streams.
Args:
frame: Frame indicating the end of processing.
"""
await super().stop(frame)
if self._speech_recognizer:
self._speech_recognizer.stop_continuous_recognition_async()
if self._audio_stream:
self._audio_stream.close()
async def cancel(self, frame: CancelFrame):
"""Cancel the speech recognition service.
Immediately stops recognition and closes resources.
Args:
frame: Frame indicating cancellation.
"""
await super().cancel(frame)
async def _disconnect(self):
"""Stop recognition and close audio streams."""
if self._speech_recognizer:
self._speech_recognizer.stop_continuous_recognition_async()
self._speech_recognizer = None
if self._audio_stream:
self._audio_stream.close()
self._audio_stream = None
@traced_stt
async def _handle_transcription(

View File

@@ -68,7 +68,7 @@ def sample_rate_to_output_format(sample_rate: int) -> SpeechSynthesisOutputForma
@dataclass
class AzureTTSSettings(TTSSettings):
"""Settings for Azure TTS services.
"""Settings for AzureTTSService and AzureHttpTTSService.
Parameters:
emphasis: Emphasis level for speech ("strong", "moderate", "reduced").

View File

@@ -135,7 +135,7 @@ def _get_aligned_audio(buffer: bytes) -> tuple[bytes, bytes]:
@dataclass
class CambTTSSettings(TTSSettings):
"""Settings for Camb.ai TTS service.
"""Settings for CambTTSService.
Parameters:
user_instructions: Custom instructions for mars-instruct model only.

View File

@@ -23,7 +23,6 @@ from pipecat.frames.frames import (
Frame,
StartFrame,
TTSAudioRawFrame,
TTSStartedFrame,
TTSStoppedFrame,
)
from pipecat.services.settings import NOT_GIVEN, TTSSettings, _NotGiven, _warn_deprecated_param
@@ -188,7 +187,7 @@ class CartesiaEmotion(str, Enum):
@dataclass
class CartesiaTTSSettings(TTSSettings):
"""Settings for Cartesia TTS services.
"""Settings for CartesiaTTSService and CartesiaHttpTTSService.
Parameters:
generation_config: Generation configuration for Sonic-3 models. Includes volume,
@@ -705,7 +704,7 @@ class CartesiaHttpTTSService(TTSService):
voice_id: Optional[str] = None,
model: Optional[str] = None,
base_url: str = "https://api.cartesia.ai",
cartesia_version: str = "2024-11-13",
cartesia_version: str = "2026-03-01",
aiohttp_session: Optional[aiohttp.ClientSession] = None,
sample_rate: Optional[int] = None,
encoding: str = "pcm_s16le",

View File

@@ -40,7 +40,7 @@ from pipecat.utils.tracing.service_decorators import traced_tts
@dataclass
class DeepgramSageMakerTTSSettings(TTSSettings):
"""Settings for Deepgram SageMaker TTS service."""
"""Settings for DeepgramSageMakerTTSService."""
pass

View File

@@ -365,7 +365,9 @@ class DeepgramSTTService(STTService):
vad_events=False,
)
# 2. Apply live_options overrides — only if settings not provided
# 2. (No step 2, as there are no deprecated direct args)
# 3. Apply live_options overrides — only if settings not provided
if live_options is not None:
_warn_deprecated_param("live_options", DeepgramSTTSettings)
if not settings:
@@ -402,7 +404,7 @@ class DeepgramSTTService(STTService):
delta = DeepgramSTTSettings.from_mapping(lo_dict)
default_settings.apply_update(delta)
# 3. Apply settings delta (canonical API, always wins)
# 4. Apply settings delta (canonical API, always wins)
if settings is not None:
default_settings.apply_update(settings)
@@ -494,8 +496,9 @@ class DeepgramSTTService(STTService):
if isinstance(self._settings, DeepgramSTTSettings):
self._settings._sync_extra_to_fields()
await self._disconnect()
await self._connect()
if self._connection:
await self._disconnect()
await self._connect()
return changed
@@ -594,13 +597,16 @@ class DeepgramSTTService(STTService):
return
logger.debug("Disconnecting from Deepgram")
# Ask Deepgram to close the stream gracefully before cancelling the task.
if self._connection:
await self._connection.send_close_stream()
# Clear self._connection first to prevent run_stt from sending audio
# during the close handshake, then close gracefully on the saved ref.
connection = self._connection
self._connection = None
if connection:
await connection.send_close_stream()
await self.cancel_task(self._connection_task)
self._connection_task = None
self._connection = None
async def _connection_handler(self):
"""Manages the full WebSocket lifecycle inside a single async with block.

View File

@@ -45,7 +45,7 @@ except ModuleNotFoundError as e:
@dataclass
class DeepgramTTSSettings(TTSSettings):
"""Settings for Deepgram TTS service."""
"""Settings for DeepgramTTSService and DeepgramHttpTTSService."""
pass
@@ -110,6 +110,8 @@ class DeepgramTTSService(WebsocketTTSService):
default_settings.model = voice
default_settings.voice = voice
# 3. (No step 3, as there's no params object to apply)
# 4. Apply settings delta (canonical API, always wins)
if settings is not None:
default_settings.apply_update(settings)
@@ -423,6 +425,8 @@ class DeepgramHttpTTSService(TTSService):
default_settings.model = voice
default_settings.voice = voice
# 3. (No step 3, as there's no params object to apply)
# 4. Apply settings delta (canonical API, always wins)
if settings is not None:
default_settings.apply_update(settings)

View File

@@ -200,18 +200,12 @@ class ElevenLabsRealtimeSTTSettings(STTSettings):
vad_threshold: VAD sensitivity (0.1-0.9, lower is more sensitive).
min_speech_duration_ms: Minimum speech duration for VAD (50-2000ms).
min_silence_duration_ms: Minimum silence duration for VAD (50-2000ms).
include_timestamps: Whether to include word-level timestamps in transcripts.
enable_logging: Whether to enable logging on ElevenLabs' side.
include_language_detection: Whether to include language detection in transcripts.
"""
vad_silence_threshold_secs: float | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
vad_threshold: float | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
min_speech_duration_ms: int | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
min_silence_duration_ms: int | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
include_timestamps: bool | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
enable_logging: bool | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
include_language_detection: bool | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
class ElevenLabsSTTService(SegmentedSTTService):
@@ -496,6 +490,9 @@ class ElevenLabsRealtimeSTTService(WebsocketSTTService):
commit_strategy: CommitStrategy = CommitStrategy.MANUAL,
model: Optional[str] = None,
sample_rate: Optional[int] = None,
include_timestamps: bool = False,
enable_logging: bool = False,
include_language_detection: bool = False,
params: Optional[InputParams] = None,
settings: Optional[ElevenLabsRealtimeSTTSettings] = None,
ttfs_p99_latency: Optional[float] = ELEVENLABS_REALTIME_TTFS_P99,
@@ -515,6 +512,9 @@ class ElevenLabsRealtimeSTTService(WebsocketSTTService):
Use ``settings=ElevenLabsRealtimeSTTSettings(model=...)`` instead.
sample_rate: Audio sample rate in Hz. If not provided, uses the pipeline's rate.
include_timestamps: Whether to include word-level timestamps in transcripts.
enable_logging: Whether to enable logging on ElevenLabs' side.
include_language_detection: Whether to include language detection in transcripts.
params: Configuration parameters for the STT service.
.. deprecated:: 0.0.105
@@ -534,9 +534,6 @@ class ElevenLabsRealtimeSTTService(WebsocketSTTService):
vad_threshold=None,
min_speech_duration_ms=None,
min_silence_duration_ms=None,
include_timestamps=False,
enable_logging=False,
include_language_detection=False,
)
# 2. Apply direct init arg overrides (deprecated)
@@ -555,9 +552,9 @@ class ElevenLabsRealtimeSTTService(WebsocketSTTService):
default_settings.vad_threshold = params.vad_threshold
default_settings.min_speech_duration_ms = params.min_speech_duration_ms
default_settings.min_silence_duration_ms = params.min_silence_duration_ms
default_settings.include_timestamps = params.include_timestamps
default_settings.enable_logging = params.enable_logging
default_settings.include_language_detection = params.include_language_detection
include_timestamps = params.include_timestamps
enable_logging = params.enable_logging
include_language_detection = params.include_language_detection
# 4. Apply settings delta (canonical API, always wins)
if settings is not None:
@@ -579,6 +576,9 @@ class ElevenLabsRealtimeSTTService(WebsocketSTTService):
# Init-only config (not runtime-updatable).
self._commit_strategy = commit_strategy
self._include_timestamps = include_timestamps
self._enable_logging = enable_logging
self._include_language_detection = include_language_detection
self._connected_event = asyncio.Event()
self._connected_event.set()
@@ -605,8 +605,9 @@ class ElevenLabsRealtimeSTTService(WebsocketSTTService):
if not changed:
return changed
await self._disconnect()
await self._connect()
if self._websocket:
await self._disconnect()
await self._connect()
return changed
@@ -762,17 +763,15 @@ class ElevenLabsRealtimeSTTService(WebsocketSTTService):
params.append(f"commit_strategy={self._commit_strategy.value}")
# Add optional parameters
if self._settings.include_timestamps:
params.append(
f"include_timestamps={str(self._settings.include_timestamps).lower()}"
)
if self._include_timestamps:
params.append(f"include_timestamps={str(self._include_timestamps).lower()}")
if self._settings.enable_logging:
params.append(f"enable_logging={str(self._settings.enable_logging).lower()}")
if self._enable_logging:
params.append(f"enable_logging={str(self._enable_logging).lower()}")
if self._settings.include_language_detection:
if self._include_language_detection:
params.append(
f"include_language_detection={str(self._settings.include_language_detection).lower()}"
f"include_language_detection={str(self._include_language_detection).lower()}"
)
# Add VAD parameters if using VAD commit strategy and values are specified
@@ -920,7 +919,7 @@ class ElevenLabsRealtimeSTTService(WebsocketSTTService):
"""
# If timestamps are enabled, skip this message and wait for the
# committed_transcript_with_timestamps message which contains all the data
if self._settings.include_timestamps:
if self._include_timestamps:
return
text = data.get("text", "").strip()

View File

@@ -187,7 +187,7 @@ class PronunciationDictionaryLocator(BaseModel):
@dataclass
class ElevenLabsTTSSettings(TTSSettings):
"""Settings for the ElevenLabs WebSocket TTS service.
"""Settings for ElevenLabsTTSService.
Fields that appear in the WebSocket URL (``voice``, ``model``,
``language``) require a full reconnect when changed. Fields that
@@ -225,7 +225,7 @@ class ElevenLabsTTSSettings(TTSSettings):
@dataclass
class ElevenLabsHttpTTSSettings(TTSSettings):
"""Settings for the ElevenLabs HTTP TTS service.
"""Settings for ElevenLabsHttpTTSService.
Parameters:
optimize_streaming_latency: Latency optimization level (0-4).
@@ -358,6 +358,9 @@ class ElevenLabsTTSService(WebsocketTTSService):
model: Optional[str] = None,
url: str = "wss://api.elevenlabs.io",
sample_rate: Optional[int] = None,
auto_mode: bool = True,
enable_ssml_parsing: Optional[bool] = None,
enable_logging: Optional[bool] = None,
pronunciation_dictionary_locators: Optional[List[PronunciationDictionaryLocator]] = None,
params: Optional[InputParams] = None,
settings: Optional[ElevenLabsTTSSettings] = None,
@@ -381,6 +384,9 @@ class ElevenLabsTTSService(WebsocketTTSService):
url: WebSocket URL for ElevenLabs TTS API.
sample_rate: Audio sample rate. If None, uses default.
auto_mode: Whether to enable automatic mode optimization.
enable_ssml_parsing: Whether to parse SSML tags in text.
enable_logging: Whether to enable ElevenLabs server-side logging.
pronunciation_dictionary_locators: List of pronunciation dictionary
locators to use.
params: Additional input parameters for voice customization.
@@ -428,11 +434,6 @@ class ElevenLabsTTSService(WebsocketTTSService):
apply_text_normalization=None,
)
# Track init-only URL params through the override chain
_auto_mode = True
_enable_ssml_parsing = None
_enable_logging = None
# 2. Apply direct init arg overrides (deprecated)
if voice_id is not None:
_warn_deprecated_param("voice_id", ElevenLabsTTSSettings, "voice")
@@ -459,11 +460,11 @@ class ElevenLabsTTSService(WebsocketTTSService):
if params.speed is not None:
default_settings.speed = params.speed
if params.auto_mode is not None:
_auto_mode = str(params.auto_mode).lower()
auto_mode = params.auto_mode
if params.enable_ssml_parsing is not None:
_enable_ssml_parsing = params.enable_ssml_parsing
enable_ssml_parsing = params.enable_ssml_parsing
if params.enable_logging is not None:
_enable_logging = params.enable_logging
enable_logging = params.enable_logging
if params.apply_text_normalization is not None:
default_settings.apply_text_normalization = params.apply_text_normalization
if _pronunciation_dictionary_locators is None:
@@ -488,9 +489,9 @@ class ElevenLabsTTSService(WebsocketTTSService):
self._url = url
# Init-only WebSocket URL params (not runtime-updatable).
self._auto_mode = _auto_mode
self._enable_ssml_parsing = _enable_ssml_parsing
self._enable_logging = _enable_logging
self._auto_mode = auto_mode
self._enable_ssml_parsing = enable_ssml_parsing
self._enable_logging = enable_logging
self._output_format = "" # initialized in start()
self._voice_settings = self._set_voice_settings()
@@ -664,7 +665,7 @@ class ElevenLabsTTSService(WebsocketTTSService):
voice_id = self._settings.voice
model = self._settings.model
output_format = self._output_format
url = f"{self._url}/v1/text-to-speech/{voice_id}/multi-stream-input?model_id={model}&output_format={output_format}&auto_mode={self._auto_mode}"
url = f"{self._url}/v1/text-to-speech/{voice_id}/multi-stream-input?model_id={model}&output_format={output_format}&auto_mode={str(self._auto_mode).lower()}"
if self._enable_ssml_parsing:
url += f"&enable_ssml_parsing={self._enable_ssml_parsing}"

View File

@@ -10,9 +10,8 @@ This module provides integration with Fish Audio's real-time TTS WebSocket API
for streaming text-to-speech synthesis with customizable voice parameters.
"""
import uuid
from dataclasses import dataclass, field
from typing import Any, AsyncGenerator, ClassVar, Dict, Literal, Mapping, Optional, Self
from typing import Any, AsyncGenerator, Literal, Mapping, Optional, Self
from loguru import logger
from pydantic import BaseModel
@@ -25,7 +24,6 @@ from pipecat.frames.frames import (
InterruptionFrame,
StartFrame,
TTSAudioRawFrame,
TTSStartedFrame,
TTSStoppedFrame,
)
from pipecat.processors.frame_processor import FrameDirection
@@ -49,21 +47,23 @@ FishAudioOutputFormat = Literal["opus", "mp3", "pcm", "wav"]
@dataclass
class FishAudioTTSSettings(TTSSettings):
"""Settings for Fish Audio TTS service.
"""Settings for FishAudioTTSService.
Parameters:
latency: Latency mode ("normal" or "balanced"). Defaults to "normal".
latency: Latency mode ("normal" or "balanced"). Defaults to "balanced".
normalize: Whether to normalize audio output. Defaults to True.
temperature: Controls randomness in speech generation (0.0-1.0).
top_p: Controls diversity via nucleus sampling (0.0-1.0).
prosody_speed: Speech speed multiplier (0.5-2.0). Defaults to 1.0.
prosody_volume: Volume adjustment in dB. Defaults to 0.
reference_id: Reference ID of the voice model.
prosody_volume: Volume adjustment in dB (-20 to 20). Defaults to 0.
"""
latency: str | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
normalize: bool | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
temperature: float | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
top_p: float | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
prosody_speed: float | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
prosody_volume: int | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
reference_id: str | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
@classmethod
def from_mapping(cls, settings: Mapping[str, Any]) -> Self:
@@ -174,18 +174,18 @@ class FishAudioTTSService(InterruptibleTTSService):
model="s1",
voice=None,
language=None,
latency="normal",
latency="balanced",
normalize=True,
temperature=None,
top_p=None,
prosody_speed=1.0,
prosody_volume=0,
reference_id=None,
)
# 2. Apply direct init arg overrides (deprecated)
if reference_id is not None:
_warn_deprecated_param("reference_id", FishAudioTTSSettings, "voice")
default_settings.voice = reference_id
default_settings.reference_id = reference_id
if model_id is not None:
_warn_deprecated_param("model_id", FishAudioTTSSettings, "model")
default_settings.model = model_id
@@ -317,8 +317,12 @@ class FishAudioTTSService(InterruptibleTTSService):
"speed": self._settings.prosody_speed,
"volume": self._settings.prosody_volume,
},
"reference_id": self._settings.reference_id,
"reference_id": self._settings.voice,
}
if self._settings.temperature is not None:
request_settings["temperature"] = self._settings.temperature
if self._settings.top_p is not None:
request_settings["top_p"] = self._settings.top_p
start_message = {"event": "start", "request": {"text": "", **request_settings}}
await self._websocket.send(ormsgpack.packb(start_message))
logger.debug("Sent start message to Fish Audio")
@@ -375,7 +379,14 @@ class FishAudioTTSService(InterruptibleTTSService):
frame = TTSAudioRawFrame(audio_data, self.sample_rate, 1)
await self.push_frame(frame)
await self.stop_ttfb_metrics()
continue
elif event == "finish":
reason = msg.get("reason", "unknown")
if reason == "error":
await self.push_error(
error_msg="Fish Audio server error during synthesis"
)
else:
logger.debug(f"Fish Audio session finished: {reason}")
except Exception as e:
await self.push_error(error_msg=f"Unknown error occurred: {e}", exception=e)

View File

@@ -12,7 +12,7 @@ WebSocket API for streaming audio transcription.
import base64
import json
from dataclasses import dataclass
from dataclasses import dataclass, field
from typing import Any, AsyncGenerator, Optional
from loguru import logger
@@ -28,7 +28,7 @@ from pipecat.frames.frames import (
VADUserStoppedSpeakingFrame,
)
from pipecat.processors.frame_processor import FrameDirection
from pipecat.services.settings import STTSettings, _warn_deprecated_param
from pipecat.services.settings import NOT_GIVEN, STTSettings, _NotGiven, _warn_deprecated_param
from pipecat.services.stt_latency import GRADIUM_TTFS_P99
from pipecat.services.stt_service import WebsocketSTTService
from pipecat.transcriptions.language import Language, resolve_language
@@ -68,9 +68,16 @@ def language_to_gradium_language(language: Language) -> Optional[str]:
@dataclass
class GradiumSTTSettings(STTSettings):
"""Settings for GradiumSTTService."""
"""Settings for GradiumSTTService.
pass
Parameters:
delay_in_frames: Delay in audio frames (80ms each) before text is
generated. Higher delays allow more context but increase latency.
Allowed values: 7, 8, 10, 12, 14, 16, 20, 24, 36, 48.
Default is 10 (800ms). Lower values like 7-8 give faster response.
"""
delay_in_frames: Optional[int] | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
class GradiumSTTService(WebsocketSTTService):
@@ -107,7 +114,6 @@ class GradiumSTTService(WebsocketSTTService):
*,
api_key: str,
api_endpoint_base_url: str = "wss://eu.api.gradium.ai/api/speech/asr",
delay_in_frames: Optional[int] = None,
params: Optional[InputParams] = None,
json_config: Optional[str] = None,
settings: Optional[GradiumSTTSettings] = None,
@@ -119,9 +125,6 @@ class GradiumSTTService(WebsocketSTTService):
Args:
api_key: Gradium API key for authentication.
api_endpoint_base_url: WebSocket endpoint URL. Defaults to Gradium's streaming endpoint.
delay_in_frames: Delay in audio frames (80ms each) before text is
generated. Higher delays allow more context but increase latency.
Allowed values: 7, 8, 10, 12, 14, 16, 20, 24, 36, 48.
params: Configuration parameters for language and delay settings.
.. deprecated:: 0.0.105
@@ -151,9 +154,10 @@ class GradiumSTTService(WebsocketSTTService):
default_settings = GradiumSTTSettings(
model=None,
language=None,
delay_in_frames=None,
)
# 2. (no deprecated direct args for this service)
# 2. (No step 2, as there are no deprecated direct args)
# 3. Apply params overrides — only if settings not provided
if params is not None:
@@ -161,7 +165,7 @@ class GradiumSTTService(WebsocketSTTService):
if not settings:
default_settings.language = params.language
if params.delay_in_frames is not None:
delay_in_frames = params.delay_in_frames
default_settings.delay_in_frames = params.delay_in_frames
# 4. Apply settings delta (canonical API, always wins)
if settings is not None:
@@ -178,7 +182,6 @@ class GradiumSTTService(WebsocketSTTService):
self._api_endpoint_base_url = api_endpoint_base_url
self._websocket = None
self._json_config = json_config
self._config_delay_in_frames = delay_in_frames
self._receive_task = None
@@ -212,8 +215,9 @@ class GradiumSTTService(WebsocketSTTService):
if not changed:
return changed
await self._disconnect()
await self._connect()
if self._websocket:
await self._disconnect()
await self._connect()
return changed
async def start(self, frame: StartFrame):
@@ -358,8 +362,8 @@ class GradiumSTTService(WebsocketSTTService):
gradium_language = language_to_gradium_language(self._settings.language)
if gradium_language:
json_config["language"] = gradium_language
if self._config_delay_in_frames:
json_config["delay_in_frames"] = self._config_delay_in_frames
if self._settings.delay_in_frames:
json_config["delay_in_frames"] = self._settings.delay_in_frames
if json_config:
setup_msg["json_config"] = json_config
await self._websocket.send(json.dumps(setup_msg))

View File

@@ -39,7 +39,7 @@ SAMPLE_RATE = 48000
@dataclass
class GradiumTTSSettings(TTSSettings):
"""Settings for the Gradium TTS service."""
"""Settings for GradiumTTSService."""
pass

View File

@@ -34,7 +34,7 @@ except ModuleNotFoundError as e:
@dataclass
class GroqTTSSettings(TTSSettings):
"""Settings for the Groq TTS service.
"""Settings for GroqTTSService.
Parameters:
speed: Speech speed multiplier. Defaults to 1.0.

View File

@@ -50,7 +50,7 @@ DEFAULT_HEADERS = {
@dataclass
class HumeTTSSettings(TTSSettings):
"""Settings for Hume TTS service.
"""Settings for HumeTTSService.
Parameters:
description: Natural-language acting directions (up to 100 characters).

View File

@@ -68,32 +68,19 @@ from pipecat.utils.tracing.service_decorators import traced_tts
@dataclass
class InworldTTSSettings(TTSSettings):
"""Settings for Inworld TTS services.
"""Settings for InworldTTSService and InworldHttpTTSService.
Parameters:
speaking_rate: Speaking rate for speech synthesis.
temperature: Temperature for speech synthesis.
auto_mode: Whether to use auto mode. Recommended when texts are sent
in full sentences/phrases. When enabled, the server controls
flushing of buffered text to achieve minimal latency while
maintaining high quality audio output. If None (default),
automatically set based on aggregate_sentences.
apply_text_normalization: Whether to apply text normalization.
timestamp_transport_strategy: Strategy for timestamp transport ("ASYNC" or "SYNC").
"""
speaking_rate: float | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
temperature: float | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
auto_mode: bool | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
apply_text_normalization: str | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
timestamp_transport_strategy: str | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
_aliases: ClassVar[Dict[str, str]] = {
"voiceId": "voice",
"modelId": "model",
"applyTextNormalization": "apply_text_normalization",
"autoMode": "auto_mode",
"timestampTransportStrategy": "timestamp_transport_strategy",
}
@classmethod
@@ -141,6 +128,7 @@ class InworldHttpTTSService(TTSService):
streaming: bool = True,
sample_rate: Optional[int] = None,
encoding: str = "LINEAR16",
timestamp_transport_strategy: Optional[Literal["ASYNC", "SYNC"]] = "ASYNC",
params: Optional[InputParams] = None,
settings: Optional[InworldTTSSettings] = None,
**kwargs,
@@ -163,6 +151,8 @@ class InworldHttpTTSService(TTSService):
streaming: Whether to use streaming mode.
sample_rate: Audio sample rate in Hz.
encoding: Audio encoding format.
timestamp_transport_strategy: Strategy for timestamp transport
("ASYNC" or "SYNC"). Defaults to "ASYNC".
params: Input parameters for Inworld TTS configuration.
.. deprecated:: 0.0.105
@@ -179,9 +169,6 @@ class InworldHttpTTSService(TTSService):
language=None,
speaking_rate=None,
temperature=None,
timestamp_transport_strategy="ASYNC",
auto_mode=None, # Not applicable for HTTP TTS
apply_text_normalization=None, # Not applicable for HTTP TTS
)
# 2. Apply direct init arg overrides (deprecated)
@@ -201,9 +188,7 @@ class InworldHttpTTSService(TTSService):
if params.temperature is not None:
default_settings.temperature = params.temperature
if params.timestamp_transport_strategy is not None:
default_settings.timestamp_transport_strategy = (
params.timestamp_transport_strategy
)
timestamp_transport_strategy = params.timestamp_transport_strategy
# 4. Apply settings delta (canonical API, always wins)
if settings is not None:
@@ -230,9 +215,10 @@ class InworldHttpTTSService(TTSService):
self._cumulative_time = 0.0
# Init-only audio format config (not runtime-updatable).
# Init-only config (not runtime-updatable).
self._audio_encoding = encoding
self._audio_sample_rate = 0 # Set in start()
self._timestamp_transport_strategy = timestamp_transport_strategy
def can_generate_metrics(self) -> bool:
"""Check if this service can generate processing metrics.
@@ -251,22 +237,6 @@ class InworldHttpTTSService(TTSService):
await super().start(frame)
self._audio_sample_rate = self.sample_rate
async def stop(self, frame: EndFrame):
"""Stop the Inworld TTS service.
Args:
frame: The end frame.
"""
await super().stop(frame)
async def cancel(self, frame: CancelFrame):
"""Cancel the Inworld TTS service.
Args:
frame: The cancel frame.
"""
await super().cancel(frame)
async def push_frame(self, frame: Frame, direction: FrameDirection = FrameDirection.DOWNSTREAM):
"""Push a frame and handle state changes.
@@ -347,8 +317,8 @@ class InworldHttpTTSService(TTSService):
# Use WORD timestamps for simplicity and correct spacing/capitalization
payload["timestampType"] = self._timestamp_type
if self._settings.timestamp_transport_strategy is not None:
payload["timestampTransportStrategy"] = self._settings.timestamp_transport_strategy
if self._timestamp_transport_strategy is not None:
payload["timestampTransportStrategy"] = self._timestamp_transport_strategy
request_id = str(uuid.uuid4())
headers = {
@@ -556,6 +526,9 @@ class InworldTTSService(WebsocketTTSService):
url: str = "wss://api.inworld.ai/tts/v1/voice:streamBidirectional",
sample_rate: Optional[int] = None,
encoding: str = "LINEAR16",
auto_mode: Optional[bool] = None,
apply_text_normalization: Optional[str] = None,
timestamp_transport_strategy: Optional[Literal["ASYNC", "SYNC"]] = "ASYNC",
params: Optional[InputParams] = None,
settings: Optional[InworldTTSSettings] = None,
aggregate_sentences: Optional[bool] = None,
@@ -580,6 +553,12 @@ class InworldTTSService(WebsocketTTSService):
url: URL of the Inworld WebSocket API.
sample_rate: Audio sample rate in Hz.
encoding: Audio encoding format.
auto_mode: Whether to use auto mode. When enabled, the server
controls flushing of buffered text. If None (default),
automatically set based on ``aggregate_sentences``.
apply_text_normalization: Whether to apply text normalization.
timestamp_transport_strategy: Strategy for timestamp transport
("ASYNC" or "SYNC"). Defaults to "ASYNC".
params: Input parameters for Inworld WebSocket TTS configuration.
.. deprecated:: 0.0.105
@@ -596,6 +575,10 @@ class InworldTTSService(WebsocketTTSService):
append_trailing_space: Whether to append a trailing space to text before sending to TTS.
**kwargs: Additional arguments passed to the parent class.
"""
# Derive auto_mode from aggregate_sentences if not explicitly set
if auto_mode is None:
auto_mode = True if aggregate_sentences is None else aggregate_sentences
# 1. Initialize default_settings with hardcoded defaults
default_settings = InworldTTSSettings(
model="inworld-tts-1.5-max",
@@ -603,9 +586,6 @@ class InworldTTSService(WebsocketTTSService):
language=None,
speaking_rate=None,
temperature=None,
apply_text_normalization=None,
timestamp_transport_strategy="ASYNC",
auto_mode=True if aggregate_sentences is None else aggregate_sentences,
)
# 2. Apply direct init arg overrides (deprecated)
@@ -627,13 +607,11 @@ class InworldTTSService(WebsocketTTSService):
if params.temperature is not None:
default_settings.temperature = params.temperature
if params.apply_text_normalization is not None:
default_settings.apply_text_normalization = params.apply_text_normalization
apply_text_normalization = params.apply_text_normalization
if params.timestamp_transport_strategy is not None:
default_settings.timestamp_transport_strategy = (
params.timestamp_transport_strategy
)
timestamp_transport_strategy = params.timestamp_transport_strategy
if params.auto_mode is not None:
default_settings.auto_mode = params.auto_mode
auto_mode = params.auto_mode
_buffer_max_delay_ms = params.max_buffer_delay_ms
_buffer_char_threshold = params.buffer_char_threshold
@@ -673,9 +651,12 @@ class InworldTTSService(WebsocketTTSService):
# Track the end time of the last word in the current generation
self._generation_end_time = 0.0
# Init-only audio format config (not runtime-updatable).
# Init-only config (not runtime-updatable).
self._audio_encoding = encoding
self._audio_sample_rate = 0 # Set in start()
self._auto_mode = auto_mode
self._apply_text_normalization = apply_text_normalization
self._timestamp_transport_strategy = timestamp_transport_strategy
def can_generate_metrics(self) -> bool:
"""Check if this service can generate processing metrics.
@@ -926,7 +907,7 @@ class InworldTTSService(WebsocketTTSService):
for k in ["contextCreated", "audioChunk", "flushCompleted", "contextClosed"]
if k in result
]
logger.debug(f"{self}: Received message types={msg_types}, ctx_id={ctx_id}")
logger.trace(f"{self}: Received message types={msg_types}, ctx_id={ctx_id}")
# Check for errors
status = result.get("status", {})
@@ -1036,14 +1017,12 @@ class InworldTTSService(WebsocketTTSService):
if self._settings.temperature is not None:
create_config["temperature"] = self._settings.temperature
if self._settings.apply_text_normalization is not None:
create_config["applyTextNormalization"] = self._settings.apply_text_normalization
if self._settings.auto_mode is not None:
create_config["autoMode"] = self._settings.auto_mode
if self._settings.timestamp_transport_strategy is not None:
create_config["timestampTransportStrategy"] = (
self._settings.timestamp_transport_strategy
)
if self._apply_text_normalization is not None:
create_config["applyTextNormalization"] = self._apply_text_normalization
if self._auto_mode is not None:
create_config["autoMode"] = self._auto_mode
if self._timestamp_transport_strategy is not None:
create_config["timestampTransportStrategy"] = self._timestamp_transport_strategy
# Set buffer settings for timely audio generation.
# Use provided values or defaults that work well for streaming LLM output.

View File

@@ -89,7 +89,7 @@ def language_to_kokoro_language(language: Language) -> str:
@dataclass
class KokoroTTSSettings(TTSSettings):
"""Settings for the Kokoro TTS service."""
"""Settings for KokoroTTSService."""
pass

View File

@@ -19,7 +19,6 @@ from pipecat.frames.frames import (
Frame,
StartFrame,
TTSAudioRawFrame,
TTSStartedFrame,
TTSStoppedFrame,
)
from pipecat.processors.frame_processor import FrameDirection
@@ -48,6 +47,7 @@ def language_to_lmnt_language(language: Language) -> Optional[str]:
The corresponding LMNT language code, or None if not supported.
"""
LANGUAGE_MAP = {
Language.AR: "ar",
Language.DE: "de",
Language.EN: "en",
Language.ES: "es",
@@ -65,6 +65,7 @@ def language_to_lmnt_language(language: Language) -> Optional[str]:
Language.TH: "th",
Language.TR: "tr",
Language.UK: "uk",
Language.UR: "ur",
Language.VI: "vi",
Language.ZH: "zh",
}
@@ -74,7 +75,7 @@ def language_to_lmnt_language(language: Language) -> Optional[str]:
@dataclass
class LmntTTSSettings(TTSSettings):
"""Settings for LMNT TTS service."""
"""Settings for LmntTTSService."""
pass
@@ -96,6 +97,7 @@ class LmntTTSService(InterruptibleTTSService):
voice_id: Optional[str] = None,
sample_rate: Optional[int] = None,
language: Language = Language.EN,
output_format: str = "pcm_s16le",
model: Optional[str] = None,
settings: Optional[LmntTTSSettings] = None,
**kwargs,
@@ -111,6 +113,8 @@ class LmntTTSService(InterruptibleTTSService):
sample_rate: Audio sample rate. If None, uses default.
language: Language for synthesis. Defaults to English.
output_format: Audio output format. One of "pcm_s16le", "pcm_f32le",
"mp3", "ulaw", "webm". Defaults to "pcm_s16le".
model: TTS model to use.
.. deprecated:: 0.0.105
@@ -122,7 +126,7 @@ class LmntTTSService(InterruptibleTTSService):
"""
# 1. Initialize default_settings with hardcoded defaults
default_settings = LmntTTSSettings(
model="blizzard",
model="aurora",
voice=None,
language=self.language_to_service_language(language),
)
@@ -135,7 +139,7 @@ class LmntTTSService(InterruptibleTTSService):
_warn_deprecated_param("model", LmntTTSSettings, "model")
default_settings.model = model
# 3. No params for this service
# 3. (No step 3, as there's no params object to apply)
# 4. Apply settings delta (canonical API, always wins)
if settings is not None:
@@ -151,7 +155,7 @@ class LmntTTSService(InterruptibleTTSService):
)
self._api_key = api_key
self._output_format = "raw"
self._output_format = output_format
self._receive_task = None
def can_generate_metrics(self) -> bool:

View File

@@ -87,10 +87,9 @@ def language_to_minimax_language(language: Language) -> Optional[str]:
@dataclass
class MiniMaxTTSSettings(TTSSettings):
"""Settings for MiniMax TTS service.
"""Settings for MiniMaxHttpTTSService.
Parameters:
stream: Whether to use streaming mode.
speed: Speech speed (range: 0.5 to 2.0).
volume: Speech volume (range: 0 to 10).
pitch: Pitch adjustment (range: -12 to 12).
@@ -101,7 +100,6 @@ class MiniMaxTTSSettings(TTSSettings):
language_boost: Language boost string for multilingual support.
"""
stream: bool | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
speed: float | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
volume: float | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
pitch: int | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
@@ -189,6 +187,7 @@ class MiniMaxHttpTTSService(TTSService):
voice_id: Optional[str] = None,
aiohttp_session: aiohttp.ClientSession,
sample_rate: Optional[int] = None,
stream: bool = True,
params: Optional[InputParams] = None,
settings: Optional[MiniMaxTTSSettings] = None,
**kwargs,
@@ -217,6 +216,7 @@ class MiniMaxHttpTTSService(TTSService):
aiohttp_session: aiohttp.ClientSession for API communication.
sample_rate: Output audio sample rate in Hz. If None, uses pipeline default.
stream: Whether to use streaming mode. Defaults to True.
params: Additional configuration parameters.
.. deprecated:: 0.0.105
@@ -231,7 +231,6 @@ class MiniMaxHttpTTSService(TTSService):
model="speech-02-turbo",
voice="Calm_Woman",
language=None,
stream=True,
speed=1.0,
volume=1.0,
pitch=0,
@@ -311,6 +310,7 @@ class MiniMaxHttpTTSService(TTSService):
self._api_key = api_key
self._group_id = group_id
self._stream = stream
self._base_url = f"{base_url}?GroupId={group_id}"
self._session = aiohttp_session
@@ -392,7 +392,7 @@ class MiniMaxHttpTTSService(TTSService):
# Create payload from settings
payload = {
"stream": self._settings.stream,
"stream": self._stream,
"voice_setting": voice_setting,
"audio_setting": audio_setting,
"model": self._settings.model,

View File

@@ -26,12 +26,10 @@ from pipecat.frames.frames import (
EndFrame,
ErrorFrame,
Frame,
InterruptionFrame,
LLMFullResponseEndFrame,
StartFrame,
TTSAudioRawFrame,
TTSSpeakFrame,
TTSStartedFrame,
TTSStoppedFrame,
)
from pipecat.processors.frame_processor import FrameDirection
@@ -76,7 +74,7 @@ def language_to_neuphonic_lang_code(language: Language) -> Optional[str]:
@dataclass
class NeuphonicTTSSettings(TTSSettings):
"""Settings for Neuphonic TTS service.
"""Settings for NeuphonicTTSService and NeuphonicHttpTTSService.
Parameters:
speed: Speech speed multiplier. Defaults to 1.0.
@@ -487,7 +485,7 @@ class NeuphonicHttpTTSService(TTSService):
default_settings = NeuphonicTTSSettings(
model=None,
voice=None,
language=self.language_to_service_language(Language.EN) or "en",
language=self.language_to_service_language(Language.EN),
speed=1.0,
)
@@ -501,9 +499,7 @@ class NeuphonicHttpTTSService(TTSService):
_warn_deprecated_param("params", NeuphonicTTSSettings)
if not settings:
if params.language is not None:
default_settings.language = (
self.language_to_service_language(params.language) or "en"
)
default_settings.language = self.language_to_service_language(params.language)
if params.speed is not None:
default_settings.speed = params.speed

View File

@@ -44,7 +44,7 @@ except ModuleNotFoundError as e:
@dataclass
class NvidiaTTSSettings(TTSSettings):
"""Settings for NVIDIA Riva TTS service.
"""Settings for NvidiaTTSService.
Parameters:
quality: Audio quality setting (0-100).

View File

@@ -62,7 +62,7 @@ VALID_VOICES: Dict[str, ValidVoice] = {
@dataclass
class OpenAITTSSettings(TTSSettings):
"""Settings for OpenAI TTS service.
"""Settings for OpenAITTSService.
Parameters:
instructions: Instructions to guide voice synthesis behavior.

View File

@@ -33,7 +33,7 @@ except ModuleNotFoundError as e:
@dataclass
class PiperTTSSettings(TTSSettings):
"""Settings for Piper TTS service."""
"""Settings for PiperTTSService."""
pass
@@ -82,7 +82,7 @@ class PiperTTSService(TTSService):
_warn_deprecated_param("voice_id", PiperTTSSettings, "voice")
default_settings.voice = voice_id
# 3. No params for this service
# 3. (No step 3, as there's no params object to apply)
# 4. Apply settings delta (canonical API, always wins)
if settings is not None:
@@ -186,7 +186,7 @@ class PiperTTSService(TTSService):
#
@dataclass
class PiperHttpTTSSettings(TTSSettings):
"""Settings for Piper HTTP TTS service."""
"""Settings for PiperHttpTTSService."""
pass
@@ -232,7 +232,7 @@ class PiperHttpTTSService(TTSService):
_warn_deprecated_param("voice_id", PiperHttpTTSSettings, "voice")
default_settings.voice = voice_id
# 3. No params for this service
# 3. (No step 3, as there's no params object to apply)
# 4. Apply settings delta (canonical API, always wins)
if settings is not None:

View File

@@ -38,7 +38,7 @@ except ModuleNotFoundError as e:
@dataclass
class ResembleAITTSSettings(TTSSettings):
"""Settings for Resemble AI TTS service."""
"""Settings for ResembleAITTSService."""
pass
@@ -94,7 +94,7 @@ class ResembleAITTSService(WebsocketTTSService):
_warn_deprecated_param("voice_id", ResembleAITTSSettings, "voice")
default_settings.voice = voice_id
# 3. No params for this service
# 3. (No step 3, as there's no params object to apply)
# 4. Apply settings delta (canonical API, always wins)
if settings is not None:

View File

@@ -73,7 +73,7 @@ def language_to_rime_language(language: Language) -> str:
@dataclass
class RimeTTSSettings(TTSSettings):
"""Settings for Rime WS JSON and HTTP TTS services.
"""Settings for RimeTTSService and RimeHttpTTSService.
Parameters:
segment: Text segmentation mode ("immediate", "bySentence", "never").
@@ -106,7 +106,7 @@ class RimeTTSSettings(TTSSettings):
@dataclass
class RimeNonJsonTTSSettings(TTSSettings):
"""Settings for Rime non-JSON WS TTS service.
"""Settings for RimeNonJsonTTSService.
Parameters:
segment: Text segmentation mode ("immediate", "bySentence", "never").

View File

@@ -400,12 +400,13 @@ class SarvamSTTService(STTService):
changed = await super()._update_settings(delta)
# Prompt is a WebSocket connect-time parameter; reconnect to apply.
if "prompt" in changed:
# Language and prompt are WebSocket connect-time parameters; reconnect to apply.
reconnect_fields = {"language", "prompt"}
if changed.keys() & reconnect_fields:
await self._disconnect()
await self._connect()
unhandled = {k: v for k, v in changed.items() if k != "prompt"}
unhandled = {k: v for k, v in changed.items() if k not in reconnect_fields}
if unhandled:
self._warn_unhandled_updated_settings(unhandled)
@@ -483,7 +484,6 @@ class SarvamSTTService(STTService):
Frame: None (transcription results come via WebSocket callbacks).
"""
if not self._socket_client:
logger.warning("WebSocket not connected, cannot process audio")
yield None
return
@@ -636,18 +636,22 @@ class SarvamSTTService(STTService):
await self.cancel_task(self._receive_task)
self._receive_task = None
if self._websocket_context and self._socket_client:
# Clear references first to prevent run_stt from sending audio
# during the close handshake.
socket_client = self._socket_client
websocket_context = self._websocket_context
self._socket_client = None
self._websocket_context = None
if websocket_context and socket_client:
try:
# Exit the async context manager
await self._websocket_context.__aexit__(None, None, None)
await websocket_context.__aexit__(None, None, None)
except Exception as e:
await self.push_error(
error_msg=f"Error closing WebSocket connection: {e}", exception=e
)
finally:
logger.debug("Disconnected from Sarvam WebSocket")
self._socket_client = None
self._websocket_context = None
async def _receive_task_handler(self):
"""Handle incoming messages from Sarvam WebSocket.

View File

@@ -53,11 +53,9 @@ from pipecat.frames.frames import (
EndFrame,
ErrorFrame,
Frame,
InterruptionFrame,
LLMFullResponseEndFrame,
StartFrame,
TTSAudioRawFrame,
TTSStartedFrame,
TTSStoppedFrame,
)
from pipecat.processors.frame_processor import FrameDirection
@@ -230,16 +228,27 @@ def language_to_sarvam_language(language: Language) -> Optional[str]:
"""
LANGUAGE_MAP = {
Language.BN: "bn-IN", # Bengali
Language.BN_IN: "bn-IN",
Language.EN: "en-IN", # English (India)
Language.EN_IN: "en-IN",
Language.GU: "gu-IN", # Gujarati
Language.GU_IN: "gu-IN",
Language.HI: "hi-IN", # Hindi
Language.HI_IN: "hi-IN",
Language.KN: "kn-IN", # Kannada
Language.KN_IN: "kn-IN",
Language.ML: "ml-IN", # Malayalam
Language.ML_IN: "ml-IN",
Language.MR: "mr-IN", # Marathi
Language.MR_IN: "mr-IN",
Language.OR: "od-IN", # Odia
Language.OR_IN: "od-IN",
Language.PA: "pa-IN", # Punjabi
Language.PA_IN: "pa-IN",
Language.TA: "ta-IN", # Tamil
Language.TA_IN: "ta-IN",
Language.TE: "te-IN", # Telugu
Language.TE_IN: "te-IN",
}
return resolve_language(language, LANGUAGE_MAP, use_base_code=False)
@@ -247,7 +256,7 @@ def language_to_sarvam_language(language: Language) -> Optional[str]:
@dataclass
class SarvamHttpTTSSettings(TTSSettings):
"""Settings for Sarvam HTTP TTS service.
"""Settings for SarvamHttpTTSService.
Parameters:
enable_preprocessing: Whether to enable text preprocessing. Defaults to False.
@@ -273,7 +282,7 @@ class SarvamHttpTTSSettings(TTSSettings):
@dataclass
class SarvamTTSSettings(SarvamHttpTTSSettings):
"""Settings for Sarvam WebSocket TTS service.
"""Settings for SarvamTTSService.
Extends :class:`SarvamHttpTTSSettings` with WebSocket-specific buffering parameters.
@@ -481,6 +490,10 @@ class SarvamHttpTTSService(TTSService):
if settings is not None:
default_settings.apply_update(settings)
# Convert Language enum to service-specific string
if isinstance(default_settings.language, Language):
default_settings.language = self.language_to_service_language(default_settings.language)
# Get model configuration (validates model exists)
resolved_model = default_settings.model
if resolved_model not in TTS_MODEL_CONFIGS:
@@ -900,6 +913,10 @@ class SarvamTTSService(InterruptibleTTSService):
if settings is not None:
default_settings.apply_update(settings)
# Convert Language enum to service-specific string
if isinstance(default_settings.language, Language):
default_settings.language = self.language_to_service_language(default_settings.language)
# Get model configuration (validates model exists)
resolved_model = default_settings.model
if resolved_model not in TTS_MODEL_CONFIGS:

View File

@@ -297,9 +297,7 @@ class SonioxSTTService(WebsocketSTTService):
await self._connect()
async def _update_settings(self, delta: SonioxSTTSettings) -> dict[str, Any]:
"""Apply settings delta.
Settings are stored but not applied to the active connection.
"""Apply settings delta and reconnect if anything changed.
Args:
delta: A settings delta.
@@ -309,15 +307,9 @@ class SonioxSTTService(WebsocketSTTService):
"""
changed = await super()._update_settings(delta)
if not changed:
return changed
# TODO: someday we could reconnect here to apply updated settings.
# Code might look something like the below:
# await self._disconnect()
# await self._connect()
self._warn_unhandled_updated_settings(changed)
if changed:
await self._disconnect()
await self._connect()
return changed

View File

@@ -37,7 +37,7 @@ except ModuleNotFoundError as e:
@dataclass
class SpeechmaticsTTSSettings(TTSSettings):
"""Settings for Speechmatics TTS service.
"""Settings for SpeechmaticsTTSService.
Parameters:
max_retries: Maximum number of retries for HTTP requests.

View File

@@ -70,7 +70,7 @@ def language_to_xtts_language(language: Language) -> Optional[str]:
@dataclass
class XTTSTTSSettings(TTSSettings):
"""Settings for XTTS TTS service."""
"""Settings for XTTSService."""
pass
@@ -124,6 +124,8 @@ class XTTSService(TTSService):
_warn_deprecated_param("voice_id", XTTSTTSSettings, "voice")
default_settings.voice = voice_id
# 3. (No step 3, as there's no params object to apply)
# 4. Apply settings delta (canonical API, always wins)
if settings is not None:
default_settings.apply_update(settings)

View File

@@ -34,7 +34,6 @@ new services are covered automatically with no per-service maintenance.
import importlib
import inspect
import pkgutil
import warnings
from dataclasses import fields
import pytest