TTS service and example updates

This commit is contained in:
Mark Backman
2026-03-06 20:39:00 -05:00
parent 4ed3480e4b
commit 671e9a6846
18 changed files with 142 additions and 131 deletions

View File

@@ -25,7 +25,7 @@ from pipecat.runner.types import RunnerArguments
from pipecat.runner.utils import create_transport
from pipecat.services.deepgram.stt import DeepgramSTTService
from pipecat.services.openai.llm import OpenAILLMService, OpenAILLMSettings
from pipecat.services.xtts.tts import XTTSService, XTTSSettings
from pipecat.services.xtts.tts import XTTSService, XTTSTTSSettings
from pipecat.transports.base_transport import BaseTransport, TransportParams
from pipecat.transports.daily.transport import DailyParams
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
@@ -59,7 +59,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
tts = XTTSService(
aiohttp_session=session,
settings=XTTSSettings(
settings=XTTSTTSSettings(
voice="Claribel Dervla",
),
base_url="http://localhost:8000",

View File

@@ -72,7 +72,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
stt = GoogleSTTService(
credentials=os.getenv("GOOGLE_TEST_CREDENTIALS"),
settings=GoogleSTTSettings(
languages=Language.EN_US,
languages=[Language.EN_US],
),
)

View File

@@ -55,7 +55,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
stt = GoogleSTTService(
settings=GoogleSTTSettings(
languages=Language.EN_US,
languages=[Language.EN_US],
),
credentials=os.getenv("GOOGLE_TEST_CREDENTIALS"),
)

View File

@@ -55,8 +55,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
stt = GoogleSTTService(
settings=GoogleSTTSettings(
languages=Language.EN_US,
model="chirp_3",
languages=[Language.EN_US],
# Add model to use a specific model
# model="chirp_3",
),
credentials=os.getenv("GOOGLE_TEST_CREDENTIALS"),
location="us",

View File

@@ -58,7 +58,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
tts = FishAudioTTSService(
api_key=os.getenv("FISH_API_KEY"),
settings=FishAudioTTSSettings(
model="4ce7e917cedd4bc2bb2e6ff3a46acaa1", # Barack Obama
voice="4ce7e917cedd4bc2bb2e6ff3a46acaa1", # Barack Obama
),
)

View File

@@ -24,7 +24,7 @@ from pipecat.processors.aggregators.llm_response_universal import (
from pipecat.runner.types import RunnerArguments
from pipecat.runner.utils import create_transport
from pipecat.services.openai.llm import OpenAILLMService, OpenAILLMSettings
from pipecat.services.sarvam.stt import SarvamSTTService
from pipecat.services.sarvam.stt import SarvamSTTService, SarvamSTTSettings
from pipecat.services.sarvam.tts import SarvamHttpTTSService, SarvamHttpTTSSettings
from pipecat.transcriptions.language import Language
from pipecat.transports.base_transport import BaseTransport, TransportParams
@@ -59,14 +59,16 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
async with aiohttp.ClientSession() as session:
stt = SarvamSTTService(
api_key=os.getenv("SARVAM_API_KEY"),
model="saarika:v2.5",
settings=SarvamSTTSettings(
model="saarika:v2.5",
),
)
tts = SarvamHttpTTSService(
api_key=os.getenv("SARVAM_API_KEY"),
aiohttp_session=session,
settings=SarvamHttpTTSSettings(
language=Language.EN,
language=Language.EN_IN,
),
)

View File

@@ -7,7 +7,6 @@
import os
from deepgram import LiveOptions
from dotenv import load_dotenv
from loguru import logger
@@ -28,7 +27,7 @@ from pipecat.processors.filters.function_filter import FunctionFilter
from pipecat.runner.types import RunnerArguments
from pipecat.runner.utils import create_transport
from pipecat.services.cartesia.tts import CartesiaTTSService, CartesiaTTSSettings
from pipecat.services.deepgram.stt import DeepgramSTTService
from pipecat.services.deepgram.stt import DeepgramSTTService, DeepgramSTTSettings
from pipecat.services.llm_service import FunctionCallParams
from pipecat.services.openai.llm import OpenAILLMService, OpenAILLMSettings
from pipecat.transports.base_transport import BaseTransport, TransportParams
@@ -102,7 +101,10 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
logger.info(f"Starting bot")
stt = DeepgramSTTService(
api_key=os.getenv("DEEPGRAM_API_KEY"), live_options=LiveOptions(language="multi")
api_key=os.getenv("DEEPGRAM_API_KEY"),
settings=DeepgramSTTSettings(
language="multi",
),
)
tts = SwitchLanguage()

View File

@@ -146,6 +146,7 @@ TESTS_07 = [
("07zg-interruptible-camb.py", EVAL_SIMPLE_MATH),
("07zi-interruptible-piper.py", EVAL_SIMPLE_MATH),
("07zj-interruptible-kokoro.py", EVAL_SIMPLE_MATH),
("07zk-interruptible-resembleai.py", EVAL_SIMPLE_MATH),
# Needs a local XTTS docker instance running.
# ("07i-interruptible-xtts.py", EVAL_SIMPLE_MATH),
]

View File

@@ -23,7 +23,6 @@ from pipecat.frames.frames import (
Frame,
StartFrame,
TTSAudioRawFrame,
TTSStartedFrame,
TTSStoppedFrame,
)
from pipecat.services.settings import NOT_GIVEN, TTSSettings, _NotGiven, _warn_deprecated_param
@@ -705,7 +704,7 @@ class CartesiaHttpTTSService(TTSService):
voice_id: Optional[str] = None,
model: Optional[str] = None,
base_url: str = "https://api.cartesia.ai",
cartesia_version: str = "2024-11-13",
cartesia_version: str = "2026-03-01",
aiohttp_session: Optional[aiohttp.ClientSession] = None,
sample_rate: Optional[int] = None,
encoding: str = "pcm_s16le",

View File

@@ -200,18 +200,12 @@ class ElevenLabsRealtimeSTTSettings(STTSettings):
vad_threshold: VAD sensitivity (0.1-0.9, lower is more sensitive).
min_speech_duration_ms: Minimum speech duration for VAD (50-2000ms).
min_silence_duration_ms: Minimum silence duration for VAD (50-2000ms).
include_timestamps: Whether to include word-level timestamps in transcripts.
enable_logging: Whether to enable logging on ElevenLabs' side.
include_language_detection: Whether to include language detection in transcripts.
"""
vad_silence_threshold_secs: float | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
vad_threshold: float | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
min_speech_duration_ms: int | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
min_silence_duration_ms: int | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
include_timestamps: bool | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
enable_logging: bool | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
include_language_detection: bool | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
class ElevenLabsSTTService(SegmentedSTTService):
@@ -496,6 +490,9 @@ class ElevenLabsRealtimeSTTService(WebsocketSTTService):
commit_strategy: CommitStrategy = CommitStrategy.MANUAL,
model: Optional[str] = None,
sample_rate: Optional[int] = None,
include_timestamps: bool = False,
enable_logging: bool = False,
include_language_detection: bool = False,
params: Optional[InputParams] = None,
settings: Optional[ElevenLabsRealtimeSTTSettings] = None,
ttfs_p99_latency: Optional[float] = ELEVENLABS_REALTIME_TTFS_P99,
@@ -515,6 +512,9 @@ class ElevenLabsRealtimeSTTService(WebsocketSTTService):
Use ``settings=ElevenLabsRealtimeSTTSettings(model=...)`` instead.
sample_rate: Audio sample rate in Hz. If not provided, uses the pipeline's rate.
include_timestamps: Whether to include word-level timestamps in transcripts.
enable_logging: Whether to enable logging on ElevenLabs' side.
include_language_detection: Whether to include language detection in transcripts.
params: Configuration parameters for the STT service.
.. deprecated:: 0.0.105
@@ -534,9 +534,6 @@ class ElevenLabsRealtimeSTTService(WebsocketSTTService):
vad_threshold=None,
min_speech_duration_ms=None,
min_silence_duration_ms=None,
include_timestamps=False,
enable_logging=False,
include_language_detection=False,
)
# 2. Apply direct init arg overrides (deprecated)
@@ -555,9 +552,9 @@ class ElevenLabsRealtimeSTTService(WebsocketSTTService):
default_settings.vad_threshold = params.vad_threshold
default_settings.min_speech_duration_ms = params.min_speech_duration_ms
default_settings.min_silence_duration_ms = params.min_silence_duration_ms
default_settings.include_timestamps = params.include_timestamps
default_settings.enable_logging = params.enable_logging
default_settings.include_language_detection = params.include_language_detection
include_timestamps = params.include_timestamps
enable_logging = params.enable_logging
include_language_detection = params.include_language_detection
# 4. Apply settings delta (canonical API, always wins)
if settings is not None:
@@ -579,6 +576,9 @@ class ElevenLabsRealtimeSTTService(WebsocketSTTService):
# Init-only config (not runtime-updatable).
self._commit_strategy = commit_strategy
self._include_timestamps = include_timestamps
self._enable_logging = enable_logging
self._include_language_detection = include_language_detection
self._connected_event = asyncio.Event()
self._connected_event.set()
@@ -762,17 +762,15 @@ class ElevenLabsRealtimeSTTService(WebsocketSTTService):
params.append(f"commit_strategy={self._commit_strategy.value}")
# Add optional parameters
if self._settings.include_timestamps:
params.append(
f"include_timestamps={str(self._settings.include_timestamps).lower()}"
)
if self._include_timestamps:
params.append(f"include_timestamps={str(self._include_timestamps).lower()}")
if self._settings.enable_logging:
params.append(f"enable_logging={str(self._settings.enable_logging).lower()}")
if self._enable_logging:
params.append(f"enable_logging={str(self._enable_logging).lower()}")
if self._settings.include_language_detection:
if self._include_language_detection:
params.append(
f"include_language_detection={str(self._settings.include_language_detection).lower()}"
f"include_language_detection={str(self._include_language_detection).lower()}"
)
# Add VAD parameters if using VAD commit strategy and values are specified
@@ -920,7 +918,7 @@ class ElevenLabsRealtimeSTTService(WebsocketSTTService):
"""
# If timestamps are enabled, skip this message and wait for the
# committed_transcript_with_timestamps message which contains all the data
if self._settings.include_timestamps:
if self._include_timestamps:
return
text = data.get("text", "").strip()

View File

@@ -358,6 +358,9 @@ class ElevenLabsTTSService(WebsocketTTSService):
model: Optional[str] = None,
url: str = "wss://api.elevenlabs.io",
sample_rate: Optional[int] = None,
auto_mode: bool = True,
enable_ssml_parsing: Optional[bool] = None,
enable_logging: Optional[bool] = None,
pronunciation_dictionary_locators: Optional[List[PronunciationDictionaryLocator]] = None,
params: Optional[InputParams] = None,
settings: Optional[ElevenLabsTTSSettings] = None,
@@ -381,6 +384,9 @@ class ElevenLabsTTSService(WebsocketTTSService):
url: WebSocket URL for ElevenLabs TTS API.
sample_rate: Audio sample rate. If None, uses default.
auto_mode: Whether to enable automatic mode optimization.
enable_ssml_parsing: Whether to parse SSML tags in text.
enable_logging: Whether to enable ElevenLabs server-side logging.
pronunciation_dictionary_locators: List of pronunciation dictionary
locators to use.
params: Additional input parameters for voice customization.
@@ -428,11 +434,6 @@ class ElevenLabsTTSService(WebsocketTTSService):
apply_text_normalization=None,
)
# Track init-only URL params through the override chain
_auto_mode = True
_enable_ssml_parsing = None
_enable_logging = None
# 2. Apply direct init arg overrides (deprecated)
if voice_id is not None:
_warn_deprecated_param("voice_id", ElevenLabsTTSSettings, "voice")
@@ -459,11 +460,11 @@ class ElevenLabsTTSService(WebsocketTTSService):
if params.speed is not None:
default_settings.speed = params.speed
if params.auto_mode is not None:
_auto_mode = str(params.auto_mode).lower()
auto_mode = params.auto_mode
if params.enable_ssml_parsing is not None:
_enable_ssml_parsing = params.enable_ssml_parsing
enable_ssml_parsing = params.enable_ssml_parsing
if params.enable_logging is not None:
_enable_logging = params.enable_logging
enable_logging = params.enable_logging
if params.apply_text_normalization is not None:
default_settings.apply_text_normalization = params.apply_text_normalization
if _pronunciation_dictionary_locators is None:
@@ -488,9 +489,9 @@ class ElevenLabsTTSService(WebsocketTTSService):
self._url = url
# Init-only WebSocket URL params (not runtime-updatable).
self._auto_mode = _auto_mode
self._enable_ssml_parsing = _enable_ssml_parsing
self._enable_logging = _enable_logging
self._auto_mode = auto_mode
self._enable_ssml_parsing = enable_ssml_parsing
self._enable_logging = enable_logging
self._output_format = "" # initialized in start()
self._voice_settings = self._set_voice_settings()
@@ -664,7 +665,7 @@ class ElevenLabsTTSService(WebsocketTTSService):
voice_id = self._settings.voice
model = self._settings.model
output_format = self._output_format
url = f"{self._url}/v1/text-to-speech/{voice_id}/multi-stream-input?model_id={model}&output_format={output_format}&auto_mode={self._auto_mode}"
url = f"{self._url}/v1/text-to-speech/{voice_id}/multi-stream-input?model_id={model}&output_format={output_format}&auto_mode={str(self._auto_mode).lower()}"
if self._enable_ssml_parsing:
url += f"&enable_ssml_parsing={self._enable_ssml_parsing}"

View File

@@ -10,9 +10,8 @@ This module provides integration with Fish Audio's real-time TTS WebSocket API
for streaming text-to-speech synthesis with customizable voice parameters.
"""
import uuid
from dataclasses import dataclass, field
from typing import Any, AsyncGenerator, ClassVar, Dict, Literal, Mapping, Optional, Self
from typing import Any, AsyncGenerator, Literal, Mapping, Optional, Self
from loguru import logger
from pydantic import BaseModel
@@ -25,7 +24,6 @@ from pipecat.frames.frames import (
InterruptionFrame,
StartFrame,
TTSAudioRawFrame,
TTSStartedFrame,
TTSStoppedFrame,
)
from pipecat.processors.frame_processor import FrameDirection
@@ -52,18 +50,20 @@ class FishAudioTTSSettings(TTSSettings):
"""Settings for FishAudioTTSService.
Parameters:
latency: Latency mode ("normal" or "balanced"). Defaults to "normal".
latency: Latency mode ("normal" or "balanced"). Defaults to "balanced".
normalize: Whether to normalize audio output. Defaults to True.
temperature: Controls randomness in speech generation (0.0-1.0).
top_p: Controls diversity via nucleus sampling (0.0-1.0).
prosody_speed: Speech speed multiplier (0.5-2.0). Defaults to 1.0.
prosody_volume: Volume adjustment in dB. Defaults to 0.
reference_id: Reference ID of the voice model.
prosody_volume: Volume adjustment in dB (-20 to 20). Defaults to 0.
"""
latency: str | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
normalize: bool | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
temperature: float | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
top_p: float | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
prosody_speed: float | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
prosody_volume: int | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
reference_id: str | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
@classmethod
def from_mapping(cls, settings: Mapping[str, Any]) -> Self:
@@ -174,18 +174,18 @@ class FishAudioTTSService(InterruptibleTTSService):
model="s1",
voice=None,
language=None,
latency="normal",
latency="balanced",
normalize=True,
temperature=None,
top_p=None,
prosody_speed=1.0,
prosody_volume=0,
reference_id=None,
)
# 2. Apply direct init arg overrides (deprecated)
if reference_id is not None:
_warn_deprecated_param("reference_id", FishAudioTTSSettings, "voice")
default_settings.voice = reference_id
default_settings.reference_id = reference_id
if model_id is not None:
_warn_deprecated_param("model_id", FishAudioTTSSettings, "model")
default_settings.model = model_id
@@ -317,8 +317,12 @@ class FishAudioTTSService(InterruptibleTTSService):
"speed": self._settings.prosody_speed,
"volume": self._settings.prosody_volume,
},
"reference_id": self._settings.reference_id,
"reference_id": self._settings.voice,
}
if self._settings.temperature is not None:
request_settings["temperature"] = self._settings.temperature
if self._settings.top_p is not None:
request_settings["top_p"] = self._settings.top_p
start_message = {"event": "start", "request": {"text": "", **request_settings}}
await self._websocket.send(ormsgpack.packb(start_message))
logger.debug("Sent start message to Fish Audio")
@@ -375,7 +379,14 @@ class FishAudioTTSService(InterruptibleTTSService):
frame = TTSAudioRawFrame(audio_data, self.sample_rate, 1)
await self.push_frame(frame)
await self.stop_ttfb_metrics()
continue
elif event == "finish":
reason = msg.get("reason", "unknown")
if reason == "error":
await self.push_error(
error_msg="Fish Audio server error during synthesis"
)
else:
logger.debug(f"Fish Audio session finished: {reason}")
except Exception as e:
await self.push_error(error_msg=f"Unknown error occurred: {e}", exception=e)

View File

@@ -73,27 +73,14 @@ class InworldTTSSettings(TTSSettings):
Parameters:
speaking_rate: Speaking rate for speech synthesis.
temperature: Temperature for speech synthesis.
auto_mode: Whether to use auto mode. Recommended when texts are sent
in full sentences/phrases. When enabled, the server controls
flushing of buffered text to achieve minimal latency while
maintaining high quality audio output. If None (default),
automatically set based on aggregate_sentences.
apply_text_normalization: Whether to apply text normalization.
timestamp_transport_strategy: Strategy for timestamp transport ("ASYNC" or "SYNC").
"""
speaking_rate: float | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
temperature: float | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
auto_mode: bool | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
apply_text_normalization: str | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
timestamp_transport_strategy: str | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
_aliases: ClassVar[Dict[str, str]] = {
"voiceId": "voice",
"modelId": "model",
"applyTextNormalization": "apply_text_normalization",
"autoMode": "auto_mode",
"timestampTransportStrategy": "timestamp_transport_strategy",
}
@classmethod
@@ -141,6 +128,7 @@ class InworldHttpTTSService(TTSService):
streaming: bool = True,
sample_rate: Optional[int] = None,
encoding: str = "LINEAR16",
timestamp_transport_strategy: Optional[Literal["ASYNC", "SYNC"]] = "ASYNC",
params: Optional[InputParams] = None,
settings: Optional[InworldTTSSettings] = None,
**kwargs,
@@ -163,6 +151,8 @@ class InworldHttpTTSService(TTSService):
streaming: Whether to use streaming mode.
sample_rate: Audio sample rate in Hz.
encoding: Audio encoding format.
timestamp_transport_strategy: Strategy for timestamp transport
("ASYNC" or "SYNC"). Defaults to "ASYNC".
params: Input parameters for Inworld TTS configuration.
.. deprecated:: 0.0.105
@@ -179,9 +169,6 @@ class InworldHttpTTSService(TTSService):
language=None,
speaking_rate=None,
temperature=None,
timestamp_transport_strategy="ASYNC",
auto_mode=None, # Not applicable for HTTP TTS
apply_text_normalization=None, # Not applicable for HTTP TTS
)
# 2. Apply direct init arg overrides (deprecated)
@@ -201,9 +188,7 @@ class InworldHttpTTSService(TTSService):
if params.temperature is not None:
default_settings.temperature = params.temperature
if params.timestamp_transport_strategy is not None:
default_settings.timestamp_transport_strategy = (
params.timestamp_transport_strategy
)
timestamp_transport_strategy = params.timestamp_transport_strategy
# 4. Apply settings delta (canonical API, always wins)
if settings is not None:
@@ -230,9 +215,10 @@ class InworldHttpTTSService(TTSService):
self._cumulative_time = 0.0
# Init-only audio format config (not runtime-updatable).
# Init-only config (not runtime-updatable).
self._audio_encoding = encoding
self._audio_sample_rate = 0 # Set in start()
self._timestamp_transport_strategy = timestamp_transport_strategy
def can_generate_metrics(self) -> bool:
"""Check if this service can generate processing metrics.
@@ -251,22 +237,6 @@ class InworldHttpTTSService(TTSService):
await super().start(frame)
self._audio_sample_rate = self.sample_rate
async def stop(self, frame: EndFrame):
"""Stop the Inworld TTS service.
Args:
frame: The end frame.
"""
await super().stop(frame)
async def cancel(self, frame: CancelFrame):
"""Cancel the Inworld TTS service.
Args:
frame: The cancel frame.
"""
await super().cancel(frame)
async def push_frame(self, frame: Frame, direction: FrameDirection = FrameDirection.DOWNSTREAM):
"""Push a frame and handle state changes.
@@ -347,8 +317,8 @@ class InworldHttpTTSService(TTSService):
# Use WORD timestamps for simplicity and correct spacing/capitalization
payload["timestampType"] = self._timestamp_type
if self._settings.timestamp_transport_strategy is not None:
payload["timestampTransportStrategy"] = self._settings.timestamp_transport_strategy
if self._timestamp_transport_strategy is not None:
payload["timestampTransportStrategy"] = self._timestamp_transport_strategy
request_id = str(uuid.uuid4())
headers = {
@@ -556,6 +526,9 @@ class InworldTTSService(WebsocketTTSService):
url: str = "wss://api.inworld.ai/tts/v1/voice:streamBidirectional",
sample_rate: Optional[int] = None,
encoding: str = "LINEAR16",
auto_mode: Optional[bool] = None,
apply_text_normalization: Optional[str] = None,
timestamp_transport_strategy: Optional[Literal["ASYNC", "SYNC"]] = "ASYNC",
params: Optional[InputParams] = None,
settings: Optional[InworldTTSSettings] = None,
aggregate_sentences: Optional[bool] = None,
@@ -580,6 +553,12 @@ class InworldTTSService(WebsocketTTSService):
url: URL of the Inworld WebSocket API.
sample_rate: Audio sample rate in Hz.
encoding: Audio encoding format.
auto_mode: Whether to use auto mode. When enabled, the server
controls flushing of buffered text. If None (default),
automatically set based on ``aggregate_sentences``.
apply_text_normalization: Whether to apply text normalization.
timestamp_transport_strategy: Strategy for timestamp transport
("ASYNC" or "SYNC"). Defaults to "ASYNC".
params: Input parameters for Inworld WebSocket TTS configuration.
.. deprecated:: 0.0.105
@@ -596,6 +575,10 @@ class InworldTTSService(WebsocketTTSService):
append_trailing_space: Whether to append a trailing space to text before sending to TTS.
**kwargs: Additional arguments passed to the parent class.
"""
# Derive auto_mode from aggregate_sentences if not explicitly set
if auto_mode is None:
auto_mode = True if aggregate_sentences is None else aggregate_sentences
# 1. Initialize default_settings with hardcoded defaults
default_settings = InworldTTSSettings(
model="inworld-tts-1.5-max",
@@ -603,9 +586,6 @@ class InworldTTSService(WebsocketTTSService):
language=None,
speaking_rate=None,
temperature=None,
apply_text_normalization=None,
timestamp_transport_strategy="ASYNC",
auto_mode=True if aggregate_sentences is None else aggregate_sentences,
)
# 2. Apply direct init arg overrides (deprecated)
@@ -627,13 +607,11 @@ class InworldTTSService(WebsocketTTSService):
if params.temperature is not None:
default_settings.temperature = params.temperature
if params.apply_text_normalization is not None:
default_settings.apply_text_normalization = params.apply_text_normalization
apply_text_normalization = params.apply_text_normalization
if params.timestamp_transport_strategy is not None:
default_settings.timestamp_transport_strategy = (
params.timestamp_transport_strategy
)
timestamp_transport_strategy = params.timestamp_transport_strategy
if params.auto_mode is not None:
default_settings.auto_mode = params.auto_mode
auto_mode = params.auto_mode
_buffer_max_delay_ms = params.max_buffer_delay_ms
_buffer_char_threshold = params.buffer_char_threshold
@@ -673,9 +651,12 @@ class InworldTTSService(WebsocketTTSService):
# Track the end time of the last word in the current generation
self._generation_end_time = 0.0
# Init-only audio format config (not runtime-updatable).
# Init-only config (not runtime-updatable).
self._audio_encoding = encoding
self._audio_sample_rate = 0 # Set in start()
self._auto_mode = auto_mode
self._apply_text_normalization = apply_text_normalization
self._timestamp_transport_strategy = timestamp_transport_strategy
def can_generate_metrics(self) -> bool:
"""Check if this service can generate processing metrics.
@@ -1036,14 +1017,12 @@ class InworldTTSService(WebsocketTTSService):
if self._settings.temperature is not None:
create_config["temperature"] = self._settings.temperature
if self._settings.apply_text_normalization is not None:
create_config["applyTextNormalization"] = self._settings.apply_text_normalization
if self._settings.auto_mode is not None:
create_config["autoMode"] = self._settings.auto_mode
if self._settings.timestamp_transport_strategy is not None:
create_config["timestampTransportStrategy"] = (
self._settings.timestamp_transport_strategy
)
if self._apply_text_normalization is not None:
create_config["applyTextNormalization"] = self._apply_text_normalization
if self._auto_mode is not None:
create_config["autoMode"] = self._auto_mode
if self._timestamp_transport_strategy is not None:
create_config["timestampTransportStrategy"] = self._timestamp_transport_strategy
# Set buffer settings for timely audio generation.
# Use provided values or defaults that work well for streaming LLM output.

View File

@@ -19,7 +19,6 @@ from pipecat.frames.frames import (
Frame,
StartFrame,
TTSAudioRawFrame,
TTSStartedFrame,
TTSStoppedFrame,
)
from pipecat.processors.frame_processor import FrameDirection
@@ -48,6 +47,7 @@ def language_to_lmnt_language(language: Language) -> Optional[str]:
The corresponding LMNT language code, or None if not supported.
"""
LANGUAGE_MAP = {
Language.AR: "ar",
Language.DE: "de",
Language.EN: "en",
Language.ES: "es",
@@ -65,6 +65,7 @@ def language_to_lmnt_language(language: Language) -> Optional[str]:
Language.TH: "th",
Language.TR: "tr",
Language.UK: "uk",
Language.UR: "ur",
Language.VI: "vi",
Language.ZH: "zh",
}
@@ -96,6 +97,7 @@ class LmntTTSService(InterruptibleTTSService):
voice_id: Optional[str] = None,
sample_rate: Optional[int] = None,
language: Language = Language.EN,
output_format: str = "pcm_s16le",
model: Optional[str] = None,
settings: Optional[LmntTTSSettings] = None,
**kwargs,
@@ -111,6 +113,8 @@ class LmntTTSService(InterruptibleTTSService):
sample_rate: Audio sample rate. If None, uses default.
language: Language for synthesis. Defaults to English.
output_format: Audio output format. One of "pcm_s16le", "pcm_f32le",
"mp3", "ulaw", "webm". Defaults to "pcm_s16le".
model: TTS model to use.
.. deprecated:: 0.0.105
@@ -122,7 +126,7 @@ class LmntTTSService(InterruptibleTTSService):
"""
# 1. Initialize default_settings with hardcoded defaults
default_settings = LmntTTSSettings(
model="blizzard",
model="aurora",
voice=None,
language=self.language_to_service_language(language),
)
@@ -151,7 +155,7 @@ class LmntTTSService(InterruptibleTTSService):
)
self._api_key = api_key
self._output_format = "raw"
self._output_format = output_format
self._receive_task = None
def can_generate_metrics(self) -> bool:

View File

@@ -90,7 +90,6 @@ class MiniMaxTTSSettings(TTSSettings):
"""Settings for MiniMaxHttpTTSService.
Parameters:
stream: Whether to use streaming mode.
speed: Speech speed (range: 0.5 to 2.0).
volume: Speech volume (range: 0 to 10).
pitch: Pitch adjustment (range: -12 to 12).
@@ -101,7 +100,6 @@ class MiniMaxTTSSettings(TTSSettings):
language_boost: Language boost string for multilingual support.
"""
stream: bool | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
speed: float | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
volume: float | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
pitch: int | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
@@ -189,6 +187,7 @@ class MiniMaxHttpTTSService(TTSService):
voice_id: Optional[str] = None,
aiohttp_session: aiohttp.ClientSession,
sample_rate: Optional[int] = None,
stream: bool = True,
params: Optional[InputParams] = None,
settings: Optional[MiniMaxTTSSettings] = None,
**kwargs,
@@ -217,6 +216,7 @@ class MiniMaxHttpTTSService(TTSService):
aiohttp_session: aiohttp.ClientSession for API communication.
sample_rate: Output audio sample rate in Hz. If None, uses pipeline default.
stream: Whether to use streaming mode. Defaults to True.
params: Additional configuration parameters.
.. deprecated:: 0.0.105
@@ -231,7 +231,6 @@ class MiniMaxHttpTTSService(TTSService):
model="speech-02-turbo",
voice="Calm_Woman",
language=None,
stream=True,
speed=1.0,
volume=1.0,
pitch=0,
@@ -311,6 +310,7 @@ class MiniMaxHttpTTSService(TTSService):
self._api_key = api_key
self._group_id = group_id
self._stream = stream
self._base_url = f"{base_url}?GroupId={group_id}"
self._session = aiohttp_session
@@ -392,7 +392,7 @@ class MiniMaxHttpTTSService(TTSService):
# Create payload from settings
payload = {
"stream": self._settings.stream,
"stream": self._stream,
"voice_setting": voice_setting,
"audio_setting": audio_setting,
"model": self._settings.model,

View File

@@ -26,12 +26,10 @@ from pipecat.frames.frames import (
EndFrame,
ErrorFrame,
Frame,
InterruptionFrame,
LLMFullResponseEndFrame,
StartFrame,
TTSAudioRawFrame,
TTSSpeakFrame,
TTSStartedFrame,
TTSStoppedFrame,
)
from pipecat.processors.frame_processor import FrameDirection
@@ -487,7 +485,7 @@ class NeuphonicHttpTTSService(TTSService):
default_settings = NeuphonicTTSSettings(
model=None,
voice=None,
language=self.language_to_service_language(Language.EN) or "en",
language=self.language_to_service_language(Language.EN),
speed=1.0,
)
@@ -501,9 +499,7 @@ class NeuphonicHttpTTSService(TTSService):
_warn_deprecated_param("params", NeuphonicTTSSettings)
if not settings:
if params.language is not None:
default_settings.language = (
self.language_to_service_language(params.language) or "en"
)
default_settings.language = self.language_to_service_language(params.language)
if params.speed is not None:
default_settings.speed = params.speed

View File

@@ -53,11 +53,9 @@ from pipecat.frames.frames import (
EndFrame,
ErrorFrame,
Frame,
InterruptionFrame,
LLMFullResponseEndFrame,
StartFrame,
TTSAudioRawFrame,
TTSStartedFrame,
TTSStoppedFrame,
)
from pipecat.processors.frame_processor import FrameDirection
@@ -230,16 +228,27 @@ def language_to_sarvam_language(language: Language) -> Optional[str]:
"""
LANGUAGE_MAP = {
Language.BN: "bn-IN", # Bengali
Language.BN_IN: "bn-IN",
Language.EN: "en-IN", # English (India)
Language.EN_IN: "en-IN",
Language.GU: "gu-IN", # Gujarati
Language.GU_IN: "gu-IN",
Language.HI: "hi-IN", # Hindi
Language.HI_IN: "hi-IN",
Language.KN: "kn-IN", # Kannada
Language.KN_IN: "kn-IN",
Language.ML: "ml-IN", # Malayalam
Language.ML_IN: "ml-IN",
Language.MR: "mr-IN", # Marathi
Language.MR_IN: "mr-IN",
Language.OR: "od-IN", # Odia
Language.OR_IN: "od-IN",
Language.PA: "pa-IN", # Punjabi
Language.PA_IN: "pa-IN",
Language.TA: "ta-IN", # Tamil
Language.TA_IN: "ta-IN",
Language.TE: "te-IN", # Telugu
Language.TE_IN: "te-IN",
}
return resolve_language(language, LANGUAGE_MAP, use_base_code=False)
@@ -481,6 +490,10 @@ class SarvamHttpTTSService(TTSService):
if settings is not None:
default_settings.apply_update(settings)
# Convert Language enum to service-specific string
if isinstance(default_settings.language, Language):
default_settings.language = self.language_to_service_language(default_settings.language)
# Get model configuration (validates model exists)
resolved_model = default_settings.model
if resolved_model not in TTS_MODEL_CONFIGS:
@@ -900,6 +913,10 @@ class SarvamTTSService(InterruptibleTTSService):
if settings is not None:
default_settings.apply_update(settings)
# Convert Language enum to service-specific string
if isinstance(default_settings.language, Language):
default_settings.language = self.language_to_service_language(default_settings.language)
# Get model configuration (validates model exists)
resolved_model = default_settings.model
if resolved_model not in TTS_MODEL_CONFIGS:

View File

@@ -70,7 +70,7 @@ def language_to_xtts_language(language: Language) -> Optional[str]:
@dataclass
class XTTSTTSSettings(TTSSettings):
"""Settings for XTTSTTSService."""
"""Settings for XTTSService."""
pass