From ce51df677c7a3aebfd5e0505f4633a9b41bf4bed Mon Sep 17 00:00:00 2001 From: Paul Kompfner Date: Tue, 17 Feb 2026 17:07:14 -0500 Subject: [PATCH] Add backward-compat `_aliases` and `from_mapping` overrides to TTS settings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The migration from plain-dict `self._settings` to typed dataclasses renamed keys and flattened nested dicts. The deprecated dict-based `TTSUpdateSettingsFrame(settings={...})` code path calls `from_mapping`, which silently dropped old keys into `extra`. - Add `_aliases` so renamed flat keys (e.g. `sample_rate` → `fish_sample_rate`, camelCase Inworld keys) resolve correctly. - Override `from_mapping` to destructure nested dicts (`output_format`, `prosody`, `audioConfig`, `voice_setting`, `audio_setting`) into their flat field equivalents. - Fix AsyncAI constructor bug passing `output_format={...}` dict instead of individual `output_container`/`output_encoding`/`output_sample_rate` fields. --- src/pipecat/services/asyncai/tts.py | 21 ++++++++++++----- src/pipecat/services/cartesia/tts.py | 13 ++++++++++- src/pipecat/services/fish/tts.py | 14 +++++++++++- src/pipecat/services/groq/tts.py | 4 +++- src/pipecat/services/inworld/tts.py | 21 ++++++++++++++++- src/pipecat/services/minimax/tts.py | 31 +++++++++++++++++++++++++- src/pipecat/services/resembleai/tts.py | 7 +++++- 7 files changed, 99 insertions(+), 12 deletions(-) diff --git a/src/pipecat/services/asyncai/tts.py b/src/pipecat/services/asyncai/tts.py index 05ba14113..489d7cbff 100644 --- a/src/pipecat/services/asyncai/tts.py +++ b/src/pipecat/services/asyncai/tts.py @@ -10,7 +10,7 @@ import asyncio import base64 import json from dataclasses import dataclass, field -from typing import AsyncGenerator, Optional +from typing import Any, AsyncGenerator, ClassVar, Dict, Mapping, Optional import aiohttp from loguru import logger @@ -88,6 +88,17 @@ class AsyncAITTSSettings(TTSSettings): output_encoding: str = field(default_factory=lambda: NOT_GIVEN) output_sample_rate: int = field(default_factory=lambda: NOT_GIVEN) + @classmethod + def from_mapping(cls, settings: Mapping[str, Any]) -> "AsyncAITTSSettings": + """Construct settings from a plain dict, destructuring legacy nested ``output_format``.""" + flat = dict(settings) + nested = flat.pop("output_format", None) + if isinstance(nested, dict): + flat.setdefault("output_container", nested.get("container")) + flat.setdefault("output_encoding", nested.get("encoding")) + flat.setdefault("output_sample_rate", nested.get("sample_rate")) + return super().from_mapping(flat) + class AsyncAITTSService(AudioContextTTSService): """Async TTS service with WebSocket streaming. @@ -153,11 +164,9 @@ class AsyncAITTSService(AudioContextTTSService): self._settings = AsyncAITTSSettings( model=model, voice=voice_id, - output_format={ - "container": container, - "encoding": encoding, - "sample_rate": 0, - }, + output_container=container, + output_encoding=encoding, + output_sample_rate=0, language=self.language_to_service_language(params.language) if params.language else None, diff --git a/src/pipecat/services/cartesia/tts.py b/src/pipecat/services/cartesia/tts.py index 2544d3b98..edee9e2ea 100644 --- a/src/pipecat/services/cartesia/tts.py +++ b/src/pipecat/services/cartesia/tts.py @@ -11,7 +11,7 @@ import json import warnings from dataclasses import dataclass, field from enum import Enum -from typing import Any, AsyncGenerator, List, Literal, Optional +from typing import Any, AsyncGenerator, ClassVar, Dict, List, Literal, Mapping, Optional from loguru import logger from pydantic import BaseModel, Field @@ -217,6 +217,17 @@ class CartesiaTTSSettings(TTSSettings): generation_config: GenerationConfig = field(default_factory=lambda: NOT_GIVEN) pronunciation_dict_id: str = field(default_factory=lambda: NOT_GIVEN) + @classmethod + def from_mapping(cls, settings: Mapping[str, Any]) -> "CartesiaTTSSettings": + """Construct settings from a plain dict, destructuring legacy nested ``output_format``.""" + flat = dict(settings) + nested = flat.pop("output_format", None) + if isinstance(nested, dict): + flat.setdefault("output_container", nested.get("container")) + flat.setdefault("output_encoding", nested.get("encoding")) + flat.setdefault("output_sample_rate", nested.get("sample_rate")) + return super().from_mapping(flat) + class CartesiaTTSService(AudioContextWordTTSService): """Cartesia TTS service with WebSocket streaming and word timestamps. diff --git a/src/pipecat/services/fish/tts.py b/src/pipecat/services/fish/tts.py index 4da4b6673..7dd06d705 100644 --- a/src/pipecat/services/fish/tts.py +++ b/src/pipecat/services/fish/tts.py @@ -12,7 +12,7 @@ for streaming text-to-speech synthesis with customizable voice parameters. import uuid from dataclasses import dataclass, field -from typing import Any, AsyncGenerator, Literal, Optional +from typing import Any, AsyncGenerator, ClassVar, Dict, Literal, Mapping, Optional from loguru import logger from pydantic import BaseModel @@ -69,6 +69,18 @@ class FishAudioTTSSettings(TTSSettings): prosody_volume: int = field(default_factory=lambda: NOT_GIVEN) reference_id: str = field(default_factory=lambda: NOT_GIVEN) + _aliases: ClassVar[Dict[str, str]] = {"voice_id": "voice", "sample_rate": "fish_sample_rate"} + + @classmethod + def from_mapping(cls, settings: Mapping[str, Any]) -> "FishAudioTTSSettings": + """Construct settings from a plain dict, destructuring legacy nested ``prosody``.""" + flat = dict(settings) + nested = flat.pop("prosody", None) + if isinstance(nested, dict): + flat.setdefault("prosody_speed", nested.get("speed")) + flat.setdefault("prosody_volume", nested.get("volume")) + return super().from_mapping(flat) + class FishAudioTTSService(InterruptibleTTSService): """Fish Audio text-to-speech service with WebSocket streaming. diff --git a/src/pipecat/services/groq/tts.py b/src/pipecat/services/groq/tts.py index d0b5fbd7c..e4c10f2e9 100644 --- a/src/pipecat/services/groq/tts.py +++ b/src/pipecat/services/groq/tts.py @@ -9,7 +9,7 @@ import io import wave from dataclasses import dataclass, field -from typing import AsyncGenerator, Optional +from typing import AsyncGenerator, ClassVar, Dict, Optional from loguru import logger from pydantic import BaseModel @@ -48,6 +48,8 @@ class GroqTTSSettings(TTSSettings): speed: float = field(default_factory=lambda: NOT_GIVEN) groq_sample_rate: int = field(default_factory=lambda: NOT_GIVEN) + _aliases: ClassVar[Dict[str, str]] = {"voice_id": "voice", "sample_rate": "groq_sample_rate"} + class GroqTTSService(TTSService): """Groq text-to-speech service implementation. diff --git a/src/pipecat/services/inworld/tts.py b/src/pipecat/services/inworld/tts.py index c291f3156..acc6187cb 100644 --- a/src/pipecat/services/inworld/tts.py +++ b/src/pipecat/services/inworld/tts.py @@ -17,7 +17,7 @@ import asyncio import base64 import json from dataclasses import dataclass, field -from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple +from typing import Any, AsyncGenerator, ClassVar, Dict, List, Mapping, Optional, Tuple import aiohttp import websockets @@ -74,6 +74,25 @@ class InworldTTSSettings(TTSSettings): auto_mode: bool = field(default_factory=lambda: NOT_GIVEN) apply_text_normalization: str = field(default_factory=lambda: NOT_GIVEN) + _aliases: ClassVar[Dict[str, str]] = { + "voice_id": "voice", + "voiceId": "voice", + "modelId": "model", + "applyTextNormalization": "apply_text_normalization", + "autoMode": "auto_mode", + } + + @classmethod + def from_mapping(cls, settings: Mapping[str, Any]) -> "InworldTTSSettings": + """Construct settings from a plain dict, destructuring legacy nested ``audioConfig``.""" + flat = dict(settings) + nested = flat.pop("audioConfig", None) + if isinstance(nested, dict): + flat.setdefault("audio_encoding", nested.get("audioEncoding")) + flat.setdefault("audio_sample_rate", nested.get("sampleRateHertz")) + flat.setdefault("speaking_rate", nested.get("speakingRate")) + return super().from_mapping(flat) + class InworldHttpTTSService(WordTTSService): """Inworld AI HTTP-based TTS service. diff --git a/src/pipecat/services/minimax/tts.py b/src/pipecat/services/minimax/tts.py index ab04925f3..6a107d950 100644 --- a/src/pipecat/services/minimax/tts.py +++ b/src/pipecat/services/minimax/tts.py @@ -12,7 +12,7 @@ for streaming text-to-speech synthesis. import json from dataclasses import dataclass, field -from typing import AsyncGenerator, Optional +from typing import Any, AsyncGenerator, ClassVar, Dict, Mapping, Optional import aiohttp from loguru import logger @@ -120,6 +120,35 @@ class MiniMaxTTSSettings(TTSSettings): audio_sample_rate: int = field(default_factory=lambda: NOT_GIVEN) language_boost: str = field(default_factory=lambda: NOT_GIVEN) + _aliases: ClassVar[Dict[str, str]] = {"voice_id": "voice"} + + @classmethod + def from_mapping(cls, settings: Mapping[str, Any]) -> "MiniMaxTTSSettings": + """Construct settings from a plain dict, destructuring legacy nested dicts. + + Handles ``voice_setting`` (with ``vol`` → ``volume`` rename) and + ``audio_setting`` (with prefixed field mapping). + """ + flat = dict(settings) + + voice = flat.pop("voice_setting", None) + if isinstance(voice, dict): + flat.setdefault("speed", voice.get("speed")) + flat.setdefault("volume", voice.get("vol")) + flat.setdefault("pitch", voice.get("pitch")) + flat.setdefault("emotion", voice.get("emotion")) + flat.setdefault("text_normalization", voice.get("text_normalization")) + flat.setdefault("latex_read", voice.get("latex_read")) + + audio = flat.pop("audio_setting", None) + if isinstance(audio, dict): + flat.setdefault("audio_bitrate", audio.get("bitrate")) + flat.setdefault("audio_format", audio.get("format")) + flat.setdefault("audio_channel", audio.get("channel")) + flat.setdefault("audio_sample_rate", audio.get("sample_rate")) + + return super().from_mapping(flat) + class MiniMaxHttpTTSService(TTSService): """Text-to-speech service using MiniMax's T2A (Text-to-Audio) API. diff --git a/src/pipecat/services/resembleai/tts.py b/src/pipecat/services/resembleai/tts.py index 08df23abe..acba883e4 100644 --- a/src/pipecat/services/resembleai/tts.py +++ b/src/pipecat/services/resembleai/tts.py @@ -9,7 +9,7 @@ import base64 import json from dataclasses import dataclass, field -from typing import AsyncGenerator, Optional +from typing import AsyncGenerator, ClassVar, Dict, Optional from loguru import logger @@ -54,6 +54,11 @@ class ResembleAITTSSettings(TTSSettings): output_format: str = field(default_factory=lambda: NOT_GIVEN) resemble_sample_rate: int = field(default_factory=lambda: NOT_GIVEN) + _aliases: ClassVar[Dict[str, str]] = { + "voice_id": "voice", + "sample_rate": "resemble_sample_rate", + } + class ResembleAITTSService(AudioContextWordTTSService): """Resemble AI TTS service with WebSocket streaming and word timestamps.