Add backward-compat _aliases and from_mapping overrides to TTS settings

The migration from plain-dict `self._settings` to typed dataclasses renamed keys and flattened nested dicts. The deprecated dict-based `TTSUpdateSettingsFrame(settings={...})` code path calls `from_mapping`, which silently dropped old keys into `extra`.

- Add `_aliases` so renamed flat keys (e.g. `sample_rate` → `fish_sample_rate`, camelCase Inworld keys) resolve correctly.
- Override `from_mapping` to destructure nested dicts (`output_format`, `prosody`, `audioConfig`, `voice_setting`, `audio_setting`) into their flat field equivalents.
- Fix AsyncAI constructor bug passing `output_format={...}` dict instead of individual `output_container`/`output_encoding`/`output_sample_rate` fields.
This commit is contained in:
Paul Kompfner
2026-02-17 17:07:14 -05:00
parent 68ebd3d063
commit ce51df677c
7 changed files with 99 additions and 12 deletions

View File

@@ -10,7 +10,7 @@ import asyncio
import base64
import json
from dataclasses import dataclass, field
from typing import AsyncGenerator, Optional
from typing import Any, AsyncGenerator, ClassVar, Dict, Mapping, Optional
import aiohttp
from loguru import logger
@@ -88,6 +88,17 @@ class AsyncAITTSSettings(TTSSettings):
output_encoding: str = field(default_factory=lambda: NOT_GIVEN)
output_sample_rate: int = field(default_factory=lambda: NOT_GIVEN)
@classmethod
def from_mapping(cls, settings: Mapping[str, Any]) -> "AsyncAITTSSettings":
"""Construct settings from a plain dict, destructuring legacy nested ``output_format``."""
flat = dict(settings)
nested = flat.pop("output_format", None)
if isinstance(nested, dict):
flat.setdefault("output_container", nested.get("container"))
flat.setdefault("output_encoding", nested.get("encoding"))
flat.setdefault("output_sample_rate", nested.get("sample_rate"))
return super().from_mapping(flat)
class AsyncAITTSService(AudioContextTTSService):
"""Async TTS service with WebSocket streaming.
@@ -153,11 +164,9 @@ class AsyncAITTSService(AudioContextTTSService):
self._settings = AsyncAITTSSettings(
model=model,
voice=voice_id,
output_format={
"container": container,
"encoding": encoding,
"sample_rate": 0,
},
output_container=container,
output_encoding=encoding,
output_sample_rate=0,
language=self.language_to_service_language(params.language)
if params.language
else None,

View File

@@ -11,7 +11,7 @@ import json
import warnings
from dataclasses import dataclass, field
from enum import Enum
from typing import Any, AsyncGenerator, List, Literal, Optional
from typing import Any, AsyncGenerator, ClassVar, Dict, List, Literal, Mapping, Optional
from loguru import logger
from pydantic import BaseModel, Field
@@ -217,6 +217,17 @@ class CartesiaTTSSettings(TTSSettings):
generation_config: GenerationConfig = field(default_factory=lambda: NOT_GIVEN)
pronunciation_dict_id: str = field(default_factory=lambda: NOT_GIVEN)
@classmethod
def from_mapping(cls, settings: Mapping[str, Any]) -> "CartesiaTTSSettings":
"""Construct settings from a plain dict, destructuring legacy nested ``output_format``."""
flat = dict(settings)
nested = flat.pop("output_format", None)
if isinstance(nested, dict):
flat.setdefault("output_container", nested.get("container"))
flat.setdefault("output_encoding", nested.get("encoding"))
flat.setdefault("output_sample_rate", nested.get("sample_rate"))
return super().from_mapping(flat)
class CartesiaTTSService(AudioContextWordTTSService):
"""Cartesia TTS service with WebSocket streaming and word timestamps.

View File

@@ -12,7 +12,7 @@ for streaming text-to-speech synthesis with customizable voice parameters.
import uuid
from dataclasses import dataclass, field
from typing import Any, AsyncGenerator, Literal, Optional
from typing import Any, AsyncGenerator, ClassVar, Dict, Literal, Mapping, Optional
from loguru import logger
from pydantic import BaseModel
@@ -69,6 +69,18 @@ class FishAudioTTSSettings(TTSSettings):
prosody_volume: int = field(default_factory=lambda: NOT_GIVEN)
reference_id: str = field(default_factory=lambda: NOT_GIVEN)
_aliases: ClassVar[Dict[str, str]] = {"voice_id": "voice", "sample_rate": "fish_sample_rate"}
@classmethod
def from_mapping(cls, settings: Mapping[str, Any]) -> "FishAudioTTSSettings":
"""Construct settings from a plain dict, destructuring legacy nested ``prosody``."""
flat = dict(settings)
nested = flat.pop("prosody", None)
if isinstance(nested, dict):
flat.setdefault("prosody_speed", nested.get("speed"))
flat.setdefault("prosody_volume", nested.get("volume"))
return super().from_mapping(flat)
class FishAudioTTSService(InterruptibleTTSService):
"""Fish Audio text-to-speech service with WebSocket streaming.

View File

@@ -9,7 +9,7 @@
import io
import wave
from dataclasses import dataclass, field
from typing import AsyncGenerator, Optional
from typing import AsyncGenerator, ClassVar, Dict, Optional
from loguru import logger
from pydantic import BaseModel
@@ -48,6 +48,8 @@ class GroqTTSSettings(TTSSettings):
speed: float = field(default_factory=lambda: NOT_GIVEN)
groq_sample_rate: int = field(default_factory=lambda: NOT_GIVEN)
_aliases: ClassVar[Dict[str, str]] = {"voice_id": "voice", "sample_rate": "groq_sample_rate"}
class GroqTTSService(TTSService):
"""Groq text-to-speech service implementation.

View File

@@ -17,7 +17,7 @@ import asyncio
import base64
import json
from dataclasses import dataclass, field
from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple
from typing import Any, AsyncGenerator, ClassVar, Dict, List, Mapping, Optional, Tuple
import aiohttp
import websockets
@@ -74,6 +74,25 @@ class InworldTTSSettings(TTSSettings):
auto_mode: bool = field(default_factory=lambda: NOT_GIVEN)
apply_text_normalization: str = field(default_factory=lambda: NOT_GIVEN)
_aliases: ClassVar[Dict[str, str]] = {
"voice_id": "voice",
"voiceId": "voice",
"modelId": "model",
"applyTextNormalization": "apply_text_normalization",
"autoMode": "auto_mode",
}
@classmethod
def from_mapping(cls, settings: Mapping[str, Any]) -> "InworldTTSSettings":
"""Construct settings from a plain dict, destructuring legacy nested ``audioConfig``."""
flat = dict(settings)
nested = flat.pop("audioConfig", None)
if isinstance(nested, dict):
flat.setdefault("audio_encoding", nested.get("audioEncoding"))
flat.setdefault("audio_sample_rate", nested.get("sampleRateHertz"))
flat.setdefault("speaking_rate", nested.get("speakingRate"))
return super().from_mapping(flat)
class InworldHttpTTSService(WordTTSService):
"""Inworld AI HTTP-based TTS service.

View File

@@ -12,7 +12,7 @@ for streaming text-to-speech synthesis.
import json
from dataclasses import dataclass, field
from typing import AsyncGenerator, Optional
from typing import Any, AsyncGenerator, ClassVar, Dict, Mapping, Optional
import aiohttp
from loguru import logger
@@ -120,6 +120,35 @@ class MiniMaxTTSSettings(TTSSettings):
audio_sample_rate: int = field(default_factory=lambda: NOT_GIVEN)
language_boost: str = field(default_factory=lambda: NOT_GIVEN)
_aliases: ClassVar[Dict[str, str]] = {"voice_id": "voice"}
@classmethod
def from_mapping(cls, settings: Mapping[str, Any]) -> "MiniMaxTTSSettings":
"""Construct settings from a plain dict, destructuring legacy nested dicts.
Handles ``voice_setting`` (with ``vol`` → ``volume`` rename) and
``audio_setting`` (with prefixed field mapping).
"""
flat = dict(settings)
voice = flat.pop("voice_setting", None)
if isinstance(voice, dict):
flat.setdefault("speed", voice.get("speed"))
flat.setdefault("volume", voice.get("vol"))
flat.setdefault("pitch", voice.get("pitch"))
flat.setdefault("emotion", voice.get("emotion"))
flat.setdefault("text_normalization", voice.get("text_normalization"))
flat.setdefault("latex_read", voice.get("latex_read"))
audio = flat.pop("audio_setting", None)
if isinstance(audio, dict):
flat.setdefault("audio_bitrate", audio.get("bitrate"))
flat.setdefault("audio_format", audio.get("format"))
flat.setdefault("audio_channel", audio.get("channel"))
flat.setdefault("audio_sample_rate", audio.get("sample_rate"))
return super().from_mapping(flat)
class MiniMaxHttpTTSService(TTSService):
"""Text-to-speech service using MiniMax's T2A (Text-to-Audio) API.

View File

@@ -9,7 +9,7 @@
import base64
import json
from dataclasses import dataclass, field
from typing import AsyncGenerator, Optional
from typing import AsyncGenerator, ClassVar, Dict, Optional
from loguru import logger
@@ -54,6 +54,11 @@ class ResembleAITTSSettings(TTSSettings):
output_format: str = field(default_factory=lambda: NOT_GIVEN)
resemble_sample_rate: int = field(default_factory=lambda: NOT_GIVEN)
_aliases: ClassVar[Dict[str, str]] = {
"voice_id": "voice",
"sample_rate": "resemble_sample_rate",
}
class ResembleAITTSService(AudioContextWordTTSService):
"""Resemble AI TTS service with WebSocket streaming and word timestamps.