Add backward-compat _aliases and from_mapping overrides to TTS settings
The migration from plain-dict `self._settings` to typed dataclasses renamed keys and flattened nested dicts. The deprecated dict-based `TTSUpdateSettingsFrame(settings={...})` code path calls `from_mapping`, which silently dropped old keys into `extra`.
- Add `_aliases` so renamed flat keys (e.g. `sample_rate` → `fish_sample_rate`, camelCase Inworld keys) resolve correctly.
- Override `from_mapping` to destructure nested dicts (`output_format`, `prosody`, `audioConfig`, `voice_setting`, `audio_setting`) into their flat field equivalents.
- Fix AsyncAI constructor bug passing `output_format={...}` dict instead of individual `output_container`/`output_encoding`/`output_sample_rate` fields.
This commit is contained in:
@@ -10,7 +10,7 @@ import asyncio
|
||||
import base64
|
||||
import json
|
||||
from dataclasses import dataclass, field
|
||||
from typing import AsyncGenerator, Optional
|
||||
from typing import Any, AsyncGenerator, ClassVar, Dict, Mapping, Optional
|
||||
|
||||
import aiohttp
|
||||
from loguru import logger
|
||||
@@ -88,6 +88,17 @@ class AsyncAITTSSettings(TTSSettings):
|
||||
output_encoding: str = field(default_factory=lambda: NOT_GIVEN)
|
||||
output_sample_rate: int = field(default_factory=lambda: NOT_GIVEN)
|
||||
|
||||
@classmethod
|
||||
def from_mapping(cls, settings: Mapping[str, Any]) -> "AsyncAITTSSettings":
|
||||
"""Construct settings from a plain dict, destructuring legacy nested ``output_format``."""
|
||||
flat = dict(settings)
|
||||
nested = flat.pop("output_format", None)
|
||||
if isinstance(nested, dict):
|
||||
flat.setdefault("output_container", nested.get("container"))
|
||||
flat.setdefault("output_encoding", nested.get("encoding"))
|
||||
flat.setdefault("output_sample_rate", nested.get("sample_rate"))
|
||||
return super().from_mapping(flat)
|
||||
|
||||
|
||||
class AsyncAITTSService(AudioContextTTSService):
|
||||
"""Async TTS service with WebSocket streaming.
|
||||
@@ -153,11 +164,9 @@ class AsyncAITTSService(AudioContextTTSService):
|
||||
self._settings = AsyncAITTSSettings(
|
||||
model=model,
|
||||
voice=voice_id,
|
||||
output_format={
|
||||
"container": container,
|
||||
"encoding": encoding,
|
||||
"sample_rate": 0,
|
||||
},
|
||||
output_container=container,
|
||||
output_encoding=encoding,
|
||||
output_sample_rate=0,
|
||||
language=self.language_to_service_language(params.language)
|
||||
if params.language
|
||||
else None,
|
||||
|
||||
@@ -11,7 +11,7 @@ import json
|
||||
import warnings
|
||||
from dataclasses import dataclass, field
|
||||
from enum import Enum
|
||||
from typing import Any, AsyncGenerator, List, Literal, Optional
|
||||
from typing import Any, AsyncGenerator, ClassVar, Dict, List, Literal, Mapping, Optional
|
||||
|
||||
from loguru import logger
|
||||
from pydantic import BaseModel, Field
|
||||
@@ -217,6 +217,17 @@ class CartesiaTTSSettings(TTSSettings):
|
||||
generation_config: GenerationConfig = field(default_factory=lambda: NOT_GIVEN)
|
||||
pronunciation_dict_id: str = field(default_factory=lambda: NOT_GIVEN)
|
||||
|
||||
@classmethod
|
||||
def from_mapping(cls, settings: Mapping[str, Any]) -> "CartesiaTTSSettings":
|
||||
"""Construct settings from a plain dict, destructuring legacy nested ``output_format``."""
|
||||
flat = dict(settings)
|
||||
nested = flat.pop("output_format", None)
|
||||
if isinstance(nested, dict):
|
||||
flat.setdefault("output_container", nested.get("container"))
|
||||
flat.setdefault("output_encoding", nested.get("encoding"))
|
||||
flat.setdefault("output_sample_rate", nested.get("sample_rate"))
|
||||
return super().from_mapping(flat)
|
||||
|
||||
|
||||
class CartesiaTTSService(AudioContextWordTTSService):
|
||||
"""Cartesia TTS service with WebSocket streaming and word timestamps.
|
||||
|
||||
@@ -12,7 +12,7 @@ for streaming text-to-speech synthesis with customizable voice parameters.
|
||||
|
||||
import uuid
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any, AsyncGenerator, Literal, Optional
|
||||
from typing import Any, AsyncGenerator, ClassVar, Dict, Literal, Mapping, Optional
|
||||
|
||||
from loguru import logger
|
||||
from pydantic import BaseModel
|
||||
@@ -69,6 +69,18 @@ class FishAudioTTSSettings(TTSSettings):
|
||||
prosody_volume: int = field(default_factory=lambda: NOT_GIVEN)
|
||||
reference_id: str = field(default_factory=lambda: NOT_GIVEN)
|
||||
|
||||
_aliases: ClassVar[Dict[str, str]] = {"voice_id": "voice", "sample_rate": "fish_sample_rate"}
|
||||
|
||||
@classmethod
|
||||
def from_mapping(cls, settings: Mapping[str, Any]) -> "FishAudioTTSSettings":
|
||||
"""Construct settings from a plain dict, destructuring legacy nested ``prosody``."""
|
||||
flat = dict(settings)
|
||||
nested = flat.pop("prosody", None)
|
||||
if isinstance(nested, dict):
|
||||
flat.setdefault("prosody_speed", nested.get("speed"))
|
||||
flat.setdefault("prosody_volume", nested.get("volume"))
|
||||
return super().from_mapping(flat)
|
||||
|
||||
|
||||
class FishAudioTTSService(InterruptibleTTSService):
|
||||
"""Fish Audio text-to-speech service with WebSocket streaming.
|
||||
|
||||
@@ -9,7 +9,7 @@
|
||||
import io
|
||||
import wave
|
||||
from dataclasses import dataclass, field
|
||||
from typing import AsyncGenerator, Optional
|
||||
from typing import AsyncGenerator, ClassVar, Dict, Optional
|
||||
|
||||
from loguru import logger
|
||||
from pydantic import BaseModel
|
||||
@@ -48,6 +48,8 @@ class GroqTTSSettings(TTSSettings):
|
||||
speed: float = field(default_factory=lambda: NOT_GIVEN)
|
||||
groq_sample_rate: int = field(default_factory=lambda: NOT_GIVEN)
|
||||
|
||||
_aliases: ClassVar[Dict[str, str]] = {"voice_id": "voice", "sample_rate": "groq_sample_rate"}
|
||||
|
||||
|
||||
class GroqTTSService(TTSService):
|
||||
"""Groq text-to-speech service implementation.
|
||||
|
||||
@@ -17,7 +17,7 @@ import asyncio
|
||||
import base64
|
||||
import json
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple
|
||||
from typing import Any, AsyncGenerator, ClassVar, Dict, List, Mapping, Optional, Tuple
|
||||
|
||||
import aiohttp
|
||||
import websockets
|
||||
@@ -74,6 +74,25 @@ class InworldTTSSettings(TTSSettings):
|
||||
auto_mode: bool = field(default_factory=lambda: NOT_GIVEN)
|
||||
apply_text_normalization: str = field(default_factory=lambda: NOT_GIVEN)
|
||||
|
||||
_aliases: ClassVar[Dict[str, str]] = {
|
||||
"voice_id": "voice",
|
||||
"voiceId": "voice",
|
||||
"modelId": "model",
|
||||
"applyTextNormalization": "apply_text_normalization",
|
||||
"autoMode": "auto_mode",
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_mapping(cls, settings: Mapping[str, Any]) -> "InworldTTSSettings":
|
||||
"""Construct settings from a plain dict, destructuring legacy nested ``audioConfig``."""
|
||||
flat = dict(settings)
|
||||
nested = flat.pop("audioConfig", None)
|
||||
if isinstance(nested, dict):
|
||||
flat.setdefault("audio_encoding", nested.get("audioEncoding"))
|
||||
flat.setdefault("audio_sample_rate", nested.get("sampleRateHertz"))
|
||||
flat.setdefault("speaking_rate", nested.get("speakingRate"))
|
||||
return super().from_mapping(flat)
|
||||
|
||||
|
||||
class InworldHttpTTSService(WordTTSService):
|
||||
"""Inworld AI HTTP-based TTS service.
|
||||
|
||||
@@ -12,7 +12,7 @@ for streaming text-to-speech synthesis.
|
||||
|
||||
import json
|
||||
from dataclasses import dataclass, field
|
||||
from typing import AsyncGenerator, Optional
|
||||
from typing import Any, AsyncGenerator, ClassVar, Dict, Mapping, Optional
|
||||
|
||||
import aiohttp
|
||||
from loguru import logger
|
||||
@@ -120,6 +120,35 @@ class MiniMaxTTSSettings(TTSSettings):
|
||||
audio_sample_rate: int = field(default_factory=lambda: NOT_GIVEN)
|
||||
language_boost: str = field(default_factory=lambda: NOT_GIVEN)
|
||||
|
||||
_aliases: ClassVar[Dict[str, str]] = {"voice_id": "voice"}
|
||||
|
||||
@classmethod
|
||||
def from_mapping(cls, settings: Mapping[str, Any]) -> "MiniMaxTTSSettings":
|
||||
"""Construct settings from a plain dict, destructuring legacy nested dicts.
|
||||
|
||||
Handles ``voice_setting`` (with ``vol`` → ``volume`` rename) and
|
||||
``audio_setting`` (with prefixed field mapping).
|
||||
"""
|
||||
flat = dict(settings)
|
||||
|
||||
voice = flat.pop("voice_setting", None)
|
||||
if isinstance(voice, dict):
|
||||
flat.setdefault("speed", voice.get("speed"))
|
||||
flat.setdefault("volume", voice.get("vol"))
|
||||
flat.setdefault("pitch", voice.get("pitch"))
|
||||
flat.setdefault("emotion", voice.get("emotion"))
|
||||
flat.setdefault("text_normalization", voice.get("text_normalization"))
|
||||
flat.setdefault("latex_read", voice.get("latex_read"))
|
||||
|
||||
audio = flat.pop("audio_setting", None)
|
||||
if isinstance(audio, dict):
|
||||
flat.setdefault("audio_bitrate", audio.get("bitrate"))
|
||||
flat.setdefault("audio_format", audio.get("format"))
|
||||
flat.setdefault("audio_channel", audio.get("channel"))
|
||||
flat.setdefault("audio_sample_rate", audio.get("sample_rate"))
|
||||
|
||||
return super().from_mapping(flat)
|
||||
|
||||
|
||||
class MiniMaxHttpTTSService(TTSService):
|
||||
"""Text-to-speech service using MiniMax's T2A (Text-to-Audio) API.
|
||||
|
||||
@@ -9,7 +9,7 @@
|
||||
import base64
|
||||
import json
|
||||
from dataclasses import dataclass, field
|
||||
from typing import AsyncGenerator, Optional
|
||||
from typing import AsyncGenerator, ClassVar, Dict, Optional
|
||||
|
||||
from loguru import logger
|
||||
|
||||
@@ -54,6 +54,11 @@ class ResembleAITTSSettings(TTSSettings):
|
||||
output_format: str = field(default_factory=lambda: NOT_GIVEN)
|
||||
resemble_sample_rate: int = field(default_factory=lambda: NOT_GIVEN)
|
||||
|
||||
_aliases: ClassVar[Dict[str, str]] = {
|
||||
"voice_id": "voice",
|
||||
"sample_rate": "resemble_sample_rate",
|
||||
}
|
||||
|
||||
|
||||
class ResembleAITTSService(AudioContextWordTTSService):
|
||||
"""Resemble AI TTS service with WebSocket streaming and word timestamps.
|
||||
|
||||
Reference in New Issue
Block a user