TTS service and example updates

2026-03-06 20:39:00 -05:00
parent 4ed3480e4b
commit 671e9a6846
18 changed files with 142 additions and 131 deletions
--- a/examples/foundational/07i-interruptible-xtts.py
+++ b/examples/foundational/07i-interruptible-xtts.py
@@ -25,7 +25,7 @@ from pipecat.runner.types import RunnerArguments
 from pipecat.runner.utils import create_transport
 from pipecat.services.deepgram.stt import DeepgramSTTService
 from pipecat.services.openai.llm import OpenAILLMService, OpenAILLMSettings
-from pipecat.services.xtts.tts import XTTSService, XTTSSettings
+from pipecat.services.xtts.tts import XTTSService, XTTSTTSSettings
 from pipecat.transports.base_transport import BaseTransport, TransportParams
 from pipecat.transports.daily.transport import DailyParams
 from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
@@ -59,7 +59,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):

        tts = XTTSService(
            aiohttp_session=session,
-            settings=XTTSSettings(
+            settings=XTTSTTSSettings(
                voice="Claribel Dervla",
            ),
            base_url="http://localhost:8000",
--- a/examples/foundational/07n-interruptible-gemini-image.py
+++ b/examples/foundational/07n-interruptible-gemini-image.py
@@ -72,7 +72,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
    stt = GoogleSTTService(
        credentials=os.getenv("GOOGLE_TEST_CREDENTIALS"),
        settings=GoogleSTTSettings(
-            languages=Language.EN_US,
+            languages=[Language.EN_US],
        ),
    )

--- a/examples/foundational/07n-interruptible-gemini.py
+++ b/examples/foundational/07n-interruptible-gemini.py
@@ -55,7 +55,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):

    stt = GoogleSTTService(
        settings=GoogleSTTSettings(
-            languages=Language.EN_US,
+            languages=[Language.EN_US],
        ),
        credentials=os.getenv("GOOGLE_TEST_CREDENTIALS"),
    )
--- a/examples/foundational/07n-interruptible-google-http.py
+++ b/examples/foundational/07n-interruptible-google-http.py
@@ -55,8 +55,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):

    stt = GoogleSTTService(
        settings=GoogleSTTSettings(
-            languages=Language.EN_US,
-            model="chirp_3",
+            languages=[Language.EN_US],
+            # Add model to use a specific model
+            # model="chirp_3",
        ),
        credentials=os.getenv("GOOGLE_TEST_CREDENTIALS"),
        location="us",
--- a/examples/foundational/07t-interruptible-fish.py
+++ b/examples/foundational/07t-interruptible-fish.py
@@ -58,7 +58,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
    tts = FishAudioTTSService(
        api_key=os.getenv("FISH_API_KEY"),
        settings=FishAudioTTSSettings(
-            model="4ce7e917cedd4bc2bb2e6ff3a46acaa1",  # Barack Obama
+            voice="4ce7e917cedd4bc2bb2e6ff3a46acaa1",  # Barack Obama
        ),
    )

--- a/examples/foundational/07z-interruptible-sarvam-http.py
+++ b/examples/foundational/07z-interruptible-sarvam-http.py
@@ -24,7 +24,7 @@ from pipecat.processors.aggregators.llm_response_universal import (
 from pipecat.runner.types import RunnerArguments
 from pipecat.runner.utils import create_transport
 from pipecat.services.openai.llm import OpenAILLMService, OpenAILLMSettings
-from pipecat.services.sarvam.stt import SarvamSTTService
+from pipecat.services.sarvam.stt import SarvamSTTService, SarvamSTTSettings
 from pipecat.services.sarvam.tts import SarvamHttpTTSService, SarvamHttpTTSSettings
 from pipecat.transcriptions.language import Language
 from pipecat.transports.base_transport import BaseTransport, TransportParams
@@ -59,14 +59,16 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
    async with aiohttp.ClientSession() as session:
        stt = SarvamSTTService(
            api_key=os.getenv("SARVAM_API_KEY"),
-            model="saarika:v2.5",
+            settings=SarvamSTTSettings(
+                model="saarika:v2.5",
+            ),
        )

        tts = SarvamHttpTTSService(
            api_key=os.getenv("SARVAM_API_KEY"),
            aiohttp_session=session,
            settings=SarvamHttpTTSSettings(
-                language=Language.EN,
+                language=Language.EN_IN,
            ),
        )

--- a/examples/foundational/15a-switch-languages.py
+++ b/examples/foundational/15a-switch-languages.py
@@ -7,7 +7,6 @@

 import os

-from deepgram import LiveOptions
 from dotenv import load_dotenv
 from loguru import logger

@@ -28,7 +27,7 @@ from pipecat.processors.filters.function_filter import FunctionFilter
 from pipecat.runner.types import RunnerArguments
 from pipecat.runner.utils import create_transport
 from pipecat.services.cartesia.tts import CartesiaTTSService, CartesiaTTSSettings
-from pipecat.services.deepgram.stt import DeepgramSTTService
+from pipecat.services.deepgram.stt import DeepgramSTTService, DeepgramSTTSettings
 from pipecat.services.llm_service import FunctionCallParams
 from pipecat.services.openai.llm import OpenAILLMService, OpenAILLMSettings
 from pipecat.transports.base_transport import BaseTransport, TransportParams
@@ -102,7 +101,10 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
    logger.info(f"Starting bot")

    stt = DeepgramSTTService(
-        api_key=os.getenv("DEEPGRAM_API_KEY"), live_options=LiveOptions(language="multi")
+        api_key=os.getenv("DEEPGRAM_API_KEY"),
+        settings=DeepgramSTTSettings(
+            language="multi",
+        ),
    )

    tts = SwitchLanguage()
--- a/scripts/evals/run-release-evals.py
+++ b/scripts/evals/run-release-evals.py
@@ -146,6 +146,7 @@ TESTS_07 = [
    ("07zg-interruptible-camb.py", EVAL_SIMPLE_MATH),
    ("07zi-interruptible-piper.py", EVAL_SIMPLE_MATH),
    ("07zj-interruptible-kokoro.py", EVAL_SIMPLE_MATH),
+    ("07zk-interruptible-resembleai.py", EVAL_SIMPLE_MATH),
    # Needs a local XTTS docker instance running.
    # ("07i-interruptible-xtts.py", EVAL_SIMPLE_MATH),
 ]
--- a/src/pipecat/services/cartesia/tts.py
+++ b/src/pipecat/services/cartesia/tts.py
@@ -23,7 +23,6 @@ from pipecat.frames.frames import (
    Frame,
    StartFrame,
    TTSAudioRawFrame,
-    TTSStartedFrame,
    TTSStoppedFrame,
 )
 from pipecat.services.settings import NOT_GIVEN, TTSSettings, _NotGiven, _warn_deprecated_param
@@ -705,7 +704,7 @@ class CartesiaHttpTTSService(TTSService):
        voice_id: Optional[str] = None,
        model: Optional[str] = None,
        base_url: str = "https://api.cartesia.ai",
-        cartesia_version: str = "2024-11-13",
+        cartesia_version: str = "2026-03-01",
        aiohttp_session: Optional[aiohttp.ClientSession] = None,
        sample_rate: Optional[int] = None,
        encoding: str = "pcm_s16le",
--- a/src/pipecat/services/elevenlabs/stt.py
+++ b/src/pipecat/services/elevenlabs/stt.py
@@ -200,18 +200,12 @@ class ElevenLabsRealtimeSTTSettings(STTSettings):
        vad_threshold: VAD sensitivity (0.1-0.9, lower is more sensitive).
        min_speech_duration_ms: Minimum speech duration for VAD (50-2000ms).
        min_silence_duration_ms: Minimum silence duration for VAD (50-2000ms).
-        include_timestamps: Whether to include word-level timestamps in transcripts.
-        enable_logging: Whether to enable logging on ElevenLabs' side.
-        include_language_detection: Whether to include language detection in transcripts.
    """

    vad_silence_threshold_secs: float | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
    vad_threshold: float | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
    min_speech_duration_ms: int | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
    min_silence_duration_ms: int | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
-    include_timestamps: bool | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
-    enable_logging: bool | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
-    include_language_detection: bool | _NotGiven = field(default_factory=lambda: NOT_GIVEN)


 class ElevenLabsSTTService(SegmentedSTTService):
@@ -496,6 +490,9 @@ class ElevenLabsRealtimeSTTService(WebsocketSTTService):
        commit_strategy: CommitStrategy = CommitStrategy.MANUAL,
        model: Optional[str] = None,
        sample_rate: Optional[int] = None,
+        include_timestamps: bool = False,
+        enable_logging: bool = False,
+        include_language_detection: bool = False,
        params: Optional[InputParams] = None,
        settings: Optional[ElevenLabsRealtimeSTTSettings] = None,
        ttfs_p99_latency: Optional[float] = ELEVENLABS_REALTIME_TTFS_P99,
@@ -515,6 +512,9 @@ class ElevenLabsRealtimeSTTService(WebsocketSTTService):
                    Use ``settings=ElevenLabsRealtimeSTTSettings(model=...)`` instead.

            sample_rate: Audio sample rate in Hz. If not provided, uses the pipeline's rate.
+            include_timestamps: Whether to include word-level timestamps in transcripts.
+            enable_logging: Whether to enable logging on ElevenLabs' side.
+            include_language_detection: Whether to include language detection in transcripts.
            params: Configuration parameters for the STT service.

                .. deprecated:: 0.0.105
@@ -534,9 +534,6 @@ class ElevenLabsRealtimeSTTService(WebsocketSTTService):
            vad_threshold=None,
            min_speech_duration_ms=None,
            min_silence_duration_ms=None,
-            include_timestamps=False,
-            enable_logging=False,
-            include_language_detection=False,
        )

        # 2. Apply direct init arg overrides (deprecated)
@@ -555,9 +552,9 @@ class ElevenLabsRealtimeSTTService(WebsocketSTTService):
                default_settings.vad_threshold = params.vad_threshold
                default_settings.min_speech_duration_ms = params.min_speech_duration_ms
                default_settings.min_silence_duration_ms = params.min_silence_duration_ms
-                default_settings.include_timestamps = params.include_timestamps
-                default_settings.enable_logging = params.enable_logging
-                default_settings.include_language_detection = params.include_language_detection
+                include_timestamps = params.include_timestamps
+                enable_logging = params.enable_logging
+                include_language_detection = params.include_language_detection

        # 4. Apply settings delta (canonical API, always wins)
        if settings is not None:
@@ -579,6 +576,9 @@ class ElevenLabsRealtimeSTTService(WebsocketSTTService):

        # Init-only config (not runtime-updatable).
        self._commit_strategy = commit_strategy
+        self._include_timestamps = include_timestamps
+        self._enable_logging = enable_logging
+        self._include_language_detection = include_language_detection

        self._connected_event = asyncio.Event()
        self._connected_event.set()
@@ -762,17 +762,15 @@ class ElevenLabsRealtimeSTTService(WebsocketSTTService):
            params.append(f"commit_strategy={self._commit_strategy.value}")

            # Add optional parameters
-            if self._settings.include_timestamps:
-                params.append(
-                    f"include_timestamps={str(self._settings.include_timestamps).lower()}"
-                )
+            if self._include_timestamps:
+                params.append(f"include_timestamps={str(self._include_timestamps).lower()}")

-            if self._settings.enable_logging:
-                params.append(f"enable_logging={str(self._settings.enable_logging).lower()}")
+            if self._enable_logging:
+                params.append(f"enable_logging={str(self._enable_logging).lower()}")

-            if self._settings.include_language_detection:
+            if self._include_language_detection:
                params.append(
-                    f"include_language_detection={str(self._settings.include_language_detection).lower()}"
+                    f"include_language_detection={str(self._include_language_detection).lower()}"
                )

            # Add VAD parameters if using VAD commit strategy and values are specified
@@ -920,7 +918,7 @@ class ElevenLabsRealtimeSTTService(WebsocketSTTService):
        """
        # If timestamps are enabled, skip this message and wait for the
        # committed_transcript_with_timestamps message which contains all the data
-        if self._settings.include_timestamps:
+        if self._include_timestamps:
            return

        text = data.get("text", "").strip()
--- a/src/pipecat/services/elevenlabs/tts.py
+++ b/src/pipecat/services/elevenlabs/tts.py
@@ -358,6 +358,9 @@ class ElevenLabsTTSService(WebsocketTTSService):
        model: Optional[str] = None,
        url: str = "wss://api.elevenlabs.io",
        sample_rate: Optional[int] = None,
+        auto_mode: bool = True,
+        enable_ssml_parsing: Optional[bool] = None,
+        enable_logging: Optional[bool] = None,
        pronunciation_dictionary_locators: Optional[List[PronunciationDictionaryLocator]] = None,
        params: Optional[InputParams] = None,
        settings: Optional[ElevenLabsTTSSettings] = None,
@@ -381,6 +384,9 @@ class ElevenLabsTTSService(WebsocketTTSService):

            url: WebSocket URL for ElevenLabs TTS API.
            sample_rate: Audio sample rate. If None, uses default.
+            auto_mode: Whether to enable automatic mode optimization.
+            enable_ssml_parsing: Whether to parse SSML tags in text.
+            enable_logging: Whether to enable ElevenLabs server-side logging.
            pronunciation_dictionary_locators: List of pronunciation dictionary
                locators to use.
            params: Additional input parameters for voice customization.
@@ -428,11 +434,6 @@ class ElevenLabsTTSService(WebsocketTTSService):
            apply_text_normalization=None,
        )

-        # Track init-only URL params through the override chain
-        _auto_mode = True
-        _enable_ssml_parsing = None
-        _enable_logging = None
-
        # 2. Apply direct init arg overrides (deprecated)
        if voice_id is not None:
            _warn_deprecated_param("voice_id", ElevenLabsTTSSettings, "voice")
@@ -459,11 +460,11 @@ class ElevenLabsTTSService(WebsocketTTSService):
                if params.speed is not None:
                    default_settings.speed = params.speed
                if params.auto_mode is not None:
-                    _auto_mode = str(params.auto_mode).lower()
+                    auto_mode = params.auto_mode
                if params.enable_ssml_parsing is not None:
-                    _enable_ssml_parsing = params.enable_ssml_parsing
+                    enable_ssml_parsing = params.enable_ssml_parsing
                if params.enable_logging is not None:
-                    _enable_logging = params.enable_logging
+                    enable_logging = params.enable_logging
                if params.apply_text_normalization is not None:
                    default_settings.apply_text_normalization = params.apply_text_normalization
                if _pronunciation_dictionary_locators is None:
@@ -488,9 +489,9 @@ class ElevenLabsTTSService(WebsocketTTSService):
        self._url = url

        # Init-only WebSocket URL params (not runtime-updatable).
-        self._auto_mode = _auto_mode
-        self._enable_ssml_parsing = _enable_ssml_parsing
-        self._enable_logging = _enable_logging
+        self._auto_mode = auto_mode
+        self._enable_ssml_parsing = enable_ssml_parsing
+        self._enable_logging = enable_logging

        self._output_format = ""  # initialized in start()
        self._voice_settings = self._set_voice_settings()
@@ -664,7 +665,7 @@ class ElevenLabsTTSService(WebsocketTTSService):
            voice_id = self._settings.voice
            model = self._settings.model
            output_format = self._output_format
-            url = f"{self._url}/v1/text-to-speech/{voice_id}/multi-stream-input?model_id={model}&output_format={output_format}&auto_mode={self._auto_mode}"
+            url = f"{self._url}/v1/text-to-speech/{voice_id}/multi-stream-input?model_id={model}&output_format={output_format}&auto_mode={str(self._auto_mode).lower()}"

            if self._enable_ssml_parsing:
                url += f"&enable_ssml_parsing={self._enable_ssml_parsing}"
--- a/src/pipecat/services/fish/tts.py
+++ b/src/pipecat/services/fish/tts.py
@@ -10,9 +10,8 @@ This module provides integration with Fish Audio's real-time TTS WebSocket API
 for streaming text-to-speech synthesis with customizable voice parameters.
 """

-import uuid
 from dataclasses import dataclass, field
-from typing import Any, AsyncGenerator, ClassVar, Dict, Literal, Mapping, Optional, Self
+from typing import Any, AsyncGenerator, Literal, Mapping, Optional, Self

 from loguru import logger
 from pydantic import BaseModel
@@ -25,7 +24,6 @@ from pipecat.frames.frames import (
    InterruptionFrame,
    StartFrame,
    TTSAudioRawFrame,
-    TTSStartedFrame,
    TTSStoppedFrame,
 )
 from pipecat.processors.frame_processor import FrameDirection
@@ -52,18 +50,20 @@ class FishAudioTTSSettings(TTSSettings):
    """Settings for FishAudioTTSService.

    Parameters:
-        latency: Latency mode ("normal" or "balanced"). Defaults to "normal".
+        latency: Latency mode ("normal" or "balanced"). Defaults to "balanced".
        normalize: Whether to normalize audio output. Defaults to True.
+        temperature: Controls randomness in speech generation (0.0-1.0).
+        top_p: Controls diversity via nucleus sampling (0.0-1.0).
        prosody_speed: Speech speed multiplier (0.5-2.0). Defaults to 1.0.
-        prosody_volume: Volume adjustment in dB. Defaults to 0.
-        reference_id: Reference ID of the voice model.
+        prosody_volume: Volume adjustment in dB (-20 to 20). Defaults to 0.
    """

    latency: str | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
    normalize: bool | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
+    temperature: float | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
+    top_p: float | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
    prosody_speed: float | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
    prosody_volume: int | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
-    reference_id: str | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)

    @classmethod
    def from_mapping(cls, settings: Mapping[str, Any]) -> Self:
@@ -174,18 +174,18 @@ class FishAudioTTSService(InterruptibleTTSService):
            model="s1",
            voice=None,
            language=None,
-            latency="normal",
+            latency="balanced",
            normalize=True,
+            temperature=None,
+            top_p=None,
            prosody_speed=1.0,
            prosody_volume=0,
-            reference_id=None,
        )

        # 2. Apply direct init arg overrides (deprecated)
        if reference_id is not None:
            _warn_deprecated_param("reference_id", FishAudioTTSSettings, "voice")
            default_settings.voice = reference_id
-            default_settings.reference_id = reference_id
        if model_id is not None:
            _warn_deprecated_param("model_id", FishAudioTTSSettings, "model")
            default_settings.model = model_id
@@ -317,8 +317,12 @@ class FishAudioTTSService(InterruptibleTTSService):
                    "speed": self._settings.prosody_speed,
                    "volume": self._settings.prosody_volume,
                },
-                "reference_id": self._settings.reference_id,
+                "reference_id": self._settings.voice,
            }
+            if self._settings.temperature is not None:
+                request_settings["temperature"] = self._settings.temperature
+            if self._settings.top_p is not None:
+                request_settings["top_p"] = self._settings.top_p
            start_message = {"event": "start", "request": {"text": "", **request_settings}}
            await self._websocket.send(ormsgpack.packb(start_message))
            logger.debug("Sent start message to Fish Audio")
@@ -375,7 +379,14 @@ class FishAudioTTSService(InterruptibleTTSService):
                                frame = TTSAudioRawFrame(audio_data, self.sample_rate, 1)
                                await self.push_frame(frame)
                                await self.stop_ttfb_metrics()
-                                continue
+                        elif event == "finish":
+                            reason = msg.get("reason", "unknown")
+                            if reason == "error":
+                                await self.push_error(
+                                    error_msg="Fish Audio server error during synthesis"
+                                )
+                            else:
+                                logger.debug(f"Fish Audio session finished: {reason}")

            except Exception as e:
                await self.push_error(error_msg=f"Unknown error occurred: {e}", exception=e)
--- a/src/pipecat/services/inworld/tts.py
+++ b/src/pipecat/services/inworld/tts.py
@@ -73,27 +73,14 @@ class InworldTTSSettings(TTSSettings):
    Parameters:
        speaking_rate: Speaking rate for speech synthesis.
        temperature: Temperature for speech synthesis.
-        auto_mode: Whether to use auto mode. Recommended when texts are sent
-            in full sentences/phrases. When enabled, the server controls
-            flushing of buffered text to achieve minimal latency while
-            maintaining high quality audio output. If None (default),
-            automatically set based on aggregate_sentences.
-        apply_text_normalization: Whether to apply text normalization.
-        timestamp_transport_strategy: Strategy for timestamp transport ("ASYNC" or "SYNC").
    """

    speaking_rate: float | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
    temperature: float | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
-    auto_mode: bool | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
-    apply_text_normalization: str | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
-    timestamp_transport_strategy: str | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)

    _aliases: ClassVar[Dict[str, str]] = {
        "voiceId": "voice",
        "modelId": "model",
-        "applyTextNormalization": "apply_text_normalization",
-        "autoMode": "auto_mode",
-        "timestampTransportStrategy": "timestamp_transport_strategy",
    }

    @classmethod
@@ -141,6 +128,7 @@ class InworldHttpTTSService(TTSService):
        streaming: bool = True,
        sample_rate: Optional[int] = None,
        encoding: str = "LINEAR16",
+        timestamp_transport_strategy: Optional[Literal["ASYNC", "SYNC"]] = "ASYNC",
        params: Optional[InputParams] = None,
        settings: Optional[InworldTTSSettings] = None,
        **kwargs,
@@ -163,6 +151,8 @@ class InworldHttpTTSService(TTSService):
            streaming: Whether to use streaming mode.
            sample_rate: Audio sample rate in Hz.
            encoding: Audio encoding format.
+            timestamp_transport_strategy: Strategy for timestamp transport
+                ("ASYNC" or "SYNC"). Defaults to "ASYNC".
            params: Input parameters for Inworld TTS configuration.

                .. deprecated:: 0.0.105
@@ -179,9 +169,6 @@ class InworldHttpTTSService(TTSService):
            language=None,
            speaking_rate=None,
            temperature=None,
-            timestamp_transport_strategy="ASYNC",
-            auto_mode=None,  # Not applicable for HTTP TTS
-            apply_text_normalization=None,  # Not applicable for HTTP TTS
        )

        # 2. Apply direct init arg overrides (deprecated)
@@ -201,9 +188,7 @@ class InworldHttpTTSService(TTSService):
                if params.temperature is not None:
                    default_settings.temperature = params.temperature
                if params.timestamp_transport_strategy is not None:
-                    default_settings.timestamp_transport_strategy = (
-                        params.timestamp_transport_strategy
-                    )
+                    timestamp_transport_strategy = params.timestamp_transport_strategy

        # 4. Apply settings delta (canonical API, always wins)
        if settings is not None:
@@ -230,9 +215,10 @@ class InworldHttpTTSService(TTSService):

        self._cumulative_time = 0.0

-        # Init-only audio format config (not runtime-updatable).
+        # Init-only config (not runtime-updatable).
        self._audio_encoding = encoding
        self._audio_sample_rate = 0  # Set in start()
+        self._timestamp_transport_strategy = timestamp_transport_strategy

    def can_generate_metrics(self) -> bool:
        """Check if this service can generate processing metrics.
@@ -251,22 +237,6 @@ class InworldHttpTTSService(TTSService):
        await super().start(frame)
        self._audio_sample_rate = self.sample_rate

-    async def stop(self, frame: EndFrame):
-        """Stop the Inworld TTS service.
-
-        Args:
-            frame: The end frame.
-        """
-        await super().stop(frame)
-
-    async def cancel(self, frame: CancelFrame):
-        """Cancel the Inworld TTS service.
-
-        Args:
-            frame: The cancel frame.
-        """
-        await super().cancel(frame)
-
    async def push_frame(self, frame: Frame, direction: FrameDirection = FrameDirection.DOWNSTREAM):
        """Push a frame and handle state changes.

@@ -347,8 +317,8 @@ class InworldHttpTTSService(TTSService):

        # Use WORD timestamps for simplicity and correct spacing/capitalization
        payload["timestampType"] = self._timestamp_type
-        if self._settings.timestamp_transport_strategy is not None:
-            payload["timestampTransportStrategy"] = self._settings.timestamp_transport_strategy
+        if self._timestamp_transport_strategy is not None:
+            payload["timestampTransportStrategy"] = self._timestamp_transport_strategy

        request_id = str(uuid.uuid4())
        headers = {
@@ -556,6 +526,9 @@ class InworldTTSService(WebsocketTTSService):
        url: str = "wss://api.inworld.ai/tts/v1/voice:streamBidirectional",
        sample_rate: Optional[int] = None,
        encoding: str = "LINEAR16",
+        auto_mode: Optional[bool] = None,
+        apply_text_normalization: Optional[str] = None,
+        timestamp_transport_strategy: Optional[Literal["ASYNC", "SYNC"]] = "ASYNC",
        params: Optional[InputParams] = None,
        settings: Optional[InworldTTSSettings] = None,
        aggregate_sentences: Optional[bool] = None,
@@ -580,6 +553,12 @@ class InworldTTSService(WebsocketTTSService):
            url: URL of the Inworld WebSocket API.
            sample_rate: Audio sample rate in Hz.
            encoding: Audio encoding format.
+            auto_mode: Whether to use auto mode. When enabled, the server
+                controls flushing of buffered text. If None (default),
+                automatically set based on ``aggregate_sentences``.
+            apply_text_normalization: Whether to apply text normalization.
+            timestamp_transport_strategy: Strategy for timestamp transport
+                ("ASYNC" or "SYNC"). Defaults to "ASYNC".
            params: Input parameters for Inworld WebSocket TTS configuration.

                .. deprecated:: 0.0.105
@@ -596,6 +575,10 @@ class InworldTTSService(WebsocketTTSService):
            append_trailing_space: Whether to append a trailing space to text before sending to TTS.
            **kwargs: Additional arguments passed to the parent class.
        """
+        # Derive auto_mode from aggregate_sentences if not explicitly set
+        if auto_mode is None:
+            auto_mode = True if aggregate_sentences is None else aggregate_sentences
+
        # 1. Initialize default_settings with hardcoded defaults
        default_settings = InworldTTSSettings(
            model="inworld-tts-1.5-max",
@@ -603,9 +586,6 @@ class InworldTTSService(WebsocketTTSService):
            language=None,
            speaking_rate=None,
            temperature=None,
-            apply_text_normalization=None,
-            timestamp_transport_strategy="ASYNC",
-            auto_mode=True if aggregate_sentences is None else aggregate_sentences,
        )

        # 2. Apply direct init arg overrides (deprecated)
@@ -627,13 +607,11 @@ class InworldTTSService(WebsocketTTSService):
                if params.temperature is not None:
                    default_settings.temperature = params.temperature
                if params.apply_text_normalization is not None:
-                    default_settings.apply_text_normalization = params.apply_text_normalization
+                    apply_text_normalization = params.apply_text_normalization
                if params.timestamp_transport_strategy is not None:
-                    default_settings.timestamp_transport_strategy = (
-                        params.timestamp_transport_strategy
-                    )
+                    timestamp_transport_strategy = params.timestamp_transport_strategy
                if params.auto_mode is not None:
-                    default_settings.auto_mode = params.auto_mode
+                    auto_mode = params.auto_mode
            _buffer_max_delay_ms = params.max_buffer_delay_ms
            _buffer_char_threshold = params.buffer_char_threshold

@@ -673,9 +651,12 @@ class InworldTTSService(WebsocketTTSService):
        # Track the end time of the last word in the current generation
        self._generation_end_time = 0.0

-        # Init-only audio format config (not runtime-updatable).
+        # Init-only config (not runtime-updatable).
        self._audio_encoding = encoding
        self._audio_sample_rate = 0  # Set in start()
+        self._auto_mode = auto_mode
+        self._apply_text_normalization = apply_text_normalization
+        self._timestamp_transport_strategy = timestamp_transport_strategy

    def can_generate_metrics(self) -> bool:
        """Check if this service can generate processing metrics.
@@ -1036,14 +1017,12 @@ class InworldTTSService(WebsocketTTSService):

        if self._settings.temperature is not None:
            create_config["temperature"] = self._settings.temperature
-        if self._settings.apply_text_normalization is not None:
-            create_config["applyTextNormalization"] = self._settings.apply_text_normalization
-        if self._settings.auto_mode is not None:
-            create_config["autoMode"] = self._settings.auto_mode
-        if self._settings.timestamp_transport_strategy is not None:
-            create_config["timestampTransportStrategy"] = (
-                self._settings.timestamp_transport_strategy
-            )
+        if self._apply_text_normalization is not None:
+            create_config["applyTextNormalization"] = self._apply_text_normalization
+        if self._auto_mode is not None:
+            create_config["autoMode"] = self._auto_mode
+        if self._timestamp_transport_strategy is not None:
+            create_config["timestampTransportStrategy"] = self._timestamp_transport_strategy

        # Set buffer settings for timely audio generation.
        # Use provided values or defaults that work well for streaming LLM output.
--- a/src/pipecat/services/lmnt/tts.py
+++ b/src/pipecat/services/lmnt/tts.py
@@ -19,7 +19,6 @@ from pipecat.frames.frames import (
    Frame,
    StartFrame,
    TTSAudioRawFrame,
-    TTSStartedFrame,
    TTSStoppedFrame,
 )
 from pipecat.processors.frame_processor import FrameDirection
@@ -48,6 +47,7 @@ def language_to_lmnt_language(language: Language) -> Optional[str]:
        The corresponding LMNT language code, or None if not supported.
    """
    LANGUAGE_MAP = {
+        Language.AR: "ar",
        Language.DE: "de",
        Language.EN: "en",
        Language.ES: "es",
@@ -65,6 +65,7 @@ def language_to_lmnt_language(language: Language) -> Optional[str]:
        Language.TH: "th",
        Language.TR: "tr",
        Language.UK: "uk",
+        Language.UR: "ur",
        Language.VI: "vi",
        Language.ZH: "zh",
    }
@@ -96,6 +97,7 @@ class LmntTTSService(InterruptibleTTSService):
        voice_id: Optional[str] = None,
        sample_rate: Optional[int] = None,
        language: Language = Language.EN,
+        output_format: str = "pcm_s16le",
        model: Optional[str] = None,
        settings: Optional[LmntTTSSettings] = None,
        **kwargs,
@@ -111,6 +113,8 @@ class LmntTTSService(InterruptibleTTSService):

            sample_rate: Audio sample rate. If None, uses default.
            language: Language for synthesis. Defaults to English.
+            output_format: Audio output format. One of "pcm_s16le", "pcm_f32le",
+                "mp3", "ulaw", "webm". Defaults to "pcm_s16le".
            model: TTS model to use.

                .. deprecated:: 0.0.105
@@ -122,7 +126,7 @@ class LmntTTSService(InterruptibleTTSService):
        """
        # 1. Initialize default_settings with hardcoded defaults
        default_settings = LmntTTSSettings(
-            model="blizzard",
+            model="aurora",
            voice=None,
            language=self.language_to_service_language(language),
        )
@@ -151,7 +155,7 @@ class LmntTTSService(InterruptibleTTSService):
        )

        self._api_key = api_key
-        self._output_format = "raw"
+        self._output_format = output_format
        self._receive_task = None

    def can_generate_metrics(self) -> bool:
--- a/src/pipecat/services/minimax/tts.py
+++ b/src/pipecat/services/minimax/tts.py
@@ -90,7 +90,6 @@ class MiniMaxTTSSettings(TTSSettings):
    """Settings for MiniMaxHttpTTSService.

    Parameters:
-        stream: Whether to use streaming mode.
        speed: Speech speed (range: 0.5 to 2.0).
        volume: Speech volume (range: 0 to 10).
        pitch: Pitch adjustment (range: -12 to 12).
@@ -101,7 +100,6 @@ class MiniMaxTTSSettings(TTSSettings):
        language_boost: Language boost string for multilingual support.
    """

-    stream: bool | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
    speed: float | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
    volume: float | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
    pitch: int | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
@@ -189,6 +187,7 @@ class MiniMaxHttpTTSService(TTSService):
        voice_id: Optional[str] = None,
        aiohttp_session: aiohttp.ClientSession,
        sample_rate: Optional[int] = None,
+        stream: bool = True,
        params: Optional[InputParams] = None,
        settings: Optional[MiniMaxTTSSettings] = None,
        **kwargs,
@@ -217,6 +216,7 @@ class MiniMaxHttpTTSService(TTSService):

            aiohttp_session: aiohttp.ClientSession for API communication.
            sample_rate: Output audio sample rate in Hz. If None, uses pipeline default.
+            stream: Whether to use streaming mode. Defaults to True.
            params: Additional configuration parameters.

                .. deprecated:: 0.0.105
@@ -231,7 +231,6 @@ class MiniMaxHttpTTSService(TTSService):
            model="speech-02-turbo",
            voice="Calm_Woman",
            language=None,
-            stream=True,
            speed=1.0,
            volume=1.0,
            pitch=0,
@@ -311,6 +310,7 @@ class MiniMaxHttpTTSService(TTSService):

        self._api_key = api_key
        self._group_id = group_id
+        self._stream = stream
        self._base_url = f"{base_url}?GroupId={group_id}"
        self._session = aiohttp_session

@@ -392,7 +392,7 @@ class MiniMaxHttpTTSService(TTSService):

        # Create payload from settings
        payload = {
-            "stream": self._settings.stream,
+            "stream": self._stream,
            "voice_setting": voice_setting,
            "audio_setting": audio_setting,
            "model": self._settings.model,
--- a/src/pipecat/services/neuphonic/tts.py
+++ b/src/pipecat/services/neuphonic/tts.py
@@ -26,12 +26,10 @@ from pipecat.frames.frames import (
    EndFrame,
    ErrorFrame,
    Frame,
-    InterruptionFrame,
    LLMFullResponseEndFrame,
    StartFrame,
    TTSAudioRawFrame,
    TTSSpeakFrame,
-    TTSStartedFrame,
    TTSStoppedFrame,
 )
 from pipecat.processors.frame_processor import FrameDirection
@@ -487,7 +485,7 @@ class NeuphonicHttpTTSService(TTSService):
        default_settings = NeuphonicTTSSettings(
            model=None,
            voice=None,
-            language=self.language_to_service_language(Language.EN) or "en",
+            language=self.language_to_service_language(Language.EN),
            speed=1.0,
        )

@@ -501,9 +499,7 @@ class NeuphonicHttpTTSService(TTSService):
            _warn_deprecated_param("params", NeuphonicTTSSettings)
            if not settings:
                if params.language is not None:
-                    default_settings.language = (
-                        self.language_to_service_language(params.language) or "en"
-                    )
+                    default_settings.language = self.language_to_service_language(params.language)
                if params.speed is not None:
                    default_settings.speed = params.speed

--- a/src/pipecat/services/sarvam/tts.py
+++ b/src/pipecat/services/sarvam/tts.py
@@ -53,11 +53,9 @@ from pipecat.frames.frames import (
    EndFrame,
    ErrorFrame,
    Frame,
-    InterruptionFrame,
    LLMFullResponseEndFrame,
    StartFrame,
    TTSAudioRawFrame,
-    TTSStartedFrame,
    TTSStoppedFrame,
 )
 from pipecat.processors.frame_processor import FrameDirection
@@ -230,16 +228,27 @@ def language_to_sarvam_language(language: Language) -> Optional[str]:
    """
    LANGUAGE_MAP = {
        Language.BN: "bn-IN",  # Bengali
+        Language.BN_IN: "bn-IN",
        Language.EN: "en-IN",  # English (India)
+        Language.EN_IN: "en-IN",
        Language.GU: "gu-IN",  # Gujarati
+        Language.GU_IN: "gu-IN",
        Language.HI: "hi-IN",  # Hindi
+        Language.HI_IN: "hi-IN",
        Language.KN: "kn-IN",  # Kannada
+        Language.KN_IN: "kn-IN",
        Language.ML: "ml-IN",  # Malayalam
+        Language.ML_IN: "ml-IN",
        Language.MR: "mr-IN",  # Marathi
+        Language.MR_IN: "mr-IN",
        Language.OR: "od-IN",  # Odia
+        Language.OR_IN: "od-IN",
        Language.PA: "pa-IN",  # Punjabi
+        Language.PA_IN: "pa-IN",
        Language.TA: "ta-IN",  # Tamil
+        Language.TA_IN: "ta-IN",
        Language.TE: "te-IN",  # Telugu
+        Language.TE_IN: "te-IN",
    }

    return resolve_language(language, LANGUAGE_MAP, use_base_code=False)
@@ -481,6 +490,10 @@ class SarvamHttpTTSService(TTSService):
        if settings is not None:
            default_settings.apply_update(settings)

+        # Convert Language enum to service-specific string
+        if isinstance(default_settings.language, Language):
+            default_settings.language = self.language_to_service_language(default_settings.language)
+
        # Get model configuration (validates model exists)
        resolved_model = default_settings.model
        if resolved_model not in TTS_MODEL_CONFIGS:
@@ -900,6 +913,10 @@ class SarvamTTSService(InterruptibleTTSService):
        if settings is not None:
            default_settings.apply_update(settings)

+        # Convert Language enum to service-specific string
+        if isinstance(default_settings.language, Language):
+            default_settings.language = self.language_to_service_language(default_settings.language)
+
        # Get model configuration (validates model exists)
        resolved_model = default_settings.model
        if resolved_model not in TTS_MODEL_CONFIGS:
--- a/src/pipecat/services/xtts/tts.py
+++ b/src/pipecat/services/xtts/tts.py
@@ -70,7 +70,7 @@ def language_to_xtts_language(language: Language) -> Optional[str]:

@dataclass
 class XTTSTTSSettings(TTSSettings):
-    """Settings for XTTSTTSService."""
+    """Settings for XTTSService."""

    pass