From 7861b911c0dace27240505efce69a47c0083c50b Mon Sep 17 00:00:00 2001
From: padillamt <michael.padilla@inworld.ai>
Date: Tue, 15 Jul 2025 16:50:50 -0700
Subject: [PATCH 01/38] inworld: first commit of __init__ and tts.py files

---
 src/pipecat/services/inworld/__init__.py |  13 ++
 src/pipecat/services/inworld/tts.py      | 265 +++++++++++++++++++++++
 2 files changed, 278 insertions(+)
 create mode 100644 src/pipecat/services/inworld/__init__.py
 create mode 100644 src/pipecat/services/inworld/tts.py

diff --git a/src/pipecat/services/inworld/__init__.py b/src/pipecat/services/inworld/__init__.py
new file mode 100644
index 000000000..9717eb163
--- /dev/null
+++ b/src/pipecat/services/inworld/__init__.py
@@ -0,0 +1,13 @@
+#
+# Copyright (c) 2024–2025, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+
+import sys
+
+from pipecat.services import DeprecatedModuleProxy
+
+from .tts import *
+
+sys.modules[__name__] = DeprecatedModuleProxy(globals(), "inworld", "inworld.tts")
diff --git a/src/pipecat/services/inworld/tts.py b/src/pipecat/services/inworld/tts.py
new file mode 100644
index 000000000..b938fb2a0
--- /dev/null
+++ b/src/pipecat/services/inworld/tts.py
@@ -0,0 +1,265 @@
+#
+# Copyright (c) 2024–2025, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+
+"""Inworld's text-to-speech service implementations."""
+
+import base64
+import json
+import uuid
+import warnings
+from typing import AsyncGenerator, List, Optional, Union
+
+import aiohttp
+from loguru import logger
+from pydantic import BaseModel, Field
+import io, json, base64
+
+from pipecat.frames.frames import (
+    CancelFrame,
+    EndFrame,
+    ErrorFrame,
+    Frame,
+    StartFrame,
+    StartInterruptionFrame,
+    TTSAudioRawFrame,
+    TTSStartedFrame,
+    TTSStoppedFrame,
+)
+from pipecat.processors.frame_processor import FrameDirection
+from pipecat.services.tts_service import AudioContextWordTTSService, TTSService
+from pipecat.transcriptions.language import Language
+from pipecat.utils.asyncio.watchdog_async_iterator import WatchdogAsyncIterator
+from pipecat.utils.text.base_text_aggregator import BaseTextAggregator
+from pipecat.utils.text.skip_tags_aggregator import SkipTagsAggregator
+from pipecat.utils.tracing.service_decorators import traced_tts
+
+
+def language_to_inworld_language(language: Language) -> Optional[str]:
+    """Convert Pipecat's Language enum to Inworld's language code.
+
+    Args:
+        language: The Language enum value to convert.
+
+    Returns:
+        The corresponding Inworld language code, or None if not supported.
+    """
+    BASE_LANGUAGES = {
+        Language.EN: "en",
+        Language.ES: "es",
+        Language.FR: "fr",
+        Language.KO: "ko",
+        Language.NL: "nl",
+        Language.ZH: "zh",
+    }
+
+    result = BASE_LANGUAGES.get(language)
+
+    # If not found in base languages, try to find the base language from a variant
+    if not result:
+        # Convert enum value to string and get the base language part (e.g. es-ES -> es)
+        lang_str = str(language.value)
+        base_code = lang_str.split("-")[0].lower()
+        # Look up the base code in our supported languages
+        result = base_code if base_code in BASE_LANGUAGES.values() else None
+
+    return result
+
+
+class InworldTTSService(TTSService):
+    """Inworld HTTP-based TTS service.
+
+    Provides text-to-speech using Inworld's HTTP API for simpler, non-streaming
+    synthesis. Suitable for use cases where streaming is not required and simpler
+    integration is preferred.
+    """
+
+    class InputParams(BaseModel):
+        """Input parameters for Inworld HTTP TTS configuration.
+
+        Parameters:
+            language: Language to use for synthesis.
+            speed: Voice speed control (string or float).
+            emotion: List of emotion controls.
+
+                .. deprecated:: 0.0.68
+                        The `emotion` parameter is deprecated and will be removed in a future version.
+        """
+
+        language: Optional[Language] = Language.EN
+        voice_id: str = "Ashley"
+
+    def __init__(
+        self,
+        *,
+        api_key: str,
+        aiohttp_session: aiohttp.ClientSession,
+        model: str = "inworld-tts-1",
+        base_url: str = "https://api.inworld.ai/tts/v1/voice:stream",
+        sample_rate: Optional[int] = 48000,
+        encoding: str = "LINEAR16",
+        params: Optional[InputParams] = None,
+        **kwargs,
+    ):
+        """Initialize the Inworld HTTP TTS service.
+
+        Args:
+            api_key: Inworld API key for authentication.
+            aiohttp_session: Shared aiohttp session for HTTP requests.
+            voice_id: ID of the voice to use for synthesis.
+            model: TTS model to use (e.g., "sonic-2").
+            endpoint_url: Base URL for Inworld HTTP API.
+            sample_rate: Audio sample rate. If None, uses default.
+            encoding: Audio encoding format.
+            params: Additional input parameters for voice customization.
+            **kwargs: Additional arguments passed to the parent TTSService.
+        """
+        super().__init__(sample_rate=sample_rate, **kwargs)
+
+        params = params or InworldTTSService.InputParams()
+
+        self._api_key = api_key
+        self._session = aiohttp_session
+        self._base_url = base_url
+        self._settings = {
+            "voiceId": params.voice_id,
+            "modelId": model,
+            "audio_config": {
+                "audio_encoding": encoding,
+                "sample_rate_hertz": sample_rate,
+            },
+            "language": self.language_to_service_language(params.language)
+            if params.language
+            else "en",
+        }
+        self.set_voice(params.voice_id)
+        self.set_model_name(model)
+
+
+    def can_generate_metrics(self) -> bool:
+        """Check if this service can generate processing metrics.
+
+        Returns:
+            True, as Inworld HTTP service supports metrics generation.
+        """
+        return True
+
+    def language_to_service_language(self, language: Language) -> Optional[str]:
+        """Convert a Language enum to Inworld language format.
+
+        Args:
+            language: The language to convert.
+
+        Returns:
+            The Inworld-specific language code, or None if not supported.
+        """
+        return language_to_inworld_language(language)
+
+    async def start(self, frame: StartFrame):
+        """Start the Inworld HTTP TTS service.
+
+        Args:
+            frame: The start frame containing initialization parameters.
+        """
+        await super().start(frame)
+        self._settings["audio_config"]["sample_rate_hertz"] = self.sample_rate
+
+    async def stop(self, frame: EndFrame):
+        """Stop the Inworld HTTP TTS service.
+
+        Args:
+            frame: The end frame.
+        """
+        await super().stop(frame)
+        # await self._client.close()
+
+    async def cancel(self, frame: CancelFrame):
+        """Cancel the Inworld HTTP TTS service.
+
+        Args:
+            frame: The cancel frame.
+        """
+        await super().cancel(frame)
+        # await self._client.close()
+
+    @traced_tts
+    async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
+        """Generate speech from text using Inworld's HTTP API.
+
+        Args:
+            text: The text to synthesize into speech.
+
+        Yields:
+            Frame: Audio frames containing the synthesized speech.
+        """
+        logger.debug(f"{self}: Generating TTS [{text}]")
+
+        payload = {
+            "text": text,
+            "voiceId": self._settings["voiceId"],
+            "modelId": self._settings["modelId"],
+            "audio_config": self._settings["audio_config"],
+            "language": self._settings["language"],
+        }
+
+        headers = {
+            "Authorization": f"Basic {self._api_key}",
+            "Content-Type": "application/json",
+        }
+
+        try:
+            await self.start_ttfb_metrics()
+
+            yield TTSStartedFrame()
+
+            async with self._session.post(self._base_url, json=payload, headers=headers) as response:
+                if response.status != 200:
+                    error_text = await response.text()
+                    logger.error(f"Inworld API error: {error_text}")
+                    await self.push_error(ErrorFrame(f"Inworld API error: {error_text}"))
+                    return
+
+                raw_audio_data = io.BytesIO()
+
+                async for line in response.content.iter_lines():
+                    line_str = line.decode('utf-8').strip()
+                    if not line_str:
+                        continue
+                    
+                    try:
+                        chunk = json.loads(line_str)
+                        if "result" in chunk and "audioContent" in chunk["result"]:
+                            audio_chunk = base64.b64decode(chunk["result"]["audioContent"])
+                            # Skip WAV header if present (first 44 bytes)
+                            if len(audio_chunk) > 44 and audio_chunk.startswith(b"RIFF"):
+                                audio_data = audio_chunk[44:]
+                            else:
+                                audio_data = audio_chunk
+                            raw_audio_data.write(audio_data)
+                    except json.JSONDecodeError:
+                        continue
+
+            await self.start_tts_usage_metrics(text)
+
+            audio_bytes = raw_audio_data.getvalue()
+            if not audio_bytes:
+                logger.error("No audio data received from Inworld API")
+                await self.push_error(ErrorFrame("No audio data received"))
+                return
+
+            frame = TTSAudioRawFrame(
+                audio=audio_bytes,
+                sample_rate=self.sample_rate,
+                num_channels=1,
+            )
+
+            yield frame
+
+        except Exception as e:
+            logger.error(f"{self} exception: {e}")
+            await self.push_error(ErrorFrame(f"Error generating TTS: {e}"))
+        finally:
+            await self.stop_ttfb_metrics()
+            yield TTSStoppedFrame()

From 384838147adfece71b273c5010d6e770b7d64a5a Mon Sep 17 00:00:00 2001
From: padillamt <michael.padilla@inworld.ai>
Date: Tue, 15 Jul 2025 16:56:18 -0700
Subject: [PATCH 02/38] inworld: removed unnecessary code from stop() and
 cancel()

---
 src/pipecat/services/inworld/tts.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/pipecat/services/inworld/tts.py b/src/pipecat/services/inworld/tts.py
index b938fb2a0..b05134a5d 100644
--- a/src/pipecat/services/inworld/tts.py
+++ b/src/pipecat/services/inworld/tts.py
@@ -173,7 +173,6 @@ class InworldTTSService(TTSService):
             frame: The end frame.
         """
         await super().stop(frame)
-        # await self._client.close()
 
     async def cancel(self, frame: CancelFrame):
         """Cancel the Inworld HTTP TTS service.
@@ -182,7 +181,6 @@ class InworldTTSService(TTSService):
             frame: The cancel frame.
         """
         await super().cancel(frame)
-        # await self._client.close()
 
     @traced_tts
     async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:

From 913dba3b74d42eec7c14b67c6ed3231f83478696 Mon Sep 17 00:00:00 2001
From: padillamt <michael.padilla@inworld.ai>
Date: Tue, 15 Jul 2025 17:15:57 -0700
Subject: [PATCH 03/38] inworld: class name change

---
 src/pipecat/services/inworld/tts.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pipecat/services/inworld/tts.py b/src/pipecat/services/inworld/tts.py
index b05134a5d..89f6c5e6c 100644
--- a/src/pipecat/services/inworld/tts.py
+++ b/src/pipecat/services/inworld/tts.py
@@ -68,7 +68,7 @@ def language_to_inworld_language(language: Language) -> Optional[str]:
     return result
 
 
-class InworldTTSService(TTSService):
+class InworldHttpTTSService(TTSService):
     """Inworld HTTP-based TTS service.
 
     Provides text-to-speech using Inworld's HTTP API for simpler, non-streaming

From c67b779b9178f6b638ef94437fa71b72ee7e7839 Mon Sep 17 00:00:00 2001
From: padillamt <michael.padilla@inworld.ai>
Date: Tue, 15 Jul 2025 17:21:16 -0700
Subject: [PATCH 04/38] inworld: first commit of Inworld example file for TTS

---
 .../07aa-interruptible-inworld-http.py        | 117 ++++++++++++++++++
 1 file changed, 117 insertions(+)
 create mode 100644 examples/foundational/07aa-interruptible-inworld-http.py

diff --git a/examples/foundational/07aa-interruptible-inworld-http.py b/examples/foundational/07aa-interruptible-inworld-http.py
new file mode 100644
index 000000000..2b8b1612d
--- /dev/null
+++ b/examples/foundational/07aa-interruptible-inworld-http.py
@@ -0,0 +1,117 @@
+#
+# Copyright (c) 2024–2025, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+
+import argparse
+import os
+
+import aiohttp
+from dotenv import load_dotenv
+from loguru import logger
+
+from pipecat.audio.vad.silero import SileroVADAnalyzer
+from pipecat.pipeline.pipeline import Pipeline
+from pipecat.pipeline.runner import PipelineRunner
+from pipecat.pipeline.task import PipelineParams, PipelineTask
+from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
+from pipecat.services.deepgram.stt import DeepgramSTTService
+from pipecat.services.openai.llm import OpenAILLMService
+from pipecat.services.inworld.tts import InworldHttpTTSService
+from pipecat.transports.base_transport import BaseTransport, TransportParams
+from pipecat.transports.network.fastapi_websocket import FastAPIWebsocketParams
+from pipecat.transports.services.daily import DailyParams
+
+load_dotenv(override=True)
+
+
+# We store functions so objects (e.g. SileroVADAnalyzer) don't get
+# instantiated. The function will be called when the desired transport gets
+# selected.
+transport_params = {
+    "daily": lambda: DailyParams(
+        audio_in_enabled=True,
+        audio_out_enabled=True,
+        vad_analyzer=SileroVADAnalyzer(),
+    ),
+    "twilio": lambda: FastAPIWebsocketParams(
+        audio_in_enabled=True,
+        audio_out_enabled=True,
+        vad_analyzer=SileroVADAnalyzer(),
+    ),
+    "webrtc": lambda: TransportParams(
+        audio_in_enabled=True,
+        audio_out_enabled=True,
+        vad_analyzer=SileroVADAnalyzer(),
+    ),
+}
+
+
+async def run_example(transport: BaseTransport, _: argparse.Namespace, handle_sigint: bool):
+    logger.info(f"Starting bot")
+
+    # Create an HTTP session
+    async with aiohttp.ClientSession() as session:
+        stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
+
+        tts = InworldHttpTTSService(
+            api_key=os.getenv("INWORLD_API_KEY", ""),
+            voice_id="Ashley",
+            model="inworld-tts-1",
+            aiohttp_session=session,
+        )
+
+        llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
+
+        messages = [
+            {
+                "role": "system",
+                "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
+            },
+        ]
+
+        context = OpenAILLMContext(messages)
+        context_aggregator = llm.create_context_aggregator(context)
+
+        pipeline = Pipeline(
+            [
+                transport.input(),  # Transport user input
+                stt,
+                context_aggregator.user(),  # User responses
+                llm,  # LLM
+                tts,  # TTS
+                transport.output(),  # Transport bot output
+                context_aggregator.assistant(),  # Assistant spoken responses
+            ]
+        )
+
+        task = PipelineTask(
+            pipeline,
+            params=PipelineParams(
+                enable_metrics=True,
+                enable_usage_metrics=True,
+            ),
+        )
+
+        @transport.event_handler("on_client_connected")
+        async def on_client_connected(transport, client):
+            logger.info(f"Client connected")
+            # Kick off the conversation.
+            messages.append({"role": "system", "content": "Please introduce yourself to the user."})
+            await task.queue_frames([context_aggregator.user().get_context_frame()])
+
+        @transport.event_handler("on_client_disconnected")
+        async def on_client_disconnected(transport, client):
+            logger.info(f"Client disconnected")
+            await task.cancel()
+
+        runner = PipelineRunner(handle_sigint=handle_sigint)
+
+        await runner.run(task)
+
+
+if __name__ == "__main__":
+    from pipecat.examples.run import main
+
+    main(run_example, transport_params=transport_params)

From ca936bd56966c4787903a13cdd72ecf9cb9a5eba Mon Sep 17 00:00:00 2001
From: padillamt <michael.padilla@inworld.ai>
Date: Tue, 15 Jul 2025 18:11:50 -0700
Subject: [PATCH 05/38] inworld: added Inworld to list of needed credentials

---
 dot-env.template | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/dot-env.template b/dot-env.template
index ab085757f..d79b67c63 100644
--- a/dot-env.template
+++ b/dot-env.template
@@ -76,6 +76,9 @@ GROQ_API_KEY=...
 # Grok
 GROK_API_KEY=...
 
+# Inworld
+INWORLD_API_KEY=...
+
 # Together.ai
 TOGETHER_API_KEY=...
 

From 2b76823b017d5d8a5299660ad4480b0c5842ab5e Mon Sep 17 00:00:00 2001
From: padillamt <michael.padilla@inworld.ai>
Date: Tue, 15 Jul 2025 18:17:30 -0700
Subject: [PATCH 06/38] inworld: added comments to track a few things to
 confirm

---
 src/pipecat/services/inworld/tts.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/pipecat/services/inworld/tts.py b/src/pipecat/services/inworld/tts.py
index 89f6c5e6c..620a4b702 100644
--- a/src/pipecat/services/inworld/tts.py
+++ b/src/pipecat/services/inworld/tts.py
@@ -89,7 +89,8 @@ class InworldHttpTTSService(TTSService):
         """
 
         language: Optional[Language] = Language.EN
-        voice_id: str = "Ashley"
+        voice_id: str = "Ashley" ## QUESTION: How to make this modifyable/how to modify?
+        # QUESTION: What about speed, pitch, and temperature??
 
     def __init__(
         self,

From f3984aec33462fcfc8387132c1265df3d21c7c88 Mon Sep 17 00:00:00 2001
From: padillamt <michael.padilla@inworld.ai>
Date: Wed, 16 Jul 2025 13:21:32 -0700
Subject: [PATCH 07/38] inworld: added (empty) requirements for Inworld to be
 explicit reg dependencies

---
 docs/api/requirements.txt | 1 +
 pyproject.toml            | 1 +
 2 files changed, 2 insertions(+)

diff --git a/docs/api/requirements.txt b/docs/api/requirements.txt
index c9e8e2ce9..4c68da84b 100644
--- a/docs/api/requirements.txt
+++ b/docs/api/requirements.txt
@@ -23,6 +23,7 @@ pipecat-ai[gladia]
 pipecat-ai[google]
 pipecat-ai[grok]
 pipecat-ai[groq]
+pipecat-ai[inworld]
 # pipecat-ai[krisp] # Mocked
 pipecat-ai[koala]
 # pipecat-ai[langchain] # Mocked
diff --git a/pyproject.toml b/pyproject.toml
index 39a16231f..0ab9109a8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -59,6 +59,7 @@ google = [ "google-cloud-speech~=2.32.0", "google-cloud-texttospeech~=2.26.0", "
 grok = []
 groq = [ "groq~=0.23.0" ]
 gstreamer = [ "pygobject~=3.50.0" ]
+inworld = []
 krisp = [ "pipecat-ai-krisp~=0.4.0" ]
 koala = [ "pvkoala~=2.0.3" ]
 langchain = [ "langchain~=0.3.20", "langchain-community~=0.3.20", "langchain-openai~=0.3.9" ]

From 1bc442e3292d47313def7511c7415fe6b7d7c080 Mon Sep 17 00:00:00 2001
From: padillamt <michael.padilla@inworld.ai>
Date: Fri, 18 Jul 2025 15:13:19 -0700
Subject: [PATCH 08/38] inworld: docstring fix

---
 src/pipecat/services/inworld/tts.py | 142 ++++++++++++++++++++++------
 1 file changed, 112 insertions(+), 30 deletions(-)

diff --git a/src/pipecat/services/inworld/tts.py b/src/pipecat/services/inworld/tts.py
index 620a4b702..bf9a5d6ee 100644
--- a/src/pipecat/services/inworld/tts.py
+++ b/src/pipecat/services/inworld/tts.py
@@ -7,6 +7,7 @@
 """Inworld's text-to-speech service implementations."""
 
 import base64
+import io
 import json
 import uuid
 import warnings
@@ -15,7 +16,6 @@ from typing import AsyncGenerator, List, Optional, Union
 import aiohttp
 from loguru import logger
 from pydantic import BaseModel, Field
-import io, json, base64
 
 from pipecat.frames.frames import (
     CancelFrame,
@@ -89,7 +89,7 @@ class InworldHttpTTSService(TTSService):
         """
 
         language: Optional[Language] = Language.EN
-        voice_id: str = "Ashley" ## QUESTION: How to make this modifyable/how to modify?
+        voice_id: str = "Ashley"  ## QUESTION: How to make this modifyable/how to modify?
         # QUESTION: What about speed, pitch, and temperature??
 
     def __init__(
@@ -109,9 +109,8 @@ class InworldHttpTTSService(TTSService):
         Args:
             api_key: Inworld API key for authentication.
             aiohttp_session: Shared aiohttp session for HTTP requests.
-            voice_id: ID of the voice to use for synthesis.
-            model: TTS model to use (e.g., "sonic-2").
-            endpoint_url: Base URL for Inworld HTTP API.
+            model: TTS model to use (e.g., "inworld-tts-1").
+            base_url: Base URL for Inworld HTTP API.
             sample_rate: Audio sample rate. If None, uses default.
             encoding: Audio encoding format.
             params: Additional input parameters for voice customization.
@@ -138,7 +137,6 @@ class InworldHttpTTSService(TTSService):
         self.set_voice(params.voice_id)
         self.set_model_name(model)
 
-
     def can_generate_metrics(self) -> bool:
         """Check if this service can generate processing metrics.
 
@@ -187,6 +185,8 @@ class InworldHttpTTSService(TTSService):
     async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
         """Generate speech from text using Inworld's HTTP API.
 
+        This implementation streams audio chunk by chunk as it's received.
+
         Args:
             text: The text to synthesize into speech.
 
@@ -213,52 +213,134 @@ class InworldHttpTTSService(TTSService):
 
             yield TTSStartedFrame()
 
-            async with self._session.post(self._base_url, json=payload, headers=headers) as response:
+            # A flag to ensure we only strip the header from the very first chunk.
+            is_first_chunk = True
+
+            async with self._session.post(
+                self._base_url, json=payload, headers=headers
+            ) as response:
                 if response.status != 200:
                     error_text = await response.text()
                     logger.error(f"Inworld API error: {error_text}")
                     await self.push_error(ErrorFrame(f"Inworld API error: {error_text}"))
                     return
 
-                raw_audio_data = io.BytesIO()
-
+                # Process the stream line by line.
                 async for line in response.content.iter_lines():
-                    line_str = line.decode('utf-8').strip()
+                    line_str = line.decode("utf-8").strip()
                     if not line_str:
                         continue
-                    
+
                     try:
                         chunk = json.loads(line_str)
                         if "result" in chunk and "audioContent" in chunk["result"]:
                             audio_chunk = base64.b64decode(chunk["result"]["audioContent"])
-                            # Skip WAV header if present (first 44 bytes)
-                            if len(audio_chunk) > 44 and audio_chunk.startswith(b"RIFF"):
+                            audio_data = audio_chunk
+
+                            # Correctly strip the header only from the first chunk.
+                            if (
+                                is_first_chunk
+                                and len(audio_chunk) > 44
+                                and audio_chunk.startswith(b"RIFF")
+                            ):
                                 audio_data = audio_chunk[44:]
-                            else:
-                                audio_data = audio_chunk
-                            raw_audio_data.write(audio_data)
+                                is_first_chunk = False  # Unset the flag.
+
+                            # Yield each audio frame as it's processed.
+                            yield TTSAudioRawFrame(
+                                audio=audio_data,
+                                sample_rate=self.sample_rate,
+                                num_channels=1,
+                            )
+
                     except json.JSONDecodeError:
                         continue
 
             await self.start_tts_usage_metrics(text)
 
-            audio_bytes = raw_audio_data.getvalue()
-            if not audio_bytes:
-                logger.error("No audio data received from Inworld API")
-                await self.push_error(ErrorFrame("No audio data received"))
-                return
-
-            frame = TTSAudioRawFrame(
-                audio=audio_bytes,
-                sample_rate=self.sample_rate,
-                num_channels=1,
-            )
-
-            yield frame
-
         except Exception as e:
             logger.error(f"{self} exception: {e}")
             await self.push_error(ErrorFrame(f"Error generating TTS: {e}"))
         finally:
             await self.stop_ttfb_metrics()
             yield TTSStoppedFrame()
+
+    # @traced_tts
+    # async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
+    #     """Generate speech from text using Inworld's HTTP API.
+
+    #     Args:
+    #         text: The text to synthesize into speech.
+
+    #     Yields:
+    #         Frame: Audio frames containing the synthesized speech.
+    #     """
+    #     logger.debug(f"{self}: Generating TTS [{text}]")
+
+    #     payload = {
+    #         "text": text,
+    #         "voiceId": self._settings["voiceId"],
+    #         "modelId": self._settings["modelId"],
+    #         "audio_config": self._settings["audio_config"],
+    #         "language": self._settings["language"],
+    #     }
+
+    #     headers = {
+    #         "Authorization": f"Basic {self._api_key}",
+    #         "Content-Type": "application/json",
+    #     }
+
+    #     try:
+    #         await self.start_ttfb_metrics()
+
+    #         yield TTSStartedFrame()
+
+    #         async with self._session.post(self._base_url, json=payload, headers=headers) as response:
+    #             if response.status != 200:
+    #                 error_text = await response.text()
+    #                 logger.error(f"Inworld API error: {error_text}")
+    #                 await self.push_error(ErrorFrame(f"Inworld API error: {error_text}"))
+    #                 return
+
+    #             raw_audio_data = io.BytesIO()
+
+    #             async for line in response.content.iter_lines():
+    #                 line_str = line.decode('utf-8').strip()
+    #                 if not line_str:
+    #                     continue
+
+    #                 try:
+    #                     chunk = json.loads(line_str)
+    #                     if "result" in chunk and "audioContent" in chunk["result"]:
+    #                         audio_chunk = base64.b64decode(chunk["result"]["audioContent"])
+    #                         # Skip WAV header if present (first 44 bytes)
+    #                         if len(audio_chunk) > 44 and audio_chunk.startswith(b"RIFF"):
+    #                             audio_data = audio_chunk[44:]
+    #                         else:
+    #                             audio_data = audio_chunk
+    #                         raw_audio_data.write(audio_data)
+    #                 except json.JSONDecodeError:
+    #                     continue
+
+    #         await self.start_tts_usage_metrics(text)
+
+    #         audio_bytes = raw_audio_data.getvalue()
+    #         if not audio_bytes:
+    #             logger.error("No audio data received from Inworld API")
+    #             await self.push_error(ErrorFrame("No audio data received"))
+    #             return
+
+    #         frame = TTSAudioRawFrame(
+    #             audio=audio_bytes,
+    #             sample_rate=self.sample_rate,
+    #             num_channels=1,
+    #         )
+
+    #         yield frame
+
+    #     except Exception as e:
+    #         logger.error(f"{self} exception: {e}")
+    #         await self.push_error(ErrorFrame(f"Error generating TTS: {e}"))
+    #     finally:
+    #         await self.stop_ttfb_metrics()
+    #         yield TTSStoppedFrame()

From 5d8c184d99f95fe0761036bbb691fc9ac03e0b6f Mon Sep 17 00:00:00 2001
From: padillamt <michael.padilla@inworld.ai>
Date: Fri, 18 Jul 2025 16:30:03 -0700
Subject: [PATCH 09/38] inworld: commit of original text file and changes that
 copy openai's with Inworld TTS as only change

---
 .../07aa-interruptible-inworld-http.py        |  16 ++-
 .../07aa-interruptible-inworld-http_copy.py   | 117 ++++++++++++++++++
 2 files changed, 127 insertions(+), 6 deletions(-)
 create mode 100644 examples/foundational/07aa-interruptible-inworld-http_copy.py

diff --git a/examples/foundational/07aa-interruptible-inworld-http.py b/examples/foundational/07aa-interruptible-inworld-http.py
index 2b8b1612d..03622a2d4 100644
--- a/examples/foundational/07aa-interruptible-inworld-http.py
+++ b/examples/foundational/07aa-interruptible-inworld-http.py
@@ -16,16 +16,15 @@ from pipecat.pipeline.pipeline import Pipeline
 from pipecat.pipeline.runner import PipelineRunner
 from pipecat.pipeline.task import PipelineParams, PipelineTask
 from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
-from pipecat.services.deepgram.stt import DeepgramSTTService
-from pipecat.services.openai.llm import OpenAILLMService
 from pipecat.services.inworld.tts import InworldHttpTTSService
+from pipecat.services.openai.llm import OpenAILLMService
+from pipecat.services.openai.stt import OpenAISTTService
 from pipecat.transports.base_transport import BaseTransport, TransportParams
 from pipecat.transports.network.fastapi_websocket import FastAPIWebsocketParams
 from pipecat.transports.services.daily import DailyParams
 
 load_dotenv(override=True)
 
-
 # We store functions so objects (e.g. SileroVADAnalyzer) don't get
 # instantiated. The function will be called when the desired transport gets
 # selected.
@@ -53,7 +52,11 @@ async def run_example(transport: BaseTransport, _: argparse.Namespace, handle_si
 
     # Create an HTTP session
     async with aiohttp.ClientSession() as session:
-        stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
+        stt = OpenAISTTService(
+            api_key=os.getenv("OPENAI_API_KEY"),
+            model="gpt-4o-transcribe",
+            prompt="Expect words related to dogs, such as breed names.",
+        )
 
         tts = InworldHttpTTSService(
             api_key=os.getenv("INWORLD_API_KEY", ""),
@@ -67,7 +70,7 @@ async def run_example(transport: BaseTransport, _: argparse.Namespace, handle_si
         messages = [
             {
                 "role": "system",
-                "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
+                "content": "You are very knowledgable about dogs. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
             },
         ]
 
@@ -77,7 +80,7 @@ async def run_example(transport: BaseTransport, _: argparse.Namespace, handle_si
         pipeline = Pipeline(
             [
                 transport.input(),  # Transport user input
-                stt,
+                stt,  # STT
                 context_aggregator.user(),  # User responses
                 llm,  # LLM
                 tts,  # TTS
@@ -89,6 +92,7 @@ async def run_example(transport: BaseTransport, _: argparse.Namespace, handle_si
         task = PipelineTask(
             pipeline,
             params=PipelineParams(
+                audio_out_sample_rate=24000,
                 enable_metrics=True,
                 enable_usage_metrics=True,
             ),
diff --git a/examples/foundational/07aa-interruptible-inworld-http_copy.py b/examples/foundational/07aa-interruptible-inworld-http_copy.py
new file mode 100644
index 000000000..0121865ab
--- /dev/null
+++ b/examples/foundational/07aa-interruptible-inworld-http_copy.py
@@ -0,0 +1,117 @@
+#
+# Copyright (c) 2024–2025, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+
+import argparse
+import os
+
+import aiohttp
+from dotenv import load_dotenv
+from loguru import logger
+
+from pipecat.audio.vad.silero import SileroVADAnalyzer
+from pipecat.pipeline.pipeline import Pipeline
+from pipecat.pipeline.runner import PipelineRunner
+from pipecat.pipeline.task import PipelineParams, PipelineTask
+from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
+from pipecat.services.deepgram.stt import DeepgramSTTService
+from pipecat.services.inworld.tts import InworldHttpTTSService
+from pipecat.services.openai.llm import OpenAILLMService
+from pipecat.transports.base_transport import BaseTransport, TransportParams
+from pipecat.transports.network.fastapi_websocket import FastAPIWebsocketParams
+from pipecat.transports.services.daily import DailyParams
+
+load_dotenv(override=True)
+
+
+# We store functions so objects (e.g. SileroVADAnalyzer) don't get
+# instantiated. The function will be called when the desired transport gets
+# selected.
+transport_params = {
+    "daily": lambda: DailyParams(
+        audio_in_enabled=True,
+        audio_out_enabled=True,
+        vad_analyzer=SileroVADAnalyzer(),
+    ),
+    "twilio": lambda: FastAPIWebsocketParams(
+        audio_in_enabled=True,
+        audio_out_enabled=True,
+        vad_analyzer=SileroVADAnalyzer(),
+    ),
+    "webrtc": lambda: TransportParams(
+        audio_in_enabled=True,
+        audio_out_enabled=True,
+        vad_analyzer=SileroVADAnalyzer(),
+    ),
+}
+
+
+async def run_example(transport: BaseTransport, _: argparse.Namespace, handle_sigint: bool):
+    logger.info(f"Starting bot")
+
+    # Create an HTTP session
+    async with aiohttp.ClientSession() as session:
+        stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
+
+        tts = InworldHttpTTSService(
+            api_key=os.getenv("INWORLD_API_KEY", ""),
+            voice_id="Ashley",
+            model="inworld-tts-1",
+            aiohttp_session=session,
+        )
+
+        llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
+
+        messages = [
+            {
+                "role": "system",
+                "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
+            },
+        ]
+
+        context = OpenAILLMContext(messages)
+        context_aggregator = llm.create_context_aggregator(context)
+
+        pipeline = Pipeline(
+            [
+                transport.input(),  # Transport user input
+                stt,
+                context_aggregator.user(),  # User responses
+                llm,  # LLM
+                tts,  # TTS
+                transport.output(),  # Transport bot output
+                context_aggregator.assistant(),  # Assistant spoken responses
+            ]
+        )
+
+        task = PipelineTask(
+            pipeline,
+            params=PipelineParams(
+                enable_metrics=True,
+                enable_usage_metrics=True,
+            ),
+        )
+
+        @transport.event_handler("on_client_connected")
+        async def on_client_connected(transport, client):
+            logger.info(f"Client connected")
+            # Kick off the conversation.
+            messages.append({"role": "system", "content": "Please introduce yourself to the user."})
+            await task.queue_frames([context_aggregator.user().get_context_frame()])
+
+        @transport.event_handler("on_client_disconnected")
+        async def on_client_disconnected(transport, client):
+            logger.info(f"Client disconnected")
+            await task.cancel()
+
+        runner = PipelineRunner(handle_sigint=handle_sigint)
+
+        await runner.run(task)
+
+
+if __name__ == "__main__":
+    from pipecat.examples.run import main
+
+    main(run_example, transport_params=transport_params)

From e3711f96a31ad11bf26b68c397e9971f3c08927b Mon Sep 17 00:00:00 2001
From: padillamt <michael.padilla@inworld.ai>
Date: Sun, 20 Jul 2025 17:06:35 -0700
Subject: [PATCH 10/38] inworld: added detailed comments

---
 src/pipecat/services/inworld/tts.py | 440 +++++++++++++++++++---------
 1 file changed, 297 insertions(+), 143 deletions(-)

diff --git a/src/pipecat/services/inworld/tts.py b/src/pipecat/services/inworld/tts.py
index bf9a5d6ee..9d328fa5a 100644
--- a/src/pipecat/services/inworld/tts.py
+++ b/src/pipecat/services/inworld/tts.py
@@ -4,7 +4,33 @@
 # SPDX-License-Identifier: BSD 2-Clause License
 #
 
-"""Inworld's text-to-speech service implementations."""
+"""Inworld AI Text-to-Speech Service Implementation.
+
+This module provides integration with Inworld AI's HTTP-based TTS API, enabling
+real-time text-to-speech synthesis with high-quality, natural-sounding voices.
+
+Key Features:
+- HTTP streaming API support for low-latency audio generation
+- Multiple voice options (Ashley, Hades, etc.)
+- Real-time audio chunk processing with proper buffering
+- WAV header handling and audio format conversion
+- Comprehensive error handling and metrics tracking
+
+Technical Implementation:
+- Uses aiohttp for HTTP streaming connections
+- Implements JSON line-by-line parsing for streaming responses
+- Handles base64-encoded audio data with proper decoding
+- Manages audio continuity to prevent clicks and artifacts
+- Integrates with Pipecat's frame-based pipeline architecture
+
+Usage:
+    tts = InworldHttpTTSService(
+        api_key=os.getenv("INWORLD_API_KEY"),
+        voice_id="Ashley",
+        model="inworld-tts-1",
+        aiohttp_session=session
+    )
+"""
 
 import base64
 import io
@@ -40,11 +66,35 @@ from pipecat.utils.tracing.service_decorators import traced_tts
 def language_to_inworld_language(language: Language) -> Optional[str]:
     """Convert Pipecat's Language enum to Inworld's language code.
 
+    Inworld AI supports a specific set of language codes for TTS synthesis.
+    This function maps Pipecat's standardized Language enum values to the
+    corresponding language codes expected by Inworld's API.
+
+    Supported Languages:
+    - EN (English) -> "en"
+    - ES (Spanish) -> "es"
+    - FR (French) -> "fr"
+    - KO (Korean) -> "ko"
+    - NL (Dutch) -> "nl"
+    - ZH (Chinese) -> "zh"
+
+    The function also handles language variants (e.g., es-ES, en-US) by
+    extracting the base language code and mapping it if supported.
+
     Args:
-        language: The Language enum value to convert.
+        language: The Language enum value to convert (e.g., Language.EN).
 
     Returns:
-        The corresponding Inworld language code, or None if not supported.
+        The corresponding Inworld language code string (e.g., "en"),
+        or None if the language is not supported by Inworld's API.
+
+    Example:
+        >>> language_to_inworld_language(Language.EN)
+        "en"
+        >>> language_to_inworld_language(Language.ES)
+        "es"
+        >>> language_to_inworld_language(Language.DE)  # Not supported
+        None
     """
     BASE_LANGUAGES = {
         Language.EN: "en",
@@ -69,11 +119,42 @@ def language_to_inworld_language(language: Language) -> Optional[str]:
 
 
 class InworldHttpTTSService(TTSService):
-    """Inworld HTTP-based TTS service.
+    """Inworld AI HTTP-based Text-to-Speech Service.
 
-    Provides text-to-speech using Inworld's HTTP API for simpler, non-streaming
-    synthesis. Suitable for use cases where streaming is not required and simpler
-    integration is preferred.
+    This service integrates Inworld AI's high-quality TTS API with Pipecat's pipeline
+    architecture. It provides real-time speech synthesis with natural-sounding voices
+    and low-latency streaming audio delivery.
+
+    Key Features:
+    - Real-time HTTP streaming for minimal latency
+    - Multiple voice options (Ashley, Hades, etc.)
+    - High-quality audio output (48kHz LINEAR16 PCM)
+    - Automatic audio format handling and header stripping
+    - Comprehensive error handling and recovery
+    - Built-in performance metrics and monitoring
+
+    Technical Architecture:
+    - Uses aiohttp for non-blocking HTTP requests
+    - Implements JSON line-by-line streaming protocol
+    - Processes base64-encoded audio chunks in real-time
+    - Manages audio continuity to prevent artifacts
+    - Integrates with Pipecat's frame-based pipeline system
+
+    Supported Configuration:
+    - Voice Selection: Ashley, Hades, and other Inworld voices
+    - Models: inworld-tts-1 and other available models
+    - Audio Formats: LINEAR16 PCM at various sample rates
+    - Languages: English, Spanish, French, Korean, Dutch, Chinese
+
+    Example Usage:
+        async with aiohttp.ClientSession() as session:
+            tts = InworldHttpTTSService(
+                api_key=os.getenv("INWORLD_API_KEY"),
+                voice_id="Ashley",                    # Voice selection
+                model="inworld-tts-1",               # TTS model
+                aiohttp_session=session,             # Required HTTP session
+                sample_rate=48000,                   # Audio quality
+            )
     """
 
     class InputParams(BaseModel):
@@ -89,7 +170,7 @@ class InworldHttpTTSService(TTSService):
         """
 
         language: Optional[Language] = Language.EN
-        voice_id: str = "Ashley"  ## QUESTION: How to make this modifyable/how to modify?
+        voice_id: str = "Hades"  ## QUESTION: How to make this modifyable/how to modify?
         # QUESTION: What about speed, pitch, and temperature??
 
     def __init__(
@@ -97,6 +178,7 @@ class InworldHttpTTSService(TTSService):
         *,
         api_key: str,
         aiohttp_session: aiohttp.ClientSession,
+        voice_id: str = "Ashley",
         model: str = "inworld-tts-1",
         base_url: str = "https://api.inworld.ai/tts/v1/voice:stream",
         sample_rate: Optional[int] = 48000,
@@ -106,36 +188,67 @@ class InworldHttpTTSService(TTSService):
     ):
         """Initialize the Inworld HTTP TTS service.
 
+        Sets up the TTS service with Inworld AI's streaming API configuration.
+        This constructor prepares all necessary parameters for real-time speech synthesis.
+
         Args:
-            api_key: Inworld API key for authentication.
-            aiohttp_session: Shared aiohttp session for HTTP requests.
-            model: TTS model to use (e.g., "inworld-tts-1").
-            base_url: Base URL for Inworld HTTP API.
-            sample_rate: Audio sample rate. If None, uses default.
-            encoding: Audio encoding format.
-            params: Additional input parameters for voice customization.
-            **kwargs: Additional arguments passed to the parent TTSService.
+            api_key: Inworld API key for authentication (base64-encoded from Inworld Portal).
+                    Get this from: Inworld Portal > Settings > API Keys > Runtime API Key
+            aiohttp_session: Shared aiohttp session for HTTP requests. Must be provided
+                           for proper connection pooling and resource management.
+            voice_id: Voice to use for synthesis. Available options include:
+                     - "Ashley" (default) - Natural female voice
+                     - "Hades" - Distinctive character voice
+                     - Other voices available through Inworld's voice catalog
+            model: TTS model to use. Currently supported:
+                  - "inworld-tts-1" (default) - Latest high-quality model
+                  - Other models as available in Inworld's API
+            base_url: Base URL for Inworld HTTP API. Uses streaming endpoint by default.
+                     Should normally not be changed unless using a different environment.
+            sample_rate: Audio sample rate in Hz. Common values:
+                        - 48000 (default) - High quality, suitable for most applications
+                        - 24000 - Good quality, lower bandwidth
+                        - 16000 - Basic quality, minimal bandwidth
+            encoding: Audio encoding format. Supported options:
+                     - "LINEAR16" (default) - Uncompressed PCM, best quality
+                     - Other formats as supported by Inworld API
+            params: Additional input parameters for advanced voice customization.
+                   Usually None for standard usage.
+            **kwargs: Additional arguments passed to the parent TTSService class.
+
+        Note:
+            The aiohttp_session parameter is required because Inworld's HTTP API
+            benefits from connection reuse and proper async session management.
         """
+        # Initialize parent TTSService with audio configuration
         super().__init__(sample_rate=sample_rate, **kwargs)
 
-        params = params or InworldTTSService.InputParams()
+        # Use provided params or create default configuration
+        params = params or InworldHttpTTSService.InputParams()
 
-        self._api_key = api_key
-        self._session = aiohttp_session
-        self._base_url = base_url
+        # Store core configuration for API requests
+        self._api_key = api_key  # Authentication credentials
+        self._session = aiohttp_session  # HTTP session for requests
+        self._base_url = base_url  # API endpoint URL
+
+        # Build settings dictionary that matches Inworld's API expectations
+        # This will be sent as JSON payload in each TTS request
         self._settings = {
-            "voiceId": params.voice_id,
-            "modelId": model,
-            "audio_config": {
-                "audio_encoding": encoding,
-                "sample_rate_hertz": sample_rate,
+            "voiceId": voice_id,  # Voice selection (fixes bug where this was ignored)
+            "modelId": model,  # TTS model selection
+            "audio_config": {  # Audio format configuration
+                "audio_encoding": encoding,  # Format: LINEAR16, MP3, etc.
+                "sample_rate_hertz": sample_rate,  # Sample rate: 48000, 24000, etc.
             },
+            # Language configuration with fallback to English
             "language": self.language_to_service_language(params.language)
             if params.language
             else "en",
         }
-        self.set_voice(params.voice_id)
-        self.set_model_name(model)
+
+        # Register voice and model with parent service for metrics and tracking
+        self.set_voice(voice_id)  # Used for logging and metrics
+        self.set_model_name(model)  # Used for performance tracking
 
     def can_generate_metrics(self) -> bool:
         """Check if this service can generate processing metrics.
@@ -183,164 +296,205 @@ class InworldHttpTTSService(TTSService):
 
     @traced_tts
     async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
-        """Generate speech from text using Inworld's HTTP API.
+        """Generate speech from text using Inworld's streaming HTTP API.
 
-        This implementation streams audio chunk by chunk as it's received.
+        This is the core TTS processing function that:
+        1. Sends text to Inworld's streaming TTS endpoint
+        2. Receives JSON-streamed audio chunks in real-time
+        3. Processes and cleans audio data (removes WAV headers, validates content)
+        4. Yields audio frames for immediate playback in the pipeline
+
+        Technical Details:
+        - Uses HTTP streaming with JSON line-by-line responses
+        - Each JSON line contains base64-encoded audio data
+        - Implements buffering to handle partial JSON lines
+        - Strips WAV headers to prevent audio artifacts/clicks
+        - Provides real-time audio streaming for low latency
 
         Args:
             text: The text to synthesize into speech.
 
         Yields:
-            Frame: Audio frames containing the synthesized speech.
+            Frame: Audio frames containing the synthesized speech, plus control frames.
+
+        Raises:
+            ErrorFrame: If API errors occur or audio processing fails.
         """
         logger.debug(f"{self}: Generating TTS [{text}]")
 
+        # ================================================================================
+        # STEP 1: PREPARE API REQUEST
+        # ================================================================================
+        # Build the JSON payload according to Inworld's API specification
+        # This matches the format shown in their documentation examples
         payload = {
-            "text": text,
-            "voiceId": self._settings["voiceId"],
-            "modelId": self._settings["modelId"],
-            "audio_config": self._settings["audio_config"],
-            "language": self._settings["language"],
+            "text": text,  # Text to synthesize
+            "voiceId": self._settings["voiceId"],  # Voice selection (Ashley, Hades, etc.)
+            "modelId": self._settings["modelId"],  # TTS model (inworld-tts-1)
+            "audio_config": self._settings[
+                "audio_config"
+            ],  # Audio format settings (LINEAR16, 48kHz)
+            "language": self._settings["language"],  # Language code (en, es, etc.)
         }
 
+        # Set up HTTP headers for authentication and content type
+        # Inworld requires Basic auth with base64-encoded API key
         headers = {
-            "Authorization": f"Basic {self._api_key}",
-            "Content-Type": "application/json",
+            "Authorization": f"Basic {self._api_key}",  # Base64 API key from Inworld Portal
+            "Content-Type": "application/json",  # JSON request body
         }
 
         try:
+            # ================================================================================
+            # STEP 2: INITIALIZE METRICS AND STREAMING
+            # ================================================================================
+            # Start measuring Time To First Byte (TTFB) for performance tracking
             await self.start_ttfb_metrics()
 
+            # Signal to the pipeline that TTS generation has started
+            # This allows downstream processors to prepare for incoming audio
             yield TTSStartedFrame()
 
-            # A flag to ensure we only strip the header from the very first chunk.
+            # Flag to track if we're processing the first audio chunk
+            # Used for WAV header handling and debugging
             is_first_chunk = True
 
+            # ================================================================================
+            # STEP 3: MAKE HTTP STREAMING REQUEST
+            # ================================================================================
+            # Use aiohttp's streaming POST to Inworld's streaming endpoint
+            # The endpoint returns JSON lines with audio chunks as they're generated
             async with self._session.post(
                 self._base_url, json=payload, headers=headers
             ) as response:
+                # ================================================================================
+                # STEP 4: HANDLE HTTP ERRORS
+                # ================================================================================
+                # Check for API errors (expired keys, invalid requests, etc.)
                 if response.status != 200:
                     error_text = await response.text()
                     logger.error(f"Inworld API error: {error_text}")
                     await self.push_error(ErrorFrame(f"Inworld API error: {error_text}"))
                     return
 
-                # Process the stream line by line.
-                async for line in response.content.iter_lines():
-                    line_str = line.decode("utf-8").strip()
-                    if not line_str:
+                # ================================================================================
+                # STEP 5: PROCESS STREAMING JSON RESPONSE
+                # ================================================================================
+                # Inworld streams JSON lines where each line contains audio data
+                # We need to buffer incoming data and process complete lines
+
+                # Buffer to accumulate incoming text data
+                # This handles cases where JSON lines are split across HTTP chunks
+                buffer = ""
+
+                # Read HTTP response in manageable chunks (1KB each)
+                # This prevents memory issues with large responses
+                async for chunk in response.content.iter_chunked(1024):
+                    if not chunk:
                         continue
 
-                    try:
-                        chunk = json.loads(line_str)
-                        if "result" in chunk and "audioContent" in chunk["result"]:
-                            audio_chunk = base64.b64decode(chunk["result"]["audioContent"])
-                            audio_data = audio_chunk
+                    # ============================================================================
+                    # STEP 6: BUFFER MANAGEMENT
+                    # ============================================================================
+                    # Decode binary chunk to text and add to our line buffer
+                    # Each chunk may contain partial JSON lines, so we need to accumulate
+                    buffer += chunk.decode("utf-8")
 
-                            # Correctly strip the header only from the first chunk.
-                            if (
-                                is_first_chunk
-                                and len(audio_chunk) > 44
-                                and audio_chunk.startswith(b"RIFF")
-                            ):
-                                audio_data = audio_chunk[44:]
-                                is_first_chunk = False  # Unset the flag.
+                    # ============================================================================
+                    # STEP 7: LINE-BY-LINE JSON PROCESSING
+                    # ============================================================================
+                    # Process all complete lines in the buffer (lines ending with \n)
+                    # Leave partial lines in buffer for next iteration
+                    while "\n" in buffer:
+                        # Split on first newline, keeping remainder in buffer
+                        line, buffer = buffer.split("\n", 1)
+                        line_str = line.strip()
 
-                            # Yield each audio frame as it's processed.
-                            yield TTSAudioRawFrame(
-                                audio=audio_data,
-                                sample_rate=self.sample_rate,
-                                num_channels=1,
-                            )
+                        # Skip empty lines (common in streaming responses)
+                        if not line_str:
+                            continue
 
-                    except json.JSONDecodeError:
-                        continue
+                        try:
+                            # ================================================================
+                            # STEP 8: PARSE JSON AND EXTRACT AUDIO
+                            # ================================================================
+                            # Parse the JSON line - should contain audio data
+                            chunk_data = json.loads(line_str)
 
+                            # Check if this line contains audio content
+                            # Inworld's response format: {"result": {"audioContent": "base64data"}}
+                            if "result" in chunk_data and "audioContent" in chunk_data["result"]:
+                                # Decode base64 audio data to binary
+                                audio_chunk = base64.b64decode(chunk_data["result"]["audioContent"])
+
+                                # ========================================================
+                                # STEP 9: AUDIO DATA VALIDATION
+                                # ========================================================
+                                # Skip empty audio chunks that could cause discontinuities
+                                # Empty chunks can create gaps or clicks in audio playback
+                                if not audio_chunk:
+                                    continue
+
+                                # Start with the raw audio data
+                                audio_data = audio_chunk
+
+                                # ========================================================
+                                # STEP 10: WAV HEADER REMOVAL (CRITICAL FOR AUDIO QUALITY)
+                                # ========================================================
+                                # Each audio chunk may have its own WAV header (44 bytes)
+                                # These headers contain metadata and will sound like clicks if played
+                                # We must strip them from EVERY chunk, not just the first one
+                                if (
+                                    len(audio_chunk) > 44  # Ensure chunk is large enough
+                                    and audio_chunk.startswith(
+                                        b"RIFF"
+                                    )  # Check for WAV header magic bytes
+                                ):
+                                    # Remove the 44-byte WAV header to get pure audio data
+                                    audio_data = audio_chunk[44:]
+
+                                    # Track that we've seen our first chunk (for debugging)
+                                    if is_first_chunk:
+                                        is_first_chunk = False
+
+                                # ========================================================
+                                # STEP 11: YIELD AUDIO FRAME TO PIPELINE
+                                # ========================================================
+                                # Only yield frames with actual audio content
+                                # Empty frames can cause pipeline issues
+                                if len(audio_data) > 0:
+                                    # Create Pipecat audio frame with processed audio data
+                                    yield TTSAudioRawFrame(
+                                        audio=audio_data,  # Clean audio without headers
+                                        sample_rate=self.sample_rate,  # Configured sample rate (48kHz)
+                                        num_channels=1,  # Mono audio
+                                    )
+
+                        except json.JSONDecodeError:
+                            # Ignore malformed JSON lines - streaming can have partial data
+                            # This is normal in HTTP streaming scenarios
+                            continue
+
+            # ================================================================================
+            # STEP 12: FINALIZE METRICS AND CLEANUP
+            # ================================================================================
+            # Start usage metrics tracking after successful completion
             await self.start_tts_usage_metrics(text)
 
         except Exception as e:
+            # ================================================================================
+            # STEP 13: ERROR HANDLING
+            # ================================================================================
+            # Log any unexpected errors and notify the pipeline
             logger.error(f"{self} exception: {e}")
             await self.push_error(ErrorFrame(f"Error generating TTS: {e}"))
         finally:
+            # ================================================================================
+            # STEP 14: CLEANUP AND COMPLETION
+            # ================================================================================
+            # Always stop metrics tracking, even if errors occurred
             await self.stop_ttfb_metrics()
+
+            # Signal to pipeline that TTS generation is complete
+            # This allows downstream processors to finalize audio processing
             yield TTSStoppedFrame()
-
-    # @traced_tts
-    # async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
-    #     """Generate speech from text using Inworld's HTTP API.
-
-    #     Args:
-    #         text: The text to synthesize into speech.
-
-    #     Yields:
-    #         Frame: Audio frames containing the synthesized speech.
-    #     """
-    #     logger.debug(f"{self}: Generating TTS [{text}]")
-
-    #     payload = {
-    #         "text": text,
-    #         "voiceId": self._settings["voiceId"],
-    #         "modelId": self._settings["modelId"],
-    #         "audio_config": self._settings["audio_config"],
-    #         "language": self._settings["language"],
-    #     }
-
-    #     headers = {
-    #         "Authorization": f"Basic {self._api_key}",
-    #         "Content-Type": "application/json",
-    #     }
-
-    #     try:
-    #         await self.start_ttfb_metrics()
-
-    #         yield TTSStartedFrame()
-
-    #         async with self._session.post(self._base_url, json=payload, headers=headers) as response:
-    #             if response.status != 200:
-    #                 error_text = await response.text()
-    #                 logger.error(f"Inworld API error: {error_text}")
-    #                 await self.push_error(ErrorFrame(f"Inworld API error: {error_text}"))
-    #                 return
-
-    #             raw_audio_data = io.BytesIO()
-
-    #             async for line in response.content.iter_lines():
-    #                 line_str = line.decode('utf-8').strip()
-    #                 if not line_str:
-    #                     continue
-
-    #                 try:
-    #                     chunk = json.loads(line_str)
-    #                     if "result" in chunk and "audioContent" in chunk["result"]:
-    #                         audio_chunk = base64.b64decode(chunk["result"]["audioContent"])
-    #                         # Skip WAV header if present (first 44 bytes)
-    #                         if len(audio_chunk) > 44 and audio_chunk.startswith(b"RIFF"):
-    #                             audio_data = audio_chunk[44:]
-    #                         else:
-    #                             audio_data = audio_chunk
-    #                         raw_audio_data.write(audio_data)
-    #                 except json.JSONDecodeError:
-    #                     continue
-
-    #         await self.start_tts_usage_metrics(text)
-
-    #         audio_bytes = raw_audio_data.getvalue()
-    #         if not audio_bytes:
-    #             logger.error("No audio data received from Inworld API")
-    #             await self.push_error(ErrorFrame("No audio data received"))
-    #             return
-
-    #         frame = TTSAudioRawFrame(
-    #             audio=audio_bytes,
-    #             sample_rate=self.sample_rate,
-    #             num_channels=1,
-    #         )
-
-    #         yield frame
-
-    #     except Exception as e:
-    #         logger.error(f"{self} exception: {e}")
-    #         await self.push_error(ErrorFrame(f"Error generating TTS: {e}"))
-    #     finally:
-    #         await self.stop_ttfb_metrics()
-    #         yield TTSStoppedFrame()

From 4250aa6616f7a471585202b512a41b0813e319cc Mon Sep 17 00:00:00 2001
From: padillamt <michael.padilla@inworld.ai>
Date: Mon, 21 Jul 2025 10:11:50 -0700
Subject: [PATCH 11/38] inworld: removal of backup copy, no longer needed

---
 .../07aa-interruptible-inworld-http_copy.py   | 117 ------------------
 1 file changed, 117 deletions(-)
 delete mode 100644 examples/foundational/07aa-interruptible-inworld-http_copy.py

diff --git a/examples/foundational/07aa-interruptible-inworld-http_copy.py b/examples/foundational/07aa-interruptible-inworld-http_copy.py
deleted file mode 100644
index 0121865ab..000000000
--- a/examples/foundational/07aa-interruptible-inworld-http_copy.py
+++ /dev/null
@@ -1,117 +0,0 @@
-#
-# Copyright (c) 2024–2025, Daily
-#
-# SPDX-License-Identifier: BSD 2-Clause License
-#
-
-import argparse
-import os
-
-import aiohttp
-from dotenv import load_dotenv
-from loguru import logger
-
-from pipecat.audio.vad.silero import SileroVADAnalyzer
-from pipecat.pipeline.pipeline import Pipeline
-from pipecat.pipeline.runner import PipelineRunner
-from pipecat.pipeline.task import PipelineParams, PipelineTask
-from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
-from pipecat.services.deepgram.stt import DeepgramSTTService
-from pipecat.services.inworld.tts import InworldHttpTTSService
-from pipecat.services.openai.llm import OpenAILLMService
-from pipecat.transports.base_transport import BaseTransport, TransportParams
-from pipecat.transports.network.fastapi_websocket import FastAPIWebsocketParams
-from pipecat.transports.services.daily import DailyParams
-
-load_dotenv(override=True)
-
-
-# We store functions so objects (e.g. SileroVADAnalyzer) don't get
-# instantiated. The function will be called when the desired transport gets
-# selected.
-transport_params = {
-    "daily": lambda: DailyParams(
-        audio_in_enabled=True,
-        audio_out_enabled=True,
-        vad_analyzer=SileroVADAnalyzer(),
-    ),
-    "twilio": lambda: FastAPIWebsocketParams(
-        audio_in_enabled=True,
-        audio_out_enabled=True,
-        vad_analyzer=SileroVADAnalyzer(),
-    ),
-    "webrtc": lambda: TransportParams(
-        audio_in_enabled=True,
-        audio_out_enabled=True,
-        vad_analyzer=SileroVADAnalyzer(),
-    ),
-}
-
-
-async def run_example(transport: BaseTransport, _: argparse.Namespace, handle_sigint: bool):
-    logger.info(f"Starting bot")
-
-    # Create an HTTP session
-    async with aiohttp.ClientSession() as session:
-        stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
-
-        tts = InworldHttpTTSService(
-            api_key=os.getenv("INWORLD_API_KEY", ""),
-            voice_id="Ashley",
-            model="inworld-tts-1",
-            aiohttp_session=session,
-        )
-
-        llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
-
-        messages = [
-            {
-                "role": "system",
-                "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
-            },
-        ]
-
-        context = OpenAILLMContext(messages)
-        context_aggregator = llm.create_context_aggregator(context)
-
-        pipeline = Pipeline(
-            [
-                transport.input(),  # Transport user input
-                stt,
-                context_aggregator.user(),  # User responses
-                llm,  # LLM
-                tts,  # TTS
-                transport.output(),  # Transport bot output
-                context_aggregator.assistant(),  # Assistant spoken responses
-            ]
-        )
-
-        task = PipelineTask(
-            pipeline,
-            params=PipelineParams(
-                enable_metrics=True,
-                enable_usage_metrics=True,
-            ),
-        )
-
-        @transport.event_handler("on_client_connected")
-        async def on_client_connected(transport, client):
-            logger.info(f"Client connected")
-            # Kick off the conversation.
-            messages.append({"role": "system", "content": "Please introduce yourself to the user."})
-            await task.queue_frames([context_aggregator.user().get_context_frame()])
-
-        @transport.event_handler("on_client_disconnected")
-        async def on_client_disconnected(transport, client):
-            logger.info(f"Client disconnected")
-            await task.cancel()
-
-        runner = PipelineRunner(handle_sigint=handle_sigint)
-
-        await runner.run(task)
-
-
-if __name__ == "__main__":
-    from pipecat.examples.run import main
-
-    main(run_example, transport_params=transport_params)

From aadd088b5077b014d78f8e8e0930e9e6785e42b2 Mon Sep 17 00:00:00 2001
From: padillamt <michael.padilla@inworld.ai>
Date: Mon, 21 Jul 2025 10:52:55 -0700
Subject: [PATCH 12/38] inworld: commented out contents as per Pipecat guidance
 that this pattern is being retired

---
 src/pipecat/services/inworld/__init__.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/pipecat/services/inworld/__init__.py b/src/pipecat/services/inworld/__init__.py
index 9717eb163..910364d1b 100644
--- a/src/pipecat/services/inworld/__init__.py
+++ b/src/pipecat/services/inworld/__init__.py
@@ -4,10 +4,10 @@
 # SPDX-License-Identifier: BSD 2-Clause License
 #
 
-import sys
+# import sys
 
-from pipecat.services import DeprecatedModuleProxy
+# from pipecat.services import DeprecatedModuleProxy
 
-from .tts import *
+# from .tts import *
 
-sys.modules[__name__] = DeprecatedModuleProxy(globals(), "inworld", "inworld.tts")
+# sys.modules[__name__] = DeprecatedModuleProxy(globals(), "inworld", "inworld.tts")

From 54ff946976776be1d95d5a7ab0a9c5e9fb8af8f7 Mon Sep 17 00:00:00 2001
From: padillamt <michael.padilla@inworld.ai>
Date: Mon, 21 Jul 2025 12:07:58 -0700
Subject: [PATCH 13/38] inworld: largely adjustments for docstring
 compatibility

---
 src/pipecat/services/inworld/tts.py | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/src/pipecat/services/inworld/tts.py b/src/pipecat/services/inworld/tts.py
index 9d328fa5a..51245ea09 100644
--- a/src/pipecat/services/inworld/tts.py
+++ b/src/pipecat/services/inworld/tts.py
@@ -10,6 +10,7 @@ This module provides integration with Inworld AI's HTTP-based TTS API, enabling
 real-time text-to-speech synthesis with high-quality, natural-sounding voices.
 
 Key Features:
+
 - HTTP streaming API support for low-latency audio generation
 - Multiple voice options (Ashley, Hades, etc.)
 - Real-time audio chunk processing with proper buffering
@@ -17,13 +18,14 @@ Key Features:
 - Comprehensive error handling and metrics tracking
 
 Technical Implementation:
+
 - Uses aiohttp for HTTP streaming connections
 - Implements JSON line-by-line parsing for streaming responses
 - Handles base64-encoded audio data with proper decoding
 - Manages audio continuity to prevent clicks and artifacts
 - Integrates with Pipecat's frame-based pipeline architecture
 
-Usage:
+Usage::
     tts = InworldHttpTTSService(
         api_key=os.getenv("INWORLD_API_KEY"),
         voice_id="Ashley",
@@ -71,6 +73,7 @@ def language_to_inworld_language(language: Language) -> Optional[str]:
     corresponding language codes expected by Inworld's API.
 
     Supported Languages:
+
     - EN (English) -> "en"
     - ES (Spanish) -> "es"
     - FR (French) -> "fr"
@@ -126,6 +129,7 @@ class InworldHttpTTSService(TTSService):
     and low-latency streaming audio delivery.
 
     Key Features:
+
     - Real-time HTTP streaming for minimal latency
     - Multiple voice options (Ashley, Hades, etc.)
     - High-quality audio output (48kHz LINEAR16 PCM)
@@ -134,6 +138,7 @@ class InworldHttpTTSService(TTSService):
     - Built-in performance metrics and monitoring
 
     Technical Architecture:
+
     - Uses aiohttp for non-blocking HTTP requests
     - Implements JSON line-by-line streaming protocol
     - Processes base64-encoded audio chunks in real-time
@@ -141,16 +146,17 @@ class InworldHttpTTSService(TTSService):
     - Integrates with Pipecat's frame-based pipeline system
 
     Supported Configuration:
+
     - Voice Selection: Ashley, Hades, and other Inworld voices
     - Models: inworld-tts-1 and other available models
     - Audio Formats: LINEAR16 PCM at various sample rates
     - Languages: English, Spanish, French, Korean, Dutch, Chinese
 
-    Example Usage:
+    Example Usage::
         async with aiohttp.ClientSession() as session:
             tts = InworldHttpTTSService(
                 api_key=os.getenv("INWORLD_API_KEY"),
-                voice_id="Ashley",                    # Voice selection
+                voice_id="Ashley",                   # Voice selection
                 model="inworld-tts-1",               # TTS model
                 aiohttp_session=session,             # Required HTTP session
                 sample_rate=48000,                   # Audio quality
@@ -162,16 +168,9 @@ class InworldHttpTTSService(TTSService):
 
         Parameters:
             language: Language to use for synthesis.
-            speed: Voice speed control (string or float).
-            emotion: List of emotion controls.
-
-                .. deprecated:: 0.0.68
-                        The `emotion` parameter is deprecated and will be removed in a future version.
         """
 
         language: Optional[Language] = Language.EN
-        voice_id: str = "Hades"  ## QUESTION: How to make this modifyable/how to modify?
-        # QUESTION: What about speed, pitch, and temperature??
 
     def __init__(
         self,
@@ -179,6 +178,7 @@ class InworldHttpTTSService(TTSService):
         api_key: str,
         aiohttp_session: aiohttp.ClientSession,
         voice_id: str = "Ashley",
+        # language: Optional[Language] = Language.EN,
         model: str = "inworld-tts-1",
         base_url: str = "https://api.inworld.ai/tts/v1/voice:stream",
         sample_rate: Optional[int] = 48000,
@@ -305,6 +305,7 @@ class InworldHttpTTSService(TTSService):
         4. Yields audio frames for immediate playback in the pipeline
 
         Technical Details:
+
         - Uses HTTP streaming with JSON line-by-line responses
         - Each JSON line contains base64-encoded audio data
         - Implements buffering to handle partial JSON lines
@@ -334,7 +335,7 @@ class InworldHttpTTSService(TTSService):
             "audio_config": self._settings[
                 "audio_config"
             ],  # Audio format settings (LINEAR16, 48kHz)
-            "language": self._settings["language"],  # Language code (en, es, etc.)
+            # "language": self._settings["language"],  # Language code (en, es, etc.)
         }
 
         # Set up HTTP headers for authentication and content type

From 8eda2435a2b523af70d95d5c6647baed1db7c7b1 Mon Sep 17 00:00:00 2001
From: padillamt <michael.padilla@inworld.ai>
Date: Mon, 21 Jul 2025 13:24:10 -0700
Subject: [PATCH 14/38] inworld: removed explicit references to language since
 our models currently infer that from the text.

---
 src/pipecat/services/inworld/tts.py | 147 +++++++++-------------------
 1 file changed, 45 insertions(+), 102 deletions(-)

diff --git a/src/pipecat/services/inworld/tts.py b/src/pipecat/services/inworld/tts.py
index 51245ea09..e585943d8 100644
--- a/src/pipecat/services/inworld/tts.py
+++ b/src/pipecat/services/inworld/tts.py
@@ -13,6 +13,7 @@ Key Features:
 
 - HTTP streaming API support for low-latency audio generation
 - Multiple voice options (Ashley, Hades, etc.)
+- Automatic language detection from input text (no manual language setting required)
 - Real-time audio chunk processing with proper buffering
 - WAV header handling and audio format conversion
 - Comprehensive error handling and metrics tracking
@@ -26,12 +27,16 @@ Technical Implementation:
 - Integrates with Pipecat's frame-based pipeline architecture
 
 Usage::
-    tts = InworldHttpTTSService(
-        api_key=os.getenv("INWORLD_API_KEY"),
-        voice_id="Ashley",
-        model="inworld-tts-1",
-        aiohttp_session=session
-    )
+
+    async with aiohttp.ClientSession() as session:
+        tts = InworldHttpTTSService(
+            api_key=os.getenv("INWORLD_API_KEY"),
+            aiohttp_session=session,
+            params=InworldHttpTTSService.InputParams(
+                voice_id="Ashley",
+                model="inworld-tts-1",
+            ),
+        )
 """
 
 import base64
@@ -58,69 +63,12 @@ from pipecat.frames.frames import (
 )
 from pipecat.processors.frame_processor import FrameDirection
 from pipecat.services.tts_service import AudioContextWordTTSService, TTSService
-from pipecat.transcriptions.language import Language
 from pipecat.utils.asyncio.watchdog_async_iterator import WatchdogAsyncIterator
 from pipecat.utils.text.base_text_aggregator import BaseTextAggregator
 from pipecat.utils.text.skip_tags_aggregator import SkipTagsAggregator
 from pipecat.utils.tracing.service_decorators import traced_tts
 
 
-def language_to_inworld_language(language: Language) -> Optional[str]:
-    """Convert Pipecat's Language enum to Inworld's language code.
-
-    Inworld AI supports a specific set of language codes for TTS synthesis.
-    This function maps Pipecat's standardized Language enum values to the
-    corresponding language codes expected by Inworld's API.
-
-    Supported Languages:
-
-    - EN (English) -> "en"
-    - ES (Spanish) -> "es"
-    - FR (French) -> "fr"
-    - KO (Korean) -> "ko"
-    - NL (Dutch) -> "nl"
-    - ZH (Chinese) -> "zh"
-
-    The function also handles language variants (e.g., es-ES, en-US) by
-    extracting the base language code and mapping it if supported.
-
-    Args:
-        language: The Language enum value to convert (e.g., Language.EN).
-
-    Returns:
-        The corresponding Inworld language code string (e.g., "en"),
-        or None if the language is not supported by Inworld's API.
-
-    Example:
-        >>> language_to_inworld_language(Language.EN)
-        "en"
-        >>> language_to_inworld_language(Language.ES)
-        "es"
-        >>> language_to_inworld_language(Language.DE)  # Not supported
-        None
-    """
-    BASE_LANGUAGES = {
-        Language.EN: "en",
-        Language.ES: "es",
-        Language.FR: "fr",
-        Language.KO: "ko",
-        Language.NL: "nl",
-        Language.ZH: "zh",
-    }
-
-    result = BASE_LANGUAGES.get(language)
-
-    # If not found in base languages, try to find the base language from a variant
-    if not result:
-        # Convert enum value to string and get the base language part (e.g. es-ES -> es)
-        lang_str = str(language.value)
-        base_code = lang_str.split("-")[0].lower()
-        # Look up the base code in our supported languages
-        result = base_code if base_code in BASE_LANGUAGES.values() else None
-
-    return result
-
-
 class InworldHttpTTSService(TTSService):
     """Inworld AI HTTP-based Text-to-Speech Service.
 
@@ -150,16 +98,26 @@ class InworldHttpTTSService(TTSService):
     - Voice Selection: Ashley, Hades, and other Inworld voices
     - Models: inworld-tts-1 and other available models
     - Audio Formats: LINEAR16 PCM at various sample rates
-    - Languages: English, Spanish, French, Korean, Dutch, Chinese
+    - Language Detection: Automatically inferred from input text (no explicit language setting required)
 
     Example Usage::
+
         async with aiohttp.ClientSession() as session:
+            # Using default settings (Ashley voice, inworld-tts-1 model)
             tts = InworldHttpTTSService(
                 api_key=os.getenv("INWORLD_API_KEY"),
-                voice_id="Ashley",                   # Voice selection
-                model="inworld-tts-1",               # TTS model
-                aiohttp_session=session,             # Required HTTP session
-                sample_rate=48000,                   # Audio quality
+                aiohttp_session=session,
+            )
+
+            # Or with custom voice and model via params
+            params = InworldHttpTTSService.InputParams(
+                voice_id="Hades",
+                model="inworld-tts-1-max",
+            )
+            tts = InworldHttpTTSService(
+                api_key=os.getenv("INWORLD_API_KEY"),
+                aiohttp_session=session,
+                params=params,
             )
     """
 
@@ -167,19 +125,22 @@ class InworldHttpTTSService(TTSService):
         """Input parameters for Inworld HTTP TTS configuration.
 
         Parameters:
-            language: Language to use for synthesis.
+            voice_id: Voice selection for speech synthesis (e.g., "Ashley", "Hades").
+            model: TTS model to use (e.g., "inworld-tts-1", "inworld-tts-1-max").
+
+        Note:
+            Language is automatically inferred from the input text by Inworld's TTS models,
+            so no explicit language parameter is required.
         """
 
-        language: Optional[Language] = Language.EN
+        voice_id: Optional[str] = "Ashley"  # defaults to the Ashley voice
+        model: Optional[str] = "inworld-tts-1"  # defaults to the inworld-tts-1 model
 
     def __init__(
         self,
         *,
         api_key: str,
         aiohttp_session: aiohttp.ClientSession,
-        voice_id: str = "Ashley",
-        # language: Optional[Language] = Language.EN,
-        model: str = "inworld-tts-1",
         base_url: str = "https://api.inworld.ai/tts/v1/voice:stream",
         sample_rate: Optional[int] = 48000,
         encoding: str = "LINEAR16",
@@ -196,13 +157,6 @@ class InworldHttpTTSService(TTSService):
                     Get this from: Inworld Portal > Settings > API Keys > Runtime API Key
             aiohttp_session: Shared aiohttp session for HTTP requests. Must be provided
                            for proper connection pooling and resource management.
-            voice_id: Voice to use for synthesis. Available options include:
-                     - "Ashley" (default) - Natural female voice
-                     - "Hades" - Distinctive character voice
-                     - Other voices available through Inworld's voice catalog
-            model: TTS model to use. Currently supported:
-                  - "inworld-tts-1" (default) - Latest high-quality model
-                  - Other models as available in Inworld's API
             base_url: Base URL for Inworld HTTP API. Uses streaming endpoint by default.
                      Should normally not be changed unless using a different environment.
             sample_rate: Audio sample rate in Hz. Common values:
@@ -212,8 +166,11 @@ class InworldHttpTTSService(TTSService):
             encoding: Audio encoding format. Supported options:
                      - "LINEAR16" (default) - Uncompressed PCM, best quality
                      - Other formats as supported by Inworld API
-            params: Additional input parameters for advanced voice customization.
-                   Usually None for standard usage.
+            params: Input parameters for voice and model configuration. Use this to specify:
+                   - voice_id: Voice selection ("Ashley", "Hades", etc.)
+                   - model: TTS model ("inworld-tts-1", "inworld-tts-1-max", etc.)
+                   If None, uses default values (Ashley voice, inworld-tts-1 model).
+                   Note: Language is automatically inferred from input text.
             **kwargs: Additional arguments passed to the parent TTSService class.
 
         Note:
@@ -233,22 +190,19 @@ class InworldHttpTTSService(TTSService):
 
         # Build settings dictionary that matches Inworld's API expectations
         # This will be sent as JSON payload in each TTS request
+        # Note: Language is automatically inferred from text by Inworld's models
         self._settings = {
-            "voiceId": voice_id,  # Voice selection (fixes bug where this was ignored)
-            "modelId": model,  # TTS model selection
+            "voiceId": params.voice_id or "Ashley",  # Voice selection from params
+            "modelId": params.model or "inworld-tts-1",  # TTS model selection from params
             "audio_config": {  # Audio format configuration
                 "audio_encoding": encoding,  # Format: LINEAR16, MP3, etc.
                 "sample_rate_hertz": sample_rate,  # Sample rate: 48000, 24000, etc.
             },
-            # Language configuration with fallback to English
-            "language": self.language_to_service_language(params.language)
-            if params.language
-            else "en",
         }
 
         # Register voice and model with parent service for metrics and tracking
-        self.set_voice(voice_id)  # Used for logging and metrics
-        self.set_model_name(model)  # Used for performance tracking
+        self.set_voice(params.voice_id or "Ashley")  # Used for logging and metrics
+        self.set_model_name(params.model or "inworld-tts-1")  # Used for performance tracking
 
     def can_generate_metrics(self) -> bool:
         """Check if this service can generate processing metrics.
@@ -258,17 +212,6 @@ class InworldHttpTTSService(TTSService):
         """
         return True
 
-    def language_to_service_language(self, language: Language) -> Optional[str]:
-        """Convert a Language enum to Inworld language format.
-
-        Args:
-            language: The language to convert.
-
-        Returns:
-            The Inworld-specific language code, or None if not supported.
-        """
-        return language_to_inworld_language(language)
-
     async def start(self, frame: StartFrame):
         """Start the Inworld HTTP TTS service.
 
@@ -328,6 +271,7 @@ class InworldHttpTTSService(TTSService):
         # ================================================================================
         # Build the JSON payload according to Inworld's API specification
         # This matches the format shown in their documentation examples
+        # Note: Language is automatically inferred from the input text by Inworld's models
         payload = {
             "text": text,  # Text to synthesize
             "voiceId": self._settings["voiceId"],  # Voice selection (Ashley, Hades, etc.)
@@ -335,7 +279,6 @@ class InworldHttpTTSService(TTSService):
             "audio_config": self._settings[
                 "audio_config"
             ],  # Audio format settings (LINEAR16, 48kHz)
-            # "language": self._settings["language"],  # Language code (en, es, etc.)
         }
 
         # Set up HTTP headers for authentication and content type

From 4853d5d55cb0df19ba57ceecdd37f5e42b8acf04 Mon Sep 17 00:00:00 2001
From: padillamt <michael.padilla@inworld.ai>
Date: Mon, 21 Jul 2025 13:27:25 -0700
Subject: [PATCH 15/38] inworld: updated InworldHttpTTSService initialization

---
 examples/foundational/07aa-interruptible-inworld-http.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/examples/foundational/07aa-interruptible-inworld-http.py b/examples/foundational/07aa-interruptible-inworld-http.py
index 03622a2d4..c1c509ade 100644
--- a/examples/foundational/07aa-interruptible-inworld-http.py
+++ b/examples/foundational/07aa-interruptible-inworld-http.py
@@ -60,9 +60,11 @@ async def run_example(transport: BaseTransport, _: argparse.Namespace, handle_si
 
         tts = InworldHttpTTSService(
             api_key=os.getenv("INWORLD_API_KEY", ""),
-            voice_id="Ashley",
-            model="inworld-tts-1",
             aiohttp_session=session,
+            params=InworldHttpTTSService.InputParams(
+                voice_id="Ashley",
+                model="inworld-tts-1-max",
+            ),
         )
 
         llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))

From 0d5292c4efe43571aceefebab00565eaa40db237 Mon Sep 17 00:00:00 2001
From: padillamt <michael.padilla@inworld.ai>
Date: Mon, 21 Jul 2025 13:48:13 -0700
Subject: [PATCH 16/38] inworld: typo fix in voice name

---
 examples/foundational/07aa-interruptible-inworld-http.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/foundational/07aa-interruptible-inworld-http.py b/examples/foundational/07aa-interruptible-inworld-http.py
index c1c509ade..887334b7d 100644
--- a/examples/foundational/07aa-interruptible-inworld-http.py
+++ b/examples/foundational/07aa-interruptible-inworld-http.py
@@ -63,7 +63,7 @@ async def run_example(transport: BaseTransport, _: argparse.Namespace, handle_si
             aiohttp_session=session,
             params=InworldHttpTTSService.InputParams(
                 voice_id="Ashley",
-                model="inworld-tts-1-max",
+                model="inworld-tts-1",
             ),
         )
 

From 076a675a757eadb1ace6d265e9cb5f6093a62e7b Mon Sep 17 00:00:00 2001
From: padillamt <michael.padilla@inworld.ai>
Date: Mon, 21 Jul 2025 13:50:36 -0700
Subject: [PATCH 17/38] inworld: Fix...Set sample_rate=None in
 InworldHttpTTSService to match Cartesia pattern

---
 src/pipecat/services/inworld/tts.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/src/pipecat/services/inworld/tts.py b/src/pipecat/services/inworld/tts.py
index e585943d8..6e1552744 100644
--- a/src/pipecat/services/inworld/tts.py
+++ b/src/pipecat/services/inworld/tts.py
@@ -142,7 +142,7 @@ class InworldHttpTTSService(TTSService):
         api_key: str,
         aiohttp_session: aiohttp.ClientSession,
         base_url: str = "https://api.inworld.ai/tts/v1/voice:stream",
-        sample_rate: Optional[int] = 48000,
+        sample_rate: Optional[int] = None,
         encoding: str = "LINEAR16",
         params: Optional[InputParams] = None,
         **kwargs,
@@ -159,10 +159,8 @@ class InworldHttpTTSService(TTSService):
                            for proper connection pooling and resource management.
             base_url: Base URL for Inworld HTTP API. Uses streaming endpoint by default.
                      Should normally not be changed unless using a different environment.
-            sample_rate: Audio sample rate in Hz. Common values:
-                        - 48000 (default) - High quality, suitable for most applications
-                        - 24000 - Good quality, lower bandwidth
-                        - 16000 - Basic quality, minimal bandwidth
+            sample_rate: Audio sample rate in Hz. If None, uses default from StartFrame.
+                        Common values: 48000 (high quality), 24000 (good quality), 16000 (basic)
             encoding: Audio encoding format. Supported options:
                      - "LINEAR16" (default) - Uncompressed PCM, best quality
                      - Other formats as supported by Inworld API
@@ -196,7 +194,7 @@ class InworldHttpTTSService(TTSService):
             "modelId": params.model or "inworld-tts-1",  # TTS model selection from params
             "audio_config": {  # Audio format configuration
                 "audio_encoding": encoding,  # Format: LINEAR16, MP3, etc.
-                "sample_rate_hertz": sample_rate,  # Sample rate: 48000, 24000, etc.
+                "sample_rate_hertz": 0,  # Will be set in start() from parent service
             },
         }
 

From 1915407ff7ab215abe14180dbf856d8e1190d1e9 Mon Sep 17 00:00:00 2001
From: padillamt <michael.padilla@inworld.ai>
Date: Mon, 21 Jul 2025 15:30:48 -0700
Subject: [PATCH 18/38] inworld: removed unreferenced is_first_chunk variable

---
 src/pipecat/services/inworld/tts.py | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/src/pipecat/services/inworld/tts.py b/src/pipecat/services/inworld/tts.py
index 6e1552744..af90ad116 100644
--- a/src/pipecat/services/inworld/tts.py
+++ b/src/pipecat/services/inworld/tts.py
@@ -297,10 +297,6 @@ class InworldHttpTTSService(TTSService):
             # This allows downstream processors to prepare for incoming audio
             yield TTSStartedFrame()
 
-            # Flag to track if we're processing the first audio chunk
-            # Used for WAV header handling and debugging
-            is_first_chunk = True
-
             # ================================================================================
             # STEP 3: MAKE HTTP STREAMING REQUEST
             # ================================================================================
@@ -395,10 +391,6 @@ class InworldHttpTTSService(TTSService):
                                     # Remove the 44-byte WAV header to get pure audio data
                                     audio_data = audio_chunk[44:]
 
-                                    # Track that we've seen our first chunk (for debugging)
-                                    if is_first_chunk:
-                                        is_first_chunk = False
-
                                 # ========================================================
                                 # STEP 11: YIELD AUDIO FRAME TO PIPELINE
                                 # ========================================================

From f29024bcc01899d067c13e5edfe89a4977727571 Mon Sep 17 00:00:00 2001
From: padillamt <michael.padilla@inworld.ai>
Date: Wed, 23 Jul 2025 11:47:26 -0700
Subject: [PATCH 19/38] mtpadilla: update coments regarding temperature
 parameter

---
 src/pipecat/services/inworld/tts.py | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/src/pipecat/services/inworld/tts.py b/src/pipecat/services/inworld/tts.py
index af90ad116..b1a020b8b 100644
--- a/src/pipecat/services/inworld/tts.py
+++ b/src/pipecat/services/inworld/tts.py
@@ -35,6 +35,7 @@ Usage::
             params=InworldHttpTTSService.InputParams(
                 voice_id="Ashley",
                 model="inworld-tts-1",
+                temperature=0.8,  # Optional: control synthesis variability (range: [0, 2])
             ),
         )
 """
@@ -109,10 +110,11 @@ class InworldHttpTTSService(TTSService):
                 aiohttp_session=session,
             )
 
-            # Or with custom voice and model via params
+            # Or with custom voice, model, and temperature via params
             params = InworldHttpTTSService.InputParams(
                 voice_id="Hades",
                 model="inworld-tts-1-max",
+                temperature=0.8,  # Add variability to speech synthesis (range: [0, 2])
             )
             tts = InworldHttpTTSService(
                 api_key=os.getenv("INWORLD_API_KEY"),
@@ -124,9 +126,11 @@ class InworldHttpTTSService(TTSService):
     class InputParams(BaseModel):
         """Input parameters for Inworld HTTP TTS configuration.
 
-        Parameters:
+                Parameters:
             voice_id: Voice selection for speech synthesis (e.g., "Ashley", "Hades").
             model: TTS model to use (e.g., "inworld-tts-1", "inworld-tts-1-max").
+            temperature: Voice temperature control for synthesis variability (e.g., 0.8).
+                        Valid range: [0, 2]. Higher values increase variability.
 
         Note:
             Language is automatically inferred from the input text by Inworld's TTS models,
@@ -135,6 +139,7 @@ class InworldHttpTTSService(TTSService):
 
         voice_id: Optional[str] = "Ashley"  # defaults to the Ashley voice
         model: Optional[str] = "inworld-tts-1"  # defaults to the inworld-tts-1 model
+        temperature: Optional[float] = None  # optional temperature control (range: [0, 2])
 
     def __init__(
         self,
@@ -167,6 +172,7 @@ class InworldHttpTTSService(TTSService):
             params: Input parameters for voice and model configuration. Use this to specify:
                    - voice_id: Voice selection ("Ashley", "Hades", etc.)
                    - model: TTS model ("inworld-tts-1", "inworld-tts-1-max", etc.)
+                   - temperature: Voice temperature control for variability (range: [0, 2], e.g., 0.8, optional)
                    If None, uses default values (Ashley voice, inworld-tts-1 model).
                    Note: Language is automatically inferred from input text.
             **kwargs: Additional arguments passed to the parent TTSService class.
@@ -198,6 +204,10 @@ class InworldHttpTTSService(TTSService):
             },
         }
 
+        # Add optional temperature parameter if provided (valid range: [0, 2])
+        if params.temperature is not None:
+            self._settings["temperature"] = params.temperature
+
         # Register voice and model with parent service for metrics and tracking
         self.set_voice(params.voice_id or "Ashley")  # Used for logging and metrics
         self.set_model_name(params.model or "inworld-tts-1")  # Used for performance tracking
@@ -279,6 +289,10 @@ class InworldHttpTTSService(TTSService):
             ],  # Audio format settings (LINEAR16, 48kHz)
         }
 
+        # Add optional temperature parameter if configured (valid range: [0, 2])
+        if "temperature" in self._settings:
+            payload["temperature"] = self._settings["temperature"]
+
         # Set up HTTP headers for authentication and content type
         # Inworld requires Basic auth with base64-encoded API key
         headers = {

From a5d353030ec83a79e40c647af7bcb4f46863e57b Mon Sep 17 00:00:00 2001
From: padillamt <michael.padilla@inworld.ai>
Date: Wed, 23 Jul 2025 12:02:58 -0700
Subject: [PATCH 20/38] mtpadilla: small formatting fix to comments

---
 src/pipecat/services/inworld/tts.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pipecat/services/inworld/tts.py b/src/pipecat/services/inworld/tts.py
index b1a020b8b..1fd8e9a55 100644
--- a/src/pipecat/services/inworld/tts.py
+++ b/src/pipecat/services/inworld/tts.py
@@ -126,7 +126,7 @@ class InworldHttpTTSService(TTSService):
     class InputParams(BaseModel):
         """Input parameters for Inworld HTTP TTS configuration.
 
-                Parameters:
+        Parameters:
             voice_id: Voice selection for speech synthesis (e.g., "Ashley", "Hades").
             model: TTS model to use (e.g., "inworld-tts-1", "inworld-tts-1-max").
             temperature: Voice temperature control for synthesis variability (e.g., 0.8).

From 147bf9cfe852e644ca35dacf02032db99348a4d9 Mon Sep 17 00:00:00 2001
From: padillamt <michael.padilla@inworld.ai>
Date: Wed, 23 Jul 2025 15:28:43 -0700
Subject: [PATCH 21/38] mtpadilla: addition of non-streaming option with own
 dedicated class, and related additional non-streaming test option

---
 .../07aa-interruptible-inworld-http.py        |  34 +-
 src/pipecat/services/inworld/tts.py           | 375 +++++++++++++++++-
 2 files changed, 393 insertions(+), 16 deletions(-)

diff --git a/examples/foundational/07aa-interruptible-inworld-http.py b/examples/foundational/07aa-interruptible-inworld-http.py
index 887334b7d..65d7babca 100644
--- a/examples/foundational/07aa-interruptible-inworld-http.py
+++ b/examples/foundational/07aa-interruptible-inworld-http.py
@@ -16,7 +16,7 @@ from pipecat.pipeline.pipeline import Pipeline
 from pipecat.pipeline.runner import PipelineRunner
 from pipecat.pipeline.task import PipelineParams, PipelineTask
 from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
-from pipecat.services.inworld.tts import InworldHttpTTSService
+from pipecat.services.inworld.tts import InworldHttpNonStreamingService, InworldHttpStreamingService
 from pipecat.services.openai.llm import OpenAILLMService
 from pipecat.services.openai.stt import OpenAISTTService
 from pipecat.transports.base_transport import BaseTransport, TransportParams
@@ -58,14 +58,30 @@ async def run_example(transport: BaseTransport, _: argparse.Namespace, handle_si
             prompt="Expect words related to dogs, such as breed names.",
         )
 
-        tts = InworldHttpTTSService(
-            api_key=os.getenv("INWORLD_API_KEY", ""),
-            aiohttp_session=session,
-            params=InworldHttpTTSService.InputParams(
-                voice_id="Ashley",
-                model="inworld-tts-1",
-            ),
-        )
+        streaming = True
+
+        if streaming:
+            # Streaming TTS - Real-time audio generation as text is processed
+            tts = InworldHttpStreamingService(
+                api_key=os.getenv("INWORLD_API_KEY", ""),
+                aiohttp_session=session,
+                params=InworldHttpStreamingService.InputParams(
+                    voice_id="Ashley",
+                    model="inworld-tts-1",
+                    temperature=0.8,
+                ),
+            )
+        else:
+            # Non-streaming TTS - Complete audio generation then playback
+            tts = InworldHttpNonStreamingService(
+                api_key=os.getenv("INWORLD_API_KEY", ""),
+                aiohttp_session=session,
+                params=InworldHttpNonStreamingService.InputParams(
+                    voice_id="Ashley",
+                    model="inworld-tts-1",
+                    temperature=0.8,
+                ),
+            )
 
         llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
 
diff --git a/src/pipecat/services/inworld/tts.py b/src/pipecat/services/inworld/tts.py
index 1fd8e9a55..3a70b7499 100644
--- a/src/pipecat/services/inworld/tts.py
+++ b/src/pipecat/services/inworld/tts.py
@@ -29,10 +29,10 @@ Technical Implementation:
 Usage::
 
     async with aiohttp.ClientSession() as session:
-        tts = InworldHttpTTSService(
+        tts = InworldHttpStreamingService(
             api_key=os.getenv("INWORLD_API_KEY"),
             aiohttp_session=session,
-            params=InworldHttpTTSService.InputParams(
+            params=InworldHttpStreamingService.InputParams(
                 voice_id="Ashley",
                 model="inworld-tts-1",
                 temperature=0.8,  # Optional: control synthesis variability (range: [0, 2])
@@ -70,7 +70,7 @@ from pipecat.utils.text.skip_tags_aggregator import SkipTagsAggregator
 from pipecat.utils.tracing.service_decorators import traced_tts
 
 
-class InworldHttpTTSService(TTSService):
+class InworldHttpStreamingService(TTSService):
     """Inworld AI HTTP-based Text-to-Speech Service.
 
     This service integrates Inworld AI's high-quality TTS API with Pipecat's pipeline
@@ -105,18 +105,18 @@ class InworldHttpTTSService(TTSService):
 
         async with aiohttp.ClientSession() as session:
             # Using default settings (Ashley voice, inworld-tts-1 model)
-            tts = InworldHttpTTSService(
+            tts = InworldHttpStreamingService(
                 api_key=os.getenv("INWORLD_API_KEY"),
                 aiohttp_session=session,
             )
 
             # Or with custom voice, model, and temperature via params
-            params = InworldHttpTTSService.InputParams(
+            params = InworldHttpStreamingService.InputParams(
                 voice_id="Hades",
                 model="inworld-tts-1-max",
                 temperature=0.8,  # Add variability to speech synthesis (range: [0, 2])
             )
-            tts = InworldHttpTTSService(
+            tts = InworldHttpStreamingService(
                 api_key=os.getenv("INWORLD_API_KEY"),
                 aiohttp_session=session,
                 params=params,
@@ -185,7 +185,7 @@ class InworldHttpTTSService(TTSService):
         super().__init__(sample_rate=sample_rate, **kwargs)
 
         # Use provided params or create default configuration
-        params = params or InworldHttpTTSService.InputParams()
+        params = params or InworldHttpStreamingService.InputParams()
 
         # Store core configuration for API requests
         self._api_key = api_key  # Authentication credentials
@@ -446,3 +446,364 @@ class InworldHttpTTSService(TTSService):
             # Signal to pipeline that TTS generation is complete
             # This allows downstream processors to finalize audio processing
             yield TTSStoppedFrame()
+
+
+class InworldHttpNonStreamingService(TTSService):
+    """Inworld AI HTTP-based Text-to-Speech Service (Non-Streaming).
+
+    This service integrates with Inworld AI's non-streaming TTS API for simpler,
+    complete audio synthesis. Suitable for use cases where streaming is not required
+    and you prefer to receive the complete audio file at once.
+
+    Key Features:
+
+    - Simple HTTP request/response for complete audio synthesis
+    - Same voice options as streaming version (Ashley, Hades, etc.)
+    - High-quality audio output (48kHz LINEAR16 PCM)
+    - Automatic language detection from input text
+    - Support for temperature parameter for synthesis variability
+    - Lower complexity compared to streaming implementation
+
+    Technical Architecture:
+
+    - Uses aiohttp for single HTTP POST request
+    - Downloads complete audio as base64-encoded data
+    - Processes entire audio file and chunks for playback
+    - Integrates with Pipecat's frame-based pipeline system
+
+    Usage::
+
+        async with aiohttp.ClientSession() as session:
+            # Using default settings (Ashley voice, inworld-tts-1 model)
+            tts = InworldHttpNonStreamingService(
+                api_key=os.getenv("INWORLD_API_KEY"),
+                aiohttp_session=session,
+            )
+
+            # Or with custom voice, model, and temperature
+            params = InworldHttpNonStreamingService.InputParams(
+                voice_id="Hades",
+                model="inworld-tts-1-max",
+                temperature=0.8,  # Control synthesis variability (range: [0, 2])
+            )
+            tts = InworldHttpNonStreamingService(
+                api_key=os.getenv("INWORLD_API_KEY"),
+                aiohttp_session=session,
+                params=params,
+            )
+    """
+
+    class InputParams(BaseModel):
+        """Input parameters for Inworld non-streaming TTS configuration.
+
+        Parameters:
+            voice_id: Voice selection for speech synthesis (e.g., "Ashley", "Hades").
+            model: TTS model to use (e.g., "inworld-tts-1", "inworld-tts-1-max").
+            temperature: Voice temperature control for synthesis variability (e.g., 0.8).
+                        Valid range: [0, 2]. Higher values increase variability.
+
+        Note:
+            Language is automatically inferred from the input text by Inworld's TTS models,
+            so no explicit language parameter is required.
+        """
+
+        voice_id: Optional[str] = "Ashley"  # defaults to the Ashley voice
+        model: Optional[str] = "inworld-tts-1"  # defaults to the inworld-tts-1 model
+        temperature: Optional[float] = None  # optional temperature control (range: [0, 2])
+
+    def __init__(
+        self,
+        *,
+        api_key: str,
+        aiohttp_session: Optional[aiohttp.ClientSession] = None,
+        base_url: str = "https://api.inworld.ai/tts/v1/voice",  # Non-streaming endpoint
+        sample_rate: Optional[int] = None,
+        encoding: str = "LINEAR16",
+        params: Optional[InputParams] = None,
+        **kwargs,
+    ):
+        """Initialize the Inworld non-streaming TTS service.
+
+        Sets up the TTS service with Inworld AI's non-streaming API configuration.
+        This constructor prepares all necessary parameters for complete audio synthesis.
+
+        Args:
+            api_key: Inworld API key for authentication (base64-encoded from Inworld Portal).
+                    Get this from: Inworld Portal > Settings > API Keys > Runtime API Key
+            aiohttp_session: Shared aiohttp session for HTTP requests. Must be provided
+                           for proper connection pooling and resource management.
+            base_url: Base URL for Inworld non-streaming HTTP API. Uses non-streaming endpoint by default.
+                     Should normally not be changed unless using a different environment.
+            sample_rate: Audio sample rate in Hz. If None, uses default from StartFrame.
+                        Common values: 48000 (high quality), 24000 (good quality), 16000 (basic)
+            encoding: Audio encoding format. Supported options:
+                     - "LINEAR16" (default) - Uncompressed PCM, best quality
+                     - Other formats as supported by Inworld API
+            params: Input parameters for voice and model configuration. Use this to specify:
+                   - voice_id: Voice selection ("Ashley", "Hades", etc.)
+                   - model: TTS model ("inworld-tts-1", "inworld-tts-1-max", etc.)
+                   - temperature: Voice temperature control for variability (range: [0, 2], e.g., 0.8, optional)
+                   If None, uses default values (Ashley voice, inworld-tts-1 model).
+                   Note: Language is automatically inferred from input text.
+            **kwargs: Additional arguments passed to the parent TTSService class.
+
+        Note:
+            The aiohttp_session parameter is required because Inworld's HTTP API
+            benefits from connection reuse and proper async session management.
+        """
+        # Initialize parent TTSService with audio configuration
+        super().__init__(sample_rate=sample_rate, **kwargs)
+
+        # Use provided params or create default configuration
+        params = params or InworldHttpNonStreamingService.InputParams()
+
+        # Store core configuration for API requests
+        self._api_key = api_key  # Authentication credentials
+        self._session = aiohttp_session  # HTTP session for requests (optional)
+        self._base_url = base_url  # API endpoint URL
+
+        # Build settings dictionary that matches Inworld's API expectations
+        # This will be sent as JSON payload in the TTS request
+        # Note: Language is automatically inferred from text by Inworld's models
+        self._settings = {
+            "voiceId": params.voice_id or "Ashley",  # Voice selection from params
+            "modelId": params.model or "inworld-tts-1",  # TTS model selection from params
+            "audio_config": {  # Audio format configuration
+                "audio_encoding": encoding,  # Format: LINEAR16, MP3, etc.
+                "sample_rate_hertz": 0,  # Will be set in start() from parent service
+            },
+        }
+
+        # Add optional temperature parameter if provided (valid range: [0, 2])
+        if params.temperature is not None:
+            self._settings["temperature"] = params.temperature
+
+        # Register voice and model with parent service for metrics and tracking
+        self.set_voice(params.voice_id or "Ashley")  # Used for logging and metrics
+        self.set_model_name(params.model or "inworld-tts-1")  # Used for performance tracking
+
+    def can_generate_metrics(self) -> bool:
+        """Check if this service can generate processing metrics.
+
+        Returns:
+            True, as Inworld non-streaming service supports metrics generation.
+        """
+        return True
+
+    async def start(self, frame: StartFrame):
+        """Start the Inworld non-streaming TTS service.
+
+        Args:
+            frame: The start frame containing initialization parameters.
+        """
+        await super().start(frame)
+        self._settings["audio_config"]["sample_rate_hertz"] = self.sample_rate
+
+    async def stop(self, frame: EndFrame):
+        """Stop the Inworld non-streaming TTS service.
+
+        Args:
+            frame: The end frame.
+        """
+        await super().stop(frame)
+
+    async def cancel(self, frame: CancelFrame):
+        """Cancel the Inworld non-streaming TTS service.
+
+        Args:
+            frame: The cancel frame.
+        """
+        await super().cancel(frame)
+
+    @traced_tts
+    async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
+        """Generate speech from text using Inworld's non-streaming HTTP API.
+
+        This method sends text to Inworld's non-streaming TTS endpoint and receives
+        the complete audio file as a base64-encoded response. The audio is then
+        chunked and yielded for playback in the pipeline.
+
+        Args:
+            text: The text to synthesize into speech.
+
+        Yields:
+            Frame: Audio frames containing the synthesized speech, plus control frames.
+
+        Raises:
+            ErrorFrame: If API errors occur or audio processing fails.
+        """
+        logger.debug(f"{self}: Generating TTS [{text}]")
+
+        # ================================================================================
+        # STEP 1: PREPARE API REQUEST
+        # ================================================================================
+        # Build the JSON payload according to Inworld's non-streaming API specification
+        # This matches the format shown in their documentation examples
+        # Note: Language is automatically inferred from the input text by Inworld's models
+        payload = {
+            "text": text,  # Text to synthesize
+            "voiceId": self._settings["voiceId"],  # Voice selection (Ashley, Hades, etc.)
+            "modelId": self._settings["modelId"],  # TTS model (inworld-tts-1)
+            "audio_config": self._settings["audio_config"],  # Audio format settings
+        }
+
+        # Add optional temperature parameter if configured (valid range: [0, 2])
+        if "temperature" in self._settings:
+            payload["temperature"] = self._settings["temperature"]
+
+        # Set up HTTP headers for authentication and content type
+        # Inworld requires Basic auth with base64-encoded API key
+        headers = {
+            "Authorization": f"Basic {self._api_key}",  # Base64 API key from Inworld Portal
+            "Content-Type": "application/json",  # JSON request body
+        }
+
+        try:
+            # ================================================================================
+            # STEP 2: INITIALIZE METRICS AND STREAMING
+            # ================================================================================
+            # Start measuring Time To First Byte (TTFB) for performance tracking
+            await self.start_ttfb_metrics()
+
+            # Signal to the pipeline that TTS generation has started
+            # This allows downstream processors to prepare for incoming audio
+            yield TTSStartedFrame()
+
+            # ================================================================================
+            # STEP 3: MAKE HTTP NON-STREAMING REQUEST
+            # ================================================================================
+            # Make single HTTP POST request to Inworld's non-streaming endpoint
+            # This endpoint returns complete audio as base64-encoded data
+            # Create session if none was provided
+            if self._session:
+                session = self._session
+            else:
+                session = aiohttp.ClientSession()
+
+            async with (
+                session
+                if not self._session
+                else session.post(
+                    self._base_url, json=payload, headers=headers
+                ) as context_or_response
+            ):
+                if self._session:
+                    response = context_or_response
+                else:
+                    async with context_or_response.post(
+                        self._base_url, json=payload, headers=headers
+                    ) as response:
+                        # ================================================================
+                        # STEP 4: HANDLE HTTP ERRORS
+                        # ================================================================
+                        # Check for API errors (expired keys, invalid requests, etc.)
+                        if response.status != 200:
+                            error_text = await response.text()
+                            logger.error(f"Inworld API error: {error_text}")
+                            await self.push_error(ErrorFrame(f"Inworld API error: {error_text}"))
+                            return
+
+                        # ================================================================
+                        # STEP 5: PARSE COMPLETE JSON RESPONSE
+                        # ================================================================
+                        # Parse the complete JSON response containing base64 audio data
+                        response_data = await response.json()
+
+                        # ================================================================
+                        # STEP 6: EXTRACT AND VALIDATE AUDIO CONTENT
+                        # ================================================================
+                        # Extract the base64-encoded audio content from response
+                        if "audioContent" not in response_data:
+                            logger.error("No audioContent in Inworld API response")
+                            await self.push_error(ErrorFrame("No audioContent in response"))
+                            return
+
+                        # ================================================================
+                        # STEP 7: DECODE AND PROCESS AUDIO DATA
+                        # ================================================================
+                        # Decode the base64 audio data to binary
+                        audio_data = base64.b64decode(response_data["audioContent"])
+
+                        # Strip WAV header if present (Inworld may include WAV header)
+                        # This prevents audio clicks and ensures clean audio playback
+                        if len(audio_data) > 44 and audio_data.startswith(b"RIFF"):
+                            audio_data = audio_data[44:]
+
+                        # ================================================================
+                        # STEP 8: START USAGE METRICS TRACKING
+                        # ================================================================
+                        await self.start_tts_usage_metrics(text)
+
+                        # ================================================================
+                        # STEP 9: CHUNK AND YIELD AUDIO FOR PLAYBACK
+                        # ================================================================
+                        # Chunk the complete audio for streaming playback
+                        # This allows the pipeline to process audio in manageable pieces
+                        CHUNK_SIZE = self.chunk_size
+
+                        for i in range(0, len(audio_data), CHUNK_SIZE):
+                            chunk = audio_data[i : i + CHUNK_SIZE]
+                            if len(chunk) > 0:
+                                await self.stop_ttfb_metrics()
+                                yield TTSAudioRawFrame(
+                                    audio=chunk,
+                                    sample_rate=self.sample_rate,
+                                    num_channels=1,
+                                )
+
+                if self._session:
+                    # Handle HTTP errors
+                    if response.status != 200:
+                        error_text = await response.text()
+                        logger.error(f"Inworld API error: {error_text}")
+                        await self.push_error(ErrorFrame(f"Inworld API error: {error_text}"))
+                        return
+
+                    # Parse the complete JSON response
+                    response_data = await response.json()
+
+                    # Extract the base64-encoded audio content
+                    if "audioContent" not in response_data:
+                        logger.error("No audioContent in Inworld API response")
+                        await self.push_error(ErrorFrame("No audioContent in response"))
+                        return
+
+                    # Decode the base64 audio data
+                    audio_data = base64.b64decode(response_data["audioContent"])
+
+                    # Strip WAV header if present (Inworld may include WAV header)
+                    if len(audio_data) > 44 and audio_data.startswith(b"RIFF"):
+                        audio_data = audio_data[44:]
+
+                    await self.start_tts_usage_metrics(text)
+
+                    # Chunk the complete audio for streaming playback
+                    CHUNK_SIZE = self.chunk_size
+
+                    for i in range(0, len(audio_data), CHUNK_SIZE):
+                        chunk = audio_data[i : i + CHUNK_SIZE]
+                        if len(chunk) > 0:
+                            await self.stop_ttfb_metrics()
+                            yield TTSAudioRawFrame(
+                                audio=chunk,
+                                sample_rate=self.sample_rate,
+                                num_channels=1,
+                            )
+
+        except Exception as e:
+            # ================================================================================
+            # STEP 10: ERROR HANDLING
+            # ================================================================================
+            # Log any unexpected errors and notify the pipeline
+            logger.error(f"{self} exception: {e}")
+            await self.push_error(ErrorFrame(f"Error generating TTS: {e}"))
+        finally:
+            # ================================================================================
+            # STEP 11: CLEANUP AND COMPLETION
+            # ================================================================================
+            # Always stop metrics tracking, even if errors occurred
+            await self.stop_ttfb_metrics()
+
+            # Signal to pipeline that TTS generation is complete
+            # This allows downstream processors to finalize audio processing
+            yield TTSStoppedFrame()

From b6367965cbfb7de1f70e6ceeb4ca6eddc0c253d4 Mon Sep 17 00:00:00 2001
From: padillamt <michael.padilla@inworld.ai>
Date: Wed, 23 Jul 2025 16:50:32 -0700
Subject: [PATCH 22/38] mtpadilla: consolidate streaming and non-streaming
 options into a single class with common API, with boolean switch variable
 added (streaming)

---
 .../07aa-interruptible-inworld-http.py        |  38 +-
 src/pipecat/services/inworld/tts.py           | 749 +++++++-----------
 2 files changed, 284 insertions(+), 503 deletions(-)

diff --git a/examples/foundational/07aa-interruptible-inworld-http.py b/examples/foundational/07aa-interruptible-inworld-http.py
index 65d7babca..dbfbcc878 100644
--- a/examples/foundational/07aa-interruptible-inworld-http.py
+++ b/examples/foundational/07aa-interruptible-inworld-http.py
@@ -16,7 +16,7 @@ from pipecat.pipeline.pipeline import Pipeline
 from pipecat.pipeline.runner import PipelineRunner
 from pipecat.pipeline.task import PipelineParams, PipelineTask
 from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
-from pipecat.services.inworld.tts import InworldHttpNonStreamingService, InworldHttpStreamingService
+from pipecat.services.inworld.tts import InworldTTSService
 from pipecat.services.openai.llm import OpenAILLMService
 from pipecat.services.openai.stt import OpenAISTTService
 from pipecat.transports.base_transport import BaseTransport, TransportParams
@@ -58,30 +58,20 @@ async def run_example(transport: BaseTransport, _: argparse.Namespace, handle_si
             prompt="Expect words related to dogs, such as breed names.",
         )
 
-        streaming = True
+        # Inworld TTS Service - Unified streaming and non-streaming
+        # Set streaming=True for real-time audio, streaming=False for complete audio generation
+        streaming = False  # Toggle this to switch between modes
 
-        if streaming:
-            # Streaming TTS - Real-time audio generation as text is processed
-            tts = InworldHttpStreamingService(
-                api_key=os.getenv("INWORLD_API_KEY", ""),
-                aiohttp_session=session,
-                params=InworldHttpStreamingService.InputParams(
-                    voice_id="Ashley",
-                    model="inworld-tts-1",
-                    temperature=0.8,
-                ),
-            )
-        else:
-            # Non-streaming TTS - Complete audio generation then playback
-            tts = InworldHttpNonStreamingService(
-                api_key=os.getenv("INWORLD_API_KEY", ""),
-                aiohttp_session=session,
-                params=InworldHttpNonStreamingService.InputParams(
-                    voice_id="Ashley",
-                    model="inworld-tts-1",
-                    temperature=0.8,
-                ),
-            )
+        tts = InworldTTSService(
+            api_key=os.getenv("INWORLD_API_KEY", ""),
+            aiohttp_session=session,
+            streaming=streaming,  # True: real-time chunks, False: complete audio then playback
+            params=InworldTTSService.InputParams(
+                voice_id="Ashley",
+                model="inworld-tts-1",
+                temperature=0.8,
+            ),
+        )
 
         llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
 
diff --git a/src/pipecat/services/inworld/tts.py b/src/pipecat/services/inworld/tts.py
index 3a70b7499..94ef5aa32 100644
--- a/src/pipecat/services/inworld/tts.py
+++ b/src/pipecat/services/inworld/tts.py
@@ -7,11 +7,12 @@
 """Inworld AI Text-to-Speech Service Implementation.
 
 This module provides integration with Inworld AI's HTTP-based TTS API, enabling
-real-time text-to-speech synthesis with high-quality, natural-sounding voices.
+both streaming and non-streaming text-to-speech synthesis with high-quality,
+natural-sounding voices.
 
 Key Features:
 
-- HTTP streaming API support for low-latency audio generation
+- HTTP streaming and non-streaming API support for flexible audio generation
 - Multiple voice options (Ashley, Hades, etc.)
 - Automatic language detection from input text (no manual language setting required)
 - Real-time audio chunk processing with proper buffering
@@ -20,8 +21,8 @@ Key Features:
 
 Technical Implementation:
 
-- Uses aiohttp for HTTP streaming connections
-- Implements JSON line-by-line parsing for streaming responses
+- Uses aiohttp for HTTP connections
+- Implements both JSON line-by-line parsing (streaming) and complete response (non-streaming)
 - Handles base64-encoded audio data with proper decoding
 - Manages audio continuity to prevent clicks and artifacts
 - Integrates with Pipecat's frame-based pipeline architecture
@@ -29,15 +30,29 @@ Technical Implementation:
 Usage::
 
     async with aiohttp.ClientSession() as session:
-        tts = InworldHttpStreamingService(
+        # Streaming mode (default) - real-time audio generation
+        tts = InworldTTSService(
             api_key=os.getenv("INWORLD_API_KEY"),
             aiohttp_session=session,
-            params=InworldHttpStreamingService.InputParams(
+            streaming=True,  # Default
+            params=InworldTTSService.InputParams(
                 voice_id="Ashley",
                 model="inworld-tts-1",
                 temperature=0.8,  # Optional: control synthesis variability (range: [0, 2])
             ),
         )
+
+        # Non-streaming mode - complete audio generation then playback
+        tts = InworldTTSService(
+            api_key=os.getenv("INWORLD_API_KEY"),
+            aiohttp_session=session,
+            streaming=False,
+            params=InworldTTSService.InputParams(
+                voice_id="Ashley",
+                model="inworld-tts-1",
+                temperature=0.8,
+            ),
+        )
 """
 
 import base64
@@ -70,27 +85,30 @@ from pipecat.utils.text.skip_tags_aggregator import SkipTagsAggregator
 from pipecat.utils.tracing.service_decorators import traced_tts
 
 
-class InworldHttpStreamingService(TTSService):
+class InworldTTSService(TTSService):
     """Inworld AI HTTP-based Text-to-Speech Service.
 
-    This service integrates Inworld AI's high-quality TTS API with Pipecat's pipeline
-    architecture. It provides real-time speech synthesis with natural-sounding voices
-    and low-latency streaming audio delivery.
+    This unified service integrates Inworld AI's high-quality TTS API with Pipecat's pipeline
+    architecture. It supports both streaming and non-streaming modes, providing flexible
+    speech synthesis with natural-sounding voices.
 
     Key Features:
 
-    - Real-time HTTP streaming for minimal latency
+    - **Streaming Mode**: Real-time HTTP streaming for minimal latency
+    - **Non-Streaming Mode**: Complete audio synthesis then chunked playback
     - Multiple voice options (Ashley, Hades, etc.)
     - High-quality audio output (48kHz LINEAR16 PCM)
     - Automatic audio format handling and header stripping
     - Comprehensive error handling and recovery
     - Built-in performance metrics and monitoring
+    - Unified interface for both modes
 
     Technical Architecture:
 
     - Uses aiohttp for non-blocking HTTP requests
-    - Implements JSON line-by-line streaming protocol
-    - Processes base64-encoded audio chunks in real-time
+    - **Streaming**: Implements JSON line-by-line streaming protocol
+    - **Non-Streaming**: Single HTTP POST with complete response
+    - Processes base64-encoded audio chunks in real-time or batch
     - Manages audio continuity to prevent artifacts
     - Integrates with Pipecat's frame-based pipeline system
 
@@ -100,31 +118,38 @@ class InworldHttpStreamingService(TTSService):
     - Models: inworld-tts-1 and other available models
     - Audio Formats: LINEAR16 PCM at various sample rates
     - Language Detection: Automatically inferred from input text (no explicit language setting required)
+    - Mode Selection: streaming=True for real-time, streaming=False for complete synthesis
 
     Example Usage::
 
         async with aiohttp.ClientSession() as session:
-            # Using default settings (Ashley voice, inworld-tts-1 model)
-            tts = InworldHttpStreamingService(
+            # Streaming mode (default) - Real-time audio generation
+            tts_streaming = InworldTTSService(
                 api_key=os.getenv("INWORLD_API_KEY"),
                 aiohttp_session=session,
+                streaming=True,  # Default behavior
+                params=InworldTTSService.InputParams(
+                    voice_id="Ashley",
+                    model="inworld-tts-1",
+                    temperature=0.8,  # Add variability to speech synthesis (range: [0, 2])
+                ),
             )
 
-            # Or with custom voice, model, and temperature via params
-            params = InworldHttpStreamingService.InputParams(
-                voice_id="Hades",
-                model="inworld-tts-1-max",
-                temperature=0.8,  # Add variability to speech synthesis (range: [0, 2])
-            )
-            tts = InworldHttpStreamingService(
+            # Non-streaming mode - Complete audio then playback
+            tts_complete = InworldTTSService(
                 api_key=os.getenv("INWORLD_API_KEY"),
                 aiohttp_session=session,
-                params=params,
+                streaming=False,
+                params=InworldTTSService.InputParams(
+                    voice_id="Hades",
+                    model="inworld-tts-1-max",
+                    temperature=0.8,
+                ),
             )
     """
 
     class InputParams(BaseModel):
-        """Input parameters for Inworld HTTP TTS configuration.
+        """Input parameters for Inworld TTS configuration.
 
         Parameters:
             voice_id: Voice selection for speech synthesis (e.g., "Ashley", "Hades").
@@ -146,23 +171,29 @@ class InworldHttpStreamingService(TTSService):
         *,
         api_key: str,
         aiohttp_session: aiohttp.ClientSession,
-        base_url: str = "https://api.inworld.ai/tts/v1/voice:stream",
+        streaming: bool = True,
+        base_url: Optional[str] = None,
         sample_rate: Optional[int] = None,
         encoding: str = "LINEAR16",
         params: Optional[InputParams] = None,
         **kwargs,
     ):
-        """Initialize the Inworld HTTP TTS service.
+        """Initialize the Inworld TTS service.
 
-        Sets up the TTS service with Inworld AI's streaming API configuration.
-        This constructor prepares all necessary parameters for real-time speech synthesis.
+        Sets up the TTS service with Inworld AI's API configuration.
+        This constructor prepares all necessary parameters for speech synthesis.
 
         Args:
             api_key: Inworld API key for authentication (base64-encoded from Inworld Portal).
                     Get this from: Inworld Portal > Settings > API Keys > Runtime API Key
             aiohttp_session: Shared aiohttp session for HTTP requests. Must be provided
                            for proper connection pooling and resource management.
-            base_url: Base URL for Inworld HTTP API. Uses streaming endpoint by default.
+            streaming: Whether to use streaming mode (True) or non-streaming mode (False).
+                      - True: Real-time audio chunks as they're generated (lower latency)
+                      - False: Complete audio file generated first, then chunked for playback (simpler)
+            base_url: Base URL for Inworld HTTP API. If None, automatically selected based on streaming mode:
+                     - Streaming: "https://api.inworld.ai/tts/v1/voice:stream"
+                     - Non-streaming: "https://api.inworld.ai/tts/v1/voice"
                      Should normally not be changed unless using a different environment.
             sample_rate: Audio sample rate in Hz. If None, uses default from StartFrame.
                         Common values: 48000 (high quality), 24000 (good quality), 16000 (basic)
@@ -185,11 +216,19 @@ class InworldHttpStreamingService(TTSService):
         super().__init__(sample_rate=sample_rate, **kwargs)
 
         # Use provided params or create default configuration
-        params = params or InworldHttpStreamingService.InputParams()
+        params = params or InworldTTSService.InputParams()
 
         # Store core configuration for API requests
         self._api_key = api_key  # Authentication credentials
         self._session = aiohttp_session  # HTTP session for requests
+        self._streaming = streaming  # Streaming mode selection
+
+        # Set base URL based on streaming mode if not provided
+        if base_url is None:
+            if streaming:
+                base_url = "https://api.inworld.ai/tts/v1/voice:stream"  # Streaming endpoint
+            else:
+                base_url = "https://api.inworld.ai/tts/v1/voice"  # Non-streaming endpoint
         self._base_url = base_url  # API endpoint URL
 
         # Build settings dictionary that matches Inworld's API expectations
@@ -216,12 +255,12 @@ class InworldHttpStreamingService(TTSService):
         """Check if this service can generate processing metrics.
 
         Returns:
-            True, as Inworld HTTP service supports metrics generation.
+            True, as Inworld TTS service supports metrics generation.
         """
         return True
 
     async def start(self, frame: StartFrame):
-        """Start the Inworld HTTP TTS service.
+        """Start the Inworld TTS service.
 
         Args:
             frame: The start frame containing initialization parameters.
@@ -230,7 +269,7 @@ class InworldHttpStreamingService(TTSService):
         self._settings["audio_config"]["sample_rate_hertz"] = self.sample_rate
 
     async def stop(self, frame: EndFrame):
-        """Stop the Inworld HTTP TTS service.
+        """Stop the Inworld TTS service.
 
         Args:
             frame: The end frame.
@@ -238,7 +277,7 @@ class InworldHttpStreamingService(TTSService):
         await super().stop(frame)
 
     async def cancel(self, frame: CancelFrame):
-        """Cancel the Inworld HTTP TTS service.
+        """Cancel the Inworld TTS service.
 
         Args:
             frame: The cancel frame.
@@ -247,21 +286,30 @@ class InworldHttpStreamingService(TTSService):
 
     @traced_tts
     async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
-        """Generate speech from text using Inworld's streaming HTTP API.
+        """Generate speech from text using Inworld's HTTP API.
 
-        This is the core TTS processing function that:
+        This is the core TTS processing function that adapts its behavior based on the streaming mode:
+
+        **Streaming Mode (streaming=True)**:
         1. Sends text to Inworld's streaming TTS endpoint
         2. Receives JSON-streamed audio chunks in real-time
         3. Processes and cleans audio data (removes WAV headers, validates content)
         4. Yields audio frames for immediate playback in the pipeline
 
+        **Non-Streaming Mode (streaming=False)**:
+        1. Sends text to Inworld's non-streaming TTS endpoint
+        2. Receives complete audio file as base64-encoded response
+        3. Processes entire audio and chunks for playback
+        4. Yields audio frames in manageable pieces
+
         Technical Details:
 
-        - Uses HTTP streaming with JSON line-by-line responses
-        - Each JSON line contains base64-encoded audio data
-        - Implements buffering to handle partial JSON lines
+        - **Streaming**: Uses HTTP streaming with JSON line-by-line responses
+        - **Non-Streaming**: Single HTTP POST with complete JSON response
+        - Each audio chunk contains base64-encoded audio data
+        - Implements buffering to handle partial data (streaming mode)
         - Strips WAV headers to prevent audio artifacts/clicks
-        - Provides real-time audio streaming for low latency
+        - Provides optimized audio delivery for each mode
 
         Args:
             text: The text to synthesize into speech.
@@ -272,7 +320,7 @@ class InworldHttpStreamingService(TTSService):
         Raises:
             ErrorFrame: If API errors occur or audio processing fails.
         """
-        logger.debug(f"{self}: Generating TTS [{text}]")
+        logger.debug(f"{self}: Generating TTS [{text}] (streaming={self._streaming})")
 
         # ================================================================================
         # STEP 1: PREPARE API REQUEST
@@ -302,7 +350,7 @@ class InworldHttpStreamingService(TTSService):
 
         try:
             # ================================================================================
-            # STEP 2: INITIALIZE METRICS AND STREAMING
+            # STEP 2: INITIALIZE METRICS AND PROCESSING
             # ================================================================================
             # Start measuring Time To First Byte (TTFB) for performance tracking
             await self.start_ttfb_metrics()
@@ -312,10 +360,10 @@ class InworldHttpStreamingService(TTSService):
             yield TTSStartedFrame()
 
             # ================================================================================
-            # STEP 3: MAKE HTTP STREAMING REQUEST
+            # STEP 3: MAKE HTTP REQUEST (MODE-SPECIFIC)
             # ================================================================================
-            # Use aiohttp's streaming POST to Inworld's streaming endpoint
-            # The endpoint returns JSON lines with audio chunks as they're generated
+            # Use aiohttp to make request to Inworld's endpoint
+            # Behavior differs based on streaming mode
             async with self._session.post(
                 self._base_url, json=payload, headers=headers
             ) as response:
@@ -330,115 +378,34 @@ class InworldHttpStreamingService(TTSService):
                     return
 
                 # ================================================================================
-                # STEP 5: PROCESS STREAMING JSON RESPONSE
+                # STEP 5: PROCESS RESPONSE (MODE-SPECIFIC)
                 # ================================================================================
-                # Inworld streams JSON lines where each line contains audio data
-                # We need to buffer incoming data and process complete lines
-
-                # Buffer to accumulate incoming text data
-                # This handles cases where JSON lines are split across HTTP chunks
-                buffer = ""
-
-                # Read HTTP response in manageable chunks (1KB each)
-                # This prevents memory issues with large responses
-                async for chunk in response.content.iter_chunked(1024):
-                    if not chunk:
-                        continue
-
-                    # ============================================================================
-                    # STEP 6: BUFFER MANAGEMENT
-                    # ============================================================================
-                    # Decode binary chunk to text and add to our line buffer
-                    # Each chunk may contain partial JSON lines, so we need to accumulate
-                    buffer += chunk.decode("utf-8")
-
-                    # ============================================================================
-                    # STEP 7: LINE-BY-LINE JSON PROCESSING
-                    # ============================================================================
-                    # Process all complete lines in the buffer (lines ending with \n)
-                    # Leave partial lines in buffer for next iteration
-                    while "\n" in buffer:
-                        # Split on first newline, keeping remainder in buffer
-                        line, buffer = buffer.split("\n", 1)
-                        line_str = line.strip()
-
-                        # Skip empty lines (common in streaming responses)
-                        if not line_str:
-                            continue
-
-                        try:
-                            # ================================================================
-                            # STEP 8: PARSE JSON AND EXTRACT AUDIO
-                            # ================================================================
-                            # Parse the JSON line - should contain audio data
-                            chunk_data = json.loads(line_str)
-
-                            # Check if this line contains audio content
-                            # Inworld's response format: {"result": {"audioContent": "base64data"}}
-                            if "result" in chunk_data and "audioContent" in chunk_data["result"]:
-                                # Decode base64 audio data to binary
-                                audio_chunk = base64.b64decode(chunk_data["result"]["audioContent"])
-
-                                # ========================================================
-                                # STEP 9: AUDIO DATA VALIDATION
-                                # ========================================================
-                                # Skip empty audio chunks that could cause discontinuities
-                                # Empty chunks can create gaps or clicks in audio playback
-                                if not audio_chunk:
-                                    continue
-
-                                # Start with the raw audio data
-                                audio_data = audio_chunk
-
-                                # ========================================================
-                                # STEP 10: WAV HEADER REMOVAL (CRITICAL FOR AUDIO QUALITY)
-                                # ========================================================
-                                # Each audio chunk may have its own WAV header (44 bytes)
-                                # These headers contain metadata and will sound like clicks if played
-                                # We must strip them from EVERY chunk, not just the first one
-                                if (
-                                    len(audio_chunk) > 44  # Ensure chunk is large enough
-                                    and audio_chunk.startswith(
-                                        b"RIFF"
-                                    )  # Check for WAV header magic bytes
-                                ):
-                                    # Remove the 44-byte WAV header to get pure audio data
-                                    audio_data = audio_chunk[44:]
-
-                                # ========================================================
-                                # STEP 11: YIELD AUDIO FRAME TO PIPELINE
-                                # ========================================================
-                                # Only yield frames with actual audio content
-                                # Empty frames can cause pipeline issues
-                                if len(audio_data) > 0:
-                                    # Create Pipecat audio frame with processed audio data
-                                    yield TTSAudioRawFrame(
-                                        audio=audio_data,  # Clean audio without headers
-                                        sample_rate=self.sample_rate,  # Configured sample rate (48kHz)
-                                        num_channels=1,  # Mono audio
-                                    )
-
-                        except json.JSONDecodeError:
-                            # Ignore malformed JSON lines - streaming can have partial data
-                            # This is normal in HTTP streaming scenarios
-                            continue
+                # Choose processing method based on streaming mode
+                if self._streaming:
+                    # Stream processing: JSON line-by-line with real-time audio
+                    async for frame in self._process_streaming_response(response):
+                        yield frame
+                else:
+                    # Non-stream processing: Complete JSON response with batch audio
+                    async for frame in self._process_non_streaming_response(response):
+                        yield frame
 
             # ================================================================================
-            # STEP 12: FINALIZE METRICS AND CLEANUP
+            # STEP 6: FINALIZE METRICS AND CLEANUP
             # ================================================================================
             # Start usage metrics tracking after successful completion
             await self.start_tts_usage_metrics(text)
 
         except Exception as e:
             # ================================================================================
-            # STEP 13: ERROR HANDLING
+            # STEP 7: ERROR HANDLING
             # ================================================================================
             # Log any unexpected errors and notify the pipeline
             logger.error(f"{self} exception: {e}")
             await self.push_error(ErrorFrame(f"Error generating TTS: {e}"))
         finally:
             # ================================================================================
-            # STEP 14: CLEANUP AND COMPLETION
+            # STEP 8: CLEANUP AND COMPLETION
             # ================================================================================
             # Always stop metrics tracking, even if errors occurred
             await self.stop_ttfb_metrics()
@@ -447,363 +414,187 @@ class InworldHttpStreamingService(TTSService):
             # This allows downstream processors to finalize audio processing
             yield TTSStoppedFrame()
 
+    async def _process_streaming_response(
+        self, response: aiohttp.ClientResponse
+    ) -> AsyncGenerator[Frame, None]:
+        """Process streaming JSON response with real-time audio chunks.
 
-class InworldHttpNonStreamingService(TTSService):
-    """Inworld AI HTTP-based Text-to-Speech Service (Non-Streaming).
-
-    This service integrates with Inworld AI's non-streaming TTS API for simpler,
-    complete audio synthesis. Suitable for use cases where streaming is not required
-    and you prefer to receive the complete audio file at once.
-
-    Key Features:
-
-    - Simple HTTP request/response for complete audio synthesis
-    - Same voice options as streaming version (Ashley, Hades, etc.)
-    - High-quality audio output (48kHz LINEAR16 PCM)
-    - Automatic language detection from input text
-    - Support for temperature parameter for synthesis variability
-    - Lower complexity compared to streaming implementation
-
-    Technical Architecture:
-
-    - Uses aiohttp for single HTTP POST request
-    - Downloads complete audio as base64-encoded data
-    - Processes entire audio file and chunks for playback
-    - Integrates with Pipecat's frame-based pipeline system
-
-    Usage::
-
-        async with aiohttp.ClientSession() as session:
-            # Using default settings (Ashley voice, inworld-tts-1 model)
-            tts = InworldHttpNonStreamingService(
-                api_key=os.getenv("INWORLD_API_KEY"),
-                aiohttp_session=session,
-            )
-
-            # Or with custom voice, model, and temperature
-            params = InworldHttpNonStreamingService.InputParams(
-                voice_id="Hades",
-                model="inworld-tts-1-max",
-                temperature=0.8,  # Control synthesis variability (range: [0, 2])
-            )
-            tts = InworldHttpNonStreamingService(
-                api_key=os.getenv("INWORLD_API_KEY"),
-                aiohttp_session=session,
-                params=params,
-            )
-    """
-
-    class InputParams(BaseModel):
-        """Input parameters for Inworld non-streaming TTS configuration.
-
-        Parameters:
-            voice_id: Voice selection for speech synthesis (e.g., "Ashley", "Hades").
-            model: TTS model to use (e.g., "inworld-tts-1", "inworld-tts-1-max").
-            temperature: Voice temperature control for synthesis variability (e.g., 0.8).
-                        Valid range: [0, 2]. Higher values increase variability.
-
-        Note:
-            Language is automatically inferred from the input text by Inworld's TTS models,
-            so no explicit language parameter is required.
-        """
-
-        voice_id: Optional[str] = "Ashley"  # defaults to the Ashley voice
-        model: Optional[str] = "inworld-tts-1"  # defaults to the inworld-tts-1 model
-        temperature: Optional[float] = None  # optional temperature control (range: [0, 2])
-
-    def __init__(
-        self,
-        *,
-        api_key: str,
-        aiohttp_session: Optional[aiohttp.ClientSession] = None,
-        base_url: str = "https://api.inworld.ai/tts/v1/voice",  # Non-streaming endpoint
-        sample_rate: Optional[int] = None,
-        encoding: str = "LINEAR16",
-        params: Optional[InputParams] = None,
-        **kwargs,
-    ):
-        """Initialize the Inworld non-streaming TTS service.
-
-        Sets up the TTS service with Inworld AI's non-streaming API configuration.
-        This constructor prepares all necessary parameters for complete audio synthesis.
+        This method handles Inworld's streaming endpoint response format:
+        - JSON lines containing base64-encoded audio chunks
+        - Real-time processing as data arrives
+        - Line buffering to handle partial JSON data
 
         Args:
-            api_key: Inworld API key for authentication (base64-encoded from Inworld Portal).
-                    Get this from: Inworld Portal > Settings > API Keys > Runtime API Key
-            aiohttp_session: Shared aiohttp session for HTTP requests. Must be provided
-                           for proper connection pooling and resource management.
-            base_url: Base URL for Inworld non-streaming HTTP API. Uses non-streaming endpoint by default.
-                     Should normally not be changed unless using a different environment.
-            sample_rate: Audio sample rate in Hz. If None, uses default from StartFrame.
-                        Common values: 48000 (high quality), 24000 (good quality), 16000 (basic)
-            encoding: Audio encoding format. Supported options:
-                     - "LINEAR16" (default) - Uncompressed PCM, best quality
-                     - Other formats as supported by Inworld API
-            params: Input parameters for voice and model configuration. Use this to specify:
-                   - voice_id: Voice selection ("Ashley", "Hades", etc.)
-                   - model: TTS model ("inworld-tts-1", "inworld-tts-1-max", etc.)
-                   - temperature: Voice temperature control for variability (range: [0, 2], e.g., 0.8, optional)
-                   If None, uses default values (Ashley voice, inworld-tts-1 model).
-                   Note: Language is automatically inferred from input text.
-            **kwargs: Additional arguments passed to the parent TTSService class.
-
-        Note:
-            The aiohttp_session parameter is required because Inworld's HTTP API
-            benefits from connection reuse and proper async session management.
-        """
-        # Initialize parent TTSService with audio configuration
-        super().__init__(sample_rate=sample_rate, **kwargs)
-
-        # Use provided params or create default configuration
-        params = params or InworldHttpNonStreamingService.InputParams()
-
-        # Store core configuration for API requests
-        self._api_key = api_key  # Authentication credentials
-        self._session = aiohttp_session  # HTTP session for requests (optional)
-        self._base_url = base_url  # API endpoint URL
-
-        # Build settings dictionary that matches Inworld's API expectations
-        # This will be sent as JSON payload in the TTS request
-        # Note: Language is automatically inferred from text by Inworld's models
-        self._settings = {
-            "voiceId": params.voice_id or "Ashley",  # Voice selection from params
-            "modelId": params.model or "inworld-tts-1",  # TTS model selection from params
-            "audio_config": {  # Audio format configuration
-                "audio_encoding": encoding,  # Format: LINEAR16, MP3, etc.
-                "sample_rate_hertz": 0,  # Will be set in start() from parent service
-            },
-        }
-
-        # Add optional temperature parameter if provided (valid range: [0, 2])
-        if params.temperature is not None:
-            self._settings["temperature"] = params.temperature
-
-        # Register voice and model with parent service for metrics and tracking
-        self.set_voice(params.voice_id or "Ashley")  # Used for logging and metrics
-        self.set_model_name(params.model or "inworld-tts-1")  # Used for performance tracking
-
-    def can_generate_metrics(self) -> bool:
-        """Check if this service can generate processing metrics.
-
-        Returns:
-            True, as Inworld non-streaming service supports metrics generation.
-        """
-        return True
-
-    async def start(self, frame: StartFrame):
-        """Start the Inworld non-streaming TTS service.
-
-        Args:
-            frame: The start frame containing initialization parameters.
-        """
-        await super().start(frame)
-        self._settings["audio_config"]["sample_rate_hertz"] = self.sample_rate
-
-    async def stop(self, frame: EndFrame):
-        """Stop the Inworld non-streaming TTS service.
-
-        Args:
-            frame: The end frame.
-        """
-        await super().stop(frame)
-
-    async def cancel(self, frame: CancelFrame):
-        """Cancel the Inworld non-streaming TTS service.
-
-        Args:
-            frame: The cancel frame.
-        """
-        await super().cancel(frame)
-
-    @traced_tts
-    async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
-        """Generate speech from text using Inworld's non-streaming HTTP API.
-
-        This method sends text to Inworld's non-streaming TTS endpoint and receives
-        the complete audio file as a base64-encoded response. The audio is then
-        chunked and yielded for playback in the pipeline.
-
-        Args:
-            text: The text to synthesize into speech.
+            response: The aiohttp response object from streaming endpoint.
 
         Yields:
-            Frame: Audio frames containing the synthesized speech, plus control frames.
-
-        Raises:
-            ErrorFrame: If API errors occur or audio processing fails.
+            Frame: Audio frames as they're processed from the stream.
         """
-        logger.debug(f"{self}: Generating TTS [{text}]")
+        # ================================================================================
+        # STREAMING: PROCESS JSON LINE-BY-LINE RESPONSE
+        # ================================================================================
+        # Inworld streams JSON lines where each line contains audio data
+        # We need to buffer incoming data and process complete lines
+
+        # Buffer to accumulate incoming text data
+        # This handles cases where JSON lines are split across HTTP chunks
+        buffer = ""
+
+        # Read HTTP response in manageable chunks (1KB each)
+        # This prevents memory issues with large responses
+        async for chunk in response.content.iter_chunked(1024):
+            if not chunk:
+                continue
+
+            # ============================================================================
+            # BUFFER MANAGEMENT
+            # ============================================================================
+            # Decode binary chunk to text and add to our line buffer
+            # Each chunk may contain partial JSON lines, so we need to accumulate
+            buffer += chunk.decode("utf-8")
+
+            # ============================================================================
+            # LINE-BY-LINE JSON PROCESSING
+            # ============================================================================
+            # Process all complete lines in the buffer (lines ending with \n)
+            # Leave partial lines in buffer for next iteration
+            while "\n" in buffer:
+                # Split on first newline, keeping remainder in buffer
+                line, buffer = buffer.split("\n", 1)
+                line_str = line.strip()
+
+                # Skip empty lines (common in streaming responses)
+                if not line_str:
+                    continue
+
+                try:
+                    # ================================================================
+                    # PARSE JSON AND EXTRACT AUDIO
+                    # ================================================================
+                    # Parse the JSON line - should contain audio data
+                    chunk_data = json.loads(line_str)
+
+                    # Check if this line contains audio content
+                    # Inworld's response format: {"result": {"audioContent": "base64data"}}
+                    if "result" in chunk_data and "audioContent" in chunk_data["result"]:
+                        # Process the audio chunk
+                        async for frame in self._process_audio_chunk(
+                            base64.b64decode(chunk_data["result"]["audioContent"])
+                        ):
+                            yield frame
+
+                except json.JSONDecodeError:
+                    # Ignore malformed JSON lines - streaming can have partial data
+                    # This is normal in HTTP streaming scenarios
+                    continue
+
+    async def _process_non_streaming_response(
+        self, response: aiohttp.ClientResponse
+    ) -> AsyncGenerator[Frame, None]:
+        """Process complete JSON response with full audio content.
+
+        This method handles Inworld's non-streaming endpoint response format:
+        - Single JSON response with complete base64-encoded audio
+        - Full audio download then chunked playback
+        - Simpler processing without line buffering
+
+        Args:
+            response: The aiohttp response object from non-streaming endpoint.
+
+        Yields:
+            Frame: Audio frames chunked from the complete audio.
+        """
+        # ================================================================================
+        # NON-STREAMING: PARSE COMPLETE JSON RESPONSE
+        # ================================================================================
+        # Parse the complete JSON response containing base64 audio data
+        response_data = await response.json()
 
         # ================================================================================
-        # STEP 1: PREPARE API REQUEST
+        # EXTRACT AND VALIDATE AUDIO CONTENT
         # ================================================================================
-        # Build the JSON payload according to Inworld's non-streaming API specification
-        # This matches the format shown in their documentation examples
-        # Note: Language is automatically inferred from the input text by Inworld's models
-        payload = {
-            "text": text,  # Text to synthesize
-            "voiceId": self._settings["voiceId"],  # Voice selection (Ashley, Hades, etc.)
-            "modelId": self._settings["modelId"],  # TTS model (inworld-tts-1)
-            "audio_config": self._settings["audio_config"],  # Audio format settings
-        }
+        # Extract the base64-encoded audio content from response
+        if "audioContent" not in response_data:
+            logger.error("No audioContent in Inworld API response")
+            await self.push_error(ErrorFrame("No audioContent in response"))
+            return
 
-        # Add optional temperature parameter if configured (valid range: [0, 2])
-        if "temperature" in self._settings:
-            payload["temperature"] = self._settings["temperature"]
+        # ================================================================================
+        # DECODE AND PROCESS COMPLETE AUDIO DATA
+        # ================================================================================
+        # Decode the base64 audio data to binary
+        audio_data = base64.b64decode(response_data["audioContent"])
 
-        # Set up HTTP headers for authentication and content type
-        # Inworld requires Basic auth with base64-encoded API key
-        headers = {
-            "Authorization": f"Basic {self._api_key}",  # Base64 API key from Inworld Portal
-            "Content-Type": "application/json",  # JSON request body
-        }
+        # Strip WAV header if present (Inworld may include WAV header)
+        # This prevents audio clicks and ensures clean audio playback
+        if len(audio_data) > 44 and audio_data.startswith(b"RIFF"):
+            audio_data = audio_data[44:]
 
-        try:
-            # ================================================================================
-            # STEP 2: INITIALIZE METRICS AND STREAMING
-            # ================================================================================
-            # Start measuring Time To First Byte (TTFB) for performance tracking
-            await self.start_ttfb_metrics()
+        # ================================================================================
+        # CHUNK AND YIELD COMPLETE AUDIO FOR PLAYBACK
+        # ================================================================================
+        # Chunk the complete audio for streaming playback
+        # This allows the pipeline to process audio in manageable pieces
+        CHUNK_SIZE = self.chunk_size
 
-            # Signal to the pipeline that TTS generation has started
-            # This allows downstream processors to prepare for incoming audio
-            yield TTSStartedFrame()
+        for i in range(0, len(audio_data), CHUNK_SIZE):
+            chunk = audio_data[i : i + CHUNK_SIZE]
+            if len(chunk) > 0:
+                await self.stop_ttfb_metrics()
+                yield TTSAudioRawFrame(
+                    audio=chunk,
+                    sample_rate=self.sample_rate,
+                    num_channels=1,
+                )
 
-            # ================================================================================
-            # STEP 3: MAKE HTTP NON-STREAMING REQUEST
-            # ================================================================================
-            # Make single HTTP POST request to Inworld's non-streaming endpoint
-            # This endpoint returns complete audio as base64-encoded data
-            # Create session if none was provided
-            if self._session:
-                session = self._session
-            else:
-                session = aiohttp.ClientSession()
+    async def _process_audio_chunk(self, audio_chunk: bytes) -> AsyncGenerator[Frame, None]:
+        """Process a single audio chunk (common logic for both modes).
 
-            async with (
-                session
-                if not self._session
-                else session.post(
-                    self._base_url, json=payload, headers=headers
-                ) as context_or_response
-            ):
-                if self._session:
-                    response = context_or_response
-                else:
-                    async with context_or_response.post(
-                        self._base_url, json=payload, headers=headers
-                    ) as response:
-                        # ================================================================
-                        # STEP 4: HANDLE HTTP ERRORS
-                        # ================================================================
-                        # Check for API errors (expired keys, invalid requests, etc.)
-                        if response.status != 200:
-                            error_text = await response.text()
-                            logger.error(f"Inworld API error: {error_text}")
-                            await self.push_error(ErrorFrame(f"Inworld API error: {error_text}"))
-                            return
+        This method handles audio chunk processing that's common to both streaming
+        and non-streaming modes:
+        - WAV header removal
+        - Audio validation
+        - Frame creation and yielding
 
-                        # ================================================================
-                        # STEP 5: PARSE COMPLETE JSON RESPONSE
-                        # ================================================================
-                        # Parse the complete JSON response containing base64 audio data
-                        response_data = await response.json()
+        Args:
+            audio_chunk: Raw audio data bytes to process.
 
-                        # ================================================================
-                        # STEP 6: EXTRACT AND VALIDATE AUDIO CONTENT
-                        # ================================================================
-                        # Extract the base64-encoded audio content from response
-                        if "audioContent" not in response_data:
-                            logger.error("No audioContent in Inworld API response")
-                            await self.push_error(ErrorFrame("No audioContent in response"))
-                            return
+        Yields:
+            Frame: Audio frame if chunk contains valid audio data.
+        """
+        # ========================================================
+        # AUDIO DATA VALIDATION
+        # ========================================================
+        # Skip empty audio chunks that could cause discontinuities
+        # Empty chunks can create gaps or clicks in audio playback
+        if not audio_chunk:
+            return
 
-                        # ================================================================
-                        # STEP 7: DECODE AND PROCESS AUDIO DATA
-                        # ================================================================
-                        # Decode the base64 audio data to binary
-                        audio_data = base64.b64decode(response_data["audioContent"])
+        # Start with the raw audio data
+        audio_data = audio_chunk
 
-                        # Strip WAV header if present (Inworld may include WAV header)
-                        # This prevents audio clicks and ensures clean audio playback
-                        if len(audio_data) > 44 and audio_data.startswith(b"RIFF"):
-                            audio_data = audio_data[44:]
+        # ========================================================
+        # WAV HEADER REMOVAL (CRITICAL FOR AUDIO QUALITY)
+        # ========================================================
+        # Each audio chunk may have its own WAV header (44 bytes)
+        # These headers contain metadata and will sound like clicks if played
+        # We must strip them from EVERY chunk, not just the first one
+        if (
+            len(audio_chunk) > 44  # Ensure chunk is large enough
+            and audio_chunk.startswith(b"RIFF")  # Check for WAV header magic bytes
+        ):
+            # Remove the 44-byte WAV header to get pure audio data
+            audio_data = audio_chunk[44:]
 
-                        # ================================================================
-                        # STEP 8: START USAGE METRICS TRACKING
-                        # ================================================================
-                        await self.start_tts_usage_metrics(text)
-
-                        # ================================================================
-                        # STEP 9: CHUNK AND YIELD AUDIO FOR PLAYBACK
-                        # ================================================================
-                        # Chunk the complete audio for streaming playback
-                        # This allows the pipeline to process audio in manageable pieces
-                        CHUNK_SIZE = self.chunk_size
-
-                        for i in range(0, len(audio_data), CHUNK_SIZE):
-                            chunk = audio_data[i : i + CHUNK_SIZE]
-                            if len(chunk) > 0:
-                                await self.stop_ttfb_metrics()
-                                yield TTSAudioRawFrame(
-                                    audio=chunk,
-                                    sample_rate=self.sample_rate,
-                                    num_channels=1,
-                                )
-
-                if self._session:
-                    # Handle HTTP errors
-                    if response.status != 200:
-                        error_text = await response.text()
-                        logger.error(f"Inworld API error: {error_text}")
-                        await self.push_error(ErrorFrame(f"Inworld API error: {error_text}"))
-                        return
-
-                    # Parse the complete JSON response
-                    response_data = await response.json()
-
-                    # Extract the base64-encoded audio content
-                    if "audioContent" not in response_data:
-                        logger.error("No audioContent in Inworld API response")
-                        await self.push_error(ErrorFrame("No audioContent in response"))
-                        return
-
-                    # Decode the base64 audio data
-                    audio_data = base64.b64decode(response_data["audioContent"])
-
-                    # Strip WAV header if present (Inworld may include WAV header)
-                    if len(audio_data) > 44 and audio_data.startswith(b"RIFF"):
-                        audio_data = audio_data[44:]
-
-                    await self.start_tts_usage_metrics(text)
-
-                    # Chunk the complete audio for streaming playback
-                    CHUNK_SIZE = self.chunk_size
-
-                    for i in range(0, len(audio_data), CHUNK_SIZE):
-                        chunk = audio_data[i : i + CHUNK_SIZE]
-                        if len(chunk) > 0:
-                            await self.stop_ttfb_metrics()
-                            yield TTSAudioRawFrame(
-                                audio=chunk,
-                                sample_rate=self.sample_rate,
-                                num_channels=1,
-                            )
-
-        except Exception as e:
-            # ================================================================================
-            # STEP 10: ERROR HANDLING
-            # ================================================================================
-            # Log any unexpected errors and notify the pipeline
-            logger.error(f"{self} exception: {e}")
-            await self.push_error(ErrorFrame(f"Error generating TTS: {e}"))
-        finally:
-            # ================================================================================
-            # STEP 11: CLEANUP AND COMPLETION
-            # ================================================================================
-            # Always stop metrics tracking, even if errors occurred
-            await self.stop_ttfb_metrics()
-
-            # Signal to pipeline that TTS generation is complete
-            # This allows downstream processors to finalize audio processing
-            yield TTSStoppedFrame()
+        # ========================================================
+        # YIELD AUDIO FRAME TO PIPELINE
+        # ========================================================
+        # Only yield frames with actual audio content
+        # Empty frames can cause pipeline issues
+        if len(audio_data) > 0:
+            # Create Pipecat audio frame with processed audio data
+            yield TTSAudioRawFrame(
+                audio=audio_data,  # Clean audio without headers
+                sample_rate=self.sample_rate,  # Configured sample rate (48kHz)
+                num_channels=1,  # Mono audio
+            )

From da8c67114acd8a79f1a1f298f1a685c26ff5bad2 Mon Sep 17 00:00:00 2001
From: padillamt <michael.padilla@inworld.ai>
Date: Thu, 24 Jul 2025 13:35:29 -0700
Subject: [PATCH 23/38] mtpadilla: make streaming the default for example

---
 examples/foundational/07aa-interruptible-inworld-http.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/foundational/07aa-interruptible-inworld-http.py b/examples/foundational/07aa-interruptible-inworld-http.py
index dbfbcc878..53e3c14b3 100644
--- a/examples/foundational/07aa-interruptible-inworld-http.py
+++ b/examples/foundational/07aa-interruptible-inworld-http.py
@@ -60,7 +60,7 @@ async def run_example(transport: BaseTransport, _: argparse.Namespace, handle_si
 
         # Inworld TTS Service - Unified streaming and non-streaming
         # Set streaming=True for real-time audio, streaming=False for complete audio generation
-        streaming = False  # Toggle this to switch between modes
+        streaming = True  # Toggle this to switch between modes
 
         tts = InworldTTSService(
             api_key=os.getenv("INWORLD_API_KEY", ""),

From f6440ee6e174fc4eb9d60ff7ed1cb8ec21bccbac Mon Sep 17 00:00:00 2001
From: padillamt <michael.padilla@inworld.ai>
Date: Thu, 24 Jul 2025 13:36:40 -0700
Subject: [PATCH 24/38] mtpadilla: correct Examples header in comments

---
 src/pipecat/services/inworld/tts.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/pipecat/services/inworld/tts.py b/src/pipecat/services/inworld/tts.py
index 94ef5aa32..64d5d9fe9 100644
--- a/src/pipecat/services/inworld/tts.py
+++ b/src/pipecat/services/inworld/tts.py
@@ -27,7 +27,7 @@ Technical Implementation:
 - Manages audio continuity to prevent clicks and artifacts
 - Integrates with Pipecat's frame-based pipeline architecture
 
-Usage::
+Examples::
 
     async with aiohttp.ClientSession() as session:
         # Streaming mode (default) - real-time audio generation
@@ -120,7 +120,7 @@ class InworldTTSService(TTSService):
     - Language Detection: Automatically inferred from input text (no explicit language setting required)
     - Mode Selection: streaming=True for real-time, streaming=False for complete synthesis
 
-    Example Usage::
+    Examples::
 
         async with aiohttp.ClientSession() as session:
             # Streaming mode (default) - Real-time audio generation

From 81048ce43a78ee112b44d1a23ac9823f7976057e Mon Sep 17 00:00:00 2001
From: padillamt <michael.padilla@inworld.ai>
Date: Thu, 24 Jul 2025 20:42:29 -0700
Subject: [PATCH 25/38] mtpadilla: rename 07aa-interruptible-inworld-http.py to
 07ab-interruptible-inworld-http.py

---
 ...uptible-inworld-http.py => 07ab-interruptible-inworld-http.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename examples/foundational/{07aa-interruptible-inworld-http.py => 07ab-interruptible-inworld-http.py} (100%)

diff --git a/examples/foundational/07aa-interruptible-inworld-http.py b/examples/foundational/07ab-interruptible-inworld-http.py
similarity index 100%
rename from examples/foundational/07aa-interruptible-inworld-http.py
rename to examples/foundational/07ab-interruptible-inworld-http.py

From 067f64389bef74a23d15904a4dc53d1b41b57cff Mon Sep 17 00:00:00 2001
From: padillamt <michael.padilla@inworld.ai>
Date: Thu, 24 Jul 2025 20:44:27 -0700
Subject: [PATCH 26/38] mtpadilla: no longer needed so making empty

---
 src/pipecat/services/inworld/__init__.py | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/src/pipecat/services/inworld/__init__.py b/src/pipecat/services/inworld/__init__.py
index 910364d1b..8b1378917 100644
--- a/src/pipecat/services/inworld/__init__.py
+++ b/src/pipecat/services/inworld/__init__.py
@@ -1,13 +1 @@
-#
-# Copyright (c) 2024–2025, Daily
-#
-# SPDX-License-Identifier: BSD 2-Clause License
-#
 
-# import sys
-
-# from pipecat.services import DeprecatedModuleProxy
-
-# from .tts import *
-
-# sys.modules[__name__] = DeprecatedModuleProxy(globals(), "inworld", "inworld.tts")

From 662550cc5ea939c4864f0be2c88fcdcda335ffbe Mon Sep 17 00:00:00 2001
From: padillamt <michael.padilla@inworld.ai>
Date: Thu, 24 Jul 2025 21:05:22 -0700
Subject: [PATCH 27/38] mtpadilla: remove unused imports

---
 src/pipecat/services/inworld/tts.py | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/src/pipecat/services/inworld/tts.py b/src/pipecat/services/inworld/tts.py
index 64d5d9fe9..6eac58e49 100644
--- a/src/pipecat/services/inworld/tts.py
+++ b/src/pipecat/services/inworld/tts.py
@@ -56,15 +56,12 @@ Examples::
 """
 
 import base64
-import io
 import json
-import uuid
-import warnings
-from typing import AsyncGenerator, List, Optional, Union
+from typing import AsyncGenerator, Optional
 
 import aiohttp
 from loguru import logger
-from pydantic import BaseModel, Field
+from pydantic import BaseModel
 
 from pipecat.frames.frames import (
     CancelFrame,
@@ -72,16 +69,11 @@ from pipecat.frames.frames import (
     ErrorFrame,
     Frame,
     StartFrame,
-    StartInterruptionFrame,
     TTSAudioRawFrame,
     TTSStartedFrame,
     TTSStoppedFrame,
 )
-from pipecat.processors.frame_processor import FrameDirection
-from pipecat.services.tts_service import AudioContextWordTTSService, TTSService
-from pipecat.utils.asyncio.watchdog_async_iterator import WatchdogAsyncIterator
-from pipecat.utils.text.base_text_aggregator import BaseTextAggregator
-from pipecat.utils.text.skip_tags_aggregator import SkipTagsAggregator
+from pipecat.services.tts_service import TTSService
 from pipecat.utils.tracing.service_decorators import traced_tts
 
 

From d248c102c873d5933c4048cb1a12c384ec65b2ba Mon Sep 17 00:00:00 2001
From: padillamt <michael.padilla@inworld.ai>
Date: Thu, 24 Jul 2025 21:15:20 -0700
Subject: [PATCH 28/38] inworld: removal of unnecessary default assignment
 since already done

Co-authored-by: Mark Backman <m.backman@gmail.com>
---
 src/pipecat/services/inworld/tts.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pipecat/services/inworld/tts.py b/src/pipecat/services/inworld/tts.py
index 6eac58e49..b89b21d4a 100644
--- a/src/pipecat/services/inworld/tts.py
+++ b/src/pipecat/services/inworld/tts.py
@@ -227,7 +227,7 @@ class InworldTTSService(TTSService):
         # This will be sent as JSON payload in each TTS request
         # Note: Language is automatically inferred from text by Inworld's models
         self._settings = {
-            "voiceId": params.voice_id or "Ashley",  # Voice selection from params
+            "voiceId": params.voice_id,  # Voice selection from params
             "modelId": params.model or "inworld-tts-1",  # TTS model selection from params
             "audio_config": {  # Audio format configuration
                 "audio_encoding": encoding,  # Format: LINEAR16, MP3, etc.

From 16c20f3a997b48c98ed83570543f2071412f08c4 Mon Sep 17 00:00:00 2001
From: padillamt <michael.padilla@inworld.ai>
Date: Thu, 24 Jul 2025 21:15:34 -0700
Subject: [PATCH 29/38] inworld: removal of unnecessary default assignment
 since already done

Co-authored-by: Mark Backman <m.backman@gmail.com>
---
 src/pipecat/services/inworld/tts.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pipecat/services/inworld/tts.py b/src/pipecat/services/inworld/tts.py
index b89b21d4a..ce7e7592d 100644
--- a/src/pipecat/services/inworld/tts.py
+++ b/src/pipecat/services/inworld/tts.py
@@ -228,7 +228,7 @@ class InworldTTSService(TTSService):
         # Note: Language is automatically inferred from text by Inworld's models
         self._settings = {
             "voiceId": params.voice_id,  # Voice selection from params
-            "modelId": params.model or "inworld-tts-1",  # TTS model selection from params
+            "modelId": params.model,  # TTS model selection from params
             "audio_config": {  # Audio format configuration
                 "audio_encoding": encoding,  # Format: LINEAR16, MP3, etc.
                 "sample_rate_hertz": 0,  # Will be set in start() from parent service

From 7483422bd9dace032b4fa1fb6af713e4d4f0942e Mon Sep 17 00:00:00 2001
From: padillamt <michael.padilla@inworld.ai>
Date: Thu, 24 Jul 2025 21:23:03 -0700
Subject: [PATCH 30/38] inworld: change set_voice uto use self._settings

Co-authored-by: Mark Backman <m.backman@gmail.com>
---
 src/pipecat/services/inworld/tts.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pipecat/services/inworld/tts.py b/src/pipecat/services/inworld/tts.py
index ce7e7592d..f664479b8 100644
--- a/src/pipecat/services/inworld/tts.py
+++ b/src/pipecat/services/inworld/tts.py
@@ -240,7 +240,7 @@ class InworldTTSService(TTSService):
             self._settings["temperature"] = params.temperature
 
         # Register voice and model with parent service for metrics and tracking
-        self.set_voice(params.voice_id or "Ashley")  # Used for logging and metrics
+        self.set_voice(self._settings["voice_id"])  # Used for logging and metrics
         self.set_model_name(params.model or "inworld-tts-1")  # Used for performance tracking
 
     def can_generate_metrics(self) -> bool:

From 5fb1899aeb54c2567ba17440a4689ec2ac79272f Mon Sep 17 00:00:00 2001
From: padillamt <michael.padilla@inworld.ai>
Date: Thu, 24 Jul 2025 21:42:42 -0700
Subject: [PATCH 31/38] inworld: removal of unnecessary default assignment as
 already handled

---
 src/pipecat/services/inworld/tts.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/pipecat/services/inworld/tts.py b/src/pipecat/services/inworld/tts.py
index f664479b8..c26b0b15c 100644
--- a/src/pipecat/services/inworld/tts.py
+++ b/src/pipecat/services/inworld/tts.py
@@ -240,8 +240,8 @@ class InworldTTSService(TTSService):
             self._settings["temperature"] = params.temperature
 
         # Register voice and model with parent service for metrics and tracking
-        self.set_voice(self._settings["voice_id"])  # Used for logging and metrics
-        self.set_model_name(params.model or "inworld-tts-1")  # Used for performance tracking
+        self.set_voice(self._settings["voiceId"])  # Used for logging and metrics
+        self.set_model_name(self._settings["modelId"])  # Used for performance tracking
 
     def can_generate_metrics(self) -> bool:
         """Check if this service can generate processing metrics.

From f982ace4c5ad1e4d86a222997eeead189c1e1289 Mon Sep 17 00:00:00 2001
From: padillamt <michael.padilla@inworld.ai>
Date: Thu, 24 Jul 2025 21:56:01 -0700
Subject: [PATCH 32/38] inworld: removal of unnecessary setting of ssampling
 rate since matches default

---
 examples/foundational/07ab-interruptible-inworld-http.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/foundational/07ab-interruptible-inworld-http.py b/examples/foundational/07ab-interruptible-inworld-http.py
index 53e3c14b3..25d05cefe 100644
--- a/examples/foundational/07ab-interruptible-inworld-http.py
+++ b/examples/foundational/07ab-interruptible-inworld-http.py
@@ -100,7 +100,6 @@ async def run_example(transport: BaseTransport, _: argparse.Namespace, handle_si
         task = PipelineTask(
             pipeline,
             params=PipelineParams(
-                audio_out_sample_rate=24000,
                 enable_metrics=True,
                 enable_usage_metrics=True,
             ),

From acc5b9f2102c85dcd6aebc43796918cf2e7685fa Mon Sep 17 00:00:00 2001
From: padillamt <michael.padilla@inworld.ai>
Date: Thu, 24 Jul 2025 22:07:15 -0700
Subject: [PATCH 33/38] inworld: change to function that stops all processing
 metrics

Co-authored-by: Mark Backman <m.backman@gmail.com>
---
 src/pipecat/services/inworld/tts.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pipecat/services/inworld/tts.py b/src/pipecat/services/inworld/tts.py
index c26b0b15c..68ada9720 100644
--- a/src/pipecat/services/inworld/tts.py
+++ b/src/pipecat/services/inworld/tts.py
@@ -400,7 +400,7 @@ class InworldTTSService(TTSService):
             # STEP 8: CLEANUP AND COMPLETION
             # ================================================================================
             # Always stop metrics tracking, even if errors occurred
-            await self.stop_ttfb_metrics()
+            await self.stop_all_metrics()
 
             # Signal to pipeline that TTS generation is complete
             # This allows downstream processors to finalize audio processing

From 8e6679475962f9cca38a836968050c1421bbb26a Mon Sep 17 00:00:00 2001
From: padillamt <michael.padilla@inworld.ai>
Date: Thu, 24 Jul 2025 22:22:36 -0700
Subject: [PATCH 34/38] mtpadilla: switch to Deepgram ASR for lower latency

---
 examples/foundational/07ab-interruptible-inworld-http.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/examples/foundational/07ab-interruptible-inworld-http.py b/examples/foundational/07ab-interruptible-inworld-http.py
index 25d05cefe..e0ae9f15c 100644
--- a/examples/foundational/07ab-interruptible-inworld-http.py
+++ b/examples/foundational/07ab-interruptible-inworld-http.py
@@ -16,9 +16,9 @@ from pipecat.pipeline.pipeline import Pipeline
 from pipecat.pipeline.runner import PipelineRunner
 from pipecat.pipeline.task import PipelineParams, PipelineTask
 from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
+from pipecat.services.deepgram.stt import DeepgramSTTService
 from pipecat.services.inworld.tts import InworldTTSService
 from pipecat.services.openai.llm import OpenAILLMService
-from pipecat.services.openai.stt import OpenAISTTService
 from pipecat.transports.base_transport import BaseTransport, TransportParams
 from pipecat.transports.network.fastapi_websocket import FastAPIWebsocketParams
 from pipecat.transports.services.daily import DailyParams
@@ -52,11 +52,7 @@ async def run_example(transport: BaseTransport, _: argparse.Namespace, handle_si
 
     # Create an HTTP session
     async with aiohttp.ClientSession() as session:
-        stt = OpenAISTTService(
-            api_key=os.getenv("OPENAI_API_KEY"),
-            model="gpt-4o-transcribe",
-            prompt="Expect words related to dogs, such as breed names.",
-        )
+        stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
 
         # Inworld TTS Service - Unified streaming and non-streaming
         # Set streaming=True for real-time audio, streaming=False for complete audio generation

From 37361391d9b896bc47941fd284b7ec7079e60597 Mon Sep 17 00:00:00 2001
From: padillamt <michael.padilla@inworld.ai>
Date: Fri, 25 Jul 2025 09:16:56 -0700
Subject: [PATCH 35/38] mtpadilla: removed ability to set base_url via
 constructor, set internally based on streaming variable

---
 src/pipecat/services/inworld/tts.py | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/src/pipecat/services/inworld/tts.py b/src/pipecat/services/inworld/tts.py
index 68ada9720..6a3d6aa46 100644
--- a/src/pipecat/services/inworld/tts.py
+++ b/src/pipecat/services/inworld/tts.py
@@ -164,7 +164,6 @@ class InworldTTSService(TTSService):
         api_key: str,
         aiohttp_session: aiohttp.ClientSession,
         streaming: bool = True,
-        base_url: Optional[str] = None,
         sample_rate: Optional[int] = None,
         encoding: str = "LINEAR16",
         params: Optional[InputParams] = None,
@@ -183,10 +182,9 @@ class InworldTTSService(TTSService):
             streaming: Whether to use streaming mode (True) or non-streaming mode (False).
                       - True: Real-time audio chunks as they're generated (lower latency)
                       - False: Complete audio file generated first, then chunked for playback (simpler)
-            base_url: Base URL for Inworld HTTP API. If None, automatically selected based on streaming mode:
-                     - Streaming: "https://api.inworld.ai/tts/v1/voice:stream"
-                     - Non-streaming: "https://api.inworld.ai/tts/v1/voice"
-                     Should normally not be changed unless using a different environment.
+                      The base URL is automatically selected based on this mode:
+                      - Streaming: "https://api.inworld.ai/tts/v1/voice:stream"
+                      - Non-streaming: "https://api.inworld.ai/tts/v1/voice"
             sample_rate: Audio sample rate in Hz. If None, uses default from StartFrame.
                         Common values: 48000 (high quality), 24000 (good quality), 16000 (basic)
             encoding: Audio encoding format. Supported options:
@@ -215,13 +213,11 @@ class InworldTTSService(TTSService):
         self._session = aiohttp_session  # HTTP session for requests
         self._streaming = streaming  # Streaming mode selection
 
-        # Set base URL based on streaming mode if not provided
-        if base_url is None:
-            if streaming:
-                base_url = "https://api.inworld.ai/tts/v1/voice:stream"  # Streaming endpoint
-            else:
-                base_url = "https://api.inworld.ai/tts/v1/voice"  # Non-streaming endpoint
-        self._base_url = base_url  # API endpoint URL
+        # Set base URL based on streaming mode
+        if streaming:
+            self._base_url = "https://api.inworld.ai/tts/v1/voice:stream"  # Streaming endpoint
+        else:
+            self._base_url = "https://api.inworld.ai/tts/v1/voice"  # Non-streaming endpoint
 
         # Build settings dictionary that matches Inworld's API expectations
         # This will be sent as JSON payload in each TTS request

From 4a9bec5b353c9194a7deb3a58ef69da44a920a37 Mon Sep 17 00:00:00 2001
From: padillamt <michael.padilla@inworld.ai>
Date: Fri, 25 Jul 2025 11:14:20 -0700
Subject: [PATCH 36/38] mtpadilla: stop metrics at result chunk

---
 src/pipecat/services/inworld/tts.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/pipecat/services/inworld/tts.py b/src/pipecat/services/inworld/tts.py
index 6a3d6aa46..3e3004c5a 100644
--- a/src/pipecat/services/inworld/tts.py
+++ b/src/pipecat/services/inworld/tts.py
@@ -466,6 +466,7 @@ class InworldTTSService(TTSService):
                     # Inworld's response format: {"result": {"audioContent": "base64data"}}
                     if "result" in chunk_data and "audioContent" in chunk_data["result"]:
                         # Process the audio chunk
+                        await self.stop_ttfb_metrics()
                         async for frame in self._process_audio_chunk(
                             base64.b64decode(chunk_data["result"]["audioContent"])
                         ):

From e140bd6960f49fe7261fd06467097bf762e99eab Mon Sep 17 00:00:00 2001
From: padillamt <michael.padilla@inworld.ai>
Date: Fri, 25 Jul 2025 14:04:49 -0700
Subject: [PATCH 37/38] mtpadilla: moved model and voice id setting into the
 class constructor

---
 src/pipecat/services/inworld/tts.py | 49 +++++++++++++++--------------
 1 file changed, 26 insertions(+), 23 deletions(-)

diff --git a/src/pipecat/services/inworld/tts.py b/src/pipecat/services/inworld/tts.py
index 3e3004c5a..fdd0d1a5c 100644
--- a/src/pipecat/services/inworld/tts.py
+++ b/src/pipecat/services/inworld/tts.py
@@ -34,10 +34,10 @@ Examples::
         tts = InworldTTSService(
             api_key=os.getenv("INWORLD_API_KEY"),
             aiohttp_session=session,
+            voice_id="Ashley",
+            model="inworld-tts-1",
             streaming=True,  # Default
             params=InworldTTSService.InputParams(
-                voice_id="Ashley",
-                model="inworld-tts-1",
                 temperature=0.8,  # Optional: control synthesis variability (range: [0, 2])
             ),
         )
@@ -46,10 +46,10 @@ Examples::
         tts = InworldTTSService(
             api_key=os.getenv("INWORLD_API_KEY"),
             aiohttp_session=session,
+            voice_id="Ashley",
+            model="inworld-tts-1",
             streaming=False,
             params=InworldTTSService.InputParams(
-                voice_id="Ashley",
-                model="inworld-tts-1",
                 temperature=0.8,
             ),
         )
@@ -119,10 +119,10 @@ class InworldTTSService(TTSService):
             tts_streaming = InworldTTSService(
                 api_key=os.getenv("INWORLD_API_KEY"),
                 aiohttp_session=session,
+                voice_id="Ashley",
+                model="inworld-tts-1",
                 streaming=True,  # Default behavior
                 params=InworldTTSService.InputParams(
-                    voice_id="Ashley",
-                    model="inworld-tts-1",
                     temperature=0.8,  # Add variability to speech synthesis (range: [0, 2])
                 ),
             )
@@ -131,21 +131,19 @@ class InworldTTSService(TTSService):
             tts_complete = InworldTTSService(
                 api_key=os.getenv("INWORLD_API_KEY"),
                 aiohttp_session=session,
+                voice_id="Hades",
+                model="inworld-tts-1-max",
                 streaming=False,
                 params=InworldTTSService.InputParams(
-                    voice_id="Hades",
-                    model="inworld-tts-1-max",
                     temperature=0.8,
                 ),
             )
     """
 
     class InputParams(BaseModel):
-        """Input parameters for Inworld TTS configuration.
+        """Optional input parameters for Inworld TTS configuration.
 
         Parameters:
-            voice_id: Voice selection for speech synthesis (e.g., "Ashley", "Hades").
-            model: TTS model to use (e.g., "inworld-tts-1", "inworld-tts-1-max").
             temperature: Voice temperature control for synthesis variability (e.g., 0.8).
                         Valid range: [0, 2]. Higher values increase variability.
 
@@ -154,8 +152,6 @@ class InworldTTSService(TTSService):
             so no explicit language parameter is required.
         """
 
-        voice_id: Optional[str] = "Ashley"  # defaults to the Ashley voice
-        model: Optional[str] = "inworld-tts-1"  # defaults to the inworld-tts-1 model
         temperature: Optional[float] = None  # optional temperature control (range: [0, 2])
 
     def __init__(
@@ -163,6 +159,8 @@ class InworldTTSService(TTSService):
         *,
         api_key: str,
         aiohttp_session: aiohttp.ClientSession,
+        voice_id: str = "Ashley",
+        model: str = "inworld-tts-1",
         streaming: bool = True,
         sample_rate: Optional[int] = None,
         encoding: str = "LINEAR16",
@@ -179,6 +177,14 @@ class InworldTTSService(TTSService):
                     Get this from: Inworld Portal > Settings > API Keys > Runtime API Key
             aiohttp_session: Shared aiohttp session for HTTP requests. Must be provided
                            for proper connection pooling and resource management.
+            voice_id: Voice selection for speech synthesis. Common options include:
+                     - "Ashley": Clear, professional female voice (default)
+                     - "Hades": Deep, authoritative male voice
+                     - And many more available in your Inworld account
+            model: TTS model to use for speech synthesis:
+                  - "inworld-tts-1": Standard quality model (default)
+                  - "inworld-tts-1-max": Higher quality model
+                  - Other models as available in your Inworld account
             streaming: Whether to use streaming mode (True) or non-streaming mode (False).
                       - True: Real-time audio chunks as they're generated (lower latency)
                       - False: Complete audio file generated first, then chunked for playback (simpler)
@@ -190,12 +196,9 @@ class InworldTTSService(TTSService):
             encoding: Audio encoding format. Supported options:
                      - "LINEAR16" (default) - Uncompressed PCM, best quality
                      - Other formats as supported by Inworld API
-            params: Input parameters for voice and model configuration. Use this to specify:
-                   - voice_id: Voice selection ("Ashley", "Hades", etc.)
-                   - model: TTS model ("inworld-tts-1", "inworld-tts-1-max", etc.)
+            params: Optional input parameters for additional configuration. Use this to specify:
                    - temperature: Voice temperature control for variability (range: [0, 2], e.g., 0.8, optional)
-                   If None, uses default values (Ashley voice, inworld-tts-1 model).
-                   Note: Language is automatically inferred from input text.
+                   Language is automatically inferred from input text.
             **kwargs: Additional arguments passed to the parent TTSService class.
 
         Note:
@@ -223,8 +226,8 @@ class InworldTTSService(TTSService):
         # This will be sent as JSON payload in each TTS request
         # Note: Language is automatically inferred from text by Inworld's models
         self._settings = {
-            "voiceId": params.voice_id,  # Voice selection from params
-            "modelId": params.model,  # TTS model selection from params
+            "voiceId": voice_id,  # Voice selection from direct parameter
+            "modelId": model,  # TTS model selection from direct parameter
             "audio_config": {  # Audio format configuration
                 "audio_encoding": encoding,  # Format: LINEAR16, MP3, etc.
                 "sample_rate_hertz": 0,  # Will be set in start() from parent service
@@ -232,12 +235,12 @@ class InworldTTSService(TTSService):
         }
 
         # Add optional temperature parameter if provided (valid range: [0, 2])
-        if params.temperature is not None:
+        if params and params.temperature is not None:
             self._settings["temperature"] = params.temperature
 
         # Register voice and model with parent service for metrics and tracking
-        self.set_voice(self._settings["voiceId"])  # Used for logging and metrics
-        self.set_model_name(self._settings["modelId"])  # Used for performance tracking
+        self.set_voice(voice_id)  # Used for logging and metrics
+        self.set_model_name(model)  # Used for performance tracking
 
     def can_generate_metrics(self) -> bool:
         """Check if this service can generate processing metrics.

From b68f044ef7d73733aa057074ffa4cc2ff2c2c751 Mon Sep 17 00:00:00 2001
From: padillamt <michael.padilla@inworld.ai>
Date: Fri, 25 Jul 2025 15:13:43 -0700
Subject: [PATCH 38/38] mtpadilla: updated example to reflect parameter
 placement changes in base Inworld TTS class

---
 examples/foundational/07ab-interruptible-inworld-http.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/foundational/07ab-interruptible-inworld-http.py b/examples/foundational/07ab-interruptible-inworld-http.py
index e0ae9f15c..5d559ba5a 100644
--- a/examples/foundational/07ab-interruptible-inworld-http.py
+++ b/examples/foundational/07ab-interruptible-inworld-http.py
@@ -61,10 +61,10 @@ async def run_example(transport: BaseTransport, _: argparse.Namespace, handle_si
         tts = InworldTTSService(
             api_key=os.getenv("INWORLD_API_KEY", ""),
             aiohttp_session=session,
+            voice_id="Ashley",
+            model="inworld-tts-1",
             streaming=streaming,  # True: real-time chunks, False: complete audio then playback
             params=InworldTTSService.InputParams(
-                voice_id="Ashley",
-                model="inworld-tts-1",
                 temperature=0.8,
             ),
         )