From 7861b911c0dace27240505efce69a47c0083c50b Mon Sep 17 00:00:00 2001 From: padillamt Date: Tue, 15 Jul 2025 16:50:50 -0700 Subject: [PATCH 01/38] inworld: first commit of __init__ and tts.py files --- src/pipecat/services/inworld/__init__.py | 13 ++ src/pipecat/services/inworld/tts.py | 265 +++++++++++++++++++++++ 2 files changed, 278 insertions(+) create mode 100644 src/pipecat/services/inworld/__init__.py create mode 100644 src/pipecat/services/inworld/tts.py diff --git a/src/pipecat/services/inworld/__init__.py b/src/pipecat/services/inworld/__init__.py new file mode 100644 index 000000000..9717eb163 --- /dev/null +++ b/src/pipecat/services/inworld/__init__.py @@ -0,0 +1,13 @@ +# +# Copyright (c) 2024–2025, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +import sys + +from pipecat.services import DeprecatedModuleProxy + +from .tts import * + +sys.modules[__name__] = DeprecatedModuleProxy(globals(), "inworld", "inworld.tts") diff --git a/src/pipecat/services/inworld/tts.py b/src/pipecat/services/inworld/tts.py new file mode 100644 index 000000000..b938fb2a0 --- /dev/null +++ b/src/pipecat/services/inworld/tts.py @@ -0,0 +1,265 @@ +# +# Copyright (c) 2024–2025, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +"""Inworld's text-to-speech service implementations.""" + +import base64 +import json +import uuid +import warnings +from typing import AsyncGenerator, List, Optional, Union + +import aiohttp +from loguru import logger +from pydantic import BaseModel, Field +import io, json, base64 + +from pipecat.frames.frames import ( + CancelFrame, + EndFrame, + ErrorFrame, + Frame, + StartFrame, + StartInterruptionFrame, + TTSAudioRawFrame, + TTSStartedFrame, + TTSStoppedFrame, +) +from pipecat.processors.frame_processor import FrameDirection +from pipecat.services.tts_service import AudioContextWordTTSService, TTSService +from pipecat.transcriptions.language import Language +from pipecat.utils.asyncio.watchdog_async_iterator import WatchdogAsyncIterator +from pipecat.utils.text.base_text_aggregator import BaseTextAggregator +from pipecat.utils.text.skip_tags_aggregator import SkipTagsAggregator +from pipecat.utils.tracing.service_decorators import traced_tts + + +def language_to_inworld_language(language: Language) -> Optional[str]: + """Convert Pipecat's Language enum to Inworld's language code. + + Args: + language: The Language enum value to convert. + + Returns: + The corresponding Inworld language code, or None if not supported. + """ + BASE_LANGUAGES = { + Language.EN: "en", + Language.ES: "es", + Language.FR: "fr", + Language.KO: "ko", + Language.NL: "nl", + Language.ZH: "zh", + } + + result = BASE_LANGUAGES.get(language) + + # If not found in base languages, try to find the base language from a variant + if not result: + # Convert enum value to string and get the base language part (e.g. es-ES -> es) + lang_str = str(language.value) + base_code = lang_str.split("-")[0].lower() + # Look up the base code in our supported languages + result = base_code if base_code in BASE_LANGUAGES.values() else None + + return result + + +class InworldTTSService(TTSService): + """Inworld HTTP-based TTS service. + + Provides text-to-speech using Inworld's HTTP API for simpler, non-streaming + synthesis. Suitable for use cases where streaming is not required and simpler + integration is preferred. + """ + + class InputParams(BaseModel): + """Input parameters for Inworld HTTP TTS configuration. + + Parameters: + language: Language to use for synthesis. + speed: Voice speed control (string or float). + emotion: List of emotion controls. + + .. deprecated:: 0.0.68 + The `emotion` parameter is deprecated and will be removed in a future version. + """ + + language: Optional[Language] = Language.EN + voice_id: str = "Ashley" + + def __init__( + self, + *, + api_key: str, + aiohttp_session: aiohttp.ClientSession, + model: str = "inworld-tts-1", + base_url: str = "https://api.inworld.ai/tts/v1/voice:stream", + sample_rate: Optional[int] = 48000, + encoding: str = "LINEAR16", + params: Optional[InputParams] = None, + **kwargs, + ): + """Initialize the Inworld HTTP TTS service. + + Args: + api_key: Inworld API key for authentication. + aiohttp_session: Shared aiohttp session for HTTP requests. + voice_id: ID of the voice to use for synthesis. + model: TTS model to use (e.g., "sonic-2"). + endpoint_url: Base URL for Inworld HTTP API. + sample_rate: Audio sample rate. If None, uses default. + encoding: Audio encoding format. + params: Additional input parameters for voice customization. + **kwargs: Additional arguments passed to the parent TTSService. + """ + super().__init__(sample_rate=sample_rate, **kwargs) + + params = params or InworldTTSService.InputParams() + + self._api_key = api_key + self._session = aiohttp_session + self._base_url = base_url + self._settings = { + "voiceId": params.voice_id, + "modelId": model, + "audio_config": { + "audio_encoding": encoding, + "sample_rate_hertz": sample_rate, + }, + "language": self.language_to_service_language(params.language) + if params.language + else "en", + } + self.set_voice(params.voice_id) + self.set_model_name(model) + + + def can_generate_metrics(self) -> bool: + """Check if this service can generate processing metrics. + + Returns: + True, as Inworld HTTP service supports metrics generation. + """ + return True + + def language_to_service_language(self, language: Language) -> Optional[str]: + """Convert a Language enum to Inworld language format. + + Args: + language: The language to convert. + + Returns: + The Inworld-specific language code, or None if not supported. + """ + return language_to_inworld_language(language) + + async def start(self, frame: StartFrame): + """Start the Inworld HTTP TTS service. + + Args: + frame: The start frame containing initialization parameters. + """ + await super().start(frame) + self._settings["audio_config"]["sample_rate_hertz"] = self.sample_rate + + async def stop(self, frame: EndFrame): + """Stop the Inworld HTTP TTS service. + + Args: + frame: The end frame. + """ + await super().stop(frame) + # await self._client.close() + + async def cancel(self, frame: CancelFrame): + """Cancel the Inworld HTTP TTS service. + + Args: + frame: The cancel frame. + """ + await super().cancel(frame) + # await self._client.close() + + @traced_tts + async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: + """Generate speech from text using Inworld's HTTP API. + + Args: + text: The text to synthesize into speech. + + Yields: + Frame: Audio frames containing the synthesized speech. + """ + logger.debug(f"{self}: Generating TTS [{text}]") + + payload = { + "text": text, + "voiceId": self._settings["voiceId"], + "modelId": self._settings["modelId"], + "audio_config": self._settings["audio_config"], + "language": self._settings["language"], + } + + headers = { + "Authorization": f"Basic {self._api_key}", + "Content-Type": "application/json", + } + + try: + await self.start_ttfb_metrics() + + yield TTSStartedFrame() + + async with self._session.post(self._base_url, json=payload, headers=headers) as response: + if response.status != 200: + error_text = await response.text() + logger.error(f"Inworld API error: {error_text}") + await self.push_error(ErrorFrame(f"Inworld API error: {error_text}")) + return + + raw_audio_data = io.BytesIO() + + async for line in response.content.iter_lines(): + line_str = line.decode('utf-8').strip() + if not line_str: + continue + + try: + chunk = json.loads(line_str) + if "result" in chunk and "audioContent" in chunk["result"]: + audio_chunk = base64.b64decode(chunk["result"]["audioContent"]) + # Skip WAV header if present (first 44 bytes) + if len(audio_chunk) > 44 and audio_chunk.startswith(b"RIFF"): + audio_data = audio_chunk[44:] + else: + audio_data = audio_chunk + raw_audio_data.write(audio_data) + except json.JSONDecodeError: + continue + + await self.start_tts_usage_metrics(text) + + audio_bytes = raw_audio_data.getvalue() + if not audio_bytes: + logger.error("No audio data received from Inworld API") + await self.push_error(ErrorFrame("No audio data received")) + return + + frame = TTSAudioRawFrame( + audio=audio_bytes, + sample_rate=self.sample_rate, + num_channels=1, + ) + + yield frame + + except Exception as e: + logger.error(f"{self} exception: {e}") + await self.push_error(ErrorFrame(f"Error generating TTS: {e}")) + finally: + await self.stop_ttfb_metrics() + yield TTSStoppedFrame() From 384838147adfece71b273c5010d6e770b7d64a5a Mon Sep 17 00:00:00 2001 From: padillamt Date: Tue, 15 Jul 2025 16:56:18 -0700 Subject: [PATCH 02/38] inworld: removed unnecessary code from stop() and cancel() --- src/pipecat/services/inworld/tts.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/pipecat/services/inworld/tts.py b/src/pipecat/services/inworld/tts.py index b938fb2a0..b05134a5d 100644 --- a/src/pipecat/services/inworld/tts.py +++ b/src/pipecat/services/inworld/tts.py @@ -173,7 +173,6 @@ class InworldTTSService(TTSService): frame: The end frame. """ await super().stop(frame) - # await self._client.close() async def cancel(self, frame: CancelFrame): """Cancel the Inworld HTTP TTS service. @@ -182,7 +181,6 @@ class InworldTTSService(TTSService): frame: The cancel frame. """ await super().cancel(frame) - # await self._client.close() @traced_tts async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: From 913dba3b74d42eec7c14b67c6ed3231f83478696 Mon Sep 17 00:00:00 2001 From: padillamt Date: Tue, 15 Jul 2025 17:15:57 -0700 Subject: [PATCH 03/38] inworld: class name change --- src/pipecat/services/inworld/tts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pipecat/services/inworld/tts.py b/src/pipecat/services/inworld/tts.py index b05134a5d..89f6c5e6c 100644 --- a/src/pipecat/services/inworld/tts.py +++ b/src/pipecat/services/inworld/tts.py @@ -68,7 +68,7 @@ def language_to_inworld_language(language: Language) -> Optional[str]: return result -class InworldTTSService(TTSService): +class InworldHttpTTSService(TTSService): """Inworld HTTP-based TTS service. Provides text-to-speech using Inworld's HTTP API for simpler, non-streaming From c67b779b9178f6b638ef94437fa71b72ee7e7839 Mon Sep 17 00:00:00 2001 From: padillamt Date: Tue, 15 Jul 2025 17:21:16 -0700 Subject: [PATCH 04/38] inworld: first commit of Inworld example file for TTS --- .../07aa-interruptible-inworld-http.py | 117 ++++++++++++++++++ 1 file changed, 117 insertions(+) create mode 100644 examples/foundational/07aa-interruptible-inworld-http.py diff --git a/examples/foundational/07aa-interruptible-inworld-http.py b/examples/foundational/07aa-interruptible-inworld-http.py new file mode 100644 index 000000000..2b8b1612d --- /dev/null +++ b/examples/foundational/07aa-interruptible-inworld-http.py @@ -0,0 +1,117 @@ +# +# Copyright (c) 2024–2025, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +import argparse +import os + +import aiohttp +from dotenv import load_dotenv +from loguru import logger + +from pipecat.audio.vad.silero import SileroVADAnalyzer +from pipecat.pipeline.pipeline import Pipeline +from pipecat.pipeline.runner import PipelineRunner +from pipecat.pipeline.task import PipelineParams, PipelineTask +from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext +from pipecat.services.deepgram.stt import DeepgramSTTService +from pipecat.services.openai.llm import OpenAILLMService +from pipecat.services.inworld.tts import InworldHttpTTSService +from pipecat.transports.base_transport import BaseTransport, TransportParams +from pipecat.transports.network.fastapi_websocket import FastAPIWebsocketParams +from pipecat.transports.services.daily import DailyParams + +load_dotenv(override=True) + + +# We store functions so objects (e.g. SileroVADAnalyzer) don't get +# instantiated. The function will be called when the desired transport gets +# selected. +transport_params = { + "daily": lambda: DailyParams( + audio_in_enabled=True, + audio_out_enabled=True, + vad_analyzer=SileroVADAnalyzer(), + ), + "twilio": lambda: FastAPIWebsocketParams( + audio_in_enabled=True, + audio_out_enabled=True, + vad_analyzer=SileroVADAnalyzer(), + ), + "webrtc": lambda: TransportParams( + audio_in_enabled=True, + audio_out_enabled=True, + vad_analyzer=SileroVADAnalyzer(), + ), +} + + +async def run_example(transport: BaseTransport, _: argparse.Namespace, handle_sigint: bool): + logger.info(f"Starting bot") + + # Create an HTTP session + async with aiohttp.ClientSession() as session: + stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY")) + + tts = InworldHttpTTSService( + api_key=os.getenv("INWORLD_API_KEY", ""), + voice_id="Ashley", + model="inworld-tts-1", + aiohttp_session=session, + ) + + llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY")) + + messages = [ + { + "role": "system", + "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.", + }, + ] + + context = OpenAILLMContext(messages) + context_aggregator = llm.create_context_aggregator(context) + + pipeline = Pipeline( + [ + transport.input(), # Transport user input + stt, + context_aggregator.user(), # User responses + llm, # LLM + tts, # TTS + transport.output(), # Transport bot output + context_aggregator.assistant(), # Assistant spoken responses + ] + ) + + task = PipelineTask( + pipeline, + params=PipelineParams( + enable_metrics=True, + enable_usage_metrics=True, + ), + ) + + @transport.event_handler("on_client_connected") + async def on_client_connected(transport, client): + logger.info(f"Client connected") + # Kick off the conversation. + messages.append({"role": "system", "content": "Please introduce yourself to the user."}) + await task.queue_frames([context_aggregator.user().get_context_frame()]) + + @transport.event_handler("on_client_disconnected") + async def on_client_disconnected(transport, client): + logger.info(f"Client disconnected") + await task.cancel() + + runner = PipelineRunner(handle_sigint=handle_sigint) + + await runner.run(task) + + +if __name__ == "__main__": + from pipecat.examples.run import main + + main(run_example, transport_params=transport_params) From ca936bd56966c4787903a13cdd72ecf9cb9a5eba Mon Sep 17 00:00:00 2001 From: padillamt Date: Tue, 15 Jul 2025 18:11:50 -0700 Subject: [PATCH 05/38] inworld: added Inworld to list of needed credentials --- dot-env.template | 3 +++ 1 file changed, 3 insertions(+) diff --git a/dot-env.template b/dot-env.template index ab085757f..d79b67c63 100644 --- a/dot-env.template +++ b/dot-env.template @@ -76,6 +76,9 @@ GROQ_API_KEY=... # Grok GROK_API_KEY=... +# Inworld +INWORLD_API_KEY=... + # Together.ai TOGETHER_API_KEY=... From 2b76823b017d5d8a5299660ad4480b0c5842ab5e Mon Sep 17 00:00:00 2001 From: padillamt Date: Tue, 15 Jul 2025 18:17:30 -0700 Subject: [PATCH 06/38] inworld: added comments to track a few things to confirm --- src/pipecat/services/inworld/tts.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/pipecat/services/inworld/tts.py b/src/pipecat/services/inworld/tts.py index 89f6c5e6c..620a4b702 100644 --- a/src/pipecat/services/inworld/tts.py +++ b/src/pipecat/services/inworld/tts.py @@ -89,7 +89,8 @@ class InworldHttpTTSService(TTSService): """ language: Optional[Language] = Language.EN - voice_id: str = "Ashley" + voice_id: str = "Ashley" ## QUESTION: How to make this modifyable/how to modify? + # QUESTION: What about speed, pitch, and temperature?? def __init__( self, From f3984aec33462fcfc8387132c1265df3d21c7c88 Mon Sep 17 00:00:00 2001 From: padillamt Date: Wed, 16 Jul 2025 13:21:32 -0700 Subject: [PATCH 07/38] inworld: added (empty) requirements for Inworld to be explicit reg dependencies --- docs/api/requirements.txt | 1 + pyproject.toml | 1 + 2 files changed, 2 insertions(+) diff --git a/docs/api/requirements.txt b/docs/api/requirements.txt index c9e8e2ce9..4c68da84b 100644 --- a/docs/api/requirements.txt +++ b/docs/api/requirements.txt @@ -23,6 +23,7 @@ pipecat-ai[gladia] pipecat-ai[google] pipecat-ai[grok] pipecat-ai[groq] +pipecat-ai[inworld] # pipecat-ai[krisp] # Mocked pipecat-ai[koala] # pipecat-ai[langchain] # Mocked diff --git a/pyproject.toml b/pyproject.toml index 39a16231f..0ab9109a8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -59,6 +59,7 @@ google = [ "google-cloud-speech~=2.32.0", "google-cloud-texttospeech~=2.26.0", " grok = [] groq = [ "groq~=0.23.0" ] gstreamer = [ "pygobject~=3.50.0" ] +inworld = [] krisp = [ "pipecat-ai-krisp~=0.4.0" ] koala = [ "pvkoala~=2.0.3" ] langchain = [ "langchain~=0.3.20", "langchain-community~=0.3.20", "langchain-openai~=0.3.9" ] From 1bc442e3292d47313def7511c7415fe6b7d7c080 Mon Sep 17 00:00:00 2001 From: padillamt Date: Fri, 18 Jul 2025 15:13:19 -0700 Subject: [PATCH 08/38] inworld: docstring fix --- src/pipecat/services/inworld/tts.py | 142 ++++++++++++++++++++++------ 1 file changed, 112 insertions(+), 30 deletions(-) diff --git a/src/pipecat/services/inworld/tts.py b/src/pipecat/services/inworld/tts.py index 620a4b702..bf9a5d6ee 100644 --- a/src/pipecat/services/inworld/tts.py +++ b/src/pipecat/services/inworld/tts.py @@ -7,6 +7,7 @@ """Inworld's text-to-speech service implementations.""" import base64 +import io import json import uuid import warnings @@ -15,7 +16,6 @@ from typing import AsyncGenerator, List, Optional, Union import aiohttp from loguru import logger from pydantic import BaseModel, Field -import io, json, base64 from pipecat.frames.frames import ( CancelFrame, @@ -89,7 +89,7 @@ class InworldHttpTTSService(TTSService): """ language: Optional[Language] = Language.EN - voice_id: str = "Ashley" ## QUESTION: How to make this modifyable/how to modify? + voice_id: str = "Ashley" ## QUESTION: How to make this modifyable/how to modify? # QUESTION: What about speed, pitch, and temperature?? def __init__( @@ -109,9 +109,8 @@ class InworldHttpTTSService(TTSService): Args: api_key: Inworld API key for authentication. aiohttp_session: Shared aiohttp session for HTTP requests. - voice_id: ID of the voice to use for synthesis. - model: TTS model to use (e.g., "sonic-2"). - endpoint_url: Base URL for Inworld HTTP API. + model: TTS model to use (e.g., "inworld-tts-1"). + base_url: Base URL for Inworld HTTP API. sample_rate: Audio sample rate. If None, uses default. encoding: Audio encoding format. params: Additional input parameters for voice customization. @@ -138,7 +137,6 @@ class InworldHttpTTSService(TTSService): self.set_voice(params.voice_id) self.set_model_name(model) - def can_generate_metrics(self) -> bool: """Check if this service can generate processing metrics. @@ -187,6 +185,8 @@ class InworldHttpTTSService(TTSService): async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: """Generate speech from text using Inworld's HTTP API. + This implementation streams audio chunk by chunk as it's received. + Args: text: The text to synthesize into speech. @@ -213,52 +213,134 @@ class InworldHttpTTSService(TTSService): yield TTSStartedFrame() - async with self._session.post(self._base_url, json=payload, headers=headers) as response: + # A flag to ensure we only strip the header from the very first chunk. + is_first_chunk = True + + async with self._session.post( + self._base_url, json=payload, headers=headers + ) as response: if response.status != 200: error_text = await response.text() logger.error(f"Inworld API error: {error_text}") await self.push_error(ErrorFrame(f"Inworld API error: {error_text}")) return - raw_audio_data = io.BytesIO() - + # Process the stream line by line. async for line in response.content.iter_lines(): - line_str = line.decode('utf-8').strip() + line_str = line.decode("utf-8").strip() if not line_str: continue - + try: chunk = json.loads(line_str) if "result" in chunk and "audioContent" in chunk["result"]: audio_chunk = base64.b64decode(chunk["result"]["audioContent"]) - # Skip WAV header if present (first 44 bytes) - if len(audio_chunk) > 44 and audio_chunk.startswith(b"RIFF"): + audio_data = audio_chunk + + # Correctly strip the header only from the first chunk. + if ( + is_first_chunk + and len(audio_chunk) > 44 + and audio_chunk.startswith(b"RIFF") + ): audio_data = audio_chunk[44:] - else: - audio_data = audio_chunk - raw_audio_data.write(audio_data) + is_first_chunk = False # Unset the flag. + + # Yield each audio frame as it's processed. + yield TTSAudioRawFrame( + audio=audio_data, + sample_rate=self.sample_rate, + num_channels=1, + ) + except json.JSONDecodeError: continue await self.start_tts_usage_metrics(text) - audio_bytes = raw_audio_data.getvalue() - if not audio_bytes: - logger.error("No audio data received from Inworld API") - await self.push_error(ErrorFrame("No audio data received")) - return - - frame = TTSAudioRawFrame( - audio=audio_bytes, - sample_rate=self.sample_rate, - num_channels=1, - ) - - yield frame - except Exception as e: logger.error(f"{self} exception: {e}") await self.push_error(ErrorFrame(f"Error generating TTS: {e}")) finally: await self.stop_ttfb_metrics() yield TTSStoppedFrame() + + # @traced_tts + # async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: + # """Generate speech from text using Inworld's HTTP API. + + # Args: + # text: The text to synthesize into speech. + + # Yields: + # Frame: Audio frames containing the synthesized speech. + # """ + # logger.debug(f"{self}: Generating TTS [{text}]") + + # payload = { + # "text": text, + # "voiceId": self._settings["voiceId"], + # "modelId": self._settings["modelId"], + # "audio_config": self._settings["audio_config"], + # "language": self._settings["language"], + # } + + # headers = { + # "Authorization": f"Basic {self._api_key}", + # "Content-Type": "application/json", + # } + + # try: + # await self.start_ttfb_metrics() + + # yield TTSStartedFrame() + + # async with self._session.post(self._base_url, json=payload, headers=headers) as response: + # if response.status != 200: + # error_text = await response.text() + # logger.error(f"Inworld API error: {error_text}") + # await self.push_error(ErrorFrame(f"Inworld API error: {error_text}")) + # return + + # raw_audio_data = io.BytesIO() + + # async for line in response.content.iter_lines(): + # line_str = line.decode('utf-8').strip() + # if not line_str: + # continue + + # try: + # chunk = json.loads(line_str) + # if "result" in chunk and "audioContent" in chunk["result"]: + # audio_chunk = base64.b64decode(chunk["result"]["audioContent"]) + # # Skip WAV header if present (first 44 bytes) + # if len(audio_chunk) > 44 and audio_chunk.startswith(b"RIFF"): + # audio_data = audio_chunk[44:] + # else: + # audio_data = audio_chunk + # raw_audio_data.write(audio_data) + # except json.JSONDecodeError: + # continue + + # await self.start_tts_usage_metrics(text) + + # audio_bytes = raw_audio_data.getvalue() + # if not audio_bytes: + # logger.error("No audio data received from Inworld API") + # await self.push_error(ErrorFrame("No audio data received")) + # return + + # frame = TTSAudioRawFrame( + # audio=audio_bytes, + # sample_rate=self.sample_rate, + # num_channels=1, + # ) + + # yield frame + + # except Exception as e: + # logger.error(f"{self} exception: {e}") + # await self.push_error(ErrorFrame(f"Error generating TTS: {e}")) + # finally: + # await self.stop_ttfb_metrics() + # yield TTSStoppedFrame() From 5d8c184d99f95fe0761036bbb691fc9ac03e0b6f Mon Sep 17 00:00:00 2001 From: padillamt Date: Fri, 18 Jul 2025 16:30:03 -0700 Subject: [PATCH 09/38] inworld: commit of original text file and changes that copy openai's with Inworld TTS as only change --- .../07aa-interruptible-inworld-http.py | 16 ++- .../07aa-interruptible-inworld-http_copy.py | 117 ++++++++++++++++++ 2 files changed, 127 insertions(+), 6 deletions(-) create mode 100644 examples/foundational/07aa-interruptible-inworld-http_copy.py diff --git a/examples/foundational/07aa-interruptible-inworld-http.py b/examples/foundational/07aa-interruptible-inworld-http.py index 2b8b1612d..03622a2d4 100644 --- a/examples/foundational/07aa-interruptible-inworld-http.py +++ b/examples/foundational/07aa-interruptible-inworld-http.py @@ -16,16 +16,15 @@ from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner from pipecat.pipeline.task import PipelineParams, PipelineTask from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext -from pipecat.services.deepgram.stt import DeepgramSTTService -from pipecat.services.openai.llm import OpenAILLMService from pipecat.services.inworld.tts import InworldHttpTTSService +from pipecat.services.openai.llm import OpenAILLMService +from pipecat.services.openai.stt import OpenAISTTService from pipecat.transports.base_transport import BaseTransport, TransportParams from pipecat.transports.network.fastapi_websocket import FastAPIWebsocketParams from pipecat.transports.services.daily import DailyParams load_dotenv(override=True) - # We store functions so objects (e.g. SileroVADAnalyzer) don't get # instantiated. The function will be called when the desired transport gets # selected. @@ -53,7 +52,11 @@ async def run_example(transport: BaseTransport, _: argparse.Namespace, handle_si # Create an HTTP session async with aiohttp.ClientSession() as session: - stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY")) + stt = OpenAISTTService( + api_key=os.getenv("OPENAI_API_KEY"), + model="gpt-4o-transcribe", + prompt="Expect words related to dogs, such as breed names.", + ) tts = InworldHttpTTSService( api_key=os.getenv("INWORLD_API_KEY", ""), @@ -67,7 +70,7 @@ async def run_example(transport: BaseTransport, _: argparse.Namespace, handle_si messages = [ { "role": "system", - "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.", + "content": "You are very knowledgable about dogs. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.", }, ] @@ -77,7 +80,7 @@ async def run_example(transport: BaseTransport, _: argparse.Namespace, handle_si pipeline = Pipeline( [ transport.input(), # Transport user input - stt, + stt, # STT context_aggregator.user(), # User responses llm, # LLM tts, # TTS @@ -89,6 +92,7 @@ async def run_example(transport: BaseTransport, _: argparse.Namespace, handle_si task = PipelineTask( pipeline, params=PipelineParams( + audio_out_sample_rate=24000, enable_metrics=True, enable_usage_metrics=True, ), diff --git a/examples/foundational/07aa-interruptible-inworld-http_copy.py b/examples/foundational/07aa-interruptible-inworld-http_copy.py new file mode 100644 index 000000000..0121865ab --- /dev/null +++ b/examples/foundational/07aa-interruptible-inworld-http_copy.py @@ -0,0 +1,117 @@ +# +# Copyright (c) 2024–2025, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +import argparse +import os + +import aiohttp +from dotenv import load_dotenv +from loguru import logger + +from pipecat.audio.vad.silero import SileroVADAnalyzer +from pipecat.pipeline.pipeline import Pipeline +from pipecat.pipeline.runner import PipelineRunner +from pipecat.pipeline.task import PipelineParams, PipelineTask +from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext +from pipecat.services.deepgram.stt import DeepgramSTTService +from pipecat.services.inworld.tts import InworldHttpTTSService +from pipecat.services.openai.llm import OpenAILLMService +from pipecat.transports.base_transport import BaseTransport, TransportParams +from pipecat.transports.network.fastapi_websocket import FastAPIWebsocketParams +from pipecat.transports.services.daily import DailyParams + +load_dotenv(override=True) + + +# We store functions so objects (e.g. SileroVADAnalyzer) don't get +# instantiated. The function will be called when the desired transport gets +# selected. +transport_params = { + "daily": lambda: DailyParams( + audio_in_enabled=True, + audio_out_enabled=True, + vad_analyzer=SileroVADAnalyzer(), + ), + "twilio": lambda: FastAPIWebsocketParams( + audio_in_enabled=True, + audio_out_enabled=True, + vad_analyzer=SileroVADAnalyzer(), + ), + "webrtc": lambda: TransportParams( + audio_in_enabled=True, + audio_out_enabled=True, + vad_analyzer=SileroVADAnalyzer(), + ), +} + + +async def run_example(transport: BaseTransport, _: argparse.Namespace, handle_sigint: bool): + logger.info(f"Starting bot") + + # Create an HTTP session + async with aiohttp.ClientSession() as session: + stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY")) + + tts = InworldHttpTTSService( + api_key=os.getenv("INWORLD_API_KEY", ""), + voice_id="Ashley", + model="inworld-tts-1", + aiohttp_session=session, + ) + + llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY")) + + messages = [ + { + "role": "system", + "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.", + }, + ] + + context = OpenAILLMContext(messages) + context_aggregator = llm.create_context_aggregator(context) + + pipeline = Pipeline( + [ + transport.input(), # Transport user input + stt, + context_aggregator.user(), # User responses + llm, # LLM + tts, # TTS + transport.output(), # Transport bot output + context_aggregator.assistant(), # Assistant spoken responses + ] + ) + + task = PipelineTask( + pipeline, + params=PipelineParams( + enable_metrics=True, + enable_usage_metrics=True, + ), + ) + + @transport.event_handler("on_client_connected") + async def on_client_connected(transport, client): + logger.info(f"Client connected") + # Kick off the conversation. + messages.append({"role": "system", "content": "Please introduce yourself to the user."}) + await task.queue_frames([context_aggregator.user().get_context_frame()]) + + @transport.event_handler("on_client_disconnected") + async def on_client_disconnected(transport, client): + logger.info(f"Client disconnected") + await task.cancel() + + runner = PipelineRunner(handle_sigint=handle_sigint) + + await runner.run(task) + + +if __name__ == "__main__": + from pipecat.examples.run import main + + main(run_example, transport_params=transport_params) From e3711f96a31ad11bf26b68c397e9971f3c08927b Mon Sep 17 00:00:00 2001 From: padillamt Date: Sun, 20 Jul 2025 17:06:35 -0700 Subject: [PATCH 10/38] inworld: added detailed comments --- src/pipecat/services/inworld/tts.py | 440 +++++++++++++++++++--------- 1 file changed, 297 insertions(+), 143 deletions(-) diff --git a/src/pipecat/services/inworld/tts.py b/src/pipecat/services/inworld/tts.py index bf9a5d6ee..9d328fa5a 100644 --- a/src/pipecat/services/inworld/tts.py +++ b/src/pipecat/services/inworld/tts.py @@ -4,7 +4,33 @@ # SPDX-License-Identifier: BSD 2-Clause License # -"""Inworld's text-to-speech service implementations.""" +"""Inworld AI Text-to-Speech Service Implementation. + +This module provides integration with Inworld AI's HTTP-based TTS API, enabling +real-time text-to-speech synthesis with high-quality, natural-sounding voices. + +Key Features: +- HTTP streaming API support for low-latency audio generation +- Multiple voice options (Ashley, Hades, etc.) +- Real-time audio chunk processing with proper buffering +- WAV header handling and audio format conversion +- Comprehensive error handling and metrics tracking + +Technical Implementation: +- Uses aiohttp for HTTP streaming connections +- Implements JSON line-by-line parsing for streaming responses +- Handles base64-encoded audio data with proper decoding +- Manages audio continuity to prevent clicks and artifacts +- Integrates with Pipecat's frame-based pipeline architecture + +Usage: + tts = InworldHttpTTSService( + api_key=os.getenv("INWORLD_API_KEY"), + voice_id="Ashley", + model="inworld-tts-1", + aiohttp_session=session + ) +""" import base64 import io @@ -40,11 +66,35 @@ from pipecat.utils.tracing.service_decorators import traced_tts def language_to_inworld_language(language: Language) -> Optional[str]: """Convert Pipecat's Language enum to Inworld's language code. + Inworld AI supports a specific set of language codes for TTS synthesis. + This function maps Pipecat's standardized Language enum values to the + corresponding language codes expected by Inworld's API. + + Supported Languages: + - EN (English) -> "en" + - ES (Spanish) -> "es" + - FR (French) -> "fr" + - KO (Korean) -> "ko" + - NL (Dutch) -> "nl" + - ZH (Chinese) -> "zh" + + The function also handles language variants (e.g., es-ES, en-US) by + extracting the base language code and mapping it if supported. + Args: - language: The Language enum value to convert. + language: The Language enum value to convert (e.g., Language.EN). Returns: - The corresponding Inworld language code, or None if not supported. + The corresponding Inworld language code string (e.g., "en"), + or None if the language is not supported by Inworld's API. + + Example: + >>> language_to_inworld_language(Language.EN) + "en" + >>> language_to_inworld_language(Language.ES) + "es" + >>> language_to_inworld_language(Language.DE) # Not supported + None """ BASE_LANGUAGES = { Language.EN: "en", @@ -69,11 +119,42 @@ def language_to_inworld_language(language: Language) -> Optional[str]: class InworldHttpTTSService(TTSService): - """Inworld HTTP-based TTS service. + """Inworld AI HTTP-based Text-to-Speech Service. - Provides text-to-speech using Inworld's HTTP API for simpler, non-streaming - synthesis. Suitable for use cases where streaming is not required and simpler - integration is preferred. + This service integrates Inworld AI's high-quality TTS API with Pipecat's pipeline + architecture. It provides real-time speech synthesis with natural-sounding voices + and low-latency streaming audio delivery. + + Key Features: + - Real-time HTTP streaming for minimal latency + - Multiple voice options (Ashley, Hades, etc.) + - High-quality audio output (48kHz LINEAR16 PCM) + - Automatic audio format handling and header stripping + - Comprehensive error handling and recovery + - Built-in performance metrics and monitoring + + Technical Architecture: + - Uses aiohttp for non-blocking HTTP requests + - Implements JSON line-by-line streaming protocol + - Processes base64-encoded audio chunks in real-time + - Manages audio continuity to prevent artifacts + - Integrates with Pipecat's frame-based pipeline system + + Supported Configuration: + - Voice Selection: Ashley, Hades, and other Inworld voices + - Models: inworld-tts-1 and other available models + - Audio Formats: LINEAR16 PCM at various sample rates + - Languages: English, Spanish, French, Korean, Dutch, Chinese + + Example Usage: + async with aiohttp.ClientSession() as session: + tts = InworldHttpTTSService( + api_key=os.getenv("INWORLD_API_KEY"), + voice_id="Ashley", # Voice selection + model="inworld-tts-1", # TTS model + aiohttp_session=session, # Required HTTP session + sample_rate=48000, # Audio quality + ) """ class InputParams(BaseModel): @@ -89,7 +170,7 @@ class InworldHttpTTSService(TTSService): """ language: Optional[Language] = Language.EN - voice_id: str = "Ashley" ## QUESTION: How to make this modifyable/how to modify? + voice_id: str = "Hades" ## QUESTION: How to make this modifyable/how to modify? # QUESTION: What about speed, pitch, and temperature?? def __init__( @@ -97,6 +178,7 @@ class InworldHttpTTSService(TTSService): *, api_key: str, aiohttp_session: aiohttp.ClientSession, + voice_id: str = "Ashley", model: str = "inworld-tts-1", base_url: str = "https://api.inworld.ai/tts/v1/voice:stream", sample_rate: Optional[int] = 48000, @@ -106,36 +188,67 @@ class InworldHttpTTSService(TTSService): ): """Initialize the Inworld HTTP TTS service. + Sets up the TTS service with Inworld AI's streaming API configuration. + This constructor prepares all necessary parameters for real-time speech synthesis. + Args: - api_key: Inworld API key for authentication. - aiohttp_session: Shared aiohttp session for HTTP requests. - model: TTS model to use (e.g., "inworld-tts-1"). - base_url: Base URL for Inworld HTTP API. - sample_rate: Audio sample rate. If None, uses default. - encoding: Audio encoding format. - params: Additional input parameters for voice customization. - **kwargs: Additional arguments passed to the parent TTSService. + api_key: Inworld API key for authentication (base64-encoded from Inworld Portal). + Get this from: Inworld Portal > Settings > API Keys > Runtime API Key + aiohttp_session: Shared aiohttp session for HTTP requests. Must be provided + for proper connection pooling and resource management. + voice_id: Voice to use for synthesis. Available options include: + - "Ashley" (default) - Natural female voice + - "Hades" - Distinctive character voice + - Other voices available through Inworld's voice catalog + model: TTS model to use. Currently supported: + - "inworld-tts-1" (default) - Latest high-quality model + - Other models as available in Inworld's API + base_url: Base URL for Inworld HTTP API. Uses streaming endpoint by default. + Should normally not be changed unless using a different environment. + sample_rate: Audio sample rate in Hz. Common values: + - 48000 (default) - High quality, suitable for most applications + - 24000 - Good quality, lower bandwidth + - 16000 - Basic quality, minimal bandwidth + encoding: Audio encoding format. Supported options: + - "LINEAR16" (default) - Uncompressed PCM, best quality + - Other formats as supported by Inworld API + params: Additional input parameters for advanced voice customization. + Usually None for standard usage. + **kwargs: Additional arguments passed to the parent TTSService class. + + Note: + The aiohttp_session parameter is required because Inworld's HTTP API + benefits from connection reuse and proper async session management. """ + # Initialize parent TTSService with audio configuration super().__init__(sample_rate=sample_rate, **kwargs) - params = params or InworldTTSService.InputParams() + # Use provided params or create default configuration + params = params or InworldHttpTTSService.InputParams() - self._api_key = api_key - self._session = aiohttp_session - self._base_url = base_url + # Store core configuration for API requests + self._api_key = api_key # Authentication credentials + self._session = aiohttp_session # HTTP session for requests + self._base_url = base_url # API endpoint URL + + # Build settings dictionary that matches Inworld's API expectations + # This will be sent as JSON payload in each TTS request self._settings = { - "voiceId": params.voice_id, - "modelId": model, - "audio_config": { - "audio_encoding": encoding, - "sample_rate_hertz": sample_rate, + "voiceId": voice_id, # Voice selection (fixes bug where this was ignored) + "modelId": model, # TTS model selection + "audio_config": { # Audio format configuration + "audio_encoding": encoding, # Format: LINEAR16, MP3, etc. + "sample_rate_hertz": sample_rate, # Sample rate: 48000, 24000, etc. }, + # Language configuration with fallback to English "language": self.language_to_service_language(params.language) if params.language else "en", } - self.set_voice(params.voice_id) - self.set_model_name(model) + + # Register voice and model with parent service for metrics and tracking + self.set_voice(voice_id) # Used for logging and metrics + self.set_model_name(model) # Used for performance tracking def can_generate_metrics(self) -> bool: """Check if this service can generate processing metrics. @@ -183,164 +296,205 @@ class InworldHttpTTSService(TTSService): @traced_tts async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: - """Generate speech from text using Inworld's HTTP API. + """Generate speech from text using Inworld's streaming HTTP API. - This implementation streams audio chunk by chunk as it's received. + This is the core TTS processing function that: + 1. Sends text to Inworld's streaming TTS endpoint + 2. Receives JSON-streamed audio chunks in real-time + 3. Processes and cleans audio data (removes WAV headers, validates content) + 4. Yields audio frames for immediate playback in the pipeline + + Technical Details: + - Uses HTTP streaming with JSON line-by-line responses + - Each JSON line contains base64-encoded audio data + - Implements buffering to handle partial JSON lines + - Strips WAV headers to prevent audio artifacts/clicks + - Provides real-time audio streaming for low latency Args: text: The text to synthesize into speech. Yields: - Frame: Audio frames containing the synthesized speech. + Frame: Audio frames containing the synthesized speech, plus control frames. + + Raises: + ErrorFrame: If API errors occur or audio processing fails. """ logger.debug(f"{self}: Generating TTS [{text}]") + # ================================================================================ + # STEP 1: PREPARE API REQUEST + # ================================================================================ + # Build the JSON payload according to Inworld's API specification + # This matches the format shown in their documentation examples payload = { - "text": text, - "voiceId": self._settings["voiceId"], - "modelId": self._settings["modelId"], - "audio_config": self._settings["audio_config"], - "language": self._settings["language"], + "text": text, # Text to synthesize + "voiceId": self._settings["voiceId"], # Voice selection (Ashley, Hades, etc.) + "modelId": self._settings["modelId"], # TTS model (inworld-tts-1) + "audio_config": self._settings[ + "audio_config" + ], # Audio format settings (LINEAR16, 48kHz) + "language": self._settings["language"], # Language code (en, es, etc.) } + # Set up HTTP headers for authentication and content type + # Inworld requires Basic auth with base64-encoded API key headers = { - "Authorization": f"Basic {self._api_key}", - "Content-Type": "application/json", + "Authorization": f"Basic {self._api_key}", # Base64 API key from Inworld Portal + "Content-Type": "application/json", # JSON request body } try: + # ================================================================================ + # STEP 2: INITIALIZE METRICS AND STREAMING + # ================================================================================ + # Start measuring Time To First Byte (TTFB) for performance tracking await self.start_ttfb_metrics() + # Signal to the pipeline that TTS generation has started + # This allows downstream processors to prepare for incoming audio yield TTSStartedFrame() - # A flag to ensure we only strip the header from the very first chunk. + # Flag to track if we're processing the first audio chunk + # Used for WAV header handling and debugging is_first_chunk = True + # ================================================================================ + # STEP 3: MAKE HTTP STREAMING REQUEST + # ================================================================================ + # Use aiohttp's streaming POST to Inworld's streaming endpoint + # The endpoint returns JSON lines with audio chunks as they're generated async with self._session.post( self._base_url, json=payload, headers=headers ) as response: + # ================================================================================ + # STEP 4: HANDLE HTTP ERRORS + # ================================================================================ + # Check for API errors (expired keys, invalid requests, etc.) if response.status != 200: error_text = await response.text() logger.error(f"Inworld API error: {error_text}") await self.push_error(ErrorFrame(f"Inworld API error: {error_text}")) return - # Process the stream line by line. - async for line in response.content.iter_lines(): - line_str = line.decode("utf-8").strip() - if not line_str: + # ================================================================================ + # STEP 5: PROCESS STREAMING JSON RESPONSE + # ================================================================================ + # Inworld streams JSON lines where each line contains audio data + # We need to buffer incoming data and process complete lines + + # Buffer to accumulate incoming text data + # This handles cases where JSON lines are split across HTTP chunks + buffer = "" + + # Read HTTP response in manageable chunks (1KB each) + # This prevents memory issues with large responses + async for chunk in response.content.iter_chunked(1024): + if not chunk: continue - try: - chunk = json.loads(line_str) - if "result" in chunk and "audioContent" in chunk["result"]: - audio_chunk = base64.b64decode(chunk["result"]["audioContent"]) - audio_data = audio_chunk + # ============================================================================ + # STEP 6: BUFFER MANAGEMENT + # ============================================================================ + # Decode binary chunk to text and add to our line buffer + # Each chunk may contain partial JSON lines, so we need to accumulate + buffer += chunk.decode("utf-8") - # Correctly strip the header only from the first chunk. - if ( - is_first_chunk - and len(audio_chunk) > 44 - and audio_chunk.startswith(b"RIFF") - ): - audio_data = audio_chunk[44:] - is_first_chunk = False # Unset the flag. + # ============================================================================ + # STEP 7: LINE-BY-LINE JSON PROCESSING + # ============================================================================ + # Process all complete lines in the buffer (lines ending with \n) + # Leave partial lines in buffer for next iteration + while "\n" in buffer: + # Split on first newline, keeping remainder in buffer + line, buffer = buffer.split("\n", 1) + line_str = line.strip() - # Yield each audio frame as it's processed. - yield TTSAudioRawFrame( - audio=audio_data, - sample_rate=self.sample_rate, - num_channels=1, - ) + # Skip empty lines (common in streaming responses) + if not line_str: + continue - except json.JSONDecodeError: - continue + try: + # ================================================================ + # STEP 8: PARSE JSON AND EXTRACT AUDIO + # ================================================================ + # Parse the JSON line - should contain audio data + chunk_data = json.loads(line_str) + # Check if this line contains audio content + # Inworld's response format: {"result": {"audioContent": "base64data"}} + if "result" in chunk_data and "audioContent" in chunk_data["result"]: + # Decode base64 audio data to binary + audio_chunk = base64.b64decode(chunk_data["result"]["audioContent"]) + + # ======================================================== + # STEP 9: AUDIO DATA VALIDATION + # ======================================================== + # Skip empty audio chunks that could cause discontinuities + # Empty chunks can create gaps or clicks in audio playback + if not audio_chunk: + continue + + # Start with the raw audio data + audio_data = audio_chunk + + # ======================================================== + # STEP 10: WAV HEADER REMOVAL (CRITICAL FOR AUDIO QUALITY) + # ======================================================== + # Each audio chunk may have its own WAV header (44 bytes) + # These headers contain metadata and will sound like clicks if played + # We must strip them from EVERY chunk, not just the first one + if ( + len(audio_chunk) > 44 # Ensure chunk is large enough + and audio_chunk.startswith( + b"RIFF" + ) # Check for WAV header magic bytes + ): + # Remove the 44-byte WAV header to get pure audio data + audio_data = audio_chunk[44:] + + # Track that we've seen our first chunk (for debugging) + if is_first_chunk: + is_first_chunk = False + + # ======================================================== + # STEP 11: YIELD AUDIO FRAME TO PIPELINE + # ======================================================== + # Only yield frames with actual audio content + # Empty frames can cause pipeline issues + if len(audio_data) > 0: + # Create Pipecat audio frame with processed audio data + yield TTSAudioRawFrame( + audio=audio_data, # Clean audio without headers + sample_rate=self.sample_rate, # Configured sample rate (48kHz) + num_channels=1, # Mono audio + ) + + except json.JSONDecodeError: + # Ignore malformed JSON lines - streaming can have partial data + # This is normal in HTTP streaming scenarios + continue + + # ================================================================================ + # STEP 12: FINALIZE METRICS AND CLEANUP + # ================================================================================ + # Start usage metrics tracking after successful completion await self.start_tts_usage_metrics(text) except Exception as e: + # ================================================================================ + # STEP 13: ERROR HANDLING + # ================================================================================ + # Log any unexpected errors and notify the pipeline logger.error(f"{self} exception: {e}") await self.push_error(ErrorFrame(f"Error generating TTS: {e}")) finally: + # ================================================================================ + # STEP 14: CLEANUP AND COMPLETION + # ================================================================================ + # Always stop metrics tracking, even if errors occurred await self.stop_ttfb_metrics() + + # Signal to pipeline that TTS generation is complete + # This allows downstream processors to finalize audio processing yield TTSStoppedFrame() - - # @traced_tts - # async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: - # """Generate speech from text using Inworld's HTTP API. - - # Args: - # text: The text to synthesize into speech. - - # Yields: - # Frame: Audio frames containing the synthesized speech. - # """ - # logger.debug(f"{self}: Generating TTS [{text}]") - - # payload = { - # "text": text, - # "voiceId": self._settings["voiceId"], - # "modelId": self._settings["modelId"], - # "audio_config": self._settings["audio_config"], - # "language": self._settings["language"], - # } - - # headers = { - # "Authorization": f"Basic {self._api_key}", - # "Content-Type": "application/json", - # } - - # try: - # await self.start_ttfb_metrics() - - # yield TTSStartedFrame() - - # async with self._session.post(self._base_url, json=payload, headers=headers) as response: - # if response.status != 200: - # error_text = await response.text() - # logger.error(f"Inworld API error: {error_text}") - # await self.push_error(ErrorFrame(f"Inworld API error: {error_text}")) - # return - - # raw_audio_data = io.BytesIO() - - # async for line in response.content.iter_lines(): - # line_str = line.decode('utf-8').strip() - # if not line_str: - # continue - - # try: - # chunk = json.loads(line_str) - # if "result" in chunk and "audioContent" in chunk["result"]: - # audio_chunk = base64.b64decode(chunk["result"]["audioContent"]) - # # Skip WAV header if present (first 44 bytes) - # if len(audio_chunk) > 44 and audio_chunk.startswith(b"RIFF"): - # audio_data = audio_chunk[44:] - # else: - # audio_data = audio_chunk - # raw_audio_data.write(audio_data) - # except json.JSONDecodeError: - # continue - - # await self.start_tts_usage_metrics(text) - - # audio_bytes = raw_audio_data.getvalue() - # if not audio_bytes: - # logger.error("No audio data received from Inworld API") - # await self.push_error(ErrorFrame("No audio data received")) - # return - - # frame = TTSAudioRawFrame( - # audio=audio_bytes, - # sample_rate=self.sample_rate, - # num_channels=1, - # ) - - # yield frame - - # except Exception as e: - # logger.error(f"{self} exception: {e}") - # await self.push_error(ErrorFrame(f"Error generating TTS: {e}")) - # finally: - # await self.stop_ttfb_metrics() - # yield TTSStoppedFrame() From 4250aa6616f7a471585202b512a41b0813e319cc Mon Sep 17 00:00:00 2001 From: padillamt Date: Mon, 21 Jul 2025 10:11:50 -0700 Subject: [PATCH 11/38] inworld: removal of backup copy, no longer needed --- .../07aa-interruptible-inworld-http_copy.py | 117 ------------------ 1 file changed, 117 deletions(-) delete mode 100644 examples/foundational/07aa-interruptible-inworld-http_copy.py diff --git a/examples/foundational/07aa-interruptible-inworld-http_copy.py b/examples/foundational/07aa-interruptible-inworld-http_copy.py deleted file mode 100644 index 0121865ab..000000000 --- a/examples/foundational/07aa-interruptible-inworld-http_copy.py +++ /dev/null @@ -1,117 +0,0 @@ -# -# Copyright (c) 2024–2025, Daily -# -# SPDX-License-Identifier: BSD 2-Clause License -# - -import argparse -import os - -import aiohttp -from dotenv import load_dotenv -from loguru import logger - -from pipecat.audio.vad.silero import SileroVADAnalyzer -from pipecat.pipeline.pipeline import Pipeline -from pipecat.pipeline.runner import PipelineRunner -from pipecat.pipeline.task import PipelineParams, PipelineTask -from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext -from pipecat.services.deepgram.stt import DeepgramSTTService -from pipecat.services.inworld.tts import InworldHttpTTSService -from pipecat.services.openai.llm import OpenAILLMService -from pipecat.transports.base_transport import BaseTransport, TransportParams -from pipecat.transports.network.fastapi_websocket import FastAPIWebsocketParams -from pipecat.transports.services.daily import DailyParams - -load_dotenv(override=True) - - -# We store functions so objects (e.g. SileroVADAnalyzer) don't get -# instantiated. The function will be called when the desired transport gets -# selected. -transport_params = { - "daily": lambda: DailyParams( - audio_in_enabled=True, - audio_out_enabled=True, - vad_analyzer=SileroVADAnalyzer(), - ), - "twilio": lambda: FastAPIWebsocketParams( - audio_in_enabled=True, - audio_out_enabled=True, - vad_analyzer=SileroVADAnalyzer(), - ), - "webrtc": lambda: TransportParams( - audio_in_enabled=True, - audio_out_enabled=True, - vad_analyzer=SileroVADAnalyzer(), - ), -} - - -async def run_example(transport: BaseTransport, _: argparse.Namespace, handle_sigint: bool): - logger.info(f"Starting bot") - - # Create an HTTP session - async with aiohttp.ClientSession() as session: - stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY")) - - tts = InworldHttpTTSService( - api_key=os.getenv("INWORLD_API_KEY", ""), - voice_id="Ashley", - model="inworld-tts-1", - aiohttp_session=session, - ) - - llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY")) - - messages = [ - { - "role": "system", - "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.", - }, - ] - - context = OpenAILLMContext(messages) - context_aggregator = llm.create_context_aggregator(context) - - pipeline = Pipeline( - [ - transport.input(), # Transport user input - stt, - context_aggregator.user(), # User responses - llm, # LLM - tts, # TTS - transport.output(), # Transport bot output - context_aggregator.assistant(), # Assistant spoken responses - ] - ) - - task = PipelineTask( - pipeline, - params=PipelineParams( - enable_metrics=True, - enable_usage_metrics=True, - ), - ) - - @transport.event_handler("on_client_connected") - async def on_client_connected(transport, client): - logger.info(f"Client connected") - # Kick off the conversation. - messages.append({"role": "system", "content": "Please introduce yourself to the user."}) - await task.queue_frames([context_aggregator.user().get_context_frame()]) - - @transport.event_handler("on_client_disconnected") - async def on_client_disconnected(transport, client): - logger.info(f"Client disconnected") - await task.cancel() - - runner = PipelineRunner(handle_sigint=handle_sigint) - - await runner.run(task) - - -if __name__ == "__main__": - from pipecat.examples.run import main - - main(run_example, transport_params=transport_params) From aadd088b5077b014d78f8e8e0930e9e6785e42b2 Mon Sep 17 00:00:00 2001 From: padillamt Date: Mon, 21 Jul 2025 10:52:55 -0700 Subject: [PATCH 12/38] inworld: commented out contents as per Pipecat guidance that this pattern is being retired --- src/pipecat/services/inworld/__init__.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/pipecat/services/inworld/__init__.py b/src/pipecat/services/inworld/__init__.py index 9717eb163..910364d1b 100644 --- a/src/pipecat/services/inworld/__init__.py +++ b/src/pipecat/services/inworld/__init__.py @@ -4,10 +4,10 @@ # SPDX-License-Identifier: BSD 2-Clause License # -import sys +# import sys -from pipecat.services import DeprecatedModuleProxy +# from pipecat.services import DeprecatedModuleProxy -from .tts import * +# from .tts import * -sys.modules[__name__] = DeprecatedModuleProxy(globals(), "inworld", "inworld.tts") +# sys.modules[__name__] = DeprecatedModuleProxy(globals(), "inworld", "inworld.tts") From 54ff946976776be1d95d5a7ab0a9c5e9fb8af8f7 Mon Sep 17 00:00:00 2001 From: padillamt Date: Mon, 21 Jul 2025 12:07:58 -0700 Subject: [PATCH 13/38] inworld: largely adjustments for docstring compatibility --- src/pipecat/services/inworld/tts.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/src/pipecat/services/inworld/tts.py b/src/pipecat/services/inworld/tts.py index 9d328fa5a..51245ea09 100644 --- a/src/pipecat/services/inworld/tts.py +++ b/src/pipecat/services/inworld/tts.py @@ -10,6 +10,7 @@ This module provides integration with Inworld AI's HTTP-based TTS API, enabling real-time text-to-speech synthesis with high-quality, natural-sounding voices. Key Features: + - HTTP streaming API support for low-latency audio generation - Multiple voice options (Ashley, Hades, etc.) - Real-time audio chunk processing with proper buffering @@ -17,13 +18,14 @@ Key Features: - Comprehensive error handling and metrics tracking Technical Implementation: + - Uses aiohttp for HTTP streaming connections - Implements JSON line-by-line parsing for streaming responses - Handles base64-encoded audio data with proper decoding - Manages audio continuity to prevent clicks and artifacts - Integrates with Pipecat's frame-based pipeline architecture -Usage: +Usage:: tts = InworldHttpTTSService( api_key=os.getenv("INWORLD_API_KEY"), voice_id="Ashley", @@ -71,6 +73,7 @@ def language_to_inworld_language(language: Language) -> Optional[str]: corresponding language codes expected by Inworld's API. Supported Languages: + - EN (English) -> "en" - ES (Spanish) -> "es" - FR (French) -> "fr" @@ -126,6 +129,7 @@ class InworldHttpTTSService(TTSService): and low-latency streaming audio delivery. Key Features: + - Real-time HTTP streaming for minimal latency - Multiple voice options (Ashley, Hades, etc.) - High-quality audio output (48kHz LINEAR16 PCM) @@ -134,6 +138,7 @@ class InworldHttpTTSService(TTSService): - Built-in performance metrics and monitoring Technical Architecture: + - Uses aiohttp for non-blocking HTTP requests - Implements JSON line-by-line streaming protocol - Processes base64-encoded audio chunks in real-time @@ -141,16 +146,17 @@ class InworldHttpTTSService(TTSService): - Integrates with Pipecat's frame-based pipeline system Supported Configuration: + - Voice Selection: Ashley, Hades, and other Inworld voices - Models: inworld-tts-1 and other available models - Audio Formats: LINEAR16 PCM at various sample rates - Languages: English, Spanish, French, Korean, Dutch, Chinese - Example Usage: + Example Usage:: async with aiohttp.ClientSession() as session: tts = InworldHttpTTSService( api_key=os.getenv("INWORLD_API_KEY"), - voice_id="Ashley", # Voice selection + voice_id="Ashley", # Voice selection model="inworld-tts-1", # TTS model aiohttp_session=session, # Required HTTP session sample_rate=48000, # Audio quality @@ -162,16 +168,9 @@ class InworldHttpTTSService(TTSService): Parameters: language: Language to use for synthesis. - speed: Voice speed control (string or float). - emotion: List of emotion controls. - - .. deprecated:: 0.0.68 - The `emotion` parameter is deprecated and will be removed in a future version. """ language: Optional[Language] = Language.EN - voice_id: str = "Hades" ## QUESTION: How to make this modifyable/how to modify? - # QUESTION: What about speed, pitch, and temperature?? def __init__( self, @@ -179,6 +178,7 @@ class InworldHttpTTSService(TTSService): api_key: str, aiohttp_session: aiohttp.ClientSession, voice_id: str = "Ashley", + # language: Optional[Language] = Language.EN, model: str = "inworld-tts-1", base_url: str = "https://api.inworld.ai/tts/v1/voice:stream", sample_rate: Optional[int] = 48000, @@ -305,6 +305,7 @@ class InworldHttpTTSService(TTSService): 4. Yields audio frames for immediate playback in the pipeline Technical Details: + - Uses HTTP streaming with JSON line-by-line responses - Each JSON line contains base64-encoded audio data - Implements buffering to handle partial JSON lines @@ -334,7 +335,7 @@ class InworldHttpTTSService(TTSService): "audio_config": self._settings[ "audio_config" ], # Audio format settings (LINEAR16, 48kHz) - "language": self._settings["language"], # Language code (en, es, etc.) + # "language": self._settings["language"], # Language code (en, es, etc.) } # Set up HTTP headers for authentication and content type From 8eda2435a2b523af70d95d5c6647baed1db7c7b1 Mon Sep 17 00:00:00 2001 From: padillamt Date: Mon, 21 Jul 2025 13:24:10 -0700 Subject: [PATCH 14/38] inworld: removed explicit references to language since our models currently infer that from the text. --- src/pipecat/services/inworld/tts.py | 147 +++++++++------------------- 1 file changed, 45 insertions(+), 102 deletions(-) diff --git a/src/pipecat/services/inworld/tts.py b/src/pipecat/services/inworld/tts.py index 51245ea09..e585943d8 100644 --- a/src/pipecat/services/inworld/tts.py +++ b/src/pipecat/services/inworld/tts.py @@ -13,6 +13,7 @@ Key Features: - HTTP streaming API support for low-latency audio generation - Multiple voice options (Ashley, Hades, etc.) +- Automatic language detection from input text (no manual language setting required) - Real-time audio chunk processing with proper buffering - WAV header handling and audio format conversion - Comprehensive error handling and metrics tracking @@ -26,12 +27,16 @@ Technical Implementation: - Integrates with Pipecat's frame-based pipeline architecture Usage:: - tts = InworldHttpTTSService( - api_key=os.getenv("INWORLD_API_KEY"), - voice_id="Ashley", - model="inworld-tts-1", - aiohttp_session=session - ) + + async with aiohttp.ClientSession() as session: + tts = InworldHttpTTSService( + api_key=os.getenv("INWORLD_API_KEY"), + aiohttp_session=session, + params=InworldHttpTTSService.InputParams( + voice_id="Ashley", + model="inworld-tts-1", + ), + ) """ import base64 @@ -58,69 +63,12 @@ from pipecat.frames.frames import ( ) from pipecat.processors.frame_processor import FrameDirection from pipecat.services.tts_service import AudioContextWordTTSService, TTSService -from pipecat.transcriptions.language import Language from pipecat.utils.asyncio.watchdog_async_iterator import WatchdogAsyncIterator from pipecat.utils.text.base_text_aggregator import BaseTextAggregator from pipecat.utils.text.skip_tags_aggregator import SkipTagsAggregator from pipecat.utils.tracing.service_decorators import traced_tts -def language_to_inworld_language(language: Language) -> Optional[str]: - """Convert Pipecat's Language enum to Inworld's language code. - - Inworld AI supports a specific set of language codes for TTS synthesis. - This function maps Pipecat's standardized Language enum values to the - corresponding language codes expected by Inworld's API. - - Supported Languages: - - - EN (English) -> "en" - - ES (Spanish) -> "es" - - FR (French) -> "fr" - - KO (Korean) -> "ko" - - NL (Dutch) -> "nl" - - ZH (Chinese) -> "zh" - - The function also handles language variants (e.g., es-ES, en-US) by - extracting the base language code and mapping it if supported. - - Args: - language: The Language enum value to convert (e.g., Language.EN). - - Returns: - The corresponding Inworld language code string (e.g., "en"), - or None if the language is not supported by Inworld's API. - - Example: - >>> language_to_inworld_language(Language.EN) - "en" - >>> language_to_inworld_language(Language.ES) - "es" - >>> language_to_inworld_language(Language.DE) # Not supported - None - """ - BASE_LANGUAGES = { - Language.EN: "en", - Language.ES: "es", - Language.FR: "fr", - Language.KO: "ko", - Language.NL: "nl", - Language.ZH: "zh", - } - - result = BASE_LANGUAGES.get(language) - - # If not found in base languages, try to find the base language from a variant - if not result: - # Convert enum value to string and get the base language part (e.g. es-ES -> es) - lang_str = str(language.value) - base_code = lang_str.split("-")[0].lower() - # Look up the base code in our supported languages - result = base_code if base_code in BASE_LANGUAGES.values() else None - - return result - - class InworldHttpTTSService(TTSService): """Inworld AI HTTP-based Text-to-Speech Service. @@ -150,16 +98,26 @@ class InworldHttpTTSService(TTSService): - Voice Selection: Ashley, Hades, and other Inworld voices - Models: inworld-tts-1 and other available models - Audio Formats: LINEAR16 PCM at various sample rates - - Languages: English, Spanish, French, Korean, Dutch, Chinese + - Language Detection: Automatically inferred from input text (no explicit language setting required) Example Usage:: + async with aiohttp.ClientSession() as session: + # Using default settings (Ashley voice, inworld-tts-1 model) tts = InworldHttpTTSService( api_key=os.getenv("INWORLD_API_KEY"), - voice_id="Ashley", # Voice selection - model="inworld-tts-1", # TTS model - aiohttp_session=session, # Required HTTP session - sample_rate=48000, # Audio quality + aiohttp_session=session, + ) + + # Or with custom voice and model via params + params = InworldHttpTTSService.InputParams( + voice_id="Hades", + model="inworld-tts-1-max", + ) + tts = InworldHttpTTSService( + api_key=os.getenv("INWORLD_API_KEY"), + aiohttp_session=session, + params=params, ) """ @@ -167,19 +125,22 @@ class InworldHttpTTSService(TTSService): """Input parameters for Inworld HTTP TTS configuration. Parameters: - language: Language to use for synthesis. + voice_id: Voice selection for speech synthesis (e.g., "Ashley", "Hades"). + model: TTS model to use (e.g., "inworld-tts-1", "inworld-tts-1-max"). + + Note: + Language is automatically inferred from the input text by Inworld's TTS models, + so no explicit language parameter is required. """ - language: Optional[Language] = Language.EN + voice_id: Optional[str] = "Ashley" # defaults to the Ashley voice + model: Optional[str] = "inworld-tts-1" # defaults to the inworld-tts-1 model def __init__( self, *, api_key: str, aiohttp_session: aiohttp.ClientSession, - voice_id: str = "Ashley", - # language: Optional[Language] = Language.EN, - model: str = "inworld-tts-1", base_url: str = "https://api.inworld.ai/tts/v1/voice:stream", sample_rate: Optional[int] = 48000, encoding: str = "LINEAR16", @@ -196,13 +157,6 @@ class InworldHttpTTSService(TTSService): Get this from: Inworld Portal > Settings > API Keys > Runtime API Key aiohttp_session: Shared aiohttp session for HTTP requests. Must be provided for proper connection pooling and resource management. - voice_id: Voice to use for synthesis. Available options include: - - "Ashley" (default) - Natural female voice - - "Hades" - Distinctive character voice - - Other voices available through Inworld's voice catalog - model: TTS model to use. Currently supported: - - "inworld-tts-1" (default) - Latest high-quality model - - Other models as available in Inworld's API base_url: Base URL for Inworld HTTP API. Uses streaming endpoint by default. Should normally not be changed unless using a different environment. sample_rate: Audio sample rate in Hz. Common values: @@ -212,8 +166,11 @@ class InworldHttpTTSService(TTSService): encoding: Audio encoding format. Supported options: - "LINEAR16" (default) - Uncompressed PCM, best quality - Other formats as supported by Inworld API - params: Additional input parameters for advanced voice customization. - Usually None for standard usage. + params: Input parameters for voice and model configuration. Use this to specify: + - voice_id: Voice selection ("Ashley", "Hades", etc.) + - model: TTS model ("inworld-tts-1", "inworld-tts-1-max", etc.) + If None, uses default values (Ashley voice, inworld-tts-1 model). + Note: Language is automatically inferred from input text. **kwargs: Additional arguments passed to the parent TTSService class. Note: @@ -233,22 +190,19 @@ class InworldHttpTTSService(TTSService): # Build settings dictionary that matches Inworld's API expectations # This will be sent as JSON payload in each TTS request + # Note: Language is automatically inferred from text by Inworld's models self._settings = { - "voiceId": voice_id, # Voice selection (fixes bug where this was ignored) - "modelId": model, # TTS model selection + "voiceId": params.voice_id or "Ashley", # Voice selection from params + "modelId": params.model or "inworld-tts-1", # TTS model selection from params "audio_config": { # Audio format configuration "audio_encoding": encoding, # Format: LINEAR16, MP3, etc. "sample_rate_hertz": sample_rate, # Sample rate: 48000, 24000, etc. }, - # Language configuration with fallback to English - "language": self.language_to_service_language(params.language) - if params.language - else "en", } # Register voice and model with parent service for metrics and tracking - self.set_voice(voice_id) # Used for logging and metrics - self.set_model_name(model) # Used for performance tracking + self.set_voice(params.voice_id or "Ashley") # Used for logging and metrics + self.set_model_name(params.model or "inworld-tts-1") # Used for performance tracking def can_generate_metrics(self) -> bool: """Check if this service can generate processing metrics. @@ -258,17 +212,6 @@ class InworldHttpTTSService(TTSService): """ return True - def language_to_service_language(self, language: Language) -> Optional[str]: - """Convert a Language enum to Inworld language format. - - Args: - language: The language to convert. - - Returns: - The Inworld-specific language code, or None if not supported. - """ - return language_to_inworld_language(language) - async def start(self, frame: StartFrame): """Start the Inworld HTTP TTS service. @@ -328,6 +271,7 @@ class InworldHttpTTSService(TTSService): # ================================================================================ # Build the JSON payload according to Inworld's API specification # This matches the format shown in their documentation examples + # Note: Language is automatically inferred from the input text by Inworld's models payload = { "text": text, # Text to synthesize "voiceId": self._settings["voiceId"], # Voice selection (Ashley, Hades, etc.) @@ -335,7 +279,6 @@ class InworldHttpTTSService(TTSService): "audio_config": self._settings[ "audio_config" ], # Audio format settings (LINEAR16, 48kHz) - # "language": self._settings["language"], # Language code (en, es, etc.) } # Set up HTTP headers for authentication and content type From 4853d5d55cb0df19ba57ceecdd37f5e42b8acf04 Mon Sep 17 00:00:00 2001 From: padillamt Date: Mon, 21 Jul 2025 13:27:25 -0700 Subject: [PATCH 15/38] inworld: updated InworldHttpTTSService initialization --- examples/foundational/07aa-interruptible-inworld-http.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/examples/foundational/07aa-interruptible-inworld-http.py b/examples/foundational/07aa-interruptible-inworld-http.py index 03622a2d4..c1c509ade 100644 --- a/examples/foundational/07aa-interruptible-inworld-http.py +++ b/examples/foundational/07aa-interruptible-inworld-http.py @@ -60,9 +60,11 @@ async def run_example(transport: BaseTransport, _: argparse.Namespace, handle_si tts = InworldHttpTTSService( api_key=os.getenv("INWORLD_API_KEY", ""), - voice_id="Ashley", - model="inworld-tts-1", aiohttp_session=session, + params=InworldHttpTTSService.InputParams( + voice_id="Ashley", + model="inworld-tts-1-max", + ), ) llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY")) From 0d5292c4efe43571aceefebab00565eaa40db237 Mon Sep 17 00:00:00 2001 From: padillamt Date: Mon, 21 Jul 2025 13:48:13 -0700 Subject: [PATCH 16/38] inworld: typo fix in voice name --- examples/foundational/07aa-interruptible-inworld-http.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/foundational/07aa-interruptible-inworld-http.py b/examples/foundational/07aa-interruptible-inworld-http.py index c1c509ade..887334b7d 100644 --- a/examples/foundational/07aa-interruptible-inworld-http.py +++ b/examples/foundational/07aa-interruptible-inworld-http.py @@ -63,7 +63,7 @@ async def run_example(transport: BaseTransport, _: argparse.Namespace, handle_si aiohttp_session=session, params=InworldHttpTTSService.InputParams( voice_id="Ashley", - model="inworld-tts-1-max", + model="inworld-tts-1", ), ) From 076a675a757eadb1ace6d265e9cb5f6093a62e7b Mon Sep 17 00:00:00 2001 From: padillamt Date: Mon, 21 Jul 2025 13:50:36 -0700 Subject: [PATCH 17/38] inworld: Fix...Set sample_rate=None in InworldHttpTTSService to match Cartesia pattern --- src/pipecat/services/inworld/tts.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/src/pipecat/services/inworld/tts.py b/src/pipecat/services/inworld/tts.py index e585943d8..6e1552744 100644 --- a/src/pipecat/services/inworld/tts.py +++ b/src/pipecat/services/inworld/tts.py @@ -142,7 +142,7 @@ class InworldHttpTTSService(TTSService): api_key: str, aiohttp_session: aiohttp.ClientSession, base_url: str = "https://api.inworld.ai/tts/v1/voice:stream", - sample_rate: Optional[int] = 48000, + sample_rate: Optional[int] = None, encoding: str = "LINEAR16", params: Optional[InputParams] = None, **kwargs, @@ -159,10 +159,8 @@ class InworldHttpTTSService(TTSService): for proper connection pooling and resource management. base_url: Base URL for Inworld HTTP API. Uses streaming endpoint by default. Should normally not be changed unless using a different environment. - sample_rate: Audio sample rate in Hz. Common values: - - 48000 (default) - High quality, suitable for most applications - - 24000 - Good quality, lower bandwidth - - 16000 - Basic quality, minimal bandwidth + sample_rate: Audio sample rate in Hz. If None, uses default from StartFrame. + Common values: 48000 (high quality), 24000 (good quality), 16000 (basic) encoding: Audio encoding format. Supported options: - "LINEAR16" (default) - Uncompressed PCM, best quality - Other formats as supported by Inworld API @@ -196,7 +194,7 @@ class InworldHttpTTSService(TTSService): "modelId": params.model or "inworld-tts-1", # TTS model selection from params "audio_config": { # Audio format configuration "audio_encoding": encoding, # Format: LINEAR16, MP3, etc. - "sample_rate_hertz": sample_rate, # Sample rate: 48000, 24000, etc. + "sample_rate_hertz": 0, # Will be set in start() from parent service }, } From 1915407ff7ab215abe14180dbf856d8e1190d1e9 Mon Sep 17 00:00:00 2001 From: padillamt Date: Mon, 21 Jul 2025 15:30:48 -0700 Subject: [PATCH 18/38] inworld: removed unreferenced is_first_chunk variable --- src/pipecat/services/inworld/tts.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/src/pipecat/services/inworld/tts.py b/src/pipecat/services/inworld/tts.py index 6e1552744..af90ad116 100644 --- a/src/pipecat/services/inworld/tts.py +++ b/src/pipecat/services/inworld/tts.py @@ -297,10 +297,6 @@ class InworldHttpTTSService(TTSService): # This allows downstream processors to prepare for incoming audio yield TTSStartedFrame() - # Flag to track if we're processing the first audio chunk - # Used for WAV header handling and debugging - is_first_chunk = True - # ================================================================================ # STEP 3: MAKE HTTP STREAMING REQUEST # ================================================================================ @@ -395,10 +391,6 @@ class InworldHttpTTSService(TTSService): # Remove the 44-byte WAV header to get pure audio data audio_data = audio_chunk[44:] - # Track that we've seen our first chunk (for debugging) - if is_first_chunk: - is_first_chunk = False - # ======================================================== # STEP 11: YIELD AUDIO FRAME TO PIPELINE # ======================================================== From f29024bcc01899d067c13e5edfe89a4977727571 Mon Sep 17 00:00:00 2001 From: padillamt Date: Wed, 23 Jul 2025 11:47:26 -0700 Subject: [PATCH 19/38] mtpadilla: update coments regarding temperature parameter --- src/pipecat/services/inworld/tts.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/src/pipecat/services/inworld/tts.py b/src/pipecat/services/inworld/tts.py index af90ad116..b1a020b8b 100644 --- a/src/pipecat/services/inworld/tts.py +++ b/src/pipecat/services/inworld/tts.py @@ -35,6 +35,7 @@ Usage:: params=InworldHttpTTSService.InputParams( voice_id="Ashley", model="inworld-tts-1", + temperature=0.8, # Optional: control synthesis variability (range: [0, 2]) ), ) """ @@ -109,10 +110,11 @@ class InworldHttpTTSService(TTSService): aiohttp_session=session, ) - # Or with custom voice and model via params + # Or with custom voice, model, and temperature via params params = InworldHttpTTSService.InputParams( voice_id="Hades", model="inworld-tts-1-max", + temperature=0.8, # Add variability to speech synthesis (range: [0, 2]) ) tts = InworldHttpTTSService( api_key=os.getenv("INWORLD_API_KEY"), @@ -124,9 +126,11 @@ class InworldHttpTTSService(TTSService): class InputParams(BaseModel): """Input parameters for Inworld HTTP TTS configuration. - Parameters: + Parameters: voice_id: Voice selection for speech synthesis (e.g., "Ashley", "Hades"). model: TTS model to use (e.g., "inworld-tts-1", "inworld-tts-1-max"). + temperature: Voice temperature control for synthesis variability (e.g., 0.8). + Valid range: [0, 2]. Higher values increase variability. Note: Language is automatically inferred from the input text by Inworld's TTS models, @@ -135,6 +139,7 @@ class InworldHttpTTSService(TTSService): voice_id: Optional[str] = "Ashley" # defaults to the Ashley voice model: Optional[str] = "inworld-tts-1" # defaults to the inworld-tts-1 model + temperature: Optional[float] = None # optional temperature control (range: [0, 2]) def __init__( self, @@ -167,6 +172,7 @@ class InworldHttpTTSService(TTSService): params: Input parameters for voice and model configuration. Use this to specify: - voice_id: Voice selection ("Ashley", "Hades", etc.) - model: TTS model ("inworld-tts-1", "inworld-tts-1-max", etc.) + - temperature: Voice temperature control for variability (range: [0, 2], e.g., 0.8, optional) If None, uses default values (Ashley voice, inworld-tts-1 model). Note: Language is automatically inferred from input text. **kwargs: Additional arguments passed to the parent TTSService class. @@ -198,6 +204,10 @@ class InworldHttpTTSService(TTSService): }, } + # Add optional temperature parameter if provided (valid range: [0, 2]) + if params.temperature is not None: + self._settings["temperature"] = params.temperature + # Register voice and model with parent service for metrics and tracking self.set_voice(params.voice_id or "Ashley") # Used for logging and metrics self.set_model_name(params.model or "inworld-tts-1") # Used for performance tracking @@ -279,6 +289,10 @@ class InworldHttpTTSService(TTSService): ], # Audio format settings (LINEAR16, 48kHz) } + # Add optional temperature parameter if configured (valid range: [0, 2]) + if "temperature" in self._settings: + payload["temperature"] = self._settings["temperature"] + # Set up HTTP headers for authentication and content type # Inworld requires Basic auth with base64-encoded API key headers = { From a5d353030ec83a79e40c647af7bcb4f46863e57b Mon Sep 17 00:00:00 2001 From: padillamt Date: Wed, 23 Jul 2025 12:02:58 -0700 Subject: [PATCH 20/38] mtpadilla: small formatting fix to comments --- src/pipecat/services/inworld/tts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pipecat/services/inworld/tts.py b/src/pipecat/services/inworld/tts.py index b1a020b8b..1fd8e9a55 100644 --- a/src/pipecat/services/inworld/tts.py +++ b/src/pipecat/services/inworld/tts.py @@ -126,7 +126,7 @@ class InworldHttpTTSService(TTSService): class InputParams(BaseModel): """Input parameters for Inworld HTTP TTS configuration. - Parameters: + Parameters: voice_id: Voice selection for speech synthesis (e.g., "Ashley", "Hades"). model: TTS model to use (e.g., "inworld-tts-1", "inworld-tts-1-max"). temperature: Voice temperature control for synthesis variability (e.g., 0.8). From 147bf9cfe852e644ca35dacf02032db99348a4d9 Mon Sep 17 00:00:00 2001 From: padillamt Date: Wed, 23 Jul 2025 15:28:43 -0700 Subject: [PATCH 21/38] mtpadilla: addition of non-streaming option with own dedicated class, and related additional non-streaming test option --- .../07aa-interruptible-inworld-http.py | 34 +- src/pipecat/services/inworld/tts.py | 375 +++++++++++++++++- 2 files changed, 393 insertions(+), 16 deletions(-) diff --git a/examples/foundational/07aa-interruptible-inworld-http.py b/examples/foundational/07aa-interruptible-inworld-http.py index 887334b7d..65d7babca 100644 --- a/examples/foundational/07aa-interruptible-inworld-http.py +++ b/examples/foundational/07aa-interruptible-inworld-http.py @@ -16,7 +16,7 @@ from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner from pipecat.pipeline.task import PipelineParams, PipelineTask from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext -from pipecat.services.inworld.tts import InworldHttpTTSService +from pipecat.services.inworld.tts import InworldHttpNonStreamingService, InworldHttpStreamingService from pipecat.services.openai.llm import OpenAILLMService from pipecat.services.openai.stt import OpenAISTTService from pipecat.transports.base_transport import BaseTransport, TransportParams @@ -58,14 +58,30 @@ async def run_example(transport: BaseTransport, _: argparse.Namespace, handle_si prompt="Expect words related to dogs, such as breed names.", ) - tts = InworldHttpTTSService( - api_key=os.getenv("INWORLD_API_KEY", ""), - aiohttp_session=session, - params=InworldHttpTTSService.InputParams( - voice_id="Ashley", - model="inworld-tts-1", - ), - ) + streaming = True + + if streaming: + # Streaming TTS - Real-time audio generation as text is processed + tts = InworldHttpStreamingService( + api_key=os.getenv("INWORLD_API_KEY", ""), + aiohttp_session=session, + params=InworldHttpStreamingService.InputParams( + voice_id="Ashley", + model="inworld-tts-1", + temperature=0.8, + ), + ) + else: + # Non-streaming TTS - Complete audio generation then playback + tts = InworldHttpNonStreamingService( + api_key=os.getenv("INWORLD_API_KEY", ""), + aiohttp_session=session, + params=InworldHttpNonStreamingService.InputParams( + voice_id="Ashley", + model="inworld-tts-1", + temperature=0.8, + ), + ) llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY")) diff --git a/src/pipecat/services/inworld/tts.py b/src/pipecat/services/inworld/tts.py index 1fd8e9a55..3a70b7499 100644 --- a/src/pipecat/services/inworld/tts.py +++ b/src/pipecat/services/inworld/tts.py @@ -29,10 +29,10 @@ Technical Implementation: Usage:: async with aiohttp.ClientSession() as session: - tts = InworldHttpTTSService( + tts = InworldHttpStreamingService( api_key=os.getenv("INWORLD_API_KEY"), aiohttp_session=session, - params=InworldHttpTTSService.InputParams( + params=InworldHttpStreamingService.InputParams( voice_id="Ashley", model="inworld-tts-1", temperature=0.8, # Optional: control synthesis variability (range: [0, 2]) @@ -70,7 +70,7 @@ from pipecat.utils.text.skip_tags_aggregator import SkipTagsAggregator from pipecat.utils.tracing.service_decorators import traced_tts -class InworldHttpTTSService(TTSService): +class InworldHttpStreamingService(TTSService): """Inworld AI HTTP-based Text-to-Speech Service. This service integrates Inworld AI's high-quality TTS API with Pipecat's pipeline @@ -105,18 +105,18 @@ class InworldHttpTTSService(TTSService): async with aiohttp.ClientSession() as session: # Using default settings (Ashley voice, inworld-tts-1 model) - tts = InworldHttpTTSService( + tts = InworldHttpStreamingService( api_key=os.getenv("INWORLD_API_KEY"), aiohttp_session=session, ) # Or with custom voice, model, and temperature via params - params = InworldHttpTTSService.InputParams( + params = InworldHttpStreamingService.InputParams( voice_id="Hades", model="inworld-tts-1-max", temperature=0.8, # Add variability to speech synthesis (range: [0, 2]) ) - tts = InworldHttpTTSService( + tts = InworldHttpStreamingService( api_key=os.getenv("INWORLD_API_KEY"), aiohttp_session=session, params=params, @@ -185,7 +185,7 @@ class InworldHttpTTSService(TTSService): super().__init__(sample_rate=sample_rate, **kwargs) # Use provided params or create default configuration - params = params or InworldHttpTTSService.InputParams() + params = params or InworldHttpStreamingService.InputParams() # Store core configuration for API requests self._api_key = api_key # Authentication credentials @@ -446,3 +446,364 @@ class InworldHttpTTSService(TTSService): # Signal to pipeline that TTS generation is complete # This allows downstream processors to finalize audio processing yield TTSStoppedFrame() + + +class InworldHttpNonStreamingService(TTSService): + """Inworld AI HTTP-based Text-to-Speech Service (Non-Streaming). + + This service integrates with Inworld AI's non-streaming TTS API for simpler, + complete audio synthesis. Suitable for use cases where streaming is not required + and you prefer to receive the complete audio file at once. + + Key Features: + + - Simple HTTP request/response for complete audio synthesis + - Same voice options as streaming version (Ashley, Hades, etc.) + - High-quality audio output (48kHz LINEAR16 PCM) + - Automatic language detection from input text + - Support for temperature parameter for synthesis variability + - Lower complexity compared to streaming implementation + + Technical Architecture: + + - Uses aiohttp for single HTTP POST request + - Downloads complete audio as base64-encoded data + - Processes entire audio file and chunks for playback + - Integrates with Pipecat's frame-based pipeline system + + Usage:: + + async with aiohttp.ClientSession() as session: + # Using default settings (Ashley voice, inworld-tts-1 model) + tts = InworldHttpNonStreamingService( + api_key=os.getenv("INWORLD_API_KEY"), + aiohttp_session=session, + ) + + # Or with custom voice, model, and temperature + params = InworldHttpNonStreamingService.InputParams( + voice_id="Hades", + model="inworld-tts-1-max", + temperature=0.8, # Control synthesis variability (range: [0, 2]) + ) + tts = InworldHttpNonStreamingService( + api_key=os.getenv("INWORLD_API_KEY"), + aiohttp_session=session, + params=params, + ) + """ + + class InputParams(BaseModel): + """Input parameters for Inworld non-streaming TTS configuration. + + Parameters: + voice_id: Voice selection for speech synthesis (e.g., "Ashley", "Hades"). + model: TTS model to use (e.g., "inworld-tts-1", "inworld-tts-1-max"). + temperature: Voice temperature control for synthesis variability (e.g., 0.8). + Valid range: [0, 2]. Higher values increase variability. + + Note: + Language is automatically inferred from the input text by Inworld's TTS models, + so no explicit language parameter is required. + """ + + voice_id: Optional[str] = "Ashley" # defaults to the Ashley voice + model: Optional[str] = "inworld-tts-1" # defaults to the inworld-tts-1 model + temperature: Optional[float] = None # optional temperature control (range: [0, 2]) + + def __init__( + self, + *, + api_key: str, + aiohttp_session: Optional[aiohttp.ClientSession] = None, + base_url: str = "https://api.inworld.ai/tts/v1/voice", # Non-streaming endpoint + sample_rate: Optional[int] = None, + encoding: str = "LINEAR16", + params: Optional[InputParams] = None, + **kwargs, + ): + """Initialize the Inworld non-streaming TTS service. + + Sets up the TTS service with Inworld AI's non-streaming API configuration. + This constructor prepares all necessary parameters for complete audio synthesis. + + Args: + api_key: Inworld API key for authentication (base64-encoded from Inworld Portal). + Get this from: Inworld Portal > Settings > API Keys > Runtime API Key + aiohttp_session: Shared aiohttp session for HTTP requests. Must be provided + for proper connection pooling and resource management. + base_url: Base URL for Inworld non-streaming HTTP API. Uses non-streaming endpoint by default. + Should normally not be changed unless using a different environment. + sample_rate: Audio sample rate in Hz. If None, uses default from StartFrame. + Common values: 48000 (high quality), 24000 (good quality), 16000 (basic) + encoding: Audio encoding format. Supported options: + - "LINEAR16" (default) - Uncompressed PCM, best quality + - Other formats as supported by Inworld API + params: Input parameters for voice and model configuration. Use this to specify: + - voice_id: Voice selection ("Ashley", "Hades", etc.) + - model: TTS model ("inworld-tts-1", "inworld-tts-1-max", etc.) + - temperature: Voice temperature control for variability (range: [0, 2], e.g., 0.8, optional) + If None, uses default values (Ashley voice, inworld-tts-1 model). + Note: Language is automatically inferred from input text. + **kwargs: Additional arguments passed to the parent TTSService class. + + Note: + The aiohttp_session parameter is required because Inworld's HTTP API + benefits from connection reuse and proper async session management. + """ + # Initialize parent TTSService with audio configuration + super().__init__(sample_rate=sample_rate, **kwargs) + + # Use provided params or create default configuration + params = params or InworldHttpNonStreamingService.InputParams() + + # Store core configuration for API requests + self._api_key = api_key # Authentication credentials + self._session = aiohttp_session # HTTP session for requests (optional) + self._base_url = base_url # API endpoint URL + + # Build settings dictionary that matches Inworld's API expectations + # This will be sent as JSON payload in the TTS request + # Note: Language is automatically inferred from text by Inworld's models + self._settings = { + "voiceId": params.voice_id or "Ashley", # Voice selection from params + "modelId": params.model or "inworld-tts-1", # TTS model selection from params + "audio_config": { # Audio format configuration + "audio_encoding": encoding, # Format: LINEAR16, MP3, etc. + "sample_rate_hertz": 0, # Will be set in start() from parent service + }, + } + + # Add optional temperature parameter if provided (valid range: [0, 2]) + if params.temperature is not None: + self._settings["temperature"] = params.temperature + + # Register voice and model with parent service for metrics and tracking + self.set_voice(params.voice_id or "Ashley") # Used for logging and metrics + self.set_model_name(params.model or "inworld-tts-1") # Used for performance tracking + + def can_generate_metrics(self) -> bool: + """Check if this service can generate processing metrics. + + Returns: + True, as Inworld non-streaming service supports metrics generation. + """ + return True + + async def start(self, frame: StartFrame): + """Start the Inworld non-streaming TTS service. + + Args: + frame: The start frame containing initialization parameters. + """ + await super().start(frame) + self._settings["audio_config"]["sample_rate_hertz"] = self.sample_rate + + async def stop(self, frame: EndFrame): + """Stop the Inworld non-streaming TTS service. + + Args: + frame: The end frame. + """ + await super().stop(frame) + + async def cancel(self, frame: CancelFrame): + """Cancel the Inworld non-streaming TTS service. + + Args: + frame: The cancel frame. + """ + await super().cancel(frame) + + @traced_tts + async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: + """Generate speech from text using Inworld's non-streaming HTTP API. + + This method sends text to Inworld's non-streaming TTS endpoint and receives + the complete audio file as a base64-encoded response. The audio is then + chunked and yielded for playback in the pipeline. + + Args: + text: The text to synthesize into speech. + + Yields: + Frame: Audio frames containing the synthesized speech, plus control frames. + + Raises: + ErrorFrame: If API errors occur or audio processing fails. + """ + logger.debug(f"{self}: Generating TTS [{text}]") + + # ================================================================================ + # STEP 1: PREPARE API REQUEST + # ================================================================================ + # Build the JSON payload according to Inworld's non-streaming API specification + # This matches the format shown in their documentation examples + # Note: Language is automatically inferred from the input text by Inworld's models + payload = { + "text": text, # Text to synthesize + "voiceId": self._settings["voiceId"], # Voice selection (Ashley, Hades, etc.) + "modelId": self._settings["modelId"], # TTS model (inworld-tts-1) + "audio_config": self._settings["audio_config"], # Audio format settings + } + + # Add optional temperature parameter if configured (valid range: [0, 2]) + if "temperature" in self._settings: + payload["temperature"] = self._settings["temperature"] + + # Set up HTTP headers for authentication and content type + # Inworld requires Basic auth with base64-encoded API key + headers = { + "Authorization": f"Basic {self._api_key}", # Base64 API key from Inworld Portal + "Content-Type": "application/json", # JSON request body + } + + try: + # ================================================================================ + # STEP 2: INITIALIZE METRICS AND STREAMING + # ================================================================================ + # Start measuring Time To First Byte (TTFB) for performance tracking + await self.start_ttfb_metrics() + + # Signal to the pipeline that TTS generation has started + # This allows downstream processors to prepare for incoming audio + yield TTSStartedFrame() + + # ================================================================================ + # STEP 3: MAKE HTTP NON-STREAMING REQUEST + # ================================================================================ + # Make single HTTP POST request to Inworld's non-streaming endpoint + # This endpoint returns complete audio as base64-encoded data + # Create session if none was provided + if self._session: + session = self._session + else: + session = aiohttp.ClientSession() + + async with ( + session + if not self._session + else session.post( + self._base_url, json=payload, headers=headers + ) as context_or_response + ): + if self._session: + response = context_or_response + else: + async with context_or_response.post( + self._base_url, json=payload, headers=headers + ) as response: + # ================================================================ + # STEP 4: HANDLE HTTP ERRORS + # ================================================================ + # Check for API errors (expired keys, invalid requests, etc.) + if response.status != 200: + error_text = await response.text() + logger.error(f"Inworld API error: {error_text}") + await self.push_error(ErrorFrame(f"Inworld API error: {error_text}")) + return + + # ================================================================ + # STEP 5: PARSE COMPLETE JSON RESPONSE + # ================================================================ + # Parse the complete JSON response containing base64 audio data + response_data = await response.json() + + # ================================================================ + # STEP 6: EXTRACT AND VALIDATE AUDIO CONTENT + # ================================================================ + # Extract the base64-encoded audio content from response + if "audioContent" not in response_data: + logger.error("No audioContent in Inworld API response") + await self.push_error(ErrorFrame("No audioContent in response")) + return + + # ================================================================ + # STEP 7: DECODE AND PROCESS AUDIO DATA + # ================================================================ + # Decode the base64 audio data to binary + audio_data = base64.b64decode(response_data["audioContent"]) + + # Strip WAV header if present (Inworld may include WAV header) + # This prevents audio clicks and ensures clean audio playback + if len(audio_data) > 44 and audio_data.startswith(b"RIFF"): + audio_data = audio_data[44:] + + # ================================================================ + # STEP 8: START USAGE METRICS TRACKING + # ================================================================ + await self.start_tts_usage_metrics(text) + + # ================================================================ + # STEP 9: CHUNK AND YIELD AUDIO FOR PLAYBACK + # ================================================================ + # Chunk the complete audio for streaming playback + # This allows the pipeline to process audio in manageable pieces + CHUNK_SIZE = self.chunk_size + + for i in range(0, len(audio_data), CHUNK_SIZE): + chunk = audio_data[i : i + CHUNK_SIZE] + if len(chunk) > 0: + await self.stop_ttfb_metrics() + yield TTSAudioRawFrame( + audio=chunk, + sample_rate=self.sample_rate, + num_channels=1, + ) + + if self._session: + # Handle HTTP errors + if response.status != 200: + error_text = await response.text() + logger.error(f"Inworld API error: {error_text}") + await self.push_error(ErrorFrame(f"Inworld API error: {error_text}")) + return + + # Parse the complete JSON response + response_data = await response.json() + + # Extract the base64-encoded audio content + if "audioContent" not in response_data: + logger.error("No audioContent in Inworld API response") + await self.push_error(ErrorFrame("No audioContent in response")) + return + + # Decode the base64 audio data + audio_data = base64.b64decode(response_data["audioContent"]) + + # Strip WAV header if present (Inworld may include WAV header) + if len(audio_data) > 44 and audio_data.startswith(b"RIFF"): + audio_data = audio_data[44:] + + await self.start_tts_usage_metrics(text) + + # Chunk the complete audio for streaming playback + CHUNK_SIZE = self.chunk_size + + for i in range(0, len(audio_data), CHUNK_SIZE): + chunk = audio_data[i : i + CHUNK_SIZE] + if len(chunk) > 0: + await self.stop_ttfb_metrics() + yield TTSAudioRawFrame( + audio=chunk, + sample_rate=self.sample_rate, + num_channels=1, + ) + + except Exception as e: + # ================================================================================ + # STEP 10: ERROR HANDLING + # ================================================================================ + # Log any unexpected errors and notify the pipeline + logger.error(f"{self} exception: {e}") + await self.push_error(ErrorFrame(f"Error generating TTS: {e}")) + finally: + # ================================================================================ + # STEP 11: CLEANUP AND COMPLETION + # ================================================================================ + # Always stop metrics tracking, even if errors occurred + await self.stop_ttfb_metrics() + + # Signal to pipeline that TTS generation is complete + # This allows downstream processors to finalize audio processing + yield TTSStoppedFrame() From b6367965cbfb7de1f70e6ceeb4ca6eddc0c253d4 Mon Sep 17 00:00:00 2001 From: padillamt Date: Wed, 23 Jul 2025 16:50:32 -0700 Subject: [PATCH 22/38] mtpadilla: consolidate streaming and non-streaming options into a single class with common API, with boolean switch variable added (streaming) --- .../07aa-interruptible-inworld-http.py | 38 +- src/pipecat/services/inworld/tts.py | 749 +++++++----------- 2 files changed, 284 insertions(+), 503 deletions(-) diff --git a/examples/foundational/07aa-interruptible-inworld-http.py b/examples/foundational/07aa-interruptible-inworld-http.py index 65d7babca..dbfbcc878 100644 --- a/examples/foundational/07aa-interruptible-inworld-http.py +++ b/examples/foundational/07aa-interruptible-inworld-http.py @@ -16,7 +16,7 @@ from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner from pipecat.pipeline.task import PipelineParams, PipelineTask from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext -from pipecat.services.inworld.tts import InworldHttpNonStreamingService, InworldHttpStreamingService +from pipecat.services.inworld.tts import InworldTTSService from pipecat.services.openai.llm import OpenAILLMService from pipecat.services.openai.stt import OpenAISTTService from pipecat.transports.base_transport import BaseTransport, TransportParams @@ -58,30 +58,20 @@ async def run_example(transport: BaseTransport, _: argparse.Namespace, handle_si prompt="Expect words related to dogs, such as breed names.", ) - streaming = True + # Inworld TTS Service - Unified streaming and non-streaming + # Set streaming=True for real-time audio, streaming=False for complete audio generation + streaming = False # Toggle this to switch between modes - if streaming: - # Streaming TTS - Real-time audio generation as text is processed - tts = InworldHttpStreamingService( - api_key=os.getenv("INWORLD_API_KEY", ""), - aiohttp_session=session, - params=InworldHttpStreamingService.InputParams( - voice_id="Ashley", - model="inworld-tts-1", - temperature=0.8, - ), - ) - else: - # Non-streaming TTS - Complete audio generation then playback - tts = InworldHttpNonStreamingService( - api_key=os.getenv("INWORLD_API_KEY", ""), - aiohttp_session=session, - params=InworldHttpNonStreamingService.InputParams( - voice_id="Ashley", - model="inworld-tts-1", - temperature=0.8, - ), - ) + tts = InworldTTSService( + api_key=os.getenv("INWORLD_API_KEY", ""), + aiohttp_session=session, + streaming=streaming, # True: real-time chunks, False: complete audio then playback + params=InworldTTSService.InputParams( + voice_id="Ashley", + model="inworld-tts-1", + temperature=0.8, + ), + ) llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY")) diff --git a/src/pipecat/services/inworld/tts.py b/src/pipecat/services/inworld/tts.py index 3a70b7499..94ef5aa32 100644 --- a/src/pipecat/services/inworld/tts.py +++ b/src/pipecat/services/inworld/tts.py @@ -7,11 +7,12 @@ """Inworld AI Text-to-Speech Service Implementation. This module provides integration with Inworld AI's HTTP-based TTS API, enabling -real-time text-to-speech synthesis with high-quality, natural-sounding voices. +both streaming and non-streaming text-to-speech synthesis with high-quality, +natural-sounding voices. Key Features: -- HTTP streaming API support for low-latency audio generation +- HTTP streaming and non-streaming API support for flexible audio generation - Multiple voice options (Ashley, Hades, etc.) - Automatic language detection from input text (no manual language setting required) - Real-time audio chunk processing with proper buffering @@ -20,8 +21,8 @@ Key Features: Technical Implementation: -- Uses aiohttp for HTTP streaming connections -- Implements JSON line-by-line parsing for streaming responses +- Uses aiohttp for HTTP connections +- Implements both JSON line-by-line parsing (streaming) and complete response (non-streaming) - Handles base64-encoded audio data with proper decoding - Manages audio continuity to prevent clicks and artifacts - Integrates with Pipecat's frame-based pipeline architecture @@ -29,15 +30,29 @@ Technical Implementation: Usage:: async with aiohttp.ClientSession() as session: - tts = InworldHttpStreamingService( + # Streaming mode (default) - real-time audio generation + tts = InworldTTSService( api_key=os.getenv("INWORLD_API_KEY"), aiohttp_session=session, - params=InworldHttpStreamingService.InputParams( + streaming=True, # Default + params=InworldTTSService.InputParams( voice_id="Ashley", model="inworld-tts-1", temperature=0.8, # Optional: control synthesis variability (range: [0, 2]) ), ) + + # Non-streaming mode - complete audio generation then playback + tts = InworldTTSService( + api_key=os.getenv("INWORLD_API_KEY"), + aiohttp_session=session, + streaming=False, + params=InworldTTSService.InputParams( + voice_id="Ashley", + model="inworld-tts-1", + temperature=0.8, + ), + ) """ import base64 @@ -70,27 +85,30 @@ from pipecat.utils.text.skip_tags_aggregator import SkipTagsAggregator from pipecat.utils.tracing.service_decorators import traced_tts -class InworldHttpStreamingService(TTSService): +class InworldTTSService(TTSService): """Inworld AI HTTP-based Text-to-Speech Service. - This service integrates Inworld AI's high-quality TTS API with Pipecat's pipeline - architecture. It provides real-time speech synthesis with natural-sounding voices - and low-latency streaming audio delivery. + This unified service integrates Inworld AI's high-quality TTS API with Pipecat's pipeline + architecture. It supports both streaming and non-streaming modes, providing flexible + speech synthesis with natural-sounding voices. Key Features: - - Real-time HTTP streaming for minimal latency + - **Streaming Mode**: Real-time HTTP streaming for minimal latency + - **Non-Streaming Mode**: Complete audio synthesis then chunked playback - Multiple voice options (Ashley, Hades, etc.) - High-quality audio output (48kHz LINEAR16 PCM) - Automatic audio format handling and header stripping - Comprehensive error handling and recovery - Built-in performance metrics and monitoring + - Unified interface for both modes Technical Architecture: - Uses aiohttp for non-blocking HTTP requests - - Implements JSON line-by-line streaming protocol - - Processes base64-encoded audio chunks in real-time + - **Streaming**: Implements JSON line-by-line streaming protocol + - **Non-Streaming**: Single HTTP POST with complete response + - Processes base64-encoded audio chunks in real-time or batch - Manages audio continuity to prevent artifacts - Integrates with Pipecat's frame-based pipeline system @@ -100,31 +118,38 @@ class InworldHttpStreamingService(TTSService): - Models: inworld-tts-1 and other available models - Audio Formats: LINEAR16 PCM at various sample rates - Language Detection: Automatically inferred from input text (no explicit language setting required) + - Mode Selection: streaming=True for real-time, streaming=False for complete synthesis Example Usage:: async with aiohttp.ClientSession() as session: - # Using default settings (Ashley voice, inworld-tts-1 model) - tts = InworldHttpStreamingService( + # Streaming mode (default) - Real-time audio generation + tts_streaming = InworldTTSService( api_key=os.getenv("INWORLD_API_KEY"), aiohttp_session=session, + streaming=True, # Default behavior + params=InworldTTSService.InputParams( + voice_id="Ashley", + model="inworld-tts-1", + temperature=0.8, # Add variability to speech synthesis (range: [0, 2]) + ), ) - # Or with custom voice, model, and temperature via params - params = InworldHttpStreamingService.InputParams( - voice_id="Hades", - model="inworld-tts-1-max", - temperature=0.8, # Add variability to speech synthesis (range: [0, 2]) - ) - tts = InworldHttpStreamingService( + # Non-streaming mode - Complete audio then playback + tts_complete = InworldTTSService( api_key=os.getenv("INWORLD_API_KEY"), aiohttp_session=session, - params=params, + streaming=False, + params=InworldTTSService.InputParams( + voice_id="Hades", + model="inworld-tts-1-max", + temperature=0.8, + ), ) """ class InputParams(BaseModel): - """Input parameters for Inworld HTTP TTS configuration. + """Input parameters for Inworld TTS configuration. Parameters: voice_id: Voice selection for speech synthesis (e.g., "Ashley", "Hades"). @@ -146,23 +171,29 @@ class InworldHttpStreamingService(TTSService): *, api_key: str, aiohttp_session: aiohttp.ClientSession, - base_url: str = "https://api.inworld.ai/tts/v1/voice:stream", + streaming: bool = True, + base_url: Optional[str] = None, sample_rate: Optional[int] = None, encoding: str = "LINEAR16", params: Optional[InputParams] = None, **kwargs, ): - """Initialize the Inworld HTTP TTS service. + """Initialize the Inworld TTS service. - Sets up the TTS service with Inworld AI's streaming API configuration. - This constructor prepares all necessary parameters for real-time speech synthesis. + Sets up the TTS service with Inworld AI's API configuration. + This constructor prepares all necessary parameters for speech synthesis. Args: api_key: Inworld API key for authentication (base64-encoded from Inworld Portal). Get this from: Inworld Portal > Settings > API Keys > Runtime API Key aiohttp_session: Shared aiohttp session for HTTP requests. Must be provided for proper connection pooling and resource management. - base_url: Base URL for Inworld HTTP API. Uses streaming endpoint by default. + streaming: Whether to use streaming mode (True) or non-streaming mode (False). + - True: Real-time audio chunks as they're generated (lower latency) + - False: Complete audio file generated first, then chunked for playback (simpler) + base_url: Base URL for Inworld HTTP API. If None, automatically selected based on streaming mode: + - Streaming: "https://api.inworld.ai/tts/v1/voice:stream" + - Non-streaming: "https://api.inworld.ai/tts/v1/voice" Should normally not be changed unless using a different environment. sample_rate: Audio sample rate in Hz. If None, uses default from StartFrame. Common values: 48000 (high quality), 24000 (good quality), 16000 (basic) @@ -185,11 +216,19 @@ class InworldHttpStreamingService(TTSService): super().__init__(sample_rate=sample_rate, **kwargs) # Use provided params or create default configuration - params = params or InworldHttpStreamingService.InputParams() + params = params or InworldTTSService.InputParams() # Store core configuration for API requests self._api_key = api_key # Authentication credentials self._session = aiohttp_session # HTTP session for requests + self._streaming = streaming # Streaming mode selection + + # Set base URL based on streaming mode if not provided + if base_url is None: + if streaming: + base_url = "https://api.inworld.ai/tts/v1/voice:stream" # Streaming endpoint + else: + base_url = "https://api.inworld.ai/tts/v1/voice" # Non-streaming endpoint self._base_url = base_url # API endpoint URL # Build settings dictionary that matches Inworld's API expectations @@ -216,12 +255,12 @@ class InworldHttpStreamingService(TTSService): """Check if this service can generate processing metrics. Returns: - True, as Inworld HTTP service supports metrics generation. + True, as Inworld TTS service supports metrics generation. """ return True async def start(self, frame: StartFrame): - """Start the Inworld HTTP TTS service. + """Start the Inworld TTS service. Args: frame: The start frame containing initialization parameters. @@ -230,7 +269,7 @@ class InworldHttpStreamingService(TTSService): self._settings["audio_config"]["sample_rate_hertz"] = self.sample_rate async def stop(self, frame: EndFrame): - """Stop the Inworld HTTP TTS service. + """Stop the Inworld TTS service. Args: frame: The end frame. @@ -238,7 +277,7 @@ class InworldHttpStreamingService(TTSService): await super().stop(frame) async def cancel(self, frame: CancelFrame): - """Cancel the Inworld HTTP TTS service. + """Cancel the Inworld TTS service. Args: frame: The cancel frame. @@ -247,21 +286,30 @@ class InworldHttpStreamingService(TTSService): @traced_tts async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: - """Generate speech from text using Inworld's streaming HTTP API. + """Generate speech from text using Inworld's HTTP API. - This is the core TTS processing function that: + This is the core TTS processing function that adapts its behavior based on the streaming mode: + + **Streaming Mode (streaming=True)**: 1. Sends text to Inworld's streaming TTS endpoint 2. Receives JSON-streamed audio chunks in real-time 3. Processes and cleans audio data (removes WAV headers, validates content) 4. Yields audio frames for immediate playback in the pipeline + **Non-Streaming Mode (streaming=False)**: + 1. Sends text to Inworld's non-streaming TTS endpoint + 2. Receives complete audio file as base64-encoded response + 3. Processes entire audio and chunks for playback + 4. Yields audio frames in manageable pieces + Technical Details: - - Uses HTTP streaming with JSON line-by-line responses - - Each JSON line contains base64-encoded audio data - - Implements buffering to handle partial JSON lines + - **Streaming**: Uses HTTP streaming with JSON line-by-line responses + - **Non-Streaming**: Single HTTP POST with complete JSON response + - Each audio chunk contains base64-encoded audio data + - Implements buffering to handle partial data (streaming mode) - Strips WAV headers to prevent audio artifacts/clicks - - Provides real-time audio streaming for low latency + - Provides optimized audio delivery for each mode Args: text: The text to synthesize into speech. @@ -272,7 +320,7 @@ class InworldHttpStreamingService(TTSService): Raises: ErrorFrame: If API errors occur or audio processing fails. """ - logger.debug(f"{self}: Generating TTS [{text}]") + logger.debug(f"{self}: Generating TTS [{text}] (streaming={self._streaming})") # ================================================================================ # STEP 1: PREPARE API REQUEST @@ -302,7 +350,7 @@ class InworldHttpStreamingService(TTSService): try: # ================================================================================ - # STEP 2: INITIALIZE METRICS AND STREAMING + # STEP 2: INITIALIZE METRICS AND PROCESSING # ================================================================================ # Start measuring Time To First Byte (TTFB) for performance tracking await self.start_ttfb_metrics() @@ -312,10 +360,10 @@ class InworldHttpStreamingService(TTSService): yield TTSStartedFrame() # ================================================================================ - # STEP 3: MAKE HTTP STREAMING REQUEST + # STEP 3: MAKE HTTP REQUEST (MODE-SPECIFIC) # ================================================================================ - # Use aiohttp's streaming POST to Inworld's streaming endpoint - # The endpoint returns JSON lines with audio chunks as they're generated + # Use aiohttp to make request to Inworld's endpoint + # Behavior differs based on streaming mode async with self._session.post( self._base_url, json=payload, headers=headers ) as response: @@ -330,115 +378,34 @@ class InworldHttpStreamingService(TTSService): return # ================================================================================ - # STEP 5: PROCESS STREAMING JSON RESPONSE + # STEP 5: PROCESS RESPONSE (MODE-SPECIFIC) # ================================================================================ - # Inworld streams JSON lines where each line contains audio data - # We need to buffer incoming data and process complete lines - - # Buffer to accumulate incoming text data - # This handles cases where JSON lines are split across HTTP chunks - buffer = "" - - # Read HTTP response in manageable chunks (1KB each) - # This prevents memory issues with large responses - async for chunk in response.content.iter_chunked(1024): - if not chunk: - continue - - # ============================================================================ - # STEP 6: BUFFER MANAGEMENT - # ============================================================================ - # Decode binary chunk to text and add to our line buffer - # Each chunk may contain partial JSON lines, so we need to accumulate - buffer += chunk.decode("utf-8") - - # ============================================================================ - # STEP 7: LINE-BY-LINE JSON PROCESSING - # ============================================================================ - # Process all complete lines in the buffer (lines ending with \n) - # Leave partial lines in buffer for next iteration - while "\n" in buffer: - # Split on first newline, keeping remainder in buffer - line, buffer = buffer.split("\n", 1) - line_str = line.strip() - - # Skip empty lines (common in streaming responses) - if not line_str: - continue - - try: - # ================================================================ - # STEP 8: PARSE JSON AND EXTRACT AUDIO - # ================================================================ - # Parse the JSON line - should contain audio data - chunk_data = json.loads(line_str) - - # Check if this line contains audio content - # Inworld's response format: {"result": {"audioContent": "base64data"}} - if "result" in chunk_data and "audioContent" in chunk_data["result"]: - # Decode base64 audio data to binary - audio_chunk = base64.b64decode(chunk_data["result"]["audioContent"]) - - # ======================================================== - # STEP 9: AUDIO DATA VALIDATION - # ======================================================== - # Skip empty audio chunks that could cause discontinuities - # Empty chunks can create gaps or clicks in audio playback - if not audio_chunk: - continue - - # Start with the raw audio data - audio_data = audio_chunk - - # ======================================================== - # STEP 10: WAV HEADER REMOVAL (CRITICAL FOR AUDIO QUALITY) - # ======================================================== - # Each audio chunk may have its own WAV header (44 bytes) - # These headers contain metadata and will sound like clicks if played - # We must strip them from EVERY chunk, not just the first one - if ( - len(audio_chunk) > 44 # Ensure chunk is large enough - and audio_chunk.startswith( - b"RIFF" - ) # Check for WAV header magic bytes - ): - # Remove the 44-byte WAV header to get pure audio data - audio_data = audio_chunk[44:] - - # ======================================================== - # STEP 11: YIELD AUDIO FRAME TO PIPELINE - # ======================================================== - # Only yield frames with actual audio content - # Empty frames can cause pipeline issues - if len(audio_data) > 0: - # Create Pipecat audio frame with processed audio data - yield TTSAudioRawFrame( - audio=audio_data, # Clean audio without headers - sample_rate=self.sample_rate, # Configured sample rate (48kHz) - num_channels=1, # Mono audio - ) - - except json.JSONDecodeError: - # Ignore malformed JSON lines - streaming can have partial data - # This is normal in HTTP streaming scenarios - continue + # Choose processing method based on streaming mode + if self._streaming: + # Stream processing: JSON line-by-line with real-time audio + async for frame in self._process_streaming_response(response): + yield frame + else: + # Non-stream processing: Complete JSON response with batch audio + async for frame in self._process_non_streaming_response(response): + yield frame # ================================================================================ - # STEP 12: FINALIZE METRICS AND CLEANUP + # STEP 6: FINALIZE METRICS AND CLEANUP # ================================================================================ # Start usage metrics tracking after successful completion await self.start_tts_usage_metrics(text) except Exception as e: # ================================================================================ - # STEP 13: ERROR HANDLING + # STEP 7: ERROR HANDLING # ================================================================================ # Log any unexpected errors and notify the pipeline logger.error(f"{self} exception: {e}") await self.push_error(ErrorFrame(f"Error generating TTS: {e}")) finally: # ================================================================================ - # STEP 14: CLEANUP AND COMPLETION + # STEP 8: CLEANUP AND COMPLETION # ================================================================================ # Always stop metrics tracking, even if errors occurred await self.stop_ttfb_metrics() @@ -447,363 +414,187 @@ class InworldHttpStreamingService(TTSService): # This allows downstream processors to finalize audio processing yield TTSStoppedFrame() + async def _process_streaming_response( + self, response: aiohttp.ClientResponse + ) -> AsyncGenerator[Frame, None]: + """Process streaming JSON response with real-time audio chunks. -class InworldHttpNonStreamingService(TTSService): - """Inworld AI HTTP-based Text-to-Speech Service (Non-Streaming). - - This service integrates with Inworld AI's non-streaming TTS API for simpler, - complete audio synthesis. Suitable for use cases where streaming is not required - and you prefer to receive the complete audio file at once. - - Key Features: - - - Simple HTTP request/response for complete audio synthesis - - Same voice options as streaming version (Ashley, Hades, etc.) - - High-quality audio output (48kHz LINEAR16 PCM) - - Automatic language detection from input text - - Support for temperature parameter for synthesis variability - - Lower complexity compared to streaming implementation - - Technical Architecture: - - - Uses aiohttp for single HTTP POST request - - Downloads complete audio as base64-encoded data - - Processes entire audio file and chunks for playback - - Integrates with Pipecat's frame-based pipeline system - - Usage:: - - async with aiohttp.ClientSession() as session: - # Using default settings (Ashley voice, inworld-tts-1 model) - tts = InworldHttpNonStreamingService( - api_key=os.getenv("INWORLD_API_KEY"), - aiohttp_session=session, - ) - - # Or with custom voice, model, and temperature - params = InworldHttpNonStreamingService.InputParams( - voice_id="Hades", - model="inworld-tts-1-max", - temperature=0.8, # Control synthesis variability (range: [0, 2]) - ) - tts = InworldHttpNonStreamingService( - api_key=os.getenv("INWORLD_API_KEY"), - aiohttp_session=session, - params=params, - ) - """ - - class InputParams(BaseModel): - """Input parameters for Inworld non-streaming TTS configuration. - - Parameters: - voice_id: Voice selection for speech synthesis (e.g., "Ashley", "Hades"). - model: TTS model to use (e.g., "inworld-tts-1", "inworld-tts-1-max"). - temperature: Voice temperature control for synthesis variability (e.g., 0.8). - Valid range: [0, 2]. Higher values increase variability. - - Note: - Language is automatically inferred from the input text by Inworld's TTS models, - so no explicit language parameter is required. - """ - - voice_id: Optional[str] = "Ashley" # defaults to the Ashley voice - model: Optional[str] = "inworld-tts-1" # defaults to the inworld-tts-1 model - temperature: Optional[float] = None # optional temperature control (range: [0, 2]) - - def __init__( - self, - *, - api_key: str, - aiohttp_session: Optional[aiohttp.ClientSession] = None, - base_url: str = "https://api.inworld.ai/tts/v1/voice", # Non-streaming endpoint - sample_rate: Optional[int] = None, - encoding: str = "LINEAR16", - params: Optional[InputParams] = None, - **kwargs, - ): - """Initialize the Inworld non-streaming TTS service. - - Sets up the TTS service with Inworld AI's non-streaming API configuration. - This constructor prepares all necessary parameters for complete audio synthesis. + This method handles Inworld's streaming endpoint response format: + - JSON lines containing base64-encoded audio chunks + - Real-time processing as data arrives + - Line buffering to handle partial JSON data Args: - api_key: Inworld API key for authentication (base64-encoded from Inworld Portal). - Get this from: Inworld Portal > Settings > API Keys > Runtime API Key - aiohttp_session: Shared aiohttp session for HTTP requests. Must be provided - for proper connection pooling and resource management. - base_url: Base URL for Inworld non-streaming HTTP API. Uses non-streaming endpoint by default. - Should normally not be changed unless using a different environment. - sample_rate: Audio sample rate in Hz. If None, uses default from StartFrame. - Common values: 48000 (high quality), 24000 (good quality), 16000 (basic) - encoding: Audio encoding format. Supported options: - - "LINEAR16" (default) - Uncompressed PCM, best quality - - Other formats as supported by Inworld API - params: Input parameters for voice and model configuration. Use this to specify: - - voice_id: Voice selection ("Ashley", "Hades", etc.) - - model: TTS model ("inworld-tts-1", "inworld-tts-1-max", etc.) - - temperature: Voice temperature control for variability (range: [0, 2], e.g., 0.8, optional) - If None, uses default values (Ashley voice, inworld-tts-1 model). - Note: Language is automatically inferred from input text. - **kwargs: Additional arguments passed to the parent TTSService class. - - Note: - The aiohttp_session parameter is required because Inworld's HTTP API - benefits from connection reuse and proper async session management. - """ - # Initialize parent TTSService with audio configuration - super().__init__(sample_rate=sample_rate, **kwargs) - - # Use provided params or create default configuration - params = params or InworldHttpNonStreamingService.InputParams() - - # Store core configuration for API requests - self._api_key = api_key # Authentication credentials - self._session = aiohttp_session # HTTP session for requests (optional) - self._base_url = base_url # API endpoint URL - - # Build settings dictionary that matches Inworld's API expectations - # This will be sent as JSON payload in the TTS request - # Note: Language is automatically inferred from text by Inworld's models - self._settings = { - "voiceId": params.voice_id or "Ashley", # Voice selection from params - "modelId": params.model or "inworld-tts-1", # TTS model selection from params - "audio_config": { # Audio format configuration - "audio_encoding": encoding, # Format: LINEAR16, MP3, etc. - "sample_rate_hertz": 0, # Will be set in start() from parent service - }, - } - - # Add optional temperature parameter if provided (valid range: [0, 2]) - if params.temperature is not None: - self._settings["temperature"] = params.temperature - - # Register voice and model with parent service for metrics and tracking - self.set_voice(params.voice_id or "Ashley") # Used for logging and metrics - self.set_model_name(params.model or "inworld-tts-1") # Used for performance tracking - - def can_generate_metrics(self) -> bool: - """Check if this service can generate processing metrics. - - Returns: - True, as Inworld non-streaming service supports metrics generation. - """ - return True - - async def start(self, frame: StartFrame): - """Start the Inworld non-streaming TTS service. - - Args: - frame: The start frame containing initialization parameters. - """ - await super().start(frame) - self._settings["audio_config"]["sample_rate_hertz"] = self.sample_rate - - async def stop(self, frame: EndFrame): - """Stop the Inworld non-streaming TTS service. - - Args: - frame: The end frame. - """ - await super().stop(frame) - - async def cancel(self, frame: CancelFrame): - """Cancel the Inworld non-streaming TTS service. - - Args: - frame: The cancel frame. - """ - await super().cancel(frame) - - @traced_tts - async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: - """Generate speech from text using Inworld's non-streaming HTTP API. - - This method sends text to Inworld's non-streaming TTS endpoint and receives - the complete audio file as a base64-encoded response. The audio is then - chunked and yielded for playback in the pipeline. - - Args: - text: The text to synthesize into speech. + response: The aiohttp response object from streaming endpoint. Yields: - Frame: Audio frames containing the synthesized speech, plus control frames. - - Raises: - ErrorFrame: If API errors occur or audio processing fails. + Frame: Audio frames as they're processed from the stream. """ - logger.debug(f"{self}: Generating TTS [{text}]") + # ================================================================================ + # STREAMING: PROCESS JSON LINE-BY-LINE RESPONSE + # ================================================================================ + # Inworld streams JSON lines where each line contains audio data + # We need to buffer incoming data and process complete lines + + # Buffer to accumulate incoming text data + # This handles cases where JSON lines are split across HTTP chunks + buffer = "" + + # Read HTTP response in manageable chunks (1KB each) + # This prevents memory issues with large responses + async for chunk in response.content.iter_chunked(1024): + if not chunk: + continue + + # ============================================================================ + # BUFFER MANAGEMENT + # ============================================================================ + # Decode binary chunk to text and add to our line buffer + # Each chunk may contain partial JSON lines, so we need to accumulate + buffer += chunk.decode("utf-8") + + # ============================================================================ + # LINE-BY-LINE JSON PROCESSING + # ============================================================================ + # Process all complete lines in the buffer (lines ending with \n) + # Leave partial lines in buffer for next iteration + while "\n" in buffer: + # Split on first newline, keeping remainder in buffer + line, buffer = buffer.split("\n", 1) + line_str = line.strip() + + # Skip empty lines (common in streaming responses) + if not line_str: + continue + + try: + # ================================================================ + # PARSE JSON AND EXTRACT AUDIO + # ================================================================ + # Parse the JSON line - should contain audio data + chunk_data = json.loads(line_str) + + # Check if this line contains audio content + # Inworld's response format: {"result": {"audioContent": "base64data"}} + if "result" in chunk_data and "audioContent" in chunk_data["result"]: + # Process the audio chunk + async for frame in self._process_audio_chunk( + base64.b64decode(chunk_data["result"]["audioContent"]) + ): + yield frame + + except json.JSONDecodeError: + # Ignore malformed JSON lines - streaming can have partial data + # This is normal in HTTP streaming scenarios + continue + + async def _process_non_streaming_response( + self, response: aiohttp.ClientResponse + ) -> AsyncGenerator[Frame, None]: + """Process complete JSON response with full audio content. + + This method handles Inworld's non-streaming endpoint response format: + - Single JSON response with complete base64-encoded audio + - Full audio download then chunked playback + - Simpler processing without line buffering + + Args: + response: The aiohttp response object from non-streaming endpoint. + + Yields: + Frame: Audio frames chunked from the complete audio. + """ + # ================================================================================ + # NON-STREAMING: PARSE COMPLETE JSON RESPONSE + # ================================================================================ + # Parse the complete JSON response containing base64 audio data + response_data = await response.json() # ================================================================================ - # STEP 1: PREPARE API REQUEST + # EXTRACT AND VALIDATE AUDIO CONTENT # ================================================================================ - # Build the JSON payload according to Inworld's non-streaming API specification - # This matches the format shown in their documentation examples - # Note: Language is automatically inferred from the input text by Inworld's models - payload = { - "text": text, # Text to synthesize - "voiceId": self._settings["voiceId"], # Voice selection (Ashley, Hades, etc.) - "modelId": self._settings["modelId"], # TTS model (inworld-tts-1) - "audio_config": self._settings["audio_config"], # Audio format settings - } + # Extract the base64-encoded audio content from response + if "audioContent" not in response_data: + logger.error("No audioContent in Inworld API response") + await self.push_error(ErrorFrame("No audioContent in response")) + return - # Add optional temperature parameter if configured (valid range: [0, 2]) - if "temperature" in self._settings: - payload["temperature"] = self._settings["temperature"] + # ================================================================================ + # DECODE AND PROCESS COMPLETE AUDIO DATA + # ================================================================================ + # Decode the base64 audio data to binary + audio_data = base64.b64decode(response_data["audioContent"]) - # Set up HTTP headers for authentication and content type - # Inworld requires Basic auth with base64-encoded API key - headers = { - "Authorization": f"Basic {self._api_key}", # Base64 API key from Inworld Portal - "Content-Type": "application/json", # JSON request body - } + # Strip WAV header if present (Inworld may include WAV header) + # This prevents audio clicks and ensures clean audio playback + if len(audio_data) > 44 and audio_data.startswith(b"RIFF"): + audio_data = audio_data[44:] - try: - # ================================================================================ - # STEP 2: INITIALIZE METRICS AND STREAMING - # ================================================================================ - # Start measuring Time To First Byte (TTFB) for performance tracking - await self.start_ttfb_metrics() + # ================================================================================ + # CHUNK AND YIELD COMPLETE AUDIO FOR PLAYBACK + # ================================================================================ + # Chunk the complete audio for streaming playback + # This allows the pipeline to process audio in manageable pieces + CHUNK_SIZE = self.chunk_size - # Signal to the pipeline that TTS generation has started - # This allows downstream processors to prepare for incoming audio - yield TTSStartedFrame() + for i in range(0, len(audio_data), CHUNK_SIZE): + chunk = audio_data[i : i + CHUNK_SIZE] + if len(chunk) > 0: + await self.stop_ttfb_metrics() + yield TTSAudioRawFrame( + audio=chunk, + sample_rate=self.sample_rate, + num_channels=1, + ) - # ================================================================================ - # STEP 3: MAKE HTTP NON-STREAMING REQUEST - # ================================================================================ - # Make single HTTP POST request to Inworld's non-streaming endpoint - # This endpoint returns complete audio as base64-encoded data - # Create session if none was provided - if self._session: - session = self._session - else: - session = aiohttp.ClientSession() + async def _process_audio_chunk(self, audio_chunk: bytes) -> AsyncGenerator[Frame, None]: + """Process a single audio chunk (common logic for both modes). - async with ( - session - if not self._session - else session.post( - self._base_url, json=payload, headers=headers - ) as context_or_response - ): - if self._session: - response = context_or_response - else: - async with context_or_response.post( - self._base_url, json=payload, headers=headers - ) as response: - # ================================================================ - # STEP 4: HANDLE HTTP ERRORS - # ================================================================ - # Check for API errors (expired keys, invalid requests, etc.) - if response.status != 200: - error_text = await response.text() - logger.error(f"Inworld API error: {error_text}") - await self.push_error(ErrorFrame(f"Inworld API error: {error_text}")) - return + This method handles audio chunk processing that's common to both streaming + and non-streaming modes: + - WAV header removal + - Audio validation + - Frame creation and yielding - # ================================================================ - # STEP 5: PARSE COMPLETE JSON RESPONSE - # ================================================================ - # Parse the complete JSON response containing base64 audio data - response_data = await response.json() + Args: + audio_chunk: Raw audio data bytes to process. - # ================================================================ - # STEP 6: EXTRACT AND VALIDATE AUDIO CONTENT - # ================================================================ - # Extract the base64-encoded audio content from response - if "audioContent" not in response_data: - logger.error("No audioContent in Inworld API response") - await self.push_error(ErrorFrame("No audioContent in response")) - return + Yields: + Frame: Audio frame if chunk contains valid audio data. + """ + # ======================================================== + # AUDIO DATA VALIDATION + # ======================================================== + # Skip empty audio chunks that could cause discontinuities + # Empty chunks can create gaps or clicks in audio playback + if not audio_chunk: + return - # ================================================================ - # STEP 7: DECODE AND PROCESS AUDIO DATA - # ================================================================ - # Decode the base64 audio data to binary - audio_data = base64.b64decode(response_data["audioContent"]) + # Start with the raw audio data + audio_data = audio_chunk - # Strip WAV header if present (Inworld may include WAV header) - # This prevents audio clicks and ensures clean audio playback - if len(audio_data) > 44 and audio_data.startswith(b"RIFF"): - audio_data = audio_data[44:] + # ======================================================== + # WAV HEADER REMOVAL (CRITICAL FOR AUDIO QUALITY) + # ======================================================== + # Each audio chunk may have its own WAV header (44 bytes) + # These headers contain metadata and will sound like clicks if played + # We must strip them from EVERY chunk, not just the first one + if ( + len(audio_chunk) > 44 # Ensure chunk is large enough + and audio_chunk.startswith(b"RIFF") # Check for WAV header magic bytes + ): + # Remove the 44-byte WAV header to get pure audio data + audio_data = audio_chunk[44:] - # ================================================================ - # STEP 8: START USAGE METRICS TRACKING - # ================================================================ - await self.start_tts_usage_metrics(text) - - # ================================================================ - # STEP 9: CHUNK AND YIELD AUDIO FOR PLAYBACK - # ================================================================ - # Chunk the complete audio for streaming playback - # This allows the pipeline to process audio in manageable pieces - CHUNK_SIZE = self.chunk_size - - for i in range(0, len(audio_data), CHUNK_SIZE): - chunk = audio_data[i : i + CHUNK_SIZE] - if len(chunk) > 0: - await self.stop_ttfb_metrics() - yield TTSAudioRawFrame( - audio=chunk, - sample_rate=self.sample_rate, - num_channels=1, - ) - - if self._session: - # Handle HTTP errors - if response.status != 200: - error_text = await response.text() - logger.error(f"Inworld API error: {error_text}") - await self.push_error(ErrorFrame(f"Inworld API error: {error_text}")) - return - - # Parse the complete JSON response - response_data = await response.json() - - # Extract the base64-encoded audio content - if "audioContent" not in response_data: - logger.error("No audioContent in Inworld API response") - await self.push_error(ErrorFrame("No audioContent in response")) - return - - # Decode the base64 audio data - audio_data = base64.b64decode(response_data["audioContent"]) - - # Strip WAV header if present (Inworld may include WAV header) - if len(audio_data) > 44 and audio_data.startswith(b"RIFF"): - audio_data = audio_data[44:] - - await self.start_tts_usage_metrics(text) - - # Chunk the complete audio for streaming playback - CHUNK_SIZE = self.chunk_size - - for i in range(0, len(audio_data), CHUNK_SIZE): - chunk = audio_data[i : i + CHUNK_SIZE] - if len(chunk) > 0: - await self.stop_ttfb_metrics() - yield TTSAudioRawFrame( - audio=chunk, - sample_rate=self.sample_rate, - num_channels=1, - ) - - except Exception as e: - # ================================================================================ - # STEP 10: ERROR HANDLING - # ================================================================================ - # Log any unexpected errors and notify the pipeline - logger.error(f"{self} exception: {e}") - await self.push_error(ErrorFrame(f"Error generating TTS: {e}")) - finally: - # ================================================================================ - # STEP 11: CLEANUP AND COMPLETION - # ================================================================================ - # Always stop metrics tracking, even if errors occurred - await self.stop_ttfb_metrics() - - # Signal to pipeline that TTS generation is complete - # This allows downstream processors to finalize audio processing - yield TTSStoppedFrame() + # ======================================================== + # YIELD AUDIO FRAME TO PIPELINE + # ======================================================== + # Only yield frames with actual audio content + # Empty frames can cause pipeline issues + if len(audio_data) > 0: + # Create Pipecat audio frame with processed audio data + yield TTSAudioRawFrame( + audio=audio_data, # Clean audio without headers + sample_rate=self.sample_rate, # Configured sample rate (48kHz) + num_channels=1, # Mono audio + ) From da8c67114acd8a79f1a1f298f1a685c26ff5bad2 Mon Sep 17 00:00:00 2001 From: padillamt Date: Thu, 24 Jul 2025 13:35:29 -0700 Subject: [PATCH 23/38] mtpadilla: make streaming the default for example --- examples/foundational/07aa-interruptible-inworld-http.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/foundational/07aa-interruptible-inworld-http.py b/examples/foundational/07aa-interruptible-inworld-http.py index dbfbcc878..53e3c14b3 100644 --- a/examples/foundational/07aa-interruptible-inworld-http.py +++ b/examples/foundational/07aa-interruptible-inworld-http.py @@ -60,7 +60,7 @@ async def run_example(transport: BaseTransport, _: argparse.Namespace, handle_si # Inworld TTS Service - Unified streaming and non-streaming # Set streaming=True for real-time audio, streaming=False for complete audio generation - streaming = False # Toggle this to switch between modes + streaming = True # Toggle this to switch between modes tts = InworldTTSService( api_key=os.getenv("INWORLD_API_KEY", ""), From f6440ee6e174fc4eb9d60ff7ed1cb8ec21bccbac Mon Sep 17 00:00:00 2001 From: padillamt Date: Thu, 24 Jul 2025 13:36:40 -0700 Subject: [PATCH 24/38] mtpadilla: correct Examples header in comments --- src/pipecat/services/inworld/tts.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/pipecat/services/inworld/tts.py b/src/pipecat/services/inworld/tts.py index 94ef5aa32..64d5d9fe9 100644 --- a/src/pipecat/services/inworld/tts.py +++ b/src/pipecat/services/inworld/tts.py @@ -27,7 +27,7 @@ Technical Implementation: - Manages audio continuity to prevent clicks and artifacts - Integrates with Pipecat's frame-based pipeline architecture -Usage:: +Examples:: async with aiohttp.ClientSession() as session: # Streaming mode (default) - real-time audio generation @@ -120,7 +120,7 @@ class InworldTTSService(TTSService): - Language Detection: Automatically inferred from input text (no explicit language setting required) - Mode Selection: streaming=True for real-time, streaming=False for complete synthesis - Example Usage:: + Examples:: async with aiohttp.ClientSession() as session: # Streaming mode (default) - Real-time audio generation From 81048ce43a78ee112b44d1a23ac9823f7976057e Mon Sep 17 00:00:00 2001 From: padillamt Date: Thu, 24 Jul 2025 20:42:29 -0700 Subject: [PATCH 25/38] mtpadilla: rename 07aa-interruptible-inworld-http.py to 07ab-interruptible-inworld-http.py --- ...uptible-inworld-http.py => 07ab-interruptible-inworld-http.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename examples/foundational/{07aa-interruptible-inworld-http.py => 07ab-interruptible-inworld-http.py} (100%) diff --git a/examples/foundational/07aa-interruptible-inworld-http.py b/examples/foundational/07ab-interruptible-inworld-http.py similarity index 100% rename from examples/foundational/07aa-interruptible-inworld-http.py rename to examples/foundational/07ab-interruptible-inworld-http.py From 067f64389bef74a23d15904a4dc53d1b41b57cff Mon Sep 17 00:00:00 2001 From: padillamt Date: Thu, 24 Jul 2025 20:44:27 -0700 Subject: [PATCH 26/38] mtpadilla: no longer needed so making empty --- src/pipecat/services/inworld/__init__.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/src/pipecat/services/inworld/__init__.py b/src/pipecat/services/inworld/__init__.py index 910364d1b..8b1378917 100644 --- a/src/pipecat/services/inworld/__init__.py +++ b/src/pipecat/services/inworld/__init__.py @@ -1,13 +1 @@ -# -# Copyright (c) 2024–2025, Daily -# -# SPDX-License-Identifier: BSD 2-Clause License -# -# import sys - -# from pipecat.services import DeprecatedModuleProxy - -# from .tts import * - -# sys.modules[__name__] = DeprecatedModuleProxy(globals(), "inworld", "inworld.tts") From 662550cc5ea939c4864f0be2c88fcdcda335ffbe Mon Sep 17 00:00:00 2001 From: padillamt Date: Thu, 24 Jul 2025 21:05:22 -0700 Subject: [PATCH 27/38] mtpadilla: remove unused imports --- src/pipecat/services/inworld/tts.py | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/src/pipecat/services/inworld/tts.py b/src/pipecat/services/inworld/tts.py index 64d5d9fe9..6eac58e49 100644 --- a/src/pipecat/services/inworld/tts.py +++ b/src/pipecat/services/inworld/tts.py @@ -56,15 +56,12 @@ Examples:: """ import base64 -import io import json -import uuid -import warnings -from typing import AsyncGenerator, List, Optional, Union +from typing import AsyncGenerator, Optional import aiohttp from loguru import logger -from pydantic import BaseModel, Field +from pydantic import BaseModel from pipecat.frames.frames import ( CancelFrame, @@ -72,16 +69,11 @@ from pipecat.frames.frames import ( ErrorFrame, Frame, StartFrame, - StartInterruptionFrame, TTSAudioRawFrame, TTSStartedFrame, TTSStoppedFrame, ) -from pipecat.processors.frame_processor import FrameDirection -from pipecat.services.tts_service import AudioContextWordTTSService, TTSService -from pipecat.utils.asyncio.watchdog_async_iterator import WatchdogAsyncIterator -from pipecat.utils.text.base_text_aggregator import BaseTextAggregator -from pipecat.utils.text.skip_tags_aggregator import SkipTagsAggregator +from pipecat.services.tts_service import TTSService from pipecat.utils.tracing.service_decorators import traced_tts From d248c102c873d5933c4048cb1a12c384ec65b2ba Mon Sep 17 00:00:00 2001 From: padillamt Date: Thu, 24 Jul 2025 21:15:20 -0700 Subject: [PATCH 28/38] inworld: removal of unnecessary default assignment since already done Co-authored-by: Mark Backman --- src/pipecat/services/inworld/tts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pipecat/services/inworld/tts.py b/src/pipecat/services/inworld/tts.py index 6eac58e49..b89b21d4a 100644 --- a/src/pipecat/services/inworld/tts.py +++ b/src/pipecat/services/inworld/tts.py @@ -227,7 +227,7 @@ class InworldTTSService(TTSService): # This will be sent as JSON payload in each TTS request # Note: Language is automatically inferred from text by Inworld's models self._settings = { - "voiceId": params.voice_id or "Ashley", # Voice selection from params + "voiceId": params.voice_id, # Voice selection from params "modelId": params.model or "inworld-tts-1", # TTS model selection from params "audio_config": { # Audio format configuration "audio_encoding": encoding, # Format: LINEAR16, MP3, etc. From 16c20f3a997b48c98ed83570543f2071412f08c4 Mon Sep 17 00:00:00 2001 From: padillamt Date: Thu, 24 Jul 2025 21:15:34 -0700 Subject: [PATCH 29/38] inworld: removal of unnecessary default assignment since already done Co-authored-by: Mark Backman --- src/pipecat/services/inworld/tts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pipecat/services/inworld/tts.py b/src/pipecat/services/inworld/tts.py index b89b21d4a..ce7e7592d 100644 --- a/src/pipecat/services/inworld/tts.py +++ b/src/pipecat/services/inworld/tts.py @@ -228,7 +228,7 @@ class InworldTTSService(TTSService): # Note: Language is automatically inferred from text by Inworld's models self._settings = { "voiceId": params.voice_id, # Voice selection from params - "modelId": params.model or "inworld-tts-1", # TTS model selection from params + "modelId": params.model, # TTS model selection from params "audio_config": { # Audio format configuration "audio_encoding": encoding, # Format: LINEAR16, MP3, etc. "sample_rate_hertz": 0, # Will be set in start() from parent service From 7483422bd9dace032b4fa1fb6af713e4d4f0942e Mon Sep 17 00:00:00 2001 From: padillamt Date: Thu, 24 Jul 2025 21:23:03 -0700 Subject: [PATCH 30/38] inworld: change set_voice uto use self._settings Co-authored-by: Mark Backman --- src/pipecat/services/inworld/tts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pipecat/services/inworld/tts.py b/src/pipecat/services/inworld/tts.py index ce7e7592d..f664479b8 100644 --- a/src/pipecat/services/inworld/tts.py +++ b/src/pipecat/services/inworld/tts.py @@ -240,7 +240,7 @@ class InworldTTSService(TTSService): self._settings["temperature"] = params.temperature # Register voice and model with parent service for metrics and tracking - self.set_voice(params.voice_id or "Ashley") # Used for logging and metrics + self.set_voice(self._settings["voice_id"]) # Used for logging and metrics self.set_model_name(params.model or "inworld-tts-1") # Used for performance tracking def can_generate_metrics(self) -> bool: From 5fb1899aeb54c2567ba17440a4689ec2ac79272f Mon Sep 17 00:00:00 2001 From: padillamt Date: Thu, 24 Jul 2025 21:42:42 -0700 Subject: [PATCH 31/38] inworld: removal of unnecessary default assignment as already handled --- src/pipecat/services/inworld/tts.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/pipecat/services/inworld/tts.py b/src/pipecat/services/inworld/tts.py index f664479b8..c26b0b15c 100644 --- a/src/pipecat/services/inworld/tts.py +++ b/src/pipecat/services/inworld/tts.py @@ -240,8 +240,8 @@ class InworldTTSService(TTSService): self._settings["temperature"] = params.temperature # Register voice and model with parent service for metrics and tracking - self.set_voice(self._settings["voice_id"]) # Used for logging and metrics - self.set_model_name(params.model or "inworld-tts-1") # Used for performance tracking + self.set_voice(self._settings["voiceId"]) # Used for logging and metrics + self.set_model_name(self._settings["modelId"]) # Used for performance tracking def can_generate_metrics(self) -> bool: """Check if this service can generate processing metrics. From f982ace4c5ad1e4d86a222997eeead189c1e1289 Mon Sep 17 00:00:00 2001 From: padillamt Date: Thu, 24 Jul 2025 21:56:01 -0700 Subject: [PATCH 32/38] inworld: removal of unnecessary setting of ssampling rate since matches default --- examples/foundational/07ab-interruptible-inworld-http.py | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/foundational/07ab-interruptible-inworld-http.py b/examples/foundational/07ab-interruptible-inworld-http.py index 53e3c14b3..25d05cefe 100644 --- a/examples/foundational/07ab-interruptible-inworld-http.py +++ b/examples/foundational/07ab-interruptible-inworld-http.py @@ -100,7 +100,6 @@ async def run_example(transport: BaseTransport, _: argparse.Namespace, handle_si task = PipelineTask( pipeline, params=PipelineParams( - audio_out_sample_rate=24000, enable_metrics=True, enable_usage_metrics=True, ), From acc5b9f2102c85dcd6aebc43796918cf2e7685fa Mon Sep 17 00:00:00 2001 From: padillamt Date: Thu, 24 Jul 2025 22:07:15 -0700 Subject: [PATCH 33/38] inworld: change to function that stops all processing metrics Co-authored-by: Mark Backman --- src/pipecat/services/inworld/tts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pipecat/services/inworld/tts.py b/src/pipecat/services/inworld/tts.py index c26b0b15c..68ada9720 100644 --- a/src/pipecat/services/inworld/tts.py +++ b/src/pipecat/services/inworld/tts.py @@ -400,7 +400,7 @@ class InworldTTSService(TTSService): # STEP 8: CLEANUP AND COMPLETION # ================================================================================ # Always stop metrics tracking, even if errors occurred - await self.stop_ttfb_metrics() + await self.stop_all_metrics() # Signal to pipeline that TTS generation is complete # This allows downstream processors to finalize audio processing From 8e6679475962f9cca38a836968050c1421bbb26a Mon Sep 17 00:00:00 2001 From: padillamt Date: Thu, 24 Jul 2025 22:22:36 -0700 Subject: [PATCH 34/38] mtpadilla: switch to Deepgram ASR for lower latency --- examples/foundational/07ab-interruptible-inworld-http.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/examples/foundational/07ab-interruptible-inworld-http.py b/examples/foundational/07ab-interruptible-inworld-http.py index 25d05cefe..e0ae9f15c 100644 --- a/examples/foundational/07ab-interruptible-inworld-http.py +++ b/examples/foundational/07ab-interruptible-inworld-http.py @@ -16,9 +16,9 @@ from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner from pipecat.pipeline.task import PipelineParams, PipelineTask from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext +from pipecat.services.deepgram.stt import DeepgramSTTService from pipecat.services.inworld.tts import InworldTTSService from pipecat.services.openai.llm import OpenAILLMService -from pipecat.services.openai.stt import OpenAISTTService from pipecat.transports.base_transport import BaseTransport, TransportParams from pipecat.transports.network.fastapi_websocket import FastAPIWebsocketParams from pipecat.transports.services.daily import DailyParams @@ -52,11 +52,7 @@ async def run_example(transport: BaseTransport, _: argparse.Namespace, handle_si # Create an HTTP session async with aiohttp.ClientSession() as session: - stt = OpenAISTTService( - api_key=os.getenv("OPENAI_API_KEY"), - model="gpt-4o-transcribe", - prompt="Expect words related to dogs, such as breed names.", - ) + stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY")) # Inworld TTS Service - Unified streaming and non-streaming # Set streaming=True for real-time audio, streaming=False for complete audio generation From 37361391d9b896bc47941fd284b7ec7079e60597 Mon Sep 17 00:00:00 2001 From: padillamt Date: Fri, 25 Jul 2025 09:16:56 -0700 Subject: [PATCH 35/38] mtpadilla: removed ability to set base_url via constructor, set internally based on streaming variable --- src/pipecat/services/inworld/tts.py | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/src/pipecat/services/inworld/tts.py b/src/pipecat/services/inworld/tts.py index 68ada9720..6a3d6aa46 100644 --- a/src/pipecat/services/inworld/tts.py +++ b/src/pipecat/services/inworld/tts.py @@ -164,7 +164,6 @@ class InworldTTSService(TTSService): api_key: str, aiohttp_session: aiohttp.ClientSession, streaming: bool = True, - base_url: Optional[str] = None, sample_rate: Optional[int] = None, encoding: str = "LINEAR16", params: Optional[InputParams] = None, @@ -183,10 +182,9 @@ class InworldTTSService(TTSService): streaming: Whether to use streaming mode (True) or non-streaming mode (False). - True: Real-time audio chunks as they're generated (lower latency) - False: Complete audio file generated first, then chunked for playback (simpler) - base_url: Base URL for Inworld HTTP API. If None, automatically selected based on streaming mode: - - Streaming: "https://api.inworld.ai/tts/v1/voice:stream" - - Non-streaming: "https://api.inworld.ai/tts/v1/voice" - Should normally not be changed unless using a different environment. + The base URL is automatically selected based on this mode: + - Streaming: "https://api.inworld.ai/tts/v1/voice:stream" + - Non-streaming: "https://api.inworld.ai/tts/v1/voice" sample_rate: Audio sample rate in Hz. If None, uses default from StartFrame. Common values: 48000 (high quality), 24000 (good quality), 16000 (basic) encoding: Audio encoding format. Supported options: @@ -215,13 +213,11 @@ class InworldTTSService(TTSService): self._session = aiohttp_session # HTTP session for requests self._streaming = streaming # Streaming mode selection - # Set base URL based on streaming mode if not provided - if base_url is None: - if streaming: - base_url = "https://api.inworld.ai/tts/v1/voice:stream" # Streaming endpoint - else: - base_url = "https://api.inworld.ai/tts/v1/voice" # Non-streaming endpoint - self._base_url = base_url # API endpoint URL + # Set base URL based on streaming mode + if streaming: + self._base_url = "https://api.inworld.ai/tts/v1/voice:stream" # Streaming endpoint + else: + self._base_url = "https://api.inworld.ai/tts/v1/voice" # Non-streaming endpoint # Build settings dictionary that matches Inworld's API expectations # This will be sent as JSON payload in each TTS request From 4a9bec5b353c9194a7deb3a58ef69da44a920a37 Mon Sep 17 00:00:00 2001 From: padillamt Date: Fri, 25 Jul 2025 11:14:20 -0700 Subject: [PATCH 36/38] mtpadilla: stop metrics at result chunk --- src/pipecat/services/inworld/tts.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/pipecat/services/inworld/tts.py b/src/pipecat/services/inworld/tts.py index 6a3d6aa46..3e3004c5a 100644 --- a/src/pipecat/services/inworld/tts.py +++ b/src/pipecat/services/inworld/tts.py @@ -466,6 +466,7 @@ class InworldTTSService(TTSService): # Inworld's response format: {"result": {"audioContent": "base64data"}} if "result" in chunk_data and "audioContent" in chunk_data["result"]: # Process the audio chunk + await self.stop_ttfb_metrics() async for frame in self._process_audio_chunk( base64.b64decode(chunk_data["result"]["audioContent"]) ): From e140bd6960f49fe7261fd06467097bf762e99eab Mon Sep 17 00:00:00 2001 From: padillamt Date: Fri, 25 Jul 2025 14:04:49 -0700 Subject: [PATCH 37/38] mtpadilla: moved model and voice id setting into the class constructor --- src/pipecat/services/inworld/tts.py | 49 +++++++++++++++-------------- 1 file changed, 26 insertions(+), 23 deletions(-) diff --git a/src/pipecat/services/inworld/tts.py b/src/pipecat/services/inworld/tts.py index 3e3004c5a..fdd0d1a5c 100644 --- a/src/pipecat/services/inworld/tts.py +++ b/src/pipecat/services/inworld/tts.py @@ -34,10 +34,10 @@ Examples:: tts = InworldTTSService( api_key=os.getenv("INWORLD_API_KEY"), aiohttp_session=session, + voice_id="Ashley", + model="inworld-tts-1", streaming=True, # Default params=InworldTTSService.InputParams( - voice_id="Ashley", - model="inworld-tts-1", temperature=0.8, # Optional: control synthesis variability (range: [0, 2]) ), ) @@ -46,10 +46,10 @@ Examples:: tts = InworldTTSService( api_key=os.getenv("INWORLD_API_KEY"), aiohttp_session=session, + voice_id="Ashley", + model="inworld-tts-1", streaming=False, params=InworldTTSService.InputParams( - voice_id="Ashley", - model="inworld-tts-1", temperature=0.8, ), ) @@ -119,10 +119,10 @@ class InworldTTSService(TTSService): tts_streaming = InworldTTSService( api_key=os.getenv("INWORLD_API_KEY"), aiohttp_session=session, + voice_id="Ashley", + model="inworld-tts-1", streaming=True, # Default behavior params=InworldTTSService.InputParams( - voice_id="Ashley", - model="inworld-tts-1", temperature=0.8, # Add variability to speech synthesis (range: [0, 2]) ), ) @@ -131,21 +131,19 @@ class InworldTTSService(TTSService): tts_complete = InworldTTSService( api_key=os.getenv("INWORLD_API_KEY"), aiohttp_session=session, + voice_id="Hades", + model="inworld-tts-1-max", streaming=False, params=InworldTTSService.InputParams( - voice_id="Hades", - model="inworld-tts-1-max", temperature=0.8, ), ) """ class InputParams(BaseModel): - """Input parameters for Inworld TTS configuration. + """Optional input parameters for Inworld TTS configuration. Parameters: - voice_id: Voice selection for speech synthesis (e.g., "Ashley", "Hades"). - model: TTS model to use (e.g., "inworld-tts-1", "inworld-tts-1-max"). temperature: Voice temperature control for synthesis variability (e.g., 0.8). Valid range: [0, 2]. Higher values increase variability. @@ -154,8 +152,6 @@ class InworldTTSService(TTSService): so no explicit language parameter is required. """ - voice_id: Optional[str] = "Ashley" # defaults to the Ashley voice - model: Optional[str] = "inworld-tts-1" # defaults to the inworld-tts-1 model temperature: Optional[float] = None # optional temperature control (range: [0, 2]) def __init__( @@ -163,6 +159,8 @@ class InworldTTSService(TTSService): *, api_key: str, aiohttp_session: aiohttp.ClientSession, + voice_id: str = "Ashley", + model: str = "inworld-tts-1", streaming: bool = True, sample_rate: Optional[int] = None, encoding: str = "LINEAR16", @@ -179,6 +177,14 @@ class InworldTTSService(TTSService): Get this from: Inworld Portal > Settings > API Keys > Runtime API Key aiohttp_session: Shared aiohttp session for HTTP requests. Must be provided for proper connection pooling and resource management. + voice_id: Voice selection for speech synthesis. Common options include: + - "Ashley": Clear, professional female voice (default) + - "Hades": Deep, authoritative male voice + - And many more available in your Inworld account + model: TTS model to use for speech synthesis: + - "inworld-tts-1": Standard quality model (default) + - "inworld-tts-1-max": Higher quality model + - Other models as available in your Inworld account streaming: Whether to use streaming mode (True) or non-streaming mode (False). - True: Real-time audio chunks as they're generated (lower latency) - False: Complete audio file generated first, then chunked for playback (simpler) @@ -190,12 +196,9 @@ class InworldTTSService(TTSService): encoding: Audio encoding format. Supported options: - "LINEAR16" (default) - Uncompressed PCM, best quality - Other formats as supported by Inworld API - params: Input parameters for voice and model configuration. Use this to specify: - - voice_id: Voice selection ("Ashley", "Hades", etc.) - - model: TTS model ("inworld-tts-1", "inworld-tts-1-max", etc.) + params: Optional input parameters for additional configuration. Use this to specify: - temperature: Voice temperature control for variability (range: [0, 2], e.g., 0.8, optional) - If None, uses default values (Ashley voice, inworld-tts-1 model). - Note: Language is automatically inferred from input text. + Language is automatically inferred from input text. **kwargs: Additional arguments passed to the parent TTSService class. Note: @@ -223,8 +226,8 @@ class InworldTTSService(TTSService): # This will be sent as JSON payload in each TTS request # Note: Language is automatically inferred from text by Inworld's models self._settings = { - "voiceId": params.voice_id, # Voice selection from params - "modelId": params.model, # TTS model selection from params + "voiceId": voice_id, # Voice selection from direct parameter + "modelId": model, # TTS model selection from direct parameter "audio_config": { # Audio format configuration "audio_encoding": encoding, # Format: LINEAR16, MP3, etc. "sample_rate_hertz": 0, # Will be set in start() from parent service @@ -232,12 +235,12 @@ class InworldTTSService(TTSService): } # Add optional temperature parameter if provided (valid range: [0, 2]) - if params.temperature is not None: + if params and params.temperature is not None: self._settings["temperature"] = params.temperature # Register voice and model with parent service for metrics and tracking - self.set_voice(self._settings["voiceId"]) # Used for logging and metrics - self.set_model_name(self._settings["modelId"]) # Used for performance tracking + self.set_voice(voice_id) # Used for logging and metrics + self.set_model_name(model) # Used for performance tracking def can_generate_metrics(self) -> bool: """Check if this service can generate processing metrics. From b68f044ef7d73733aa057074ffa4cc2ff2c2c751 Mon Sep 17 00:00:00 2001 From: padillamt Date: Fri, 25 Jul 2025 15:13:43 -0700 Subject: [PATCH 38/38] mtpadilla: updated example to reflect parameter placement changes in base Inworld TTS class --- examples/foundational/07ab-interruptible-inworld-http.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/foundational/07ab-interruptible-inworld-http.py b/examples/foundational/07ab-interruptible-inworld-http.py index e0ae9f15c..5d559ba5a 100644 --- a/examples/foundational/07ab-interruptible-inworld-http.py +++ b/examples/foundational/07ab-interruptible-inworld-http.py @@ -61,10 +61,10 @@ async def run_example(transport: BaseTransport, _: argparse.Namespace, handle_si tts = InworldTTSService( api_key=os.getenv("INWORLD_API_KEY", ""), aiohttp_session=session, + voice_id="Ashley", + model="inworld-tts-1", streaming=streaming, # True: real-time chunks, False: complete audio then playback params=InworldTTSService.InputParams( - voice_id="Ashley", - model="inworld-tts-1", temperature=0.8, ), )