From 406f8b730b2e6ea61adf891e5fec2534069def02 Mon Sep 17 00:00:00 2001
From: Ian Lee <ian.lee@berkeley.edu>
Date: Thu, 7 May 2026 14:49:57 -0700
Subject: [PATCH] [inworld] default to using PCM encoding

* server returns audio bytes without headers
---
 changelog/4446.change.md            | 1 +
 src/pipecat/services/inworld/tts.py | 8 ++++----
 2 files changed, 5 insertions(+), 4 deletions(-)
 create mode 100644 changelog/4446.change.md

diff --git a/changelog/4446.change.md b/changelog/4446.change.md
new file mode 100644
index 000000000..20efc50d8
--- /dev/null
+++ b/changelog/4446.change.md
@@ -0,0 +1 @@
+- Updated `InworldHttpTTSService` and `InworldTTSService` to use PCM audio encoding by default, which returns audio bytes without headers.
\ No newline at end of file
diff --git a/src/pipecat/services/inworld/tts.py b/src/pipecat/services/inworld/tts.py
index 5c0194ff1..fc6b32f89 100644
--- a/src/pipecat/services/inworld/tts.py
+++ b/src/pipecat/services/inworld/tts.py
@@ -94,7 +94,7 @@ class InworldHttpTTSService(TTSService):
     """Inworld AI HTTP-based TTS service.
 
     Supports both streaming and non-streaming modes via the `streaming` parameter.
-    Outputs LINEAR16 audio at configurable sample rates with word-level timestamps.
+    Outputs PCM audio at configurable sample rates with word-level timestamps.
     """
 
     Settings = InworldTTSSettings
@@ -125,7 +125,7 @@ class InworldHttpTTSService(TTSService):
         model: str | None = None,
         streaming: bool = True,
         sample_rate: int | None = None,
-        encoding: str = "LINEAR16",
+        encoding: str = "PCM",
         timestamp_transport_strategy: Literal["ASYNC", "SYNC"] | None = "ASYNC",
         params: InputParams | None = None,
         settings: Settings | None = None,
@@ -505,7 +505,7 @@ class InworldTTSService(WebsocketTTSService):
     """Inworld AI WebSocket-based TTS service.
 
     Uses bidirectional WebSocket for lower latency streaming. Supports multiple
-    independent audio contexts per connection (max 5). Outputs LINEAR16 audio
+    independent audio contexts per connection (max 5). Outputs PCM audio
     with word-level timestamps.
     """
 
@@ -548,7 +548,7 @@ class InworldTTSService(WebsocketTTSService):
         model: str | None = None,
         url: str = "wss://api.inworld.ai/tts/v1/voice:streamBidirectional",
         sample_rate: int | None = None,
-        encoding: str = "LINEAR16",
+        encoding: str = "PCM",
         auto_mode: bool | None = None,
         apply_text_normalization: str | None = None,
         timestamp_transport_strategy: Literal["ASYNC", "SYNC"] | None = "ASYNC",