From dfe5fec8f964ec21dd9dd543154c06b18cd6e22c Mon Sep 17 00:00:00 2001
From: Ian Lee <ian.lee@berkeley.edu>
Date: Thu, 12 Mar 2026 14:42:56 -0700
Subject: [PATCH] [inworld] prewarm context on llm response start

---
 changelog/4013.changed.md           |  1 +
 src/pipecat/services/inworld/tts.py | 37 +++++++++++++++++++++++++++++
 2 files changed, 38 insertions(+)
 create mode 100644 changelog/4013.changed.md

diff --git a/changelog/4013.changed.md b/changelog/4013.changed.md
new file mode 100644
index 000000000..4f8a37680
--- /dev/null
+++ b/changelog/4013.changed.md
@@ -0,0 +1 @@
+- Added context prewarming path for `InworldTTSService` to improve first audio latency
\ No newline at end of file
diff --git a/src/pipecat/services/inworld/tts.py b/src/pipecat/services/inworld/tts.py
index 334c3c617..f6d0bf466 100644
--- a/src/pipecat/services/inworld/tts.py
+++ b/src/pipecat/services/inworld/tts.py
@@ -56,6 +56,7 @@ from pipecat.frames.frames import (
     ErrorFrame,
     Frame,
     InterruptionFrame,
+    LLMFullResponseStartFrame,
     StartFrame,
     TTSAudioRawFrame,
     TTSStartedFrame,
@@ -653,6 +654,11 @@ class InworldTTSService(WebsocketTTSService):
         # Track the end time of the last word in the current generation
         self._generation_end_time = 0.0
 
+        # Context ID that was pre-opened on the server during process_frame
+        # (LLMFullResponseStartFrame) to avoid context creation latency when
+        # enough context for TTS is available.
+        self._prewarmed_context_id: Optional[str] = None
+
         # Init-only config (not runtime-updatable).
         self._audio_encoding = encoding
         self._audio_sample_rate = 0  # Set in start()
@@ -726,6 +732,29 @@ class InworldTTSService(WebsocketTTSService):
             if isinstance(frame, TTSStoppedFrame):
                 await self.add_word_timestamps([("Reset", 0)])
 
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
+        """Process incoming frames and pre-open context on LLM response start.
+
+        Eagerly sends the context configuration to the server when
+        LLMFullResponseStartFrame arrives, so the context is ready by the time
+        enough context for TTS is available. The base class assigns ``_turn_context_id`` before
+        this runs, which is reused for all ``run_tts`` calls within the turn.
+        """
+        await super().process_frame(frame, direction)
+
+        if isinstance(frame, LLMFullResponseStartFrame):
+            if self._prewarmed_context_id:
+                try:
+                    await self._send_close_context(self._prewarmed_context_id)
+                except Exception as e:
+                    logger.warning(f"{self}: Failed to close previous prewarmed context: {e}")
+                self._prewarmed_context_id = None
+            try:
+                await self._send_context(self._turn_context_id)
+                self._prewarmed_context_id = self._turn_context_id
+            except Exception as e:
+                logger.warning(f"{self}: Failed to pre-open context: {e}")
+
     def _calculate_word_times(self, timestamp_info: Dict[str, Any]) -> List[Tuple[str, float]]:
         """Calculate word timestamps from Inworld WebSocket API response.
 
@@ -887,6 +916,7 @@ class InworldTTSService(WebsocketTTSService):
         finally:
             await self.remove_active_audio_context()
             self._websocket = None
+            self._prewarmed_context_id = None
             self._cumulative_time = 0.0
             self._generation_end_time = 0.0
             await self._call_event_handler("on_disconnected")
@@ -1001,9 +1031,16 @@ class InworldTTSService(WebsocketTTSService):
     async def _send_context(self, context_id: str):
         """Send a context to the Inworld WebSocket TTS service.
 
+        Skips the send if this context was already pre-opened on the server
+        (prewarmed during process_frame).
+
         Args:
             context_id: The context ID.
         """
+        if context_id == self._prewarmed_context_id:
+            self._prewarmed_context_id = None
+            return
+
         audio_config = {
             "audioEncoding": self._audio_encoding,
             "sampleRateHertz": self._audio_sample_rate,