From 4ef5ac6f0c0290fc3b2c30d8e5de3b8cd4185695 Mon Sep 17 00:00:00 2001 From: filipi87 Date: Fri, 27 Mar 2026 11:33:32 -0300 Subject: [PATCH 1/4] InworldTTSService improvements. --- changelog/4167.fixed.md | 1 + src/pipecat/services/inworld/tts.py | 33 +++++++---------------------- 2 files changed, 9 insertions(+), 25 deletions(-) create mode 100644 changelog/4167.fixed.md diff --git a/changelog/4167.fixed.md b/changelog/4167.fixed.md new file mode 100644 index 000000000..2784d4a41 --- /dev/null +++ b/changelog/4167.fixed.md @@ -0,0 +1 @@ +- Fixed a word timestamp interleaving issue in `InworldTTSService` when processing multiple sentences. diff --git a/src/pipecat/services/inworld/tts.py b/src/pipecat/services/inworld/tts.py index cc0350abf..be4ff050a 100644 --- a/src/pipecat/services/inworld/tts.py +++ b/src/pipecat/services/inworld/tts.py @@ -646,7 +646,7 @@ class InworldTTSService(WebsocketTTSService): super().__init__( push_text_frames=False, - push_stop_frames=True, + push_stop_frames=False, pause_frame_processing=True, sample_rate=sample_rate, aggregate_sentences=aggregate_sentences, @@ -742,21 +742,10 @@ class InworldTTSService(WebsocketTTSService): logger.trace(f"Flushing audio for context {flush_id}") await self._send_flush(flush_id) - async def push_frame(self, frame: Frame, direction: FrameDirection = FrameDirection.DOWNSTREAM): - """Push a frame and handle state changes. - - Args: - frame: The frame to push. - direction: The direction to push the frame. - """ - await super().push_frame(frame, direction) - if isinstance(frame, (TTSStoppedFrame, InterruptionFrame)): - logger.trace( - f"{self}: Resetting timestamp tracking due to {type(frame).__name__} - " - f"cumulative_time was {self._cumulative_time}" - ) - self._cumulative_time = 0.0 - self._generation_end_time = 0.0 + def _reset_generation_timing(self): + """Reset the cumulative time and generation end time for a new generation.""" + self._cumulative_time = 0.0 + self._generation_end_time = 0.0 async def on_turn_context_created(self, context_id: str): """Eagerly open the context on the server when a new turn starts. @@ -815,8 +804,6 @@ class InworldTTSService(WebsocketTTSService): except Exception as e: await self.push_error(error_msg=f"Unknown error occurred: {e}", exception=e) self._sent_context_ids.discard(context_id) - self._cumulative_time = 0.0 - self._generation_end_time = 0.0 async def on_turn_context_completed(self): """Close the server-side context at end of turn. @@ -834,10 +821,6 @@ class InworldTTSService(WebsocketTTSService): await self._close_context(context_id) await super().on_audio_context_interrupted(context_id) - async def on_audio_context_completed(self, context_id: str): - """Callback invoked when an audio context has been completed.""" - await self._close_context(context_id) - async def _maybe_push_fallback_text(self, context_id: str): """Push the full text as fallback when no timestamps were received. @@ -966,8 +949,7 @@ class InworldTTSService(WebsocketTTSService): await self.remove_active_audio_context() self._websocket = None self._sent_context_ids.clear() - self._cumulative_time = 0.0 - self._generation_end_time = 0.0 + self._reset_generation_timing() self._context_texts.clear() self._contexts_with_timestamps.clear() await self._call_event_handler("on_disconnected") @@ -1053,7 +1035,7 @@ class InworldTTSService(WebsocketTTSService): # Handle context closed - context no longer exists on server if "contextClosed" in result: - logger.trace(f"{self}: Context closed on server: {ctx_id}") + logger.debug(f"{self}: Context closed on server: {ctx_id}") await self._maybe_push_fallback_text(ctx_id) await self.stop_ttfb_metrics() await self.append_to_audio_context(ctx_id, TTSStoppedFrame(context_id=ctx_id)) @@ -1174,6 +1156,7 @@ class InworldTTSService(WebsocketTTSService): try: if not self.audio_context_available(context_id): + self._reset_generation_timing() await self.create_audio_context(context_id) await self.start_ttfb_metrics() yield TTSStartedFrame(context_id=context_id) From b31bece617ab7699f327432a6fd2d69b4eca9683 Mon Sep 17 00:00:00 2001 From: filipi87 Date: Fri, 27 Mar 2026 12:06:21 -0300 Subject: [PATCH 2/4] Not trying to recreate the context. --- src/pipecat/services/inworld/tts.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/pipecat/services/inworld/tts.py b/src/pipecat/services/inworld/tts.py index be4ff050a..157130442 100644 --- a/src/pipecat/services/inworld/tts.py +++ b/src/pipecat/services/inworld/tts.py @@ -997,10 +997,6 @@ class InworldTTSService(WebsocketTTSService): # Handle context created confirmation if "contextCreated" in result: logger.trace(f"{self}: Context created on server: {ctx_id}") - # If the context isn't available recreate it (handles race conditions during interruption recovery). - elif ctx_id and not self.audio_context_available(ctx_id): - logger.trace(f"{self}: Recreating audio context for current context: {ctx_id}") - await self.create_audio_context(ctx_id) # Process audio chunk audio_chunk = result.get("audioChunk", {}) @@ -1148,7 +1144,7 @@ class InworldTTSService(WebsocketTTSService): Returns: An asynchronous generator of frames. """ - logger.debug(f"{self}: Generating WebSocket TTS [{text}]") + logger.debug(f"{self}: Generating WebSocket TTS [{text}, for context: {context_id}]") try: if not self._websocket or self._websocket.state is State.CLOSED: From e851f8c1d542c7948d7ea8bdafec769a23e68e44 Mon Sep 17 00:00:00 2001 From: filipi87 Date: Fri, 27 Mar 2026 12:11:35 -0300 Subject: [PATCH 3/4] Adding changelog entry for the fix. --- changelog/4167.fixed.2.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 changelog/4167.fixed.2.md diff --git a/changelog/4167.fixed.2.md b/changelog/4167.fixed.2.md new file mode 100644 index 000000000..6b894cc7f --- /dev/null +++ b/changelog/4167.fixed.2.md @@ -0,0 +1 @@ +- Fixed an issue in `InworldTTSService` where, in cases of fast interruption, we would continue receiving audio from the previous context. \ No newline at end of file From e2870fc2ac08b2ac9cb04fac9a4459b72114f358 Mon Sep 17 00:00:00 2001 From: filipi87 Date: Fri, 27 Mar 2026 12:12:16 -0300 Subject: [PATCH 4/4] Changing to debug the log when we are not able to append audio to the context. --- src/pipecat/services/tts_service.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pipecat/services/tts_service.py b/src/pipecat/services/tts_service.py index a65a5e244..1cd68540d 100644 --- a/src/pipecat/services/tts_service.py +++ b/src/pipecat/services/tts_service.py @@ -1223,7 +1223,7 @@ class TTSService(AIService): logger.trace(f"{self} appending audio {frame} to audio context {context_id}") await self._audio_contexts[context_id].put(frame) else: - logger.warning(f"{self} unable to append audio to context {context_id}") + logger.debug(f"{self} unable to append audio to context {context_id}") async def remove_audio_context(self, context_id: str): """Remove an existing audio context.