From e8783f6a33fe4049aaedff2e2940cc9b0ba5b011 Mon Sep 17 00:00:00 2001
From: Adithya Suresh <adxthya@amazon.com>
Date: Mon, 24 Mar 2025 15:40:26 -0700
Subject: [PATCH 01/97] Handle cache token counts being none

---
 src/pipecat/services/anthropic/llm.py | 24 +++++++++++++++++-------
 1 file changed, 17 insertions(+), 7 deletions(-)

diff --git a/src/pipecat/services/anthropic/llm.py b/src/pipecat/services/anthropic/llm.py
index 3e369075a..fd646b21c 100644
--- a/src/pipecat/services/anthropic/llm.py
+++ b/src/pipecat/services/anthropic/llm.py
@@ -253,14 +253,24 @@ class AnthropicLLMService(LLMService):
                         if hasattr(event.message.usage, "output_tokens")
                         else 0
                     )
-                    if hasattr(event.message.usage, "cache_creation_input_tokens"):
-                        cache_creation_input_tokens += (
-                            event.message.usage.cache_creation_input_tokens
+                    cache_creation_input_tokens += (
+                        event.message.usage.cache_creation_input_tokens
+                        if (
+                            hasattr(event.message.usage, "cache_creation_input_tokens")
+                            and event.message.usage.cache_creation_input_tokens is not None
                         )
-                        logger.debug(f"Cache creation input tokens: {cache_creation_input_tokens}")
-                    if hasattr(event.message.usage, "cache_read_input_tokens"):
-                        cache_read_input_tokens += event.message.usage.cache_read_input_tokens
-                        logger.debug(f"Cache read input tokens: {cache_read_input_tokens}")
+                        else 0
+                    )
+                    logger.debug(f"Cache creation input tokens: {cache_creation_input_tokens}")
+                    cache_read_input_tokens += (
+                        event.message.usage.cache_read_input_tokens
+                        if (
+                            hasattr(event.message.usage, "cache_read_input_tokens")
+                            and event.message.usage.cache_read_input_tokens is not None
+                        )
+                        else 0
+                    )
+                    logger.debug(f"Cache read input tokens: {cache_read_input_tokens}")
                     total_input_tokens = (
                         prompt_tokens + cache_creation_input_tokens + cache_read_input_tokens
                     )

From 855d567b1ef87c7e7df1bd8036f6ab446b81ff86 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= <aleix@daily.co>
Date: Mon, 5 May 2025 14:06:58 -0700
Subject: [PATCH 02/97] only send data to transports after they are really
 ready

---
 CHANGELOG.md                                  |  3 ++
 src/pipecat/transports/base_input.py          | 12 ++++---
 src/pipecat/transports/base_output.py         | 18 ++++++-----
 src/pipecat/transports/local/audio.py         |  4 +++
 src/pipecat/transports/local/tk.py            |  4 +++
 .../transports/network/fastapi_websocket.py   |  2 ++
 .../transports/network/small_webrtc.py        |  2 ++
 .../transports/network/websocket_client.py    |  2 ++
 .../transports/network/websocket_server.py    |  2 ++
 src/pipecat/transports/services/daily.py      | 31 ++++++++++++-------
 src/pipecat/transports/services/livekit.py    |  2 ++
 11 files changed, 58 insertions(+), 24 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 54cbd3cd6..95350157f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -13,6 +13,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Fixed
 
+- Fixed an issue that could cause data to be sent to the transports when they
+  were still not ready.
+
 - Remove custom audio tracks from `DailyTransport` before leaving.
 
 ## [0.0.66] - 2025-05-02
diff --git a/src/pipecat/transports/base_input.py b/src/pipecat/transports/base_input.py
index 51ebdb677..f9a27a6d3 100644
--- a/src/pipecat/transports/base_input.py
+++ b/src/pipecat/transports/base_input.py
@@ -122,6 +122,7 @@ class BaseInputTransport(FrameProcessor):
         # Configure VAD analyzer.
         if self._params.vad_analyzer:
             self._params.vad_analyzer.set_sample_rate(self._sample_rate)
+
         # Configure End of turn analyzer.
         if self._params.turn_analyzer:
             self._params.turn_analyzer.set_sample_rate(self._sample_rate)
@@ -129,10 +130,6 @@ class BaseInputTransport(FrameProcessor):
         # Start audio filter.
         if self._params.audio_in_filter:
             await self._params.audio_in_filter.start(self._sample_rate)
-        # Create audio input queue and task if needed.
-        if not self._audio_task and self._params.audio_in_enabled:
-            self._audio_in_queue = asyncio.Queue()
-            self._audio_task = self.create_task(self._audio_task_handler())
 
     async def stop(self, frame: EndFrame):
         # Cancel and wait for the audio input task to finish.
@@ -149,6 +146,13 @@ class BaseInputTransport(FrameProcessor):
             await self.cancel_task(self._audio_task)
             self._audio_task = None
 
+    async def set_transport_ready(self, frame: StartFrame):
+        """To be called when the transport is ready to stream."""
+        # Create audio input queue and task if needed.
+        if not self._audio_task and self._params.audio_in_enabled:
+            self._audio_in_queue = asyncio.Queue()
+            self._audio_task = self.create_task(self._audio_task_handler())
+
     async def push_audio_frame(self, frame: InputAudioRawFrame):
         if self._params.audio_in_enabled:
             await self._audio_in_queue.put(frame)
diff --git a/src/pipecat/transports/base_output.py b/src/pipecat/transports/base_output.py
index fa5d5e1c4..81492b84d 100644
--- a/src/pipecat/transports/base_output.py
+++ b/src/pipecat/transports/base_output.py
@@ -78,6 +78,16 @@ class BaseOutputTransport(FrameProcessor):
         audio_bytes_10ms = int(self._sample_rate / 100) * self._params.audio_out_channels * 2
         self._audio_chunk_size = audio_bytes_10ms * self._params.audio_out_10ms_chunks
 
+    async def stop(self, frame: EndFrame):
+        for _, sender in self._media_senders.items():
+            await sender.stop(frame)
+
+    async def cancel(self, frame: CancelFrame):
+        for _, sender in self._media_senders.items():
+            await sender.cancel(frame)
+
+    async def set_transport_ready(self, frame: StartFrame):
+        """To be called when the transport is ready to stream."""
         # Register destinations.
         for destination in self._params.audio_out_destinations:
             await self.register_audio_destination(destination)
@@ -112,14 +122,6 @@ class BaseOutputTransport(FrameProcessor):
             )
             await self._media_senders[destination].start(frame)
 
-    async def stop(self, frame: EndFrame):
-        for _, sender in self._media_senders.items():
-            await sender.stop(frame)
-
-    async def cancel(self, frame: CancelFrame):
-        for _, sender in self._media_senders.items():
-            await sender.cancel(frame)
-
     async def send_message(self, frame: TransportMessageFrame | TransportMessageUrgentFrame):
         pass
 
diff --git a/src/pipecat/transports/local/audio.py b/src/pipecat/transports/local/audio.py
index 8bfd7ee34..ba554c9e3 100644
--- a/src/pipecat/transports/local/audio.py
+++ b/src/pipecat/transports/local/audio.py
@@ -61,6 +61,8 @@ class LocalAudioInputTransport(BaseInputTransport):
         )
         self._in_stream.start_stream()
 
+        await self.set_transport_ready(frame)
+
     async def cleanup(self):
         await super().cleanup()
         if self._in_stream:
@@ -111,6 +113,8 @@ class LocalAudioOutputTransport(BaseOutputTransport):
         )
         self._out_stream.start_stream()
 
+        await self.set_transport_ready(frame)
+
     async def cleanup(self):
         await super().cleanup()
         if self._out_stream:
diff --git a/src/pipecat/transports/local/tk.py b/src/pipecat/transports/local/tk.py
index bed6371c2..4086497cb 100644
--- a/src/pipecat/transports/local/tk.py
+++ b/src/pipecat/transports/local/tk.py
@@ -68,6 +68,8 @@ class TkInputTransport(BaseInputTransport):
         )
         self._in_stream.start_stream()
 
+        await self.set_transport_ready(frame)
+
     async def cleanup(self):
         await super().cleanup()
         if self._in_stream:
@@ -124,6 +126,8 @@ class TkOutputTransport(BaseOutputTransport):
         )
         self._out_stream.start_stream()
 
+        await self.set_transport_ready(frame)
+
     async def cleanup(self):
         await super().cleanup()
         if self._out_stream:
diff --git a/src/pipecat/transports/network/fastapi_websocket.py b/src/pipecat/transports/network/fastapi_websocket.py
index 4a20bc49b..f04d56b0d 100644
--- a/src/pipecat/transports/network/fastapi_websocket.py
+++ b/src/pipecat/transports/network/fastapi_websocket.py
@@ -131,6 +131,7 @@ class FastAPIWebsocketInputTransport(BaseInputTransport):
         await self._client.trigger_client_connected()
         if not self._receive_task:
             self._receive_task = self.create_task(self._receive_messages())
+        await self.set_transport_ready(frame)
 
     async def _stop_tasks(self):
         if self._monitor_websocket_task:
@@ -204,6 +205,7 @@ class FastAPIWebsocketOutputTransport(BaseOutputTransport):
         await self._client.setup(frame)
         await self._params.serializer.setup(frame)
         self._send_interval = (self.audio_chunk_size / self.sample_rate) / 2
+        await self.set_transport_ready(frame)
 
     async def stop(self, frame: EndFrame):
         await super().stop(frame)
diff --git a/src/pipecat/transports/network/small_webrtc.py b/src/pipecat/transports/network/small_webrtc.py
index fdd501299..ffa3f441a 100644
--- a/src/pipecat/transports/network/small_webrtc.py
+++ b/src/pipecat/transports/network/small_webrtc.py
@@ -395,6 +395,7 @@ class SmallWebRTCInputTransport(BaseInputTransport):
             self._receive_audio_task = self.create_task(self._receive_audio())
         if not self._receive_video_task and self._params.video_in_enabled:
             self._receive_video_task = self.create_task(self._receive_video())
+        await self.set_transport_ready(frame)
 
     async def _stop_tasks(self):
         if self._receive_audio_task:
@@ -487,6 +488,7 @@ class SmallWebRTCOutputTransport(BaseOutputTransport):
         await super().start(frame)
         await self._client.setup(self._params, frame)
         await self._client.connect()
+        await self.set_transport_ready(frame)
 
     async def stop(self, frame: EndFrame):
         await super().stop(frame)
diff --git a/src/pipecat/transports/network/websocket_client.py b/src/pipecat/transports/network/websocket_client.py
index 7e9725a76..535a0ab21 100644
--- a/src/pipecat/transports/network/websocket_client.py
+++ b/src/pipecat/transports/network/websocket_client.py
@@ -136,6 +136,7 @@ class WebsocketClientInputTransport(BaseInputTransport):
         await self._params.serializer.setup(frame)
         await self._session.setup(frame)
         await self._session.connect()
+        await self.set_transport_ready(frame)
 
     async def stop(self, frame: EndFrame):
         await super().stop(frame)
@@ -186,6 +187,7 @@ class WebsocketClientOutputTransport(BaseOutputTransport):
         await self._params.serializer.setup(frame)
         await self._session.setup(frame)
         await self._session.connect()
+        await self.set_transport_ready(frame)
 
     async def stop(self, frame: EndFrame):
         await super().stop(frame)
diff --git a/src/pipecat/transports/network/websocket_server.py b/src/pipecat/transports/network/websocket_server.py
index b930f9fd6..7c8738871 100644
--- a/src/pipecat/transports/network/websocket_server.py
+++ b/src/pipecat/transports/network/websocket_server.py
@@ -83,6 +83,7 @@ class WebsocketServerInputTransport(BaseInputTransport):
         await self._params.serializer.setup(frame)
         if not self._server_task:
             self._server_task = self.create_task(self._server_task_handler())
+        await self.set_transport_ready(frame)
 
     async def stop(self, frame: EndFrame):
         await super().stop(frame)
@@ -195,6 +196,7 @@ class WebsocketServerOutputTransport(BaseOutputTransport):
         await super().start(frame)
         await self._params.serializer.setup(frame)
         self._send_interval = (self.audio_chunk_size / self.sample_rate) / 2
+        await self.set_transport_ready(frame)
 
     async def stop(self, frame: EndFrame):
         await super().stop(frame)
diff --git a/src/pipecat/transports/services/daily.py b/src/pipecat/transports/services/daily.py
index 3e43ddee1..9909d9336 100644
--- a/src/pipecat/transports/services/daily.py
+++ b/src/pipecat/transports/services/daily.py
@@ -944,19 +944,23 @@ class DailyInputTransport(BaseInputTransport):
             self._audio_in_task = self.create_task(self._audio_in_task_handler())
 
     async def start(self, frame: StartFrame):
-        # Setup client.
-        await self._client.setup(frame)
-
-        # Parent start.
-        await super().start(frame)
-
         if self._initialized:
             return
 
         self._initialized = True
 
+        # Parent start.
+        await super().start(frame)
+
+        # Setup client.
+        await self._client.setup(frame)
+
         # Join the room.
         await self._client.join()
+
+        # Indicate the transport that we are connected.
+        await self.set_transport_ready(frame)
+
         if self._params.audio_in_stream_on_start:
             self.start_audio_in_streaming()
 
@@ -1125,20 +1129,23 @@ class DailyOutputTransport(BaseOutputTransport):
         self._initialized = False
 
     async def start(self, frame: StartFrame):
-        # Setup client.
-        await self._client.setup(frame)
-
-        # Parent start.
-        await super().start(frame)
-
         if self._initialized:
             return
 
         self._initialized = True
 
+        # Parent start.
+        await super().start(frame)
+
+        # Setup client.
+        await self._client.setup(frame)
+
         # Join the room.
         await self._client.join()
 
+        # Indicate the transport that we are connected.
+        await self.set_transport_ready(frame)
+
     async def stop(self, frame: EndFrame):
         # Parent stop.
         await super().stop(frame)
diff --git a/src/pipecat/transports/services/livekit.py b/src/pipecat/transports/services/livekit.py
index 456a70ea6..36cc5d604 100644
--- a/src/pipecat/transports/services/livekit.py
+++ b/src/pipecat/transports/services/livekit.py
@@ -370,6 +370,7 @@ class LiveKitInputTransport(BaseInputTransport):
         await self._client.connect()
         if not self._audio_in_task and self._params.audio_in_enabled:
             self._audio_in_task = self.create_task(self._audio_in_task_handler())
+        await self.set_transport_ready(frame)
         logger.info("LiveKitInputTransport started")
 
     async def stop(self, frame: EndFrame):
@@ -441,6 +442,7 @@ class LiveKitOutputTransport(BaseOutputTransport):
         await super().start(frame)
         await self._client.setup(frame)
         await self._client.connect()
+        await self.set_transport_ready(frame)
         logger.info("LiveKitOutputTransport started")
 
     async def stop(self, frame: EndFrame):

From 9cc498b1fa17e2571092084d5dc6d3762fbe0bfd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= <aleix@daily.co>
Date: Mon, 5 May 2025 21:27:49 -0700
Subject: [PATCH 03/97] TaskManager: use a dictionary instead of a set to store
 tasks

---
 CHANGELOG.md                 |  2 ++
 src/pipecat/utils/asyncio.py | 15 ++++++++-------
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 95350157f..ac4e2db11 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -13,6 +13,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Fixed
 
+- Fixed a `TaskManager` that was causing dangling tasks to be reported.
+
 - Fixed an issue that could cause data to be sent to the transports when they
   were still not ready.
 
diff --git a/src/pipecat/utils/asyncio.py b/src/pipecat/utils/asyncio.py
index acc4acec8..cea447329 100644
--- a/src/pipecat/utils/asyncio.py
+++ b/src/pipecat/utils/asyncio.py
@@ -6,7 +6,7 @@
 
 import asyncio
 from abc import ABC, abstractmethod
-from typing import Coroutine, Optional, Set
+from typing import Coroutine, Dict, Optional, Sequence, Set
 
 from loguru import logger
 
@@ -69,14 +69,14 @@ class BaseTaskManager(ABC):
         pass
 
     @abstractmethod
-    def current_tasks(self) -> Set[asyncio.Task]:
+    def current_tasks(self) -> Sequence[asyncio.Task]:
         """Returns the list of currently created/registered tasks."""
         pass
 
 
 class TaskManager(BaseTaskManager):
     def __init__(self) -> None:
-        self._tasks: Set[asyncio.Task] = set()
+        self._tasks: Dict[str, asyncio.Task] = {}
         self._loop: Optional[asyncio.AbstractEventLoop] = None
 
     def set_event_loop(self, loop: asyncio.AbstractEventLoop):
@@ -179,16 +179,17 @@ class TaskManager(BaseTaskManager):
         finally:
             self._remove_task(task)
 
-    def current_tasks(self) -> Set[asyncio.Task]:
+    def current_tasks(self) -> Sequence[asyncio.Task]:
         """Returns the list of currently created/registered tasks."""
-        return self._tasks
+        return list(self._tasks.values())
 
     def _add_task(self, task: asyncio.Task):
-        self._tasks.add(task)
+        name = task.get_name()
+        self._tasks[name] = task
 
     def _remove_task(self, task: asyncio.Task):
         name = task.get_name()
         try:
-            self._tasks.remove(task)
+            del self._tasks[name]
         except KeyError as e:
             logger.trace(f"{name}: unable to remove task (already removed?): {e}")

From 45839053135f8f6dac67ea448fdd4f1909d17e95 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= <aleix@daily.co>
Date: Mon, 5 May 2025 21:33:21 -0700
Subject: [PATCH 04/97] PipelineTask: cleanup if task is cancelled from outside
 Pipecat

---
 CHANGELOG.md                 |  3 +++
 src/pipecat/pipeline/task.py | 31 ++++++++++++++++++++-----------
 2 files changed, 23 insertions(+), 11 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index ac4e2db11..d8be39cdb 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -13,6 +13,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Fixed
 
+- Fixed a `PipelineTask` issue that would cause tasks to not be cancelled if
+  task was cancelled from outside of Pipecat.
+
 - Fixed a `TaskManager` that was causing dangling tasks to be reported.
 
 - Fixed an issue that could cause data to be sent to the transports when they
diff --git a/src/pipecat/pipeline/task.py b/src/pipecat/pipeline/task.py
index 8279373cb..c40173899 100644
--- a/src/pipecat/pipeline/task.py
+++ b/src/pipecat/pipeline/task.py
@@ -286,12 +286,7 @@ class PipelineTask(BaseTask):
     async def cancel(self):
         """Stops the running pipeline immediately."""
         logger.debug(f"Canceling pipeline task {self}")
-        # Make sure everything is cleaned up downstream. This is sent
-        # out-of-band from the main streaming task which is what we want since
-        # we want to cancel right away.
-        await self._source.push_frame(CancelFrame())
-        # Only cancel the push task. Everything else will be cancelled in run().
-        await self._task_manager.cancel_task(self._process_push_task)
+        await self._cancel()
 
     async def run(self):
         """Starts and manages the pipeline execution until completion or cancellation."""
@@ -309,11 +304,17 @@ class PipelineTask(BaseTask):
             # well, because you get a CancelledError in every place you are
             # awaiting a task.
             pass
-        await self._cancel_tasks()
-        await self._cleanup(cleanup_pipeline)
-        if self._check_dangling_tasks:
-            self._print_dangling_tasks()
-        self._finished = True
+        finally:
+            # It's possibe that we get an asyncio.CancelledError from the
+            # outside, if so we need to make sure everything gets cancelled
+            # properly.
+            if cleanup_pipeline:
+                await self._cancel()
+            await self._cancel_tasks()
+            await self._cleanup(cleanup_pipeline)
+            if self._check_dangling_tasks:
+                self._print_dangling_tasks()
+            self._finished = True
 
     async def queue_frame(self, frame: Frame):
         """Queue a single frame to be pushed down the pipeline.
@@ -336,6 +337,14 @@ class PipelineTask(BaseTask):
             for frame in frames:
                 await self.queue_frame(frame)
 
+    async def _cancel(self):
+        # Make sure everything is cleaned up downstream. This is sent
+        # out-of-band from the main streaming task which is what we want since
+        # we want to cancel right away.
+        await self._source.push_frame(CancelFrame())
+        # Only cancel the push task. Everything else will be cancelled in run().
+        await self._task_manager.cancel_task(self._process_push_task)
+
     async def _create_tasks(self):
         self._process_up_task = self._task_manager.create_task(
             self._process_up_queue(), f"{self}::_process_up_queue"

From e06146c23770c3611a436e12c3dfb5528fb7afee Mon Sep 17 00:00:00 2001
From: Mark Backman <mark@daily.co>
Date: Tue, 6 May 2025 11:06:57 -0400
Subject: [PATCH 05/97] Add enable_ssml_parsing to ElevenLabsTTSService

---
 CHANGELOG.md                           | 2 ++
 src/pipecat/services/elevenlabs/tts.py | 5 +++++
 2 files changed, 7 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 95350157f..1bbc5210d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
+- Added `enable_ssml_parsing` to `InputParams` in `ElevenLabsTTSService`.
+
 - Added support to `RimeHttpTTSService` for the `arcana` model.
 
 ### Fixed
diff --git a/src/pipecat/services/elevenlabs/tts.py b/src/pipecat/services/elevenlabs/tts.py
index 4362fcdc9..ea89d1378 100644
--- a/src/pipecat/services/elevenlabs/tts.py
+++ b/src/pipecat/services/elevenlabs/tts.py
@@ -169,6 +169,7 @@ class ElevenLabsTTSService(InterruptibleWordTTSService):
         use_speaker_boost: Optional[bool] = None
         speed: Optional[float] = None
         auto_mode: Optional[bool] = True
+        enable_ssml_parsing: Optional[bool] = None
 
         @model_validator(mode="after")
         def validate_voice_settings(self):
@@ -227,6 +228,7 @@ class ElevenLabsTTSService(InterruptibleWordTTSService):
             "use_speaker_boost": params.use_speaker_boost,
             "speed": params.speed,
             "auto_mode": str(params.auto_mode).lower(),
+            "enable_ssml_parsing": params.enable_ssml_parsing,
         }
         self.set_model_name(model)
         self.set_voice(voice_id)
@@ -324,6 +326,9 @@ class ElevenLabsTTSService(InterruptibleWordTTSService):
             if self._settings["optimize_streaming_latency"]:
                 url += f"&optimize_streaming_latency={self._settings['optimize_streaming_latency']}"
 
+            if self._settings["enable_ssml_parsing"]:
+                url += f"&enable_ssml_parsing={self._settings['enable_ssml_parsing']}"
+
             # Language can only be used with the ELEVENLABS_MULTILINGUAL_MODELS
             language = self._settings["language"]
             if model in ELEVENLABS_MULTILINGUAL_MODELS and language is not None:

From 8691870bcb138ec971c01501486e09d4d14fa118 Mon Sep 17 00:00:00 2001
From: Mark Backman <mark@daily.co>
Date: Tue, 6 May 2025 11:29:32 -0400
Subject: [PATCH 06/97] Update Deepgram TTS default voice to Aura 2 voice

---
 CHANGELOG.md                                            | 4 ++++
 examples/foundational/07c-interruptible-deepgram-vad.py | 2 +-
 examples/foundational/07c-interruptible-deepgram.py     | 2 +-
 src/pipecat/services/deepgram/tts.py                    | 2 +-
 4 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index d8be39cdb..f663976fa 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -11,6 +11,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 - Added support to `RimeHttpTTSService` for the `arcana` model.
 
+### Changed
+
+- Updated the default voice for `DeepgramTTSService` to `aura-2-helena-en`.
+
 ### Fixed
 
 - Fixed a `PipelineTask` issue that would cause tasks to not be cancelled if
diff --git a/examples/foundational/07c-interruptible-deepgram-vad.py b/examples/foundational/07c-interruptible-deepgram-vad.py
index a6d6ab4bb..945cdc447 100644
--- a/examples/foundational/07c-interruptible-deepgram-vad.py
+++ b/examples/foundational/07c-interruptible-deepgram-vad.py
@@ -47,7 +47,7 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection, _: argparse.Namespac
         live_options=LiveOptions(vad_events=True, utterance_end_ms="1000"),
     )
 
-    tts = DeepgramTTSService(api_key=os.getenv("DEEPGRAM_API_KEY"), voice="aura-helios-en")
+    tts = DeepgramTTSService(api_key=os.getenv("DEEPGRAM_API_KEY"), voice="aura-2-andromeda-en")
 
     llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
 
diff --git a/examples/foundational/07c-interruptible-deepgram.py b/examples/foundational/07c-interruptible-deepgram.py
index 3e02d8d77..2a707da4a 100644
--- a/examples/foundational/07c-interruptible-deepgram.py
+++ b/examples/foundational/07c-interruptible-deepgram.py
@@ -39,7 +39,7 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection, _: argparse.Namespac
 
     stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
 
-    tts = DeepgramTTSService(api_key=os.getenv("DEEPGRAM_API_KEY"), voice="aura-helios-en")
+    tts = DeepgramTTSService(api_key=os.getenv("DEEPGRAM_API_KEY"), voice="aura-2-andromeda-en")
 
     llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
 
diff --git a/src/pipecat/services/deepgram/tts.py b/src/pipecat/services/deepgram/tts.py
index ec8a755a0..93c710f9e 100644
--- a/src/pipecat/services/deepgram/tts.py
+++ b/src/pipecat/services/deepgram/tts.py
@@ -30,7 +30,7 @@ class DeepgramTTSService(TTSService):
         self,
         *,
         api_key: str,
-        voice: str = "aura-helios-en",
+        voice: str = "aura-2-helena-en",
         base_url: str = "",
         sample_rate: Optional[int] = None,
         encoding: str = "linear16",

From 288f8865c8471adf5b4ec1706445099259b61af2 Mon Sep 17 00:00:00 2001
From: Mark Backman <mark@daily.co>
Date: Tue, 6 May 2025 12:13:26 -0400
Subject: [PATCH 07/97] Add enable_logging to ElevenLabsTTSService

---
 CHANGELOG.md                           | 3 ++-
 src/pipecat/services/elevenlabs/tts.py | 5 +++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1bbc5210d..013a84421 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,7 +9,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
-- Added `enable_ssml_parsing` to `InputParams` in `ElevenLabsTTSService`.
+- Added `enable_ssml_parsing` and `enable_logging` to `InputParams` in
+  `ElevenLabsTTSService`.
 
 - Added support to `RimeHttpTTSService` for the `arcana` model.
 
diff --git a/src/pipecat/services/elevenlabs/tts.py b/src/pipecat/services/elevenlabs/tts.py
index ea89d1378..0a3d5d0d1 100644
--- a/src/pipecat/services/elevenlabs/tts.py
+++ b/src/pipecat/services/elevenlabs/tts.py
@@ -170,6 +170,7 @@ class ElevenLabsTTSService(InterruptibleWordTTSService):
         speed: Optional[float] = None
         auto_mode: Optional[bool] = True
         enable_ssml_parsing: Optional[bool] = None
+        enable_logging: Optional[bool] = None
 
         @model_validator(mode="after")
         def validate_voice_settings(self):
@@ -229,6 +230,7 @@ class ElevenLabsTTSService(InterruptibleWordTTSService):
             "speed": params.speed,
             "auto_mode": str(params.auto_mode).lower(),
             "enable_ssml_parsing": params.enable_ssml_parsing,
+            "enable_logging": params.enable_logging,
         }
         self.set_model_name(model)
         self.set_voice(voice_id)
@@ -329,6 +331,9 @@ class ElevenLabsTTSService(InterruptibleWordTTSService):
             if self._settings["enable_ssml_parsing"]:
                 url += f"&enable_ssml_parsing={self._settings['enable_ssml_parsing']}"
 
+            if self._settings["enable_logging"]:
+                url += f"&enable_logging={self._settings['enable_logging']}"
+
             # Language can only be used with the ELEVENLABS_MULTILINGUAL_MODELS
             language = self._settings["language"]
             if model in ELEVENLABS_MULTILINGUAL_MODELS and language is not None:

From 0d30b000af2e20102b4286f7d540a9336e838e9f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= <aleix@daily.co>
Date: Mon, 5 May 2025 11:48:55 -0700
Subject: [PATCH 08/97] BaseObserver: add FramePushed class and deprecated
 multiple arguments

---
 CHANGELOG.md                                  |  8 +++
 examples/foundational/30-observer.py          | 20 +++----
 src/pipecat/observers/base_observer.py        | 56 ++++++++++++------
 .../observers/loggers/llm_log_observer.py     | 17 +++---
 .../loggers/transcription_log_observer.py     | 15 ++---
 src/pipecat/pipeline/task_observer.py         | 58 ++++++++-----------
 src/pipecat/processors/frame_processor.py     | 20 +++++--
 src/pipecat/processors/frameworks/rtvi.py     | 15 ++---
 src/pipecat/services/google/rtvi.py           | 18 +++---
 src/pipecat/tests/utils.py                    | 14 ++---
 10 files changed, 127 insertions(+), 114 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4e5130d7a..3ae2db0ae 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -16,8 +16,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Changed
 
+- Observers `on_push_frame()` now take a single argument `FramePushed` instead
+  of multiple arguments.
+
 - Updated the default voice for `DeepgramTTSService` to `aura-2-helena-en`.
 
+### Deprecated
+
+- Observer `on_push_frame(src, dst, frame, direction, timestamp)` is now
+  deprecated, use `on_push_frame(data: FramePushed)` instead.
+
 ### Fixed
 
 - Fixed a `PipelineTask` issue that would cause tasks to not be cancelled if
diff --git a/examples/foundational/30-observer.py b/examples/foundational/30-observer.py
index d8c2ec100..46bd96e53 100644
--- a/examples/foundational/30-observer.py
+++ b/examples/foundational/30-observer.py
@@ -14,16 +14,15 @@ from pipecat.audio.vad.silero import SileroVADAnalyzer
 from pipecat.frames.frames import (
     BotStartedSpeakingFrame,
     BotStoppedSpeakingFrame,
-    Frame,
     StartInterruptionFrame,
 )
-from pipecat.observers.base_observer import BaseObserver
+from pipecat.observers.base_observer import BaseObserver, FramePushed
 from pipecat.observers.loggers.llm_log_observer import LLMLogObserver
 from pipecat.pipeline.pipeline import Pipeline
 from pipecat.pipeline.runner import PipelineRunner
 from pipecat.pipeline.task import PipelineParams, PipelineTask
 from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
-from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
+from pipecat.processors.frame_processor import FrameDirection
 from pipecat.services.cartesia.tts import CartesiaTTSService
 from pipecat.services.deepgram.stt import DeepgramSTTService
 from pipecat.services.openai.llm import OpenAILLMService
@@ -46,14 +45,13 @@ class DebugObserver(BaseObserver):
     Log format: [EVENT TYPE]: [source processor] → [destination processor] at [timestamp]s
     """
 
-    async def on_push_frame(
-        self,
-        src: FrameProcessor,
-        dst: FrameProcessor,
-        frame: Frame,
-        direction: FrameDirection,
-        timestamp: int,
-    ):
+    async def on_push_frame(self, data: FramePushed):
+        src = data.source
+        dst = data.destination
+        frame = data.frame
+        direction = data.direction
+        timestamp = data.timestamp
+
         # Convert timestamp to seconds for readability
         time_sec = timestamp / 1_000_000_000
 
diff --git a/src/pipecat/observers/base_observer.py b/src/pipecat/observers/base_observer.py
index 46f746946..f1a0c2a1b 100644
--- a/src/pipecat/observers/base_observer.py
+++ b/src/pipecat/observers/base_observer.py
@@ -5,9 +5,38 @@
 #
 
 from abc import ABC, abstractmethod
+from dataclasses import dataclass
+
+from typing_extensions import TYPE_CHECKING
 
 from pipecat.frames.frames import Frame
-from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
+
+if TYPE_CHECKING:
+    from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
+
+
+@dataclass
+class FramePushed:
+    """Represents an event where a frame is pushed from one processor to another
+    within the pipeline.
+
+    This data structure is typically used by observers to track the flow of
+    frames through the pipeline for logging, debugging, or analytics purposes.
+
+    Attributes:
+        source (FrameProcessor): The processor sending the frame.
+        destination (FrameProcessor): The processor receiving the frame.
+        frame (Frame): The frame being transferred.
+        direction (FrameDirection): The direction of the transfer (e.g., downstream or upstream).
+        timestamp (int): The time when the frame was pushed, based on the pipeline clock.
+
+    """
+
+    source: "FrameProcessor"
+    destination: "FrameProcessor"
+    frame: Frame
+    direction: "FrameDirection"
+    timestamp: int
 
 
 class BaseObserver(ABC):
@@ -19,26 +48,15 @@ class BaseObserver(ABC):
     """
 
     @abstractmethod
-    async def on_push_frame(
-        self,
-        src: FrameProcessor,
-        dst: FrameProcessor,
-        frame: Frame,
-        direction: FrameDirection,
-        timestamp: int,
-    ):
-        """Abstract method to handle the event when a frame is pushed from one
-        processor to another.
+    async def on_push_frame(self, data: FramePushed):
+        """Handle the event when a frame is pushed from one processor to another.
+
+        This method should be implemented by subclasses to define specific
+        behavior (e.g., logging, monitoring, debugging) when a frame is
+        transferred through the pipeline.
 
         Args:
-            src (FrameProcessor): The source frame processor that is sending the frame.
-            dst (FrameProcessor): The destination frame processor that will receive the frame.
-            frame (Frame): The frame being transferred between processors.
-            direction (FrameDirection): The direction of the frame transfer.
-            timestamp (int): The timestamp when the frame was pushed (based on the pipeline clock).
-
-        This method should be implemented by subclasses to define specific behavior
-        when a frame is pushed.
+            data (FramePushed): The event data containing details about the frame transfer.
 
         """
         pass
diff --git a/src/pipecat/observers/loggers/llm_log_observer.py b/src/pipecat/observers/loggers/llm_log_observer.py
index dd270abf5..9e4d53b28 100644
--- a/src/pipecat/observers/loggers/llm_log_observer.py
+++ b/src/pipecat/observers/loggers/llm_log_observer.py
@@ -15,7 +15,7 @@ from pipecat.frames.frames import (
     LLMMessagesFrame,
     LLMTextFrame,
 )
-from pipecat.observers.base_observer import BaseObserver
+from pipecat.observers.base_observer import BaseObserver, FramePushed
 from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContextFrame
 from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
 from pipecat.services.llm_service import LLMService
@@ -38,14 +38,13 @@ class LLMLogObserver(BaseObserver):
 
     """
 
-    async def on_push_frame(
-        self,
-        src: FrameProcessor,
-        dst: FrameProcessor,
-        frame: Frame,
-        direction: FrameDirection,
-        timestamp: int,
-    ):
+    async def on_push_frame(self, data: FramePushed):
+        src = data.source
+        dst = data.destination
+        frame = data.frame
+        direction = data.direction
+        timestamp = data.timestamp
+
         if not isinstance(src, LLMService) and not isinstance(dst, LLMService):
             return
 
diff --git a/src/pipecat/observers/loggers/transcription_log_observer.py b/src/pipecat/observers/loggers/transcription_log_observer.py
index 4547ee54f..57e38c952 100644
--- a/src/pipecat/observers/loggers/transcription_log_observer.py
+++ b/src/pipecat/observers/loggers/transcription_log_observer.py
@@ -11,7 +11,7 @@ from pipecat.frames.frames import (
     InterimTranscriptionFrame,
     TranscriptionFrame,
 )
-from pipecat.observers.base_observer import BaseObserver
+from pipecat.observers.base_observer import BaseObserver, FramePushed
 from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
 from pipecat.services.stt_service import STTService
 
@@ -29,14 +29,11 @@ class TranscriptionLogObserver(BaseObserver):
 
     """
 
-    async def on_push_frame(
-        self,
-        src: FrameProcessor,
-        dst: FrameProcessor,
-        frame: Frame,
-        direction: FrameDirection,
-        timestamp: int,
-    ):
+    async def on_push_frame(self, data: FramePushed):
+        src = data.source
+        frame = data.frame
+        timestamp = data.timestamp
+
         if not isinstance(src, STTService):
             return
 
diff --git a/src/pipecat/pipeline/task_observer.py b/src/pipecat/pipeline/task_observer.py
index dd805032c..252708f8c 100644
--- a/src/pipecat/pipeline/task_observer.py
+++ b/src/pipecat/pipeline/task_observer.py
@@ -5,13 +5,12 @@
 #
 
 import asyncio
+import inspect
 from typing import List
 
 from attr import dataclass
 
-from pipecat.frames.frames import Frame
-from pipecat.observers.base_observer import BaseObserver
-from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
+from pipecat.observers.base_observer import BaseObserver, FramePushed
 from pipecat.utils.asyncio import BaseTaskManager
 
 
@@ -27,20 +26,6 @@ class Proxy:
     observer: BaseObserver
 
 
-@dataclass
-class ObserverData:
-    """This is the data we receive from the main observer and that we put into a
-    proxy queue for later processing.
-
-    """
-
-    src: FrameProcessor
-    dst: FrameProcessor
-    frame: Frame
-    direction: FrameDirection
-    timestamp: int
-
-
 class TaskObserver(BaseObserver):
     """This is a pipeline frame observer that is meant to be used as a proxy to
     the user provided observers. That is, this is the observer that should be
@@ -68,20 +53,9 @@ class TaskObserver(BaseObserver):
         for proxy in self._proxies:
             await self._task_manager.cancel_task(proxy.task)
 
-    async def on_push_frame(
-        self,
-        src: FrameProcessor,
-        dst: FrameProcessor,
-        frame: Frame,
-        direction: FrameDirection,
-        timestamp: int,
-    ):
+    async def on_push_frame(self, data: FramePushed):
         for proxy in self._proxies:
-            await proxy.queue.put(
-                ObserverData(
-                    src=src, dst=dst, frame=frame, direction=direction, timestamp=timestamp
-                )
-            )
+            await proxy.queue.put(data)
 
     def _create_proxies(self, observers) -> List[Proxy]:
         proxies = []
@@ -96,8 +70,26 @@ class TaskObserver(BaseObserver):
         return proxies
 
     async def _proxy_task_handler(self, queue: asyncio.Queue, observer: BaseObserver):
+        warning_reported = False
         while True:
             data = await queue.get()
-            await observer.on_push_frame(
-                data.src, data.dst, data.frame, data.direction, data.timestamp
-            )
+
+            signature = inspect.signature(observer.on_push_frame)
+            if len(signature.parameters) > 1:
+                if not warning_reported:
+                    import warnings
+
+                    with warnings.catch_warnings():
+                        warnings.simplefilter("always")
+                        warnings.warn(
+                            "Observer `on_push_frame(source, destination, frame, direction, timestamp)` is deprecated, us `on_push_frame(data: FramePushed)` instead.",
+                            DeprecationWarning,
+                        )
+                    warning_reported = True
+                await observer.on_push_frame(
+                    data.src, data.dst, data.frame, data.direction, data.timestamp
+                )
+            else:
+                await observer.on_push_frame(data)
+
+            queue.task_done()
diff --git a/src/pipecat/processors/frame_processor.py b/src/pipecat/processors/frame_processor.py
index 590698e7f..97cc24378 100644
--- a/src/pipecat/processors/frame_processor.py
+++ b/src/pipecat/processors/frame_processor.py
@@ -21,6 +21,7 @@ from pipecat.frames.frames import (
     SystemFrame,
 )
 from pipecat.metrics.metrics import LLMTokenUsage, MetricsData
+from pipecat.observers.base_observer import FramePushed
 from pipecat.processors.metrics.frame_processor_metrics import FrameProcessorMetrics
 from pipecat.utils.asyncio import BaseTaskManager
 from pipecat.utils.base_object import BaseObject
@@ -294,17 +295,28 @@ class FrameProcessor(BaseObject):
             timestamp = self._clock.get_time() if self._clock else 0
             if direction == FrameDirection.DOWNSTREAM and self._next:
                 logger.trace(f"Pushing {frame} from {self} to {self._next}")
+
                 if self._observer:
-                    await self._observer.on_push_frame(
-                        self, self._next, frame, direction, timestamp
+                    data = FramePushed(
+                        source=self,
+                        destination=self._next,
+                        frame=frame,
+                        direction=direction,
+                        timestamp=timestamp,
                     )
+                    await self._observer.on_push_frame(data)
                 await self._next.queue_frame(frame, direction)
             elif direction == FrameDirection.UPSTREAM and self._prev:
                 logger.trace(f"Pushing {frame} upstream from {self} to {self._prev}")
                 if self._observer:
-                    await self._observer.on_push_frame(
-                        self, self._prev, frame, direction, timestamp
+                    data = FramePushed(
+                        source=self,
+                        destination=self._prev,
+                        frame=frame,
+                        direction=direction,
+                        timestamp=timestamp,
                     )
+                    await self._observer.on_push_frame(data)
                 await self._prev.queue_frame(frame, direction)
         except Exception as e:
             logger.exception(f"Uncaught exception in {self}: {e}")
diff --git a/src/pipecat/processors/frameworks/rtvi.py b/src/pipecat/processors/frameworks/rtvi.py
index 55e91d7ff..ee0cced87 100644
--- a/src/pipecat/processors/frameworks/rtvi.py
+++ b/src/pipecat/processors/frameworks/rtvi.py
@@ -55,7 +55,7 @@ from pipecat.metrics.metrics import (
     TTFBMetricsData,
     TTSUsageMetricsData,
 )
-from pipecat.observers.base_observer import BaseObserver
+from pipecat.observers.base_observer import BaseObserver, FramePushed
 from pipecat.processors.aggregators.openai_llm_context import (
     OpenAILLMContext,
     OpenAILLMContextFrame,
@@ -445,14 +445,7 @@ class RTVIObserver(BaseObserver):
         self._frames_seen = set()
         rtvi.set_errors_enabled(self._params.errors_enabled)
 
-    async def on_push_frame(
-        self,
-        src: FrameProcessor,
-        dst: FrameProcessor,
-        frame: Frame,
-        direction: FrameDirection,
-        timestamp: int,
-    ):
+    async def on_push_frame(self, data: FramePushed):
         """Process a frame being pushed through the pipeline.
 
         Args:
@@ -462,6 +455,10 @@ class RTVIObserver(BaseObserver):
             direction: Direction of frame flow in pipeline
             timestamp: Time when frame was pushed
         """
+        src = data.source
+        frame = data.frame
+        direction = data.direction
+
         # If we have already seen this frame, let's skip it.
         if frame.id in self._frames_seen:
             return
diff --git a/src/pipecat/services/google/rtvi.py b/src/pipecat/services/google/rtvi.py
index 88e67e6c6..cd60f6f1f 100644
--- a/src/pipecat/services/google/rtvi.py
+++ b/src/pipecat/services/google/rtvi.py
@@ -9,8 +9,9 @@ from typing import List, Literal, Optional
 from pydantic import BaseModel
 
 from pipecat.frames.frames import Frame
+from pipecat.observers.base_observer import FramePushed
 from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
-from pipecat.processors.frameworks.rtvi import RTVIObserver
+from pipecat.processors.frameworks.rtvi import RTVIObserver, RTVIProcessor
 from pipecat.services.google.frames import LLMSearchOrigin, LLMSearchResponseFrame
 
 
@@ -27,18 +28,13 @@ class RTVIBotLLMSearchResponseMessage(BaseModel):
 
 
 class GoogleRTVIObserver(RTVIObserver):
-    def __init__(self, rtvi: FrameProcessor):
+    def __init__(self, rtvi: RTVIProcessor):
         super().__init__(rtvi)
 
-    async def on_push_frame(
-        self,
-        src: FrameProcessor,
-        dst: FrameProcessor,
-        frame: Frame,
-        direction: FrameDirection,
-        timestamp: int,
-    ):
-        await super().on_push_frame(src, dst, frame, direction, timestamp)
+    async def on_push_frame(self, data: FramePushed):
+        await super().on_push_frame(data)
+
+        frame = data.frame
 
         if isinstance(frame, LLMSearchResponseFrame):
             await self._handle_llm_search_response_frame(frame)
diff --git a/src/pipecat/tests/utils.py b/src/pipecat/tests/utils.py
index e2368ba09..b5dfc5de1 100644
--- a/src/pipecat/tests/utils.py
+++ b/src/pipecat/tests/utils.py
@@ -15,7 +15,7 @@ from pipecat.frames.frames import (
     StartFrame,
     SystemFrame,
 )
-from pipecat.observers.base_observer import BaseObserver
+from pipecat.observers.base_observer import BaseObserver, FramePushed
 from pipecat.pipeline.pipeline import Pipeline
 from pipecat.pipeline.runner import PipelineRunner
 from pipecat.pipeline.task import PipelineParams, PipelineTask
@@ -42,14 +42,10 @@ class HeartbeatsObserver(BaseObserver):
         self._target = target
         self._callback = heartbeat_callback
 
-    async def on_push_frame(
-        self,
-        src: FrameProcessor,
-        dst: FrameProcessor,
-        frame: Frame,
-        direction: FrameDirection,
-        timestamp: int,
-    ):
+    async def on_push_frame(self, data: FramePushed):
+        src = data.source
+        frame = data.frame
+
         if src == self._target and isinstance(frame, HeartbeatFrame):
             await self._callback(self._target, frame)
 

From d69fa5dba507ade3146828d2082d8e329ec30b42 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= <aleix@daily.co>
Date: Mon, 5 May 2025 11:51:08 -0700
Subject: [PATCH 09/97] update CHANGELOG with UltravoxSTTService fix

---
 CHANGELOG.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3ae2db0ae..1a6274667 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -28,6 +28,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Fixed
 
+- Fixed a `UltravoxSTTService` issue that would cause the service to generate
+  all tokens as one word.
+
 - Fixed a `PipelineTask` issue that would cause tasks to not be cancelled if
   task was cancelled from outside of Pipecat.
 

From a1d46cb26bbb489d9ddcc414c59dced7fe6983b8 Mon Sep 17 00:00:00 2001
From: Mark Backman <mark@daily.co>
Date: Tue, 6 May 2025 21:23:23 -0400
Subject: [PATCH 10/97] Removing CanonicalMetricsService

---
 CHANGELOG.md                                |   4 +
 README.md                                   |   2 +-
 docs/api/requirements.txt                   |   1 -
 examples/canonical-metrics/.gitignore       | 161 --------------
 examples/canonical-metrics/Dockerfile       |  10 -
 examples/canonical-metrics/README.md        |  66 ------
 examples/canonical-metrics/bot.py           | 146 -------------
 examples/canonical-metrics/env.example      |   6 -
 examples/canonical-metrics/requirements.txt |   5 -
 examples/canonical-metrics/runner.py        |  55 -----
 examples/canonical-metrics/server.py        | 139 ------------
 pyproject.toml                              |   1 -
 src/pipecat/services/canonical/__init__.py  |  13 --
 src/pipecat/services/canonical/metrics.py   | 230 --------------------
 14 files changed, 5 insertions(+), 834 deletions(-)
 delete mode 100644 examples/canonical-metrics/.gitignore
 delete mode 100644 examples/canonical-metrics/Dockerfile
 delete mode 100644 examples/canonical-metrics/README.md
 delete mode 100644 examples/canonical-metrics/bot.py
 delete mode 100644 examples/canonical-metrics/env.example
 delete mode 100644 examples/canonical-metrics/requirements.txt
 delete mode 100644 examples/canonical-metrics/runner.py
 delete mode 100644 examples/canonical-metrics/server.py
 delete mode 100644 src/pipecat/services/canonical/__init__.py
 delete mode 100644 src/pipecat/services/canonical/metrics.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1a6274667..101cc7f58 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -41,6 +41,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 - Remove custom audio tracks from `DailyTransport` before leaving.
 
+### Removed
+
+- Removed `CanonicalMetricsService` as it's no longer maintained.
+
 ## [0.0.66] - 2025-05-02
 
 ### Added
diff --git a/README.md b/README.md
index ac2444f87..7f95bb664 100644
--- a/README.md
+++ b/README.md
@@ -60,7 +60,7 @@ You can connect to Pipecat from any platform using our official SDKs:
 | Memory              | [mem0](https://docs.pipecat.ai/server/services/memory/mem0)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
 | Vision & Image      | [fal](https://docs.pipecat.ai/server/services/image-generation/fal), [Google Imagen](https://docs.pipecat.ai/server/services/image-generation/fal), [Moondream](https://docs.pipecat.ai/server/services/vision/moondream)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
 | Audio Processing    | [Silero VAD](https://docs.pipecat.ai/server/utilities/audio/silero-vad-analyzer), [Krisp](https://docs.pipecat.ai/server/utilities/audio/krisp-filter), [Koala](https://docs.pipecat.ai/server/utilities/audio/koala-filter), [Noisereduce](https://docs.pipecat.ai/server/utilities/audio/noisereduce-filter)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
-| Analytics & Metrics | [Canonical AI](https://docs.pipecat.ai/server/services/analytics/canonical), [Sentry](https://docs.pipecat.ai/server/services/analytics/sentry)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
+| Analytics & Metrics | [Sentry](https://docs.pipecat.ai/server/services/analytics/sentry)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
 
 📚 [View full services documentation →](https://docs.pipecat.ai/server/services/supported-services)
 
diff --git a/docs/api/requirements.txt b/docs/api/requirements.txt
index 9badccd8f..a77ff1084 100644
--- a/docs/api/requirements.txt
+++ b/docs/api/requirements.txt
@@ -10,7 +10,6 @@ pipecat-ai[anthropic]
 pipecat-ai[assemblyai]
 pipecat-ai[aws]
 pipecat-ai[azure]
-pipecat-ai[canonical]
 pipecat-ai[cartesia]
 pipecat-ai[cerebras]
 pipecat-ai[deepseek]
diff --git a/examples/canonical-metrics/.gitignore b/examples/canonical-metrics/.gitignore
deleted file mode 100644
index 50d9d205e..000000000
--- a/examples/canonical-metrics/.gitignore
+++ /dev/null
@@ -1,161 +0,0 @@
-# Byte-compiled / optimized / DLL files
-__pycache__/
-*.py[cod]
-*$py.class
-recordings/
-# C extensions
-*.so
-
-# Distribution / packaging
-.Python
-build/
-develop-eggs/
-dist/
-downloads/
-eggs/
-.eggs/
-lib/
-lib64/
-parts/
-sdist/
-var/
-wheels/
-share/python-wheels/
-*.egg-info/
-.installed.cfg
-*.egg
-MANIFEST
-
-# PyInstaller
-#  Usually these files are written by a python script from a template
-#  before PyInstaller builds the exe, so as to inject date/other infos into it.
-*.manifest
-*.spec
-
-# Installer logs
-pip-log.txt
-pip-delete-this-directory.txt
-
-# Unit test / coverage reports
-htmlcov/
-.tox/
-.nox/
-.coverage
-.coverage.*
-.cache
-nosetests.xml
-coverage.xml
-*.cover
-*.py,cover
-.hypothesis/
-.pytest_cache/
-cover/
-
-# Translations
-*.mo
-*.pot
-
-# Django stuff:
-*.log
-local_settings.py
-db.sqlite3
-db.sqlite3-journal
-
-# Flask stuff:
-instance/
-.webassets-cache
-
-# Scrapy stuff:
-.scrapy
-
-# Sphinx documentation
-docs/_build/
-
-# PyBuilder
-.pybuilder/
-target/
-
-# Jupyter Notebook
-.ipynb_checkpoints
-
-# IPython
-profile_default/
-ipython_config.py
-
-# pyenv
-#   For a library or package, you might want to ignore these files since the code is
-#   intended to run in multiple environments; otherwise, check them in:
-# .python-version
-
-# pipenv
-#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
-#   However, in case of collaboration, if having platform-specific dependencies or dependencies
-#   having no cross-platform support, pipenv may install dependencies that don't work, or not
-#   install all needed dependencies.
-#Pipfile.lock
-
-# poetry
-#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
-#   This is especially recommended for binary packages to ensure reproducibility, and is more
-#   commonly ignored for libraries.
-#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
-#poetry.lock
-
-# pdm
-#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
-#pdm.lock
-#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
-#   in version control.
-#   https://pdm.fming.dev/#use-with-ide
-.pdm.toml
-
-# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
-__pypackages__/
-
-# Celery stuff
-celerybeat-schedule
-celerybeat.pid
-
-# SageMath parsed files
-*.sage.py
-
-# Environments
-.env
-.venv
-env/
-venv/
-ENV/
-env.bak/
-venv.bak/
-
-# Spyder project settings
-.spyderproject
-.spyproject
-
-# Rope project settings
-.ropeproject
-
-# mkdocs documentation
-/site
-
-# mypy
-.mypy_cache/
-.dmypy.json
-dmypy.json
-
-# Pyre type checker
-.pyre/
-
-# pytype static type analyzer
-.pytype/
-
-# Cython debug symbols
-cython_debug/
-
-# PyCharm
-#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
-#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
-#  and can be added to the global gitignore or merged into this file.  For a more nuclear
-#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
-#.idea/
-runpod.toml
diff --git a/examples/canonical-metrics/Dockerfile b/examples/canonical-metrics/Dockerfile
deleted file mode 100644
index a5b4668c6..000000000
--- a/examples/canonical-metrics/Dockerfile
+++ /dev/null
@@ -1,10 +0,0 @@
-FROM python:3.10-bullseye
-RUN mkdir /app
-COPY *.py /app/
-COPY requirements.txt /app/
-WORKDIR /app
-RUN pip3 install -r requirements.txt
-
-EXPOSE 7860
-
-CMD ["python3", "server.py"]
diff --git a/examples/canonical-metrics/README.md b/examples/canonical-metrics/README.md
deleted file mode 100644
index 068655d2b..000000000
--- a/examples/canonical-metrics/README.md
+++ /dev/null
@@ -1,66 +0,0 @@
-# Chatbot with canonical-metrics
-
-This project implements a chatbot using a pipeline architecture that integrates audio processing, transcription, and a language model for conversational interactions. The chatbot operates within a daily communication environment, utilizing various services for text-to-speech and language model responses.
-
-## Features
-
-- **Audio Input and Output**: Captures microphone input and plays back audio responses.
-- **Voice Activity Detection**: Utilizes Silero VAD to manage audio input intelligently.
-- **Text-to-Speech**: Integrates ElevenLabs TTS service to convert text responses into audio.
-- **Language Model Interaction**: Uses OpenAI's GPT-4 model to generate responses based on user input.
-- **Transcription Services**: Captures and transcribes participant speech for analytics.
-- **Metrics Collection**: Sends audio data for analysis via Canonical Metrics Service.
-
-## Requirements
-
-- Python 3.10+
-- `python-dotenv`
-- Additional libraries from the `pipecat` package.
-
-## Setup
-
-1. Clone the repository.
-2. Install the required packages.
-3. Set up environment variables for API keys:
-   - `OPENAI_API_KEY`
-   - `ELEVENLABS_API_KEY`
-   - `CANONICAL_API_KEY`
-   - `CANONICAL_API_URL`
-4. Run the script.
-
-## Usage
-
-The chatbot introduces itself and engages in conversations, providing brief and creative responses. Designed for flexibility, it can support multiple languages with appropriate configuration.
-
-## Events
-
-- Participants joining or leaving the call are handled dynamically, adjusting the chatbot's behavior accordingly.
-
-
-ℹ️ The first time, things might take extra time to get started since VAD (Voice Activity Detection) model needs to be downloaded.
-
-## Get started
-
-```python
-python3 -m venv venv
-source venv/bin/activate
-pip install -r requirements.txt
-
-cp env.example .env # and add your credentials
-
-```
-
-## Run the server
-
-```bash
-python server.py
-```
-
-Then, visit `http://localhost:7860/` in your browser to start a chatbot session.
-
-## Build and test the Docker image
-
-```
-docker build -t chatbot .
-docker run --env-file .env -p 7860:7860 chatbot
-```
diff --git a/examples/canonical-metrics/bot.py b/examples/canonical-metrics/bot.py
deleted file mode 100644
index 871d0542d..000000000
--- a/examples/canonical-metrics/bot.py
+++ /dev/null
@@ -1,146 +0,0 @@
-#
-# Copyright (c) 2024–2025, Daily
-#
-# SPDX-License-Identifier: BSD 2-Clause License
-#
-
-import asyncio
-import os
-import sys
-import uuid
-
-import aiohttp
-from dotenv import load_dotenv
-from loguru import logger
-from runner import configure
-
-from pipecat.audio.vad.silero import SileroVADAnalyzer
-from pipecat.frames.frames import EndFrame
-from pipecat.pipeline.pipeline import Pipeline
-from pipecat.pipeline.runner import PipelineRunner
-from pipecat.pipeline.task import PipelineParams, PipelineTask
-from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
-from pipecat.processors.audio.audio_buffer_processor import AudioBufferProcessor
-from pipecat.services.canonical.metrics import CanonicalMetricsService
-from pipecat.services.elevenlabs.tts import ElevenLabsTTSService
-from pipecat.services.openai.llm import OpenAILLMService
-from pipecat.transports.services.daily import DailyParams, DailyTransport
-
-load_dotenv(override=True)
-
-logger.remove(0)
-logger.add(sys.stderr, level="DEBUG")
-
-
-async def main():
-    async with aiohttp.ClientSession() as session:
-        (room_url, token) = await configure(session)
-
-        transport = DailyTransport(
-            room_url,
-            token,
-            "Chatbot",
-            DailyParams(
-                audio_out_enabled=True,
-                audio_in_enabled=True,
-                video_out_enabled=False,
-                vad_analyzer=SileroVADAnalyzer(),
-                transcription_enabled=True,
-                #
-                # Spanish
-                #
-                # transcription_settings=DailyTranscriptionSettings(
-                #     language="es",
-                #     tier="nova",
-                #     model="2-general"
-                # )
-            ),
-        )
-
-        tts = ElevenLabsTTSService(
-            api_key=os.getenv("ELEVENLABS_API_KEY"),
-            #
-            # English
-            #
-            voice_id="cgSgspJ2msm6clMCkdW9",
-            #
-            # Spanish
-            #
-            # model="eleven_multilingual_v2",
-            # voice_id="gD1IexrzCvsXPHUuT0s3",
-        )
-
-        llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
-
-        messages = [
-            {
-                "role": "system",
-                #
-                # English
-                #
-                "content": "You are Chatbot, a friendly, helpful robot. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way, but keep your responses brief. Start by introducing yourself. Keep all your responses to 12 words or fewer.",
-                #
-                # Spanish
-                #
-                # "content": "Eres Chatbot, un amigable y útil robot. Tu objetivo es demostrar tus capacidades de una manera breve. Tus respuestas se convertiran a audio así que nunca no debes incluir caracteres especiales. Contesta a lo que el usuario pregunte de una manera creativa, útil y breve. Empieza por presentarte a ti mismo.",
-            },
-        ]
-
-        context = OpenAILLMContext(messages)
-        context_aggregator = llm.create_context_aggregator(context)
-
-        """
-        CanonicalMetrics uses AudioBufferProcessor under the hood to buffer the audio. On
-        call completion, CanonicalMetrics will send the audio buffer to Canonical for
-        analysis. Visit https://voice.canonical.chat to learn more.
-        """
-        audio_buffer_processor = AudioBufferProcessor(num_channels=2)
-        canonical = CanonicalMetricsService(
-            audio_buffer_processor=audio_buffer_processor,
-            aiohttp_session=session,
-            api_key=os.getenv("CANONICAL_API_KEY"),
-            call_id=str(uuid.uuid4()),
-            assistant="pipecat-chatbot",
-            assistant_speaks_first=True,
-            context=context,
-        )
-        pipeline = Pipeline(
-            [
-                transport.input(),  # microphone
-                context_aggregator.user(),
-                llm,
-                tts,
-                transport.output(),
-                canonical,  # uploads audio buffer to Canonical AI for metrics
-                audio_buffer_processor,  # captures audio into a buffer
-                context_aggregator.assistant(),
-            ]
-        )
-
-        task = PipelineTask(pipeline, params=PipelineParams(allow_interruptions=True))
-
-        @transport.event_handler("on_first_participant_joined")
-        async def on_first_participant_joined(transport, participant):
-            await audio_buffer_processor.start_recording()
-            await transport.capture_participant_transcription(participant["id"])
-            await task.queue_frames([context_aggregator.user().get_context_frame()])
-
-        @transport.event_handler("on_participant_left")
-        async def on_participant_left(transport, participant, reason):
-            print(f"Participant left: {participant}")
-            await task.cancel()
-
-        @transport.event_handler("on_call_state_updated")
-        async def on_call_state_updated(transport, state):
-            if state == "left":
-                # Here we don't want to cancel, we just want to finish sending
-                # whatever is queued, so we use an EndFrame().
-                await task.queue_frame(EndFrame())
-
-        runner = PipelineRunner()
-
-        await runner.run(task)
-
-
-if __name__ == "__main__":
-    asyncio.run(main())
diff --git a/examples/canonical-metrics/env.example b/examples/canonical-metrics/env.example
deleted file mode 100644
index 6b865401a..000000000
--- a/examples/canonical-metrics/env.example
+++ /dev/null
@@ -1,6 +0,0 @@
-DAILY_SAMPLE_ROOM_URL=https://yourdomain.daily.co/yourroom # (for joining the bot to the same room repeatedly for local dev)
-DAILY_API_KEY=7df...
-OPENAI_API_KEY=sk-PL...
-ELEVENLABS_API_KEY=aeb...
-CANONICAL_API_KEY=can...
-CANONICAL_API_URL=
diff --git a/examples/canonical-metrics/requirements.txt b/examples/canonical-metrics/requirements.txt
deleted file mode 100644
index 7e53edc6b..000000000
--- a/examples/canonical-metrics/requirements.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-python-dotenv
-fastapi[all]
-uvicorn
-pipecat-ai[daily,openai,silero,elevenlabs,canonical]
-
diff --git a/examples/canonical-metrics/runner.py b/examples/canonical-metrics/runner.py
deleted file mode 100644
index ad39a3ac4..000000000
--- a/examples/canonical-metrics/runner.py
+++ /dev/null
@@ -1,55 +0,0 @@
-#
-# Copyright (c) 2024–2025, Daily
-#
-# SPDX-License-Identifier: BSD 2-Clause License
-#
-
-import argparse
-import os
-
-import aiohttp
-
-from pipecat.transports.services.helpers.daily_rest import DailyRESTHelper
-
-
-async def configure(aiohttp_session: aiohttp.ClientSession):
-    parser = argparse.ArgumentParser(description="Daily AI SDK Bot Sample")
-    parser.add_argument(
-        "-u", "--url", type=str, required=False, help="URL of the Daily room to join"
-    )
-    parser.add_argument(
-        "-k",
-        "--apikey",
-        type=str,
-        required=False,
-        help="Daily API Key (needed to create an owner token for the room)",
-    )
-
-    args, unknown = parser.parse_known_args()
-
-    url = args.url or os.getenv("DAILY_SAMPLE_ROOM_URL")
-    key = args.apikey or os.getenv("DAILY_API_KEY")
-
-    if not url:
-        raise Exception(
-            "No Daily room specified. use the -u/--url option from the command line, or set DAILY_SAMPLE_ROOM_URL in your environment to specify a Daily room URL."
-        )
-
-    if not key:
-        raise Exception(
-            "No Daily API key specified. use the -k/--apikey option from the command line, or set DAILY_API_KEY in your environment to specify a Daily API key, available from https://dashboard.daily.co/developers."
-        )
-
-    daily_rest_helper = DailyRESTHelper(
-        daily_api_key=key,
-        daily_api_url=os.getenv("DAILY_API_URL", "https://api.daily.co/v1"),
-        aiohttp_session=aiohttp_session,
-    )
-
-    # Create a meeting token for the given room with an expiration 1 hour in
-    # the future.
-    expiry_time: float = 60 * 60
-
-    token = await daily_rest_helper.get_token(url, expiry_time)
-
-    return (url, token)
diff --git a/examples/canonical-metrics/server.py b/examples/canonical-metrics/server.py
deleted file mode 100644
index a0f38854c..000000000
--- a/examples/canonical-metrics/server.py
+++ /dev/null
@@ -1,139 +0,0 @@
-#
-# Copyright (c) 2024–2025, Daily
-#
-# SPDX-License-Identifier: BSD 2-Clause License
-#
-
-import argparse
-import os
-import subprocess
-from contextlib import asynccontextmanager
-
-import aiohttp
-from dotenv import load_dotenv
-from fastapi import FastAPI, HTTPException, Request
-from fastapi.middleware.cors import CORSMiddleware
-from fastapi.responses import JSONResponse, RedirectResponse
-
-from pipecat.transports.services.helpers.daily_rest import DailyRESTHelper, DailyRoomParams
-
-MAX_BOTS_PER_ROOM = 1
-
-# Bot sub-process dict for status reporting and concurrency control
-bot_procs = {}
-
-daily_helpers = {}
-
-load_dotenv(override=True)
-
-
-def cleanup():
-    # Clean up function, just to be extra safe
-    for entry in bot_procs.values():
-        proc = entry[0]
-        proc.terminate()
-        proc.wait()
-
-
-@asynccontextmanager
-async def lifespan(app: FastAPI):
-    aiohttp_session = aiohttp.ClientSession()
-    daily_helpers["rest"] = DailyRESTHelper(
-        daily_api_key=os.getenv("DAILY_API_KEY", ""),
-        daily_api_url=os.getenv("DAILY_API_URL", "https://api.daily.co/v1"),
-        aiohttp_session=aiohttp_session,
-    )
-    yield
-    await aiohttp_session.close()
-    cleanup()
-
-
-app = FastAPI(lifespan=lifespan)
-
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=["*"],
-    allow_credentials=True,
-    allow_methods=["*"],
-    allow_headers=["*"],
-)
-
-
-@app.get("/")
-async def start_agent(request: Request):
-    print(f"!!! Creating room")
-    room = await daily_helpers["rest"].create_room(DailyRoomParams())
-    print(f"!!! Room URL: {room.url}")
-    # Ensure the room property is present
-    if not room.url:
-        raise HTTPException(
-            status_code=500,
-            detail="Missing 'room' property in request data. Cannot start agent without a target room!",
-        )
-
-    # Check if there is already an existing process running in this room
-    num_bots_in_room = sum(
-        1 for proc in bot_procs.values() if proc[1] == room.url and proc[0].poll() is None
-    )
-    if num_bots_in_room >= MAX_BOTS_PER_ROOM:
-        raise HTTPException(status_code=500, detail=f"Max bot limited reach for room: {room.url}")
-
-    # Get the token for the room
-    token = await daily_helpers["rest"].get_token(room.url)
-
-    if not token:
-        raise HTTPException(status_code=500, detail=f"Failed to get token for room: {room.url}")
-
-    # Spawn a new agent, and join the user session
-    # Note: this is mostly for demonstration purposes (refer to 'deployment' in README)
-    try:
-        proc = subprocess.Popen(
-            [f"python3 -m bot -u {room.url} -t {token}"],
-            shell=True,
-            bufsize=1,
-            cwd=os.path.dirname(os.path.abspath(__file__)),
-        )
-        bot_procs[proc.pid] = (proc, room.url)
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Failed to start subprocess: {e}")
-
-    return RedirectResponse(room.url)
-
-
-@app.get("/status/{pid}")
-def get_status(pid: int):
-    # Look up the subprocess
-    proc = bot_procs.get(pid)
-
-    # If the subprocess doesn't exist, return an error
-    if not proc:
-        raise HTTPException(status_code=404, detail=f"Bot with process id: {pid} not found")
-
-    # Check the status of the subprocess
-    if proc[0].poll() is None:
-        status = "running"
-    else:
-        status = "finished"
-
-    return JSONResponse({"bot_id": pid, "status": status})
-
-
-if __name__ == "__main__":
-    import uvicorn
-
-    default_host = os.getenv("HOST", "0.0.0.0")
-    default_port = int(os.getenv("FAST_API_PORT", "7860"))
-
-    parser = argparse.ArgumentParser(description="Daily Storyteller FastAPI server")
-    parser.add_argument("--host", type=str, default=default_host, help="Host address")
-    parser.add_argument("--port", type=int, default=default_port, help="Port number")
-    parser.add_argument("--reload", action="store_true", help="Reload code on change")
-
-    config = parser.parse_args()
-
-    uvicorn.run(
-        "server:app",
-        host=config.host,
-        port=config.port,
-        reload=config.reload,
-    )
diff --git a/pyproject.toml b/pyproject.toml
index ecddb0902..910c8d066 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -43,7 +43,6 @@ anthropic = [ "anthropic~=0.49.0" ]
 assemblyai = [ "assemblyai~=0.37.0" ]
 aws = [ "boto3~=1.37.16" ]
 azure = [ "azure-cognitiveservices-speech~=1.42.0"]
-canonical = [ "aiofiles~=24.1.0" ]
 cartesia = [ "cartesia~=1.4.0", "websockets~=13.1" ]
 cerebras = []
 deepseek = []
diff --git a/src/pipecat/services/canonical/__init__.py b/src/pipecat/services/canonical/__init__.py
deleted file mode 100644
index f47b99c4e..000000000
--- a/src/pipecat/services/canonical/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-#
-# Copyright (c) 2024–2025, Daily
-#
-# SPDX-License-Identifier: BSD 2-Clause License
-#
-
-import sys
-
-from pipecat.services import DeprecatedModuleProxy
-
-from .metrics import *
-
-sys.modules[__name__] = DeprecatedModuleProxy(globals(), "canonical", "canonical.metrics")
diff --git a/src/pipecat/services/canonical/metrics.py b/src/pipecat/services/canonical/metrics.py
deleted file mode 100644
index 012cd4ab7..000000000
--- a/src/pipecat/services/canonical/metrics.py
+++ /dev/null
@@ -1,230 +0,0 @@
-#
-# Copyright (c) 2024–2025, Daily
-#
-# SPDX-License-Identifier: BSD 2-Clause License
-#
-
-import io
-import os
-import uuid
-import wave
-from datetime import datetime
-from typing import Dict, List, Optional, Tuple
-
-import aiohttp
-from loguru import logger
-
-from pipecat.frames.frames import CancelFrame, EndFrame, Frame
-from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
-from pipecat.processors.audio.audio_buffer_processor import AudioBufferProcessor
-from pipecat.processors.frame_processor import FrameDirection
-from pipecat.services.ai_service import AIService
-
-try:
-    import aiofiles
-    import aiofiles.os
-except ModuleNotFoundError as e:
-    logger.error(f"Exception: {e}")
-    logger.error(
-        "In order to use Canonical Metrics, you need to `pip install pipecat-ai[canonical]`. "
-        + "Also, set the `CANONICAL_API_KEY` environment variable."
-    )
-    raise Exception(f"Missing module: {e}")
-
-
-# Multipart upload part size in bytes, cannot be smaller than 5MB
-PART_SIZE = 1024 * 1024 * 5
-
-
-class CanonicalMetricsService(AIService):
-    """Initialize a CanonicalAudioProcessor instance.
-
-    This class uses an AudioBufferProcessor to get the conversation audio and
-    uploads it to Canonical Voice API for audio processing.
-
-    Args:
-        call_id (str): Your unique identifier for the call. This is used to match the call in the Canonical Voice system to the call in your system.
-        assistant (str): Identifier for the AI assistant. This can be whatever you want, it's intended for you convenience so you can distinguish
-        between different assistants and a grouping mechanism for calls.
-        assistant_speaks_first (bool, optional): Indicates if the assistant speaks first in the conversation. Defaults to True.
-        output_dir (str, optional): Directory to save temporary audio files. Defaults to "recordings".
-
-    Attributes:
-        call_id (str): Stores the unique call identifier.
-        assistant (str): Stores the assistant identifier.
-        assistant_speaks_first (bool): Indicates whether the assistant speaks first.
-        output_dir (str): Directory path for saving temporary audio files.
-
-    The constructor also ensures that the output directory exists.
-    """
-
-    def __init__(
-        self,
-        *,
-        aiohttp_session: aiohttp.ClientSession,
-        call_id: str,
-        assistant: str,
-        api_key: str,
-        api_url: str = "https://voiceapp.canonical.chat/api/v1",
-        assistant_speaks_first: bool = True,
-        output_dir: str = "recordings",
-        audio_buffer_processor: Optional[AudioBufferProcessor] = None,
-        context: Optional[OpenAILLMContext] = None,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        # Validate that at least one of audio_buffer_processor or context is provided
-        if audio_buffer_processor is None and context is None:
-            raise ValueError("At least one of audio_buffer_processor or context must be specified")
-
-        self._aiohttp_session = aiohttp_session
-        self._audio_buffer_processor = audio_buffer_processor
-        self._api_key = api_key
-        self._api_url = api_url
-        self._call_id = call_id
-        self._assistant = assistant
-        self._assistant_speaks_first = assistant_speaks_first
-        self._output_dir = output_dir
-        self._context = context
-
-    async def stop(self, frame: EndFrame):
-        await super().stop(frame)
-        await self._process_completion()
-
-    async def cancel(self, frame: CancelFrame):
-        await super().cancel(frame)
-        await self._process_completion()
-
-    async def process_frame(self, frame: Frame, direction: FrameDirection):
-        await super().process_frame(frame, direction)
-        await self.push_frame(frame, direction)
-
-    async def _process_completion(self):
-        if self._audio_buffer_processor is not None:
-            await self._process_audio()
-        elif self._context is not None:
-            await self._process_transcript()
-
-    async def _process_transcript(self):
-        params = {
-            "callId": self._call_id,
-            "assistant": {"id": self._assistant, "speaksFirst": self._assistant_speaks_first},
-            "transcript": self._context.messages,
-        }
-        response = await self._aiohttp_session.post(
-            f"{self._api_url}/call",
-            headers=self._request_headers(),
-            json=params,
-        )
-        if not response.ok:
-            logger.error(f"Failed to process transcript: {await response.text()}")
-
-    async def _process_audio(self):
-        audio_buffer_processor = self._audio_buffer_processor
-
-        if not audio_buffer_processor.has_audio():
-            return
-
-        os.makedirs(self._output_dir, exist_ok=True)
-        filename = self._get_output_filename()
-        audio = audio_buffer_processor.merge_audio_buffers()
-
-        with io.BytesIO() as buffer:
-            with wave.open(buffer, "wb") as wf:
-                wf.setsampwidth(2)
-                wf.setnchannels(audio_buffer_processor.num_channels)
-                wf.setframerate(audio_buffer_processor.sample_rate)
-                wf.writeframes(audio)
-            async with aiofiles.open(filename, "wb") as file:
-                await file.write(buffer.getvalue())
-
-        try:
-            await self._multipart_upload(filename)
-            await aiofiles.os.remove(filename)
-        except FileNotFoundError:
-            pass
-        except Exception as e:
-            logger.error(f"Failed to upload recording: {e}")
-
-    def _get_output_filename(self):
-        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-        return f"{self._output_dir}/{timestamp}-{uuid.uuid4().hex}.wav"
-
-    def _request_headers(self):
-        return {"Content-Type": "application/json", "X-Canonical-Api-Key": self._api_key}
-
-    async def _multipart_upload(self, file_path: str):
-        upload_request, upload_response = await self._request_upload(file_path)
-        if upload_request is None or upload_response is None:
-            return
-        parts = await self._upload_parts(file_path, upload_response)
-        if parts is None:
-            return
-        await self._upload_complete(parts, upload_request, upload_response)
-
-    async def _request_upload(self, file_path: str) -> Tuple[Dict, Dict]:
-        filename = os.path.basename(file_path)
-        filesize = os.path.getsize(file_path)
-        numparts = int((filesize + PART_SIZE - 1) / PART_SIZE)
-
-        params = {
-            "filename": filename,
-            "parts": numparts,
-            "callId": self._call_id,
-            "assistant": {"id": self._assistant, "speaksFirst": self._assistant_speaks_first},
-        }
-        logger.debug(f"Requesting presigned URLs for {numparts} parts")
-        response = await self._aiohttp_session.post(
-            f"{self._api_url}/recording/uploadRequest", headers=self._request_headers(), json=params
-        )
-        if not response.ok:
-            logger.error(f"Failed to get presigned URLs: {await response.text()}")
-            return None, None
-        response_json = await response.json()
-        return params, response_json
-
-    async def _upload_parts(self, file_path: str, upload_response: Dict) -> List[Dict]:
-        urls = upload_response["urls"]
-        parts = []
-        try:
-            async with aiofiles.open(file_path, "rb") as file:
-                for partnum, upload_url in enumerate(urls, start=1):
-                    data = await file.read(PART_SIZE)
-                    if not data:
-                        break
-
-                    response = await self._aiohttp_session.put(upload_url, data=data)
-                    if not response.ok:
-                        logger.error(f"Failed to upload part {partnum}: {await response.text()}")
-                        return None
-
-                    etag = response.headers["ETag"]
-                    parts.append({"partnum": str(partnum), "etag": etag})
-
-        except Exception as e:
-            logger.error(f"Multipart upload aborted, an error occurred: {str(e)}")
-        return parts
-
-    async def _upload_complete(
-        self, parts: List[Dict], upload_request: Dict, upload_response: Dict
-    ):
-        params = {
-            "filename": upload_request["filename"],
-            "parts": parts,
-            "slug": upload_response["slug"],
-            "callId": self._call_id,
-            "assistant": {"id": self._assistant, "speaksFirst": self._assistant_speaks_first},
-        }
-        if self._context is not None:
-            params["transcript"] = self._context.messages
-
-        logger.debug(f"Completing upload for {params['filename']}")
-        logger.debug(f"Slug: {params['slug']}")
-        response = await self._aiohttp_session.post(
-            f"{self._api_url}/recording/uploadComplete",
-            headers=self._request_headers(),
-            json=params,
-        )
-        if not response.ok:
-            logger.error(f"Failed to complete upload: {await response.text()}")
-            return

From a4447019293e5e2fe11eade1ac492605aecf9782 Mon Sep 17 00:00:00 2001
From: Mark Backman <mark@daily.co>
Date: Wed, 7 May 2025 09:02:08 -0400
Subject: [PATCH 11/97] Update README with Riva services

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 7f95bb664..47be9b6e1 100644
--- a/README.md
+++ b/README.md
@@ -51,9 +51,9 @@ You can connect to Pipecat from any platform using our official SDKs:
 
 | Category            | Services                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
 | ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| Speech-to-Text      | [AssemblyAI](https://docs.pipecat.ai/server/services/stt/assemblyai), [Azure](https://docs.pipecat.ai/server/services/stt/azure), [Deepgram](https://docs.pipecat.ai/server/services/stt/deepgram), [Fal Wizper](https://docs.pipecat.ai/server/services/stt/fal), [Gladia](https://docs.pipecat.ai/server/services/stt/gladia), [Google](https://docs.pipecat.ai/server/services/stt/google), [Groq (Whisper)](https://docs.pipecat.ai/server/services/stt/groq), [OpenAI (Whisper)](https://docs.pipecat.ai/server/services/stt/openai), [Parakeet (NVIDIA)](https://docs.pipecat.ai/server/services/stt/parakeet), [Ultravox](https://docs.pipecat.ai/server/services/stt/ultravox), [Whisper](https://docs.pipecat.ai/server/services/stt/whisper)                                                                                                                                                                                                                                            |
+| Speech-to-Text      | [AssemblyAI](https://docs.pipecat.ai/server/services/stt/assemblyai), [Azure](https://docs.pipecat.ai/server/services/stt/azure), [Deepgram](https://docs.pipecat.ai/server/services/stt/deepgram), [Fal Wizper](https://docs.pipecat.ai/server/services/stt/fal), [Gladia](https://docs.pipecat.ai/server/services/stt/gladia), [Google](https://docs.pipecat.ai/server/services/stt/google), [Groq (Whisper)](https://docs.pipecat.ai/server/services/stt/groq), [OpenAI (Whisper)](https://docs.pipecat.ai/server/services/stt/openai), [NVIDIA Riva](https://docs.pipecat.ai/server/services/stt/riva), [Ultravox](https://docs.pipecat.ai/server/services/stt/ultravox), [Whisper](https://docs.pipecat.ai/server/services/stt/whisper)                                                                                                                                                                                                                                                      |
 | LLMs                | [Anthropic](https://docs.pipecat.ai/server/services/llm/anthropic), [Azure](https://docs.pipecat.ai/server/services/llm/azure), [Cerebras](https://docs.pipecat.ai/server/services/llm/cerebras), [DeepSeek](https://docs.pipecat.ai/server/services/llm/deepseek), [Fireworks AI](https://docs.pipecat.ai/server/services/llm/fireworks), [Gemini](https://docs.pipecat.ai/server/services/llm/gemini), [Grok](https://docs.pipecat.ai/server/services/llm/grok), [Groq](https://docs.pipecat.ai/server/services/llm/groq), [NVIDIA NIM](https://docs.pipecat.ai/server/services/llm/nim), [Ollama](https://docs.pipecat.ai/server/services/llm/ollama), [OpenAI](https://docs.pipecat.ai/server/services/llm/openai), [OpenRouter](https://docs.pipecat.ai/server/services/llm/openrouter), [Perplexity](https://docs.pipecat.ai/server/services/llm/perplexity), [Qwen](https://docs.pipecat.ai/server/services/llm/qwen), [Together AI](https://docs.pipecat.ai/server/services/llm/together) |
-| Text-to-Speech      | [AWS](https://docs.pipecat.ai/server/services/tts/aws), [Azure](https://docs.pipecat.ai/server/services/tts/azure), [Cartesia](https://docs.pipecat.ai/server/services/tts/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/tts/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/tts/elevenlabs), [FastPitch (NVIDIA)](https://docs.pipecat.ai/server/services/tts/fastpitch), [Fish](https://docs.pipecat.ai/server/services/tts/fish), [Google](https://docs.pipecat.ai/server/services/tts/google), [LMNT](https://docs.pipecat.ai/server/services/tts/lmnt), [Neuphonic](https://docs.pipecat.ai/server/services/tts/neuphonic), [OpenAI](https://docs.pipecat.ai/server/services/tts/openai), [Piper](https://docs.pipecat.ai/server/services/tts/piper), [PlayHT](https://docs.pipecat.ai/server/services/tts/playht), [Rime](https://docs.pipecat.ai/server/services/tts/rime), [XTTS](https://docs.pipecat.ai/server/services/tts/xtts)                       |
+| Text-to-Speech      | [AWS](https://docs.pipecat.ai/server/services/tts/aws), [Azure](https://docs.pipecat.ai/server/services/tts/azure), [Cartesia](https://docs.pipecat.ai/server/services/tts/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/tts/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/tts/elevenlabs), [Fish](https://docs.pipecat.ai/server/services/tts/fish), [Google](https://docs.pipecat.ai/server/services/tts/google), [LMNT](https://docs.pipecat.ai/server/services/tts/lmnt), [Neuphonic](https://docs.pipecat.ai/server/services/tts/neuphonic), [NVIDIA Riva](https://docs.pipecat.ai/server/services/tts/riva), [OpenAI](https://docs.pipecat.ai/server/services/tts/openai), [Piper](https://docs.pipecat.ai/server/services/tts/piper), [PlayHT](https://docs.pipecat.ai/server/services/tts/playht), [Rime](https://docs.pipecat.ai/server/services/tts/rime), [XTTS](https://docs.pipecat.ai/server/services/tts/xtts)                                   |
 | Speech-to-Speech    | [Gemini Multimodal Live](https://docs.pipecat.ai/server/services/s2s/gemini), [OpenAI Realtime](https://docs.pipecat.ai/server/services/s2s/openai)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               |
 | Transport           | [Daily (WebRTC)](https://docs.pipecat.ai/server/services/transport/daily), [FastAPI Websocket](https://docs.pipecat.ai/server/services/transport/fastapi-websocket), [SmallWebRTCTransport](https://docs.pipecat.ai/server/services/transport/small-webrtc), [WebSocket Server](https://docs.pipecat.ai/server/services/transport/websocket-server), Local                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
 | Video               | [Tavus](https://docs.pipecat.ai/server/services/video/tavus), [Simli](https://docs.pipecat.ai/server/services/video/simli)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |

From 5e39c0cfeb8f8b162c0c27443a1a3417ccc5a338 Mon Sep 17 00:00:00 2001
From: Dan Berg <dan@webinargeek.com>
Date: Wed, 7 May 2025 14:30:39 +0200
Subject: [PATCH 12/97] DailyTransport: added on_active_speaker_changed event
 handler

---
 CHANGELOG.md                             |  2 ++
 src/pipecat/transports/services/daily.py | 10 ++++++++++
 2 files changed, 12 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 101cc7f58..3e330b9c9 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
+- Added `on_active_speaker_changed` event handler to the `DailyTransport` class.
+
 - Added `enable_ssml_parsing` and `enable_logging` to `InputParams` in
   `ElevenLabsTTSService`.
 
diff --git a/src/pipecat/transports/services/daily.py b/src/pipecat/transports/services/daily.py
index 9909d9336..5d00e76bc 100644
--- a/src/pipecat/transports/services/daily.py
+++ b/src/pipecat/transports/services/daily.py
@@ -175,6 +175,7 @@ class DailyCallbacks(BaseModel):
     """Callback handlers for Daily events.
 
     Attributes:
+        on_active_speaker_changed: Called when the active speaker of the call has changed.
         on_joined: Called when bot successfully joined a room.
         on_left: Called when bot left a room.
         on_error: Called when an error occurs.
@@ -201,6 +202,7 @@ class DailyCallbacks(BaseModel):
         on_recording_error: Called when recording encounters an error.
     """
 
+    on_active_speaker_changed: Callable[[Mapping[str, Any]], Awaitable[None]]
     on_joined: Callable[[Mapping[str, Any]], Awaitable[None]]
     on_left: Callable[[], Awaitable[None]]
     on_error: Callable[[str], Awaitable[None]]
@@ -789,6 +791,9 @@ class DailyTransportClient(EventHandler):
     # Daily (EventHandler)
     #
 
+    def on_active_speaker_changed(self, participant):
+        self._call_async_callback(self._callbacks.on_active_speaker_changed, participant)
+
     def on_app_message(self, message: Any, sender: str):
         self._call_async_callback(self._callbacks.on_app_message, message, sender)
 
@@ -1208,6 +1213,7 @@ class DailyTransport(BaseTransport):
         super().__init__(input_name=input_name, output_name=output_name)
 
         callbacks = DailyCallbacks(
+            on_active_speaker_changed=self._on_active_speaker_changed,
             on_joined=self._on_joined,
             on_left=self._on_left,
             on_error=self._on_error,
@@ -1243,6 +1249,7 @@ class DailyTransport(BaseTransport):
 
         # Register supported handlers. The user will only be able to register
         # these handlers.
+        self._register_event_handler("on_active_speaker_changed")
         self._register_event_handler("on_joined")
         self._register_event_handler("on_left")
         self._register_event_handler("on_error")
@@ -1377,6 +1384,9 @@ class DailyTransport(BaseTransport):
     async def update_remote_participants(self, remote_participants: Mapping[str, Any]):
         await self._client.update_remote_participants(remote_participants=remote_participants)
 
+    async def _on_active_speaker_changed(self, participant: Any):
+        await self._call_event_handler("on_active_speaker_changed", participant)
+
     async def _on_joined(self, data):
         await self._call_event_handler("on_joined", data)
 

From 5b66133a6cf27d9087e14897d7e2eced3e852cbe Mon Sep 17 00:00:00 2001
From: mattie ruth backman <mattieruth@gmail.com>
Date: Wed, 7 May 2025 12:08:28 -0400
Subject: [PATCH 13/97] Revert breaking change in RTVI protocol for function
 calling

---
 src/pipecat/processors/frameworks/rtvi.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/pipecat/processors/frameworks/rtvi.py b/src/pipecat/processors/frameworks/rtvi.py
index ee0cced87..909dd15b7 100644
--- a/src/pipecat/processors/frameworks/rtvi.py
+++ b/src/pipecat/processors/frameworks/rtvi.py
@@ -254,7 +254,7 @@ class RTVIBotReady(BaseModel):
 class RTVILLMFunctionCallMessageData(BaseModel):
     function_name: str
     tool_call_id: str
-    arguments: Mapping[str, Any]
+    args: Mapping[str, Any]
 
 
 class RTVILLMFunctionCallMessage(BaseModel):
@@ -700,7 +700,7 @@ class RTVIProcessor(FrameProcessor):
         fn = RTVILLMFunctionCallMessageData(
             function_name=params.function_name,
             tool_call_id=params.tool_call_id,
-            arguments=params.arguments,
+            args=params.arguments,
         )
         message = RTVILLMFunctionCallMessage(data=fn)
         await self._push_transport_message(message, exclude_none=False)

From 2b18f60261adfb6252f8385ade3614f26f926714 Mon Sep 17 00:00:00 2001
From: Tico Ballagas <tb@sidekickwellness.com>
Date: Sat, 8 Feb 2025 18:46:59 -0800
Subject: [PATCH 14/97] Initial implementation of AWS Transcribe TTS

---
 .../foundational/07m-interruptible-polly.py   |  13 +-
 src/pipecat/services/aws/tts.py               | 624 +++++++++++++++++-
 2 files changed, 626 insertions(+), 11 deletions(-)

diff --git a/examples/foundational/07m-interruptible-polly.py b/examples/foundational/07m-interruptible-polly.py
index 286fe5128..63349360e 100644
--- a/examples/foundational/07m-interruptible-polly.py
+++ b/examples/foundational/07m-interruptible-polly.py
@@ -15,9 +15,9 @@ from pipecat.pipeline.pipeline import Pipeline
 from pipecat.pipeline.runner import PipelineRunner
 from pipecat.pipeline.task import PipelineParams, PipelineTask
 from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
-from pipecat.services.aws.tts import PollyTTSService
-from pipecat.services.deepgram.stt import DeepgramSTTService
-from pipecat.services.openai.llm import OpenAILLMService
+from pipecat.services.aws import PollyTTSService, TranscribeSTTService
+from pipecat.services.openai import OpenAILLMService
+from pipecat.transcriptions.language import Language
 from pipecat.transports.base_transport import TransportParams
 from pipecat.transports.network.small_webrtc import SmallWebRTCTransport
 from pipecat.transports.network.webrtc_connection import SmallWebRTCConnection
@@ -37,14 +37,11 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection, _: argparse.Namespac
         ),
     )
 
-    stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
+    stt = TranscribeSTTService()
 
     tts = PollyTTSService(
-        api_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
-        aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
-        region=os.getenv("AWS_REGION"),
         voice_id="Amy",
-        params=PollyTTSService.InputParams(engine="neural", language="en-GB", rate="1.05"),
+        params=PollyTTSService.InputParams(engine="standard", language=Language.EN_GB, rate="1.05"),
     )
 
     llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
diff --git a/src/pipecat/services/aws/tts.py b/src/pipecat/services/aws/tts.py
index db6e168ab..e90ea9220 100644
--- a/src/pipecat/services/aws/tts.py
+++ b/src/pipecat/services/aws/tts.py
@@ -5,7 +5,21 @@
 #
 
 import asyncio
-from typing import AsyncGenerator, Optional
+from typing import AsyncGenerator, Optional, Dict
+import os
+import datetime
+import time
+from urllib.parse import urlencode
+import json
+import struct
+from io import BytesIO
+import urllib.parse
+import hashlib
+import hmac
+import random
+import string
+import binascii
+import numpy as np
 
 from loguru import logger
 from pydantic import BaseModel
@@ -17,17 +31,27 @@ from pipecat.frames.frames import (
     TTSAudioRawFrame,
     TTSStartedFrame,
     TTSStoppedFrame,
+    TranscriptionFrame,
+    InterimTranscriptionFrame,
+    StartFrame,
+    EndFrame,
+    CancelFrame,
 )
-from pipecat.services.tts_service import TTSService
+from pipecat.services.ai_services import TTSService, STTService
 from pipecat.transcriptions.language import Language
+from pipecat.utils.time import time_now_iso8601
 
 try:
     import boto3
     from botocore.exceptions import BotoCoreError, ClientError
+    import websockets
+    from botocore.auth import SigV4Auth
+    from botocore.awsrequest import AWSRequest
+    from botocore.credentials import Credentials
 except ModuleNotFoundError as e:
     logger.error(f"Exception: {e}")
     logger.error(
-        "In order to use Deepgram, you need to `pip install pipecat-ai[aws]`. Also, set `AWS_SECRET_ACCESS_KEY`, `AWS_ACCESS_KEY_ID`, and `AWS_REGION` environment variable."
+        "In order to use AWS services, you need to `pip install pipecat-ai[aws]`. Also, set `AWS_SECRET_ACCESS_KEY`, `AWS_ACCESS_KEY_ID`, and `AWS_REGION` environment variable."
     )
     raise Exception(f"Missing module: {e}")
 
@@ -151,6 +175,24 @@ class PollyTTSService(TTSService):
 
         self.set_voice(voice_id)
 
+        # Get credentials from environment variables if not provided
+        self._credentials = {
+            "aws_access_key_id": aws_access_key_id or os.getenv("AWS_ACCESS_KEY_ID"),
+            "aws_secret_access_key": api_key or os.getenv("AWS_SECRET_ACCESS_KEY"),
+            "aws_session_token": aws_session_token or os.getenv("AWS_SESSION_TOKEN"),
+            "region": region or os.getenv("AWS_REGION", "us-east-1"),
+        }
+
+        # Validate that we have the required credentials
+        if (
+            not self._credentials["aws_access_key_id"]
+            or not self._credentials["aws_secret_access_key"]
+        ):
+            raise ValueError(
+                "AWS credentials not found. Please provide them either through constructor parameters "
+                "or set AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY environment variables."
+            )
+
     def can_generate_metrics(self) -> bool:
         return True
 
@@ -248,3 +290,579 @@ class PollyTTSService(TTSService):
 
         finally:
             yield TTSStoppedFrame()
+
+
+class AWSTTSService(PollyTTSService):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+        import warnings
+
+        with warnings.catch_warnings():
+            warnings.simplefilter("always")
+            warnings.warn(
+                "'AWSTTSService' is deprecated, use 'PollyTTSService' instead.", DeprecationWarning
+            )
+
+
+def get_presigned_url(
+    *,
+    region: str,
+    credentials: Dict[str, Optional[str]],
+    language_code: str,
+    media_encoding: str = "pcm",
+    sample_rate: int = 16000,
+    number_of_channels: int = 1,
+    enable_partial_results_stabilization: bool = True,
+    partial_results_stability: str = "high",
+    vocabulary_name: Optional[str] = None,
+    vocabulary_filter_name: Optional[str] = None,
+    show_speaker_label: bool = False,
+    enable_channel_identification: bool = False,
+) -> str:
+    """Create a presigned URL for AWS Transcribe streaming."""
+    access_key = credentials.get("access_key")
+    secret_key = credentials.get("secret_key")
+    session_token = credentials.get("session_token")
+
+    if not access_key or not secret_key:
+        raise ValueError("AWS credentials are required")
+
+    # Initialize the URL generator
+    url_generator = AWSTranscribePresignedURL(
+        access_key=access_key, secret_key=secret_key, session_token=session_token, region=region
+    )
+
+    # Get the presigned URL
+    return url_generator.get_request_url(
+        sample_rate=sample_rate,
+        language_code=language_code,
+        media_encoding=media_encoding,
+        vocabulary_name=vocabulary_name,
+        vocabulary_filter_name=vocabulary_filter_name,
+        show_speaker_label=show_speaker_label,
+        enable_channel_identification=enable_channel_identification,
+        number_of_channels=number_of_channels,
+        enable_partial_results_stabilization=enable_partial_results_stabilization,
+        partial_results_stability=partial_results_stability,
+    )
+
+
+class AWSTranscribePresignedURL:
+    def __init__(
+        self, access_key: str, secret_key: str, session_token: str, region: str = "us-east-1"
+    ):
+        self.access_key = access_key
+        self.secret_key = secret_key
+        self.session_token = session_token
+        self.method = "GET"
+        self.service = "transcribe"
+        self.region = region
+        self.endpoint = ""
+        self.host = ""
+        self.amz_date = ""
+        self.datestamp = ""
+        self.canonical_uri = "/stream-transcription-websocket"
+        self.canonical_headers = ""
+        self.signed_headers = "host"
+        self.algorithm = "AWS4-HMAC-SHA256"
+        self.credential_scope = ""
+        self.canonical_querystring = ""
+        self.payload_hash = ""
+        self.canonical_request = ""
+        self.string_to_sign = ""
+        self.signature = ""
+        self.request_url = ""
+
+    def get_request_url(
+        self,
+        sample_rate: int,
+        language_code: str = "",
+        media_encoding: str = "pcm",
+        vocabulary_name: str = "",
+        vocabulary_filter_name: str = "",
+        show_speaker_label: bool = False,
+        enable_channel_identification: bool = False,
+        number_of_channels: int = 1,
+        enable_partial_results_stabilization: bool = False,
+        partial_results_stability: str = "",
+    ) -> str:
+        self.endpoint = f"wss://transcribestreaming.{self.region}.amazonaws.com:8443"
+        self.host = f"transcribestreaming.{self.region}.amazonaws.com:8443"
+
+        now = datetime.datetime.utcnow()
+        self.amz_date = now.strftime("%Y%m%dT%H%M%SZ")
+        self.datestamp = now.strftime("%Y%m%d")
+        self.canonical_headers = f"host:{self.host}\n"
+        self.credential_scope = f"{self.datestamp}%2F{self.region}%2F{self.service}%2Faws4_request"
+
+        # Create canonical querystring
+        self.canonical_querystring = "X-Amz-Algorithm=" + self.algorithm
+        self.canonical_querystring += (
+            "&X-Amz-Credential=" + self.access_key + "%2F" + self.credential_scope
+        )
+        self.canonical_querystring += "&X-Amz-Date=" + self.amz_date
+        self.canonical_querystring += "&X-Amz-Expires=300"
+        if self.session_token:
+            self.canonical_querystring += "&X-Amz-Security-Token=" + urllib.parse.quote(
+                self.session_token, safe=""
+            )
+        self.canonical_querystring += "&X-Amz-SignedHeaders=" + self.signed_headers
+
+        if enable_channel_identification:
+            self.canonical_querystring += "&enable-channel-identification=true"
+        if enable_partial_results_stabilization:
+            self.canonical_querystring += "&enable-partial-results-stabilization=true"
+        if language_code:
+            self.canonical_querystring += "&language-code=" + language_code
+        if media_encoding:
+            self.canonical_querystring += "&media-encoding=" + media_encoding
+        if number_of_channels > 1:
+            self.canonical_querystring += "&number-of-channels=" + str(number_of_channels)
+        if partial_results_stability:
+            self.canonical_querystring += "&partial-results-stability=" + partial_results_stability
+        if sample_rate:
+            self.canonical_querystring += "&sample-rate=" + str(sample_rate)
+        if show_speaker_label:
+            self.canonical_querystring += "&show-speaker-label=true"
+        if vocabulary_filter_name:
+            self.canonical_querystring += "&vocabulary-filter-name=" + vocabulary_filter_name
+        if vocabulary_name:
+            self.canonical_querystring += "&vocabulary-name=" + vocabulary_name
+
+        # Create payload hash
+        self.payload_hash = hashlib.sha256("".encode("utf-8")).hexdigest()
+
+        # Create canonical request
+        self.canonical_request = f"{self.method}\n{self.canonical_uri}\n{self.canonical_querystring}\n{self.canonical_headers}\n{self.signed_headers}\n{self.payload_hash}"
+
+        # Create string to sign
+        credential_scope = f"{self.datestamp}/{self.region}/{self.service}/aws4_request"
+        string_to_sign = (
+            f"{self.algorithm}\n{self.amz_date}\n{credential_scope}\n"
+            + hashlib.sha256(self.canonical_request.encode("utf-8")).hexdigest()
+        )
+
+        # Calculate signature
+        k_date = hmac.new(
+            f"AWS4{self.secret_key}".encode("utf-8"), self.datestamp.encode("utf-8"), hashlib.sha256
+        ).digest()
+        k_region = hmac.new(k_date, self.region.encode("utf-8"), hashlib.sha256).digest()
+        k_service = hmac.new(k_region, self.service.encode("utf-8"), hashlib.sha256).digest()
+        k_signing = hmac.new(k_service, b"aws4_request", hashlib.sha256).digest()
+        self.signature = hmac.new(
+            k_signing, string_to_sign.encode("utf-8"), hashlib.sha256
+        ).hexdigest()
+
+        # Add signature to query string
+        self.canonical_querystring += "&X-Amz-Signature=" + self.signature
+
+        # Create request URL
+        self.request_url = self.endpoint + self.canonical_uri + "?" + self.canonical_querystring
+        return self.request_url
+
+
+def get_headers(header_name: str, header_value: str) -> bytearray:
+    """Build a header following AWS event stream format."""
+    name = header_name.encode("utf-8")
+    name_byte_length = bytes([len(name)])
+    value_type = bytes([7])  # 7 represents a string
+    value = header_value.encode("utf-8")
+    value_byte_length = struct.pack(">H", len(value))
+
+    # Construct the header
+    header_list = bytearray()
+    header_list.extend(name_byte_length)
+    header_list.extend(name)
+    header_list.extend(value_type)
+    header_list.extend(value_byte_length)
+    header_list.extend(value)
+    return header_list
+
+
+def build_event_message(payload: bytes) -> bytes:
+    """
+    Build an event message for AWS Transcribe streaming.
+    Matches AWS sample: https://github.com/aws-samples/amazon-transcribe-streaming-python-websockets/blob/main/eventstream.py
+    """
+    # Build headers
+    content_type_header = get_headers(":content-type", "application/octet-stream")
+    event_type_header = get_headers(":event-type", "AudioEvent")
+    message_type_header = get_headers(":message-type", "event")
+
+    headers = bytearray()
+    headers.extend(content_type_header)
+    headers.extend(event_type_header)
+    headers.extend(message_type_header)
+
+    # Calculate total byte length and headers byte length
+    # 16 accounts for 8 byte prelude, 2x 4 byte CRCs
+    total_byte_length = struct.pack(">I", len(headers) + len(payload) + 16)
+    headers_byte_length = struct.pack(">I", len(headers))
+
+    # Build the prelude
+    prelude = bytearray([0] * 8)
+    prelude[:4] = total_byte_length
+    prelude[4:] = headers_byte_length
+
+    # Calculate checksum for prelude
+    prelude_crc = struct.pack(">I", binascii.crc32(prelude) & 0xFFFFFFFF)
+
+    # Construct the message
+    message_as_list = bytearray()
+    message_as_list.extend(prelude)
+    message_as_list.extend(prelude_crc)
+    message_as_list.extend(headers)
+    message_as_list.extend(payload)
+
+    # Calculate checksum for message
+    message = bytes(message_as_list)
+    message_crc = struct.pack(">I", binascii.crc32(message) & 0xFFFFFFFF)
+
+    # Add message checksum
+    message_as_list.extend(message_crc)
+
+    return bytes(message_as_list)
+
+
+def decode_event(message):
+    # Extract the prelude, headers, payload and CRC
+    prelude = message[:8]
+    total_length, headers_length = struct.unpack(">II", prelude)
+    prelude_crc = struct.unpack(">I", message[8:12])[0]
+    headers = message[12 : 12 + headers_length]
+    payload = message[12 + headers_length : -4]
+    message_crc = struct.unpack(">I", message[-4:])[0]
+
+    # Check the CRCs
+    assert prelude_crc == binascii.crc32(prelude) & 0xFFFFFFFF, "Prelude CRC check failed"
+    assert message_crc == binascii.crc32(message[:-4]) & 0xFFFFFFFF, "Message CRC check failed"
+
+    # Parse the headers
+    headers_dict = {}
+    while headers:
+        name_len = headers[0]
+        name = headers[1 : 1 + name_len].decode("utf-8")
+        value_type = headers[1 + name_len]
+        value_len = struct.unpack(">H", headers[2 + name_len : 4 + name_len])[0]
+        value = headers[4 + name_len : 4 + name_len + value_len].decode("utf-8")
+        headers_dict[name] = value
+        headers = headers[4 + name_len + value_len :]
+
+    return headers_dict, json.loads(payload)
+
+
+class TranscribeSTTService(STTService):
+    def __init__(
+        self,
+        *,
+        api_key: Optional[str] = None,
+        aws_access_key_id: Optional[str] = None,
+        aws_session_token: Optional[str] = None,
+        region: Optional[str] = "us-east-1",
+        sample_rate: int = 16000,
+        language: Language = Language.EN,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self._settings = {
+            "sample_rate": sample_rate,
+            "language": language,
+            "media_encoding": "linear16",  # AWS expects raw PCM
+            "number_of_channels": 1,
+            "show_speaker_label": False,
+            "enable_channel_identification": False,
+        }
+
+        # Validate sample rate - AWS Transcribe only supports 8000 Hz or 16000 Hz
+        if sample_rate not in [8000, 16000]:
+            logger.warning(
+                f"AWS Transcribe only supports 8000 Hz or 16000 Hz sample rates. Converting from {sample_rate} Hz to 16000 Hz."
+            )
+            self._settings["sample_rate"] = 16000
+
+        self._credentials = {
+            "aws_access_key_id": aws_access_key_id or os.getenv("AWS_ACCESS_KEY_ID"),
+            "aws_secret_access_key": api_key or os.getenv("AWS_SECRET_ACCESS_KEY"),
+            "aws_session_token": aws_session_token or os.getenv("AWS_SESSION_TOKEN"),
+            "region": region or os.getenv("AWS_REGION", "us-east-1"),
+        }
+
+        self._ws_client = None
+        self._connection_lock = asyncio.Lock()
+        self._connecting = False
+        self._receive_task = None
+
+    def get_service_encoding(self, encoding: str) -> str:
+        """Convert internal encoding format to AWS Transcribe format."""
+        encoding_map = {
+            "linear16": "pcm",  # AWS expects "pcm" for 16-bit linear PCM
+        }
+        return encoding_map.get(encoding, encoding)
+
+    async def start(self, frame: StartFrame):
+        """Initialize the connection when the service starts."""
+        await super().start(frame)
+        logger.info("Starting AWS Transcribe service...")
+        retry_count = 0
+        max_retries = 3
+
+        while retry_count < max_retries:
+            try:
+                await self._connect()
+                if self._ws_client and self._ws_client.open:
+                    logger.info("Successfully established WebSocket connection")
+                    return
+                logger.warning("WebSocket connection not established after connect")
+            except Exception as e:
+                logger.error(f"Failed to connect (attempt {retry_count + 1}/{max_retries}): {e}")
+                retry_count += 1
+                if retry_count < max_retries:
+                    await asyncio.sleep(1)  # Wait before retrying
+
+        raise RuntimeError("Failed to establish WebSocket connection after multiple attempts")
+
+    async def run_stt(self, frame: Frame) -> AsyncGenerator[Frame, None]:
+        """Process audio data and send to AWS Transcribe"""
+        try:
+            # Skip if no speech detected
+            if hasattr(frame, "is_speech") and not frame.is_speech:
+                logger.debug("Skipping non-speech frame")
+                return
+
+            # Ensure WebSocket is connected
+            if not self._ws_client or not self._ws_client.open:
+                logger.info("WebSocket not connected, attempting to reconnect...")
+                try:
+                    await self._connect()
+                except Exception as e:
+                    logger.error(f"Failed to reconnect: {e}")
+                    yield ErrorFrame("Failed to reconnect to AWS Transcribe", fatal=False)
+                    return
+
+            # Get the audio data - if frame is bytes, use directly, otherwise get audio attribute
+            audio_data = frame if isinstance(frame, bytes) else frame.audio
+
+            # Format the audio data according to AWS event stream format
+            event_message = build_event_message(audio_data)
+            # logger.debug(f"Sending audio chunk of size {len(audio_data)} bytes")
+
+            # Send the formatted event message
+            try:
+                await self._ws_client.send(event_message)
+                # Start metrics after first chunk sent
+                await self.start_processing_metrics()
+                await self.start_ttfb_metrics()
+            except websockets.exceptions.ConnectionClosed as e:
+                logger.warning(f"Connection closed while sending: {e}")
+                await self._disconnect()
+                # Don't yield error here - we'll retry on next frame
+            except Exception as e:
+                logger.error(f"Error sending audio: {e}")
+                yield ErrorFrame(f"AWS Transcribe error: {str(e)}", fatal=False)
+                await self._disconnect()
+
+        except Exception as e:
+            logger.error(f"Error in run_stt: {e}")
+            yield ErrorFrame(f"AWS Transcribe error: {str(e)}", fatal=False)
+            await self._disconnect()
+
+    async def _connect(self):
+        """Connect to AWS Transcribe with connection state management."""
+        if (
+            self._ws_client
+            and self._ws_client.open
+            and self._receive_task
+            and not self._receive_task.done()
+        ):
+            logger.debug("Already connected")
+            return
+
+        async with self._connection_lock:
+            if self._connecting:
+                logger.debug("Connection already in progress")
+                return
+
+            try:
+                self._connecting = True
+                logger.debug("Starting connection process...")
+
+                if self._ws_client:
+                    await self._disconnect()
+
+                language_code = self.language_to_service_language(
+                    Language(self._settings["language"])
+                )
+                if not language_code:
+                    raise ValueError(f"Unsupported language: {self._settings['language']}")
+
+                # Generate random websocket key
+                websocket_key = "".join(
+                    random.choices(
+                        string.ascii_uppercase + string.ascii_lowercase + string.digits, k=20
+                    )
+                )
+
+                # Add required headers
+                extra_headers = {
+                    "Origin": "https://localhost",
+                    "Sec-WebSocket-Key": websocket_key,
+                    "Sec-WebSocket-Version": "13",
+                    "Connection": "keep-alive",
+                }
+
+                # Get presigned URL
+                presigned_url = get_presigned_url(
+                    region=self._credentials["region"],
+                    credentials={
+                        "access_key": self._credentials["aws_access_key_id"],
+                        "secret_key": self._credentials["aws_secret_access_key"],
+                        "session_token": self._credentials["aws_session_token"],
+                    },
+                    language_code=language_code,
+                    media_encoding=self.get_service_encoding(
+                        self._settings["media_encoding"]
+                    ),  # Convert to AWS format
+                    sample_rate=self._settings["sample_rate"],
+                    number_of_channels=self._settings["number_of_channels"],
+                    enable_partial_results_stabilization=True,
+                    partial_results_stability="high",
+                    show_speaker_label=self._settings["show_speaker_label"],
+                    enable_channel_identification=self._settings["enable_channel_identification"],
+                )
+
+                logger.debug(f"Connecting to WebSocket with URL: {presigned_url[:100]}...")
+
+                # Connect with the required headers and settings
+                self._ws_client = await websockets.connect(
+                    presigned_url,
+                    extra_headers=extra_headers,
+                    subprotocols=["mqtt"],
+                    ping_interval=None,
+                    ping_timeout=None,
+                    compression=None,
+                )
+                logger.debug("WebSocket connected, starting receive task...")
+
+                # Start receive task
+                self._receive_task = asyncio.create_task(self._receive_loop())
+
+                logger.info("Successfully connected to AWS Transcribe")
+
+            except Exception as e:
+                logger.error(f"Failed to connect to AWS Transcribe: {e}")
+                await self._disconnect()
+                raise
+
+            finally:
+                self._connecting = False
+
+    async def _disconnect(self):
+        """Disconnect from AWS Transcribe."""
+        if self._receive_task:
+            self._receive_task.cancel()
+            try:
+                await self._receive_task
+            except asyncio.CancelledError:
+                pass
+            self._receive_task = None
+
+        if self._ws_client:
+            try:
+                if self._ws_client.open:
+                    # Send end-stream message
+                    end_stream = {"message-type": "event", "event": "end"}
+                    await self._ws_client.send(json.dumps(end_stream))
+                await self._ws_client.close()
+            except Exception as e:
+                logger.warning(f"Error closing WebSocket connection: {e}")
+            finally:
+                self._ws_client = None
+
+    def language_to_service_language(self, language: Language) -> str | None:
+        """Convert internal language enum to AWS Transcribe language code."""
+        language_map = {
+            Language.EN: "en-US",
+            Language.ES: "es-US",
+            Language.FR: "fr-FR",
+            Language.DE: "de-DE",
+            Language.IT: "it-IT",
+            Language.PT: "pt-BR",
+            Language.JA: "ja-JP",
+            Language.KO: "ko-KR",
+            Language.ZH: "zh-CN",
+        }
+        return language_map.get(language)
+
+    async def _receive_loop(self):
+        """Background task to receive and process messages from AWS Transcribe."""
+        try:
+            logger.debug("Receive loop started")
+            while True:
+                if not self._ws_client or not self._ws_client.open:
+                    logger.warning("WebSocket closed in receive loop")
+                    break
+
+                try:
+                    response = await self._ws_client.recv()
+                    headers, payload = decode_event(response)
+
+                    # logger.debug(f"Received message type: {headers.get(':message-type')}")
+
+                    if headers.get(":message-type") == "event":
+                        # Process transcription results
+                        results = payload.get("Transcript", {}).get("Results", [])
+                        if results:
+                            result = results[0]
+                            alternatives = result.get("Alternatives", [])
+                            if alternatives:
+                                transcript = alternatives[0].get("Transcript", "")
+                                is_final = not result.get("IsPartial", True)
+
+                                if transcript:
+                                    await self.stop_ttfb_metrics()
+                                    if is_final:
+                                        await self.push_frame(
+                                            TranscriptionFrame(
+                                                transcript,
+                                                "",
+                                                time_now_iso8601(),
+                                                self._settings["language"],
+                                            )
+                                        )
+                                        await self.stop_processing_metrics()
+                                    else:
+                                        await self.push_frame(
+                                            InterimTranscriptionFrame(
+                                                transcript,
+                                                "",
+                                                time_now_iso8601(),
+                                                self._settings["language"],
+                                            )
+                                        )
+                    elif headers.get(":message-type") == "exception":
+                        error_msg = payload.get("Message", "Unknown error")
+                        logger.error(f"Exception from AWS: {error_msg}")
+                        await self.push_frame(
+                            ErrorFrame(f"AWS Transcribe error: {error_msg}", fatal=False)
+                        )
+                    else:
+                        logger.debug(f"Other message type received: {headers}")
+                        logger.debug(f"Payload: {payload}")
+
+                except websockets.exceptions.ConnectionClosed as e:
+                    logger.error(
+                        f"WebSocket connection closed in receive loop with code {e.code}: {e.reason}"
+                    )
+                    break
+                except Exception as e:
+                    logger.error(f"Error in receive loop: {e}")
+                    break
+
+        except asyncio.CancelledError:
+            logger.debug("Receive loop cancelled")
+        except Exception as e:
+            logger.error(f"Unexpected error in receive loop: {e}")
+        finally:
+            logger.debug("Receive loop ended")

From acb7d597cb037919272778a9cfa97c8bf1bc67f7 Mon Sep 17 00:00:00 2001
From: Tico Ballagas <tb@sidekickwellness.com>
Date: Thu, 13 Feb 2025 15:00:50 -0800
Subject: [PATCH 15/97] Change example to use generative voices

---
 examples/foundational/07m-interruptible-polly.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/examples/foundational/07m-interruptible-polly.py b/examples/foundational/07m-interruptible-polly.py
index 63349360e..b3bd08061 100644
--- a/examples/foundational/07m-interruptible-polly.py
+++ b/examples/foundational/07m-interruptible-polly.py
@@ -40,8 +40,11 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection, _: argparse.Namespac
     stt = TranscribeSTTService()
 
     tts = PollyTTSService(
-        voice_id="Amy",
-        params=PollyTTSService.InputParams(engine="standard", language=Language.EN_GB, rate="1.05"),
+        region="us-west-2",  # only specific regions support generative TTS
+        voice_id="Joanna",
+        params=PollyTTSService.InputParams(
+            engine="generative", language=Language.EN_GB, rate="1.05"
+        ),
     )
 
     llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))

From 844f61dfeac533fd1f61f436fba0b36886352394 Mon Sep 17 00:00:00 2001
From: Adithya Suresh <adxthya@amazon.com>
Date: Thu, 3 Apr 2025 08:41:34 +0000
Subject: [PATCH 16/97] Initial implementation

---
 .../adapters/services/bedrock_adapter.py      |  38 +
 src/pipecat/services/aws/llm.py               | 803 ++++++++++++++++++
 2 files changed, 841 insertions(+)
 create mode 100644 src/pipecat/adapters/services/bedrock_adapter.py
 create mode 100644 src/pipecat/services/aws/llm.py

diff --git a/src/pipecat/adapters/services/bedrock_adapter.py b/src/pipecat/adapters/services/bedrock_adapter.py
new file mode 100644
index 000000000..0aba6aba2
--- /dev/null
+++ b/src/pipecat/adapters/services/bedrock_adapter.py
@@ -0,0 +1,38 @@
+#
+# Copyright (c) 2024–2025, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+
+from typing import Any, Dict, List, Union
+
+from pipecat.adapters.base_llm_adapter import BaseLLMAdapter
+from pipecat.adapters.schemas.function_schema import FunctionSchema
+from pipecat.adapters.schemas.tools_schema import ToolsSchema
+
+
+class BedrockLLMAdapter(BaseLLMAdapter):
+    @staticmethod
+    def _to_bedrock_function_format(function: FunctionSchema) -> Dict[str, Any]:
+        return {
+            "toolSpec": {
+                "name": function.name,
+                "description": function.description,
+                "inputSchema": {
+                    "json": {
+                        "type": "object",
+                        "properties": function.properties,
+                        "required": function.required,
+                    },
+                }
+            }
+        }
+
+    def to_provider_tools_format(self, tools_schema: ToolsSchema) -> List[Dict[str, Any]]:
+        """Converts function schemas to Bedrock's function-calling format.
+
+        :return: Bedrock formatted function call definition.
+        """
+
+        functions_schema = tools_schema.standard_tools
+        return [self._to_bedrock_function_format(func) for func in functions_schema]
diff --git a/src/pipecat/services/aws/llm.py b/src/pipecat/services/aws/llm.py
new file mode 100644
index 000000000..3b476e03b
--- /dev/null
+++ b/src/pipecat/services/aws/llm.py
@@ -0,0 +1,803 @@
+#
+# Copyright (c) 2024–2025, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+
+import asyncio
+import base64
+import copy
+import io
+import json
+import re
+from dataclasses import dataclass
+from typing import Any, Dict, List, Mapping, Optional, Union
+
+import boto3
+from botocore.config import Config
+import httpx
+from loguru import logger
+from PIL import Image
+from pydantic import BaseModel, Field
+
+from pipecat.adapters.services.anthropic_adapter import AnthropicLLMAdapter
+from pipecat.frames.frames import (
+    Frame,
+    FunctionCallCancelFrame,
+    FunctionCallInProgressFrame,
+    FunctionCallResultFrame,
+    LLMFullResponseEndFrame,
+    LLMFullResponseStartFrame,
+    LLMMessagesFrame,
+    LLMTextFrame,
+    LLMUpdateSettingsFrame,
+    UserImageRawFrame,
+    VisionImageRawFrame,
+)
+from pipecat.metrics.metrics import LLMTokenUsage
+from pipecat.processors.aggregators.llm_response import (
+    LLMAssistantContextAggregator,
+    LLMUserContextAggregator,
+)
+from pipecat.processors.aggregators.openai_llm_context import (
+    OpenAILLMContext,
+    OpenAILLMContextFrame,
+)
+from pipecat.processors.frame_processor import FrameDirection
+from pipecat.services.ai_services import LLMService
+
+try:
+    from anthropic import NOT_GIVEN, NotGiven
+except ModuleNotFoundError as e:
+    logger.error(f"Exception: {e}")
+    logger.error(
+        "In order to use Anthropic, you need to `pip install pipecat-ai[anthropic]`. "
+        + "Also, set `ANTHROPIC_API_KEY` environment variable."
+    )
+    raise Exception(f"Missing module: {e}")
+
+
+@dataclass
+class BedrockContextAggregatorPair:
+    _user: "BedrockUserContextAggregator"
+    _assistant: "BedrockAssistantContextAggregator"
+
+    def user(self) -> "BedrockUserContextAggregator":
+        return self._user
+
+    def assistant(self) -> "BedrockAssistantContextAggregator":
+        return self._assistant
+
+
+class BedrockLLMService(LLMService):
+    """This class implements inference with AWS Bedrock models including Amazon Nova and Anthropic Claude.
+    
+    Requires AWS credentials to be configured in the environment or through boto3 configuration.
+    """
+    class InputParams(BaseModel):
+        max_tokens: Optional[int] = Field(default_factory=lambda: 4096, ge=1)
+        temperature: Optional[float] = Field(default_factory=lambda: 0.7, ge=0.0, le=1.0)
+        top_p: Optional[float] = Field(default_factory=lambda: 0.999, ge=0.0, le=1.0)
+        stop_sequences: Optional[List[str]] = Field(default_factory=lambda: [])
+        latency: Optional[str] = Field(default_factory=lambda: "standard")
+        additional_model_request_fields: Optional[Dict[str, Any]] = Field(default_factory=dict)
+
+    def __init__(
+        self,
+        *,
+        aws_access_key: str,
+        aws_secret_key: str,
+        aws_session_token: Optional[str] = None,
+        aws_region: str = "us-east-1",
+        model: str,
+        params: InputParams = InputParams(),
+        client_config: Optional[Config] = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        
+        # Initialize the Bedrock client
+        if not client_config:
+            client_config = Config(
+                connect_timeout=300,  # 5 minutes
+                read_timeout=300,     # 5 minutes
+                retries={'max_attempts': 3}
+            )
+        session = boto3.Session(
+            aws_access_key_id=aws_access_key,
+            aws_secret_access_key=aws_secret_key,
+            aws_session_token=aws_session_token,
+            region_name=aws_region
+        )
+        self._client = session.client(
+            service_name='bedrock-runtime',
+            config=client_config
+        )
+        
+        self.set_model_name(model)
+        self._settings = {
+            "max_tokens": params.max_tokens,
+            "temperature": params.temperature,
+            "top_p": params.top_p,
+            "latency": params.latency,
+            "additional_model_request_fields": params.additional_model_request_fields if isinstance(params.additional_model_request_fields, dict) else {},
+        }
+        
+        # Determine model provider from model ID
+        self.model_provider = self._get_model_provider(model)
+        logger.info(f"Using AWS Bedrock model: {model} from provider: {self.model_provider}")
+
+    def _get_model_provider(self, model: str) -> str:
+        """Determine the model provider from the model ID"""
+        if "anthropic." in model:
+            return "anthropic"
+        elif "amazon." in model:
+            return "amazon"
+        else:
+            raise ValueError(f"Unsupported model: {model}. Only Anthropic Claude and Amazon Nova model families are supported.")
+
+    def can_generate_metrics(self) -> bool:
+        return True
+
+    def create_context_aggregator(
+        self,
+        context: OpenAILLMContext,
+        *,
+        user_kwargs: Mapping[str, Any] = {},
+        assistant_kwargs: Mapping[str, Any] = {},
+    ) -> BedrockContextAggregatorPair:
+        """Create an instance of BedrockContextAggregatorPair from an
+        OpenAILLMContext. Constructor keyword arguments for both the user and
+        assistant aggregators can be provided.
+
+        Args:
+            context (OpenAILLMContext): The LLM context.
+            user_kwargs (Mapping[str, Any], optional): Additional keyword
+                arguments for the user context aggregator constructor. Defaults
+                to an empty mapping.
+            assistant_kwargs (Mapping[str, Any], optional): Additional keyword
+                arguments for the assistant context aggregator
+                constructor. Defaults to an empty mapping.
+
+        Returns:
+            BedrockContextAggregatorPair: A pair of context aggregators, one
+            for the user and one for the assistant, encapsulated in an
+            BedrockContextAggregatorPair.
+        """
+        context.set_llm_adapter(self.get_llm_adapter())
+
+        if isinstance(context, OpenAILLMContext):
+            context = BedrockLLMContext.from_openai_context(context)
+                
+        user = BedrockUserContextAggregator(context, **user_kwargs)
+        assistant = BedrockAssistantContextAggregator(context, **assistant_kwargs)
+        return BedrockContextAggregatorPair(_user=user, _assistant=assistant)
+
+    async def _process_context(self, context: "BedrockLLMContext"):
+        # Usage tracking
+        prompt_tokens = 0
+        completion_tokens = 0
+        completion_tokens_estimate = 0
+        use_completion_tokens_estimate = False
+
+        try:
+            await self.push_frame(LLMFullResponseStartFrame())
+            await self.start_processing_metrics()
+
+            # logger.debug(
+            #     f"{self}: Generating chat with Bedrock model {self.model_name} | [{context.get_messages_for_logging()}]"
+            # )
+
+            await self.start_ttfb_metrics()
+            
+            # Set up inference config
+            inference_config = {
+                "maxTokens": self._settings["max_tokens"],
+                "temperature": self._settings["temperature"],
+                "topP": self._settings["top_p"],
+            }
+            
+            # Prepare request parameters
+            request_params = {
+                "modelId": self.model_name,
+                "messages": context.messages,
+                "inferenceConfig": inference_config,
+                "additionalModelRequestFields": self._settings["additional_model_request_fields"]
+            }
+            
+            # Add system message
+            request_params["system"] = [{"text": context.system}]
+                
+            # Add tools if present
+            if context.tools:
+                print(context.tools)
+                tool_config = {
+                    "tools": context.tools
+                }
+                
+                # Add tool_choice if specified
+                if context.tool_choice:
+                    if context.tool_choice == "auto":
+                        tool_config["toolChoice"] = {"auto": {}}
+                    elif context.tool_choice == "none":
+                        # Skip adding toolChoice for "none"
+                        pass
+                    elif isinstance(context.tool_choice, dict) and "function" in context.tool_choice:
+                        tool_config["toolChoice"] = {
+                            "tool": {
+                                "name": context.tool_choice["function"]["name"]
+                            }
+                        }
+                
+                request_params["toolConfig"] = tool_config
+            
+            # Add performance config if latency is specified
+            if self._settings["latency"] in ["standard", "optimized"]:
+                request_params["performanceConfig"] = {
+                    "latency": self._settings["latency"]
+                }
+            
+            logger.debug(f"Calling Bedrock model with: {request_params}")
+            
+            # Call Bedrock with streaming
+            response = self._client.converse_stream(**request_params)
+            
+            await self.stop_ttfb_metrics()
+            
+            # Process the streaming response
+            tool_use_block = None
+            json_accumulator = ""
+            
+            for event in response["stream"]:
+                # Handle text content
+                if "contentBlockDelta" in event:
+                    delta = event["contentBlockDelta"]["delta"]
+                    if "text" in delta:
+                        await self.push_frame(LLMTextFrame(delta["text"]))
+                        completion_tokens_estimate += self._estimate_tokens(delta["text"])
+                    elif "toolUse" in delta and "input" in delta["toolUse"]:
+                        # Handle partial JSON for tool use
+                        json_str = json.dumps(delta["toolUse"]["input"])
+                        json_accumulator += json_str
+                        completion_tokens_estimate += self._estimate_tokens(json_str)
+                
+                # Handle tool use start
+                elif "contentBlockStart" in event:
+                    content_block = event["contentBlockStart"]
+                    if content_block.get("type") == "toolUse":
+                        tool_use_block = {
+                            "id": content_block["toolUse"].get("toolUseId", ""),
+                            "name": content_block["toolUse"].get("name", "")
+                        }
+                        json_accumulator = ""
+                
+                # Handle message completion with tool use
+                elif "messageDelta" in event and "stopReason" in event["messageDelta"]:
+                    if event["messageDelta"]["stopReason"] == "toolUse" and tool_use_block:
+                        try:
+                            arguments = json.loads(json_accumulator) if json_accumulator else {}
+                            await self.call_function(
+                                context=context,
+                                tool_call_id=tool_use_block["id"],
+                                function_name=tool_use_block["name"],
+                                arguments=arguments,
+                            )
+                        except json.JSONDecodeError:
+                            logger.error(f"Failed to parse tool arguments: {json_accumulator}")
+                
+                # Handle usage metrics if available
+                if "usage" in event:
+                    usage = event["usage"]
+                    prompt_tokens += usage.get("inputTokens", 0)
+                    completion_tokens += usage.get("outputTokens", 0)
+
+        except asyncio.CancelledError:
+            # If we're interrupted, we won't get a complete usage report. So set our flag to use the
+            # token estimate. The reraise the exception so all the processors running in this task
+            # also get cancelled.
+            use_completion_tokens_estimate = True
+            raise
+        except httpx.TimeoutException:
+            await self._call_event_handler("on_completion_timeout")
+        except Exception as e:
+            logger.exception(f"{self} exception: {e}")
+        finally:
+            await self.stop_processing_metrics()
+            await self.push_frame(LLMFullResponseEndFrame())
+            comp_tokens = (
+                completion_tokens
+                if not use_completion_tokens_estimate
+                else completion_tokens_estimate
+            )
+            await self._report_usage_metrics(
+                prompt_tokens=prompt_tokens,
+                completion_tokens=comp_tokens,
+            )
+
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
+
+        context = None
+        if isinstance(frame, OpenAILLMContextFrame):
+            context = BedrockLLMContext.upgrade_to_bedrock(frame.context)
+        elif isinstance(frame, LLMMessagesFrame):
+            context = BedrockLLMContext.from_messages(frame.messages)
+        elif isinstance(frame, VisionImageRawFrame):
+            # This is only useful in very simple pipelines because it creates
+            # a new context. Generally we want a context manager to catch
+            # UserImageRawFrames coming through the pipeline and add them
+            # to the context.
+            context = BedrockLLMContext.from_image_frame(frame)
+        elif isinstance(frame, LLMUpdateSettingsFrame):
+            await self._update_settings(frame.settings)
+        else:
+            await self.push_frame(frame, direction)
+
+        if context:
+            await self._process_context(context)
+
+    def _estimate_tokens(self, text: str) -> int:
+        return int(len(re.split(r"[^\w]+", text)) * 1.3)
+
+    async def _report_usage_metrics(
+        self,
+        prompt_tokens: int,
+        completion_tokens: int,
+    ):
+        if prompt_tokens or completion_tokens:
+            tokens = LLMTokenUsage(
+                prompt_tokens=prompt_tokens,
+                completion_tokens=completion_tokens,
+                total_tokens=prompt_tokens + completion_tokens,
+            )
+            await self.start_llm_usage_metrics(tokens)
+
+
+class BedrockLLMContext(OpenAILLMContext):
+    def __init__(
+        self,
+        messages: Optional[List[dict]] = None,
+        tools: Optional[List[dict]] = None,
+        tool_choice: Optional[dict] = None,
+        *,
+        system: Union[str, NotGiven] = NOT_GIVEN,
+    ):
+        super().__init__(messages=messages, tools=tools, tool_choice=tool_choice)
+        self.system = system
+
+    @staticmethod
+    def upgrade_to_bedrock(obj: OpenAILLMContext) -> "BedrockLLMContext":
+        logger.debug(f"Upgrading to Bedrock: {obj}")
+        if isinstance(obj, OpenAILLMContext) and not isinstance(obj, BedrockLLMContext):
+            obj.__class__ = BedrockLLMContext
+            obj._restructure_from_openai_messages()
+        else:
+            obj._restructure_from_bedrock_messages()
+        return obj
+
+    @classmethod
+    def from_openai_context(cls, openai_context: OpenAILLMContext):
+        self = cls(
+            messages=openai_context.messages,
+            tools=openai_context.tools,
+            tool_choice=openai_context.tool_choice,
+        )
+        self.set_llm_adapter(openai_context.get_llm_adapter())
+        self._restructure_from_openai_messages()
+        return self
+
+    @classmethod
+    def from_messages(cls, messages: List[dict]) -> "BedrockLLMContext":
+        self = cls(messages=messages)
+        # self._restructure_from_openai_messages()
+        return self
+
+    @classmethod
+    def from_image_frame(cls, frame: VisionImageRawFrame) -> "BedrockLLMContext":
+        context = cls()
+        context.add_image_frame_message(
+            format=frame.format, size=frame.size, image=frame.image, text=frame.text
+        )
+        return context
+
+    def set_messages(self, messages: List):
+        self._messages[:] = messages
+        # self._restructure_from_openai_messages()
+
+    # convert a message in Bedrock format into one or more messages in OpenAI format
+    def to_standard_messages(self, obj):
+        """Convert Bedrock message format to standard structured format.
+
+        Handles text content and function calls for both user and assistant messages.
+
+        Args:
+            obj: Message in Bedrock format:
+                {
+                    "role": "user/assistant",
+                    "content": [{"text": str} | {"toolUse": {...}} | {"toolResult": {...}}]
+                }
+
+        Returns:
+            List of messages in standard format:
+            [
+                {
+                    "role": "user/assistant/tool",
+                    "content": [{"type": "text", "text": str}]
+                }
+            ]
+        """
+        role = obj.get("role")
+        content = obj.get("content")
+        
+        if role == "assistant":
+            if isinstance(content, str):
+                return [{"role": role, "content": [{"type": "text", "text": content}]}]
+            elif isinstance(content, list):
+                text_items = []
+                tool_items = []
+                for item in content:
+                    if "text" in item:
+                        text_items.append({"type": "text", "text": item["text"]})
+                    elif "toolUse" in item:
+                        tool_use = item["toolUse"]
+                        tool_items.append(
+                            {
+                                "type": "function",
+                                "id": tool_use["toolUseId"],
+                                "function": {
+                                    "name": tool_use["name"],
+                                    "arguments": json.dumps(tool_use["input"]),
+                                },
+                            }
+                        )
+                messages = []
+                if text_items:
+                    messages.append({"role": role, "content": text_items})
+                if tool_items:
+                    messages.append({"role": role, "tool_calls": tool_items})
+                return messages
+        elif role == "user":
+            if isinstance(content, str):
+                return [{"role": role, "content": [{"type": "text", "text": content}]}]
+            elif isinstance(content, list):
+                text_items = []
+                tool_items = []
+                for item in content:
+                    if "text" in item:
+                        text_items.append({"type": "text", "text": item["text"]})
+                    elif "toolResult" in item:
+                        tool_result = item["toolResult"]
+                        # Extract content from toolResult
+                        result_content = ""
+                        if isinstance(tool_result["content"], list):
+                            for content_item in tool_result["content"]:
+                                if "text" in content_item:
+                                    result_content = content_item["text"]
+                                elif "json" in content_item:
+                                    result_content = json.dumps(content_item["json"])
+                        else:
+                            result_content = tool_result["content"]
+                            
+                        tool_items.append(
+                            {
+                                "role": "tool",
+                                "tool_call_id": tool_result["toolUseId"],
+                                "content": result_content,
+                            }
+                        )
+                messages = []
+                if text_items:
+                    messages.append({"role": role, "content": text_items})
+                messages.extend(tool_items)
+                return messages
+
+    def from_standard_message(self, message):
+        """Convert standard format message to Bedrock format.
+
+        Handles conversion of text content, tool calls, and tool results.
+        Empty text content is converted to "(empty)".
+
+        Args:
+            message: Message in standard format:
+                {
+                    "role": "user/assistant/tool",
+                    "content": str | [{"type": "text", ...}],
+                    "tool_calls": [{"id": str, "function": {"name": str, "arguments": str}}]
+                }
+
+        Returns:
+            Message in Bedrock format:
+            {
+                "role": "user/assistant",
+                "content": [
+                    {"text": str} |
+                    {"toolUse": {"toolUseId": str, "name": str, "input": dict}} |
+                    {"toolResult": {"toolUseId": str, "content": [...], "status": str}}
+                ]
+            }
+        """
+        print(message)
+        if message["role"] == "tool":
+            # Try to parse the content as JSON if it looks like JSON
+            try:
+                if message["content"].strip().startswith('{') and message["content"].strip().endswith('}'):
+                    content_json = json.loads(message["content"])
+                    tool_result_content = [{"json": content_json}]
+                else:
+                    tool_result_content = [{"text": message["content"]}]
+            except:
+                tool_result_content = [{"text": message["content"]}]
+                
+            return {
+                "role": "user",
+                "content": [
+                    {
+                        "toolResult": {
+                            "toolUseId": message["tool_call_id"],
+                            "content": tool_result_content
+                        },
+                    },
+                ],
+            }
+            
+        if message.get("tool_calls"):
+            tc = message["tool_calls"]
+            ret = {"role": "assistant", "content": []}
+            for tool_call in tc:
+                function = tool_call["function"]
+                arguments = json.loads(function["arguments"])
+                new_tool_use = {
+                    "toolUse": {
+                        "toolUseId": tool_call["id"],
+                        "name": function["name"],
+                        "input": arguments,
+                    }
+                }
+                ret["content"].append(new_tool_use)
+            return ret
+            
+        # Handle text content
+        content = message.get("content")
+        if isinstance(content, str):
+            if content == "":
+                return {"role": message["role"], "content": [{"text": "(empty)"}]}
+            else:
+                return {"role": message["role"], "content": [{"text": content}]}
+        elif isinstance(content, list):
+            new_content = []
+            for item in content:
+                if item.get("type", "") == "text":
+                    text_content = item["text"] if item["text"] != "" else "(empty)"
+                    new_content.append({"text": text_content})
+            return {"role": message["role"], "content": new_content}
+            
+        return message
+
+    def add_image_frame_message(
+        self, *, format: str, size: tuple[int, int], image: bytes, text: str = None
+    ):
+        buffer = io.BytesIO()
+        Image.frombytes(format, size, image).save(buffer, format="JPEG")
+        encoded_image = base64.b64encode(buffer.getvalue()).decode("utf-8")
+
+        # Image should be the first content block in the message
+        content = [
+            {
+                "type": "image",
+                "format": "jpeg",
+                "source": {
+                    "bytes": encoded_image
+                }
+            }
+        ]
+        if text:
+            content.append({"text": text})
+        self.add_message({"role": "user", "content": content})
+
+    def add_message(self, message):
+        try:
+            if self.messages:
+                # Bedrock requires that roles alternate. If this message's role is the same as the
+                # last message, we should add this message's content to the last message.
+                if self.messages[-1]["role"] == message["role"]:
+                    # if the last message has just a content string, convert it to a list
+                    # in the proper format
+                    if isinstance(self.messages[-1]["content"], str):
+                        self.messages[-1]["content"] = [
+                            {"type": "text", "text": self.messages[-1]["content"]}
+                        ]
+                    # if this message has just a content string, convert it to a list
+                    # in the proper format
+                    if isinstance(message["content"], str):
+                        message["content"] = [{"text": message["content"]}]
+                    # append the content of this message to the last message
+                    self.messages[-1]["content"].extend(message["content"])
+                else:
+                    self.messages.append(message)
+            else:
+                self.messages.append(message)
+        except Exception as e:
+            logger.error(f"Error adding message: {e}")
+
+    def _restructure_from_bedrock_messages(self):
+        """Restructure messages in Bedrock format by handling system messages, 
+        merging consecutive messages with the same role, and ensuring proper content formatting.
+        """
+
+        print(self.messages)
+
+        # Handle system message if present at the beginning
+        if self.messages and self.messages[0]["role"] == "system":
+            if len(self.messages) == 1:
+                self.messages[0]["role"] = "user"
+            else:
+                system_content = self.messages.pop(0)["content"]
+                self.system = system_content[0]["text"] if isinstance(system_content, list) and system_content and isinstance(system_content[0], dict) and "text" in system_content[0] else str(system_content)
+
+        # Ensure content is properly formatted
+        for msg in self.messages:
+            if isinstance(msg["content"], str):
+                msg["content"] = [{"text": msg["content"]}]
+            elif not msg["content"]:
+                msg["content"] = [{"text": "(empty)"}]
+            elif isinstance(msg["content"], list):
+                for idx, item in enumerate(msg["content"]):
+                    if isinstance(item, dict) and "text" in item and item["text"] == "":
+                        item["text"] = "(empty)"
+                    elif isinstance(item, str) and item == "":
+                        msg["content"][idx] = {"text": "(empty)"}
+
+        # Merge consecutive messages with the same role
+        merged_messages = []
+        for msg in self.messages:
+            if merged_messages and merged_messages[-1]["role"] == msg["role"]:
+                merged_messages[-1]["content"].extend(msg["content"])
+            else:
+                merged_messages.append(msg)
+        
+        self.messages.clear()
+        self.messages.extend(merged_messages)
+
+    def _restructure_from_openai_messages(self):
+        # first, map across self._messages calling self.from_standard_message(m) to modify messages in place
+        try:
+            self._messages[:] = [self.from_standard_message(m) for m in self._messages]
+        except Exception as e:
+            logger.error(f"Error mapping messages: {e}")
+
+        # See if we should pull the system message out of our context.messages list. (For
+        # compatibility with Open AI messages format.)
+        if self.messages and self.messages[0]["role"] == "system":
+            if len(self.messages) == 1:
+                # If we have only have a system message in the list, all we can really do
+                # without introducing too much magic is change the role to "user".
+                self.messages[0]["role"] = "user"
+            else:
+                # If we have more than one message, we'll pull the system message out of the
+                # list.
+                self.system = self.messages[0]["content"]
+                self.messages.pop(0)
+
+        # Merge consecutive messages with the same role.
+        i = 0
+        while i < len(self.messages) - 1:
+            current_message = self.messages[i]
+            next_message = self.messages[i + 1]
+            if current_message["role"] == next_message["role"]:
+                # Convert content to list of dictionaries if it's a string
+                if isinstance(current_message["content"], str):
+                    current_message["content"] = [
+                        {"type": "text", "text": current_message["content"]}
+                    ]
+                if isinstance(next_message["content"], str):
+                    next_message["content"] = [{"type": "text", "text": next_message["content"]}]
+                # Concatenate the content
+                current_message["content"].extend(next_message["content"])
+                # Remove the next message from the list
+                self.messages.pop(i + 1)
+            else:
+                i += 1
+
+        # Avoid empty content in messages
+        for message in self.messages:
+            if isinstance(message["content"], str) and message["content"] == "":
+                message["content"] = "(empty)"
+            elif isinstance(message["content"], list) and len(message["content"]) == 0:
+                message["content"] = [{"type": "text", "text": "(empty)"}]
+
+    def get_messages_for_persistent_storage(self):
+        messages = super().get_messages_for_persistent_storage()
+        if self.system:
+            messages.insert(0, {"role": "system", "content": self.system})
+        return messages
+
+    def get_messages_for_logging(self) -> str:
+        msgs = []
+        for message in self.messages:
+            msg = copy.deepcopy(message)
+            if "content" in msg:
+                if isinstance(msg["content"], list):
+                    for item in msg["content"]:
+                        if item.get("image"):
+                            item["source"]["bytes"] = "..."
+            msgs.append(msg)
+        return json.dumps(msgs)
+
+
+class BedrockUserContextAggregator(LLMUserContextAggregator):
+    pass
+
+
+class BedrockAssistantContextAggregator(LLMAssistantContextAggregator):
+    async def handle_function_call_in_progress(self, frame: FunctionCallInProgressFrame):
+        # Format tool use according to Bedrock API
+        self._context.add_message(
+            {
+                "role": "assistant",
+                "content": [
+                    {
+                        "toolUse": {
+                            "toolUseId": frame.tool_call_id,
+                            "name": frame.function_name,
+                            "input": frame.arguments
+                        }
+                    }
+                ],
+            }
+        )
+        self._context.add_message(
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "toolResult": {
+                            "toolUseId": frame.tool_call_id,
+                            "content": [
+                                {
+                                    "text": "IN_PROGRESS"
+                                }
+                            ],
+                        }
+                    }
+                ],
+            }
+        )
+
+    async def handle_function_call_result(self, frame: FunctionCallResultFrame):
+        if frame.result:
+            result = json.dumps(frame.result)
+            await self._update_function_call_result(frame.function_name, frame.tool_call_id, result)
+        else:
+            await self._update_function_call_result(
+                frame.function_name, frame.tool_call_id, "COMPLETED"
+            )
+
+    async def handle_function_call_cancel(self, frame: FunctionCallCancelFrame):
+        await self._update_function_call_result(
+            frame.function_name, frame.tool_call_id, "CANCELLED"
+        )
+
+    async def _update_function_call_result(
+        self, function_name: str, tool_call_id: str, result: Any
+    ):
+        for message in self._context.messages:
+            if message["role"] == "user":
+                for content in message["content"]:
+                    if (
+                        isinstance(content, dict)
+                        and content.get("toolResult")
+                        and content["toolResult"]["toolUseId"] == tool_call_id
+                    ):
+                        content["toolResult"]["content"] = [{"text": result}]
+
+    async def handle_user_image_frame(self, frame: UserImageRawFrame):
+        await self._update_function_call_result(
+            frame.request.function_name, frame.request.tool_call_id, "COMPLETED"
+        )
+        self._context.add_image_frame_message(
+            format=frame.format,
+            size=frame.size,
+            image=frame.image,
+            text=frame.request.context,
+        )
+        
\ No newline at end of file

From 88c9e08bd819f9768e234e0e9341388d86997e87 Mon Sep 17 00:00:00 2001
From: Adithya Suresh <adxthya@amazon.com>
Date: Thu, 3 Apr 2025 11:27:17 +0000
Subject: [PATCH 17/97] Updated tools parsing logic

---
 src/pipecat/services/aws/llm.py | 28 +++++++++++-----------------
 1 file changed, 11 insertions(+), 17 deletions(-)

diff --git a/src/pipecat/services/aws/llm.py b/src/pipecat/services/aws/llm.py
index 3b476e03b..2f762e9bd 100644
--- a/src/pipecat/services/aws/llm.py
+++ b/src/pipecat/services/aws/llm.py
@@ -210,7 +210,6 @@ class BedrockLLMService(LLMService):
                 
             # Add tools if present
             if context.tools:
-                print(context.tools)
                 tool_config = {
                     "tools": context.tools
                 }
@@ -257,23 +256,22 @@ class BedrockLLMService(LLMService):
                         completion_tokens_estimate += self._estimate_tokens(delta["text"])
                     elif "toolUse" in delta and "input" in delta["toolUse"]:
                         # Handle partial JSON for tool use
-                        json_str = json.dumps(delta["toolUse"]["input"])
-                        json_accumulator += json_str
-                        completion_tokens_estimate += self._estimate_tokens(json_str)
+                        json_accumulator += delta["toolUse"]["input"]
+                        completion_tokens_estimate += self._estimate_tokens(delta["toolUse"]["input"])
                 
                 # Handle tool use start
                 elif "contentBlockStart" in event:
-                    content_block = event["contentBlockStart"]
-                    if content_block.get("type") == "toolUse":
+                    content_block_start = event["contentBlockStart"]['start']
+                    if "toolUse" in content_block_start:
                         tool_use_block = {
-                            "id": content_block["toolUse"].get("toolUseId", ""),
-                            "name": content_block["toolUse"].get("name", "")
+                            "id": content_block_start["toolUse"].get("toolUseId", ""),
+                            "name": content_block_start["toolUse"].get("name", "")
                         }
                         json_accumulator = ""
                 
                 # Handle message completion with tool use
-                elif "messageDelta" in event and "stopReason" in event["messageDelta"]:
-                    if event["messageDelta"]["stopReason"] == "toolUse" and tool_use_block:
+                elif "messageStop" in event and "stopReason" in event["messageStop"]:
+                    if event["messageStop"]["stopReason"] == "tool_use" and tool_use_block:
                         try:
                             arguments = json.loads(json_accumulator) if json_accumulator else {}
                             await self.call_function(
@@ -286,8 +284,8 @@ class BedrockLLMService(LLMService):
                             logger.error(f"Failed to parse tool arguments: {json_accumulator}")
                 
                 # Handle usage metrics if available
-                if "usage" in event:
-                    usage = event["usage"]
+                if "metadata" in event and "usage" in event["metadata"]:
+                    usage = event["metadata"]["usage"]
                     prompt_tokens += usage.get("inputTokens", 0)
                     completion_tokens += usage.get("outputTokens", 0)
 
@@ -516,7 +514,6 @@ class BedrockLLMContext(OpenAILLMContext):
                 ]
             }
         """
-        print(message)
         if message["role"] == "tool":
             # Try to parse the content as JSON if it looks like JSON
             try:
@@ -623,9 +620,6 @@ class BedrockLLMContext(OpenAILLMContext):
         """Restructure messages in Bedrock format by handling system messages, 
         merging consecutive messages with the same role, and ensuring proper content formatting.
         """
-
-        print(self.messages)
-
         # Handle system message if present at the beginning
         if self.messages and self.messages[0]["role"] == "system":
             if len(self.messages) == 1:
@@ -739,7 +733,7 @@ class BedrockAssistantContextAggregator(LLMAssistantContextAggregator):
                         "toolUse": {
                             "toolUseId": frame.tool_call_id,
                             "name": frame.function_name,
-                            "input": frame.arguments
+                            "input": frame.arguments if frame.arguments else {}
                         }
                     }
                 ],

From 05ae8d3ffa11e07f8868e1e81c387ce4419dddfa Mon Sep 17 00:00:00 2001
From: Adithya Suresh <adxthya@amazon.com>
Date: Fri, 4 Apr 2025 05:36:09 +0000
Subject: [PATCH 18/97] Removed OpenAI based context formatting

---
 src/pipecat/services/aws/llm.py | 580 ++++++++++++++++----------------
 1 file changed, 286 insertions(+), 294 deletions(-)

diff --git a/src/pipecat/services/aws/llm.py b/src/pipecat/services/aws/llm.py
index 2f762e9bd..cb21eccaa 100644
--- a/src/pipecat/services/aws/llm.py
+++ b/src/pipecat/services/aws/llm.py
@@ -46,16 +46,6 @@ from pipecat.processors.aggregators.openai_llm_context import (
 from pipecat.processors.frame_processor import FrameDirection
 from pipecat.services.ai_services import LLMService
 
-try:
-    from anthropic import NOT_GIVEN, NotGiven
-except ModuleNotFoundError as e:
-    logger.error(f"Exception: {e}")
-    logger.error(
-        "In order to use Anthropic, you need to `pip install pipecat-ai[anthropic]`. "
-        + "Also, set `ANTHROPIC_API_KEY` environment variable."
-    )
-    raise Exception(f"Missing module: {e}")
-
 
 @dataclass
 class BedrockContextAggregatorPair:
@@ -69,288 +59,6 @@ class BedrockContextAggregatorPair:
         return self._assistant
 
 
-class BedrockLLMService(LLMService):
-    """This class implements inference with AWS Bedrock models including Amazon Nova and Anthropic Claude.
-    
-    Requires AWS credentials to be configured in the environment or through boto3 configuration.
-    """
-    class InputParams(BaseModel):
-        max_tokens: Optional[int] = Field(default_factory=lambda: 4096, ge=1)
-        temperature: Optional[float] = Field(default_factory=lambda: 0.7, ge=0.0, le=1.0)
-        top_p: Optional[float] = Field(default_factory=lambda: 0.999, ge=0.0, le=1.0)
-        stop_sequences: Optional[List[str]] = Field(default_factory=lambda: [])
-        latency: Optional[str] = Field(default_factory=lambda: "standard")
-        additional_model_request_fields: Optional[Dict[str, Any]] = Field(default_factory=dict)
-
-    def __init__(
-        self,
-        *,
-        aws_access_key: str,
-        aws_secret_key: str,
-        aws_session_token: Optional[str] = None,
-        aws_region: str = "us-east-1",
-        model: str,
-        params: InputParams = InputParams(),
-        client_config: Optional[Config] = None,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        
-        # Initialize the Bedrock client
-        if not client_config:
-            client_config = Config(
-                connect_timeout=300,  # 5 minutes
-                read_timeout=300,     # 5 minutes
-                retries={'max_attempts': 3}
-            )
-        session = boto3.Session(
-            aws_access_key_id=aws_access_key,
-            aws_secret_access_key=aws_secret_key,
-            aws_session_token=aws_session_token,
-            region_name=aws_region
-        )
-        self._client = session.client(
-            service_name='bedrock-runtime',
-            config=client_config
-        )
-        
-        self.set_model_name(model)
-        self._settings = {
-            "max_tokens": params.max_tokens,
-            "temperature": params.temperature,
-            "top_p": params.top_p,
-            "latency": params.latency,
-            "additional_model_request_fields": params.additional_model_request_fields if isinstance(params.additional_model_request_fields, dict) else {},
-        }
-        
-        # Determine model provider from model ID
-        self.model_provider = self._get_model_provider(model)
-        logger.info(f"Using AWS Bedrock model: {model} from provider: {self.model_provider}")
-
-    def _get_model_provider(self, model: str) -> str:
-        """Determine the model provider from the model ID"""
-        if "anthropic." in model:
-            return "anthropic"
-        elif "amazon." in model:
-            return "amazon"
-        else:
-            raise ValueError(f"Unsupported model: {model}. Only Anthropic Claude and Amazon Nova model families are supported.")
-
-    def can_generate_metrics(self) -> bool:
-        return True
-
-    def create_context_aggregator(
-        self,
-        context: OpenAILLMContext,
-        *,
-        user_kwargs: Mapping[str, Any] = {},
-        assistant_kwargs: Mapping[str, Any] = {},
-    ) -> BedrockContextAggregatorPair:
-        """Create an instance of BedrockContextAggregatorPair from an
-        OpenAILLMContext. Constructor keyword arguments for both the user and
-        assistant aggregators can be provided.
-
-        Args:
-            context (OpenAILLMContext): The LLM context.
-            user_kwargs (Mapping[str, Any], optional): Additional keyword
-                arguments for the user context aggregator constructor. Defaults
-                to an empty mapping.
-            assistant_kwargs (Mapping[str, Any], optional): Additional keyword
-                arguments for the assistant context aggregator
-                constructor. Defaults to an empty mapping.
-
-        Returns:
-            BedrockContextAggregatorPair: A pair of context aggregators, one
-            for the user and one for the assistant, encapsulated in an
-            BedrockContextAggregatorPair.
-        """
-        context.set_llm_adapter(self.get_llm_adapter())
-
-        if isinstance(context, OpenAILLMContext):
-            context = BedrockLLMContext.from_openai_context(context)
-                
-        user = BedrockUserContextAggregator(context, **user_kwargs)
-        assistant = BedrockAssistantContextAggregator(context, **assistant_kwargs)
-        return BedrockContextAggregatorPair(_user=user, _assistant=assistant)
-
-    async def _process_context(self, context: "BedrockLLMContext"):
-        # Usage tracking
-        prompt_tokens = 0
-        completion_tokens = 0
-        completion_tokens_estimate = 0
-        use_completion_tokens_estimate = False
-
-        try:
-            await self.push_frame(LLMFullResponseStartFrame())
-            await self.start_processing_metrics()
-
-            # logger.debug(
-            #     f"{self}: Generating chat with Bedrock model {self.model_name} | [{context.get_messages_for_logging()}]"
-            # )
-
-            await self.start_ttfb_metrics()
-            
-            # Set up inference config
-            inference_config = {
-                "maxTokens": self._settings["max_tokens"],
-                "temperature": self._settings["temperature"],
-                "topP": self._settings["top_p"],
-            }
-            
-            # Prepare request parameters
-            request_params = {
-                "modelId": self.model_name,
-                "messages": context.messages,
-                "inferenceConfig": inference_config,
-                "additionalModelRequestFields": self._settings["additional_model_request_fields"]
-            }
-            
-            # Add system message
-            request_params["system"] = [{"text": context.system}]
-                
-            # Add tools if present
-            if context.tools:
-                tool_config = {
-                    "tools": context.tools
-                }
-                
-                # Add tool_choice if specified
-                if context.tool_choice:
-                    if context.tool_choice == "auto":
-                        tool_config["toolChoice"] = {"auto": {}}
-                    elif context.tool_choice == "none":
-                        # Skip adding toolChoice for "none"
-                        pass
-                    elif isinstance(context.tool_choice, dict) and "function" in context.tool_choice:
-                        tool_config["toolChoice"] = {
-                            "tool": {
-                                "name": context.tool_choice["function"]["name"]
-                            }
-                        }
-                
-                request_params["toolConfig"] = tool_config
-            
-            # Add performance config if latency is specified
-            if self._settings["latency"] in ["standard", "optimized"]:
-                request_params["performanceConfig"] = {
-                    "latency": self._settings["latency"]
-                }
-            
-            logger.debug(f"Calling Bedrock model with: {request_params}")
-            
-            # Call Bedrock with streaming
-            response = self._client.converse_stream(**request_params)
-            
-            await self.stop_ttfb_metrics()
-            
-            # Process the streaming response
-            tool_use_block = None
-            json_accumulator = ""
-            
-            for event in response["stream"]:
-                # Handle text content
-                if "contentBlockDelta" in event:
-                    delta = event["contentBlockDelta"]["delta"]
-                    if "text" in delta:
-                        await self.push_frame(LLMTextFrame(delta["text"]))
-                        completion_tokens_estimate += self._estimate_tokens(delta["text"])
-                    elif "toolUse" in delta and "input" in delta["toolUse"]:
-                        # Handle partial JSON for tool use
-                        json_accumulator += delta["toolUse"]["input"]
-                        completion_tokens_estimate += self._estimate_tokens(delta["toolUse"]["input"])
-                
-                # Handle tool use start
-                elif "contentBlockStart" in event:
-                    content_block_start = event["contentBlockStart"]['start']
-                    if "toolUse" in content_block_start:
-                        tool_use_block = {
-                            "id": content_block_start["toolUse"].get("toolUseId", ""),
-                            "name": content_block_start["toolUse"].get("name", "")
-                        }
-                        json_accumulator = ""
-                
-                # Handle message completion with tool use
-                elif "messageStop" in event and "stopReason" in event["messageStop"]:
-                    if event["messageStop"]["stopReason"] == "tool_use" and tool_use_block:
-                        try:
-                            arguments = json.loads(json_accumulator) if json_accumulator else {}
-                            await self.call_function(
-                                context=context,
-                                tool_call_id=tool_use_block["id"],
-                                function_name=tool_use_block["name"],
-                                arguments=arguments,
-                            )
-                        except json.JSONDecodeError:
-                            logger.error(f"Failed to parse tool arguments: {json_accumulator}")
-                
-                # Handle usage metrics if available
-                if "metadata" in event and "usage" in event["metadata"]:
-                    usage = event["metadata"]["usage"]
-                    prompt_tokens += usage.get("inputTokens", 0)
-                    completion_tokens += usage.get("outputTokens", 0)
-
-        except asyncio.CancelledError:
-            # If we're interrupted, we won't get a complete usage report. So set our flag to use the
-            # token estimate. The reraise the exception so all the processors running in this task
-            # also get cancelled.
-            use_completion_tokens_estimate = True
-            raise
-        except httpx.TimeoutException:
-            await self._call_event_handler("on_completion_timeout")
-        except Exception as e:
-            logger.exception(f"{self} exception: {e}")
-        finally:
-            await self.stop_processing_metrics()
-            await self.push_frame(LLMFullResponseEndFrame())
-            comp_tokens = (
-                completion_tokens
-                if not use_completion_tokens_estimate
-                else completion_tokens_estimate
-            )
-            await self._report_usage_metrics(
-                prompt_tokens=prompt_tokens,
-                completion_tokens=comp_tokens,
-            )
-
-    async def process_frame(self, frame: Frame, direction: FrameDirection):
-        await super().process_frame(frame, direction)
-
-        context = None
-        if isinstance(frame, OpenAILLMContextFrame):
-            context = BedrockLLMContext.upgrade_to_bedrock(frame.context)
-        elif isinstance(frame, LLMMessagesFrame):
-            context = BedrockLLMContext.from_messages(frame.messages)
-        elif isinstance(frame, VisionImageRawFrame):
-            # This is only useful in very simple pipelines because it creates
-            # a new context. Generally we want a context manager to catch
-            # UserImageRawFrames coming through the pipeline and add them
-            # to the context.
-            context = BedrockLLMContext.from_image_frame(frame)
-        elif isinstance(frame, LLMUpdateSettingsFrame):
-            await self._update_settings(frame.settings)
-        else:
-            await self.push_frame(frame, direction)
-
-        if context:
-            await self._process_context(context)
-
-    def _estimate_tokens(self, text: str) -> int:
-        return int(len(re.split(r"[^\w]+", text)) * 1.3)
-
-    async def _report_usage_metrics(
-        self,
-        prompt_tokens: int,
-        completion_tokens: int,
-    ):
-        if prompt_tokens or completion_tokens:
-            tokens = LLMTokenUsage(
-                prompt_tokens=prompt_tokens,
-                completion_tokens=completion_tokens,
-                total_tokens=prompt_tokens + completion_tokens,
-            )
-            await self.start_llm_usage_metrics(tokens)
-
-
 class BedrockLLMContext(OpenAILLMContext):
     def __init__(
         self,
@@ -358,7 +66,7 @@ class BedrockLLMContext(OpenAILLMContext):
         tools: Optional[List[dict]] = None,
         tool_choice: Optional[dict] = None,
         *,
-        system: Union[str, NotGiven] = NOT_GIVEN,
+        system: Optional[str] = None,
     ):
         super().__init__(messages=messages, tools=tools, tool_choice=tool_choice)
         self.system = system
@@ -375,6 +83,7 @@ class BedrockLLMContext(OpenAILLMContext):
 
     @classmethod
     def from_openai_context(cls, openai_context: OpenAILLMContext):
+        logger.debug("from_openai_context called")
         self = cls(
             messages=openai_context.messages,
             tools=openai_context.tools,
@@ -621,6 +330,7 @@ class BedrockLLMContext(OpenAILLMContext):
         merging consecutive messages with the same role, and ensuring proper content formatting.
         """
         # Handle system message if present at the beginning
+        logger.debug(f"_restructure_from_bedrock_messages: {self.messages}")
         if self.messages and self.messages[0]["role"] == "system":
             if len(self.messages) == 1:
                 self.messages[0]["role"] = "user"
@@ -653,6 +363,7 @@ class BedrockLLMContext(OpenAILLMContext):
         self.messages.extend(merged_messages)
 
     def _restructure_from_openai_messages(self):
+        logger.debug(f"_restructure_from_openai_messages: {self.messages}")
         # first, map across self._messages calling self.from_standard_message(m) to modify messages in place
         try:
             self._messages[:] = [self.from_standard_message(m) for m in self._messages]
@@ -794,4 +505,285 @@ class BedrockAssistantContextAggregator(LLMAssistantContextAggregator):
             image=frame.image,
             text=frame.request.context,
         )
-        
\ No newline at end of file
+
+
+class BedrockLLMService(LLMService):
+    """This class implements inference with AWS Bedrock models including Amazon Nova and Anthropic Claude.
+    
+    Requires AWS credentials to be configured in the environment or through boto3 configuration.
+    """
+    class InputParams(BaseModel):
+        max_tokens: Optional[int] = Field(default_factory=lambda: 4096, ge=1)
+        temperature: Optional[float] = Field(default_factory=lambda: 0.7, ge=0.0, le=1.0)
+        top_p: Optional[float] = Field(default_factory=lambda: 0.999, ge=0.0, le=1.0)
+        stop_sequences: Optional[List[str]] = Field(default_factory=lambda: [])
+        latency: Optional[str] = Field(default_factory=lambda: "standard")
+        additional_model_request_fields: Optional[Dict[str, Any]] = Field(default_factory=dict)
+
+    def __init__(
+        self,
+        *,
+        aws_access_key: Optional[str] = None,
+        aws_secret_key: Optional[str] = None,
+        aws_session_token: Optional[str] = None,
+        aws_region: str = "us-east-1",
+        model: str,
+        params: InputParams = InputParams(),
+        client_config: Optional[Config] = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        
+        # Initialize the Bedrock client
+        if not client_config:
+            client_config = Config(
+                connect_timeout=300,  # 5 minutes
+                read_timeout=300,     # 5 minutes
+                retries={'max_attempts': 3}
+            )
+        session = boto3.Session(
+            aws_access_key_id=aws_access_key,
+            aws_secret_access_key=aws_secret_key,
+            aws_session_token=aws_session_token,
+            region_name=aws_region
+        )
+        self._client = session.client(
+            service_name='bedrock-runtime',
+            config=client_config
+        )
+        
+        self.set_model_name(model)
+        self._settings = {
+            "max_tokens": params.max_tokens,
+            "temperature": params.temperature,
+            "top_p": params.top_p,
+            "latency": params.latency,
+            "additional_model_request_fields": params.additional_model_request_fields if isinstance(params.additional_model_request_fields, dict) else {},
+        }
+        
+        # Determine model provider from model ID
+        self.model_provider = self._get_model_provider(model)
+        logger.info(f"Using AWS Bedrock model: {model} from provider: {self.model_provider}")
+
+    def _get_model_provider(self, model: str) -> str:
+        """Determine the model provider from the model ID"""
+        if "anthropic." in model:
+            return "anthropic"
+        elif "amazon." in model:
+            return "amazon"
+        else:
+            raise ValueError(f"Unsupported model: {model}. Only Anthropic Claude and Amazon Nova model families are supported.")
+
+    def can_generate_metrics(self) -> bool:
+        return True
+
+    def create_context_aggregator(
+        self,
+        context: BedrockLLMContext,
+        *,
+        user_kwargs: Mapping[str, Any] = {},
+        assistant_kwargs: Mapping[str, Any] = {},
+    ) -> BedrockContextAggregatorPair:
+        """Create an instance of BedrockContextAggregatorPair from an
+        OpenAILLMContext. Constructor keyword arguments for both the user and
+        assistant aggregators can be provided.
+
+        Args:
+            context (OpenAILLMContext): The LLM context.
+            user_kwargs (Mapping[str, Any], optional): Additional keyword
+                arguments for the user context aggregator constructor. Defaults
+                to an empty mapping.
+            assistant_kwargs (Mapping[str, Any], optional): Additional keyword
+                arguments for the assistant context aggregator
+                constructor. Defaults to an empty mapping.
+
+        Returns:
+            BedrockContextAggregatorPair: A pair of context aggregators, one
+            for the user and one for the assistant, encapsulated in an
+            BedrockContextAggregatorPair.
+        """
+        context.set_llm_adapter(self.get_llm_adapter())
+
+        if isinstance(context, OpenAILLMContext) and not isinstance(context, BedrockLLMContext):
+            context = BedrockLLMContext.from_openai_context(context)
+                
+        user = BedrockUserContextAggregator(context, **user_kwargs)
+        assistant = BedrockAssistantContextAggregator(context, **assistant_kwargs)
+        return BedrockContextAggregatorPair(_user=user, _assistant=assistant)
+
+    async def _process_context(self, context: "BedrockLLMContext"):
+        # Usage tracking
+        prompt_tokens = 0
+        completion_tokens = 0
+        completion_tokens_estimate = 0
+        use_completion_tokens_estimate = False
+
+        try:
+            await self.push_frame(LLMFullResponseStartFrame())
+            await self.start_processing_metrics()
+
+            # logger.debug(
+            #     f"{self}: Generating chat with Bedrock model {self.model_name} | [{context.get_messages_for_logging()}]"
+            # )
+
+            await self.start_ttfb_metrics()
+            
+            # Set up inference config
+            inference_config = {
+                "maxTokens": self._settings["max_tokens"],
+                "temperature": self._settings["temperature"],
+                "topP": self._settings["top_p"],
+            }
+            
+            # Prepare request parameters
+            request_params = {
+                "modelId": self.model_name,
+                "messages": context.messages,
+                "inferenceConfig": inference_config,
+                "additionalModelRequestFields": self._settings["additional_model_request_fields"]
+            }
+            
+            # Add system message
+            request_params["system"] = [{"text": context.system}]
+                
+            # Add tools if present
+            if context.tools:
+                tool_config = {
+                    "tools": context.tools
+                }
+                
+                # Add tool_choice if specified
+                if context.tool_choice:
+                    if context.tool_choice == "auto":
+                        tool_config["toolChoice"] = {"auto": {}}
+                    elif context.tool_choice == "none":
+                        # Skip adding toolChoice for "none"
+                        pass
+                    elif isinstance(context.tool_choice, dict) and "function" in context.tool_choice:
+                        tool_config["toolChoice"] = {
+                            "tool": {
+                                "name": context.tool_choice["function"]["name"]
+                            }
+                        }
+                
+                request_params["toolConfig"] = tool_config
+            
+            # Add performance config if latency is specified
+            if self._settings["latency"] in ["standard", "optimized"]:
+                request_params["performanceConfig"] = {
+                    "latency": self._settings["latency"]
+                }
+            
+            logger.debug(f"Calling Bedrock model with: {request_params}")
+            
+            # Call Bedrock with streaming
+            response = self._client.converse_stream(**request_params)
+            
+            await self.stop_ttfb_metrics()
+            
+            # Process the streaming response
+            tool_use_block = None
+            json_accumulator = ""
+            
+            for event in response["stream"]:
+                # Handle text content
+                if "contentBlockDelta" in event:
+                    delta = event["contentBlockDelta"]["delta"]
+                    if "text" in delta:
+                        await self.push_frame(LLMTextFrame(delta["text"]))
+                        completion_tokens_estimate += self._estimate_tokens(delta["text"])
+                    elif "toolUse" in delta and "input" in delta["toolUse"]:
+                        # Handle partial JSON for tool use
+                        json_accumulator += delta["toolUse"]["input"]
+                        completion_tokens_estimate += self._estimate_tokens(delta["toolUse"]["input"])
+                
+                # Handle tool use start
+                elif "contentBlockStart" in event:
+                    content_block_start = event["contentBlockStart"]['start']
+                    if "toolUse" in content_block_start:
+                        tool_use_block = {
+                            "id": content_block_start["toolUse"].get("toolUseId", ""),
+                            "name": content_block_start["toolUse"].get("name", "")
+                        }
+                        json_accumulator = ""
+                
+                # Handle message completion with tool use
+                elif "messageStop" in event and "stopReason" in event["messageStop"]:
+                    if event["messageStop"]["stopReason"] == "tool_use" and tool_use_block:
+                        try:
+                            arguments = json.loads(json_accumulator) if json_accumulator else {}
+                            await self.call_function(
+                                context=context,
+                                tool_call_id=tool_use_block["id"],
+                                function_name=tool_use_block["name"],
+                                arguments=arguments,
+                            )
+                        except json.JSONDecodeError:
+                            logger.error(f"Failed to parse tool arguments: {json_accumulator}")
+                
+                # Handle usage metrics if available
+                if "metadata" in event and "usage" in event["metadata"]:
+                    usage = event["metadata"]["usage"]
+                    prompt_tokens += usage.get("inputTokens", 0)
+                    completion_tokens += usage.get("outputTokens", 0)
+
+        except asyncio.CancelledError:
+            # If we're interrupted, we won't get a complete usage report. So set our flag to use the
+            # token estimate. The reraise the exception so all the processors running in this task
+            # also get cancelled.
+            use_completion_tokens_estimate = True
+            raise
+        except httpx.TimeoutException:
+            await self._call_event_handler("on_completion_timeout")
+        except Exception as e:
+            logger.exception(f"{self} exception: {e}")
+        finally:
+            await self.stop_processing_metrics()
+            await self.push_frame(LLMFullResponseEndFrame())
+            comp_tokens = (
+                completion_tokens
+                if not use_completion_tokens_estimate
+                else completion_tokens_estimate
+            )
+            await self._report_usage_metrics(
+                prompt_tokens=prompt_tokens,
+                completion_tokens=comp_tokens,
+            )
+
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
+
+        context = None
+        if isinstance(frame, OpenAILLMContextFrame):
+            context = BedrockLLMContext.upgrade_to_bedrock(frame.context)
+        elif isinstance(frame, LLMMessagesFrame):
+            context = BedrockLLMContext.from_messages(frame.messages)
+        elif isinstance(frame, VisionImageRawFrame):
+            # This is only useful in very simple pipelines because it creates
+            # a new context. Generally we want a context manager to catch
+            # UserImageRawFrames coming through the pipeline and add them
+            # to the context.
+            context = BedrockLLMContext.from_image_frame(frame)
+        elif isinstance(frame, LLMUpdateSettingsFrame):
+            await self._update_settings(frame.settings)
+        else:
+            await self.push_frame(frame, direction)
+
+        if context:
+            await self._process_context(context)
+
+    def _estimate_tokens(self, text: str) -> int:
+        return int(len(re.split(r"[^\w]+", text)) * 1.3)
+
+    async def _report_usage_metrics(
+        self,
+        prompt_tokens: int,
+        completion_tokens: int,
+    ):
+        if prompt_tokens or completion_tokens:
+            tokens = LLMTokenUsage(
+                prompt_tokens=prompt_tokens,
+                completion_tokens=completion_tokens,
+                total_tokens=prompt_tokens + completion_tokens,
+            )
+            await self.start_llm_usage_metrics(tokens)

From f014f718eb2fce7c5ebbd67f88c777e2d553f3aa Mon Sep 17 00:00:00 2001
From: Adithya Suresh <adxthya@amazon.com>
Date: Fri, 4 Apr 2025 05:39:08 +0000
Subject: [PATCH 19/97] Restructured STT and enabled prosody tags for
 generative Polly

---
 ...ible-polly.py => 07m-interruptible-aws.py} |  43 +-
 src/pipecat/services/aws/stt.py               | 600 +++++++++++++++++
 src/pipecat/services/aws/tts.py               | 612 +-----------------
 3 files changed, 638 insertions(+), 617 deletions(-)
 rename examples/foundational/{07m-interruptible-polly.py => 07m-interruptible-aws.py} (70%)
 create mode 100644 src/pipecat/services/aws/stt.py

diff --git a/examples/foundational/07m-interruptible-polly.py b/examples/foundational/07m-interruptible-aws.py
similarity index 70%
rename from examples/foundational/07m-interruptible-polly.py
rename to examples/foundational/07m-interruptible-aws.py
index b3bd08061..d1fae6b5e 100644
--- a/examples/foundational/07m-interruptible-polly.py
+++ b/examples/foundational/07m-interruptible-aws.py
@@ -5,7 +5,6 @@
 #
 
 import argparse
-import os
 
 from dotenv import load_dotenv
 from loguru import logger
@@ -14,13 +13,13 @@ from pipecat.audio.vad.silero import SileroVADAnalyzer
 from pipecat.pipeline.pipeline import Pipeline
 from pipecat.pipeline.runner import PipelineRunner
 from pipecat.pipeline.task import PipelineParams, PipelineTask
-from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
-from pipecat.services.aws import PollyTTSService, TranscribeSTTService
-from pipecat.services.openai import OpenAILLMService
 from pipecat.transcriptions.language import Language
 from pipecat.transports.base_transport import TransportParams
 from pipecat.transports.network.small_webrtc import SmallWebRTCTransport
 from pipecat.transports.network.webrtc_connection import SmallWebRTCConnection
+from pipecat.services.aws.llm import BedrockLLMService, BedrockLLMContext
+from pipecat.services.aws.stt import TranscribeSTTService
+from pipecat.services.aws.tts import PollyTTSService
 
 load_dotenv(override=True)
 
@@ -43,20 +42,30 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection, _: argparse.Namespac
         region="us-west-2",  # only specific regions support generative TTS
         voice_id="Joanna",
         params=PollyTTSService.InputParams(
-            engine="generative", language=Language.EN_GB, rate="1.05"
+            engine="generative",
+            language=Language.EN_US,
+            rate="1.1"
         ),
     )
 
-    llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
+    llm = BedrockLLMService(
+        aws_region="us-west-2",
+        model="us.anthropic.claude-3-5-haiku-20241022-v1:0",
+        params=BedrockLLMService.InputParams(
+            temperature=0.8,
+            latency="optimized"
+        )
+    )
 
     messages = [
-        {
-            "role": "system",
-            "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
-        },
-    ]
+            {
+                "role": "system",
+                "content": [{"text": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way."}],
+            },
+        ]
+    )
 
-    context = OpenAILLMContext(messages)
+    context = BedrockLLMContext(messages)
     context_aggregator = llm.create_context_aggregator(context)
 
     pipeline = Pipeline(
@@ -68,8 +77,8 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection, _: argparse.Namespac
             tts,  # TTS
             transport.output(),  # Transport bot output
             context_aggregator.assistant(),  # Assistant spoken responses
-        ]
-    )
+         ]
+     )
 
     task = PipelineTask(
         pipeline,
@@ -85,16 +94,12 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection, _: argparse.Namespac
     async def on_client_connected(transport, client):
         logger.info(f"Client connected")
         # Kick off the conversation.
-        messages.append({"role": "system", "content": "Please introduce yourself to the user."})
+        messages.append({"role": "user", "content": [{"text": "Please introduce yourself to the user."}]})
         await task.queue_frames([context_aggregator.user().get_context_frame()])
 
     @transport.event_handler("on_client_disconnected")
     async def on_client_disconnected(transport, client):
         logger.info(f"Client disconnected")
-
-    @transport.event_handler("on_client_closed")
-    async def on_client_closed(transport, client):
-        logger.info(f"Client closed connection")
         await task.cancel()
 
     runner = PipelineRunner(handle_sigint=False)
diff --git a/src/pipecat/services/aws/stt.py b/src/pipecat/services/aws/stt.py
new file mode 100644
index 000000000..08d74d484
--- /dev/null
+++ b/src/pipecat/services/aws/stt.py
@@ -0,0 +1,600 @@
+import asyncio
+from typing import AsyncGenerator, Optional, Dict
+import os
+import datetime
+from urllib.parse import urlencode
+import json
+import struct
+import urllib.parse
+import hashlib
+import hmac
+import random
+import string
+import binascii
+
+from loguru import logger
+
+from pipecat.frames.frames import (
+    ErrorFrame,
+    Frame,
+    TranscriptionFrame,
+    InterimTranscriptionFrame,
+    StartFrame
+)
+from pipecat.services.ai_services import STTService
+from pipecat.transcriptions.language import Language
+from pipecat.utils.time import time_now_iso8601
+
+try:
+    import boto3
+    from botocore.exceptions import BotoCoreError, ClientError
+    import websockets
+except ModuleNotFoundError as e:
+    logger.error(f"Exception: {e}")
+    logger.error(
+        "In order to use AWS services, you need to `pip install pipecat-ai[aws]`. Also, remember to set `AWS_SECRET_ACCESS_KEY`, `AWS_ACCESS_KEY_ID`, and `AWS_REGION` environment variable."
+    )
+    raise Exception(f"Missing module: {e}")
+
+
+def get_presigned_url(
+    *,
+    region: str,
+    credentials: Dict[str, Optional[str]],
+    language_code: str,
+    media_encoding: str = "pcm",
+    sample_rate: int = 16000,
+    number_of_channels: int = 1,
+    enable_partial_results_stabilization: bool = True,
+    partial_results_stability: str = "high",
+    vocabulary_name: Optional[str] = None,
+    vocabulary_filter_name: Optional[str] = None,
+    show_speaker_label: bool = False,
+    enable_channel_identification: bool = False,
+) -> str:
+    """Create a presigned URL for AWS Transcribe streaming."""
+    access_key = credentials.get("access_key")
+    secret_key = credentials.get("secret_key")
+    session_token = credentials.get("session_token")
+
+    if not access_key or not secret_key:
+        raise ValueError("AWS credentials are required")
+
+    # Initialize the URL generator
+    url_generator = AWSTranscribePresignedURL(
+        access_key=access_key, secret_key=secret_key, session_token=session_token, region=region
+    )
+
+    # Get the presigned URL
+    return url_generator.get_request_url(
+        sample_rate=sample_rate,
+        language_code=language_code,
+        media_encoding=media_encoding,
+        vocabulary_name=vocabulary_name,
+        vocabulary_filter_name=vocabulary_filter_name,
+        show_speaker_label=show_speaker_label,
+        enable_channel_identification=enable_channel_identification,
+        number_of_channels=number_of_channels,
+        enable_partial_results_stabilization=enable_partial_results_stabilization,
+        partial_results_stability=partial_results_stability,
+    )
+
+
+class AWSTranscribePresignedURL:
+    def __init__(
+        self, access_key: str, secret_key: str, session_token: str, region: str = "us-east-1"
+    ):
+        self.access_key = access_key
+        self.secret_key = secret_key
+        self.session_token = session_token
+        self.method = "GET"
+        self.service = "transcribe"
+        self.region = region
+        self.endpoint = ""
+        self.host = ""
+        self.amz_date = ""
+        self.datestamp = ""
+        self.canonical_uri = "/stream-transcription-websocket"
+        self.canonical_headers = ""
+        self.signed_headers = "host"
+        self.algorithm = "AWS4-HMAC-SHA256"
+        self.credential_scope = ""
+        self.canonical_querystring = ""
+        self.payload_hash = ""
+        self.canonical_request = ""
+        self.string_to_sign = ""
+        self.signature = ""
+        self.request_url = ""
+
+    def get_request_url(
+        self,
+        sample_rate: int,
+        language_code: str = "",
+        media_encoding: str = "pcm",
+        vocabulary_name: str = "",
+        vocabulary_filter_name: str = "",
+        show_speaker_label: bool = False,
+        enable_channel_identification: bool = False,
+        number_of_channels: int = 1,
+        enable_partial_results_stabilization: bool = False,
+        partial_results_stability: str = "",
+    ) -> str:
+        self.endpoint = f"wss://transcribestreaming.{self.region}.amazonaws.com:8443"
+        self.host = f"transcribestreaming.{self.region}.amazonaws.com:8443"
+
+        now = datetime.datetime.utcnow()
+        self.amz_date = now.strftime("%Y%m%dT%H%M%SZ")
+        self.datestamp = now.strftime("%Y%m%d")
+        self.canonical_headers = f"host:{self.host}\n"
+        self.credential_scope = f"{self.datestamp}%2F{self.region}%2F{self.service}%2Faws4_request"
+
+        # Create canonical querystring
+        self.canonical_querystring = "X-Amz-Algorithm=" + self.algorithm
+        self.canonical_querystring += (
+            "&X-Amz-Credential=" + self.access_key + "%2F" + self.credential_scope
+        )
+        self.canonical_querystring += "&X-Amz-Date=" + self.amz_date
+        self.canonical_querystring += "&X-Amz-Expires=300"
+        if self.session_token:
+            self.canonical_querystring += "&X-Amz-Security-Token=" + urllib.parse.quote(
+                self.session_token, safe=""
+            )
+        self.canonical_querystring += "&X-Amz-SignedHeaders=" + self.signed_headers
+
+        if enable_channel_identification:
+            self.canonical_querystring += "&enable-channel-identification=true"
+        if enable_partial_results_stabilization:
+            self.canonical_querystring += "&enable-partial-results-stabilization=true"
+        if language_code:
+            self.canonical_querystring += "&language-code=" + language_code
+        if media_encoding:
+            self.canonical_querystring += "&media-encoding=" + media_encoding
+        if number_of_channels > 1:
+            self.canonical_querystring += "&number-of-channels=" + str(number_of_channels)
+        if partial_results_stability:
+            self.canonical_querystring += "&partial-results-stability=" + partial_results_stability
+        if sample_rate:
+            self.canonical_querystring += "&sample-rate=" + str(sample_rate)
+        if show_speaker_label:
+            self.canonical_querystring += "&show-speaker-label=true"
+        if vocabulary_filter_name:
+            self.canonical_querystring += "&vocabulary-filter-name=" + vocabulary_filter_name
+        if vocabulary_name:
+            self.canonical_querystring += "&vocabulary-name=" + vocabulary_name
+
+        # Create payload hash
+        self.payload_hash = hashlib.sha256("".encode("utf-8")).hexdigest()
+
+        # Create canonical request
+        self.canonical_request = f"{self.method}\n{self.canonical_uri}\n{self.canonical_querystring}\n{self.canonical_headers}\n{self.signed_headers}\n{self.payload_hash}"
+
+        # Create string to sign
+        credential_scope = f"{self.datestamp}/{self.region}/{self.service}/aws4_request"
+        string_to_sign = (
+            f"{self.algorithm}\n{self.amz_date}\n{credential_scope}\n"
+            + hashlib.sha256(self.canonical_request.encode("utf-8")).hexdigest()
+        )
+
+        # Calculate signature
+        k_date = hmac.new(
+            f"AWS4{self.secret_key}".encode("utf-8"), self.datestamp.encode("utf-8"), hashlib.sha256
+        ).digest()
+        k_region = hmac.new(k_date, self.region.encode("utf-8"), hashlib.sha256).digest()
+        k_service = hmac.new(k_region, self.service.encode("utf-8"), hashlib.sha256).digest()
+        k_signing = hmac.new(k_service, b"aws4_request", hashlib.sha256).digest()
+        self.signature = hmac.new(
+            k_signing, string_to_sign.encode("utf-8"), hashlib.sha256
+        ).hexdigest()
+
+        # Add signature to query string
+        self.canonical_querystring += "&X-Amz-Signature=" + self.signature
+
+        # Create request URL
+        self.request_url = self.endpoint + self.canonical_uri + "?" + self.canonical_querystring
+        return self.request_url
+
+
+def get_headers(header_name: str, header_value: str) -> bytearray:
+    """Build a header following AWS event stream format."""
+    name = header_name.encode("utf-8")
+    name_byte_length = bytes([len(name)])
+    value_type = bytes([7])  # 7 represents a string
+    value = header_value.encode("utf-8")
+    value_byte_length = struct.pack(">H", len(value))
+
+    # Construct the header
+    header_list = bytearray()
+    header_list.extend(name_byte_length)
+    header_list.extend(name)
+    header_list.extend(value_type)
+    header_list.extend(value_byte_length)
+    header_list.extend(value)
+    return header_list
+
+
+def build_event_message(payload: bytes) -> bytes:
+    """
+    Build an event message for AWS Transcribe streaming.
+    Matches AWS sample: https://github.com/aws-samples/amazon-transcribe-streaming-python-websockets/blob/main/eventstream.py
+    """
+    # Build headers
+    content_type_header = get_headers(":content-type", "application/octet-stream")
+    event_type_header = get_headers(":event-type", "AudioEvent")
+    message_type_header = get_headers(":message-type", "event")
+
+    headers = bytearray()
+    headers.extend(content_type_header)
+    headers.extend(event_type_header)
+    headers.extend(message_type_header)
+
+    # Calculate total byte length and headers byte length
+    # 16 accounts for 8 byte prelude, 2x 4 byte CRCs
+    total_byte_length = struct.pack(">I", len(headers) + len(payload) + 16)
+    headers_byte_length = struct.pack(">I", len(headers))
+
+    # Build the prelude
+    prelude = bytearray([0] * 8)
+    prelude[:4] = total_byte_length
+    prelude[4:] = headers_byte_length
+
+    # Calculate checksum for prelude
+    prelude_crc = struct.pack(">I", binascii.crc32(prelude) & 0xFFFFFFFF)
+
+    # Construct the message
+    message_as_list = bytearray()
+    message_as_list.extend(prelude)
+    message_as_list.extend(prelude_crc)
+    message_as_list.extend(headers)
+    message_as_list.extend(payload)
+
+    # Calculate checksum for message
+    message = bytes(message_as_list)
+    message_crc = struct.pack(">I", binascii.crc32(message) & 0xFFFFFFFF)
+
+    # Add message checksum
+    message_as_list.extend(message_crc)
+
+    return bytes(message_as_list)
+
+
+def decode_event(message):
+    # Extract the prelude, headers, payload and CRC
+    prelude = message[:8]
+    total_length, headers_length = struct.unpack(">II", prelude)
+    prelude_crc = struct.unpack(">I", message[8:12])[0]
+    headers = message[12 : 12 + headers_length]
+    payload = message[12 + headers_length : -4]
+    message_crc = struct.unpack(">I", message[-4:])[0]
+
+    # Check the CRCs
+    assert prelude_crc == binascii.crc32(prelude) & 0xFFFFFFFF, "Prelude CRC check failed"
+    assert message_crc == binascii.crc32(message[:-4]) & 0xFFFFFFFF, "Message CRC check failed"
+
+    # Parse the headers
+    headers_dict = {}
+    while headers:
+        name_len = headers[0]
+        name = headers[1 : 1 + name_len].decode("utf-8")
+        value_type = headers[1 + name_len]
+        value_len = struct.unpack(">H", headers[2 + name_len : 4 + name_len])[0]
+        value = headers[4 + name_len : 4 + name_len + value_len].decode("utf-8")
+        headers_dict[name] = value
+        headers = headers[4 + name_len + value_len :]
+
+    return headers_dict, json.loads(payload)
+
+
+class TranscribeSTTService(STTService):
+    def __init__(
+        self,
+        *,
+        api_key: Optional[str] = None,
+        aws_access_key_id: Optional[str] = None,
+        aws_session_token: Optional[str] = None,
+        region: Optional[str] = "us-east-1",
+        sample_rate: int = 16000,
+        language: Language = Language.EN,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self._settings = {
+            "sample_rate": sample_rate,
+            "language": language,
+            "media_encoding": "linear16",  # AWS expects raw PCM
+            "number_of_channels": 1,
+            "show_speaker_label": False,
+            "enable_channel_identification": False,
+        }
+
+        # Validate sample rate - AWS Transcribe only supports 8000 Hz or 16000 Hz
+        if sample_rate not in [8000, 16000]:
+            logger.warning(
+                f"AWS Transcribe only supports 8000 Hz or 16000 Hz sample rates. Converting from {sample_rate} Hz to 16000 Hz."
+            )
+            self._settings["sample_rate"] = 16000
+
+        self._credentials = {
+            "aws_access_key_id": aws_access_key_id or os.getenv("AWS_ACCESS_KEY_ID"),
+            "aws_secret_access_key": api_key or os.getenv("AWS_SECRET_ACCESS_KEY"),
+            "aws_session_token": aws_session_token or os.getenv("AWS_SESSION_TOKEN"),
+            "region": region or os.getenv("AWS_REGION", "us-east-1"),
+        }
+
+        self._ws_client = None
+        self._connection_lock = asyncio.Lock()
+        self._connecting = False
+        self._receive_task = None
+
+    def get_service_encoding(self, encoding: str) -> str:
+        """Convert internal encoding format to AWS Transcribe format."""
+        encoding_map = {
+            "linear16": "pcm",  # AWS expects "pcm" for 16-bit linear PCM
+        }
+        return encoding_map.get(encoding, encoding)
+
+    async def start(self, frame: StartFrame):
+        """Initialize the connection when the service starts."""
+        await super().start(frame)
+        logger.info("Starting AWS Transcribe service...")
+        retry_count = 0
+        max_retries = 3
+
+        while retry_count < max_retries:
+            try:
+                await self._connect()
+                if self._ws_client and self._ws_client.open:
+                    logger.info("Successfully established WebSocket connection")
+                    return
+                logger.warning("WebSocket connection not established after connect")
+            except Exception as e:
+                logger.error(f"Failed to connect (attempt {retry_count + 1}/{max_retries}): {e}")
+                retry_count += 1
+                if retry_count < max_retries:
+                    await asyncio.sleep(1)  # Wait before retrying
+
+        raise RuntimeError("Failed to establish WebSocket connection after multiple attempts")
+
+    async def run_stt(self, frame: Frame) -> AsyncGenerator[Frame, None]:
+        """Process audio data and send to AWS Transcribe"""
+        try:
+            # Skip if no speech detected
+            if hasattr(frame, "is_speech") and not frame.is_speech:
+                logger.debug("Skipping non-speech frame")
+                return
+
+            # Ensure WebSocket is connected
+            if not self._ws_client or not self._ws_client.open:
+                logger.info("WebSocket not connected, attempting to reconnect...")
+                try:
+                    await self._connect()
+                except Exception as e:
+                    logger.error(f"Failed to reconnect: {e}")
+                    yield ErrorFrame("Failed to reconnect to AWS Transcribe", fatal=False)
+                    return
+
+            # Get the audio data - if frame is bytes, use directly, otherwise get audio attribute
+            audio_data = frame if isinstance(frame, bytes) else frame.audio
+
+            # Format the audio data according to AWS event stream format
+            event_message = build_event_message(audio_data)
+            # logger.debug(f"Sending audio chunk of size {len(audio_data)} bytes")
+
+            # Send the formatted event message
+            try:
+                await self._ws_client.send(event_message)
+                # Start metrics after first chunk sent
+                await self.start_processing_metrics()
+                await self.start_ttfb_metrics()
+            except websockets.exceptions.ConnectionClosed as e:
+                logger.warning(f"Connection closed while sending: {e}")
+                await self._disconnect()
+                # Don't yield error here - we'll retry on next frame
+            except Exception as e:
+                logger.error(f"Error sending audio: {e}")
+                yield ErrorFrame(f"AWS Transcribe error: {str(e)}", fatal=False)
+                await self._disconnect()
+
+        except Exception as e:
+            logger.error(f"Error in run_stt: {e}")
+            yield ErrorFrame(f"AWS Transcribe error: {str(e)}", fatal=False)
+            await self._disconnect()
+
+    async def _connect(self):
+        """Connect to AWS Transcribe with connection state management."""
+        if (
+            self._ws_client
+            and self._ws_client.open
+            and self._receive_task
+            and not self._receive_task.done()
+        ):
+            logger.debug("Already connected")
+            return
+
+        async with self._connection_lock:
+            if self._connecting:
+                logger.debug("Connection already in progress")
+                return
+
+            try:
+                self._connecting = True
+                logger.debug("Starting connection process...")
+
+                if self._ws_client:
+                    await self._disconnect()
+
+                language_code = self.language_to_service_language(
+                    Language(self._settings["language"])
+                )
+                if not language_code:
+                    raise ValueError(f"Unsupported language: {self._settings['language']}")
+
+                # Generate random websocket key
+                websocket_key = "".join(
+                    random.choices(
+                        string.ascii_uppercase + string.ascii_lowercase + string.digits, k=20
+                    )
+                )
+
+                # Add required headers
+                extra_headers = {
+                    "Origin": "https://localhost",
+                    "Sec-WebSocket-Key": websocket_key,
+                    "Sec-WebSocket-Version": "13",
+                    "Connection": "keep-alive",
+                }
+
+                # Get presigned URL
+                presigned_url = get_presigned_url(
+                    region=self._credentials["region"],
+                    credentials={
+                        "access_key": self._credentials["aws_access_key_id"],
+                        "secret_key": self._credentials["aws_secret_access_key"],
+                        "session_token": self._credentials["aws_session_token"],
+                    },
+                    language_code=language_code,
+                    media_encoding=self.get_service_encoding(
+                        self._settings["media_encoding"]
+                    ),  # Convert to AWS format
+                    sample_rate=self._settings["sample_rate"],
+                    number_of_channels=self._settings["number_of_channels"],
+                    enable_partial_results_stabilization=True,
+                    partial_results_stability="high",
+                    show_speaker_label=self._settings["show_speaker_label"],
+                    enable_channel_identification=self._settings["enable_channel_identification"],
+                )
+
+                logger.debug(f"Connecting to WebSocket with URL: {presigned_url[:100]}...")
+
+                # Connect with the required headers and settings
+                self._ws_client = await websockets.connect(
+                    presigned_url,
+                    extra_headers=extra_headers,
+                    subprotocols=["mqtt"],
+                    ping_interval=None,
+                    ping_timeout=None,
+                    compression=None,
+                )
+                logger.debug("WebSocket connected, starting receive task...")
+
+                # Start receive task
+                self._receive_task = asyncio.create_task(self._receive_loop())
+
+                logger.info("Successfully connected to AWS Transcribe")
+
+            except Exception as e:
+                logger.error(f"Failed to connect to AWS Transcribe: {e}")
+                await self._disconnect()
+                raise
+
+            finally:
+                self._connecting = False
+
+    async def _disconnect(self):
+        """Disconnect from AWS Transcribe."""
+        if self._receive_task:
+            self._receive_task.cancel()
+            try:
+                await self._receive_task
+            except asyncio.CancelledError:
+                pass
+            self._receive_task = None
+
+        if self._ws_client:
+            try:
+                if self._ws_client.open:
+                    # Send end-stream message
+                    end_stream = {"message-type": "event", "event": "end"}
+                    await self._ws_client.send(json.dumps(end_stream))
+                await self._ws_client.close()
+            except Exception as e:
+                logger.warning(f"Error closing WebSocket connection: {e}")
+            finally:
+                self._ws_client = None
+
+    def language_to_service_language(self, language: Language) -> str | None:
+        """Convert internal language enum to AWS Transcribe language code."""
+        language_map = {
+            Language.EN: "en-US",
+            Language.ES: "es-US",
+            Language.FR: "fr-FR",
+            Language.DE: "de-DE",
+            Language.IT: "it-IT",
+            Language.PT: "pt-BR",
+            Language.JA: "ja-JP",
+            Language.KO: "ko-KR",
+            Language.ZH: "zh-CN",
+        }
+        return language_map.get(language)
+
+    async def _receive_loop(self):
+        """Background task to receive and process messages from AWS Transcribe."""
+        try:
+            logger.debug("Receive loop started")
+            while True:
+                if not self._ws_client or not self._ws_client.open:
+                    logger.warning("WebSocket closed in receive loop")
+                    break
+
+                try:
+                    response = await self._ws_client.recv()
+                    headers, payload = decode_event(response)
+
+                    # logger.debug(f"Received message type: {headers.get(':message-type')}")
+
+                    if headers.get(":message-type") == "event":
+                        # Process transcription results
+                        results = payload.get("Transcript", {}).get("Results", [])
+                        if results:
+                            result = results[0]
+                            alternatives = result.get("Alternatives", [])
+                            if alternatives:
+                                transcript = alternatives[0].get("Transcript", "")
+                                is_final = not result.get("IsPartial", True)
+
+                                if transcript:
+                                    await self.stop_ttfb_metrics()
+                                    if is_final:
+                                        await self.push_frame(
+                                            TranscriptionFrame(
+                                                transcript,
+                                                "",
+                                                time_now_iso8601(),
+                                                self._settings["language"],
+                                            )
+                                        )
+                                        await self.stop_processing_metrics()
+                                    else:
+                                        await self.push_frame(
+                                            InterimTranscriptionFrame(
+                                                transcript,
+                                                "",
+                                                time_now_iso8601(),
+                                                self._settings["language"],
+                                            )
+                                        )
+                    elif headers.get(":message-type") == "exception":
+                        error_msg = payload.get("Message", "Unknown error")
+                        logger.error(f"Exception from AWS: {error_msg}")
+                        await self.push_frame(
+                            ErrorFrame(f"AWS Transcribe error: {error_msg}", fatal=False)
+                        )
+                    else:
+                        logger.debug(f"Other message type received: {headers}")
+                        logger.debug(f"Payload: {payload}")
+
+                except websockets.exceptions.ConnectionClosed as e:
+                    logger.error(
+                        f"WebSocket connection closed in receive loop with code {e.code}: {e.reason}"
+                    )
+                    break
+                except Exception as e:
+                    logger.error(f"Error in receive loop: {e}")
+                    break
+
+        except asyncio.CancelledError:
+            logger.debug("Receive loop cancelled")
+        except Exception as e:
+            logger.error(f"Unexpected error in receive loop: {e}")
+        finally:
+            logger.debug("Receive loop ended")
\ No newline at end of file
diff --git a/src/pipecat/services/aws/tts.py b/src/pipecat/services/aws/tts.py
index e90ea9220..ed1230dd7 100644
--- a/src/pipecat/services/aws/tts.py
+++ b/src/pipecat/services/aws/tts.py
@@ -5,21 +5,8 @@
 #
 
 import asyncio
-from typing import AsyncGenerator, Optional, Dict
+from typing import AsyncGenerator, Optional
 import os
-import datetime
-import time
-from urllib.parse import urlencode
-import json
-import struct
-from io import BytesIO
-import urllib.parse
-import hashlib
-import hmac
-import random
-import string
-import binascii
-import numpy as np
 
 from loguru import logger
 from pydantic import BaseModel
@@ -30,28 +17,18 @@ from pipecat.frames.frames import (
     Frame,
     TTSAudioRawFrame,
     TTSStartedFrame,
-    TTSStoppedFrame,
-    TranscriptionFrame,
-    InterimTranscriptionFrame,
-    StartFrame,
-    EndFrame,
-    CancelFrame,
+    TTSStoppedFrame
 )
-from pipecat.services.ai_services import TTSService, STTService
+from pipecat.services.ai_services import TTSService
 from pipecat.transcriptions.language import Language
-from pipecat.utils.time import time_now_iso8601
 
 try:
     import boto3
     from botocore.exceptions import BotoCoreError, ClientError
-    import websockets
-    from botocore.auth import SigV4Auth
-    from botocore.awsrequest import AWSRequest
-    from botocore.credentials import Credentials
 except ModuleNotFoundError as e:
     logger.error(f"Exception: {e}")
     logger.error(
-        "In order to use AWS services, you need to `pip install pipecat-ai[aws]`. Also, set `AWS_SECRET_ACCESS_KEY`, `AWS_ACCESS_KEY_ID`, and `AWS_REGION` environment variable."
+        "In order to use AWS services, you need to `pip install pipecat-ai[aws]`. Also, remember to set `AWS_SECRET_ACCESS_KEY`, `AWS_ACCESS_KEY_ID`, and `AWS_REGION` environment variable."
     )
     raise Exception(f"Missing module: {e}")
 
@@ -207,18 +184,18 @@ class PollyTTSService(TTSService):
 
         prosody_attrs = []
         # Prosody tags are only supported for standard and neural engines
-        if self._settings["engine"] != "generative":
-            if self._settings["rate"]:
-                prosody_attrs.append(f"rate='{self._settings['rate']}'")
+        if self._settings["engine"] == "standard":
             if self._settings["pitch"]:
                 prosody_attrs.append(f"pitch='{self._settings['pitch']}'")
-            if self._settings["volume"]:
-                prosody_attrs.append(f"volume='{self._settings['volume']}'")
+        
+        if self._settings["rate"]:
+            prosody_attrs.append(f"rate='{self._settings['rate']}'")
+        if self._settings["volume"]:
+            prosody_attrs.append(f"volume='{self._settings['volume']}'")
+        # logger.warning("Prosody tags are not supported for generative engine. Ignoring.")
 
-            if prosody_attrs:
+        if prosody_attrs:
                 ssml += f"<prosody {' '.join(prosody_attrs)}>"
-        else:
-            logger.warning("Prosody tags are not supported for generative engine. Ignoring.")
 
         ssml += text
 
@@ -229,6 +206,8 @@ class PollyTTSService(TTSService):
 
         ssml += "</speak>"
 
+        logger.debug(f"SSML: {ssml}")
+
         return ssml
 
     async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
@@ -303,566 +282,3 @@ class AWSTTSService(PollyTTSService):
             warnings.warn(
                 "'AWSTTSService' is deprecated, use 'PollyTTSService' instead.", DeprecationWarning
             )
-
-
-def get_presigned_url(
-    *,
-    region: str,
-    credentials: Dict[str, Optional[str]],
-    language_code: str,
-    media_encoding: str = "pcm",
-    sample_rate: int = 16000,
-    number_of_channels: int = 1,
-    enable_partial_results_stabilization: bool = True,
-    partial_results_stability: str = "high",
-    vocabulary_name: Optional[str] = None,
-    vocabulary_filter_name: Optional[str] = None,
-    show_speaker_label: bool = False,
-    enable_channel_identification: bool = False,
-) -> str:
-    """Create a presigned URL for AWS Transcribe streaming."""
-    access_key = credentials.get("access_key")
-    secret_key = credentials.get("secret_key")
-    session_token = credentials.get("session_token")
-
-    if not access_key or not secret_key:
-        raise ValueError("AWS credentials are required")
-
-    # Initialize the URL generator
-    url_generator = AWSTranscribePresignedURL(
-        access_key=access_key, secret_key=secret_key, session_token=session_token, region=region
-    )
-
-    # Get the presigned URL
-    return url_generator.get_request_url(
-        sample_rate=sample_rate,
-        language_code=language_code,
-        media_encoding=media_encoding,
-        vocabulary_name=vocabulary_name,
-        vocabulary_filter_name=vocabulary_filter_name,
-        show_speaker_label=show_speaker_label,
-        enable_channel_identification=enable_channel_identification,
-        number_of_channels=number_of_channels,
-        enable_partial_results_stabilization=enable_partial_results_stabilization,
-        partial_results_stability=partial_results_stability,
-    )
-
-
-class AWSTranscribePresignedURL:
-    def __init__(
-        self, access_key: str, secret_key: str, session_token: str, region: str = "us-east-1"
-    ):
-        self.access_key = access_key
-        self.secret_key = secret_key
-        self.session_token = session_token
-        self.method = "GET"
-        self.service = "transcribe"
-        self.region = region
-        self.endpoint = ""
-        self.host = ""
-        self.amz_date = ""
-        self.datestamp = ""
-        self.canonical_uri = "/stream-transcription-websocket"
-        self.canonical_headers = ""
-        self.signed_headers = "host"
-        self.algorithm = "AWS4-HMAC-SHA256"
-        self.credential_scope = ""
-        self.canonical_querystring = ""
-        self.payload_hash = ""
-        self.canonical_request = ""
-        self.string_to_sign = ""
-        self.signature = ""
-        self.request_url = ""
-
-    def get_request_url(
-        self,
-        sample_rate: int,
-        language_code: str = "",
-        media_encoding: str = "pcm",
-        vocabulary_name: str = "",
-        vocabulary_filter_name: str = "",
-        show_speaker_label: bool = False,
-        enable_channel_identification: bool = False,
-        number_of_channels: int = 1,
-        enable_partial_results_stabilization: bool = False,
-        partial_results_stability: str = "",
-    ) -> str:
-        self.endpoint = f"wss://transcribestreaming.{self.region}.amazonaws.com:8443"
-        self.host = f"transcribestreaming.{self.region}.amazonaws.com:8443"
-
-        now = datetime.datetime.utcnow()
-        self.amz_date = now.strftime("%Y%m%dT%H%M%SZ")
-        self.datestamp = now.strftime("%Y%m%d")
-        self.canonical_headers = f"host:{self.host}\n"
-        self.credential_scope = f"{self.datestamp}%2F{self.region}%2F{self.service}%2Faws4_request"
-
-        # Create canonical querystring
-        self.canonical_querystring = "X-Amz-Algorithm=" + self.algorithm
-        self.canonical_querystring += (
-            "&X-Amz-Credential=" + self.access_key + "%2F" + self.credential_scope
-        )
-        self.canonical_querystring += "&X-Amz-Date=" + self.amz_date
-        self.canonical_querystring += "&X-Amz-Expires=300"
-        if self.session_token:
-            self.canonical_querystring += "&X-Amz-Security-Token=" + urllib.parse.quote(
-                self.session_token, safe=""
-            )
-        self.canonical_querystring += "&X-Amz-SignedHeaders=" + self.signed_headers
-
-        if enable_channel_identification:
-            self.canonical_querystring += "&enable-channel-identification=true"
-        if enable_partial_results_stabilization:
-            self.canonical_querystring += "&enable-partial-results-stabilization=true"
-        if language_code:
-            self.canonical_querystring += "&language-code=" + language_code
-        if media_encoding:
-            self.canonical_querystring += "&media-encoding=" + media_encoding
-        if number_of_channels > 1:
-            self.canonical_querystring += "&number-of-channels=" + str(number_of_channels)
-        if partial_results_stability:
-            self.canonical_querystring += "&partial-results-stability=" + partial_results_stability
-        if sample_rate:
-            self.canonical_querystring += "&sample-rate=" + str(sample_rate)
-        if show_speaker_label:
-            self.canonical_querystring += "&show-speaker-label=true"
-        if vocabulary_filter_name:
-            self.canonical_querystring += "&vocabulary-filter-name=" + vocabulary_filter_name
-        if vocabulary_name:
-            self.canonical_querystring += "&vocabulary-name=" + vocabulary_name
-
-        # Create payload hash
-        self.payload_hash = hashlib.sha256("".encode("utf-8")).hexdigest()
-
-        # Create canonical request
-        self.canonical_request = f"{self.method}\n{self.canonical_uri}\n{self.canonical_querystring}\n{self.canonical_headers}\n{self.signed_headers}\n{self.payload_hash}"
-
-        # Create string to sign
-        credential_scope = f"{self.datestamp}/{self.region}/{self.service}/aws4_request"
-        string_to_sign = (
-            f"{self.algorithm}\n{self.amz_date}\n{credential_scope}\n"
-            + hashlib.sha256(self.canonical_request.encode("utf-8")).hexdigest()
-        )
-
-        # Calculate signature
-        k_date = hmac.new(
-            f"AWS4{self.secret_key}".encode("utf-8"), self.datestamp.encode("utf-8"), hashlib.sha256
-        ).digest()
-        k_region = hmac.new(k_date, self.region.encode("utf-8"), hashlib.sha256).digest()
-        k_service = hmac.new(k_region, self.service.encode("utf-8"), hashlib.sha256).digest()
-        k_signing = hmac.new(k_service, b"aws4_request", hashlib.sha256).digest()
-        self.signature = hmac.new(
-            k_signing, string_to_sign.encode("utf-8"), hashlib.sha256
-        ).hexdigest()
-
-        # Add signature to query string
-        self.canonical_querystring += "&X-Amz-Signature=" + self.signature
-
-        # Create request URL
-        self.request_url = self.endpoint + self.canonical_uri + "?" + self.canonical_querystring
-        return self.request_url
-
-
-def get_headers(header_name: str, header_value: str) -> bytearray:
-    """Build a header following AWS event stream format."""
-    name = header_name.encode("utf-8")
-    name_byte_length = bytes([len(name)])
-    value_type = bytes([7])  # 7 represents a string
-    value = header_value.encode("utf-8")
-    value_byte_length = struct.pack(">H", len(value))
-
-    # Construct the header
-    header_list = bytearray()
-    header_list.extend(name_byte_length)
-    header_list.extend(name)
-    header_list.extend(value_type)
-    header_list.extend(value_byte_length)
-    header_list.extend(value)
-    return header_list
-
-
-def build_event_message(payload: bytes) -> bytes:
-    """
-    Build an event message for AWS Transcribe streaming.
-    Matches AWS sample: https://github.com/aws-samples/amazon-transcribe-streaming-python-websockets/blob/main/eventstream.py
-    """
-    # Build headers
-    content_type_header = get_headers(":content-type", "application/octet-stream")
-    event_type_header = get_headers(":event-type", "AudioEvent")
-    message_type_header = get_headers(":message-type", "event")
-
-    headers = bytearray()
-    headers.extend(content_type_header)
-    headers.extend(event_type_header)
-    headers.extend(message_type_header)
-
-    # Calculate total byte length and headers byte length
-    # 16 accounts for 8 byte prelude, 2x 4 byte CRCs
-    total_byte_length = struct.pack(">I", len(headers) + len(payload) + 16)
-    headers_byte_length = struct.pack(">I", len(headers))
-
-    # Build the prelude
-    prelude = bytearray([0] * 8)
-    prelude[:4] = total_byte_length
-    prelude[4:] = headers_byte_length
-
-    # Calculate checksum for prelude
-    prelude_crc = struct.pack(">I", binascii.crc32(prelude) & 0xFFFFFFFF)
-
-    # Construct the message
-    message_as_list = bytearray()
-    message_as_list.extend(prelude)
-    message_as_list.extend(prelude_crc)
-    message_as_list.extend(headers)
-    message_as_list.extend(payload)
-
-    # Calculate checksum for message
-    message = bytes(message_as_list)
-    message_crc = struct.pack(">I", binascii.crc32(message) & 0xFFFFFFFF)
-
-    # Add message checksum
-    message_as_list.extend(message_crc)
-
-    return bytes(message_as_list)
-
-
-def decode_event(message):
-    # Extract the prelude, headers, payload and CRC
-    prelude = message[:8]
-    total_length, headers_length = struct.unpack(">II", prelude)
-    prelude_crc = struct.unpack(">I", message[8:12])[0]
-    headers = message[12 : 12 + headers_length]
-    payload = message[12 + headers_length : -4]
-    message_crc = struct.unpack(">I", message[-4:])[0]
-
-    # Check the CRCs
-    assert prelude_crc == binascii.crc32(prelude) & 0xFFFFFFFF, "Prelude CRC check failed"
-    assert message_crc == binascii.crc32(message[:-4]) & 0xFFFFFFFF, "Message CRC check failed"
-
-    # Parse the headers
-    headers_dict = {}
-    while headers:
-        name_len = headers[0]
-        name = headers[1 : 1 + name_len].decode("utf-8")
-        value_type = headers[1 + name_len]
-        value_len = struct.unpack(">H", headers[2 + name_len : 4 + name_len])[0]
-        value = headers[4 + name_len : 4 + name_len + value_len].decode("utf-8")
-        headers_dict[name] = value
-        headers = headers[4 + name_len + value_len :]
-
-    return headers_dict, json.loads(payload)
-
-
-class TranscribeSTTService(STTService):
-    def __init__(
-        self,
-        *,
-        api_key: Optional[str] = None,
-        aws_access_key_id: Optional[str] = None,
-        aws_session_token: Optional[str] = None,
-        region: Optional[str] = "us-east-1",
-        sample_rate: int = 16000,
-        language: Language = Language.EN,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-
-        self._settings = {
-            "sample_rate": sample_rate,
-            "language": language,
-            "media_encoding": "linear16",  # AWS expects raw PCM
-            "number_of_channels": 1,
-            "show_speaker_label": False,
-            "enable_channel_identification": False,
-        }
-
-        # Validate sample rate - AWS Transcribe only supports 8000 Hz or 16000 Hz
-        if sample_rate not in [8000, 16000]:
-            logger.warning(
-                f"AWS Transcribe only supports 8000 Hz or 16000 Hz sample rates. Converting from {sample_rate} Hz to 16000 Hz."
-            )
-            self._settings["sample_rate"] = 16000
-
-        self._credentials = {
-            "aws_access_key_id": aws_access_key_id or os.getenv("AWS_ACCESS_KEY_ID"),
-            "aws_secret_access_key": api_key or os.getenv("AWS_SECRET_ACCESS_KEY"),
-            "aws_session_token": aws_session_token or os.getenv("AWS_SESSION_TOKEN"),
-            "region": region or os.getenv("AWS_REGION", "us-east-1"),
-        }
-
-        self._ws_client = None
-        self._connection_lock = asyncio.Lock()
-        self._connecting = False
-        self._receive_task = None
-
-    def get_service_encoding(self, encoding: str) -> str:
-        """Convert internal encoding format to AWS Transcribe format."""
-        encoding_map = {
-            "linear16": "pcm",  # AWS expects "pcm" for 16-bit linear PCM
-        }
-        return encoding_map.get(encoding, encoding)
-
-    async def start(self, frame: StartFrame):
-        """Initialize the connection when the service starts."""
-        await super().start(frame)
-        logger.info("Starting AWS Transcribe service...")
-        retry_count = 0
-        max_retries = 3
-
-        while retry_count < max_retries:
-            try:
-                await self._connect()
-                if self._ws_client and self._ws_client.open:
-                    logger.info("Successfully established WebSocket connection")
-                    return
-                logger.warning("WebSocket connection not established after connect")
-            except Exception as e:
-                logger.error(f"Failed to connect (attempt {retry_count + 1}/{max_retries}): {e}")
-                retry_count += 1
-                if retry_count < max_retries:
-                    await asyncio.sleep(1)  # Wait before retrying
-
-        raise RuntimeError("Failed to establish WebSocket connection after multiple attempts")
-
-    async def run_stt(self, frame: Frame) -> AsyncGenerator[Frame, None]:
-        """Process audio data and send to AWS Transcribe"""
-        try:
-            # Skip if no speech detected
-            if hasattr(frame, "is_speech") and not frame.is_speech:
-                logger.debug("Skipping non-speech frame")
-                return
-
-            # Ensure WebSocket is connected
-            if not self._ws_client or not self._ws_client.open:
-                logger.info("WebSocket not connected, attempting to reconnect...")
-                try:
-                    await self._connect()
-                except Exception as e:
-                    logger.error(f"Failed to reconnect: {e}")
-                    yield ErrorFrame("Failed to reconnect to AWS Transcribe", fatal=False)
-                    return
-
-            # Get the audio data - if frame is bytes, use directly, otherwise get audio attribute
-            audio_data = frame if isinstance(frame, bytes) else frame.audio
-
-            # Format the audio data according to AWS event stream format
-            event_message = build_event_message(audio_data)
-            # logger.debug(f"Sending audio chunk of size {len(audio_data)} bytes")
-
-            # Send the formatted event message
-            try:
-                await self._ws_client.send(event_message)
-                # Start metrics after first chunk sent
-                await self.start_processing_metrics()
-                await self.start_ttfb_metrics()
-            except websockets.exceptions.ConnectionClosed as e:
-                logger.warning(f"Connection closed while sending: {e}")
-                await self._disconnect()
-                # Don't yield error here - we'll retry on next frame
-            except Exception as e:
-                logger.error(f"Error sending audio: {e}")
-                yield ErrorFrame(f"AWS Transcribe error: {str(e)}", fatal=False)
-                await self._disconnect()
-
-        except Exception as e:
-            logger.error(f"Error in run_stt: {e}")
-            yield ErrorFrame(f"AWS Transcribe error: {str(e)}", fatal=False)
-            await self._disconnect()
-
-    async def _connect(self):
-        """Connect to AWS Transcribe with connection state management."""
-        if (
-            self._ws_client
-            and self._ws_client.open
-            and self._receive_task
-            and not self._receive_task.done()
-        ):
-            logger.debug("Already connected")
-            return
-
-        async with self._connection_lock:
-            if self._connecting:
-                logger.debug("Connection already in progress")
-                return
-
-            try:
-                self._connecting = True
-                logger.debug("Starting connection process...")
-
-                if self._ws_client:
-                    await self._disconnect()
-
-                language_code = self.language_to_service_language(
-                    Language(self._settings["language"])
-                )
-                if not language_code:
-                    raise ValueError(f"Unsupported language: {self._settings['language']}")
-
-                # Generate random websocket key
-                websocket_key = "".join(
-                    random.choices(
-                        string.ascii_uppercase + string.ascii_lowercase + string.digits, k=20
-                    )
-                )
-
-                # Add required headers
-                extra_headers = {
-                    "Origin": "https://localhost",
-                    "Sec-WebSocket-Key": websocket_key,
-                    "Sec-WebSocket-Version": "13",
-                    "Connection": "keep-alive",
-                }
-
-                # Get presigned URL
-                presigned_url = get_presigned_url(
-                    region=self._credentials["region"],
-                    credentials={
-                        "access_key": self._credentials["aws_access_key_id"],
-                        "secret_key": self._credentials["aws_secret_access_key"],
-                        "session_token": self._credentials["aws_session_token"],
-                    },
-                    language_code=language_code,
-                    media_encoding=self.get_service_encoding(
-                        self._settings["media_encoding"]
-                    ),  # Convert to AWS format
-                    sample_rate=self._settings["sample_rate"],
-                    number_of_channels=self._settings["number_of_channels"],
-                    enable_partial_results_stabilization=True,
-                    partial_results_stability="high",
-                    show_speaker_label=self._settings["show_speaker_label"],
-                    enable_channel_identification=self._settings["enable_channel_identification"],
-                )
-
-                logger.debug(f"Connecting to WebSocket with URL: {presigned_url[:100]}...")
-
-                # Connect with the required headers and settings
-                self._ws_client = await websockets.connect(
-                    presigned_url,
-                    extra_headers=extra_headers,
-                    subprotocols=["mqtt"],
-                    ping_interval=None,
-                    ping_timeout=None,
-                    compression=None,
-                )
-                logger.debug("WebSocket connected, starting receive task...")
-
-                # Start receive task
-                self._receive_task = asyncio.create_task(self._receive_loop())
-
-                logger.info("Successfully connected to AWS Transcribe")
-
-            except Exception as e:
-                logger.error(f"Failed to connect to AWS Transcribe: {e}")
-                await self._disconnect()
-                raise
-
-            finally:
-                self._connecting = False
-
-    async def _disconnect(self):
-        """Disconnect from AWS Transcribe."""
-        if self._receive_task:
-            self._receive_task.cancel()
-            try:
-                await self._receive_task
-            except asyncio.CancelledError:
-                pass
-            self._receive_task = None
-
-        if self._ws_client:
-            try:
-                if self._ws_client.open:
-                    # Send end-stream message
-                    end_stream = {"message-type": "event", "event": "end"}
-                    await self._ws_client.send(json.dumps(end_stream))
-                await self._ws_client.close()
-            except Exception as e:
-                logger.warning(f"Error closing WebSocket connection: {e}")
-            finally:
-                self._ws_client = None
-
-    def language_to_service_language(self, language: Language) -> str | None:
-        """Convert internal language enum to AWS Transcribe language code."""
-        language_map = {
-            Language.EN: "en-US",
-            Language.ES: "es-US",
-            Language.FR: "fr-FR",
-            Language.DE: "de-DE",
-            Language.IT: "it-IT",
-            Language.PT: "pt-BR",
-            Language.JA: "ja-JP",
-            Language.KO: "ko-KR",
-            Language.ZH: "zh-CN",
-        }
-        return language_map.get(language)
-
-    async def _receive_loop(self):
-        """Background task to receive and process messages from AWS Transcribe."""
-        try:
-            logger.debug("Receive loop started")
-            while True:
-                if not self._ws_client or not self._ws_client.open:
-                    logger.warning("WebSocket closed in receive loop")
-                    break
-
-                try:
-                    response = await self._ws_client.recv()
-                    headers, payload = decode_event(response)
-
-                    # logger.debug(f"Received message type: {headers.get(':message-type')}")
-
-                    if headers.get(":message-type") == "event":
-                        # Process transcription results
-                        results = payload.get("Transcript", {}).get("Results", [])
-                        if results:
-                            result = results[0]
-                            alternatives = result.get("Alternatives", [])
-                            if alternatives:
-                                transcript = alternatives[0].get("Transcript", "")
-                                is_final = not result.get("IsPartial", True)
-
-                                if transcript:
-                                    await self.stop_ttfb_metrics()
-                                    if is_final:
-                                        await self.push_frame(
-                                            TranscriptionFrame(
-                                                transcript,
-                                                "",
-                                                time_now_iso8601(),
-                                                self._settings["language"],
-                                            )
-                                        )
-                                        await self.stop_processing_metrics()
-                                    else:
-                                        await self.push_frame(
-                                            InterimTranscriptionFrame(
-                                                transcript,
-                                                "",
-                                                time_now_iso8601(),
-                                                self._settings["language"],
-                                            )
-                                        )
-                    elif headers.get(":message-type") == "exception":
-                        error_msg = payload.get("Message", "Unknown error")
-                        logger.error(f"Exception from AWS: {error_msg}")
-                        await self.push_frame(
-                            ErrorFrame(f"AWS Transcribe error: {error_msg}", fatal=False)
-                        )
-                    else:
-                        logger.debug(f"Other message type received: {headers}")
-                        logger.debug(f"Payload: {payload}")
-
-                except websockets.exceptions.ConnectionClosed as e:
-                    logger.error(
-                        f"WebSocket connection closed in receive loop with code {e.code}: {e.reason}"
-                    )
-                    break
-                except Exception as e:
-                    logger.error(f"Error in receive loop: {e}")
-                    break
-
-        except asyncio.CancelledError:
-            logger.debug("Receive loop cancelled")
-        except Exception as e:
-            logger.error(f"Unexpected error in receive loop: {e}")
-        finally:
-            logger.debug("Receive loop ended")

From b2b01861b2b09232c83266ffca9d9833ed722d97 Mon Sep 17 00:00:00 2001
From: Adithya Suresh <adxthya@amazon.com>
Date: Fri, 4 Apr 2025 06:17:40 +0000
Subject: [PATCH 20/97] Remove model restriction

---
 src/pipecat/services/aws/llm.py | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

diff --git a/src/pipecat/services/aws/llm.py b/src/pipecat/services/aws/llm.py
index cb21eccaa..2cca54c52 100644
--- a/src/pipecat/services/aws/llm.py
+++ b/src/pipecat/services/aws/llm.py
@@ -561,18 +561,7 @@ class BedrockLLMService(LLMService):
             "additional_model_request_fields": params.additional_model_request_fields if isinstance(params.additional_model_request_fields, dict) else {},
         }
         
-        # Determine model provider from model ID
-        self.model_provider = self._get_model_provider(model)
-        logger.info(f"Using AWS Bedrock model: {model} from provider: {self.model_provider}")
-
-    def _get_model_provider(self, model: str) -> str:
-        """Determine the model provider from the model ID"""
-        if "anthropic." in model:
-            return "anthropic"
-        elif "amazon." in model:
-            return "amazon"
-        else:
-            raise ValueError(f"Unsupported model: {model}. Only Anthropic Claude and Amazon Nova model families are supported.")
+        logger.info(f"Using AWS Bedrock model: {model}")
 
     def can_generate_metrics(self) -> bool:
         return True

From fa5cac7e0a2e5f84f6f7ce45a7198309fc0386f9 Mon Sep 17 00:00:00 2001
From: Adithya Suresh <adxthya@amazon.com>
Date: Tue, 8 Apr 2025 02:52:37 +0000
Subject: [PATCH 21/97] Bug fix in content format

---
 src/pipecat/services/aws/llm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pipecat/services/aws/llm.py b/src/pipecat/services/aws/llm.py
index 2cca54c52..fa33e26b3 100644
--- a/src/pipecat/services/aws/llm.py
+++ b/src/pipecat/services/aws/llm.py
@@ -310,7 +310,7 @@ class BedrockLLMContext(OpenAILLMContext):
                     # in the proper format
                     if isinstance(self.messages[-1]["content"], str):
                         self.messages[-1]["content"] = [
-                            {"type": "text", "text": self.messages[-1]["content"]}
+                            {"text": self.messages[-1]["content"]}
                         ]
                     # if this message has just a content string, convert it to a list
                     # in the proper format

From aa964847f360a9083a7c863e2ba09f1fe41dfd32 Mon Sep 17 00:00:00 2001
From: Adithya Suresh <adxthya@amazon.com>
Date: Tue, 8 Apr 2025 04:49:40 +0000
Subject: [PATCH 22/97] System param to be a list

---
 src/pipecat/services/aws/llm.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/src/pipecat/services/aws/llm.py b/src/pipecat/services/aws/llm.py
index fa33e26b3..f94aa3fbb 100644
--- a/src/pipecat/services/aws/llm.py
+++ b/src/pipecat/services/aws/llm.py
@@ -336,7 +336,15 @@ class BedrockLLMContext(OpenAILLMContext):
                 self.messages[0]["role"] = "user"
             else:
                 system_content = self.messages.pop(0)["content"]
-                self.system = system_content[0]["text"] if isinstance(system_content, list) and system_content and isinstance(system_content[0], dict) and "text" in system_content[0] else str(system_content)
+                if isinstance(system_content, str):
+                    system_content = [{"text": system_content}]
+                
+                if self.system:
+                    if isinstance(self.system, str):
+                        self.system = [{"text": self.system}]
+                    self.system.extend(system_content)
+                else:
+                    self.system = system_content
 
         # Ensure content is properly formatted
         for msg in self.messages:
@@ -600,7 +608,7 @@ class BedrockLLMService(LLMService):
         assistant = BedrockAssistantContextAggregator(context, **assistant_kwargs)
         return BedrockContextAggregatorPair(_user=user, _assistant=assistant)
 
-    async def _process_context(self, context: "BedrockLLMContext"):
+    async def _process_context(self, context: BedrockLLMContext):
         # Usage tracking
         prompt_tokens = 0
         completion_tokens = 0
@@ -633,7 +641,7 @@ class BedrockLLMService(LLMService):
             }
             
             # Add system message
-            request_params["system"] = [{"text": context.system}]
+            request_params["system"] = context.system
                 
             # Add tools if present
             if context.tools:

From 664111a3c98e5df6f34ac2a5d1281682e810d46e Mon Sep 17 00:00:00 2001
From: Adithya Suresh <adxthya@amazon.com>
Date: Tue, 8 Apr 2025 04:52:18 +0000
Subject: [PATCH 23/97] Added cache related info to metrics

---
 src/pipecat/services/aws/llm.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/pipecat/services/aws/llm.py b/src/pipecat/services/aws/llm.py
index f94aa3fbb..7c3539f7a 100644
--- a/src/pipecat/services/aws/llm.py
+++ b/src/pipecat/services/aws/llm.py
@@ -613,6 +613,8 @@ class BedrockLLMService(LLMService):
         prompt_tokens = 0
         completion_tokens = 0
         completion_tokens_estimate = 0
+        cache_read_input_tokens = 0
+        cache_creation_input_tokens = 0
         use_completion_tokens_estimate = False
 
         try:
@@ -723,6 +725,8 @@ class BedrockLLMService(LLMService):
                     usage = event["metadata"]["usage"]
                     prompt_tokens += usage.get("inputTokens", 0)
                     completion_tokens += usage.get("outputTokens", 0)
+                    cache_read_input_tokens += usage.get("cacheReadInputTokens", 0)
+                    cache_creation_input_tokens += usage.get("cacheWriteInputTokens", 0)
 
         except asyncio.CancelledError:
             # If we're interrupted, we won't get a complete usage report. So set our flag to use the
@@ -745,6 +749,8 @@ class BedrockLLMService(LLMService):
             await self._report_usage_metrics(
                 prompt_tokens=prompt_tokens,
                 completion_tokens=comp_tokens,
+                cache_read_input_tokens=cache_read_input_tokens,
+                cache_creation_input_tokens=cache_creation_input_tokens
             )
 
     async def process_frame(self, frame: Frame, direction: FrameDirection):
@@ -776,11 +782,15 @@ class BedrockLLMService(LLMService):
         self,
         prompt_tokens: int,
         completion_tokens: int,
+        cache_read_input_tokens: int,
+        cache_creation_input_tokens: int
     ):
         if prompt_tokens or completion_tokens:
             tokens = LLMTokenUsage(
                 prompt_tokens=prompt_tokens,
                 completion_tokens=completion_tokens,
                 total_tokens=prompt_tokens + completion_tokens,
+                cache_read_input_tokens=cache_read_input_tokens,
+                cache_creation_input_tokens=cache_creation_input_tokens
             )
             await self.start_llm_usage_metrics(tokens)

From a4b9db9e073aa21229b6abc2f76aac93419de775 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= <aleix@daily.co>
Date: Tue, 6 May 2025 11:37:23 -0700
Subject: [PATCH 24/97] fix formatting

---
 .../foundational/07m-interruptible-aws.py     |  38 ++---
 .../adapters/services/bedrock_adapter.py      |   2 +-
 src/pipecat/services/aws/llm.py               | 134 ++++++++----------
 src/pipecat/services/aws/stt.py               |   4 +-
 src/pipecat/services/aws/tts.py               |   6 +-
 5 files changed, 85 insertions(+), 99 deletions(-)

diff --git a/examples/foundational/07m-interruptible-aws.py b/examples/foundational/07m-interruptible-aws.py
index d1fae6b5e..ddb8b222e 100644
--- a/examples/foundational/07m-interruptible-aws.py
+++ b/examples/foundational/07m-interruptible-aws.py
@@ -13,13 +13,13 @@ from pipecat.audio.vad.silero import SileroVADAnalyzer
 from pipecat.pipeline.pipeline import Pipeline
 from pipecat.pipeline.runner import PipelineRunner
 from pipecat.pipeline.task import PipelineParams, PipelineTask
+from pipecat.services.aws.llm import BedrockLLMContext, BedrockLLMService
+from pipecat.services.aws.stt import TranscribeSTTService
+from pipecat.services.aws.tts import PollyTTSService
 from pipecat.transcriptions.language import Language
 from pipecat.transports.base_transport import TransportParams
 from pipecat.transports.network.small_webrtc import SmallWebRTCTransport
 from pipecat.transports.network.webrtc_connection import SmallWebRTCConnection
-from pipecat.services.aws.llm import BedrockLLMService, BedrockLLMContext
-from pipecat.services.aws.stt import TranscribeSTTService
-from pipecat.services.aws.tts import PollyTTSService
 
 load_dotenv(override=True)
 
@@ -42,28 +42,26 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection, _: argparse.Namespac
         region="us-west-2",  # only specific regions support generative TTS
         voice_id="Joanna",
         params=PollyTTSService.InputParams(
-            engine="generative",
-            language=Language.EN_US,
-            rate="1.1"
+            engine="generative", language=Language.EN_US, rate="1.1"
         ),
     )
 
     llm = BedrockLLMService(
         aws_region="us-west-2",
         model="us.anthropic.claude-3-5-haiku-20241022-v1:0",
-        params=BedrockLLMService.InputParams(
-            temperature=0.8,
-            latency="optimized"
-        )
+        params=BedrockLLMService.InputParams(temperature=0.8, latency="optimized"),
     )
 
     messages = [
-            {
-                "role": "system",
-                "content": [{"text": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way."}],
-            },
-        ]
-    )
+        {
+            "role": "system",
+            "content": [
+                {
+                    "text": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way."
+                }
+            ],
+        },
+    ]
 
     context = BedrockLLMContext(messages)
     context_aggregator = llm.create_context_aggregator(context)
@@ -77,8 +75,8 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection, _: argparse.Namespac
             tts,  # TTS
             transport.output(),  # Transport bot output
             context_aggregator.assistant(),  # Assistant spoken responses
-         ]
-     )
+        ]
+    )
 
     task = PipelineTask(
         pipeline,
@@ -94,7 +92,9 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection, _: argparse.Namespac
     async def on_client_connected(transport, client):
         logger.info(f"Client connected")
         # Kick off the conversation.
-        messages.append({"role": "user", "content": [{"text": "Please introduce yourself to the user."}]})
+        messages.append(
+            {"role": "user", "content": [{"text": "Please introduce yourself to the user."}]}
+        )
         await task.queue_frames([context_aggregator.user().get_context_frame()])
 
     @transport.event_handler("on_client_disconnected")
diff --git a/src/pipecat/adapters/services/bedrock_adapter.py b/src/pipecat/adapters/services/bedrock_adapter.py
index 0aba6aba2..b877f01fc 100644
--- a/src/pipecat/adapters/services/bedrock_adapter.py
+++ b/src/pipecat/adapters/services/bedrock_adapter.py
@@ -24,7 +24,7 @@ class BedrockLLMAdapter(BaseLLMAdapter):
                         "properties": function.properties,
                         "required": function.required,
                     },
-                }
+                },
             }
         }
 
diff --git a/src/pipecat/services/aws/llm.py b/src/pipecat/services/aws/llm.py
index 7c3539f7a..3b9c1fedd 100644
--- a/src/pipecat/services/aws/llm.py
+++ b/src/pipecat/services/aws/llm.py
@@ -135,7 +135,7 @@ class BedrockLLMContext(OpenAILLMContext):
         """
         role = obj.get("role")
         content = obj.get("content")
-        
+
         if role == "assistant":
             if isinstance(content, str):
                 return [{"role": role, "content": [{"type": "text", "text": content}]}]
@@ -184,7 +184,7 @@ class BedrockLLMContext(OpenAILLMContext):
                                     result_content = json.dumps(content_item["json"])
                         else:
                             result_content = tool_result["content"]
-                            
+
                         tool_items.append(
                             {
                                 "role": "tool",
@@ -226,26 +226,28 @@ class BedrockLLMContext(OpenAILLMContext):
         if message["role"] == "tool":
             # Try to parse the content as JSON if it looks like JSON
             try:
-                if message["content"].strip().startswith('{') and message["content"].strip().endswith('}'):
+                if message["content"].strip().startswith("{") and message[
+                    "content"
+                ].strip().endswith("}"):
                     content_json = json.loads(message["content"])
                     tool_result_content = [{"json": content_json}]
                 else:
                     tool_result_content = [{"text": message["content"]}]
             except:
                 tool_result_content = [{"text": message["content"]}]
-                
+
             return {
                 "role": "user",
                 "content": [
                     {
                         "toolResult": {
                             "toolUseId": message["tool_call_id"],
-                            "content": tool_result_content
+                            "content": tool_result_content,
                         },
                     },
                 ],
             }
-            
+
         if message.get("tool_calls"):
             tc = message["tool_calls"]
             ret = {"role": "assistant", "content": []}
@@ -261,7 +263,7 @@ class BedrockLLMContext(OpenAILLMContext):
                 }
                 ret["content"].append(new_tool_use)
             return ret
-            
+
         # Handle text content
         content = message.get("content")
         if isinstance(content, str):
@@ -276,7 +278,7 @@ class BedrockLLMContext(OpenAILLMContext):
                     text_content = item["text"] if item["text"] != "" else "(empty)"
                     new_content.append({"text": text_content})
             return {"role": message["role"], "content": new_content}
-            
+
         return message
 
     def add_image_frame_message(
@@ -287,15 +289,7 @@ class BedrockLLMContext(OpenAILLMContext):
         encoded_image = base64.b64encode(buffer.getvalue()).decode("utf-8")
 
         # Image should be the first content block in the message
-        content = [
-            {
-                "type": "image",
-                "format": "jpeg",
-                "source": {
-                    "bytes": encoded_image
-                }
-            }
-        ]
+        content = [{"type": "image", "format": "jpeg", "source": {"bytes": encoded_image}}]
         if text:
             content.append({"text": text})
         self.add_message({"role": "user", "content": content})
@@ -309,9 +303,7 @@ class BedrockLLMContext(OpenAILLMContext):
                     # if the last message has just a content string, convert it to a list
                     # in the proper format
                     if isinstance(self.messages[-1]["content"], str):
-                        self.messages[-1]["content"] = [
-                            {"text": self.messages[-1]["content"]}
-                        ]
+                        self.messages[-1]["content"] = [{"text": self.messages[-1]["content"]}]
                     # if this message has just a content string, convert it to a list
                     # in the proper format
                     if isinstance(message["content"], str):
@@ -326,7 +318,7 @@ class BedrockLLMContext(OpenAILLMContext):
             logger.error(f"Error adding message: {e}")
 
     def _restructure_from_bedrock_messages(self):
-        """Restructure messages in Bedrock format by handling system messages, 
+        """Restructure messages in Bedrock format by handling system messages,
         merging consecutive messages with the same role, and ensuring proper content formatting.
         """
         # Handle system message if present at the beginning
@@ -338,7 +330,7 @@ class BedrockLLMContext(OpenAILLMContext):
                 system_content = self.messages.pop(0)["content"]
                 if isinstance(system_content, str):
                     system_content = [{"text": system_content}]
-                
+
                 if self.system:
                     if isinstance(self.system, str):
                         self.system = [{"text": self.system}]
@@ -366,7 +358,7 @@ class BedrockLLMContext(OpenAILLMContext):
                 merged_messages[-1]["content"].extend(msg["content"])
             else:
                 merged_messages.append(msg)
-        
+
         self.messages.clear()
         self.messages.extend(merged_messages)
 
@@ -452,7 +444,7 @@ class BedrockAssistantContextAggregator(LLMAssistantContextAggregator):
                         "toolUse": {
                             "toolUseId": frame.tool_call_id,
                             "name": frame.function_name,
-                            "input": frame.arguments if frame.arguments else {}
+                            "input": frame.arguments if frame.arguments else {},
                         }
                     }
                 ],
@@ -465,11 +457,7 @@ class BedrockAssistantContextAggregator(LLMAssistantContextAggregator):
                     {
                         "toolResult": {
                             "toolUseId": frame.tool_call_id,
-                            "content": [
-                                {
-                                    "text": "IN_PROGRESS"
-                                }
-                            ],
+                            "content": [{"text": "IN_PROGRESS"}],
                         }
                     }
                 ],
@@ -517,9 +505,10 @@ class BedrockAssistantContextAggregator(LLMAssistantContextAggregator):
 
 class BedrockLLMService(LLMService):
     """This class implements inference with AWS Bedrock models including Amazon Nova and Anthropic Claude.
-    
+
     Requires AWS credentials to be configured in the environment or through boto3 configuration.
     """
+
     class InputParams(BaseModel):
         max_tokens: Optional[int] = Field(default_factory=lambda: 4096, ge=1)
         temperature: Optional[float] = Field(default_factory=lambda: 0.7, ge=0.0, le=1.0)
@@ -541,34 +530,33 @@ class BedrockLLMService(LLMService):
         **kwargs,
     ):
         super().__init__(**kwargs)
-        
+
         # Initialize the Bedrock client
         if not client_config:
             client_config = Config(
                 connect_timeout=300,  # 5 minutes
-                read_timeout=300,     # 5 minutes
-                retries={'max_attempts': 3}
+                read_timeout=300,  # 5 minutes
+                retries={"max_attempts": 3},
             )
         session = boto3.Session(
             aws_access_key_id=aws_access_key,
             aws_secret_access_key=aws_secret_key,
             aws_session_token=aws_session_token,
-            region_name=aws_region
+            region_name=aws_region,
         )
-        self._client = session.client(
-            service_name='bedrock-runtime',
-            config=client_config
-        )
-        
+        self._client = session.client(service_name="bedrock-runtime", config=client_config)
+
         self.set_model_name(model)
         self._settings = {
             "max_tokens": params.max_tokens,
             "temperature": params.temperature,
             "top_p": params.top_p,
             "latency": params.latency,
-            "additional_model_request_fields": params.additional_model_request_fields if isinstance(params.additional_model_request_fields, dict) else {},
+            "additional_model_request_fields": params.additional_model_request_fields
+            if isinstance(params.additional_model_request_fields, dict)
+            else {},
         }
-        
+
         logger.info(f"Using AWS Bedrock model: {model}")
 
     def can_generate_metrics(self) -> bool:
@@ -603,7 +591,7 @@ class BedrockLLMService(LLMService):
 
         if isinstance(context, OpenAILLMContext) and not isinstance(context, BedrockLLMContext):
             context = BedrockLLMContext.from_openai_context(context)
-                
+
         user = BedrockUserContextAggregator(context, **user_kwargs)
         assistant = BedrockAssistantContextAggregator(context, **assistant_kwargs)
         return BedrockContextAggregatorPair(_user=user, _assistant=assistant)
@@ -626,31 +614,29 @@ class BedrockLLMService(LLMService):
             # )
 
             await self.start_ttfb_metrics()
-            
+
             # Set up inference config
             inference_config = {
                 "maxTokens": self._settings["max_tokens"],
                 "temperature": self._settings["temperature"],
                 "topP": self._settings["top_p"],
             }
-            
+
             # Prepare request parameters
             request_params = {
                 "modelId": self.model_name,
                 "messages": context.messages,
                 "inferenceConfig": inference_config,
-                "additionalModelRequestFields": self._settings["additional_model_request_fields"]
+                "additionalModelRequestFields": self._settings["additional_model_request_fields"],
             }
-            
+
             # Add system message
             request_params["system"] = context.system
-                
+
             # Add tools if present
             if context.tools:
-                tool_config = {
-                    "tools": context.tools
-                }
-                
+                tool_config = {"tools": context.tools}
+
                 # Add tool_choice if specified
                 if context.tool_choice:
                     if context.tool_choice == "auto":
@@ -658,32 +644,30 @@ class BedrockLLMService(LLMService):
                     elif context.tool_choice == "none":
                         # Skip adding toolChoice for "none"
                         pass
-                    elif isinstance(context.tool_choice, dict) and "function" in context.tool_choice:
+                    elif (
+                        isinstance(context.tool_choice, dict) and "function" in context.tool_choice
+                    ):
                         tool_config["toolChoice"] = {
-                            "tool": {
-                                "name": context.tool_choice["function"]["name"]
-                            }
+                            "tool": {"name": context.tool_choice["function"]["name"]}
                         }
-                
+
                 request_params["toolConfig"] = tool_config
-            
+
             # Add performance config if latency is specified
             if self._settings["latency"] in ["standard", "optimized"]:
-                request_params["performanceConfig"] = {
-                    "latency": self._settings["latency"]
-                }
-            
+                request_params["performanceConfig"] = {"latency": self._settings["latency"]}
+
             logger.debug(f"Calling Bedrock model with: {request_params}")
-            
+
             # Call Bedrock with streaming
             response = self._client.converse_stream(**request_params)
-            
+
             await self.stop_ttfb_metrics()
-            
+
             # Process the streaming response
             tool_use_block = None
             json_accumulator = ""
-            
+
             for event in response["stream"]:
                 # Handle text content
                 if "contentBlockDelta" in event:
@@ -694,18 +678,20 @@ class BedrockLLMService(LLMService):
                     elif "toolUse" in delta and "input" in delta["toolUse"]:
                         # Handle partial JSON for tool use
                         json_accumulator += delta["toolUse"]["input"]
-                        completion_tokens_estimate += self._estimate_tokens(delta["toolUse"]["input"])
-                
+                        completion_tokens_estimate += self._estimate_tokens(
+                            delta["toolUse"]["input"]
+                        )
+
                 # Handle tool use start
                 elif "contentBlockStart" in event:
-                    content_block_start = event["contentBlockStart"]['start']
+                    content_block_start = event["contentBlockStart"]["start"]
                     if "toolUse" in content_block_start:
                         tool_use_block = {
                             "id": content_block_start["toolUse"].get("toolUseId", ""),
-                            "name": content_block_start["toolUse"].get("name", "")
+                            "name": content_block_start["toolUse"].get("name", ""),
                         }
                         json_accumulator = ""
-                
+
                 # Handle message completion with tool use
                 elif "messageStop" in event and "stopReason" in event["messageStop"]:
                     if event["messageStop"]["stopReason"] == "tool_use" and tool_use_block:
@@ -719,7 +705,7 @@ class BedrockLLMService(LLMService):
                             )
                         except json.JSONDecodeError:
                             logger.error(f"Failed to parse tool arguments: {json_accumulator}")
-                
+
                 # Handle usage metrics if available
                 if "metadata" in event and "usage" in event["metadata"]:
                     usage = event["metadata"]["usage"]
@@ -750,7 +736,7 @@ class BedrockLLMService(LLMService):
                 prompt_tokens=prompt_tokens,
                 completion_tokens=comp_tokens,
                 cache_read_input_tokens=cache_read_input_tokens,
-                cache_creation_input_tokens=cache_creation_input_tokens
+                cache_creation_input_tokens=cache_creation_input_tokens,
             )
 
     async def process_frame(self, frame: Frame, direction: FrameDirection):
@@ -783,7 +769,7 @@ class BedrockLLMService(LLMService):
         prompt_tokens: int,
         completion_tokens: int,
         cache_read_input_tokens: int,
-        cache_creation_input_tokens: int
+        cache_creation_input_tokens: int,
     ):
         if prompt_tokens or completion_tokens:
             tokens = LLMTokenUsage(
@@ -791,6 +777,6 @@ class BedrockLLMService(LLMService):
                 completion_tokens=completion_tokens,
                 total_tokens=prompt_tokens + completion_tokens,
                 cache_read_input_tokens=cache_read_input_tokens,
-                cache_creation_input_tokens=cache_creation_input_tokens
+                cache_creation_input_tokens=cache_creation_input_tokens,
             )
             await self.start_llm_usage_metrics(tokens)
diff --git a/src/pipecat/services/aws/stt.py b/src/pipecat/services/aws/stt.py
index 08d74d484..d749eff0c 100644
--- a/src/pipecat/services/aws/stt.py
+++ b/src/pipecat/services/aws/stt.py
@@ -19,7 +19,7 @@ from pipecat.frames.frames import (
     Frame,
     TranscriptionFrame,
     InterimTranscriptionFrame,
-    StartFrame
+    StartFrame,
 )
 from pipecat.services.ai_services import STTService
 from pipecat.transcriptions.language import Language
@@ -597,4 +597,4 @@ class TranscribeSTTService(STTService):
         except Exception as e:
             logger.error(f"Unexpected error in receive loop: {e}")
         finally:
-            logger.debug("Receive loop ended")
\ No newline at end of file
+            logger.debug("Receive loop ended")
diff --git a/src/pipecat/services/aws/tts.py b/src/pipecat/services/aws/tts.py
index ed1230dd7..d61f74ab2 100644
--- a/src/pipecat/services/aws/tts.py
+++ b/src/pipecat/services/aws/tts.py
@@ -17,7 +17,7 @@ from pipecat.frames.frames import (
     Frame,
     TTSAudioRawFrame,
     TTSStartedFrame,
-    TTSStoppedFrame
+    TTSStoppedFrame,
 )
 from pipecat.services.ai_services import TTSService
 from pipecat.transcriptions.language import Language
@@ -187,7 +187,7 @@ class PollyTTSService(TTSService):
         if self._settings["engine"] == "standard":
             if self._settings["pitch"]:
                 prosody_attrs.append(f"pitch='{self._settings['pitch']}'")
-        
+
         if self._settings["rate"]:
             prosody_attrs.append(f"rate='{self._settings['rate']}'")
         if self._settings["volume"]:
@@ -195,7 +195,7 @@ class PollyTTSService(TTSService):
         # logger.warning("Prosody tags are not supported for generative engine. Ignoring.")
 
         if prosody_attrs:
-                ssml += f"<prosody {' '.join(prosody_attrs)}>"
+            ssml += f"<prosody {' '.join(prosody_attrs)}>"
 
         ssml += text
 

From b4de98cfb756beb4b7540b57f4d7cfd6efd16315 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= <aleix@daily.co>
Date: Tue, 6 May 2025 13:42:18 -0700
Subject: [PATCH 25/97] AWS: various cleanups (logs, imports...)

---
 .../foundational/07m-interruptible-aws.py     |  19 +-
 pyproject.toml                                |   2 +-
 .../adapters/services/bedrock_adapter.py      |   2 +-
 src/pipecat/services/aws/__init__.py          |   4 +-
 src/pipecat/services/aws/llm.py               |  43 +-
 src/pipecat/services/aws/stt.py               | 465 ++++--------------
 src/pipecat/services/aws/tts.py               |  10 +-
 7 files changed, 140 insertions(+), 405 deletions(-)

diff --git a/examples/foundational/07m-interruptible-aws.py b/examples/foundational/07m-interruptible-aws.py
index ddb8b222e..c88439c62 100644
--- a/examples/foundational/07m-interruptible-aws.py
+++ b/examples/foundational/07m-interruptible-aws.py
@@ -13,7 +13,8 @@ from pipecat.audio.vad.silero import SileroVADAnalyzer
 from pipecat.pipeline.pipeline import Pipeline
 from pipecat.pipeline.runner import PipelineRunner
 from pipecat.pipeline.task import PipelineParams, PipelineTask
-from pipecat.services.aws.llm import BedrockLLMContext, BedrockLLMService
+from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
+from pipecat.services.aws.llm import BedrockLLMService
 from pipecat.services.aws.stt import TranscribeSTTService
 from pipecat.services.aws.tts import PollyTTSService
 from pipecat.transcriptions.language import Language
@@ -55,15 +56,11 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection, _: argparse.Namespac
     messages = [
         {
             "role": "system",
-            "content": [
-                {
-                    "text": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way."
-                }
-            ],
+            "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
         },
     ]
 
-    context = BedrockLLMContext(messages)
+    context = OpenAILLMContext(messages)
     context_aggregator = llm.create_context_aggregator(context)
 
     pipeline = Pipeline(
@@ -92,14 +89,16 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection, _: argparse.Namespac
     async def on_client_connected(transport, client):
         logger.info(f"Client connected")
         # Kick off the conversation.
-        messages.append(
-            {"role": "user", "content": [{"text": "Please introduce yourself to the user."}]}
-        )
+        messages.append({"role": "user", "content": "Please introduce yourself to the user."})
         await task.queue_frames([context_aggregator.user().get_context_frame()])
 
     @transport.event_handler("on_client_disconnected")
     async def on_client_disconnected(transport, client):
         logger.info(f"Client disconnected")
+
+    @transport.event_handler("on_client_closed")
+    async def on_client_closed(transport, client):
+        logger.info(f"Client closed connection")
         await task.cancel()
 
     runner = PipelineRunner(handle_sigint=False)
diff --git a/pyproject.toml b/pyproject.toml
index 910c8d066..13305933b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -41,7 +41,7 @@ Website = "https://pipecat.ai"
 [project.optional-dependencies]
 anthropic = [ "anthropic~=0.49.0" ]
 assemblyai = [ "assemblyai~=0.37.0" ]
-aws = [ "boto3~=1.37.16" ]
+aws = [ "boto3~=1.37.16", "websockets~=13.1" ]
 azure = [ "azure-cognitiveservices-speech~=1.42.0"]
 cartesia = [ "cartesia~=1.4.0", "websockets~=13.1" ]
 cerebras = []
diff --git a/src/pipecat/adapters/services/bedrock_adapter.py b/src/pipecat/adapters/services/bedrock_adapter.py
index b877f01fc..cfb2a5f27 100644
--- a/src/pipecat/adapters/services/bedrock_adapter.py
+++ b/src/pipecat/adapters/services/bedrock_adapter.py
@@ -4,7 +4,7 @@
 # SPDX-License-Identifier: BSD 2-Clause License
 #
 
-from typing import Any, Dict, List, Union
+from typing import Any, Dict, List
 
 from pipecat.adapters.base_llm_adapter import BaseLLMAdapter
 from pipecat.adapters.schemas.function_schema import FunctionSchema
diff --git a/src/pipecat/services/aws/__init__.py b/src/pipecat/services/aws/__init__.py
index b36c88499..b1f157bd3 100644
--- a/src/pipecat/services/aws/__init__.py
+++ b/src/pipecat/services/aws/__init__.py
@@ -8,6 +8,8 @@ import sys
 
 from pipecat.services import DeprecatedModuleProxy
 
+from .llm import *
+from .stt import *
 from .tts import *
 
-sys.modules[__name__] = DeprecatedModuleProxy(globals(), "aws", "aws.tts")
+sys.modules[__name__] = DeprecatedModuleProxy(globals(), "aws", "aws.[llm,stt,tts]")
diff --git a/src/pipecat/services/aws/llm.py b/src/pipecat/services/aws/llm.py
index 3b9c1fedd..63b0964c2 100644
--- a/src/pipecat/services/aws/llm.py
+++ b/src/pipecat/services/aws/llm.py
@@ -11,16 +11,12 @@ import io
 import json
 import re
 from dataclasses import dataclass
-from typing import Any, Dict, List, Mapping, Optional, Union
+from typing import Any, Dict, List, Optional
 
-import boto3
-from botocore.config import Config
-import httpx
 from loguru import logger
 from PIL import Image
 from pydantic import BaseModel, Field
 
-from pipecat.adapters.services.anthropic_adapter import AnthropicLLMAdapter
 from pipecat.frames.frames import (
     Frame,
     FunctionCallCancelFrame,
@@ -36,7 +32,9 @@ from pipecat.frames.frames import (
 )
 from pipecat.metrics.metrics import LLMTokenUsage
 from pipecat.processors.aggregators.llm_response import (
+    LLMAssistantAggregatorParams,
     LLMAssistantContextAggregator,
+    LLMUserAggregatorParams,
     LLMUserContextAggregator,
 )
 from pipecat.processors.aggregators.openai_llm_context import (
@@ -44,7 +42,18 @@ from pipecat.processors.aggregators.openai_llm_context import (
     OpenAILLMContextFrame,
 )
 from pipecat.processors.frame_processor import FrameDirection
-from pipecat.services.ai_services import LLMService
+from pipecat.services.llm_service import LLMService
+
+try:
+    import boto3
+    import httpx
+    from botocore.config import Config
+except ModuleNotFoundError as e:
+    logger.error(f"Exception: {e}")
+    logger.error(
+        "In order to use AWS services, you need to `pip install pipecat-ai[aws]`. Also, remember to set `AWS_SECRET_ACCESS_KEY`, `AWS_ACCESS_KEY_ID`, and `AWS_REGION` environment variable."
+    )
+    raise Exception(f"Missing module: {e}")
 
 
 @dataclass
@@ -564,10 +573,10 @@ class BedrockLLMService(LLMService):
 
     def create_context_aggregator(
         self,
-        context: BedrockLLMContext,
+        context: OpenAILLMContext,
         *,
-        user_kwargs: Mapping[str, Any] = {},
-        assistant_kwargs: Mapping[str, Any] = {},
+        user_params: LLMUserAggregatorParams = LLMUserAggregatorParams(),
+        assistant_params: LLMAssistantAggregatorParams = LLMAssistantAggregatorParams(),
     ) -> BedrockContextAggregatorPair:
         """Create an instance of BedrockContextAggregatorPair from an
         OpenAILLMContext. Constructor keyword arguments for both the user and
@@ -575,12 +584,10 @@ class BedrockLLMService(LLMService):
 
         Args:
             context (OpenAILLMContext): The LLM context.
-            user_kwargs (Mapping[str, Any], optional): Additional keyword
-                arguments for the user context aggregator constructor. Defaults
-                to an empty mapping.
-            assistant_kwargs (Mapping[str, Any], optional): Additional keyword
-                arguments for the assistant context aggregator
-                constructor. Defaults to an empty mapping.
+            user_params (LLMUserAggregatorParams, optional): User aggregator
+                parameters.
+            assistant_params (LLMAssistantAggregatorParams, optional): User
+                aggregator parameters.
 
         Returns:
             BedrockContextAggregatorPair: A pair of context aggregators, one
@@ -589,11 +596,11 @@ class BedrockLLMService(LLMService):
         """
         context.set_llm_adapter(self.get_llm_adapter())
 
-        if isinstance(context, OpenAILLMContext) and not isinstance(context, BedrockLLMContext):
+        if isinstance(context, OpenAILLMContext):
             context = BedrockLLMContext.from_openai_context(context)
 
-        user = BedrockUserContextAggregator(context, **user_kwargs)
-        assistant = BedrockAssistantContextAggregator(context, **assistant_kwargs)
+        user = BedrockUserContextAggregator(context, params=user_params)
+        assistant = BedrockAssistantContextAggregator(context, params=assistant_params)
         return BedrockContextAggregatorPair(_user=user, _assistant=assistant)
 
     async def _process_context(self, context: BedrockLLMContext):
diff --git a/src/pipecat/services/aws/stt.py b/src/pipecat/services/aws/stt.py
index d749eff0c..0468ab31b 100644
--- a/src/pipecat/services/aws/stt.py
+++ b/src/pipecat/services/aws/stt.py
@@ -1,289 +1,40 @@
+#
+# Copyright (c) 2024–2025, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+
 import asyncio
-from typing import AsyncGenerator, Optional, Dict
-import os
-import datetime
-from urllib.parse import urlencode
 import json
-import struct
-import urllib.parse
-import hashlib
-import hmac
+import os
 import random
 import string
-import binascii
+from typing import AsyncGenerator, Optional
 
 from loguru import logger
 
 from pipecat.frames.frames import (
+    CancelFrame,
+    EndFrame,
     ErrorFrame,
     Frame,
-    TranscriptionFrame,
     InterimTranscriptionFrame,
     StartFrame,
+    TranscriptionFrame,
 )
-from pipecat.services.ai_services import STTService
+from pipecat.services.aws.utils import build_event_message, decode_event, get_presigned_url
+from pipecat.services.stt_service import STTService
 from pipecat.transcriptions.language import Language
 from pipecat.utils.time import time_now_iso8601
 
 try:
-    import boto3
-    from botocore.exceptions import BotoCoreError, ClientError
     import websockets
 except ModuleNotFoundError as e:
     logger.error(f"Exception: {e}")
-    logger.error(
-        "In order to use AWS services, you need to `pip install pipecat-ai[aws]`. Also, remember to set `AWS_SECRET_ACCESS_KEY`, `AWS_ACCESS_KEY_ID`, and `AWS_REGION` environment variable."
-    )
+    logger.error("In order to use AWS services, you need to `pip install pipecat-ai[aws]`.")
     raise Exception(f"Missing module: {e}")
 
 
-def get_presigned_url(
-    *,
-    region: str,
-    credentials: Dict[str, Optional[str]],
-    language_code: str,
-    media_encoding: str = "pcm",
-    sample_rate: int = 16000,
-    number_of_channels: int = 1,
-    enable_partial_results_stabilization: bool = True,
-    partial_results_stability: str = "high",
-    vocabulary_name: Optional[str] = None,
-    vocabulary_filter_name: Optional[str] = None,
-    show_speaker_label: bool = False,
-    enable_channel_identification: bool = False,
-) -> str:
-    """Create a presigned URL for AWS Transcribe streaming."""
-    access_key = credentials.get("access_key")
-    secret_key = credentials.get("secret_key")
-    session_token = credentials.get("session_token")
-
-    if not access_key or not secret_key:
-        raise ValueError("AWS credentials are required")
-
-    # Initialize the URL generator
-    url_generator = AWSTranscribePresignedURL(
-        access_key=access_key, secret_key=secret_key, session_token=session_token, region=region
-    )
-
-    # Get the presigned URL
-    return url_generator.get_request_url(
-        sample_rate=sample_rate,
-        language_code=language_code,
-        media_encoding=media_encoding,
-        vocabulary_name=vocabulary_name,
-        vocabulary_filter_name=vocabulary_filter_name,
-        show_speaker_label=show_speaker_label,
-        enable_channel_identification=enable_channel_identification,
-        number_of_channels=number_of_channels,
-        enable_partial_results_stabilization=enable_partial_results_stabilization,
-        partial_results_stability=partial_results_stability,
-    )
-
-
-class AWSTranscribePresignedURL:
-    def __init__(
-        self, access_key: str, secret_key: str, session_token: str, region: str = "us-east-1"
-    ):
-        self.access_key = access_key
-        self.secret_key = secret_key
-        self.session_token = session_token
-        self.method = "GET"
-        self.service = "transcribe"
-        self.region = region
-        self.endpoint = ""
-        self.host = ""
-        self.amz_date = ""
-        self.datestamp = ""
-        self.canonical_uri = "/stream-transcription-websocket"
-        self.canonical_headers = ""
-        self.signed_headers = "host"
-        self.algorithm = "AWS4-HMAC-SHA256"
-        self.credential_scope = ""
-        self.canonical_querystring = ""
-        self.payload_hash = ""
-        self.canonical_request = ""
-        self.string_to_sign = ""
-        self.signature = ""
-        self.request_url = ""
-
-    def get_request_url(
-        self,
-        sample_rate: int,
-        language_code: str = "",
-        media_encoding: str = "pcm",
-        vocabulary_name: str = "",
-        vocabulary_filter_name: str = "",
-        show_speaker_label: bool = False,
-        enable_channel_identification: bool = False,
-        number_of_channels: int = 1,
-        enable_partial_results_stabilization: bool = False,
-        partial_results_stability: str = "",
-    ) -> str:
-        self.endpoint = f"wss://transcribestreaming.{self.region}.amazonaws.com:8443"
-        self.host = f"transcribestreaming.{self.region}.amazonaws.com:8443"
-
-        now = datetime.datetime.utcnow()
-        self.amz_date = now.strftime("%Y%m%dT%H%M%SZ")
-        self.datestamp = now.strftime("%Y%m%d")
-        self.canonical_headers = f"host:{self.host}\n"
-        self.credential_scope = f"{self.datestamp}%2F{self.region}%2F{self.service}%2Faws4_request"
-
-        # Create canonical querystring
-        self.canonical_querystring = "X-Amz-Algorithm=" + self.algorithm
-        self.canonical_querystring += (
-            "&X-Amz-Credential=" + self.access_key + "%2F" + self.credential_scope
-        )
-        self.canonical_querystring += "&X-Amz-Date=" + self.amz_date
-        self.canonical_querystring += "&X-Amz-Expires=300"
-        if self.session_token:
-            self.canonical_querystring += "&X-Amz-Security-Token=" + urllib.parse.quote(
-                self.session_token, safe=""
-            )
-        self.canonical_querystring += "&X-Amz-SignedHeaders=" + self.signed_headers
-
-        if enable_channel_identification:
-            self.canonical_querystring += "&enable-channel-identification=true"
-        if enable_partial_results_stabilization:
-            self.canonical_querystring += "&enable-partial-results-stabilization=true"
-        if language_code:
-            self.canonical_querystring += "&language-code=" + language_code
-        if media_encoding:
-            self.canonical_querystring += "&media-encoding=" + media_encoding
-        if number_of_channels > 1:
-            self.canonical_querystring += "&number-of-channels=" + str(number_of_channels)
-        if partial_results_stability:
-            self.canonical_querystring += "&partial-results-stability=" + partial_results_stability
-        if sample_rate:
-            self.canonical_querystring += "&sample-rate=" + str(sample_rate)
-        if show_speaker_label:
-            self.canonical_querystring += "&show-speaker-label=true"
-        if vocabulary_filter_name:
-            self.canonical_querystring += "&vocabulary-filter-name=" + vocabulary_filter_name
-        if vocabulary_name:
-            self.canonical_querystring += "&vocabulary-name=" + vocabulary_name
-
-        # Create payload hash
-        self.payload_hash = hashlib.sha256("".encode("utf-8")).hexdigest()
-
-        # Create canonical request
-        self.canonical_request = f"{self.method}\n{self.canonical_uri}\n{self.canonical_querystring}\n{self.canonical_headers}\n{self.signed_headers}\n{self.payload_hash}"
-
-        # Create string to sign
-        credential_scope = f"{self.datestamp}/{self.region}/{self.service}/aws4_request"
-        string_to_sign = (
-            f"{self.algorithm}\n{self.amz_date}\n{credential_scope}\n"
-            + hashlib.sha256(self.canonical_request.encode("utf-8")).hexdigest()
-        )
-
-        # Calculate signature
-        k_date = hmac.new(
-            f"AWS4{self.secret_key}".encode("utf-8"), self.datestamp.encode("utf-8"), hashlib.sha256
-        ).digest()
-        k_region = hmac.new(k_date, self.region.encode("utf-8"), hashlib.sha256).digest()
-        k_service = hmac.new(k_region, self.service.encode("utf-8"), hashlib.sha256).digest()
-        k_signing = hmac.new(k_service, b"aws4_request", hashlib.sha256).digest()
-        self.signature = hmac.new(
-            k_signing, string_to_sign.encode("utf-8"), hashlib.sha256
-        ).hexdigest()
-
-        # Add signature to query string
-        self.canonical_querystring += "&X-Amz-Signature=" + self.signature
-
-        # Create request URL
-        self.request_url = self.endpoint + self.canonical_uri + "?" + self.canonical_querystring
-        return self.request_url
-
-
-def get_headers(header_name: str, header_value: str) -> bytearray:
-    """Build a header following AWS event stream format."""
-    name = header_name.encode("utf-8")
-    name_byte_length = bytes([len(name)])
-    value_type = bytes([7])  # 7 represents a string
-    value = header_value.encode("utf-8")
-    value_byte_length = struct.pack(">H", len(value))
-
-    # Construct the header
-    header_list = bytearray()
-    header_list.extend(name_byte_length)
-    header_list.extend(name)
-    header_list.extend(value_type)
-    header_list.extend(value_byte_length)
-    header_list.extend(value)
-    return header_list
-
-
-def build_event_message(payload: bytes) -> bytes:
-    """
-    Build an event message for AWS Transcribe streaming.
-    Matches AWS sample: https://github.com/aws-samples/amazon-transcribe-streaming-python-websockets/blob/main/eventstream.py
-    """
-    # Build headers
-    content_type_header = get_headers(":content-type", "application/octet-stream")
-    event_type_header = get_headers(":event-type", "AudioEvent")
-    message_type_header = get_headers(":message-type", "event")
-
-    headers = bytearray()
-    headers.extend(content_type_header)
-    headers.extend(event_type_header)
-    headers.extend(message_type_header)
-
-    # Calculate total byte length and headers byte length
-    # 16 accounts for 8 byte prelude, 2x 4 byte CRCs
-    total_byte_length = struct.pack(">I", len(headers) + len(payload) + 16)
-    headers_byte_length = struct.pack(">I", len(headers))
-
-    # Build the prelude
-    prelude = bytearray([0] * 8)
-    prelude[:4] = total_byte_length
-    prelude[4:] = headers_byte_length
-
-    # Calculate checksum for prelude
-    prelude_crc = struct.pack(">I", binascii.crc32(prelude) & 0xFFFFFFFF)
-
-    # Construct the message
-    message_as_list = bytearray()
-    message_as_list.extend(prelude)
-    message_as_list.extend(prelude_crc)
-    message_as_list.extend(headers)
-    message_as_list.extend(payload)
-
-    # Calculate checksum for message
-    message = bytes(message_as_list)
-    message_crc = struct.pack(">I", binascii.crc32(message) & 0xFFFFFFFF)
-
-    # Add message checksum
-    message_as_list.extend(message_crc)
-
-    return bytes(message_as_list)
-
-
-def decode_event(message):
-    # Extract the prelude, headers, payload and CRC
-    prelude = message[:8]
-    total_length, headers_length = struct.unpack(">II", prelude)
-    prelude_crc = struct.unpack(">I", message[8:12])[0]
-    headers = message[12 : 12 + headers_length]
-    payload = message[12 + headers_length : -4]
-    message_crc = struct.unpack(">I", message[-4:])[0]
-
-    # Check the CRCs
-    assert prelude_crc == binascii.crc32(prelude) & 0xFFFFFFFF, "Prelude CRC check failed"
-    assert message_crc == binascii.crc32(message[:-4]) & 0xFFFFFFFF, "Message CRC check failed"
-
-    # Parse the headers
-    headers_dict = {}
-    while headers:
-        name_len = headers[0]
-        name = headers[1 : 1 + name_len].decode("utf-8")
-        value_type = headers[1 + name_len]
-        value_len = struct.unpack(">H", headers[2 + name_len : 4 + name_len])[0]
-        value = headers[4 + name_len : 4 + name_len + value_len].decode("utf-8")
-        headers_dict[name] = value
-        headers = headers[4 + name_len + value_len :]
-
-    return headers_dict, json.loads(payload)
-
-
 class TranscribeSTTService(STTService):
     def __init__(
         self,
@@ -355,17 +106,20 @@ class TranscribeSTTService(STTService):
 
         raise RuntimeError("Failed to establish WebSocket connection after multiple attempts")
 
-    async def run_stt(self, frame: Frame) -> AsyncGenerator[Frame, None]:
+    async def stop(self, frame: EndFrame):
+        await super().stop(frame)
+        await self._disconnect()
+
+    async def cancel(self, frame: CancelFrame):
+        await super().cancel(frame)
+        await self._disconnect()
+
+    async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]:
         """Process audio data and send to AWS Transcribe"""
         try:
-            # Skip if no speech detected
-            if hasattr(frame, "is_speech") and not frame.is_speech:
-                logger.debug("Skipping non-speech frame")
-                return
-
             # Ensure WebSocket is connected
             if not self._ws_client or not self._ws_client.open:
-                logger.info("WebSocket not connected, attempting to reconnect...")
+                logger.debug("WebSocket not connected, attempting to reconnect...")
                 try:
                     await self._connect()
                 except Exception as e:
@@ -373,12 +127,8 @@ class TranscribeSTTService(STTService):
                     yield ErrorFrame("Failed to reconnect to AWS Transcribe", fatal=False)
                     return
 
-            # Get the audio data - if frame is bytes, use directly, otherwise get audio attribute
-            audio_data = frame if isinstance(frame, bytes) else frame.audio
-
             # Format the audio data according to AWS event stream format
-            event_message = build_event_message(audio_data)
-            # logger.debug(f"Sending audio chunk of size {len(audio_data)} bytes")
+            event_message = build_event_message(audio)
 
             # Send the formatted event message
             try:
@@ -402,23 +152,18 @@ class TranscribeSTTService(STTService):
 
     async def _connect(self):
         """Connect to AWS Transcribe with connection state management."""
-        if (
-            self._ws_client
-            and self._ws_client.open
-            and self._receive_task
-            and not self._receive_task.done()
-        ):
-            logger.debug("Already connected")
+        if self._ws_client and self._ws_client.open and self._receive_task:
+            logger.debug(f"{self} Already connected")
             return
 
         async with self._connection_lock:
             if self._connecting:
-                logger.debug("Connection already in progress")
+                logger.debug(f"{self} Connection already in progress")
                 return
 
             try:
                 self._connecting = True
-                logger.debug("Starting connection process...")
+                logger.debug(f"{self} Starting connection process...")
 
                 if self._ws_client:
                     await self._disconnect()
@@ -464,7 +209,7 @@ class TranscribeSTTService(STTService):
                     enable_channel_identification=self._settings["enable_channel_identification"],
                 )
 
-                logger.debug(f"Connecting to WebSocket with URL: {presigned_url[:100]}...")
+                logger.debug(f"{self} Connecting to WebSocket with URL: {presigned_url[:100]}...")
 
                 # Connect with the required headers and settings
                 self._ws_client = await websockets.connect(
@@ -475,15 +220,16 @@ class TranscribeSTTService(STTService):
                     ping_timeout=None,
                     compression=None,
                 )
-                logger.debug("WebSocket connected, starting receive task...")
+
+                logger.debug(f"{self} WebSocket connected, starting receive task...")
 
                 # Start receive task
-                self._receive_task = asyncio.create_task(self._receive_loop())
+                self._receive_task = self.create_task(self._receive_loop())
 
-                logger.info("Successfully connected to AWS Transcribe")
+                logger.info(f"{self} Successfully connected to AWS Transcribe")
 
             except Exception as e:
-                logger.error(f"Failed to connect to AWS Transcribe: {e}")
+                logger.error(f"{self} Failed to connect to AWS Transcribe: {e}")
                 await self._disconnect()
                 raise
 
@@ -493,24 +239,19 @@ class TranscribeSTTService(STTService):
     async def _disconnect(self):
         """Disconnect from AWS Transcribe."""
         if self._receive_task:
-            self._receive_task.cancel()
-            try:
-                await self._receive_task
-            except asyncio.CancelledError:
-                pass
+            await self.cancel_task(self._receive_task)
             self._receive_task = None
 
-        if self._ws_client:
-            try:
-                if self._ws_client.open:
-                    # Send end-stream message
-                    end_stream = {"message-type": "event", "event": "end"}
-                    await self._ws_client.send(json.dumps(end_stream))
-                await self._ws_client.close()
-            except Exception as e:
-                logger.warning(f"Error closing WebSocket connection: {e}")
-            finally:
-                self._ws_client = None
+        try:
+            if self._ws_client and self._ws_client.open:
+                # Send end-stream message
+                end_stream = {"message-type": "event", "event": "end"}
+                await self._ws_client.send(json.dumps(end_stream))
+            await self._ws_client.close()
+        except Exception as e:
+            logger.warning(f"{self} Error closing WebSocket connection: {e}")
+        finally:
+            self._ws_client = None
 
     def language_to_service_language(self, language: Language) -> str | None:
         """Convert internal language enum to AWS Transcribe language code."""
@@ -529,72 +270,60 @@ class TranscribeSTTService(STTService):
 
     async def _receive_loop(self):
         """Background task to receive and process messages from AWS Transcribe."""
-        try:
-            logger.debug("Receive loop started")
-            while True:
-                if not self._ws_client or not self._ws_client.open:
-                    logger.warning("WebSocket closed in receive loop")
-                    break
+        while True:
+            if not self._ws_client or not self._ws_client.open:
+                logger.warning(f"{self} WebSocket closed in receive loop")
+                break
 
-                try:
-                    response = await self._ws_client.recv()
-                    headers, payload = decode_event(response)
+            try:
+                response = await self._ws_client.recv()
+                headers, payload = decode_event(response)
 
-                    # logger.debug(f"Received message type: {headers.get(':message-type')}")
+                if headers.get(":message-type") == "event":
+                    # Process transcription results
+                    results = payload.get("Transcript", {}).get("Results", [])
+                    if results:
+                        result = results[0]
+                        alternatives = result.get("Alternatives", [])
+                        if alternatives:
+                            transcript = alternatives[0].get("Transcript", "")
+                            is_final = not result.get("IsPartial", True)
 
-                    if headers.get(":message-type") == "event":
-                        # Process transcription results
-                        results = payload.get("Transcript", {}).get("Results", [])
-                        if results:
-                            result = results[0]
-                            alternatives = result.get("Alternatives", [])
-                            if alternatives:
-                                transcript = alternatives[0].get("Transcript", "")
-                                is_final = not result.get("IsPartial", True)
-
-                                if transcript:
-                                    await self.stop_ttfb_metrics()
-                                    if is_final:
-                                        await self.push_frame(
-                                            TranscriptionFrame(
-                                                transcript,
-                                                "",
-                                                time_now_iso8601(),
-                                                self._settings["language"],
-                                            )
+                            if transcript:
+                                await self.stop_ttfb_metrics()
+                                if is_final:
+                                    await self.push_frame(
+                                        TranscriptionFrame(
+                                            transcript,
+                                            "",
+                                            time_now_iso8601(),
+                                            self._settings["language"],
                                         )
-                                        await self.stop_processing_metrics()
-                                    else:
-                                        await self.push_frame(
-                                            InterimTranscriptionFrame(
-                                                transcript,
-                                                "",
-                                                time_now_iso8601(),
-                                                self._settings["language"],
-                                            )
+                                    )
+                                    await self.stop_processing_metrics()
+                                else:
+                                    await self.push_frame(
+                                        InterimTranscriptionFrame(
+                                            transcript,
+                                            "",
+                                            time_now_iso8601(),
+                                            self._settings["language"],
                                         )
-                    elif headers.get(":message-type") == "exception":
-                        error_msg = payload.get("Message", "Unknown error")
-                        logger.error(f"Exception from AWS: {error_msg}")
-                        await self.push_frame(
-                            ErrorFrame(f"AWS Transcribe error: {error_msg}", fatal=False)
-                        )
-                    else:
-                        logger.debug(f"Other message type received: {headers}")
-                        logger.debug(f"Payload: {payload}")
-
-                except websockets.exceptions.ConnectionClosed as e:
-                    logger.error(
-                        f"WebSocket connection closed in receive loop with code {e.code}: {e.reason}"
+                                    )
+                elif headers.get(":message-type") == "exception":
+                    error_msg = payload.get("Message", "Unknown error")
+                    logger.error(f"{self} Exception from AWS: {error_msg}")
+                    await self.push_frame(
+                        ErrorFrame(f"AWS Transcribe error: {error_msg}", fatal=False)
                     )
-                    break
-                except Exception as e:
-                    logger.error(f"Error in receive loop: {e}")
-                    break
-
-        except asyncio.CancelledError:
-            logger.debug("Receive loop cancelled")
-        except Exception as e:
-            logger.error(f"Unexpected error in receive loop: {e}")
-        finally:
-            logger.debug("Receive loop ended")
+                else:
+                    logger.debug(f"{self} Other message type received: {headers}")
+                    logger.debug(f"{self} Payload: {payload}")
+            except websockets.exceptions.ConnectionClosed as e:
+                logger.error(
+                    f"{self} WebSocket connection closed in receive loop with code {e.code}: {e.reason}"
+                )
+                break
+            except Exception as e:
+                logger.error(f"{self} Unexpected error in receive loop: {e}")
+                break
diff --git a/src/pipecat/services/aws/tts.py b/src/pipecat/services/aws/tts.py
index d61f74ab2..0fdbb8273 100644
--- a/src/pipecat/services/aws/tts.py
+++ b/src/pipecat/services/aws/tts.py
@@ -5,8 +5,8 @@
 #
 
 import asyncio
-from typing import AsyncGenerator, Optional
 import os
+from typing import AsyncGenerator, Optional
 
 from loguru import logger
 from pydantic import BaseModel
@@ -19,7 +19,7 @@ from pipecat.frames.frames import (
     TTSStartedFrame,
     TTSStoppedFrame,
 )
-from pipecat.services.ai_services import TTSService
+from pipecat.services.tts_service import TTSService
 from pipecat.transcriptions.language import Language
 
 try:
@@ -27,9 +27,7 @@ try:
     from botocore.exceptions import BotoCoreError, ClientError
 except ModuleNotFoundError as e:
     logger.error(f"Exception: {e}")
-    logger.error(
-        "In order to use AWS services, you need to `pip install pipecat-ai[aws]`. Also, remember to set `AWS_SECRET_ACCESS_KEY`, `AWS_ACCESS_KEY_ID`, and `AWS_REGION` environment variable."
-    )
+    logger.error("In order to use AWS services, you need to `pip install pipecat-ai[aws]`.")
     raise Exception(f"Missing module: {e}")
 
 
@@ -206,7 +204,7 @@ class PollyTTSService(TTSService):
 
         ssml += "</speak>"
 
-        logger.debug(f"SSML: {ssml}")
+        logger.trace(f"{self} SSML: {ssml}")
 
         return ssml
 

From bed2e894a22aa48a56b94ac2473ca6a5f5fa079c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= <aleix@daily.co>
Date: Tue, 6 May 2025 13:42:41 -0700
Subject: [PATCH 26/97] BedrockLLMService: pull initial system frame from
 messages

---
 src/pipecat/services/aws/llm.py | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/src/pipecat/services/aws/llm.py b/src/pipecat/services/aws/llm.py
index 63b0964c2..cec0cc2e6 100644
--- a/src/pipecat/services/aws/llm.py
+++ b/src/pipecat/services/aws/llm.py
@@ -382,15 +382,8 @@ class BedrockLLMContext(OpenAILLMContext):
         # See if we should pull the system message out of our context.messages list. (For
         # compatibility with Open AI messages format.)
         if self.messages and self.messages[0]["role"] == "system":
-            if len(self.messages) == 1:
-                # If we have only have a system message in the list, all we can really do
-                # without introducing too much magic is change the role to "user".
-                self.messages[0]["role"] = "user"
-            else:
-                # If we have more than one message, we'll pull the system message out of the
-                # list.
-                self.system = self.messages[0]["content"]
-                self.messages.pop(0)
+            self.system = self.messages[0]["content"]
+            self.messages.pop(0)
 
         # Merge consecutive messages with the same role.
         i = 0

From 58de381746594faddbdf7c8b711b9e3b04e0cd36 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= <aleix@daily.co>
Date: Tue, 6 May 2025 13:44:30 -0700
Subject: [PATCH 27/97] AWS: add missing utils

---
 src/pipecat/services/aws/utils.py | 261 ++++++++++++++++++++++++++++++
 1 file changed, 261 insertions(+)
 create mode 100644 src/pipecat/services/aws/utils.py

diff --git a/src/pipecat/services/aws/utils.py b/src/pipecat/services/aws/utils.py
new file mode 100644
index 000000000..db69456e9
--- /dev/null
+++ b/src/pipecat/services/aws/utils.py
@@ -0,0 +1,261 @@
+#
+# Copyright (c) 2024–2025, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+
+import binascii
+import datetime
+import hashlib
+import hmac
+import json
+import struct
+import urllib.parse
+from typing import Dict, Optional
+
+
+def get_presigned_url(
+    *,
+    region: str,
+    credentials: Dict[str, Optional[str]],
+    language_code: str,
+    media_encoding: str = "pcm",
+    sample_rate: int = 16000,
+    number_of_channels: int = 1,
+    enable_partial_results_stabilization: bool = True,
+    partial_results_stability: str = "high",
+    vocabulary_name: Optional[str] = None,
+    vocabulary_filter_name: Optional[str] = None,
+    show_speaker_label: bool = False,
+    enable_channel_identification: bool = False,
+) -> str:
+    """Create a presigned URL for AWS Transcribe streaming."""
+    access_key = credentials.get("access_key")
+    secret_key = credentials.get("secret_key")
+    session_token = credentials.get("session_token")
+
+    if not access_key or not secret_key:
+        raise ValueError("AWS credentials are required")
+
+    # Initialize the URL generator
+    url_generator = AWSTranscribePresignedURL(
+        access_key=access_key, secret_key=secret_key, session_token=session_token, region=region
+    )
+
+    # Get the presigned URL
+    return url_generator.get_request_url(
+        sample_rate=sample_rate,
+        language_code=language_code,
+        media_encoding=media_encoding,
+        vocabulary_name=vocabulary_name,
+        vocabulary_filter_name=vocabulary_filter_name,
+        show_speaker_label=show_speaker_label,
+        enable_channel_identification=enable_channel_identification,
+        number_of_channels=number_of_channels,
+        enable_partial_results_stabilization=enable_partial_results_stabilization,
+        partial_results_stability=partial_results_stability,
+    )
+
+
+class AWSTranscribePresignedURL:
+    def __init__(
+        self, access_key: str, secret_key: str, session_token: str, region: str = "us-east-1"
+    ):
+        self.access_key = access_key
+        self.secret_key = secret_key
+        self.session_token = session_token
+        self.method = "GET"
+        self.service = "transcribe"
+        self.region = region
+        self.endpoint = ""
+        self.host = ""
+        self.amz_date = ""
+        self.datestamp = ""
+        self.canonical_uri = "/stream-transcription-websocket"
+        self.canonical_headers = ""
+        self.signed_headers = "host"
+        self.algorithm = "AWS4-HMAC-SHA256"
+        self.credential_scope = ""
+        self.canonical_querystring = ""
+        self.payload_hash = ""
+        self.canonical_request = ""
+        self.string_to_sign = ""
+        self.signature = ""
+        self.request_url = ""
+
+    def get_request_url(
+        self,
+        sample_rate: int,
+        language_code: str = "",
+        media_encoding: str = "pcm",
+        vocabulary_name: str = "",
+        vocabulary_filter_name: str = "",
+        show_speaker_label: bool = False,
+        enable_channel_identification: bool = False,
+        number_of_channels: int = 1,
+        enable_partial_results_stabilization: bool = False,
+        partial_results_stability: str = "",
+    ) -> str:
+        self.endpoint = f"wss://transcribestreaming.{self.region}.amazonaws.com:8443"
+        self.host = f"transcribestreaming.{self.region}.amazonaws.com:8443"
+
+        now = datetime.datetime.utcnow()
+        self.amz_date = now.strftime("%Y%m%dT%H%M%SZ")
+        self.datestamp = now.strftime("%Y%m%d")
+        self.canonical_headers = f"host:{self.host}\n"
+        self.credential_scope = f"{self.datestamp}%2F{self.region}%2F{self.service}%2Faws4_request"
+
+        # Create canonical querystring
+        self.canonical_querystring = "X-Amz-Algorithm=" + self.algorithm
+        self.canonical_querystring += (
+            "&X-Amz-Credential=" + self.access_key + "%2F" + self.credential_scope
+        )
+        self.canonical_querystring += "&X-Amz-Date=" + self.amz_date
+        self.canonical_querystring += "&X-Amz-Expires=300"
+        if self.session_token:
+            self.canonical_querystring += "&X-Amz-Security-Token=" + urllib.parse.quote(
+                self.session_token, safe=""
+            )
+        self.canonical_querystring += "&X-Amz-SignedHeaders=" + self.signed_headers
+
+        if enable_channel_identification:
+            self.canonical_querystring += "&enable-channel-identification=true"
+        if enable_partial_results_stabilization:
+            self.canonical_querystring += "&enable-partial-results-stabilization=true"
+        if language_code:
+            self.canonical_querystring += "&language-code=" + language_code
+        if media_encoding:
+            self.canonical_querystring += "&media-encoding=" + media_encoding
+        if number_of_channels > 1:
+            self.canonical_querystring += "&number-of-channels=" + str(number_of_channels)
+        if partial_results_stability:
+            self.canonical_querystring += "&partial-results-stability=" + partial_results_stability
+        if sample_rate:
+            self.canonical_querystring += "&sample-rate=" + str(sample_rate)
+        if show_speaker_label:
+            self.canonical_querystring += "&show-speaker-label=true"
+        if vocabulary_filter_name:
+            self.canonical_querystring += "&vocabulary-filter-name=" + vocabulary_filter_name
+        if vocabulary_name:
+            self.canonical_querystring += "&vocabulary-name=" + vocabulary_name
+
+        # Create payload hash
+        self.payload_hash = hashlib.sha256("".encode("utf-8")).hexdigest()
+
+        # Create canonical request
+        self.canonical_request = f"{self.method}\n{self.canonical_uri}\n{self.canonical_querystring}\n{self.canonical_headers}\n{self.signed_headers}\n{self.payload_hash}"
+
+        # Create string to sign
+        credential_scope = f"{self.datestamp}/{self.region}/{self.service}/aws4_request"
+        string_to_sign = (
+            f"{self.algorithm}\n{self.amz_date}\n{credential_scope}\n"
+            + hashlib.sha256(self.canonical_request.encode("utf-8")).hexdigest()
+        )
+
+        # Calculate signature
+        k_date = hmac.new(
+            f"AWS4{self.secret_key}".encode("utf-8"), self.datestamp.encode("utf-8"), hashlib.sha256
+        ).digest()
+        k_region = hmac.new(k_date, self.region.encode("utf-8"), hashlib.sha256).digest()
+        k_service = hmac.new(k_region, self.service.encode("utf-8"), hashlib.sha256).digest()
+        k_signing = hmac.new(k_service, b"aws4_request", hashlib.sha256).digest()
+        self.signature = hmac.new(
+            k_signing, string_to_sign.encode("utf-8"), hashlib.sha256
+        ).hexdigest()
+
+        # Add signature to query string
+        self.canonical_querystring += "&X-Amz-Signature=" + self.signature
+
+        # Create request URL
+        self.request_url = self.endpoint + self.canonical_uri + "?" + self.canonical_querystring
+        return self.request_url
+
+
+def get_headers(header_name: str, header_value: str) -> bytearray:
+    """Build a header following AWS event stream format."""
+    name = header_name.encode("utf-8")
+    name_byte_length = bytes([len(name)])
+    value_type = bytes([7])  # 7 represents a string
+    value = header_value.encode("utf-8")
+    value_byte_length = struct.pack(">H", len(value))
+
+    # Construct the header
+    header_list = bytearray()
+    header_list.extend(name_byte_length)
+    header_list.extend(name)
+    header_list.extend(value_type)
+    header_list.extend(value_byte_length)
+    header_list.extend(value)
+    return header_list
+
+
+def build_event_message(payload: bytes) -> bytes:
+    """
+    Build an event message for AWS Transcribe streaming.
+    Matches AWS sample: https://github.com/aws-samples/amazon-transcribe-streaming-python-websockets/blob/main/eventstream.py
+    """
+    # Build headers
+    content_type_header = get_headers(":content-type", "application/octet-stream")
+    event_type_header = get_headers(":event-type", "AudioEvent")
+    message_type_header = get_headers(":message-type", "event")
+
+    headers = bytearray()
+    headers.extend(content_type_header)
+    headers.extend(event_type_header)
+    headers.extend(message_type_header)
+
+    # Calculate total byte length and headers byte length
+    # 16 accounts for 8 byte prelude, 2x 4 byte CRCs
+    total_byte_length = struct.pack(">I", len(headers) + len(payload) + 16)
+    headers_byte_length = struct.pack(">I", len(headers))
+
+    # Build the prelude
+    prelude = bytearray([0] * 8)
+    prelude[:4] = total_byte_length
+    prelude[4:] = headers_byte_length
+
+    # Calculate checksum for prelude
+    prelude_crc = struct.pack(">I", binascii.crc32(prelude) & 0xFFFFFFFF)
+
+    # Construct the message
+    message_as_list = bytearray()
+    message_as_list.extend(prelude)
+    message_as_list.extend(prelude_crc)
+    message_as_list.extend(headers)
+    message_as_list.extend(payload)
+
+    # Calculate checksum for message
+    message = bytes(message_as_list)
+    message_crc = struct.pack(">I", binascii.crc32(message) & 0xFFFFFFFF)
+
+    # Add message checksum
+    message_as_list.extend(message_crc)
+
+    return bytes(message_as_list)
+
+
+def decode_event(message):
+    # Extract the prelude, headers, payload and CRC
+    prelude = message[:8]
+    total_length, headers_length = struct.unpack(">II", prelude)
+    prelude_crc = struct.unpack(">I", message[8:12])[0]
+    headers = message[12 : 12 + headers_length]
+    payload = message[12 + headers_length : -4]
+    message_crc = struct.unpack(">I", message[-4:])[0]
+
+    # Check the CRCs
+    assert prelude_crc == binascii.crc32(prelude) & 0xFFFFFFFF, "Prelude CRC check failed"
+    assert message_crc == binascii.crc32(message[:-4]) & 0xFFFFFFFF, "Message CRC check failed"
+
+    # Parse the headers
+    headers_dict = {}
+    while headers:
+        name_len = headers[0]
+        name = headers[1 : 1 + name_len].decode("utf-8")
+        value_type = headers[1 + name_len]
+        value_len = struct.unpack(">H", headers[2 + name_len : 4 + name_len])[0]
+        value = headers[4 + name_len : 4 + name_len + value_len].decode("utf-8")
+        headers_dict[name] = value
+        headers = headers[4 + name_len + value_len :]
+
+    return headers_dict, json.loads(payload)

From ce1a72850bee417ceaea8f6796ed0ba99dabc601 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= <aleix@daily.co>
Date: Tue, 6 May 2025 14:51:21 -0700
Subject: [PATCH 28/97] tests: add bedrock context aggregator tests

---
 test-requirements.txt             |  2 +-
 tests/test_context_aggregators.py | 82 +++++++++++++++++++++++--------
 2 files changed, 63 insertions(+), 21 deletions(-)

diff --git a/test-requirements.txt b/test-requirements.txt
index b34a53ab9..fec8adf52 100644
--- a/test-requirements.txt
+++ b/test-requirements.txt
@@ -1 +1 @@
--e ".[anthropic,google,langchain]"
+-e ".[anthropic,aws,google,langchain]"
diff --git a/tests/test_context_aggregators.py b/tests/test_context_aggregators.py
index dfe210e07..cd84d476d 100644
--- a/tests/test_context_aggregators.py
+++ b/tests/test_context_aggregators.py
@@ -40,6 +40,11 @@ from pipecat.services.anthropic.llm import (
     AnthropicLLMContext,
     AnthropicUserContextAggregator,
 )
+from pipecat.services.aws.llm import (
+    BedrockAssistantContextAggregator,
+    BedrockLLMContext,
+    BedrockUserContextAggregator,
+)
 from pipecat.services.google.llm import (
     GoogleAssistantContextAggregator,
     GoogleLLMContext,
@@ -669,26 +674,6 @@ class TestLLMUserContextAggregator(BaseTestUserContextAggregator, unittest.Isola
     AGGREGATOR_CLASS = LLMUserContextAggregator
 
 
-#
-# OpenAI
-#
-
-
-class TestOpenAIUserContextAggregator(
-    BaseTestUserContextAggregator, unittest.IsolatedAsyncioTestCase
-):
-    CONTEXT_CLASS = OpenAILLMContext
-    AGGREGATOR_CLASS = OpenAIUserContextAggregator
-
-
-class TestOpenAIAssistantContextAggregator(
-    BaseTestAssistantContextAggreagator, unittest.IsolatedAsyncioTestCase
-):
-    CONTEXT_CLASS = OpenAILLMContext
-    AGGREGATOR_CLASS = OpenAIAssistantContextAggregator
-    EXPECTED_CONTEXT_FRAMES = [OpenAILLMContextFrame, OpenAILLMContextAssistantTimestampFrame]
-
-
 #
 # Anthropic
 #
@@ -724,6 +709,43 @@ class TestAnthropicAssistantContextAggregator(
         assert context.messages[index]["content"][0]["content"] == json.dumps(content)
 
 
+#
+# AWS (Bedrock)
+#
+
+
+class TestBedrockUserContextAggregator(
+    BaseTestUserContextAggregator, unittest.IsolatedAsyncioTestCase
+):
+    CONTEXT_CLASS = BedrockLLMContext
+    AGGREGATOR_CLASS = BedrockUserContextAggregator
+
+    def check_message_multi_content(
+        self, context: OpenAILLMContext, content_index: int, index: int, content: str
+    ):
+        messages = context.messages[content_index]
+        assert messages["content"][index]["text"] == content
+
+
+class TestBedrockAssistantContextAggregator(
+    BaseTestAssistantContextAggreagator, unittest.IsolatedAsyncioTestCase
+):
+    CONTEXT_CLASS = BedrockLLMContext
+    AGGREGATOR_CLASS = BedrockAssistantContextAggregator
+    EXPECTED_CONTEXT_FRAMES = [OpenAILLMContextFrame, OpenAILLMContextAssistantTimestampFrame]
+
+    def check_message_multi_content(
+        self, context: OpenAILLMContext, content_index: int, index: int, content: str
+    ):
+        messages = context.messages[content_index]
+        assert messages["content"][index]["text"] == content
+
+    def check_function_call_result(self, context: OpenAILLMContext, index: int, content: Any):
+        assert context.messages[index]["content"][0]["toolResult"]["content"][0][
+            "text"
+        ] == json.dumps(content)
+
+
 #
 # Google
 #
@@ -766,3 +788,23 @@ class TestGoogleAssistantContextAggregator(
     def check_function_call_result(self, context: OpenAILLMContext, index: int, content: Any):
         obj = glm.Content.to_dict(context.messages[index])
         assert obj["parts"][0]["function_response"]["response"]["value"] == json.dumps(content)
+
+
+#
+# OpenAI
+#
+
+
+class TestOpenAIUserContextAggregator(
+    BaseTestUserContextAggregator, unittest.IsolatedAsyncioTestCase
+):
+    CONTEXT_CLASS = OpenAILLMContext
+    AGGREGATOR_CLASS = OpenAIUserContextAggregator
+
+
+class TestOpenAIAssistantContextAggregator(
+    BaseTestAssistantContextAggreagator, unittest.IsolatedAsyncioTestCase
+):
+    CONTEXT_CLASS = OpenAILLMContext
+    AGGREGATOR_CLASS = OpenAIAssistantContextAggregator
+    EXPECTED_CONTEXT_FRAMES = [OpenAILLMContextFrame, OpenAILLMContextAssistantTimestampFrame]

From a8405649d0c617e97f11ea9daceee55132979136 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= <aleix@daily.co>
Date: Tue, 6 May 2025 14:52:29 -0700
Subject: [PATCH 29/97] aws: use AWS prefix for all services

---
 CHANGELOG.md                                  |  4 +
 .../foundational/07m-interruptible-aws.py     | 16 ++--
 src/pipecat/services/aws/llm.py               | 94 ++++++++++---------
 src/pipecat/services/aws/stt.py               |  2 +-
 src/pipecat/services/aws/tts.py               |  8 +-
 tests/test_context_aggregators.py             | 18 ++--
 6 files changed, 74 insertions(+), 68 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3e330b9c9..2eec61bca 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
+- Added new AWS services `AWSBedrockLLMService` and `AWSTranscribeSTTService`.
+
 - Added `on_active_speaker_changed` event handler to the `DailyTransport` class.
 
 - Added `enable_ssml_parsing` and `enable_logging` to `InputParams` in
@@ -25,6 +27,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Deprecated
 
+- `PollyTTSService` is now deprecated, use `AWSPollyTTSService` instead.
+
 - Observer `on_push_frame(src, dst, frame, direction, timestamp)` is now
   deprecated, use `on_push_frame(data: FramePushed)` instead.
 
diff --git a/examples/foundational/07m-interruptible-aws.py b/examples/foundational/07m-interruptible-aws.py
index c88439c62..2ccc7b717 100644
--- a/examples/foundational/07m-interruptible-aws.py
+++ b/examples/foundational/07m-interruptible-aws.py
@@ -14,9 +14,9 @@ from pipecat.pipeline.pipeline import Pipeline
 from pipecat.pipeline.runner import PipelineRunner
 from pipecat.pipeline.task import PipelineParams, PipelineTask
 from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
-from pipecat.services.aws.llm import BedrockLLMService
-from pipecat.services.aws.stt import TranscribeSTTService
-from pipecat.services.aws.tts import PollyTTSService
+from pipecat.services.aws.llm import AWSBedrockLLMService
+from pipecat.services.aws.stt import AWSTranscribeSTTService
+from pipecat.services.aws.tts import AWSPollyTTSService
 from pipecat.transcriptions.language import Language
 from pipecat.transports.base_transport import TransportParams
 from pipecat.transports.network.small_webrtc import SmallWebRTCTransport
@@ -37,20 +37,20 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection, _: argparse.Namespac
         ),
     )
 
-    stt = TranscribeSTTService()
+    stt = AWSTranscribeSTTService()
 
-    tts = PollyTTSService(
+    tts = AWSPollyTTSService(
         region="us-west-2",  # only specific regions support generative TTS
         voice_id="Joanna",
-        params=PollyTTSService.InputParams(
+        params=AWSPollyTTSService.InputParams(
             engine="generative", language=Language.EN_US, rate="1.1"
         ),
     )
 
-    llm = BedrockLLMService(
+    llm = AWSBedrockLLMService(
         aws_region="us-west-2",
         model="us.anthropic.claude-3-5-haiku-20241022-v1:0",
-        params=BedrockLLMService.InputParams(temperature=0.8, latency="optimized"),
+        params=AWSBedrockLLMService.InputParams(temperature=0.8, latency="optimized"),
     )
 
     messages = [
diff --git a/src/pipecat/services/aws/llm.py b/src/pipecat/services/aws/llm.py
index cec0cc2e6..00a877c0f 100644
--- a/src/pipecat/services/aws/llm.py
+++ b/src/pipecat/services/aws/llm.py
@@ -57,18 +57,18 @@ except ModuleNotFoundError as e:
 
 
 @dataclass
-class BedrockContextAggregatorPair:
-    _user: "BedrockUserContextAggregator"
-    _assistant: "BedrockAssistantContextAggregator"
+class AWSBedrockContextAggregatorPair:
+    _user: "AWSBedrockUserContextAggregator"
+    _assistant: "AWSBedrockAssistantContextAggregator"
 
-    def user(self) -> "BedrockUserContextAggregator":
+    def user(self) -> "AWSBedrockUserContextAggregator":
         return self._user
 
-    def assistant(self) -> "BedrockAssistantContextAggregator":
+    def assistant(self) -> "AWSBedrockAssistantContextAggregator":
         return self._assistant
 
 
-class BedrockLLMContext(OpenAILLMContext):
+class AWSBedrockLLMContext(OpenAILLMContext):
     def __init__(
         self,
         messages: Optional[List[dict]] = None,
@@ -81,10 +81,10 @@ class BedrockLLMContext(OpenAILLMContext):
         self.system = system
 
     @staticmethod
-    def upgrade_to_bedrock(obj: OpenAILLMContext) -> "BedrockLLMContext":
-        logger.debug(f"Upgrading to Bedrock: {obj}")
-        if isinstance(obj, OpenAILLMContext) and not isinstance(obj, BedrockLLMContext):
-            obj.__class__ = BedrockLLMContext
+    def upgrade_to_bedrock(obj: OpenAILLMContext) -> "AWSBedrockLLMContext":
+        logger.debug(f"Upgrading to AWS Bedrock: {obj}")
+        if isinstance(obj, OpenAILLMContext) and not isinstance(obj, AWSBedrockLLMContext):
+            obj.__class__ = AWSBedrockLLMContext
             obj._restructure_from_openai_messages()
         else:
             obj._restructure_from_bedrock_messages()
@@ -103,13 +103,13 @@ class BedrockLLMContext(OpenAILLMContext):
         return self
 
     @classmethod
-    def from_messages(cls, messages: List[dict]) -> "BedrockLLMContext":
+    def from_messages(cls, messages: List[dict]) -> "AWSBedrockLLMContext":
         self = cls(messages=messages)
         # self._restructure_from_openai_messages()
         return self
 
     @classmethod
-    def from_image_frame(cls, frame: VisionImageRawFrame) -> "BedrockLLMContext":
+    def from_image_frame(cls, frame: VisionImageRawFrame) -> "AWSBedrockLLMContext":
         context = cls()
         context.add_image_frame_message(
             format=frame.format, size=frame.size, image=frame.image, text=frame.text
@@ -120,14 +120,14 @@ class BedrockLLMContext(OpenAILLMContext):
         self._messages[:] = messages
         # self._restructure_from_openai_messages()
 
-    # convert a message in Bedrock format into one or more messages in OpenAI format
+    # convert a message in AWS Bedrock format into one or more messages in OpenAI format
     def to_standard_messages(self, obj):
-        """Convert Bedrock message format to standard structured format.
+        """Convert AWS Bedrock message format to standard structured format.
 
         Handles text content and function calls for both user and assistant messages.
 
         Args:
-            obj: Message in Bedrock format:
+            obj: Message in AWS Bedrock format:
                 {
                     "role": "user/assistant",
                     "content": [{"text": str} | {"toolUse": {...}} | {"toolResult": {...}}]
@@ -208,7 +208,7 @@ class BedrockLLMContext(OpenAILLMContext):
                 return messages
 
     def from_standard_message(self, message):
-        """Convert standard format message to Bedrock format.
+        """Convert standard format message to AWS Bedrock format.
 
         Handles conversion of text content, tool calls, and tool results.
         Empty text content is converted to "(empty)".
@@ -222,7 +222,7 @@ class BedrockLLMContext(OpenAILLMContext):
                 }
 
         Returns:
-            Message in Bedrock format:
+            Message in AWS Bedrock format:
             {
                 "role": "user/assistant",
                 "content": [
@@ -306,8 +306,9 @@ class BedrockLLMContext(OpenAILLMContext):
     def add_message(self, message):
         try:
             if self.messages:
-                # Bedrock requires that roles alternate. If this message's role is the same as the
-                # last message, we should add this message's content to the last message.
+                # AWS Bedrock requires that roles alternate. If this message's
+                # role is the same as the last message, we should add this
+                # message's content to the last message.
                 if self.messages[-1]["role"] == message["role"]:
                     # if the last message has just a content string, convert it to a list
                     # in the proper format
@@ -327,8 +328,10 @@ class BedrockLLMContext(OpenAILLMContext):
             logger.error(f"Error adding message: {e}")
 
     def _restructure_from_bedrock_messages(self):
-        """Restructure messages in Bedrock format by handling system messages,
-        merging consecutive messages with the same role, and ensuring proper content formatting.
+        """Restructure messages in AWS Bedrock format by handling system
+        messages, merging consecutive messages with the same role, and ensuring
+        proper content formatting.
+
         """
         # Handle system message if present at the beginning
         logger.debug(f"_restructure_from_bedrock_messages: {self.messages}")
@@ -431,13 +434,13 @@ class BedrockLLMContext(OpenAILLMContext):
         return json.dumps(msgs)
 
 
-class BedrockUserContextAggregator(LLMUserContextAggregator):
+class AWSBedrockUserContextAggregator(LLMUserContextAggregator):
     pass
 
 
-class BedrockAssistantContextAggregator(LLMAssistantContextAggregator):
+class AWSBedrockAssistantContextAggregator(LLMAssistantContextAggregator):
     async def handle_function_call_in_progress(self, frame: FunctionCallInProgressFrame):
-        # Format tool use according to Bedrock API
+        # Format tool use according to AWS Bedrock API
         self._context.add_message(
             {
                 "role": "assistant",
@@ -505,10 +508,13 @@ class BedrockAssistantContextAggregator(LLMAssistantContextAggregator):
         )
 
 
-class BedrockLLMService(LLMService):
-    """This class implements inference with AWS Bedrock models including Amazon Nova and Anthropic Claude.
+class AWSBedrockLLMService(LLMService):
+    """This class implements inference with AWS Bedrock models including Amazon
+    Nova and Anthropic Claude.
+
+    Requires AWS credentials to be configured in the environment or through
+    boto3 configuration.
 
-    Requires AWS credentials to be configured in the environment or through boto3 configuration.
     """
 
     class InputParams(BaseModel):
@@ -533,7 +539,7 @@ class BedrockLLMService(LLMService):
     ):
         super().__init__(**kwargs)
 
-        # Initialize the Bedrock client
+        # Initialize the AWS Bedrock client
         if not client_config:
             client_config = Config(
                 connect_timeout=300,  # 5 minutes
@@ -570,8 +576,8 @@ class BedrockLLMService(LLMService):
         *,
         user_params: LLMUserAggregatorParams = LLMUserAggregatorParams(),
         assistant_params: LLMAssistantAggregatorParams = LLMAssistantAggregatorParams(),
-    ) -> BedrockContextAggregatorPair:
-        """Create an instance of BedrockContextAggregatorPair from an
+    ) -> AWSBedrockContextAggregatorPair:
+        """Create an instance of AWSBedrockContextAggregatorPair from an
         OpenAILLMContext. Constructor keyword arguments for both the user and
         assistant aggregators can be provided.
 
@@ -583,20 +589,20 @@ class BedrockLLMService(LLMService):
                 aggregator parameters.
 
         Returns:
-            BedrockContextAggregatorPair: A pair of context aggregators, one
+            AWSBedrockContextAggregatorPair: A pair of context aggregators, one
             for the user and one for the assistant, encapsulated in an
-            BedrockContextAggregatorPair.
+            AWSBedrockContextAggregatorPair.
         """
         context.set_llm_adapter(self.get_llm_adapter())
 
         if isinstance(context, OpenAILLMContext):
-            context = BedrockLLMContext.from_openai_context(context)
+            context = AWSBedrockLLMContext.from_openai_context(context)
 
-        user = BedrockUserContextAggregator(context, params=user_params)
-        assistant = BedrockAssistantContextAggregator(context, params=assistant_params)
-        return BedrockContextAggregatorPair(_user=user, _assistant=assistant)
+        user = AWSBedrockUserContextAggregator(context, params=user_params)
+        assistant = AWSBedrockAssistantContextAggregator(context, params=assistant_params)
+        return AWSBedrockContextAggregatorPair(_user=user, _assistant=assistant)
 
-    async def _process_context(self, context: BedrockLLMContext):
+    async def _process_context(self, context: AWSBedrockLLMContext):
         # Usage tracking
         prompt_tokens = 0
         completion_tokens = 0
@@ -609,10 +615,6 @@ class BedrockLLMService(LLMService):
             await self.push_frame(LLMFullResponseStartFrame())
             await self.start_processing_metrics()
 
-            # logger.debug(
-            #     f"{self}: Generating chat with Bedrock model {self.model_name} | [{context.get_messages_for_logging()}]"
-            # )
-
             await self.start_ttfb_metrics()
 
             # Set up inference config
@@ -657,9 +659,9 @@ class BedrockLLMService(LLMService):
             if self._settings["latency"] in ["standard", "optimized"]:
                 request_params["performanceConfig"] = {"latency": self._settings["latency"]}
 
-            logger.debug(f"Calling Bedrock model with: {request_params}")
+            logger.debug(f"Calling AWS Bedrock model with: {request_params}")
 
-            # Call Bedrock with streaming
+            # Call AWS Bedrock with streaming
             response = self._client.converse_stream(**request_params)
 
             await self.stop_ttfb_metrics()
@@ -744,15 +746,15 @@ class BedrockLLMService(LLMService):
 
         context = None
         if isinstance(frame, OpenAILLMContextFrame):
-            context = BedrockLLMContext.upgrade_to_bedrock(frame.context)
+            context = AWSBedrockLLMContext.upgrade_to_bedrock(frame.context)
         elif isinstance(frame, LLMMessagesFrame):
-            context = BedrockLLMContext.from_messages(frame.messages)
+            context = AWSBedrockLLMContext.from_messages(frame.messages)
         elif isinstance(frame, VisionImageRawFrame):
             # This is only useful in very simple pipelines because it creates
             # a new context. Generally we want a context manager to catch
             # UserImageRawFrames coming through the pipeline and add them
             # to the context.
-            context = BedrockLLMContext.from_image_frame(frame)
+            context = AWSBedrockLLMContext.from_image_frame(frame)
         elif isinstance(frame, LLMUpdateSettingsFrame):
             await self._update_settings(frame.settings)
         else:
diff --git a/src/pipecat/services/aws/stt.py b/src/pipecat/services/aws/stt.py
index 0468ab31b..a02625f81 100644
--- a/src/pipecat/services/aws/stt.py
+++ b/src/pipecat/services/aws/stt.py
@@ -35,7 +35,7 @@ except ModuleNotFoundError as e:
     raise Exception(f"Missing module: {e}")
 
 
-class TranscribeSTTService(STTService):
+class AWSTranscribeSTTService(STTService):
     def __init__(
         self,
         *,
diff --git a/src/pipecat/services/aws/tts.py b/src/pipecat/services/aws/tts.py
index 0fdbb8273..40d746514 100644
--- a/src/pipecat/services/aws/tts.py
+++ b/src/pipecat/services/aws/tts.py
@@ -107,7 +107,7 @@ def language_to_aws_language(language: Language) -> Optional[str]:
     return language_map.get(language)
 
 
-class PollyTTSService(TTSService):
+class AWSPollyTTSService(TTSService):
     class InputParams(BaseModel):
         engine: Optional[str] = None
         language: Optional[Language] = Language.EN
@@ -190,7 +190,6 @@ class PollyTTSService(TTSService):
             prosody_attrs.append(f"rate='{self._settings['rate']}'")
         if self._settings["volume"]:
             prosody_attrs.append(f"volume='{self._settings['volume']}'")
-        # logger.warning("Prosody tags are not supported for generative engine. Ignoring.")
 
         if prosody_attrs:
             ssml += f"<prosody {' '.join(prosody_attrs)}>"
@@ -269,7 +268,7 @@ class PollyTTSService(TTSService):
             yield TTSStoppedFrame()
 
 
-class AWSTTSService(PollyTTSService):
+class PollyTTSService(AWSPollyTTSService):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
 
@@ -278,5 +277,6 @@ class AWSTTSService(PollyTTSService):
         with warnings.catch_warnings():
             warnings.simplefilter("always")
             warnings.warn(
-                "'AWSTTSService' is deprecated, use 'PollyTTSService' instead.", DeprecationWarning
+                "'PollyTTSService' is deprecated, use 'AWSPollyTTSService' instead.",
+                DeprecationWarning,
             )
diff --git a/tests/test_context_aggregators.py b/tests/test_context_aggregators.py
index cd84d476d..0f68110ce 100644
--- a/tests/test_context_aggregators.py
+++ b/tests/test_context_aggregators.py
@@ -41,9 +41,9 @@ from pipecat.services.anthropic.llm import (
     AnthropicUserContextAggregator,
 )
 from pipecat.services.aws.llm import (
-    BedrockAssistantContextAggregator,
-    BedrockLLMContext,
-    BedrockUserContextAggregator,
+    AWSBedrockAssistantContextAggregator,
+    AWSBedrockLLMContext,
+    AWSBedrockUserContextAggregator,
 )
 from pipecat.services.google.llm import (
     GoogleAssistantContextAggregator,
@@ -714,11 +714,11 @@ class TestAnthropicAssistantContextAggregator(
 #
 
 
-class TestBedrockUserContextAggregator(
+class TestAWSBedrockUserContextAggregator(
     BaseTestUserContextAggregator, unittest.IsolatedAsyncioTestCase
 ):
-    CONTEXT_CLASS = BedrockLLMContext
-    AGGREGATOR_CLASS = BedrockUserContextAggregator
+    CONTEXT_CLASS = AWSBedrockLLMContext
+    AGGREGATOR_CLASS = AWSBedrockUserContextAggregator
 
     def check_message_multi_content(
         self, context: OpenAILLMContext, content_index: int, index: int, content: str
@@ -727,11 +727,11 @@ class TestBedrockUserContextAggregator(
         assert messages["content"][index]["text"] == content
 
 
-class TestBedrockAssistantContextAggregator(
+class TestAWSBedrockAssistantContextAggregator(
     BaseTestAssistantContextAggreagator, unittest.IsolatedAsyncioTestCase
 ):
-    CONTEXT_CLASS = BedrockLLMContext
-    AGGREGATOR_CLASS = BedrockAssistantContextAggregator
+    CONTEXT_CLASS = AWSBedrockLLMContext
+    AGGREGATOR_CLASS = AWSBedrockAssistantContextAggregator
     EXPECTED_CONTEXT_FRAMES = [OpenAILLMContextFrame, OpenAILLMContextAssistantTimestampFrame]
 
     def check_message_multi_content(

From 458549f7df9b8e3e1bf9474cdf4aeb5b65560cc1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= <aleix@daily.co>
Date: Tue, 6 May 2025 21:07:09 -0700
Subject: [PATCH 30/97] AWSBedrockLLMService: fix function calling

---
 .../foundational/07m-interruptible-aws.py     |   5 +-
 .../foundational/14r-function-calling-aws.py  | 139 ++++++++++++++++++
 .../adapters/services/anthropic_adapter.py    |   2 +-
 .../adapters/services/bedrock_adapter.py      |   2 +-
 src/pipecat/services/aws/llm.py               |  11 +-
 tests/test_function_calling_adapters.py       |  30 ++++
 6 files changed, 178 insertions(+), 11 deletions(-)
 create mode 100644 examples/foundational/14r-function-calling-aws.py

diff --git a/examples/foundational/07m-interruptible-aws.py b/examples/foundational/07m-interruptible-aws.py
index 2ccc7b717..bbcfe7313 100644
--- a/examples/foundational/07m-interruptible-aws.py
+++ b/examples/foundational/07m-interruptible-aws.py
@@ -17,7 +17,6 @@ from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
 from pipecat.services.aws.llm import AWSBedrockLLMService
 from pipecat.services.aws.stt import AWSTranscribeSTTService
 from pipecat.services.aws.tts import AWSPollyTTSService
-from pipecat.transcriptions.language import Language
 from pipecat.transports.base_transport import TransportParams
 from pipecat.transports.network.small_webrtc import SmallWebRTCTransport
 from pipecat.transports.network.webrtc_connection import SmallWebRTCConnection
@@ -42,9 +41,7 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection, _: argparse.Namespac
     tts = AWSPollyTTSService(
         region="us-west-2",  # only specific regions support generative TTS
         voice_id="Joanna",
-        params=AWSPollyTTSService.InputParams(
-            engine="generative", language=Language.EN_US, rate="1.1"
-        ),
+        params=AWSPollyTTSService.InputParams(engine="generative", rate="1.1"),
     )
 
     llm = AWSBedrockLLMService(
diff --git a/examples/foundational/14r-function-calling-aws.py b/examples/foundational/14r-function-calling-aws.py
new file mode 100644
index 000000000..cf4859576
--- /dev/null
+++ b/examples/foundational/14r-function-calling-aws.py
@@ -0,0 +1,139 @@
+#
+# Copyright (c) 2024–2025, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+
+import argparse
+import os
+
+from dotenv import load_dotenv
+from loguru import logger
+
+from pipecat.adapters.schemas.function_schema import FunctionSchema
+from pipecat.adapters.schemas.tools_schema import ToolsSchema
+from pipecat.audio.vad.silero import SileroVADAnalyzer
+from pipecat.pipeline.pipeline import Pipeline
+from pipecat.pipeline.runner import PipelineRunner
+from pipecat.pipeline.task import PipelineParams, PipelineTask
+from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
+from pipecat.services.aws.llm import AWSBedrockLLMService
+from pipecat.services.aws.stt import AWSTranscribeSTTService
+from pipecat.services.aws.tts import AWSPollyTTSService
+from pipecat.services.llm_service import FunctionCallParams
+from pipecat.transports.base_transport import TransportParams
+from pipecat.transports.network.small_webrtc import SmallWebRTCTransport
+from pipecat.transports.network.webrtc_connection import SmallWebRTCConnection
+
+load_dotenv(override=True)
+
+
+async def fetch_weather_from_api(params: FunctionCallParams):
+    await params.result_callback({"conditions": "nice", "temperature": "75"})
+
+
+async def run_bot(webrtc_connection: SmallWebRTCConnection, _: argparse.Namespace):
+    logger.info(f"Starting bot")
+
+    transport = SmallWebRTCTransport(
+        webrtc_connection=webrtc_connection,
+        params=TransportParams(
+            audio_in_enabled=True,
+            audio_out_enabled=True,
+            vad_analyzer=SileroVADAnalyzer(),
+        ),
+    )
+
+    stt = AWSTranscribeSTTService()
+
+    tts = AWSPollyTTSService(
+        region="us-west-2",  # only specific regions support generative TTS
+        voice_id="Joanna",
+        params=AWSPollyTTSService.InputParams(engine="generative", rate="1.1"),
+    )
+
+    llm = AWSBedrockLLMService(
+        aws_region="us-west-2",
+        model="us.anthropic.claude-3-5-haiku-20241022-v1:0",
+        params=AWSBedrockLLMService.InputParams(temperature=0.8, latency="optimized"),
+    )
+
+    # You can also register a function_name of None to get all functions
+    # sent to the same callback with an additional function_name parameter.
+    llm.register_function("get_current_weather", fetch_weather_from_api)
+
+    weather_function = FunctionSchema(
+        name="get_current_weather",
+        description="Get the current weather",
+        properties={
+            "location": {
+                "type": "string",
+                "description": "The city and state, e.g. San Francisco, CA",
+            },
+            "format": {
+                "type": "string",
+                "enum": ["celsius", "fahrenheit"],
+                "description": "The temperature unit to use. Infer this from the user's location.",
+            },
+        },
+        required=["location", "format"],
+    )
+    tools = ToolsSchema(standard_tools=[weather_function])
+
+    messages = [
+        {
+            "role": "system",
+            "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
+        },
+    ]
+
+    context = OpenAILLMContext(messages, tools)
+    context_aggregator = llm.create_context_aggregator(context)
+
+    pipeline = Pipeline(
+        [
+            transport.input(),
+            stt,
+            context_aggregator.user(),
+            llm,
+            tts,
+            transport.output(),
+            context_aggregator.assistant(),
+        ]
+    )
+
+    task = PipelineTask(
+        pipeline,
+        params=PipelineParams(
+            allow_interruptions=True,
+            enable_metrics=True,
+            enable_usage_metrics=True,
+            report_only_initial_ttfb=True,
+        ),
+    )
+
+    @transport.event_handler("on_client_connected")
+    async def on_client_connected(transport, client):
+        logger.info(f"Client connected")
+        # Kick off the conversation.
+        messages.append({"role": "user", "content": "Please introduce yourself to the user."})
+        await task.queue_frames([context_aggregator.user().get_context_frame()])
+
+    @transport.event_handler("on_client_disconnected")
+    async def on_client_disconnected(transport, client):
+        logger.info(f"Client disconnected")
+
+    @transport.event_handler("on_client_closed")
+    async def on_client_closed(transport, client):
+        logger.info(f"Client closed connection")
+        await task.cancel()
+
+    runner = PipelineRunner(handle_sigint=False)
+
+    await runner.run(task)
+
+
+if __name__ == "__main__":
+    from run import main
+
+    main()
diff --git a/src/pipecat/adapters/services/anthropic_adapter.py b/src/pipecat/adapters/services/anthropic_adapter.py
index a699469d3..23197d3a8 100644
--- a/src/pipecat/adapters/services/anthropic_adapter.py
+++ b/src/pipecat/adapters/services/anthropic_adapter.py
@@ -4,7 +4,7 @@
 # SPDX-License-Identifier: BSD 2-Clause License
 #
 
-from typing import Any, Dict, List, Union
+from typing import Any, Dict, List
 
 from pipecat.adapters.base_llm_adapter import BaseLLMAdapter
 from pipecat.adapters.schemas.function_schema import FunctionSchema
diff --git a/src/pipecat/adapters/services/bedrock_adapter.py b/src/pipecat/adapters/services/bedrock_adapter.py
index cfb2a5f27..113a6938d 100644
--- a/src/pipecat/adapters/services/bedrock_adapter.py
+++ b/src/pipecat/adapters/services/bedrock_adapter.py
@@ -11,7 +11,7 @@ from pipecat.adapters.schemas.function_schema import FunctionSchema
 from pipecat.adapters.schemas.tools_schema import ToolsSchema
 
 
-class BedrockLLMAdapter(BaseLLMAdapter):
+class AWSBedrockLLMAdapter(BaseLLMAdapter):
     @staticmethod
     def _to_bedrock_function_format(function: FunctionSchema) -> Dict[str, Any]:
         return {
diff --git a/src/pipecat/services/aws/llm.py b/src/pipecat/services/aws/llm.py
index 00a877c0f..921d3c790 100644
--- a/src/pipecat/services/aws/llm.py
+++ b/src/pipecat/services/aws/llm.py
@@ -17,6 +17,7 @@ from loguru import logger
 from PIL import Image
 from pydantic import BaseModel, Field
 
+from pipecat.adapters.services.bedrock_adapter import AWSBedrockLLMAdapter
 from pipecat.frames.frames import (
     Frame,
     FunctionCallCancelFrame,
@@ -92,7 +93,6 @@ class AWSBedrockLLMContext(OpenAILLMContext):
 
     @classmethod
     def from_openai_context(cls, openai_context: OpenAILLMContext):
-        logger.debug("from_openai_context called")
         self = cls(
             messages=openai_context.messages,
             tools=openai_context.tools,
@@ -105,7 +105,7 @@ class AWSBedrockLLMContext(OpenAILLMContext):
     @classmethod
     def from_messages(cls, messages: List[dict]) -> "AWSBedrockLLMContext":
         self = cls(messages=messages)
-        # self._restructure_from_openai_messages()
+        self._restructure_from_openai_messages()
         return self
 
     @classmethod
@@ -118,7 +118,7 @@ class AWSBedrockLLMContext(OpenAILLMContext):
 
     def set_messages(self, messages: List):
         self._messages[:] = messages
-        # self._restructure_from_openai_messages()
+        self._restructure_from_openai_messages()
 
     # convert a message in AWS Bedrock format into one or more messages in OpenAI format
     def to_standard_messages(self, obj):
@@ -334,7 +334,6 @@ class AWSBedrockLLMContext(OpenAILLMContext):
 
         """
         # Handle system message if present at the beginning
-        logger.debug(f"_restructure_from_bedrock_messages: {self.messages}")
         if self.messages and self.messages[0]["role"] == "system":
             if len(self.messages) == 1:
                 self.messages[0]["role"] = "user"
@@ -375,7 +374,6 @@ class AWSBedrockLLMContext(OpenAILLMContext):
         self.messages.extend(merged_messages)
 
     def _restructure_from_openai_messages(self):
-        logger.debug(f"_restructure_from_openai_messages: {self.messages}")
         # first, map across self._messages calling self.from_standard_message(m) to modify messages in place
         try:
             self._messages[:] = [self.from_standard_message(m) for m in self._messages]
@@ -517,6 +515,9 @@ class AWSBedrockLLMService(LLMService):
 
     """
 
+    # Overriding the default adapter to use the Anthropic one.
+    adapter_class = AWSBedrockLLMAdapter
+
     class InputParams(BaseModel):
         max_tokens: Optional[int] = Field(default_factory=lambda: 4096, ge=1)
         temperature: Optional[float] = Field(default_factory=lambda: 0.7, ge=0.0, le=1.0)
diff --git a/tests/test_function_calling_adapters.py b/tests/test_function_calling_adapters.py
index 5d6dafce3..83640bb80 100644
--- a/tests/test_function_calling_adapters.py
+++ b/tests/test_function_calling_adapters.py
@@ -11,6 +11,7 @@ from openai.types.chat import ChatCompletionToolParam
 from pipecat.adapters.schemas.function_schema import FunctionSchema
 from pipecat.adapters.schemas.tools_schema import AdapterType, ToolsSchema
 from pipecat.adapters.services.anthropic_adapter import AnthropicLLMAdapter
+from pipecat.adapters.services.bedrock_adapter import AWSBedrockLLMAdapter
 from pipecat.adapters.services.gemini_adapter import GeminiLLMAdapter
 from pipecat.adapters.services.open_ai_adapter import OpenAILLMAdapter
 from pipecat.adapters.services.open_ai_realtime_adapter import OpenAIRealtimeLLMAdapter
@@ -174,3 +175,32 @@ class TestFunctionAdapters(unittest.TestCase):
         tools_def = self.tools_def
         tools_def.custom_tools = {AdapterType.GEMINI: [search_tool]}
         assert GeminiLLMAdapter().to_provider_tools_format(tools_def) == expected
+
+    def test_bedrock_adapter(self):
+        """Test AWS Bedrock adapter format transformation."""
+        expected = [
+            {
+                "toolSpec": {
+                    "name": "get_weather",
+                    "description": "Get the weather in a given location",
+                    "inputSchema": {
+                        "json": {
+                            "type": "object",
+                            "properties": {
+                                "format": {
+                                    "type": "string",
+                                    "enum": ["celsius", "fahrenheit"],
+                                    "description": "The temperature unit to use.",
+                                },
+                                "location": {
+                                    "type": "string",
+                                    "description": "The city, e.g. San Francisco",
+                                },
+                            },
+                            "required": ["location", "format"],
+                        }
+                    },
+                }
+            }
+        ]
+        assert AWSBedrockLLMAdapter().to_provider_tools_format(self.tools_def) == expected

From 80ef6dc4dec7eb92cf430a8d885fb404e05b1b5b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= <aleix@daily.co>
Date: Tue, 6 May 2025 21:14:06 -0700
Subject: [PATCH 31/97] update README with AWS Bedrock and Transcribe

---
 README.md | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/README.md b/README.md
index 47be9b6e1..ec3b0a791 100644
--- a/README.md
+++ b/README.md
@@ -49,18 +49,18 @@ You can connect to Pipecat from any platform using our official SDKs:
 
 ## 🧩 Available services
 
-| Category            | Services                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
-| ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| Speech-to-Text      | [AssemblyAI](https://docs.pipecat.ai/server/services/stt/assemblyai), [Azure](https://docs.pipecat.ai/server/services/stt/azure), [Deepgram](https://docs.pipecat.ai/server/services/stt/deepgram), [Fal Wizper](https://docs.pipecat.ai/server/services/stt/fal), [Gladia](https://docs.pipecat.ai/server/services/stt/gladia), [Google](https://docs.pipecat.ai/server/services/stt/google), [Groq (Whisper)](https://docs.pipecat.ai/server/services/stt/groq), [OpenAI (Whisper)](https://docs.pipecat.ai/server/services/stt/openai), [NVIDIA Riva](https://docs.pipecat.ai/server/services/stt/riva), [Ultravox](https://docs.pipecat.ai/server/services/stt/ultravox), [Whisper](https://docs.pipecat.ai/server/services/stt/whisper)                                                                                                                                                                                                                                                      |
-| LLMs                | [Anthropic](https://docs.pipecat.ai/server/services/llm/anthropic), [Azure](https://docs.pipecat.ai/server/services/llm/azure), [Cerebras](https://docs.pipecat.ai/server/services/llm/cerebras), [DeepSeek](https://docs.pipecat.ai/server/services/llm/deepseek), [Fireworks AI](https://docs.pipecat.ai/server/services/llm/fireworks), [Gemini](https://docs.pipecat.ai/server/services/llm/gemini), [Grok](https://docs.pipecat.ai/server/services/llm/grok), [Groq](https://docs.pipecat.ai/server/services/llm/groq), [NVIDIA NIM](https://docs.pipecat.ai/server/services/llm/nim), [Ollama](https://docs.pipecat.ai/server/services/llm/ollama), [OpenAI](https://docs.pipecat.ai/server/services/llm/openai), [OpenRouter](https://docs.pipecat.ai/server/services/llm/openrouter), [Perplexity](https://docs.pipecat.ai/server/services/llm/perplexity), [Qwen](https://docs.pipecat.ai/server/services/llm/qwen), [Together AI](https://docs.pipecat.ai/server/services/llm/together) |
-| Text-to-Speech      | [AWS](https://docs.pipecat.ai/server/services/tts/aws), [Azure](https://docs.pipecat.ai/server/services/tts/azure), [Cartesia](https://docs.pipecat.ai/server/services/tts/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/tts/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/tts/elevenlabs), [Fish](https://docs.pipecat.ai/server/services/tts/fish), [Google](https://docs.pipecat.ai/server/services/tts/google), [LMNT](https://docs.pipecat.ai/server/services/tts/lmnt), [Neuphonic](https://docs.pipecat.ai/server/services/tts/neuphonic), [NVIDIA Riva](https://docs.pipecat.ai/server/services/tts/riva), [OpenAI](https://docs.pipecat.ai/server/services/tts/openai), [Piper](https://docs.pipecat.ai/server/services/tts/piper), [PlayHT](https://docs.pipecat.ai/server/services/tts/playht), [Rime](https://docs.pipecat.ai/server/services/tts/rime), [XTTS](https://docs.pipecat.ai/server/services/tts/xtts)                                   |
-| Speech-to-Speech    | [Gemini Multimodal Live](https://docs.pipecat.ai/server/services/s2s/gemini), [OpenAI Realtime](https://docs.pipecat.ai/server/services/s2s/openai)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               |
-| Transport           | [Daily (WebRTC)](https://docs.pipecat.ai/server/services/transport/daily), [FastAPI Websocket](https://docs.pipecat.ai/server/services/transport/fastapi-websocket), [SmallWebRTCTransport](https://docs.pipecat.ai/server/services/transport/small-webrtc), [WebSocket Server](https://docs.pipecat.ai/server/services/transport/websocket-server), Local                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
-| Video               | [Tavus](https://docs.pipecat.ai/server/services/video/tavus), [Simli](https://docs.pipecat.ai/server/services/video/simli)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
-| Memory              | [mem0](https://docs.pipecat.ai/server/services/memory/mem0)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
-| Vision & Image      | [fal](https://docs.pipecat.ai/server/services/image-generation/fal), [Google Imagen](https://docs.pipecat.ai/server/services/image-generation/fal), [Moondream](https://docs.pipecat.ai/server/services/vision/moondream)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
-| Audio Processing    | [Silero VAD](https://docs.pipecat.ai/server/utilities/audio/silero-vad-analyzer), [Krisp](https://docs.pipecat.ai/server/utilities/audio/krisp-filter), [Koala](https://docs.pipecat.ai/server/utilities/audio/koala-filter), [Noisereduce](https://docs.pipecat.ai/server/utilities/audio/noisereduce-filter)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
-| Analytics & Metrics | [Sentry](https://docs.pipecat.ai/server/services/analytics/sentry)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
+| Category            | Services                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
+|---------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| Speech-to-Text      | [AssemblyAI](https://docs.pipecat.ai/server/services/stt/assemblyai), [AWS](https://docs.pipecat.ai/server/services/stt/aws), [Azure](https://docs.pipecat.ai/server/services/stt/azure), [Deepgram](https://docs.pipecat.ai/server/services/stt/deepgram), [Fal Wizper](https://docs.pipecat.ai/server/services/stt/fal), [Gladia](https://docs.pipecat.ai/server/services/stt/gladia), [Google](https://docs.pipecat.ai/server/services/stt/google), [Groq (Whisper)](https://docs.pipecat.ai/server/services/stt/groq), [OpenAI (Whisper)](https://docs.pipecat.ai/server/services/stt/openai), [Parakeet (NVIDIA)](https://docs.pipecat.ai/server/services/stt/parakeet), [Ultravox](https://docs.pipecat.ai/server/services/stt/ultravox), [Whisper](https://docs.pipecat.ai/server/services/stt/whisper)                                                                                                                                                                                                                                            |
+| LLMs                | [Anthropic](https://docs.pipecat.ai/server/services/llm/anthropic), [AWS](https://docs.pipecat.ai/server/services/llm/aws), [Azure](https://docs.pipecat.ai/server/services/llm/azure), [Cerebras](https://docs.pipecat.ai/server/services/llm/cerebras), [DeepSeek](https://docs.pipecat.ai/server/services/llm/deepseek), [Fireworks AI](https://docs.pipecat.ai/server/services/llm/fireworks), [Gemini](https://docs.pipecat.ai/server/services/llm/gemini), [Grok](https://docs.pipecat.ai/server/services/llm/grok), [Groq](https://docs.pipecat.ai/server/services/llm/groq), [NVIDIA NIM](https://docs.pipecat.ai/server/services/llm/nim), [Ollama](https://docs.pipecat.ai/server/services/llm/ollama), [OpenAI](https://docs.pipecat.ai/server/services/llm/openai), [OpenRouter](https://docs.pipecat.ai/server/services/llm/openrouter), [Perplexity](https://docs.pipecat.ai/server/services/llm/perplexity), [Qwen](https://docs.pipecat.ai/server/services/llm/qwen), [Together AI](https://docs.pipecat.ai/server/services/llm/together) |
+| Text-to-Speech      | [AWS](https://docs.pipecat.ai/server/services/tts/aws), [Azure](https://docs.pipecat.ai/server/services/tts/azure), [Cartesia](https://docs.pipecat.ai/server/services/tts/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/tts/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/tts/elevenlabs), [FastPitch (NVIDIA)](https://docs.pipecat.ai/server/services/tts/fastpitch), [Fish](https://docs.pipecat.ai/server/services/tts/fish), [Google](https://docs.pipecat.ai/server/services/tts/google), [LMNT](https://docs.pipecat.ai/server/services/tts/lmnt), [Neuphonic](https://docs.pipecat.ai/server/services/tts/neuphonic), [OpenAI](https://docs.pipecat.ai/server/services/tts/openai), [Piper](https://docs.pipecat.ai/server/services/tts/piper), [PlayHT](https://docs.pipecat.ai/server/services/tts/playht), [Rime](https://docs.pipecat.ai/server/services/tts/rime), [XTTS](https://docs.pipecat.ai/server/services/tts/xtts)                                                                               |
+| Speech-to-Speech    | [Gemini Multimodal Live](https://docs.pipecat.ai/server/services/s2s/gemini), [OpenAI Realtime](https://docs.pipecat.ai/server/services/s2s/openai)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
+| Transport           | [Daily (WebRTC)](https://docs.pipecat.ai/server/services/transport/daily), [FastAPI Websocket](https://docs.pipecat.ai/server/services/transport/fastapi-websocket), [SmallWebRTCTransport](https://docs.pipecat.ai/server/services/transport/small-webrtc), [WebSocket Server](https://docs.pipecat.ai/server/services/transport/websocket-server), Local                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
+| Video               | [Tavus](https://docs.pipecat.ai/server/services/video/tavus), [Simli](https://docs.pipecat.ai/server/services/video/simli)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
+| Memory              | [mem0](https://docs.pipecat.ai/server/services/memory/mem0)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               |
+| Vision & Image      | [fal](https://docs.pipecat.ai/server/services/image-generation/fal), [Google Imagen](https://docs.pipecat.ai/server/services/image-generation/fal), [Moondream](https://docs.pipecat.ai/server/services/vision/moondream)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |
+| Audio Processing    | [Silero VAD](https://docs.pipecat.ai/server/utilities/audio/silero-vad-analyzer), [Krisp](https://docs.pipecat.ai/server/utilities/audio/krisp-filter), [Koala](https://docs.pipecat.ai/server/utilities/audio/koala-filter), [Noisereduce](https://docs.pipecat.ai/server/utilities/audio/noisereduce-filter)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            |
+| Analytics & Metrics | [Sentry](https://docs.pipecat.ai/server/services/analytics/sentry)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
 
 📚 [View full services documentation →](https://docs.pipecat.ai/server/services/supported-services)
 

From 5e5626f04fe59d57fdc4982bcdc7fa48467f7c4f Mon Sep 17 00:00:00 2001
From: Paul Kompfner <paul@daily.co>
Date: Wed, 23 Apr 2025 11:40:36 -0400
Subject: [PATCH 32/97] [WIP] AWS Nova Sonic service

---
 examples/foundational/39-aws-nova-sonic.py    | 115 ++++++++++++++++++
 pyproject.toml                                |   2 +-
 .../services/aws_nova_sonic/__init__.py       |   1 +
 src/pipecat/services/aws_nova_sonic/aws.py    | 101 +++++++++++++++
 4 files changed, 218 insertions(+), 1 deletion(-)
 create mode 100644 examples/foundational/39-aws-nova-sonic.py
 create mode 100644 src/pipecat/services/aws_nova_sonic/__init__.py
 create mode 100644 src/pipecat/services/aws_nova_sonic/aws.py

diff --git a/examples/foundational/39-aws-nova-sonic.py b/examples/foundational/39-aws-nova-sonic.py
new file mode 100644
index 000000000..33fbbe477
--- /dev/null
+++ b/examples/foundational/39-aws-nova-sonic.py
@@ -0,0 +1,115 @@
+#
+# Copyright (c) 2024–2025, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+
+import os
+
+from dotenv import load_dotenv
+from loguru import logger
+
+from pipecat.audio.vad.silero import SileroVADAnalyzer
+from pipecat.audio.vad.vad_analyzer import VADParams
+from pipecat.frames.frames import LLMMessagesAppendFrame
+from pipecat.pipeline.pipeline import Pipeline
+from pipecat.pipeline.runner import PipelineRunner
+from pipecat.pipeline.task import PipelineParams, PipelineTask
+from pipecat.services.aws_nova_sonic import AWSNovaSonicService
+from pipecat.transports.base_transport import TransportParams
+from pipecat.transports.network.small_webrtc import SmallWebRTCTransport
+from pipecat.transports.network.webrtc_connection import SmallWebRTCConnection
+
+# Load environment variables
+load_dotenv(override=True)
+
+
+async def run_bot(webrtc_connection: SmallWebRTCConnection):
+    logger.info(f"Starting bot")
+
+    # Initialize the SmallWebRTCTransport with the connection
+    transport = SmallWebRTCTransport(
+        webrtc_connection=webrtc_connection,
+        params=TransportParams(
+            audio_in_enabled=True,
+            audio_out_enabled=True,
+            camera_in_enabled=False,
+            vad_enabled=True,
+            vad_audio_passthrough=True,
+            # set stop_secs to something roughly similar to the internal setting
+            # of the Multimodal Live api, just to align events.
+            vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.5)),
+        ),
+    )
+
+    # Create the AWS Nova Sonic LLM service
+    # TODO: system instruction
+    # system_instruction = f"""
+    # You are a helpful AI assistant.
+    # Your goal is to demonstrate your capabilities in a helpful and engaging way.
+    # Your output will be converted to audio so don't include special characters in your answers.
+    # Respond to what the user said in a creative and helpful way.
+    # """
+
+    llm = AWSNovaSonicService(
+        secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
+        access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
+        region=os.getenv("AWS_REGION"),
+    )
+
+    # Build the pipeline
+    pipeline = Pipeline(
+        [
+            transport.input(),
+            llm,
+            transport.output(),
+        ]
+    )
+
+    # Configure the pipeline task
+    task = PipelineTask(
+        pipeline,
+        params=PipelineParams(
+            allow_interruptions=True,
+            enable_metrics=True,
+            enable_usage_metrics=True,
+        ),
+    )
+
+    # Handle client connection event
+    @transport.event_handler("on_client_connected")
+    async def on_client_connected(transport, client):
+        logger.info(f"Client connected")
+        # Kick off the conversation.
+        await task.queue_frames(
+            [
+                LLMMessagesAppendFrame(
+                    messages=[
+                        {
+                            "role": "user",
+                            "content": f"Greet the user and introduce yourself.",
+                        }
+                    ]
+                )
+            ]
+        )
+
+    # Handle client disconnection events
+    @transport.event_handler("on_client_disconnected")
+    async def on_client_disconnected(transport, client):
+        logger.info(f"Client disconnected")
+
+    @transport.event_handler("on_client_closed")
+    async def on_client_closed(transport, client):
+        logger.info(f"Client closed connection")
+        await task.cancel()
+
+    # Run the pipeline
+    runner = PipelineRunner(handle_sigint=False)
+    await runner.run(task)
+
+
+if __name__ == "__main__":
+    from run import main
+
+    main()
diff --git a/pyproject.toml b/pyproject.toml
index 13305933b..d6d05c00c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -41,7 +41,7 @@ Website = "https://pipecat.ai"
 [project.optional-dependencies]
 anthropic = [ "anthropic~=0.49.0" ]
 assemblyai = [ "assemblyai~=0.37.0" ]
-aws = [ "boto3~=1.37.16", "websockets~=13.1" ]
+aws = [ "boto3~=1.37.16", "websockets~=13.1", "aws_sdk_bedrock_runtime~=0.0.2" ]
 azure = [ "azure-cognitiveservices-speech~=1.42.0"]
 cartesia = [ "cartesia~=1.4.0", "websockets~=13.1" ]
 cerebras = []
diff --git a/src/pipecat/services/aws_nova_sonic/__init__.py b/src/pipecat/services/aws_nova_sonic/__init__.py
new file mode 100644
index 000000000..b5559715a
--- /dev/null
+++ b/src/pipecat/services/aws_nova_sonic/__init__.py
@@ -0,0 +1 @@
+from .aws import AWSNovaSonicService
diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py
new file mode 100644
index 000000000..3caf16761
--- /dev/null
+++ b/src/pipecat/services/aws_nova_sonic/aws.py
@@ -0,0 +1,101 @@
+from aws_sdk_bedrock_runtime.client import (
+    BedrockRuntimeClient,
+    InvokeModelWithBidirectionalStreamOperationInput,
+)
+from aws_sdk_bedrock_runtime.config import Config, HTTPAuthSchemeResolver, SigV4AuthScheme
+from aws_sdk_bedrock_runtime.models import (
+    BidirectionalInputPayloadPart,
+    InvokeModelWithBidirectionalStreamInput,
+    InvokeModelWithBidirectionalStreamInputChunk,
+    InvokeModelWithBidirectionalStreamOperationOutput,
+    InvokeModelWithBidirectionalStreamOutput,
+)
+from smithy_aws_core.credentials_resolvers.static import StaticCredentialsResolver
+from smithy_aws_core.identity import AWSCredentialsIdentity
+from smithy_core.aio.eventstream import DuplexEventStream
+
+from pipecat.frames.frames import CancelFrame, EndFrame, StartFrame
+from pipecat.services.llm_service import LLMService
+
+
+class AWSNovaSonicService(LLMService):
+    def __init__(
+        self,
+        *,
+        secret_access_key: str,
+        access_key_id: str,
+        region: str,
+        model: str = "amazon.nova-sonic-v1:0",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self._secret_access_key = secret_access_key
+        self._access_key_id = access_key_id
+        self._region = region
+        self._model = model
+        self._client: BedrockRuntimeClient = None
+        self._stream: DuplexEventStream[
+            InvokeModelWithBidirectionalStreamInput,
+            InvokeModelWithBidirectionalStreamOutput,
+            InvokeModelWithBidirectionalStreamOperationOutput,
+        ] = None
+        self._receive_task = None
+
+    #
+    # standard AIService frame handling
+    #
+
+    async def start(self, frame: StartFrame):
+        await super().start(frame)
+        await self._connect()
+
+    async def stop(self, frame: EndFrame):
+        await super().stop(frame)
+        await self._disconnect()
+
+    async def cancel(self, frame: CancelFrame):
+        await super().cancel(frame)
+        await self._disconnect()
+
+    #
+    # communication
+    #
+
+    async def _connect(self):
+        if self._client:
+            # Here we assume that if we have a client we are connected.
+            return
+        self._initialize_client()
+        self._stream = await self._client.invoke_model_with_bidirectional_stream(
+            InvokeModelWithBidirectionalStreamOperationInput(model_id=self._model)
+        )
+        self._receive_task = self.create_task(self._receive_task_handler())
+        pass
+
+    async def _disconnect(self):
+        pass
+
+    def _initialize_client(self) -> BedrockRuntimeClient:
+        config = Config(
+            endpoint_uri=f"https://bedrock-runtime.{self._region}.amazonaws.com",
+            region=self._region,
+            aws_credentials_identity_resolver=StaticCredentialsResolver(
+                credentials=AWSCredentialsIdentity(
+                    access_key_id=self._access_key_id,
+                    secret_access_key=self._secret_access_key,
+                    # TODO: add additional stuff like aws_session_token
+                )
+            ),
+            http_auth_scheme_resolver=HTTPAuthSchemeResolver(),
+            http_auth_schemes={"aws.auth#sigv4": SigV4AuthScheme()},
+        )
+        self._client = BedrockRuntimeClient(config=config)
+
+    async def _send_client_event(self, event_json):
+        event = InvokeModelWithBidirectionalStreamInputChunk(
+            value=BidirectionalInputPayloadPart(bytes_=event_json.encode("utf-8"))
+        )
+        await self._stream.input_stream.send(event)
+
+    async def _receive_task_handler(self):
+        pass

From a9e395b3660f873732cc89886ab3846a368be406 Mon Sep 17 00:00:00 2001
From: Paul Kompfner <paul@daily.co>
Date: Wed, 23 Apr 2025 14:05:04 -0400
Subject: [PATCH 33/97] [WIP] AWS Nova Sonic service

---
 examples/foundational/39-aws-nova-sonic.py |  14 +-
 src/pipecat/services/aws_nova_sonic/aws.py | 201 +++++++++++++++++++--
 2 files changed, 194 insertions(+), 21 deletions(-)

diff --git a/examples/foundational/39-aws-nova-sonic.py b/examples/foundational/39-aws-nova-sonic.py
index 33fbbe477..266680542 100644
--- a/examples/foundational/39-aws-nova-sonic.py
+++ b/examples/foundational/39-aws-nova-sonic.py
@@ -43,15 +43,15 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection):
     )
 
     # Create the AWS Nova Sonic LLM service
-    # TODO: system instruction
-    # system_instruction = f"""
-    # You are a helpful AI assistant.
-    # Your goal is to demonstrate your capabilities in a helpful and engaging way.
-    # Your output will be converted to audio so don't include special characters in your answers.
-    # Respond to what the user said in a creative and helpful way.
-    # """
+    system_instruction = f"""
+    You are a helpful AI assistant.
+    Your goal is to demonstrate your capabilities in a helpful and engaging way.
+    Your output will be converted to audio so don't include special characters in your answers.
+    Respond to what the user said in a creative and helpful way.
+    """
 
     llm = AWSNovaSonicService(
+        instruction=system_instruction,
         secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
         access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
         region=os.getenv("AWS_REGION"),
diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py
index 3caf16761..d94587879 100644
--- a/src/pipecat/services/aws_nova_sonic/aws.py
+++ b/src/pipecat/services/aws_nova_sonic/aws.py
@@ -1,3 +1,7 @@
+import base64
+import uuid
+from enum import Enum
+
 from aws_sdk_bedrock_runtime.client import (
     BedrockRuntimeClient,
     InvokeModelWithBidirectionalStreamOperationInput,
@@ -10,18 +14,26 @@ from aws_sdk_bedrock_runtime.models import (
     InvokeModelWithBidirectionalStreamOperationOutput,
     InvokeModelWithBidirectionalStreamOutput,
 )
+from loguru import logger
 from smithy_aws_core.credentials_resolvers.static import StaticCredentialsResolver
 from smithy_aws_core.identity import AWSCredentialsIdentity
 from smithy_core.aio.eventstream import DuplexEventStream
 
-from pipecat.frames.frames import CancelFrame, EndFrame, StartFrame
+from pipecat.frames.frames import CancelFrame, EndFrame, Frame, InputAudioRawFrame, StartFrame
+from pipecat.processors.frame_processor import FrameDirection
 from pipecat.services.llm_service import LLMService
 
 
+class Role(Enum):
+    SYSTEM = "SYSTEM"
+    USER = "USER"
+
+
 class AWSNovaSonicService(LLMService):
     def __init__(
         self,
         *,
+        instruction: str,
         secret_access_key: str,
         access_key_id: str,
         region: str,
@@ -29,6 +41,7 @@ class AWSNovaSonicService(LLMService):
         **kwargs,
     ):
         super().__init__(**kwargs)
+        self._instruction = instruction
         self._secret_access_key = secret_access_key
         self._access_key_id = access_key_id
         self._region = region
@@ -40,6 +53,8 @@ class AWSNovaSonicService(LLMService):
             InvokeModelWithBidirectionalStreamOperationOutput,
         ] = None
         self._receive_task = None
+        self._prompt_name = str(uuid.uuid4())
+        self._input_audio_content_name = str(uuid.uuid4())
 
     #
     # standard AIService frame handling
@@ -58,24 +73,54 @@ class AWSNovaSonicService(LLMService):
         await self._disconnect()
 
     #
-    # communication
+    # frame processing
+    #
+
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
+
+        if isinstance(frame, InputAudioRawFrame):
+            # TODO: check if _audio_input_paused? what causes that?
+            await self._send_user_audio(frame)
+
+        await self.push_frame(frame, direction)
+
+    #
+    # communication with LLM
     #
 
     async def _connect(self):
-        if self._client:
-            # Here we assume that if we have a client we are connected.
-            return
-        self._initialize_client()
-        self._stream = await self._client.invoke_model_with_bidirectional_stream(
-            InvokeModelWithBidirectionalStreamOperationInput(model_id=self._model)
-        )
-        self._receive_task = self.create_task(self._receive_task_handler())
-        pass
+        try:
+            if self._client:
+                # Here we assume that if we have a client we are connected
+                return
+
+            # Create the client
+            self._client = self._create_client()
+
+            # Start the bidirectional stream
+            self._stream = await self._client.invoke_model_with_bidirectional_stream(
+                InvokeModelWithBidirectionalStreamOperationInput(model_id=self._model)
+            )
+
+            # Send session start events
+            await self._send_session_start()
+
+            # Send initial system instruction
+            await self._send_text(text=self._instruction, role=Role.SYSTEM)
+
+            # Start audio input
+            await self._send_audio_input_start()
+
+            self._receive_task = self.create_task(self._receive_task_handler())
+        except Exception as e:
+            logger.error(f"{self} initialization error: {e}")
+            self._client = None
 
     async def _disconnect(self):
         pass
 
-    def _initialize_client(self) -> BedrockRuntimeClient:
+    def _create_client(self) -> BedrockRuntimeClient:
         config = Config(
             endpoint_uri=f"https://bedrock-runtime.{self._region}.amazonaws.com",
             region=self._region,
@@ -89,9 +134,137 @@ class AWSNovaSonicService(LLMService):
             http_auth_scheme_resolver=HTTPAuthSchemeResolver(),
             http_auth_schemes={"aws.auth#sigv4": SigV4AuthScheme()},
         )
-        self._client = BedrockRuntimeClient(config=config)
+        return BedrockRuntimeClient(config=config)
 
-    async def _send_client_event(self, event_json):
+    # TODO: make params configurable?
+    async def _send_session_start(self):
+        session_start = """
+        {
+          "event": {
+            "sessionStart": {
+              "inferenceConfiguration": {
+                "maxTokens": 1024,
+                "topP": 0.9,
+                "temperature": 0.7
+              }
+            }
+          }
+        }
+        """
+        await self._send_client_event(session_start)
+
+        prompt_start = f'''
+        {{
+          "event": {{
+            "promptStart": {{
+              "promptName": "{self._prompt_name}",
+              "textOutputConfiguration": {{
+                "mediaType": "text/plain"
+              }},
+              "audioOutputConfiguration": {{
+                "mediaType": "audio/lpcm",
+                "sampleRateHertz": 24000,
+                "sampleSizeBits": 16,
+                "channelCount": 1,
+                "voiceId": "matthew",
+                "encoding": "base64",
+                "audioType": "SPEECH"
+              }}
+            }}
+          }}
+        }}
+        '''
+        await self._send_client_event(prompt_start)
+
+    async def _send_audio_input_start(self):
+        audio_content_start = f'''
+        {{
+            "event": {{
+                "contentStart": {{
+                    "promptName": "{self._prompt_name}",
+                    "contentName": "{self._input_audio_content_name}",
+                    "type": "AUDIO",
+                    "interactive": true,
+                    "role": "USER",
+                    "audioInputConfiguration": {{
+                        "mediaType": "audio/lpcm",
+                        "sampleRateHertz": 16000,
+                        "sampleSizeBits": 16,
+                        "channelCount": 1,
+                        "audioType": "SPEECH",
+                        "encoding": "base64"
+                    }}
+                }}
+            }}
+        }}
+        '''
+        await self._send_client_event(audio_content_start)
+
+    async def _send_text(self, text: str, role: Role):
+        content_name = str(uuid.uuid4())
+
+        text_content_start = f'''
+        {{
+            "event": {{
+                "contentStart": {{
+                    "promptName": "{self._prompt_name}",
+                    "contentName": "{content_name}",
+                    "type": "TEXT",
+                    "interactive": true,
+                    "role": "{role.value}",
+                    "textInputConfiguration": {{
+                        "mediaType": "text/plain"
+                    }}
+                }}
+            }}
+        }}
+        '''
+        await self._send_client_event(text_content_start)
+
+        text_input = f'''
+        {{
+            "event": {{
+                "textInput": {{
+                    "promptName": "{self._prompt_name}",
+                    "contentName": "{content_name}",
+                    "content": "{text}"
+                }}
+            }}
+        }}
+        '''
+        await self._send_client_event(text_input)
+
+        text_content_end = f'''
+        {{
+            "event": {{
+                "contentEnd": {{
+                    "promptName": "{self._prompt_name}",
+                    "contentName": "{content_name}"
+                }}
+            }}
+        }}
+        '''
+        await self._send_client_event(text_content_end)
+
+    async def _send_user_audio(self, frame: InputAudioRawFrame):
+        if not self._client:
+            return
+
+        blob = base64.b64encode(frame.audio)
+        audio_event = f'''
+        {{
+            "event": {{
+                "audioInput": {{
+                    "promptName": "{self._prompt_name}",
+                    "contentName": "{self._input_audio_content_name}",
+                    "content": "{blob.decode("utf-8")}"
+                }}
+            }}
+        }}
+        '''
+        await self._send_client_event(audio_event)
+
+    async def _send_client_event(self, event_json: str):
         event = InvokeModelWithBidirectionalStreamInputChunk(
             value=BidirectionalInputPayloadPart(bytes_=event_json.encode("utf-8"))
         )

From 6d30f441e83d2a676a3c1210d6cc5127f28bd70f Mon Sep 17 00:00:00 2001
From: Paul Kompfner <paul@daily.co>
Date: Wed, 23 Apr 2025 15:00:04 -0400
Subject: [PATCH 34/97] [WIP] AWS Nova Sonic service

---
 src/pipecat/services/aws_nova_sonic/aws.py | 55 +++++++++++++++++++++-
 1 file changed, 53 insertions(+), 2 deletions(-)

diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py
index d94587879..8bd2437bd 100644
--- a/src/pipecat/services/aws_nova_sonic/aws.py
+++ b/src/pipecat/services/aws_nova_sonic/aws.py
@@ -1,4 +1,5 @@
 import base64
+import json
 import uuid
 from enum import Enum
 
@@ -19,7 +20,14 @@ from smithy_aws_core.credentials_resolvers.static import StaticCredentialsResolv
 from smithy_aws_core.identity import AWSCredentialsIdentity
 from smithy_core.aio.eventstream import DuplexEventStream
 
-from pipecat.frames.frames import CancelFrame, EndFrame, Frame, InputAudioRawFrame, StartFrame
+from pipecat.frames.frames import (
+    CancelFrame,
+    EndFrame,
+    Frame,
+    InputAudioRawFrame,
+    StartFrame,
+    TTSAudioRawFrame,
+)
 from pipecat.processors.frame_processor import FrameDirection
 from pipecat.services.llm_service import LLMService
 
@@ -91,6 +99,8 @@ class AWSNovaSonicService(LLMService):
 
     async def _connect(self):
         try:
+            # TODO: remove after debugging
+            logger.debug("[pk] started connecting!")
             if self._client:
                 # Here we assume that if we have a client we are connected
                 return
@@ -113,6 +123,8 @@ class AWSNovaSonicService(LLMService):
             await self._send_audio_input_start()
 
             self._receive_task = self.create_task(self._receive_task_handler())
+
+            logger.debug("[pk] finished connecting!")
         except Exception as e:
             logger.error(f"{self} initialization error: {e}")
             self._client = None
@@ -271,4 +283,43 @@ class AWSNovaSonicService(LLMService):
         await self._stream.input_stream.send(event)
 
     async def _receive_task_handler(self):
-        pass
+        try:
+            while self._client:
+                # TODO: remove after debugging
+                logger.debug(f"[pk] awaiting output from server...")
+
+                output = await self._stream.await_output()
+
+                # TODO: remove after debugging
+                logger.debug(f"[pk] got output from server: {result}")
+
+                result = await output[1].receive()
+
+                # TODO: remove after debugging
+                logger.debug(f"[pk] got result from server: {result}")
+
+                if result.value and result.value.bytes_:
+                    response_data = result.value.bytes_.decode("utf-8")
+                    json_data = json.loads(response_data)
+
+                # TODO: remove after debugging
+                logger.debug(f"[pk] got JSON from server: {json_data}")
+
+                if "audioOutput" in json_data["event"]:
+                    self._handle_audio_output_event(json_data["event"])
+        except Exception as e:
+            logger.error(f"{self} error processing responses: {e}")
+
+    async def _handle_audio_output_event(self, event):
+        # TODO: remove after debugging
+        logger.debug("[pk] got output audio!")
+        audio_content = event["audioOutput"]["content"]
+        audio = base64.b64decode(audio_content)
+        # TODO: how is _current_audio_response used?
+        # TODO: make sample rate + channels (used in multiple places) consts
+        frame = TTSAudioRawFrame(
+            audio=audio,
+            sample_rate=24000,
+            num_channels=1,
+        )
+        await self.push_frame(frame)

From 7668b27fc0ac7019355d9e206f00ff4128a38905 Mon Sep 17 00:00:00 2001
From: Paul Kompfner <paul@daily.co>
Date: Wed, 23 Apr 2025 17:44:07 -0400
Subject: [PATCH 35/97] [WIP] AWS Nova Sonic service

---
 examples/foundational/39-aws-nova-sonic.py | 17 +++++++++++------
 src/pipecat/services/aws_nova_sonic/aws.py |  2 +-
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/examples/foundational/39-aws-nova-sonic.py b/examples/foundational/39-aws-nova-sonic.py
index 266680542..fd7568d63 100644
--- a/examples/foundational/39-aws-nova-sonic.py
+++ b/examples/foundational/39-aws-nova-sonic.py
@@ -32,6 +32,7 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection):
         webrtc_connection=webrtc_connection,
         params=TransportParams(
             audio_in_enabled=True,
+            audio_in_sample_rate=16000,
             audio_out_enabled=True,
             camera_in_enabled=False,
             vad_enabled=True,
@@ -43,12 +44,16 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection):
     )
 
     # Create the AWS Nova Sonic LLM service
-    system_instruction = f"""
-    You are a helpful AI assistant.
-    Your goal is to demonstrate your capabilities in a helpful and engaging way.
-    Your output will be converted to audio so don't include special characters in your answers.
-    Respond to what the user said in a creative and helpful way.
-    """
+    # system_instruction = f"""
+    # You are a helpful AI assistant.
+    # Your goal is to demonstrate your capabilities in a helpful and engaging way.
+    # Your output will be converted to audio so don't include special characters in your answers.
+    # Respond to what the user said in a creative and helpful way.
+    # """
+    # TODO: looks like Nova Sonic can't handle new lines?
+    system_instruction = "You are a friendly assistant. The user and you will engage in a spoken dialog " \
+            "exchanging the transcripts of a natural real-time conversation. Keep your responses short, " \
+            "generally two or three sentences for chatty scenarios."
 
     llm = AWSNovaSonicService(
         instruction=system_instruction,
diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py
index 8bd2437bd..6cd953c3b 100644
--- a/src/pipecat/services/aws_nova_sonic/aws.py
+++ b/src/pipecat/services/aws_nova_sonic/aws.py
@@ -306,7 +306,7 @@ class AWSNovaSonicService(LLMService):
                 logger.debug(f"[pk] got JSON from server: {json_data}")
 
                 if "audioOutput" in json_data["event"]:
-                    self._handle_audio_output_event(json_data["event"])
+                    await self._handle_audio_output_event(json_data["event"])
         except Exception as e:
             logger.error(f"{self} error processing responses: {e}")
 

From d789334a60e4fc7b3aa5c1952bcdfdb5bc43bbea Mon Sep 17 00:00:00 2001
From: Paul Kompfner <paul@daily.co>
Date: Thu, 24 Apr 2025 10:29:27 -0400
Subject: [PATCH 36/97] [WIP] AWS Nova Sonic service

---
 src/pipecat/services/aws_nova_sonic/aws.py | 23 ++--------------------
 1 file changed, 2 insertions(+), 21 deletions(-)

diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py
index 6cd953c3b..aff0be2d2 100644
--- a/src/pipecat/services/aws_nova_sonic/aws.py
+++ b/src/pipecat/services/aws_nova_sonic/aws.py
@@ -99,8 +99,6 @@ class AWSNovaSonicService(LLMService):
 
     async def _connect(self):
         try:
-            # TODO: remove after debugging
-            logger.debug("[pk] started connecting!")
             if self._client:
                 # Here we assume that if we have a client we are connected
                 return
@@ -123,8 +121,6 @@ class AWSNovaSonicService(LLMService):
             await self._send_audio_input_start()
 
             self._receive_task = self.create_task(self._receive_task_handler())
-
-            logger.debug("[pk] finished connecting!")
         except Exception as e:
             logger.error(f"{self} initialization error: {e}")
             self._client = None
@@ -285,35 +281,20 @@ class AWSNovaSonicService(LLMService):
     async def _receive_task_handler(self):
         try:
             while self._client:
-                # TODO: remove after debugging
-                logger.debug(f"[pk] awaiting output from server...")
-
                 output = await self._stream.await_output()
-
-                # TODO: remove after debugging
-                logger.debug(f"[pk] got output from server: {result}")
-
                 result = await output[1].receive()
 
-                # TODO: remove after debugging
-                logger.debug(f"[pk] got result from server: {result}")
-
                 if result.value and result.value.bytes_:
                     response_data = result.value.bytes_.decode("utf-8")
                     json_data = json.loads(response_data)
 
-                # TODO: remove after debugging
-                logger.debug(f"[pk] got JSON from server: {json_data}")
-
                 if "audioOutput" in json_data["event"]:
                     await self._handle_audio_output_event(json_data["event"])
         except Exception as e:
             logger.error(f"{self} error processing responses: {e}")
 
-    async def _handle_audio_output_event(self, event):
-        # TODO: remove after debugging
-        logger.debug("[pk] got output audio!")
-        audio_content = event["audioOutput"]["content"]
+    async def _handle_audio_output_event(self, event_json):
+        audio_content = event_json["audioOutput"]["content"]
         audio = base64.b64decode(audio_content)
         # TODO: how is _current_audio_response used?
         # TODO: make sample rate + channels (used in multiple places) consts

From 13569a5a5a634a7ff65041d08ce12c410dd721bf Mon Sep 17 00:00:00 2001
From: Paul Kompfner <paul@daily.co>
Date: Thu, 24 Apr 2025 13:55:09 -0400
Subject: [PATCH 37/97] [WIP] AWS Nova Sonic service

---
 examples/foundational/39-aws-nova-sonic.py | 14 +++--
 src/pipecat/services/aws_nova_sonic/aws.py | 63 ++++++++++++++++++++--
 2 files changed, 71 insertions(+), 6 deletions(-)

diff --git a/examples/foundational/39-aws-nova-sonic.py b/examples/foundational/39-aws-nova-sonic.py
index fd7568d63..fffaee686 100644
--- a/examples/foundational/39-aws-nova-sonic.py
+++ b/examples/foundational/39-aws-nova-sonic.py
@@ -9,6 +9,7 @@ import os
 from dotenv import load_dotenv
 from loguru import logger
 
+# import logging
 from pipecat.audio.vad.silero import SileroVADAnalyzer
 from pipecat.audio.vad.vad_analyzer import VADParams
 from pipecat.frames.frames import LLMMessagesAppendFrame
@@ -23,6 +24,11 @@ from pipecat.transports.network.webrtc_connection import SmallWebRTCConnection
 # Load environment variables
 load_dotenv(override=True)
 
+# logging.basicConfig(
+#     level=logging.DEBUG,
+#     format='%(asctime)s - %(levelname)s - %(message)s'
+# )
+
 
 async def run_bot(webrtc_connection: SmallWebRTCConnection):
     logger.info(f"Starting bot")
@@ -51,9 +57,11 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection):
     # Respond to what the user said in a creative and helpful way.
     # """
     # TODO: looks like Nova Sonic can't handle new lines?
-    system_instruction = "You are a friendly assistant. The user and you will engage in a spoken dialog " \
-            "exchanging the transcripts of a natural real-time conversation. Keep your responses short, " \
-            "generally two or three sentences for chatty scenarios."
+    system_instruction = (
+        "You are a friendly assistant. The user and you will engage in a spoken dialog "
+        "exchanging the transcripts of a natural real-time conversation. Keep your responses short, "
+        "generally two or three sentences for chatty scenarios."
+    )
 
     llm = AWSNovaSonicService(
         instruction=system_instruction,
diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py
index aff0be2d2..f8517023e 100644
--- a/src/pipecat/services/aws_nova_sonic/aws.py
+++ b/src/pipecat/services/aws_nova_sonic/aws.py
@@ -94,7 +94,7 @@ class AWSNovaSonicService(LLMService):
         await self.push_frame(frame, direction)
 
     #
-    # communication with LLM
+    # LLM communication: lifecycle
     #
 
     async def _connect(self):
@@ -144,6 +144,10 @@ class AWSNovaSonicService(LLMService):
         )
         return BedrockRuntimeClient(config=config)
 
+    #
+    # LLM communication: input events (pipecat -> LLM)
+    #
+
     # TODO: make params configurable?
     async def _send_session_start(self):
         session_start = """
@@ -278,6 +282,18 @@ class AWSNovaSonicService(LLMService):
         )
         await self._stream.input_stream.send(event)
 
+    #
+    # LLM communication: output events (LLM -> pipecat)
+    #
+
+    # Receive LLM responses ("completions").
+    # Each response contains up to four pieces of content, delivered sequentially:
+    # - User transcription
+    # - Tool use (optional)
+    # - Text response
+    # - Audio response
+    # Each piece of content is wrapped by "contentStart" and "contentEnd" events.
+    # Each overall response is wrapped by "completionStart" and "completionEnd" events.
     async def _receive_task_handler(self):
         try:
             while self._client:
@@ -288,13 +304,46 @@ class AWSNovaSonicService(LLMService):
                     response_data = result.value.bytes_.decode("utf-8")
                     json_data = json.loads(response_data)
 
-                if "audioOutput" in json_data["event"]:
-                    await self._handle_audio_output_event(json_data["event"])
+                if "event" in json_data:
+                    event_json = json_data["event"]
+                    if "completionStart" in event_json:
+                        # Handle the LLM response starting
+                        await self._handle_completion_start_event(event_json)
+                    elif "contentStart" in event_json:
+                        # Handle a piece of content starting
+                        await self._handle_content_start_event(event_json)
+                    elif "textOutput" in event_json:
+                        # Handle text output content
+                        await self._handle_text_output_event(event_json)
+                    elif "audioOutput" in event_json:
+                        # Handle audio output content
+                        await self._handle_audio_output_event(event_json)
+                    elif "contentEnd" in event_json:
+                        # Handle a piece of content ending
+                        await self._handle_content_end_event(event_json)
+                    elif "completionStart" in event_json:
+                        # Handle the LLM response ending
+                        await self._handle_completion_end_event(event_json)
+
         except Exception as e:
             logger.error(f"{self} error processing responses: {e}")
 
+    async def _handle_completion_start_event(self, event_json):
+        print("[pk] completion start")
+
+    async def _handle_content_start_event(self, event_json):
+        content_start = event_json["contentStart"]
+        type = content_start["type"]
+        role = content_start["role"]
+        print(f"[pk] content start. type: {type}, role: {role}")
+
+    async def _handle_text_output_event(self, event_json):
+        text_content = event_json["textOutput"]["content"]
+        print(f"[pk] text output. content: {text_content}")
+
     async def _handle_audio_output_event(self, event_json):
         audio_content = event_json["audioOutput"]["content"]
+        print(f"[pk] audio output. content: {len(audio_content)}")
         audio = base64.b64decode(audio_content)
         # TODO: how is _current_audio_response used?
         # TODO: make sample rate + channels (used in multiple places) consts
@@ -304,3 +353,11 @@ class AWSNovaSonicService(LLMService):
             num_channels=1,
         )
         await self.push_frame(frame)
+
+    async def _handle_content_end_event(self, event_json):
+        content_end = event_json["contentEnd"]
+        type = content_end["type"]
+        print(f"[pk] content end. type: {type}")
+
+    async def _handle_completion_end_event(self, event_json):
+        print("[pk] completion end")

From 8cbad070ade59d21df5f0c109545af197376f1cd Mon Sep 17 00:00:00 2001
From: Paul Kompfner <paul@daily.co>
Date: Thu, 24 Apr 2025 14:21:43 -0400
Subject: [PATCH 38/97] [WIP] AWS Nova Sonic service

---
 src/pipecat/services/aws_nova_sonic/aws.py | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py
index f8517023e..f28fe4360 100644
--- a/src/pipecat/services/aws_nova_sonic/aws.py
+++ b/src/pipecat/services/aws_nova_sonic/aws.py
@@ -89,7 +89,7 @@ class AWSNovaSonicService(LLMService):
 
         if isinstance(frame, InputAudioRawFrame):
             # TODO: check if _audio_input_paused? what causes that?
-            await self._send_user_audio(frame)
+            await self._send_user_audio_event(frame)
 
         await self.push_frame(frame, direction)
 
@@ -112,13 +112,13 @@ class AWSNovaSonicService(LLMService):
             )
 
             # Send session start events
-            await self._send_session_start()
+            await self._send_session_start_event()
 
             # Send initial system instruction
-            await self._send_text(text=self._instruction, role=Role.SYSTEM)
+            await self._send_text_event(text=self._instruction, role=Role.SYSTEM)
 
             # Start audio input
-            await self._send_audio_input_start()
+            await self._send_audio_input_start_event()
 
             self._receive_task = self.create_task(self._receive_task_handler())
         except Exception as e:
@@ -149,7 +149,7 @@ class AWSNovaSonicService(LLMService):
     #
 
     # TODO: make params configurable?
-    async def _send_session_start(self):
+    async def _send_session_start_event(self):
         session_start = """
         {
           "event": {
@@ -188,7 +188,7 @@ class AWSNovaSonicService(LLMService):
         '''
         await self._send_client_event(prompt_start)
 
-    async def _send_audio_input_start(self):
+    async def _send_audio_input_start_event(self):
         audio_content_start = f'''
         {{
             "event": {{
@@ -212,7 +212,7 @@ class AWSNovaSonicService(LLMService):
         '''
         await self._send_client_event(audio_content_start)
 
-    async def _send_text(self, text: str, role: Role):
+    async def _send_text_event(self, text: str, role: Role):
         content_name = str(uuid.uuid4())
 
         text_content_start = f'''
@@ -258,7 +258,7 @@ class AWSNovaSonicService(LLMService):
         '''
         await self._send_client_event(text_content_end)
 
-    async def _send_user_audio(self, frame: InputAudioRawFrame):
+    async def _send_user_audio_event(self, frame: InputAudioRawFrame):
         if not self._client:
             return
 
@@ -357,7 +357,8 @@ class AWSNovaSonicService(LLMService):
     async def _handle_content_end_event(self, event_json):
         content_end = event_json["contentEnd"]
         type = content_end["type"]
-        print(f"[pk] content end. type: {type}")
+        stop_reason = content_end["stopReason"]
+        print(f"[pk] content end. type: {type}, stop_reason: {stop_reason}")
 
     async def _handle_completion_end_event(self, event_json):
         print("[pk] completion end")

From b1d413b9be63779766aaa164114cdbc69f010cb1 Mon Sep 17 00:00:00 2001
From: Paul Kompfner <paul@daily.co>
Date: Thu, 24 Apr 2025 15:23:01 -0400
Subject: [PATCH 39/97] [WIP] AWS Nova Sonic service

---
 src/pipecat/services/aws_nova_sonic/aws.py | 28 +++++++++++++++-------
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py
index f28fe4360..c271f49c5 100644
--- a/src/pipecat/services/aws_nova_sonic/aws.py
+++ b/src/pipecat/services/aws_nova_sonic/aws.py
@@ -286,14 +286,18 @@ class AWSNovaSonicService(LLMService):
     # LLM communication: output events (LLM -> pipecat)
     #
 
-    # Receive LLM responses ("completions").
-    # Each response contains up to four pieces of content, delivered sequentially:
-    # - User transcription
-    # - Tool use (optional)
-    # - Text response
-    # - Audio response
-    # Each piece of content is wrapped by "contentStart" and "contentEnd" events.
-    # Each overall response is wrapped by "completionStart" and "completionEnd" events.
+    # Receive the ongoing LLM "completion".
+    # There is generally a single completion per session.
+    # In a completion, a few different kinds of content can be delivered:
+    # - Transcription of user audio
+    # - Tool use
+    # - Text preview of planned response speech before audio delivered
+    # - User interruption notification
+    # - Text of response speech that whose audio was actually delivered
+    # - Audio of response speech
+    # Each piece of content is wrapped by "contentStart" and "contentEnd" events. The content is
+    # delivered sequentially: one piece of content will end before another starts.
+    # The overall completion is wrapped by "completionStart" and "completionEnd" events.
     async def _receive_task_handler(self):
         try:
             while self._client:
@@ -335,7 +339,13 @@ class AWSNovaSonicService(LLMService):
         content_start = event_json["contentStart"]
         type = content_start["type"]
         role = content_start["role"]
-        print(f"[pk] content start. type: {type}, role: {role}")
+        generation_stage = None
+        if "additionalModelFields" in content_start:
+            additional_model_fields = json.loads(content_start["additionalModelFields"])
+            generation_stage = additional_model_fields.get("generationStage")
+        print(
+            f"[pk] content start. type: {type}, role: {role}, generation_stage: {generation_stage}"
+        )
 
     async def _handle_text_output_event(self, event_json):
         text_content = event_json["textOutput"]["content"]

From e40aa4f99a5b3e95fb8546c8fbe7b59749e64164 Mon Sep 17 00:00:00 2001
From: Paul Kompfner <paul@daily.co>
Date: Thu, 24 Apr 2025 15:56:28 -0400
Subject: [PATCH 40/97] [WIP] AWS Nova Sonic service - added TTSStartedFrame
 and TTSStoppedFrame

---
 src/pipecat/services/aws_nova_sonic/aws.py | 34 +++++++++++++++++-----
 1 file changed, 26 insertions(+), 8 deletions(-)

diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py
index c271f49c5..2e875f96f 100644
--- a/src/pipecat/services/aws_nova_sonic/aws.py
+++ b/src/pipecat/services/aws_nova_sonic/aws.py
@@ -27,6 +27,8 @@ from pipecat.frames.frames import (
     InputAudioRawFrame,
     StartFrame,
     TTSAudioRawFrame,
+    TTSStartedFrame,
+    TTSStoppedFrame,
 )
 from pipecat.processors.frame_processor import FrameDirection
 from pipecat.services.llm_service import LLMService
@@ -63,6 +65,7 @@ class AWSNovaSonicService(LLMService):
         self._receive_task = None
         self._prompt_name = str(uuid.uuid4())
         self._input_audio_content_name = str(uuid.uuid4())
+        self._audio_response_ongoing = False
 
     #
     # standard AIService frame handling
@@ -333,7 +336,8 @@ class AWSNovaSonicService(LLMService):
             logger.error(f"{self} error processing responses: {e}")
 
     async def _handle_completion_start_event(self, event_json):
-        print("[pk] completion start")
+        # print("[pk] completion start")
+        pass
 
     async def _handle_content_start_event(self, event_json):
         content_start = event_json["contentStart"]
@@ -343,19 +347,26 @@ class AWSNovaSonicService(LLMService):
         if "additionalModelFields" in content_start:
             additional_model_fields = json.loads(content_start["additionalModelFields"])
             generation_stage = additional_model_fields.get("generationStage")
-        print(
-            f"[pk] content start. type: {type}, role: {role}, generation_stage: {generation_stage}"
-        )
+        # print(
+        #     f"[pk] content start. type: {type}, role: {role}, generation_stage: {generation_stage}"
+        # )
 
     async def _handle_text_output_event(self, event_json):
         text_content = event_json["textOutput"]["content"]
-        print(f"[pk] text output. content: {text_content}")
+        # print(f"[pk] text output. content: {text_content}")
 
     async def _handle_audio_output_event(self, event_json):
         audio_content = event_json["audioOutput"]["content"]
         print(f"[pk] audio output. content: {len(audio_content)}")
+
+        # Report that *equivalent* of TTS (this is a speech-to-speech model) started
+        if not self._audio_response_ongoing:
+            self._audio_response_ongoing = True
+            # print("[pk] starting TTS")
+            await self.push_frame(TTSStartedFrame())
+
+        # Push audio frame
         audio = base64.b64decode(audio_content)
-        # TODO: how is _current_audio_response used?
         # TODO: make sample rate + channels (used in multiple places) consts
         frame = TTSAudioRawFrame(
             audio=audio,
@@ -368,7 +379,14 @@ class AWSNovaSonicService(LLMService):
         content_end = event_json["contentEnd"]
         type = content_end["type"]
         stop_reason = content_end["stopReason"]
-        print(f"[pk] content end. type: {type}, stop_reason: {stop_reason}")
+        # print(f"[pk] content end. type: {type}, stop_reason: {stop_reason}")
+
+        # Report that *equivalent* of TTS (this is a speech-to-speech model) stopped
+        if type == "AUDIO" and self._audio_response_ongoing:
+            print("[pk] stopping TTS")
+            self._audio_response_ongoing = False
+            await self.push_frame(TTSStoppedFrame())
 
     async def _handle_completion_end_event(self, event_json):
-        print("[pk] completion end")
+        # print("[pk] completion end")
+        pass

From de294caed9aaf2d3b37d6c2f0ec0ba7382b9d3f7 Mon Sep 17 00:00:00 2001
From: Paul Kompfner <paul@daily.co>
Date: Fri, 25 Apr 2025 15:12:37 -0400
Subject: [PATCH 41/97] [WIP] AWS Nova Sonic service - added
 LLMFullResponseStartFrame, LLMTextFrame, and LLMFullResponseEndFrame

---
 src/pipecat/services/aws_nova_sonic/aws.py | 124 +++++++++++++++++----
 1 file changed, 102 insertions(+), 22 deletions(-)

diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py
index 2e875f96f..a2437b9dd 100644
--- a/src/pipecat/services/aws_nova_sonic/aws.py
+++ b/src/pipecat/services/aws_nova_sonic/aws.py
@@ -1,6 +1,7 @@
 import base64
 import json
 import uuid
+from dataclasses import dataclass
 from enum import Enum
 
 from aws_sdk_bedrock_runtime.client import (
@@ -25,6 +26,9 @@ from pipecat.frames.frames import (
     EndFrame,
     Frame,
     InputAudioRawFrame,
+    LLMFullResponseEndFrame,
+    LLMFullResponseStartFrame,
+    LLMTextFrame,
     StartFrame,
     TTSAudioRawFrame,
     TTSStartedFrame,
@@ -37,6 +41,36 @@ from pipecat.services.llm_service import LLMService
 class Role(Enum):
     SYSTEM = "SYSTEM"
     USER = "USER"
+    ASSISTANT = "ASSISTANT"
+    TOOL = "TOOL"
+
+
+class ContentType(Enum):
+    AUDIO = "AUDIO"
+    TEXT = "TEXT"
+    TOOL = "TOOL"
+
+
+class TextStage(Enum):
+    FINAL = "FINAL"  # what has been said
+    SPECULATIVE = "SPECULATIVE"  # what's planned to be said
+
+
+@dataclass
+class CurrentContent:
+    type: ContentType
+    role: Role
+    text_stage: TextStage  # None if not text
+    text_content: str # starts as None, then fills in if text
+
+    def __str__(self):
+        return (
+            f"CurrentContent(\n"
+            f"  type={self.type.name},\n"
+            f"  role={self.role.name},\n"
+            f"  text_stage={self.text_stage.name if self.text_stage else 'None'}\n"
+            f")"
+        )
 
 
 class AWSNovaSonicService(LLMService):
@@ -65,7 +99,8 @@ class AWSNovaSonicService(LLMService):
         self._receive_task = None
         self._prompt_name = str(uuid.uuid4())
         self._input_audio_content_name = str(uuid.uuid4())
-        self._audio_response_ongoing = False
+        self._content_being_received = None  # TODO: clean this up on error or when finished
+        self._assistant_is_responding = False
 
     #
     # standard AIService frame handling
@@ -314,7 +349,7 @@ class AWSNovaSonicService(LLMService):
                 if "event" in json_data:
                     event_json = json_data["event"]
                     if "completionStart" in event_json:
-                        # Handle the LLM response starting
+                        # Handle the LLM completion starting
                         await self._handle_completion_start_event(event_json)
                     elif "contentStart" in event_json:
                         # Handle a piece of content starting
@@ -329,7 +364,7 @@ class AWSNovaSonicService(LLMService):
                         # Handle a piece of content ending
                         await self._handle_content_end_event(event_json)
                     elif "completionStart" in event_json:
-                        # Handle the LLM response ending
+                        # Handle the LLM completion ending
                         await self._handle_completion_end_event(event_json)
 
         except Exception as e:
@@ -347,24 +382,35 @@ class AWSNovaSonicService(LLMService):
         if "additionalModelFields" in content_start:
             additional_model_fields = json.loads(content_start["additionalModelFields"])
             generation_stage = additional_model_fields.get("generationStage")
-        # print(
-        #     f"[pk] content start. type: {type}, role: {role}, generation_stage: {generation_stage}"
-        # )
+
+        # Bookkeeping: track current content being received
+        content = CurrentContent(
+            type=ContentType(type),
+            role=Role(role),
+            text_stage=TextStage(generation_stage) if generation_stage else None,
+            text_content=None
+        )
+        self._content_being_received = content
+
+        if content.role == Role.ASSISTANT:
+            if content.type == ContentType.AUDIO:
+                # Report that *equivalent* of TTS (this is a speech-to-speech model) started
+                # print("[pk] TTS started")
+                await self.push_frame(TTSStartedFrame())            
+
+        print(f"[pk] content start: {self._content_being_received}")
 
     async def _handle_text_output_event(self, event_json):
         text_content = event_json["textOutput"]["content"]
-        # print(f"[pk] text output. content: {text_content}")
+        print(f"[pk] text output. content: {text_content}")
+
+        # Bookkeeping: augment the current content being received with text
+        content = self._content_being_received
+        content.text_content = text_content
 
     async def _handle_audio_output_event(self, event_json):
         audio_content = event_json["audioOutput"]["content"]
-        print(f"[pk] audio output. content: {len(audio_content)}")
-
-        # Report that *equivalent* of TTS (this is a speech-to-speech model) started
-        if not self._audio_response_ongoing:
-            self._audio_response_ongoing = True
-            # print("[pk] starting TTS")
-            await self.push_frame(TTSStartedFrame())
-
+        # print(f"[pk] audio output. content: {len(audio_content)}")
         # Push audio frame
         audio = base64.b64decode(audio_content)
         # TODO: make sample rate + channels (used in multiple places) consts
@@ -377,15 +423,49 @@ class AWSNovaSonicService(LLMService):
 
     async def _handle_content_end_event(self, event_json):
         content_end = event_json["contentEnd"]
-        type = content_end["type"]
         stop_reason = content_end["stopReason"]
-        # print(f"[pk] content end. type: {type}, stop_reason: {stop_reason}")
+        # print(
+        #     f"[pk] content end: {self._content_being_received}.\n"
+        #     f"  stop_reason: {stop_reason}"
+        # )
 
-        # Report that *equivalent* of TTS (this is a speech-to-speech model) stopped
-        if type == "AUDIO" and self._audio_response_ongoing:
-            print("[pk] stopping TTS")
-            self._audio_response_ongoing = False
-            await self.push_frame(TTSStoppedFrame())
+        # Bookkeeping: clear current content being received
+        content = self._content_being_received
+        self._content_being_received = None
+
+        if content and content.role == Role.ASSISTANT:
+            if content.type == ContentType.AUDIO:
+                # We got to the end of a chunk of the assistant's audio.
+                # Report that *equivalent* of TTS (this is a speech-to-speech model) stopped.
+                # print("[pk] TTS stopped")
+                await self.push_frame(TTSStoppedFrame())
+            elif content.type == ContentType.TEXT:
+                # Ignore non-final text, and the "interrupted" message (which isn't meaningful text)
+                if content.text_stage == TextStage.FINAL and stop_reason != "INTERRUPTED":
+                    # TODO: the way we're tracking the start and stop of the assistant response here
+                    # is rather busted, and results in way too many "responses" being put into the 
+                    # context (every final text content block is treated as its own response).
+                    # We *should* only record that an assistant response has ended when:
+                    # - the assistant truly finished its turn (stop_reason is END_TURN)
+                    # - when this is the next text content block after an INTERRUPTED has occurred
+                    # BUT it seems like there's a bug where, if there are multiple assistant text 
+                    # content blocks, the *first* one gets marked END_TURN rather than the last.
+                    print("[pk] LLM full response started")
+                    self._assistant_is_responding = True
+                    await self.push_frame(LLMFullResponseStartFrame())
+
+                    if self._assistant_is_responding:
+                        # Add text to the ongoing reported assistant response
+                        print(f"[pk] LLM text: {content.text_content}")
+                        await self.push_frame(LLMTextFrame(content.text_content))
+
+                        # Report that the assistant has finished their response.
+                        # TODO: kinda busted. see TODO comment above.
+                        print("[pk] LLM full response ended")
+                        await self.push_frame(LLMFullResponseEndFrame())
+                        self._assistant_is_responding = False
+
+        self._content_being_received = False
 
     async def _handle_completion_end_event(self, event_json):
         # print("[pk] completion end")

From 260f7c9b85f9c38b6d00dfb79c8f6ebbe021dcbf Mon Sep 17 00:00:00 2001
From: Paul Kompfner <paul@daily.co>
Date: Fri, 25 Apr 2025 15:19:45 -0400
Subject: [PATCH 42/97] [WIP] AWS Nova Sonic service - format

---
 src/pipecat/services/aws_nova_sonic/aws.py | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py
index a2437b9dd..e9ce2013d 100644
--- a/src/pipecat/services/aws_nova_sonic/aws.py
+++ b/src/pipecat/services/aws_nova_sonic/aws.py
@@ -61,7 +61,7 @@ class CurrentContent:
     type: ContentType
     role: Role
     text_stage: TextStage  # None if not text
-    text_content: str # starts as None, then fills in if text
+    text_content: str  # starts as None, then fills in if text
 
     def __str__(self):
         return (
@@ -388,7 +388,7 @@ class AWSNovaSonicService(LLMService):
             type=ContentType(type),
             role=Role(role),
             text_stage=TextStage(generation_stage) if generation_stage else None,
-            text_content=None
+            text_content=None,
         )
         self._content_being_received = content
 
@@ -396,7 +396,7 @@ class AWSNovaSonicService(LLMService):
             if content.type == ContentType.AUDIO:
                 # Report that *equivalent* of TTS (this is a speech-to-speech model) started
                 # print("[pk] TTS started")
-                await self.push_frame(TTSStartedFrame())            
+                await self.push_frame(TTSStartedFrame())
 
         print(f"[pk] content start: {self._content_being_received}")
 
@@ -424,10 +424,7 @@ class AWSNovaSonicService(LLMService):
     async def _handle_content_end_event(self, event_json):
         content_end = event_json["contentEnd"]
         stop_reason = content_end["stopReason"]
-        # print(
-        #     f"[pk] content end: {self._content_being_received}.\n"
-        #     f"  stop_reason: {stop_reason}"
-        # )
+        print(f"[pk] content end: {self._content_being_received}.\n  stop_reason: {stop_reason}")
 
         # Bookkeeping: clear current content being received
         content = self._content_being_received
@@ -443,25 +440,25 @@ class AWSNovaSonicService(LLMService):
                 # Ignore non-final text, and the "interrupted" message (which isn't meaningful text)
                 if content.text_stage == TextStage.FINAL and stop_reason != "INTERRUPTED":
                     # TODO: the way we're tracking the start and stop of the assistant response here
-                    # is rather busted, and results in way too many "responses" being put into the 
+                    # is rather busted, and results in way too many "responses" being put into the
                     # context (every final text content block is treated as its own response).
                     # We *should* only record that an assistant response has ended when:
                     # - the assistant truly finished its turn (stop_reason is END_TURN)
                     # - when this is the next text content block after an INTERRUPTED has occurred
-                    # BUT it seems like there's a bug where, if there are multiple assistant text 
+                    # BUT it seems like there's a bug where, if there are multiple assistant text
                     # content blocks, the *first* one gets marked END_TURN rather than the last.
-                    print("[pk] LLM full response started")
+                    # print("[pk] LLM full response started")
                     self._assistant_is_responding = True
                     await self.push_frame(LLMFullResponseStartFrame())
 
                     if self._assistant_is_responding:
                         # Add text to the ongoing reported assistant response
-                        print(f"[pk] LLM text: {content.text_content}")
+                        # print(f"[pk] LLM text: {content.text_content}")
                         await self.push_frame(LLMTextFrame(content.text_content))
 
                         # Report that the assistant has finished their response.
                         # TODO: kinda busted. see TODO comment above.
-                        print("[pk] LLM full response ended")
+                        # print("[pk] LLM full response ended")
                         await self.push_frame(LLMFullResponseEndFrame())
                         self._assistant_is_responding = False
 

From a38206de9caf91d29364c72a879b303e76bad51d Mon Sep 17 00:00:00 2001
From: Paul Kompfner <paul@daily.co>
Date: Fri, 25 Apr 2025 15:33:45 -0400
Subject: [PATCH 43/97] [WIP] AWS Nova Sonic service - added TranscriptionFrame

---
 src/pipecat/services/aws_nova_sonic/aws.py | 41 ++++++++++++++++++----
 1 file changed, 34 insertions(+), 7 deletions(-)

diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py
index e9ce2013d..5ded76b81 100644
--- a/src/pipecat/services/aws_nova_sonic/aws.py
+++ b/src/pipecat/services/aws_nova_sonic/aws.py
@@ -30,12 +30,14 @@ from pipecat.frames.frames import (
     LLMFullResponseStartFrame,
     LLMTextFrame,
     StartFrame,
+    TranscriptionFrame,
     TTSAudioRawFrame,
     TTSStartedFrame,
     TTSStoppedFrame,
 )
 from pipecat.processors.frame_processor import FrameDirection
 from pipecat.services.llm_service import LLMService
+from pipecat.utils.time import time_now_iso8601
 
 
 class Role(Enum):
@@ -398,19 +400,30 @@ class AWSNovaSonicService(LLMService):
                 # print("[pk] TTS started")
                 await self.push_frame(TTSStartedFrame())
 
-        print(f"[pk] content start: {self._content_being_received}")
+        # print(f"[pk] content start: {self._content_being_received}")
 
     async def _handle_text_output_event(self, event_json):
+        # This should never happen
+        if not self._content_being_received:
+            return
+
         text_content = event_json["textOutput"]["content"]
-        print(f"[pk] text output. content: {text_content}")
+        # print(f"[pk] text output. content: {text_content}")
 
         # Bookkeeping: augment the current content being received with text
+        # Assumption: only one text content per content block
         content = self._content_being_received
         content.text_content = text_content
 
     async def _handle_audio_output_event(self, event_json):
+        # This should never happen
+        if not self._content_being_received:
+            return
+
+        # Get audio
         audio_content = event_json["audioOutput"]["content"]
         # print(f"[pk] audio output. content: {len(audio_content)}")
+
         # Push audio frame
         audio = base64.b64decode(audio_content)
         # TODO: make sample rate + channels (used in multiple places) consts
@@ -422,15 +435,19 @@ class AWSNovaSonicService(LLMService):
         await self.push_frame(frame)
 
     async def _handle_content_end_event(self, event_json):
+        # This should never happen
+        if not self._content_being_received:
+            return
+
         content_end = event_json["contentEnd"]
         stop_reason = content_end["stopReason"]
-        print(f"[pk] content end: {self._content_being_received}.\n  stop_reason: {stop_reason}")
+        # print(f"[pk] content end: {self._content_being_received}.\n  stop_reason: {stop_reason}")
 
         # Bookkeeping: clear current content being received
         content = self._content_being_received
         self._content_being_received = None
 
-        if content and content.role == Role.ASSISTANT:
+        if content.role == Role.ASSISTANT:
             if content.type == ContentType.AUDIO:
                 # We got to the end of a chunk of the assistant's audio.
                 # Report that *equivalent* of TTS (this is a speech-to-speech model) stopped.
@@ -447,20 +464,30 @@ class AWSNovaSonicService(LLMService):
                     # - when this is the next text content block after an INTERRUPTED has occurred
                     # BUT it seems like there's a bug where, if there are multiple assistant text
                     # content blocks, the *first* one gets marked END_TURN rather than the last.
-                    # print("[pk] LLM full response started")
+                    print("[pk] LLM full response started")
                     self._assistant_is_responding = True
                     await self.push_frame(LLMFullResponseStartFrame())
 
                     if self._assistant_is_responding:
                         # Add text to the ongoing reported assistant response
-                        # print(f"[pk] LLM text: {content.text_content}")
+                        print(f"[pk] LLM text: {content.text_content}")
                         await self.push_frame(LLMTextFrame(content.text_content))
 
                         # Report that the assistant has finished their response.
                         # TODO: kinda busted. see TODO comment above.
-                        # print("[pk] LLM full response ended")
+                        print("[pk] LLM full response ended")
                         await self.push_frame(LLMFullResponseEndFrame())
                         self._assistant_is_responding = False
+        elif content.role == Role.USER:
+            if content.type == ContentType.TEXT:
+                if content.text_stage == TextStage.FINAL:
+                    # Report a bit of user transcription
+                    print(f"[pk] transcription: {content.text_content}")
+                    await self.push_frame(
+                        TranscriptionFrame(
+                            text=content.text_content, user_id="", timestamp=time_now_iso8601()
+                        )
+                    )
 
         self._content_being_received = False
 

From 0c255d26183d988c82111e007807f19d89105111 Mon Sep 17 00:00:00 2001
From: Paul Kompfner <paul@daily.co>
Date: Fri, 25 Apr 2025 16:49:59 -0400
Subject: [PATCH 44/97] [WIP] AWS Nova Sonic service - added TTSTextFrame and
 reworked/cleaned up some bookkeeping logic

---
 src/pipecat/services/aws_nova_sonic/aws.py | 101 +++++++++++++--------
 1 file changed, 65 insertions(+), 36 deletions(-)

diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py
index 5ded76b81..4e5619f52 100644
--- a/src/pipecat/services/aws_nova_sonic/aws.py
+++ b/src/pipecat/services/aws_nova_sonic/aws.py
@@ -34,6 +34,7 @@ from pipecat.frames.frames import (
     TTSAudioRawFrame,
     TTSStartedFrame,
     TTSStoppedFrame,
+    TTSTextFrame,
 )
 from pipecat.processors.frame_processor import FrameDirection
 from pipecat.services.llm_service import LLMService
@@ -394,13 +395,14 @@ class AWSNovaSonicService(LLMService):
         )
         self._content_being_received = content
 
+        # print(f"[pk] content start: {self._content_being_received}")
+
         if content.role == Role.ASSISTANT:
             if content.type == ContentType.AUDIO:
-                # Report that *equivalent* of TTS (this is a speech-to-speech model) started
-                # print("[pk] TTS started")
-                await self.push_frame(TTSStartedFrame())
-
-        # print(f"[pk] content start: {self._content_being_received}")
+                if not self._assistant_is_responding:
+                    # The assistant has started responding.
+                    self._assistant_is_responding = True
+                    await self._report_assistant_started_responding()
 
     async def _handle_text_output_event(self, event_json):
         # This should never happen
@@ -448,49 +450,76 @@ class AWSNovaSonicService(LLMService):
         self._content_being_received = None
 
         if content.role == Role.ASSISTANT:
-            if content.type == ContentType.AUDIO:
-                # We got to the end of a chunk of the assistant's audio.
-                # Report that *equivalent* of TTS (this is a speech-to-speech model) stopped.
-                # print("[pk] TTS stopped")
-                await self.push_frame(TTSStoppedFrame())
-            elif content.type == ContentType.TEXT:
+            if content.type == ContentType.TEXT:
                 # Ignore non-final text, and the "interrupted" message (which isn't meaningful text)
                 if content.text_stage == TextStage.FINAL and stop_reason != "INTERRUPTED":
-                    # TODO: the way we're tracking the start and stop of the assistant response here
-                    # is rather busted, and results in way too many "responses" being put into the
-                    # context (every final text content block is treated as its own response).
-                    # We *should* only record that an assistant response has ended when:
-                    # - the assistant truly finished its turn (stop_reason is END_TURN)
-                    # - when this is the next text content block after an INTERRUPTED has occurred
-                    # BUT it seems like there's a bug where, if there are multiple assistant text
-                    # content blocks, the *first* one gets marked END_TURN rather than the last.
-                    print("[pk] LLM full response started")
-                    self._assistant_is_responding = True
-                    await self.push_frame(LLMFullResponseStartFrame())
+                    # TODO: shoot, for now we may need to "restart" the assistant responding because
+                    # every FINAL text block has to be treated as its own response. See below TODO
+                    # for more information.
+                    if not self._assistant_is_responding:
+                        self._assistant_is_responding = True
+                        await self._report_assistant_started_responding()
 
                     if self._assistant_is_responding:
-                        # Add text to the ongoing reported assistant response
-                        print(f"[pk] LLM text: {content.text_content}")
-                        await self.push_frame(LLMTextFrame(content.text_content))
+                        # Text added to the ongoing assistant response
+                        await self._report_assistant_response_text_added(content.text_content)
 
-                        # Report that the assistant has finished their response.
-                        # TODO: kinda busted. see TODO comment above.
-                        print("[pk] LLM full response ended")
-                        await self.push_frame(LLMFullResponseEndFrame())
+                        # Consider the assistant finished with their response.
+                        # TODO: the way we're tracking the start/stop of the assistant response
+                        # is rather busted, and results in way too many "responses" being put into
+                        # the context (every FINAL text content block is treated as its own
+                        # response). We *should* only record that an assistant response has ended
+                        # when:
+                        # - the assistant truly finished its turn (stop_reason is END_TURN)
+                        # - when the assistant has been interrupted, and outputs what's actually
+                        #   been said
+                        # BUT it seems like there's a bug where, if there are multiple assistant
+                        # text content blocks, the *first* one gets marked END_TURN rather than the
+                        # last. It's similarly unclear how to determine what the last text content
+                        # block will be after an interruption.
                         self._assistant_is_responding = False
+                        await self._report_assistant_stopped_responding()
         elif content.role == Role.USER:
             if content.type == ContentType.TEXT:
                 if content.text_stage == TextStage.FINAL:
-                    # Report a bit of user transcription
-                    print(f"[pk] transcription: {content.text_content}")
-                    await self.push_frame(
-                        TranscriptionFrame(
-                            text=content.text_content, user_id="", timestamp=time_now_iso8601()
-                        )
-                    )
+                    # User transcription text added
+                    await self._report_user_transcription_text_added(content.text_content)
 
         self._content_being_received = False
 
     async def _handle_completion_end_event(self, event_json):
         # print("[pk] completion end")
         pass
+
+    async def _report_assistant_started_responding(self):
+        # Report that the assistant has started their response.
+        print("[pk] LLM full response started")
+        await self.push_frame(LLMFullResponseStartFrame())
+
+        # Report that equivalent of TTS (this is a speech-to-speech model) started
+        print("[pk] TTS started")
+        await self.push_frame(TTSStartedFrame())
+
+    async def _report_assistant_response_text_added(self, text):
+        # Report some text added to the ongoing assistant response
+        print(f"[pk] LLM text: {text}")
+        await self.push_frame(LLMTextFrame(text))
+
+        # Report some text added to the *equivalent* of TTS (this is a speech-to-speech model)
+        print(f"[pk] TTS text: {text}")
+        await self.push_frame(TTSTextFrame(text))
+
+    async def _report_assistant_stopped_responding(self):
+        # Report that the assistant has finished their response.
+        print("[pk] LLM full response ended")
+        await self.push_frame(LLMFullResponseEndFrame())
+
+        # Report that equivalent of TTS (this is a speech-to-speech model) stopped.
+        print("[pk] TTS stopped")
+        await self.push_frame(TTSStoppedFrame())
+
+    async def _report_user_transcription_text_added(self, text):
+        print(f"[pk] transcription: {text}")
+        await self.push_frame(
+            TranscriptionFrame(text=text, user_id="", timestamp=time_now_iso8601())
+        )

From 1f9baefba8bab6a13c46ca1af129544f520e2a5a Mon Sep 17 00:00:00 2001
From: Paul Kompfner <paul@daily.co>
Date: Sun, 27 Apr 2025 06:50:28 -0400
Subject: [PATCH 45/97] [WIP] AWS Nova Sonic service - added stubs for handling
 interruption and user-started-speaking frames

---
 src/pipecat/services/aws_nova_sonic/aws.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py
index 4e5619f52..737bc82fb 100644
--- a/src/pipecat/services/aws_nova_sonic/aws.py
+++ b/src/pipecat/services/aws_nova_sonic/aws.py
@@ -30,11 +30,15 @@ from pipecat.frames.frames import (
     LLMFullResponseStartFrame,
     LLMTextFrame,
     StartFrame,
+    StartInterruptionFrame,
+    StopInterruptionFrame,
     TranscriptionFrame,
     TTSAudioRawFrame,
     TTSStartedFrame,
     TTSStoppedFrame,
     TTSTextFrame,
+    UserStartedSpeakingFrame,
+    UserStoppedSpeakingFrame,
 )
 from pipecat.processors.frame_processor import FrameDirection
 from pipecat.services.llm_service import LLMService
@@ -131,6 +135,15 @@ class AWSNovaSonicService(LLMService):
         if isinstance(frame, InputAudioRawFrame):
             # TODO: check if _audio_input_paused? what causes that?
             await self._send_user_audio_event(frame)
+        # TODO: do we need to do anything for these?
+        elif isinstance(frame, StartInterruptionFrame):
+            print("[pk] StartInterruptionFrame")
+        elif isinstance(frame, UserStartedSpeakingFrame):
+            print("[pk] UserStartedSpeakingFrame")
+        elif isinstance(frame, StopInterruptionFrame):
+            print("[pk] StopInterruptionFrame")
+        elif isinstance(frame, UserStoppedSpeakingFrame):
+            print("[pk] UserStoppedSpeakingFrame")
 
         await self.push_frame(frame, direction)
 

From 5b64613f65a3c6d28923bb2396a58c4909a05f4d Mon Sep 17 00:00:00 2001
From: Paul Kompfner <paul@daily.co>
Date: Mon, 28 Apr 2025 09:16:10 -0400
Subject: [PATCH 46/97] [WIP] AWS Nova Sonic service

---
 examples/foundational/39-aws-nova-sonic.py |  4 +---
 src/pipecat/services/aws_nova_sonic/aws.py | 22 +++++++++++++---------
 2 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/examples/foundational/39-aws-nova-sonic.py b/examples/foundational/39-aws-nova-sonic.py
index fffaee686..567655002 100644
--- a/examples/foundational/39-aws-nova-sonic.py
+++ b/examples/foundational/39-aws-nova-sonic.py
@@ -43,9 +43,7 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection):
             camera_in_enabled=False,
             vad_enabled=True,
             vad_audio_passthrough=True,
-            # set stop_secs to something roughly similar to the internal setting
-            # of the Multimodal Live api, just to align events.
-            vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.5)),
+            vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.8)),
         ),
     )
 
diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py
index 737bc82fb..facf84a49 100644
--- a/src/pipecat/services/aws_nova_sonic/aws.py
+++ b/src/pipecat/services/aws_nova_sonic/aws.py
@@ -137,13 +137,17 @@ class AWSNovaSonicService(LLMService):
             await self._send_user_audio_event(frame)
         # TODO: do we need to do anything for these?
         elif isinstance(frame, StartInterruptionFrame):
-            print("[pk] StartInterruptionFrame")
+            # print("[pk] StartInterruptionFrame")
+            pass
         elif isinstance(frame, UserStartedSpeakingFrame):
-            print("[pk] UserStartedSpeakingFrame")
+            # print("[pk] UserStartedSpeakingFrame")
+            pass
         elif isinstance(frame, StopInterruptionFrame):
-            print("[pk] StopInterruptionFrame")
+            # print("[pk] StopInterruptionFrame")
+            pass
         elif isinstance(frame, UserStoppedSpeakingFrame):
-            print("[pk] UserStoppedSpeakingFrame")
+            # print("[pk] UserStoppedSpeakingFrame")
+            pass
 
         await self.push_frame(frame, direction)
 
@@ -415,7 +419,7 @@ class AWSNovaSonicService(LLMService):
                 if not self._assistant_is_responding:
                     # The assistant has started responding.
                     self._assistant_is_responding = True
-                    await self._report_assistant_started_responding()
+                    await self._report_assistant_response_started()
 
     async def _handle_text_output_event(self, event_json):
         # This should never happen
@@ -471,7 +475,7 @@ class AWSNovaSonicService(LLMService):
                     # for more information.
                     if not self._assistant_is_responding:
                         self._assistant_is_responding = True
-                        await self._report_assistant_started_responding()
+                        await self._report_assistant_response_started()
 
                     if self._assistant_is_responding:
                         # Text added to the ongoing assistant response
@@ -491,7 +495,7 @@ class AWSNovaSonicService(LLMService):
                         # last. It's similarly unclear how to determine what the last text content
                         # block will be after an interruption.
                         self._assistant_is_responding = False
-                        await self._report_assistant_stopped_responding()
+                        await self._report_assistant_response_ended()
         elif content.role == Role.USER:
             if content.type == ContentType.TEXT:
                 if content.text_stage == TextStage.FINAL:
@@ -504,7 +508,7 @@ class AWSNovaSonicService(LLMService):
         # print("[pk] completion end")
         pass
 
-    async def _report_assistant_started_responding(self):
+    async def _report_assistant_response_started(self):
         # Report that the assistant has started their response.
         print("[pk] LLM full response started")
         await self.push_frame(LLMFullResponseStartFrame())
@@ -522,7 +526,7 @@ class AWSNovaSonicService(LLMService):
         print(f"[pk] TTS text: {text}")
         await self.push_frame(TTSTextFrame(text))
 
-    async def _report_assistant_stopped_responding(self):
+    async def _report_assistant_response_ended(self):
         # Report that the assistant has finished their response.
         print("[pk] LLM full response ended")
         await self.push_frame(LLMFullResponseEndFrame())

From 68c1069548c110dbad409bee788a2f571854bea0 Mon Sep 17 00:00:00 2001
From: Paul Kompfner <paul@daily.co>
Date: Mon, 28 Apr 2025 10:37:11 -0400
Subject: [PATCH 47/97] [WIP] AWS Nova Sonic service

---
 src/pipecat/services/aws_nova_sonic/aws.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py
index facf84a49..dd7fd702b 100644
--- a/src/pipecat/services/aws_nova_sonic/aws.py
+++ b/src/pipecat/services/aws_nova_sonic/aws.py
@@ -412,7 +412,9 @@ class AWSNovaSonicService(LLMService):
         )
         self._content_being_received = content
 
-        # print(f"[pk] content start: {self._content_being_received}")
+        # print(f"[pk] content start: {content}")
+        if content.role == Role.ASSISTANT:
+            print(f"[pk] assistant content start: {content}")
 
         if content.role == Role.ASSISTANT:
             if content.type == ContentType.AUDIO:
@@ -425,13 +427,15 @@ class AWSNovaSonicService(LLMService):
         # This should never happen
         if not self._content_being_received:
             return
+        content = self._content_being_received
 
         text_content = event_json["textOutput"]["content"]
         # print(f"[pk] text output. content: {text_content}")
+        if content.role == Role.ASSISTANT:
+            print(f"[pk] assistant text output. content: {text_content}")
 
         # Bookkeeping: augment the current content being received with text
         # Assumption: only one text content per content block
-        content = self._content_being_received
         content.text_content = text_content
 
     async def _handle_audio_output_event(self, event_json):
@@ -457,13 +461,15 @@ class AWSNovaSonicService(LLMService):
         # This should never happen
         if not self._content_being_received:
             return
+        content = self._content_being_received
 
         content_end = event_json["contentEnd"]
         stop_reason = content_end["stopReason"]
-        # print(f"[pk] content end: {self._content_being_received}.\n  stop_reason: {stop_reason}")
+        # print(f"[pk] content end: {content}.\n  stop_reason: {stop_reason}")
+        if content.role == Role.ASSISTANT:
+            print(f"[pk] assistant content end: {content}.\n  stop_reason: {stop_reason}")
 
         # Bookkeeping: clear current content being received
-        content = self._content_being_received
         self._content_being_received = None
 
         if content.role == Role.ASSISTANT:

From 96d05e12fcd7fa821657b10b74ef6d564c4e279d Mon Sep 17 00:00:00 2001
From: Paul Kompfner <paul@daily.co>
Date: Mon, 28 Apr 2025 11:15:51 -0400
Subject: [PATCH 48/97] [WIP] AWS Nova Sonic service

---
 src/pipecat/services/aws_nova_sonic/aws.py | 57 +++++++++++-----------
 1 file changed, 28 insertions(+), 29 deletions(-)

diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py
index dd7fd702b..640658ca9 100644
--- a/src/pipecat/services/aws_nova_sonic/aws.py
+++ b/src/pipecat/services/aws_nova_sonic/aws.py
@@ -22,6 +22,7 @@ from smithy_aws_core.identity import AWSCredentialsIdentity
 from smithy_core.aio.eventstream import DuplexEventStream
 
 from pipecat.frames.frames import (
+    BotStoppedSpeakingFrame,
     CancelFrame,
     EndFrame,
     Frame,
@@ -148,9 +149,29 @@ class AWSNovaSonicService(LLMService):
         elif isinstance(frame, UserStoppedSpeakingFrame):
             # print("[pk] UserStoppedSpeakingFrame")
             pass
+        elif isinstance(frame, BotStoppedSpeakingFrame):
+            await self._handle_bot_stopped_speaking()
 
         await self.push_frame(frame, direction)
 
+    async def _handle_bot_stopped_speaking(self):
+        if self._assistant_is_responding:
+            # Consider the assistant finished with their response.
+            #
+            # TODO: ideally we could base this solely on the LLM output events, but I couldn't
+            # figure out a reliable way to determine when we've gotten our last FINAL text block
+            # after the LLM is done talking.
+            #
+            # First I looked at stopReason, but it doesn't seem like the last FINAL text block is
+            # reliably marked END_TURN (sometimes the *first* one is, but not the last...bug?)
+            #
+            # Then I considered schemes where we tally or match up SPECULATIVE text blocks with
+            # FINAL text blocks to know how many or which FINAL blocks to expect, but user
+            # interruptions throw a wrench in these schemes: depending on the exact timing of the
+            # interruption, we should or shouldn't expect some FINAL blocks.
+            self._assistant_is_responding = False
+            await self._report_assistant_response_ended()
+
     #
     # LLM communication: lifecycle
     #
@@ -413,11 +434,12 @@ class AWSNovaSonicService(LLMService):
         self._content_being_received = content
 
         # print(f"[pk] content start: {content}")
-        if content.role == Role.ASSISTANT:
-            print(f"[pk] assistant content start: {content}")
+        # if content.role == Role.ASSISTANT:
+        #     print(f"[pk] assistant content start: {content}")
 
         if content.role == Role.ASSISTANT:
             if content.type == ContentType.AUDIO:
+                # Note that an assistant response can comprise of multiple audio blocks
                 if not self._assistant_is_responding:
                     # The assistant has started responding.
                     self._assistant_is_responding = True
@@ -431,8 +453,8 @@ class AWSNovaSonicService(LLMService):
 
         text_content = event_json["textOutput"]["content"]
         # print(f"[pk] text output. content: {text_content}")
-        if content.role == Role.ASSISTANT:
-            print(f"[pk] assistant text output. content: {text_content}")
+        # if content.role == Role.ASSISTANT:
+        #     print(f"[pk] assistant text output. content: {text_content}")
 
         # Bookkeeping: augment the current content being received with text
         # Assumption: only one text content per content block
@@ -466,8 +488,8 @@ class AWSNovaSonicService(LLMService):
         content_end = event_json["contentEnd"]
         stop_reason = content_end["stopReason"]
         # print(f"[pk] content end: {content}.\n  stop_reason: {stop_reason}")
-        if content.role == Role.ASSISTANT:
-            print(f"[pk] assistant content end: {content}.\n  stop_reason: {stop_reason}")
+        # if content.role == Role.ASSISTANT:
+        #     print(f"[pk] assistant content end: {content}.\n  stop_reason: {stop_reason}")
 
         # Bookkeeping: clear current content being received
         self._content_being_received = None
@@ -476,32 +498,9 @@ class AWSNovaSonicService(LLMService):
             if content.type == ContentType.TEXT:
                 # Ignore non-final text, and the "interrupted" message (which isn't meaningful text)
                 if content.text_stage == TextStage.FINAL and stop_reason != "INTERRUPTED":
-                    # TODO: shoot, for now we may need to "restart" the assistant responding because
-                    # every FINAL text block has to be treated as its own response. See below TODO
-                    # for more information.
-                    if not self._assistant_is_responding:
-                        self._assistant_is_responding = True
-                        await self._report_assistant_response_started()
-
                     if self._assistant_is_responding:
                         # Text added to the ongoing assistant response
                         await self._report_assistant_response_text_added(content.text_content)
-
-                        # Consider the assistant finished with their response.
-                        # TODO: the way we're tracking the start/stop of the assistant response
-                        # is rather busted, and results in way too many "responses" being put into
-                        # the context (every FINAL text content block is treated as its own
-                        # response). We *should* only record that an assistant response has ended
-                        # when:
-                        # - the assistant truly finished its turn (stop_reason is END_TURN)
-                        # - when the assistant has been interrupted, and outputs what's actually
-                        #   been said
-                        # BUT it seems like there's a bug where, if there are multiple assistant
-                        # text content blocks, the *first* one gets marked END_TURN rather than the
-                        # last. It's similarly unclear how to determine what the last text content
-                        # block will be after an interruption.
-                        self._assistant_is_responding = False
-                        await self._report_assistant_response_ended()
         elif content.role == Role.USER:
             if content.type == ContentType.TEXT:
                 if content.text_stage == TextStage.FINAL:

From 9b8bce1914810971ef18bc8a01d4ab41287fc2e5 Mon Sep 17 00:00:00 2001
From: Paul Kompfner <paul@daily.co>
Date: Mon, 28 Apr 2025 13:18:09 -0400
Subject: [PATCH 49/97] [WIP] AWS Nova Sonic service - add voice_id

---
 examples/foundational/39-aws-nova-sonic.py | 1 +
 src/pipecat/services/aws_nova_sonic/aws.py | 4 +++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/examples/foundational/39-aws-nova-sonic.py b/examples/foundational/39-aws-nova-sonic.py
index 567655002..445464957 100644
--- a/examples/foundational/39-aws-nova-sonic.py
+++ b/examples/foundational/39-aws-nova-sonic.py
@@ -66,6 +66,7 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection):
         secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
         access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
         region=os.getenv("AWS_REGION"),
+        voice_id="tiffany", # matthew, tiffany, amy
     )
 
     # Build the pipeline
diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py
index 640658ca9..eb57d9b80 100644
--- a/src/pipecat/services/aws_nova_sonic/aws.py
+++ b/src/pipecat/services/aws_nova_sonic/aws.py
@@ -90,6 +90,7 @@ class AWSNovaSonicService(LLMService):
         access_key_id: str,
         region: str,
         model: str = "amazon.nova-sonic-v1:0",
+        voice_id: str = "matthew",  # matthew, tiffany, amy
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -99,6 +100,7 @@ class AWSNovaSonicService(LLMService):
         self._region = region
         self._model = model
         self._client: BedrockRuntimeClient = None
+        self._voice_id = voice_id
         self._stream: DuplexEventStream[
             InvokeModelWithBidirectionalStreamInput,
             InvokeModelWithBidirectionalStreamOutput,
@@ -257,7 +259,7 @@ class AWSNovaSonicService(LLMService):
                 "sampleRateHertz": 24000,
                 "sampleSizeBits": 16,
                 "channelCount": 1,
-                "voiceId": "matthew",
+                "voiceId": "{self._voice_id}",
                 "encoding": "base64",
                 "audioType": "SPEECH"
               }}

From 9f7f42e885db0d68cfa619ad5720abfd536ce690 Mon Sep 17 00:00:00 2001
From: Paul Kompfner <paul@daily.co>
Date: Mon, 28 Apr 2025 13:41:55 -0400
Subject: [PATCH 50/97] [WIP] AWS Nova Sonic service

---
 src/pipecat/services/aws_nova_sonic/aws.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py
index eb57d9b80..563d35422 100644
--- a/src/pipecat/services/aws_nova_sonic/aws.py
+++ b/src/pipecat/services/aws_nova_sonic/aws.py
@@ -193,7 +193,7 @@ class AWSNovaSonicService(LLMService):
             )
 
             # Send session start events
-            await self._send_session_start_event()
+            await self._send_session_start_events()
 
             # Send initial system instruction
             await self._send_text_event(text=self._instruction, role=Role.SYSTEM)
@@ -230,7 +230,7 @@ class AWSNovaSonicService(LLMService):
     #
 
     # TODO: make params configurable?
-    async def _send_session_start_event(self):
+    async def _send_session_start_events(self):
         session_start = """
         {
           "event": {

From f182eafb40dd320c874f963e7549ba64a4dfac8b Mon Sep 17 00:00:00 2001
From: Paul Kompfner <paul@daily.co>
Date: Tue, 29 Apr 2025 11:39:32 -0400
Subject: [PATCH 51/97] [WIP] AWS Nova Sonic service - add ability to pass in
 OpenAILLMContext

---
 examples/foundational/39-aws-nova-sonic.py    |  47 ++---
 .../services/aws_nova_sonic/__init__.py       |   2 +-
 src/pipecat/services/aws_nova_sonic/aws.py    | 187 +++++++++++++++---
 .../services/aws_nova_sonic/context.py        | 121 ++++++++++++
 4 files changed, 303 insertions(+), 54 deletions(-)
 create mode 100644 src/pipecat/services/aws_nova_sonic/context.py

diff --git a/examples/foundational/39-aws-nova-sonic.py b/examples/foundational/39-aws-nova-sonic.py
index 445464957..c44f85a48 100644
--- a/examples/foundational/39-aws-nova-sonic.py
+++ b/examples/foundational/39-aws-nova-sonic.py
@@ -16,7 +16,8 @@ from pipecat.frames.frames import LLMMessagesAppendFrame
 from pipecat.pipeline.pipeline import Pipeline
 from pipecat.pipeline.runner import PipelineRunner
 from pipecat.pipeline.task import PipelineParams, PipelineTask
-from pipecat.services.aws_nova_sonic import AWSNovaSonicService
+from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
+from pipecat.services.aws_nova_sonic import AWSNovaSonicLLMService
 from pipecat.transports.base_transport import TransportParams
 from pipecat.transports.network.small_webrtc import SmallWebRTCTransport
 from pipecat.transports.network.webrtc_connection import SmallWebRTCConnection
@@ -47,13 +48,7 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection):
         ),
     )
 
-    # Create the AWS Nova Sonic LLM service
-    # system_instruction = f"""
-    # You are a helpful AI assistant.
-    # Your goal is to demonstrate your capabilities in a helpful and engaging way.
-    # Your output will be converted to audio so don't include special characters in your answers.
-    # Respond to what the user said in a creative and helpful way.
-    # """
+    # Specify initial system instruction
     # TODO: looks like Nova Sonic can't handle new lines?
     system_instruction = (
         "You are a friendly assistant. The user and you will engage in a spoken dialog "
@@ -61,20 +56,37 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection):
         "generally two or three sentences for chatty scenarios."
     )
 
-    llm = AWSNovaSonicService(
-        instruction=system_instruction,
+    # Create the AWS Nova Sonic LLM service
+    llm = AWSNovaSonicLLMService(
         secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
         access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
         region=os.getenv("AWS_REGION"),
-        voice_id="tiffany", # matthew, tiffany, amy
+        voice_id="tiffany",  # matthew, tiffany, amy
+        # instruction=system_instruction # could pass instruction here rather than context, below
     )
 
+    # Set up context and context management.
+    # AWSNovaSonicService will adapt OpenAI LLM context objects with standard message format to
+    # what's expected by Nova Sonic.
+    context = OpenAILLMContext(
+        messages=[
+            {"role": "system", "content": f"{system_instruction}"},
+            {
+                "role": "user",
+                "content": "Tell me hello! Don't wait for me to say anything else first!",
+            },
+        ]
+    )
+    context_aggregator = llm.create_context_aggregator(context)
+
     # Build the pipeline
     pipeline = Pipeline(
         [
             transport.input(),
+            context_aggregator.user(),
             llm,
             transport.output(),
+            context_aggregator.assistant(),
         ]
     )
 
@@ -93,18 +105,7 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection):
     async def on_client_connected(transport, client):
         logger.info(f"Client connected")
         # Kick off the conversation.
-        await task.queue_frames(
-            [
-                LLMMessagesAppendFrame(
-                    messages=[
-                        {
-                            "role": "user",
-                            "content": f"Greet the user and introduce yourself.",
-                        }
-                    ]
-                )
-            ]
-        )
+        await task.queue_frames([context_aggregator.user().get_context_frame()])
 
     # Handle client disconnection events
     @transport.event_handler("on_client_disconnected")
diff --git a/src/pipecat/services/aws_nova_sonic/__init__.py b/src/pipecat/services/aws_nova_sonic/__init__.py
index b5559715a..e14c44f8a 100644
--- a/src/pipecat/services/aws_nova_sonic/__init__.py
+++ b/src/pipecat/services/aws_nova_sonic/__init__.py
@@ -1 +1 @@
-from .aws import AWSNovaSonicService
+from .aws import AWSNovaSonicLLMService
diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py
index 563d35422..cc07e5463 100644
--- a/src/pipecat/services/aws_nova_sonic/aws.py
+++ b/src/pipecat/services/aws_nova_sonic/aws.py
@@ -3,6 +3,7 @@ import json
 import uuid
 from dataclasses import dataclass
 from enum import Enum
+from typing import Any
 
 from aws_sdk_bedrock_runtime.client import (
     BedrockRuntimeClient,
@@ -41,18 +42,26 @@ from pipecat.frames.frames import (
     UserStartedSpeakingFrame,
     UserStoppedSpeakingFrame,
 )
+from pipecat.processors.aggregators.llm_response import (
+    LLMAssistantAggregatorParams,
+    LLMUserAggregatorParams,
+)
+from pipecat.processors.aggregators.openai_llm_context import (
+    OpenAILLMContext,
+    OpenAILLMContextFrame,
+)
 from pipecat.processors.frame_processor import FrameDirection
+from pipecat.services.aws_nova_sonic.context import (
+    AWSNovaSonicAssistantContextAggregator,
+    AWSNovaSonicContextAggregatorPair,
+    AWSNovaSonicLLMContext,
+    AWSNovaSonicUserContextAggregator,
+    Role,
+)
 from pipecat.services.llm_service import LLMService
 from pipecat.utils.time import time_now_iso8601
 
 
-class Role(Enum):
-    SYSTEM = "SYSTEM"
-    USER = "USER"
-    ASSISTANT = "ASSISTANT"
-    TOOL = "TOOL"
-
-
 class ContentType(Enum):
     AUDIO = "AUDIO"
     TEXT = "TEXT"
@@ -81,36 +90,40 @@ class CurrentContent:
         )
 
 
-class AWSNovaSonicService(LLMService):
+class AWSNovaSonicLLMService(LLMService):
     def __init__(
         self,
         *,
-        instruction: str,
+        # TODO: if we have instruction here as an alternative to using context, we should do the same for tools...right?
         secret_access_key: str,
         access_key_id: str,
         region: str,
         model: str = "amazon.nova-sonic-v1:0",
         voice_id: str = "matthew",  # matthew, tiffany, amy
+        instruction: str = None,
         **kwargs,
     ):
         super().__init__(**kwargs)
-        self._instruction = instruction
         self._secret_access_key = secret_access_key
         self._access_key_id = access_key_id
         self._region = region
         self._model = model
         self._client: BedrockRuntimeClient = None
         self._voice_id = voice_id
+        self._instruction = instruction
+        self._context: AWSNovaSonicLLMContext = None
         self._stream: DuplexEventStream[
             InvokeModelWithBidirectionalStreamInput,
             InvokeModelWithBidirectionalStreamOutput,
             InvokeModelWithBidirectionalStreamOperationOutput,
         ] = None
         self._receive_task = None
-        self._prompt_name = str(uuid.uuid4())
-        self._input_audio_content_name = str(uuid.uuid4())
-        self._content_being_received = None  # TODO: clean this up on error or when finished
+        self._prompt_name = None
+        self._input_audio_content_name = None
+        self._content_being_received = None
         self._assistant_is_responding = False
+        self._context_available = False
+        self._ready_to_send_context = False
 
     #
     # standard AIService frame handling
@@ -118,7 +131,14 @@ class AWSNovaSonicService(LLMService):
 
     async def start(self, frame: StartFrame):
         await super().start(frame)
-        await self._connect()
+        # TODO: maybe connect but don't send history until we get all of our settings?
+        # how do we know how long to wait?
+        # ah, i think we'll *always* get at least one OpenAILLMContextFrame which kicks things off
+        # so we need to send the initial history when:
+        # - we're connected
+        # - we've gotten the first context
+        # i *think* this is what's controlled by _api_session_ready/_run_llm_when_api_session_ready
+        await self._start_connecting()
 
     async def stop(self, frame: EndFrame):
         await super().stop(frame)
@@ -135,10 +155,14 @@ class AWSNovaSonicService(LLMService):
     async def process_frame(self, frame: Frame, direction: FrameDirection):
         await super().process_frame(frame, direction)
 
-        if isinstance(frame, InputAudioRawFrame):
+        if isinstance(frame, OpenAILLMContextFrame):
+            await self._handle_context(frame.context)
+        elif isinstance(frame, InputAudioRawFrame):
             # TODO: check if _audio_input_paused? what causes that?
             await self._send_user_audio_event(frame)
-        # TODO: do we need to do anything for these?
+        elif isinstance(frame, BotStoppedSpeakingFrame):
+            await self._handle_bot_stopped_speaking()
+        # TODO: do we need to do anything for the below four frame types?
         elif isinstance(frame, StartInterruptionFrame):
             # print("[pk] StartInterruptionFrame")
             pass
@@ -151,11 +175,19 @@ class AWSNovaSonicService(LLMService):
         elif isinstance(frame, UserStoppedSpeakingFrame):
             # print("[pk] UserStoppedSpeakingFrame")
             pass
-        elif isinstance(frame, BotStoppedSpeakingFrame):
-            await self._handle_bot_stopped_speaking()
 
         await self.push_frame(frame, direction)
 
+    async def _handle_context(self, context: OpenAILLMContext):
+        # TODO: if context has changed, reconnect
+        # TODO: remove
+        print(f"[pk] _handle_context: {context.get_messages_for_initializing_history()}")
+        if not self._context:
+            # We got our initial context - try to finish connecting
+            self._context = AWSNovaSonicLLMContext.upgrade_to_nova_sonic(context)
+            self._context_available = True
+            await self._finish_connecting_if_context_available()
+
     async def _handle_bot_stopped_speaking(self):
         if self._assistant_is_responding:
             # Consider the assistant finished with their response.
@@ -178,12 +210,16 @@ class AWSNovaSonicService(LLMService):
     # LLM communication: lifecycle
     #
 
-    async def _connect(self):
+    async def _start_connecting(self):
         try:
             if self._client:
-                # Here we assume that if we have a client we are connected
+                # Here we assume that if we have a client we are connected or connecting
                 return
 
+            # Set IDs for the connection
+            self._prompt_name = str(uuid.uuid4())
+            self._input_audio_content_name = str(uuid.uuid4())
+
             # Create the client
             self._client = self._create_client()
 
@@ -195,19 +231,71 @@ class AWSNovaSonicService(LLMService):
             # Send session start events
             await self._send_session_start_events()
 
-            # Send initial system instruction
-            await self._send_text_event(text=self._instruction, role=Role.SYSTEM)
-
-            # Start audio input
-            await self._send_audio_input_start_event()
-
-            self._receive_task = self.create_task(self._receive_task_handler())
+            # Finish connecting
+            self._ready_to_send_context = True
+            await self._finish_connecting_if_context_available()
         except Exception as e:
             logger.error(f"{self} initialization error: {e}")
-            self._client = None
+            self._disconnect()
+
+    async def _finish_connecting_if_context_available(self):
+        # We can only finish connecting once we've gotten our initial context and we're ready to
+        # send it
+        if not (self._context_available and self._ready_to_send_context):
+            return
+
+        # Read context
+        history = self._context.get_messages_for_initializing_history()
+
+        # Send system instruction
+        # Instruction from context takes priority
+        instruction = history.instruction if history.instruction else self._instruction
+        if instruction:
+            await self._send_text_event(text=instruction, role=Role.SYSTEM)
+
+        # Send conversation history
+        for message in history.messages:
+            await self._send_text_event(text=message.text, role=message.role)
+
+        # Send initial context (system instruction and conversation history)
+        # TODO: finish implementing
+        # - pass additional message(s)
+        # - merge init-passed system instruction + context instruction (latter takes precedence)
+        # - merge init-passed tools + context tools (latter takes precedence)
+        await self._send_text_event(text=self._instruction, role=Role.SYSTEM)
+
+        # Start audio input
+        await self._send_audio_input_start_event()
+
+        # Start receiving events
+        self._receive_task = self.create_task(self._receive_task_handler())
 
     async def _disconnect(self):
-        pass
+        try:
+            # Clean up receive task
+            if self._receive_task:
+                await self.cancel_task(self._receive_task, timeout=1.0)
+                self._receive_task = None
+
+            # Clean up client
+            if self._client:
+                await self._send_session_end_events()
+                self._client = None
+
+            # Clean up stream
+            if self._stream:
+                await self._stream.input_stream.close()
+                self._stream = None
+
+            # Reset remaining connection-specific state
+            self._prompt_name = None
+            self._input_audio_content_name = None
+            self._content_being_received = None
+            self._assistant_is_responding = False
+            self._context_available = False
+            self._ready_to_send_context = False
+        except Exception as e:
+            logger.error(f"{self} error disconnecting: {e}")
 
     def _create_client(self) -> BedrockRuntimeClient:
         config = Config(
@@ -340,7 +428,7 @@ class AWSNovaSonicService(LLMService):
         await self._send_client_event(text_content_end)
 
     async def _send_user_audio_event(self, frame: InputAudioRawFrame):
-        if not self._client:
+        if not self._stream:
             return
 
         blob = base64.b64encode(frame.audio)
@@ -357,6 +445,30 @@ class AWSNovaSonicService(LLMService):
         '''
         await self._send_client_event(audio_event)
 
+    async def _send_session_end_events(self):
+        if not self._stream:
+            return
+
+        prompt_end = f'''
+        {{
+            "event": {{
+                "promptEnd": {{
+                    "promptName": "{self._prompt_name}"
+                }}
+            }}
+        }}
+        '''
+        await self._send_client_event(prompt_end)
+
+        session_end = """
+        {
+            "event": {
+                "sessionEnd": {}
+            }
+        }
+        """
+        await self._send_client_event(session_end)
+
     async def _send_client_event(self, event_json: str):
         event = InvokeModelWithBidirectionalStreamInputChunk(
             value=BidirectionalInputPayloadPart(bytes_=event_json.encode("utf-8"))
@@ -547,3 +659,18 @@ class AWSNovaSonicService(LLMService):
         await self.push_frame(
             TranscriptionFrame(text=text, user_id="", timestamp=time_now_iso8601())
         )
+
+    #
+    # Context
+    #
+
+    def create_context_aggregator(
+        self,
+        context: OpenAILLMContext,
+        *,
+        user_params: LLMUserAggregatorParams = LLMUserAggregatorParams(),
+        assistant_params: LLMAssistantAggregatorParams = LLMAssistantAggregatorParams(),
+    ) -> AWSNovaSonicContextAggregatorPair:
+        user = AWSNovaSonicUserContextAggregator(context=context, params=user_params)
+        assistant = AWSNovaSonicAssistantContextAggregator(context=context, params=assistant_params)
+        return AWSNovaSonicContextAggregatorPair(user, assistant)
diff --git a/src/pipecat/services/aws_nova_sonic/context.py b/src/pipecat/services/aws_nova_sonic/context.py
new file mode 100644
index 000000000..331ecc13e
--- /dev/null
+++ b/src/pipecat/services/aws_nova_sonic/context.py
@@ -0,0 +1,121 @@
+import copy
+from dataclasses import dataclass, field
+from enum import Enum
+
+from loguru import logger
+
+from pipecat.frames.frames import DataFrame, Frame, LLMMessagesUpdateFrame, LLMSetToolsFrame
+from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
+from pipecat.processors.frame_processor import FrameDirection
+from pipecat.services.openai.llm import (
+    OpenAIAssistantContextAggregator,
+    OpenAIUserContextAggregator,
+)
+
+
+class Role(Enum):
+    SYSTEM = "SYSTEM"
+    USER = "USER"
+    ASSISTANT = "ASSISTANT"
+    TOOL = "TOOL"
+
+
+@dataclass
+class AWSNovaSonicConversationHistoryMessage:
+    role: Role  # only USER and ASSISTANT
+    text: str
+
+
+@dataclass
+class AWSNovaSonicConversationHistory:
+    instruction: str = None
+    messages: list[AWSNovaSonicConversationHistoryMessage] = field(default_factory=list)
+
+
+@dataclass
+class AWSNovaSonicLLMContext(OpenAILLMContext):
+    @staticmethod
+    def upgrade_to_nova_sonic(obj: OpenAILLMContext) -> "AWSNovaSonicLLMContext":
+        if isinstance(obj, OpenAILLMContext) and not isinstance(obj, AWSNovaSonicLLMContext):
+            obj.__class__ = AWSNovaSonicLLMContext
+        return obj
+
+    def get_messages_for_initializing_history(self) -> AWSNovaSonicConversationHistory:
+        history = AWSNovaSonicConversationHistory()
+
+        # Bail if there are no messages
+        if not self.messages:
+            return history
+
+        messages = copy.deepcopy(self.messages)
+
+        # If we have a "system" message as our first message, let's pull that out into "instruction"
+        if messages[0].get("role") == "system":
+            system = messages.pop(0)
+            content = system.get("content")
+            if isinstance(content, str):
+                history.instruction = content
+            elif isinstance(content, list):
+                history.instruction = content[0].get("text")
+
+        # Process remaining messages to fill out conversation history.
+        # Nova Sonic supports "user" and "assistant" messages in history.
+        for message in messages:
+            history_message = self.from_standard_message(message)
+            if history_message:
+                history.messages.append(history_message)
+
+        return history
+
+    def from_standard_message(self, message) -> AWSNovaSonicConversationHistoryMessage:
+        role = message.get("role")
+        if message.get("role") == "user" or message.get("role") == "assistant":
+            content = message.get("content")
+            if isinstance(message.get("content"), list):
+                content = ""
+                for c in message.get("content"):
+                    if c.get("type") == "text":
+                        content += " " + c.get("text")
+                    else:
+                        logger.error(
+                            f"Unhandled content type in context message: {c.get('type')} - {message}"
+                        )
+            return AWSNovaSonicConversationHistoryMessage(role=Role[role.upper()], text=content)
+        logger.error(f"Unhandled message type in from_standard_message: {message}")
+
+
+@dataclass
+class AWSNovaSonicMessagesUpdateFrame(DataFrame):
+    context: AWSNovaSonicLLMContext
+
+
+class AWSNovaSonicUserContextAggregator(OpenAIUserContextAggregator):
+    async def process_frame(
+        self, frame: Frame, direction: FrameDirection = FrameDirection.DOWNSTREAM
+    ):
+        await super().process_frame(frame, direction)
+
+        # Parent does not push LLMMessagesUpdateFrame
+        if isinstance(frame, LLMMessagesUpdateFrame):
+            await self.push_frame(AWSNovaSonicMessagesUpdateFrame(context=self._context))
+
+        # Parent also doesn't push the LLMSetToolsFrame
+        # TODO: this
+        # if isinstance(frame, LLMSetToolsFrame):
+        #     await self.push_frame(frame, direction)
+
+
+class AWSNovaSonicAssistantContextAggregator(OpenAIAssistantContextAggregator):
+    pass
+
+
+@dataclass
+class AWSNovaSonicContextAggregatorPair:
+    _user: AWSNovaSonicUserContextAggregator
+    _assistant: AWSNovaSonicAssistantContextAggregator
+
+    def user(self) -> AWSNovaSonicUserContextAggregator:
+        return self._user
+
+    def assistant(self) -> AWSNovaSonicAssistantContextAggregator:
+        return self._assistant

From 2b7e1cb5b1fdae4a7b1ecf27ade3d8bafd0fefd8 Mon Sep 17 00:00:00 2001
From: Paul Kompfner <paul@daily.co>
Date: Tue, 29 Apr 2025 16:38:02 -0400
Subject: [PATCH 52/97] [WIP] AWS Nova Sonic service - add tool calling

---
 examples/foundational/39-aws-nova-sonic.py    |  50 +++++-
 .../services/aws_nova_sonic_adapter.py        |  40 +++++
 src/pipecat/services/aws_nova_sonic/aws.py    | 149 +++++++++++++++++-
 .../services/aws_nova_sonic/context.py        |  25 ++-
 src/pipecat/services/aws_nova_sonic/frames.py |  14 ++
 5 files changed, 267 insertions(+), 11 deletions(-)
 create mode 100644 src/pipecat/adapters/services/aws_nova_sonic_adapter.py
 create mode 100644 src/pipecat/services/aws_nova_sonic/frames.py

diff --git a/examples/foundational/39-aws-nova-sonic.py b/examples/foundational/39-aws-nova-sonic.py
index c44f85a48..c9bef1fed 100644
--- a/examples/foundational/39-aws-nova-sonic.py
+++ b/examples/foundational/39-aws-nova-sonic.py
@@ -5,14 +5,16 @@
 #
 
 import os
+from datetime import datetime
 
 from dotenv import load_dotenv
 from loguru import logger
 
 # import logging
+from pipecat.adapters.schemas.function_schema import FunctionSchema
+from pipecat.adapters.schemas.tools_schema import ToolsSchema
 from pipecat.audio.vad.silero import SileroVADAnalyzer
 from pipecat.audio.vad.vad_analyzer import VADParams
-from pipecat.frames.frames import LLMMessagesAppendFrame
 from pipecat.pipeline.pipeline import Pipeline
 from pipecat.pipeline.runner import PipelineRunner
 from pipecat.pipeline.task import PipelineParams, PipelineTask
@@ -31,6 +33,39 @@ load_dotenv(override=True)
 # )
 
 
+async def fetch_weather_from_api(function_name, tool_call_id, args, llm, context, result_callback):
+    temperature = 75 if args["format"] == "fahrenheit" else 24
+    await result_callback(
+        {
+            "conditions": "nice",
+            "temperature": temperature,
+            "format": args["format"],
+            "timestamp": datetime.now().strftime("%Y%m%d_%H%M%S"),
+        }
+    )
+
+
+weather_function = FunctionSchema(
+    name="get_current_weather",
+    description="Get the current weather",
+    properties={
+        "location": {
+            "type": "string",
+            "description": "The city and state, e.g. San Francisco, CA",
+        },
+        "format": {
+            "type": "string",
+            "enum": ["celsius", "fahrenheit"],
+            "description": "The temperature unit to use. Infer this from the users location.",
+        },
+    },
+    required=["location", "format"],
+)
+
+# Create tools schema
+tools = ToolsSchema(standard_tools=[weather_function])
+
+
 async def run_bot(webrtc_connection: SmallWebRTCConnection):
     logger.info(f"Starting bot")
 
@@ -62,20 +97,27 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection):
         access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
         region=os.getenv("AWS_REGION"),
         voice_id="tiffany",  # matthew, tiffany, amy
-        # instruction=system_instruction # could pass instruction here rather than context, below
+        # instruction=system_instruction # you could pass instruction here rather than in context
     )
 
+    # Register function for function calls
+    # you can either register a single function for all function calls, or specific functions
+    # llm.register_function(None, fetch_weather_from_api)
+    llm.register_function("get_current_weather", fetch_weather_from_api)
+
     # Set up context and context management.
     # AWSNovaSonicService will adapt OpenAI LLM context objects with standard message format to
     # what's expected by Nova Sonic.
+    # TODO: since we can't trigger a response upon joining, this isn't particularly useful
     context = OpenAILLMContext(
         messages=[
             {"role": "system", "content": f"{system_instruction}"},
             {
                 "role": "user",
-                "content": "Tell me hello! Don't wait for me to say anything else first!",
+                "content": "Say hello!",
             },
-        ]
+        ],
+        tools=tools,
     )
     context_aggregator = llm.create_context_aggregator(context)
 
diff --git a/src/pipecat/adapters/services/aws_nova_sonic_adapter.py b/src/pipecat/adapters/services/aws_nova_sonic_adapter.py
new file mode 100644
index 000000000..b96980046
--- /dev/null
+++ b/src/pipecat/adapters/services/aws_nova_sonic_adapter.py
@@ -0,0 +1,40 @@
+#
+# Copyright (c) 2024–2025, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+import json
+from typing import Any, Dict, List, Union
+
+from pipecat.adapters.base_llm_adapter import BaseLLMAdapter
+from pipecat.adapters.schemas.function_schema import FunctionSchema
+from pipecat.adapters.schemas.tools_schema import ToolsSchema
+
+
+class AWSNovaSonicLLMAdapter(BaseLLMAdapter):
+    @staticmethod
+    def _to_aws_nova_sonic_function_format(function: FunctionSchema) -> Dict[str, Any]:
+        return {
+            "toolSpec": {
+                "name": function.name,
+                "description": function.description,
+                "inputSchema": {
+                    "json": json.dumps(
+                        {
+                            "type": "object",
+                            "properties": function.properties,
+                            "required": function.required,
+                        }
+                    )
+                },
+            }
+        }
+
+    def to_provider_tools_format(self, tools_schema: ToolsSchema) -> List[Dict[str, Any]]:
+        """Converts function schemas to Openai Realtime function-calling format.
+
+        :return: Openai Realtime formatted function call definition.
+        """
+
+        functions_schema = tools_schema.standard_tools
+        return [self._to_aws_nova_sonic_function_format(func) for func in functions_schema]
diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py
index cc07e5463..8b6dab3ed 100644
--- a/src/pipecat/services/aws_nova_sonic/aws.py
+++ b/src/pipecat/services/aws_nova_sonic/aws.py
@@ -1,9 +1,15 @@
+#
+# Copyright (c) 2024–2025, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+
 import base64
 import json
 import uuid
 from dataclasses import dataclass
 from enum import Enum
-from typing import Any
+from typing import Any, List
 
 from aws_sdk_bedrock_runtime.client import (
     BedrockRuntimeClient,
@@ -22,6 +28,7 @@ from smithy_aws_core.credentials_resolvers.static import StaticCredentialsResolv
 from smithy_aws_core.identity import AWSCredentialsIdentity
 from smithy_core.aio.eventstream import DuplexEventStream
 
+from pipecat.adapters.services.aws_nova_sonic_adapter import AWSNovaSonicLLMAdapter
 from pipecat.frames.frames import (
     BotStoppedSpeakingFrame,
     CancelFrame,
@@ -58,10 +65,15 @@ from pipecat.services.aws_nova_sonic.context import (
     AWSNovaSonicUserContextAggregator,
     Role,
 )
+from pipecat.services.aws_nova_sonic.frames import AWSNovaSonicFunctionCallResultFrame
 from pipecat.services.llm_service import LLMService
 from pipecat.utils.time import time_now_iso8601
 
 
+class AWSNovaSonicUnhandledFunctionException(Exception):
+    pass
+
+
 class ContentType(Enum):
     AUDIO = "AUDIO"
     TEXT = "TEXT"
@@ -91,6 +103,9 @@ class CurrentContent:
 
 
 class AWSNovaSonicLLMService(LLMService):
+    # Override the default adapter to use the AWSNovaSonicLLMAdapter one
+    adapter_class = AWSNovaSonicLLMAdapter
+
     def __init__(
         self,
         *,
@@ -162,6 +177,8 @@ class AWSNovaSonicLLMService(LLMService):
             await self._send_user_audio_event(frame)
         elif isinstance(frame, BotStoppedSpeakingFrame):
             await self._handle_bot_stopped_speaking()
+        elif isinstance(frame, AWSNovaSonicFunctionCallResultFrame):
+            await self._handle_function_call_result(frame)
         # TODO: do we need to do anything for the below four frame types?
         elif isinstance(frame, StartInterruptionFrame):
             # print("[pk] StartInterruptionFrame")
@@ -206,6 +223,10 @@ class AWSNovaSonicLLMService(LLMService):
             self._assistant_is_responding = False
             await self._report_assistant_response_ended()
 
+    async def _handle_function_call_result(self, frame: AWSNovaSonicFunctionCallResultFrame):
+        result = frame.result_frame
+        await self._send_tool_result(tool_call_id=result.tool_call_id, result=result.result)
+
     #
     # LLM communication: lifecycle
     #
@@ -228,8 +249,8 @@ class AWSNovaSonicLLMService(LLMService):
                 InvokeModelWithBidirectionalStreamOperationInput(model_id=self._model)
             )
 
-            # Send session start events
-            await self._send_session_start_events()
+            # Send session start event
+            await self._send_session_start_event()
 
             # Finish connecting
             self._ready_to_send_context = True
@@ -247,6 +268,10 @@ class AWSNovaSonicLLMService(LLMService):
         # Read context
         history = self._context.get_messages_for_initializing_history()
 
+        # Send prompt start event, specifying tools
+        tools = self._context.tools
+        await self._send_prompt_start_event(tools)
+
         # Send system instruction
         # Instruction from context takes priority
         instruction = history.instruction if history.instruction else self._instruction
@@ -318,7 +343,7 @@ class AWSNovaSonicLLMService(LLMService):
     #
 
     # TODO: make params configurable?
-    async def _send_session_start_events(self):
+    async def _send_session_start_event(self):
         session_start = """
         {
           "event": {
@@ -334,6 +359,20 @@ class AWSNovaSonicLLMService(LLMService):
         """
         await self._send_client_event(session_start)
 
+    async def _send_prompt_start_event(self, tools: List[Any]):
+        tools_config = (
+            f""",
+        "toolUseOutputConfiguration": {{
+          "mediaType": "application/json"
+        }},
+        "toolConfiguration": {{
+          "tools": {json.dumps(tools)}
+        }}
+        """
+            if tools
+            else ""
+        )
+
         prompt_start = f'''
         {{
           "event": {{
@@ -350,7 +389,7 @@ class AWSNovaSonicLLMService(LLMService):
                 "voiceId": "{self._voice_id}",
                 "encoding": "base64",
                 "audioType": "SPEECH"
-              }}
+              }}{tools_config}
             }}
           }}
         }}
@@ -382,6 +421,9 @@ class AWSNovaSonicLLMService(LLMService):
         await self._send_client_event(audio_content_start)
 
     async def _send_text_event(self, text: str, role: Role):
+        if not self._stream:
+            return
+
         content_name = str(uuid.uuid4())
 
         text_content_start = f'''
@@ -469,6 +511,61 @@ class AWSNovaSonicLLMService(LLMService):
         """
         await self._send_client_event(session_end)
 
+    async def _send_tool_result(self, tool_call_id, result):
+        if not self._stream:
+            return
+
+        # print(f"[pk] sending tool result. tool call ID: {tool_call_id}, result: {result}")
+
+        content_name = str(uuid.uuid4())
+
+        result_content_start = f'''
+        {{
+            "event": {{
+                "contentStart": {{
+                    "promptName": "{self._prompt_name}",
+                    "contentName": "{content_name}",
+                    "interactive": false,
+                    "type": "TOOL",
+                    "role": "TOOL",
+                    "toolResultInputConfiguration": {{
+                        "toolUseId": "{tool_call_id}",
+                        "type": "TEXT",
+                        "textInputConfiguration": {{
+                            "mediaType": "text/plain"
+                        }}
+                    }}
+                }}
+            }}
+        }}
+        '''
+        await self._send_client_event(result_content_start)
+
+        result_content = json.dumps(
+            {
+                "event": {
+                    "toolResult": {
+                        "promptName": self._prompt_name,
+                        "contentName": content_name,
+                        "content": json.dumps(result) if isinstance(result, dict) else result,
+                    }
+                }
+            }
+        )
+        await self._send_client_event(result_content)
+
+        result_content_end = f"""
+        {{
+            "event": {{
+                "contentEnd": {{
+                    "promptName": "{self._prompt_name}",
+                    "contentName": "{content_name}"
+                }}
+            }}
+        }}
+        """
+        await self._send_client_event(result_content_end)
+
     async def _send_client_event(self, event_json: str):
         event = InvokeModelWithBidirectionalStreamInputChunk(
             value=BidirectionalInputPayloadPart(bytes_=event_json.encode("utf-8"))
@@ -515,6 +612,9 @@ class AWSNovaSonicLLMService(LLMService):
                     elif "audioOutput" in event_json:
                         # Handle audio output content
                         await self._handle_audio_output_event(event_json)
+                    elif "toolUse" in event_json:
+                        # Handle tool use
+                        await self._handle_tool_use_event(event_json)
                     elif "contentEnd" in event_json:
                         # Handle a piece of content ending
                         await self._handle_content_end_event(event_json)
@@ -593,6 +693,42 @@ class AWSNovaSonicLLMService(LLMService):
         )
         await self.push_frame(frame)
 
+    async def _handle_tool_use_event(self, event_json):
+        # This should never happen
+        if not self._content_being_received:
+            return
+
+        # Get tool use details
+        tool_use = event_json["toolUse"]
+        function_name = tool_use["toolName"]
+        tool_call_id = tool_use["toolUseId"]
+        arguments = json.loads(tool_use["content"])
+
+        # print(
+        #     f"[pk] tool use - function_name: {function_name}, tool_call_id: {tool_call_id}, arguments: {arguments}"
+        # )
+
+        # Call tool function
+        if self.has_function(function_name):
+            if function_name in self._functions.keys():
+                await self.call_function(
+                    context=self._context,
+                    tool_call_id=tool_call_id,
+                    function_name=function_name,
+                    arguments=arguments,
+                )
+            elif None in self._functions.keys():
+                await self.call_function(
+                    context=self._context,
+                    tool_call_id=tool_call_id,
+                    function_name=function_name,
+                    arguments=arguments,
+                )
+        else:
+            raise AWSNovaSonicUnhandledFunctionException(
+                f"The LLM tried to call a function named '{function_name}', but there isn't a callback registered for that function."
+            )
+
     async def _handle_content_end_event(self, event_json):
         # This should never happen
         if not self._content_being_received:
@@ -671,6 +807,9 @@ class AWSNovaSonicLLMService(LLMService):
         user_params: LLMUserAggregatorParams = LLMUserAggregatorParams(),
         assistant_params: LLMAssistantAggregatorParams = LLMAssistantAggregatorParams(),
     ) -> AWSNovaSonicContextAggregatorPair:
+        context.set_llm_adapter(self.get_llm_adapter())
+
         user = AWSNovaSonicUserContextAggregator(context=context, params=user_params)
         assistant = AWSNovaSonicAssistantContextAggregator(context=context, params=assistant_params)
+
         return AWSNovaSonicContextAggregatorPair(user, assistant)
diff --git a/src/pipecat/services/aws_nova_sonic/context.py b/src/pipecat/services/aws_nova_sonic/context.py
index 331ecc13e..820254cfb 100644
--- a/src/pipecat/services/aws_nova_sonic/context.py
+++ b/src/pipecat/services/aws_nova_sonic/context.py
@@ -1,12 +1,25 @@
+#
+# Copyright (c) 2025, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+
 import copy
 from dataclasses import dataclass, field
 from enum import Enum
 
 from loguru import logger
 
-from pipecat.frames.frames import DataFrame, Frame, LLMMessagesUpdateFrame, LLMSetToolsFrame
+from pipecat.frames.frames import (
+    DataFrame,
+    Frame,
+    FunctionCallResultFrame,
+    LLMMessagesUpdateFrame,
+    LLMSetToolsFrame,
+)
 from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
 from pipecat.processors.frame_processor import FrameDirection
+from pipecat.services.aws_nova_sonic.frames import AWSNovaSonicFunctionCallResultFrame
 from pipecat.services.openai.llm import (
     OpenAIAssistantContextAggregator,
     OpenAIUserContextAggregator,
@@ -106,7 +119,15 @@ class AWSNovaSonicUserContextAggregator(OpenAIUserContextAggregator):
 
 
 class AWSNovaSonicAssistantContextAggregator(OpenAIAssistantContextAggregator):
-    pass
+    async def handle_function_call_result(self, frame: FunctionCallResultFrame):
+        await super().handle_function_call_result(frame)
+
+        # The standard function callback code path pushes the FunctionCallResultFrame from the llm itself,
+        # so we didn't have a chance to add the result to the openai realtime api context. Let's push a
+        # special frame to do that.
+        await self.push_frame(
+            AWSNovaSonicFunctionCallResultFrame(result_frame=frame), FrameDirection.UPSTREAM
+        )
 
 
 @dataclass
diff --git a/src/pipecat/services/aws_nova_sonic/frames.py b/src/pipecat/services/aws_nova_sonic/frames.py
new file mode 100644
index 000000000..94d410f22
--- /dev/null
+++ b/src/pipecat/services/aws_nova_sonic/frames.py
@@ -0,0 +1,14 @@
+#
+# Copyright (c) 2025, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+
+from dataclasses import dataclass
+
+from pipecat.frames.frames import DataFrame, FunctionCallResultFrame
+
+
+@dataclass
+class AWSNovaSonicFunctionCallResultFrame(DataFrame):
+    result_frame: FunctionCallResultFrame

From da5c4953d5c793d25d7ab1d523928d48550b27b2 Mon Sep 17 00:00:00 2001
From: Paul Kompfner <paul@daily.co>
Date: Wed, 30 Apr 2025 10:51:06 -0400
Subject: [PATCH 53/97] [WIP] AWS Nova Sonic service - allow passing in tools
 into initializer

---
 examples/foundational/39-aws-nova-sonic.py |  5 ++++-
 src/pipecat/services/aws_nova_sonic/aws.py | 16 ++++++++++++----
 2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/examples/foundational/39-aws-nova-sonic.py b/examples/foundational/39-aws-nova-sonic.py
index c9bef1fed..f08cfad04 100644
--- a/examples/foundational/39-aws-nova-sonic.py
+++ b/examples/foundational/39-aws-nova-sonic.py
@@ -97,7 +97,10 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection):
         access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
         region=os.getenv("AWS_REGION"),
         voice_id="tiffany",  # matthew, tiffany, amy
-        # instruction=system_instruction # you could pass instruction here rather than in context
+        # you could choose to pass instruction here rather than via context
+        # instruction=system_instruction
+        # you could choose to pass tools here rather than via context
+        # tools=tools
     )
 
     # Register function for function calls
diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py
index 8b6dab3ed..586f759c6 100644
--- a/src/pipecat/services/aws_nova_sonic/aws.py
+++ b/src/pipecat/services/aws_nova_sonic/aws.py
@@ -9,7 +9,7 @@ import json
 import uuid
 from dataclasses import dataclass
 from enum import Enum
-from typing import Any, List
+from typing import Any, List, Optional
 
 from aws_sdk_bedrock_runtime.client import (
     BedrockRuntimeClient,
@@ -28,6 +28,7 @@ from smithy_aws_core.credentials_resolvers.static import StaticCredentialsResolv
 from smithy_aws_core.identity import AWSCredentialsIdentity
 from smithy_core.aio.eventstream import DuplexEventStream
 
+from pipecat.adapters.schemas.tools_schema import ToolsSchema
 from pipecat.adapters.services.aws_nova_sonic_adapter import AWSNovaSonicLLMAdapter
 from pipecat.frames.frames import (
     BotStoppedSpeakingFrame,
@@ -115,7 +116,8 @@ class AWSNovaSonicLLMService(LLMService):
         region: str,
         model: str = "amazon.nova-sonic-v1:0",
         voice_id: str = "matthew",  # matthew, tiffany, amy
-        instruction: str = None,
+        instruction: Optional[str] = None,
+        tools: Optional[ToolsSchema] = None,
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -126,6 +128,7 @@ class AWSNovaSonicLLMService(LLMService):
         self._client: BedrockRuntimeClient = None
         self._voice_id = voice_id
         self._instruction = instruction
+        self._tools = tools
         self._context: AWSNovaSonicLLMContext = None
         self._stream: DuplexEventStream[
             InvokeModelWithBidirectionalStreamInput,
@@ -269,11 +272,16 @@ class AWSNovaSonicLLMService(LLMService):
         history = self._context.get_messages_for_initializing_history()
 
         # Send prompt start event, specifying tools
-        tools = self._context.tools
+        # Tools from context take priority over tools from __init__()
+        tools = (
+            self._context.tools
+            if self._context.tools
+            else self.get_llm_adapter().from_standard_tools(self._tools)
+        )
         await self._send_prompt_start_event(tools)
 
         # Send system instruction
-        # Instruction from context takes priority
+        # Instruction from context takes priority over instruction from __init__()
         instruction = history.instruction if history.instruction else self._instruction
         if instruction:
             await self._send_text_event(text=instruction, role=Role.SYSTEM)

From 394648f1c9a7a6e1cb6d7865fae31498696185a2 Mon Sep 17 00:00:00 2001
From: Paul Kompfner <paul@daily.co>
Date: Wed, 30 Apr 2025 11:04:47 -0400
Subject: [PATCH 54/97] [WIP] AWS Nova Sonic service - fix user utterances not
 making it into the context

---
 src/pipecat/services/aws_nova_sonic/aws.py     | 9 ++++++---
 src/pipecat/services/aws_nova_sonic/context.py | 7 +++++++
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py
index 586f759c6..1b2937f83 100644
--- a/src/pipecat/services/aws_nova_sonic/aws.py
+++ b/src/pipecat/services/aws_nova_sonic/aws.py
@@ -199,9 +199,8 @@ class AWSNovaSonicLLMService(LLMService):
         await self.push_frame(frame, direction)
 
     async def _handle_context(self, context: OpenAILLMContext):
-        # TODO: if context has changed, reconnect
-        # TODO: remove
-        print(f"[pk] _handle_context: {context.get_messages_for_initializing_history()}")
+        # TODO: reset connection if needed (if entirely new context object provided, for instance)
+        print(f"[pk] receive updated context: {context.get_messages_for_initializing_history()}")
         if not self._context:
             # We got our initial context - try to finish connecting
             self._context = AWSNovaSonicLLMContext.upgrade_to_nova_sonic(context)
@@ -800,6 +799,10 @@ class AWSNovaSonicLLMService(LLMService):
 
     async def _report_user_transcription_text_added(self, text):
         print(f"[pk] transcription: {text}")
+        # Manually add new user transcription text to context.
+        # We can't rely on the user context aggregator to do this since it's upstream from the LLM.
+        self._context.add_user_transcription_text_as_message(text)
+        # Report that some new user transcription text is available.
         await self.push_frame(
             TranscriptionFrame(text=text, user_id="", timestamp=time_now_iso8601())
         )
diff --git a/src/pipecat/services/aws_nova_sonic/context.py b/src/pipecat/services/aws_nova_sonic/context.py
index 820254cfb..92cd313cb 100644
--- a/src/pipecat/services/aws_nova_sonic/context.py
+++ b/src/pipecat/services/aws_nova_sonic/context.py
@@ -96,6 +96,13 @@ class AWSNovaSonicLLMContext(OpenAILLMContext):
             return AWSNovaSonicConversationHistoryMessage(role=Role[role.upper()], text=content)
         logger.error(f"Unhandled message type in from_standard_message: {message}")
 
+    def add_user_transcription_text_as_message(self, text):
+        message = {
+            "role": "user",
+            "content": [{"type": "text", "text": text}],
+        }
+        self.add_message(message)
+
 
 @dataclass
 class AWSNovaSonicMessagesUpdateFrame(DataFrame):

From 3960c604a4ab53a45269855cd976605c262c6261 Mon Sep 17 00:00:00 2001
From: Paul Kompfner <paul@daily.co>
Date: Wed, 30 Apr 2025 11:20:23 -0400
Subject: [PATCH 55/97] [WIP] AWS Nova Sonic service - fix empty assistant
 conversation history item in the context after tool use

---
 src/pipecat/services/aws_nova_sonic/context.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/pipecat/services/aws_nova_sonic/context.py b/src/pipecat/services/aws_nova_sonic/context.py
index 92cd313cb..206c1fd2b 100644
--- a/src/pipecat/services/aws_nova_sonic/context.py
+++ b/src/pipecat/services/aws_nova_sonic/context.py
@@ -93,7 +93,13 @@ class AWSNovaSonicLLMContext(OpenAILLMContext):
                         logger.error(
                             f"Unhandled content type in context message: {c.get('type')} - {message}"
                         )
-            return AWSNovaSonicConversationHistoryMessage(role=Role[role.upper()], text=content)
+            # There won't be content if this is an assistant tool call entry.
+            # We're ignoring those since they can't be loaded into AWS Nova Sonic conversation 
+            # history
+            if content:
+                return AWSNovaSonicConversationHistoryMessage(role=Role[role.upper()], text=content)
+        # We're ignoring messages with role "tool" since they can't be loaded into AWS Nova Sonic 
+        # conversation history
         logger.error(f"Unhandled message type in from_standard_message: {message}")
 
     def add_user_transcription_text_as_message(self, text):

From 5e0803479ea04fbb40f7f80d398165f6e2b50380 Mon Sep 17 00:00:00 2001
From: Paul Kompfner <paul@daily.co>
Date: Wed, 30 Apr 2025 14:53:22 -0400
Subject: [PATCH 56/97] [WIP] AWS Nova Sonic service - add
 send_transcription_frames option

---
 src/pipecat/services/aws_nova_sonic/aws.py     | 10 +++++++---
 src/pipecat/services/aws_nova_sonic/context.py |  4 ++--
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py
index 1b2937f83..3c6de7ad7 100644
--- a/src/pipecat/services/aws_nova_sonic/aws.py
+++ b/src/pipecat/services/aws_nova_sonic/aws.py
@@ -118,6 +118,7 @@ class AWSNovaSonicLLMService(LLMService):
         voice_id: str = "matthew",  # matthew, tiffany, amy
         instruction: Optional[str] = None,
         tools: Optional[ToolsSchema] = None,
+        send_transcription_frames: bool = True,
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -129,6 +130,7 @@ class AWSNovaSonicLLMService(LLMService):
         self._voice_id = voice_id
         self._instruction = instruction
         self._tools = tools
+        self._send_transcription_frames = send_transcription_frames
         self._context: AWSNovaSonicLLMContext = None
         self._stream: DuplexEventStream[
             InvokeModelWithBidirectionalStreamInput,
@@ -802,10 +804,12 @@ class AWSNovaSonicLLMService(LLMService):
         # Manually add new user transcription text to context.
         # We can't rely on the user context aggregator to do this since it's upstream from the LLM.
         self._context.add_user_transcription_text_as_message(text)
+
         # Report that some new user transcription text is available.
-        await self.push_frame(
-            TranscriptionFrame(text=text, user_id="", timestamp=time_now_iso8601())
-        )
+        if self._send_transcription_frames:
+            await self.push_frame(
+                TranscriptionFrame(text=text, user_id="", timestamp=time_now_iso8601())
+            )
 
     #
     # Context
diff --git a/src/pipecat/services/aws_nova_sonic/context.py b/src/pipecat/services/aws_nova_sonic/context.py
index 206c1fd2b..e4662ee57 100644
--- a/src/pipecat/services/aws_nova_sonic/context.py
+++ b/src/pipecat/services/aws_nova_sonic/context.py
@@ -94,11 +94,11 @@ class AWSNovaSonicLLMContext(OpenAILLMContext):
                             f"Unhandled content type in context message: {c.get('type')} - {message}"
                         )
             # There won't be content if this is an assistant tool call entry.
-            # We're ignoring those since they can't be loaded into AWS Nova Sonic conversation 
+            # We're ignoring those since they can't be loaded into AWS Nova Sonic conversation
             # history
             if content:
                 return AWSNovaSonicConversationHistoryMessage(role=Role[role.upper()], text=content)
-        # We're ignoring messages with role "tool" since they can't be loaded into AWS Nova Sonic 
+        # We're ignoring messages with role "tool" since they can't be loaded into AWS Nova Sonic
         # conversation history
         logger.error(f"Unhandled message type in from_standard_message: {message}")
 

From 2154db07f085e92aab4ef99b45a438dc316088cd Mon Sep 17 00:00:00 2001
From: Paul Kompfner <paul@daily.co>
Date: Wed, 30 Apr 2025 15:10:10 -0400
Subject: [PATCH 57/97] [WIP] AWS Nova Sonic service - remove unnecessary error
 log

---
 src/pipecat/services/aws_nova_sonic/context.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/pipecat/services/aws_nova_sonic/context.py b/src/pipecat/services/aws_nova_sonic/context.py
index e4662ee57..4b41b53b3 100644
--- a/src/pipecat/services/aws_nova_sonic/context.py
+++ b/src/pipecat/services/aws_nova_sonic/context.py
@@ -98,9 +98,8 @@ class AWSNovaSonicLLMContext(OpenAILLMContext):
             # history
             if content:
                 return AWSNovaSonicConversationHistoryMessage(role=Role[role.upper()], text=content)
-        # We're ignoring messages with role "tool" since they can't be loaded into AWS Nova Sonic
-        # conversation history
-        logger.error(f"Unhandled message type in from_standard_message: {message}")
+        # NOTE: we're ignoring messages with role "tool" since they can't be loaded into AWS Nova
+        # Sonic conversation history
 
     def add_user_transcription_text_as_message(self, text):
         message = {

From 6938152db67947e09da37b7d2e3d649fa0b5a939 Mon Sep 17 00:00:00 2001
From: Paul Kompfner <paul@daily.co>
Date: Wed, 30 Apr 2025 15:15:49 -0400
Subject: [PATCH 58/97] [WIP] AWS Nova Sonic service - fix comment

---
 src/pipecat/services/aws_nova_sonic/context.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/pipecat/services/aws_nova_sonic/context.py b/src/pipecat/services/aws_nova_sonic/context.py
index 4b41b53b3..5d9bafec5 100644
--- a/src/pipecat/services/aws_nova_sonic/context.py
+++ b/src/pipecat/services/aws_nova_sonic/context.py
@@ -73,6 +73,7 @@ class AWSNovaSonicLLMContext(OpenAILLMContext):
 
         # Process remaining messages to fill out conversation history.
         # Nova Sonic supports "user" and "assistant" messages in history.
+        print(f"[pk] standard messages: {messages}")
         for message in messages:
             history_message = self.from_standard_message(message)
             if history_message:
@@ -134,9 +135,9 @@ class AWSNovaSonicAssistantContextAggregator(OpenAIAssistantContextAggregator):
     async def handle_function_call_result(self, frame: FunctionCallResultFrame):
         await super().handle_function_call_result(frame)
 
-        # The standard function callback code path pushes the FunctionCallResultFrame from the llm itself,
-        # so we didn't have a chance to add the result to the openai realtime api context. Let's push a
-        # special frame to do that.
+        # The standard function callback code path pushes the FunctionCallResultFrame from the LLM
+        # itself, so we didn't have a chance to add the result to the AWS Nova Sonic server-side
+        # context. Let's push a special frame to do that.
         await self.push_frame(
             AWSNovaSonicFunctionCallResultFrame(result_frame=frame), FrameDirection.UPSTREAM
         )

From d6ef3d64ace855238e0ee60c5e5d92247b8a7448 Mon Sep 17 00:00:00 2001
From: Paul Kompfner <paul@daily.co>
Date: Wed, 30 Apr 2025 21:40:40 -0400
Subject: [PATCH 59/97] [WIP] AWS Nova Sonic service - fix context problems of
 double-counting LLM text, and mis-categorizing user text as LLM text

---
 .../services/aws_nova_sonic/context.py        | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/src/pipecat/services/aws_nova_sonic/context.py b/src/pipecat/services/aws_nova_sonic/context.py
index 5d9bafec5..4e2a4fcc1 100644
--- a/src/pipecat/services/aws_nova_sonic/context.py
+++ b/src/pipecat/services/aws_nova_sonic/context.py
@@ -16,6 +16,8 @@ from pipecat.frames.frames import (
     FunctionCallResultFrame,
     LLMMessagesUpdateFrame,
     LLMSetToolsFrame,
+    LLMTextFrame,
+    TranscriptionFrame,
 )
 from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
 from pipecat.processors.frame_processor import FrameDirection
@@ -132,6 +134,23 @@ class AWSNovaSonicUserContextAggregator(OpenAIUserContextAggregator):
 
 
 class AWSNovaSonicAssistantContextAggregator(OpenAIAssistantContextAggregator):
+    # AWS Nova Sonic is a speech-to-speech model.
+    # It behaves like a combined STT + LLM + TTS service, emitting all of:
+    # - TranscriptionFrame (for user text)
+    # - LLMTextFrame (for assistant text)
+    # - TTSTextFrame (for assistant text)
+    # In a "standard" pipeline (with separate STT + LLM + TTS services):
+    # - The TranscriptionFrame is swallowed by the LLMUserContextAggregator
+    # - The LLMTextFrame is swallowed by the TTS service
+    # Meaning the LLMAssistantContextAggregator only receives the TTSTextFrames. It actually
+    # implicitly  assumes it will receive only *non-duplicate* *assistant-related* text frames, and
+    # will misbehave otherwise (double-counting assistant text, or mis-categorizing user text as
+    # assistant text).
+    # So, let's override process_frame here to ignore TranscriptionFrames and LLMTextFrames.
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
+        if not isinstance(frame, (LLMTextFrame, TranscriptionFrame)):
+            await super().process_frame(frame, direction)
+
     async def handle_function_call_result(self, frame: FunctionCallResultFrame):
         await super().handle_function_call_result(frame)
 

From c47703995406be98ff717dc9bb4f9943b1bc1faa Mon Sep 17 00:00:00 2001
From: Paul Kompfner <paul@daily.co>
Date: Wed, 30 Apr 2025 22:29:36 -0400
Subject: [PATCH 60/97] [WIP] AWS Nova Sonic service - just for safety, add a
 short delay after BotStoppedSpeaking before sending LLMFullResponseEndFrame +
 TTSStoppedFrame, to give a bit of leeway for the LLM to deliver the "FINAL"
 text block describing what was said

---
 src/pipecat/services/aws_nova_sonic/aws.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py
index 3c6de7ad7..1ef171750 100644
--- a/src/pipecat/services/aws_nova_sonic/aws.py
+++ b/src/pipecat/services/aws_nova_sonic/aws.py
@@ -4,6 +4,7 @@
 # SPDX-License-Identifier: BSD 2-Clause License
 #
 
+import asyncio
 import base64
 import json
 import uuid
@@ -211,7 +212,8 @@ class AWSNovaSonicLLMService(LLMService):
 
     async def _handle_bot_stopped_speaking(self):
         if self._assistant_is_responding:
-            # Consider the assistant finished with their response.
+            # Consider the assistant finished with their response (after a short delay, to allow for
+            # any FINAL text block to come in).
             #
             # TODO: ideally we could base this solely on the LLM output events, but I couldn't
             # figure out a reliable way to determine when we've gotten our last FINAL text block
@@ -224,6 +226,7 @@ class AWSNovaSonicLLMService(LLMService):
             # FINAL text blocks to know how many or which FINAL blocks to expect, but user
             # interruptions throw a wrench in these schemes: depending on the exact timing of the
             # interruption, we should or shouldn't expect some FINAL blocks.
+            await asyncio.sleep(0.25)
             self._assistant_is_responding = False
             await self._report_assistant_response_ended()
 

From 38c9fa681a3a0678e58805bcca5ff833f8a1a9e3 Mon Sep 17 00:00:00 2001
From: Paul Kompfner <paul@daily.co>
Date: Thu, 1 May 2025 17:50:29 -0400
Subject: [PATCH 61/97] [WIP] AWS Nova Sonic service - Protect against
 back-to-back BotStoppedSpeaking calls, which I've observed

---
 src/pipecat/services/aws_nova_sonic/aws.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py
index 1ef171750..e8da485a8 100644
--- a/src/pipecat/services/aws_nova_sonic/aws.py
+++ b/src/pipecat/services/aws_nova_sonic/aws.py
@@ -145,6 +145,8 @@ class AWSNovaSonicLLMService(LLMService):
         self._assistant_is_responding = False
         self._context_available = False
         self._ready_to_send_context = False
+        self._handling_bot_stopped_speaking = False
+
 
     #
     # standard AIService frame handling
@@ -211,6 +213,11 @@ class AWSNovaSonicLLMService(LLMService):
             await self._finish_connecting_if_context_available()
 
     async def _handle_bot_stopped_speaking(self):
+        # Protect against back-to-back BotStoppedSpeaking calls, which I've observed
+        if self._handling_bot_stopped_speaking:
+            return
+        self._handling_bot_stopped_speaking = True
+
         if self._assistant_is_responding:
             # Consider the assistant finished with their response (after a short delay, to allow for
             # any FINAL text block to come in).
@@ -229,6 +236,9 @@ class AWSNovaSonicLLMService(LLMService):
             await asyncio.sleep(0.25)
             self._assistant_is_responding = False
             await self._report_assistant_response_ended()
+            self._handling_bot_stopped_speaking = False
+
+        self._handling_bot_stopped_speaking = False
 
     async def _handle_function_call_result(self, frame: AWSNovaSonicFunctionCallResultFrame):
         result = frame.result_frame

From 4ffdc3b77ceed4168f747384ac71ae1acd0ae941 Mon Sep 17 00:00:00 2001
From: Paul Kompfner <paul@daily.co>
Date: Thu, 1 May 2025 21:54:36 -0400
Subject: [PATCH 62/97] [WIP] AWS Nova Sonic service - do hacky direct
 manipulation of the context for now, since I can't seem to get assistant
 context aggregation working properly with frames, grr

---
 src/pipecat/services/aws_nova_sonic/aws.py    | 19 +++++--
 .../services/aws_nova_sonic/context.py        | 54 +++++++++++++------
 2 files changed, 54 insertions(+), 19 deletions(-)

diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py
index e8da485a8..35f312a0e 100644
--- a/src/pipecat/services/aws_nova_sonic/aws.py
+++ b/src/pipecat/services/aws_nova_sonic/aws.py
@@ -147,7 +147,6 @@ class AWSNovaSonicLLMService(LLMService):
         self._ready_to_send_context = False
         self._handling_bot_stopped_speaking = False
 
-
     #
     # standard AIService frame handling
     #
@@ -760,8 +759,10 @@ class AWSNovaSonicLLMService(LLMService):
         content_end = event_json["contentEnd"]
         stop_reason = content_end["stopReason"]
         # print(f"[pk] content end: {content}.\n  stop_reason: {stop_reason}")
-        # if content.role == Role.ASSISTANT:
-        #     print(f"[pk] assistant content end: {content}.\n  stop_reason: {stop_reason}")
+        if content.role == Role.ASSISTANT:
+            # print(f"[pk] assistant content end: {content}.\n  stop_reason: {stop_reason}")
+            if content.text_stage == TextStage.FINAL:
+                print(f"[pk] assistant FINAL text: {content.text_content}")
 
         # Bookkeeping: clear current content being received
         self._content_being_received = None
@@ -803,6 +804,18 @@ class AWSNovaSonicLLMService(LLMService):
         print(f"[pk] TTS text: {text}")
         await self.push_frame(TTSTextFrame(text))
 
+        # TODO: this is a (hopefully temporary) HACK. Here we directly manipulate the context rather
+        # than relying on the frames pushed to the assistant context aggregator. The pattern of
+        # receiving full-sentence text after the assistant has spoken does not easily fit with the
+        # Pipecat expectation of chunks of text streaming in while the assistant is speaking.
+        # Interruption handling was especially challenging. Rather than spend days trying to fit a
+        # square peg in a round hole, I decided on this hack for the time being. We can most cleanly
+        # abandon this hack if/when AWS Nova Sonic implements streaming smaller text chunks
+        # interspersed with audio. Note that when we move away from this hack, we need to make sure
+        # that on an interruption we avoid sending LLMFullResponseEndFrame, which gets the
+        # LLMAssistantContextAggregator into a bad state.
+        self._context.add_assistant_text_as_message(text)
+
     async def _report_assistant_response_ended(self):
         # Report that the assistant has finished their response.
         print("[pk] LLM full response ended")
diff --git a/src/pipecat/services/aws_nova_sonic/context.py b/src/pipecat/services/aws_nova_sonic/context.py
index 4e2a4fcc1..647e40ae6 100644
--- a/src/pipecat/services/aws_nova_sonic/context.py
+++ b/src/pipecat/services/aws_nova_sonic/context.py
@@ -11,13 +11,19 @@ from enum import Enum
 from loguru import logger
 
 from pipecat.frames.frames import (
+    BotStoppedSpeakingFrame,
     DataFrame,
     Frame,
     FunctionCallResultFrame,
+    LLMFullResponseEndFrame,
+    LLMFullResponseStartFrame,
+    LLMMessagesAppendFrame,
     LLMMessagesUpdateFrame,
+    LLMSetToolChoiceFrame,
     LLMSetToolsFrame,
-    LLMTextFrame,
-    TranscriptionFrame,
+    StartInterruptionFrame,
+    TextFrame,
+    UserImageRawFrame,
 )
 from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
 from pipecat.processors.frame_processor import FrameDirection
@@ -110,6 +116,15 @@ class AWSNovaSonicLLMContext(OpenAILLMContext):
             "content": [{"type": "text", "text": text}],
         }
         self.add_message(message)
+        # print(f"[pk] context updated (user): {self.get_messages_for_logging()}")
+
+    def add_assistant_text_as_message(self, text):
+        message = {
+            "role": "assistant",
+            "content": [{"type": "text", "text": text}],
+        }
+        self.add_message(message)
+        # print(f"[pk] context updated (assistant): {self.get_messages_for_logging()}")
 
 
 @dataclass
@@ -134,21 +149,28 @@ class AWSNovaSonicUserContextAggregator(OpenAIUserContextAggregator):
 
 
 class AWSNovaSonicAssistantContextAggregator(OpenAIAssistantContextAggregator):
-    # AWS Nova Sonic is a speech-to-speech model.
-    # It behaves like a combined STT + LLM + TTS service, emitting all of:
-    # - TranscriptionFrame (for user text)
-    # - LLMTextFrame (for assistant text)
-    # - TTSTextFrame (for assistant text)
-    # In a "standard" pipeline (with separate STT + LLM + TTS services):
-    # - The TranscriptionFrame is swallowed by the LLMUserContextAggregator
-    # - The LLMTextFrame is swallowed by the TTS service
-    # Meaning the LLMAssistantContextAggregator only receives the TTSTextFrames. It actually
-    # implicitly  assumes it will receive only *non-duplicate* *assistant-related* text frames, and
-    # will misbehave otherwise (double-counting assistant text, or mis-categorizing user text as
-    # assistant text).
-    # So, let's override process_frame here to ignore TranscriptionFrames and LLMTextFrames.
     async def process_frame(self, frame: Frame, direction: FrameDirection):
-        if not isinstance(frame, (LLMTextFrame, TranscriptionFrame)):
+        # HACK: For now, disable the context aggregator by making it just pass through all frames
+        # that the parent handles (except the function call stuff, which we still need).
+        # For an explanation of this hack, see
+        # AWSNovaSonicLLMService._report_assistant_response_text_added.
+        if isinstance(
+            frame,
+            (
+                StartInterruptionFrame,
+                LLMFullResponseStartFrame,
+                LLMFullResponseEndFrame,
+                TextFrame,
+                LLMMessagesAppendFrame,
+                LLMMessagesUpdateFrame,
+                LLMSetToolsFrame,
+                LLMSetToolChoiceFrame,
+                UserImageRawFrame,
+                BotStoppedSpeakingFrame,
+            ),
+        ):
+            await self.push_frame(frame, direction)
+        else:
             await super().process_frame(frame, direction)
 
     async def handle_function_call_result(self, frame: FunctionCallResultFrame):

From 3784bdbd27ef06e857537b9e08f74a74f865ffeb Mon Sep 17 00:00:00 2001
From: Paul Kompfner <paul@daily.co>
Date: Fri, 2 May 2025 10:42:52 -0400
Subject: [PATCH 63/97] [WIP] AWS Nova Sonic service - in our hacky direct
 manipulation of the context, aggregate assistant text rather than recording
 every chunk as a separate message

---
 src/pipecat/services/aws_nova_sonic/aws.py    | 13 ++++++------
 .../services/aws_nova_sonic/context.py        | 20 +++++++++++++++----
 2 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py
index 35f312a0e..e7b1fd8e6 100644
--- a/src/pipecat/services/aws_nova_sonic/aws.py
+++ b/src/pipecat/services/aws_nova_sonic/aws.py
@@ -759,10 +759,8 @@ class AWSNovaSonicLLMService(LLMService):
         content_end = event_json["contentEnd"]
         stop_reason = content_end["stopReason"]
         # print(f"[pk] content end: {content}.\n  stop_reason: {stop_reason}")
-        if content.role == Role.ASSISTANT:
-            # print(f"[pk] assistant content end: {content}.\n  stop_reason: {stop_reason}")
-            if content.text_stage == TextStage.FINAL:
-                print(f"[pk] assistant FINAL text: {content.text_content}")
+        # if content.role == Role.ASSISTANT:
+        # print(f"[pk] assistant content end: {content}.\n  stop_reason: {stop_reason}")
 
         # Bookkeeping: clear current content being received
         self._content_being_received = None
@@ -814,7 +812,7 @@ class AWSNovaSonicLLMService(LLMService):
         # interspersed with audio. Note that when we move away from this hack, we need to make sure
         # that on an interruption we avoid sending LLMFullResponseEndFrame, which gets the
         # LLMAssistantContextAggregator into a bad state.
-        self._context.add_assistant_text_as_message(text)
+        self._context.buffer_assistant_text(text)
 
     async def _report_assistant_response_ended(self):
         # Report that the assistant has finished their response.
@@ -825,11 +823,14 @@ class AWSNovaSonicLLMService(LLMService):
         print("[pk] TTS stopped")
         await self.push_frame(TTSStoppedFrame())
 
+        # For an explanation of this hack, see _report_assistant_response_text_added.
+        self._context.flush_aggregated_assistant_text()
+
     async def _report_user_transcription_text_added(self, text):
         print(f"[pk] transcription: {text}")
         # Manually add new user transcription text to context.
         # We can't rely on the user context aggregator to do this since it's upstream from the LLM.
-        self._context.add_user_transcription_text_as_message(text)
+        self._context.add_user_transcription_text(text)
 
         # Report that some new user transcription text is available.
         if self._send_transcription_frames:
diff --git a/src/pipecat/services/aws_nova_sonic/context.py b/src/pipecat/services/aws_nova_sonic/context.py
index 647e40ae6..3fac65a72 100644
--- a/src/pipecat/services/aws_nova_sonic/context.py
+++ b/src/pipecat/services/aws_nova_sonic/context.py
@@ -53,12 +53,19 @@ class AWSNovaSonicConversationHistory:
     messages: list[AWSNovaSonicConversationHistoryMessage] = field(default_factory=list)
 
 
-@dataclass
 class AWSNovaSonicLLMContext(OpenAILLMContext):
+    def __init__(self, messages=None, tools=None, **kwargs):
+        super().__init__(messages=messages, tools=tools, **kwargs)
+        self.__setup_local()
+
+    def __setup_local(self):
+        self._assistant_text = ""
+
     @staticmethod
     def upgrade_to_nova_sonic(obj: OpenAILLMContext) -> "AWSNovaSonicLLMContext":
         if isinstance(obj, OpenAILLMContext) and not isinstance(obj, AWSNovaSonicLLMContext):
             obj.__class__ = AWSNovaSonicLLMContext
+            obj.__setup_local()
         return obj
 
     def get_messages_for_initializing_history(self) -> AWSNovaSonicConversationHistory:
@@ -110,7 +117,7 @@ class AWSNovaSonicLLMContext(OpenAILLMContext):
         # NOTE: we're ignoring messages with role "tool" since they can't be loaded into AWS Nova
         # Sonic conversation history
 
-    def add_user_transcription_text_as_message(self, text):
+    def add_user_transcription_text(self, text):
         message = {
             "role": "user",
             "content": [{"type": "text", "text": text}],
@@ -118,11 +125,16 @@ class AWSNovaSonicLLMContext(OpenAILLMContext):
         self.add_message(message)
         # print(f"[pk] context updated (user): {self.get_messages_for_logging()}")
 
-    def add_assistant_text_as_message(self, text):
+    def buffer_assistant_text(self, text):
+        self._assistant_text += text  # TODO: determine if we need to add space or something
+        # print(f"[pk] assistant text buffered: {self._assistant_text}")
+
+    def flush_aggregated_assistant_text(self):
         message = {
             "role": "assistant",
-            "content": [{"type": "text", "text": text}],
+            "content": [{"type": "text", "text": self._assistant_text}],
         }
+        self._assistant_text = ""
         self.add_message(message)
         # print(f"[pk] context updated (assistant): {self.get_messages_for_logging()}")
 

From cc1f4ba81c24ff0928eb336507ee20b800d5946f Mon Sep 17 00:00:00 2001
From: Paul Kompfner <paul@daily.co>
Date: Fri, 2 May 2025 11:31:56 -0400
Subject: [PATCH 64/97] [WIP] AWS Nova Sonic service - add a hacky way of
 programmatically triggering an assistant response

---
 examples/foundational/39-aws-nova-sonic.py    |  18 +++-
 pyproject.toml                                |   1 +
 src/pipecat/services/aws_nova_sonic/aws.py    |  92 ++++++++++++++++--
 src/pipecat/services/aws_nova_sonic/ready.wav | Bin 0 -> 23484 bytes
 4 files changed, 100 insertions(+), 11 deletions(-)
 create mode 100644 src/pipecat/services/aws_nova_sonic/ready.wav

diff --git a/examples/foundational/39-aws-nova-sonic.py b/examples/foundational/39-aws-nova-sonic.py
index f08cfad04..07670f75a 100644
--- a/examples/foundational/39-aws-nova-sonic.py
+++ b/examples/foundational/39-aws-nova-sonic.py
@@ -83,12 +83,16 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection):
         ),
     )
 
-    # Specify initial system instruction
+    # Specify initial system instruction.
+    # HACK: note that, for now, we need to inject a special bit of text into this instruction to
+    # allow the first assistant response to be programmatically triggered (which happens in the
+    # on_client_connected handler, below)
     # TODO: looks like Nova Sonic can't handle new lines?
     system_instruction = (
-        "You are a friendly assistant. The user and you will engage in a spoken dialog "
-        "exchanging the transcripts of a natural real-time conversation. Keep your responses short, "
-        "generally two or three sentences for chatty scenarios."
+        "You are a friendly assistant. The user and you will engage in a spoken dialog exchanging "
+        "the transcripts of a natural real-time conversation. Keep your responses short, generally "
+        "two or three sentences for chatty scenarios. "
+        f"{AWSNovaSonicLLMService.AWAIT_TRIGGER_ASSISTANT_RESPONSE_INSTRUCTION}"
     )
 
     # Create the AWS Nova Sonic LLM service
@@ -117,7 +121,7 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection):
             {"role": "system", "content": f"{system_instruction}"},
             {
                 "role": "user",
-                "content": "Say hello!",
+                "content": "Tell me a fun fact!",
             },
         ],
         tools=tools,
@@ -151,6 +155,10 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection):
         logger.info(f"Client connected")
         # Kick off the conversation.
         await task.queue_frames([context_aggregator.user().get_context_frame()])
+        # HACK: for now, we need this special way of triggering the first assistant response in AWS
+        # Nova Sonic. Note that this trigger requires a special corresponding bit of text in the
+        # system instruction. In the future, simply queueing the context frame should be sufficient.
+        await llm.trigger_assistant_response()
 
     # Handle client disconnection events
     @transport.event_handler("on_client_disconnected")
diff --git a/pyproject.toml b/pyproject.toml
index d6d05c00c..7ce167d77 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -96,6 +96,7 @@ where = ["src"]
 
 [tool.setuptools.package-data]
 "pipecat" = ["py.typed"]
+"pipecat.services.aws_nova_sonic" = ["src/pipecat/services/aws_nova_sonic/ready.wav"]
 
 [tool.pytest.ini_options]
 addopts = "--verbose"
diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py
index e7b1fd8e6..5b69810f3 100644
--- a/src/pipecat/services/aws_nova_sonic/aws.py
+++ b/src/pipecat/services/aws_nova_sonic/aws.py
@@ -8,8 +8,10 @@ import asyncio
 import base64
 import json
 import uuid
+import wave
 from dataclasses import dataclass
 from enum import Enum
+from importlib.resources import files
 from typing import Any, List, Optional
 
 from aws_sdk_bedrock_runtime.client import (
@@ -146,6 +148,8 @@ class AWSNovaSonicLLMService(LLMService):
         self._context_available = False
         self._ready_to_send_context = False
         self._handling_bot_stopped_speaking = False
+        self._triggering_assistant_response = False
+        self._assistant_response_trigger_audio: bytes = None  # Not cleared on _disconnect()
 
     #
     # standard AIService frame handling
@@ -180,8 +184,7 @@ class AWSNovaSonicLLMService(LLMService):
         if isinstance(frame, OpenAILLMContextFrame):
             await self._handle_context(frame.context)
         elif isinstance(frame, InputAudioRawFrame):
-            # TODO: check if _audio_input_paused? what causes that?
-            await self._send_user_audio_event(frame)
+            await self._handle_input_audio_frame(frame)
         elif isinstance(frame, BotStoppedSpeakingFrame):
             await self._handle_bot_stopped_speaking()
         elif isinstance(frame, AWSNovaSonicFunctionCallResultFrame):
@@ -211,6 +214,15 @@ class AWSNovaSonicLLMService(LLMService):
             self._context_available = True
             await self._finish_connecting_if_context_available()
 
+    async def _handle_input_audio_frame(self, frame: InputAudioRawFrame):
+        # Wait until we're done sending the assistant response trigger audio before sending audio
+        # from the user's mic
+        if self._triggering_assistant_response:
+            return
+
+        # TODO: check if _audio_input_paused? what causes that?
+        await self._send_user_audio_event(frame.audio)
+
     async def _handle_bot_stopped_speaking(self):
         # Protect against back-to-back BotStoppedSpeaking calls, which I've observed
         if self._handling_bot_stopped_speaking:
@@ -316,6 +328,14 @@ class AWSNovaSonicLLMService(LLMService):
         # Start receiving events
         self._receive_task = self.create_task(self._receive_task_handler())
 
+        # If we need to, send assistant response trigger
+        if self._triggering_assistant_response:
+            # If the trigger was the first audio chunk sent on this connection it'd be ignored (I'm
+            # guessing the LLM can't quite "hear" the first little bit of audio sent). So send a bit
+            # of leading blank audio first.
+            await self._send_assistant_response_trigger(lead_with_blank_audio=True)
+            self._triggering_assistant_response = False
+
     async def _disconnect(self):
         try:
             # Clean up receive task
@@ -340,6 +360,8 @@ class AWSNovaSonicLLMService(LLMService):
             self._assistant_is_responding = False
             self._context_available = False
             self._ready_to_send_context = False
+            self._handling_bot_stopped_speaking = False
+            self._triggering_assistant_response = False
         except Exception as e:
             logger.error(f"{self} error disconnecting: {e}")
 
@@ -490,11 +512,11 @@ class AWSNovaSonicLLMService(LLMService):
         '''
         await self._send_client_event(text_content_end)
 
-    async def _send_user_audio_event(self, frame: InputAudioRawFrame):
+    async def _send_user_audio_event(self, audio: bytes):
         if not self._stream:
             return
 
-        blob = base64.b64encode(frame.audio)
+        blob = base64.b64encode(audio)
         audio_event = f'''
         {{
             "event": {{
@@ -639,7 +661,7 @@ class AWSNovaSonicLLMService(LLMService):
                     elif "contentEnd" in event_json:
                         # Handle a piece of content ending
                         await self._handle_content_end_event(event_json)
-                    elif "completionStart" in event_json:
+                    elif "completionEnd" in event_json:
                         # Handle the LLM completion ending
                         await self._handle_completion_end_event(event_json)
 
@@ -839,7 +861,7 @@ class AWSNovaSonicLLMService(LLMService):
             )
 
     #
-    # Context
+    # context
     #
 
     def create_context_aggregator(
@@ -855,3 +877,61 @@ class AWSNovaSonicLLMService(LLMService):
         assistant = AWSNovaSonicAssistantContextAggregator(context=context, params=assistant_params)
 
         return AWSNovaSonicContextAggregatorPair(user, assistant)
+
+    #
+    # assistant response trigger (HACK)
+    #
+
+    # Class variable
+    AWAIT_TRIGGER_ASSISTANT_RESPONSE_INSTRUCTION = (
+        "Start speaking when you hear the user say 'ready', but don't consider that 'ready' to be "
+        "a meaningful part of the conversation other than as a trigger for you to start speaking."
+    )
+
+    async def trigger_assistant_response(self):
+        if self._triggering_assistant_response:
+            return False
+
+        self._triggering_assistant_response = True
+
+        # Read audio bytes, if we don't already have them cached
+        if not self._assistant_response_trigger_audio:
+            file_path = files("pipecat.services.aws_nova_sonic").joinpath("ready.wav")
+            with wave.open(file_path.open("rb"), "rb") as wav_file:
+                self._assistant_response_trigger_audio = wav_file.readframes(wav_file.getnframes())
+
+        # Send the trigger audio, if we're fully connected and set up
+        # NOTE: maybe there's a better way to determine whether we're done setting up?
+        if self._receive_task:
+            await self._send_assistant_response_trigger()
+            self._triggering_assistant_response = False
+
+    async def _send_assistant_response_trigger(self, lead_with_blank_audio=False):
+        # TODO: if/when we make bitrate, etc configurable, avoid hard-coding this
+        chunk_size = 640  # equivalent to what we get from InputAudioRawFrame
+        chunk_duration = 640 / (
+            16000 * 2
+        )  # 640 bytes of 16-bit (2-byte) PCM mono audio at 16kHz corresponds to 0.02 seconds
+
+        # Lead with blank audio, if needed
+        if lead_with_blank_audio:
+            blank_audio_duration = 0.5  # much less than this and it doesn't reliably work
+            blank_audio_chunk = b"\x00" * chunk_size
+            num_chunks = int(blank_audio_duration / chunk_duration)
+            for _ in range(num_chunks):
+                await self._send_user_audio_event(blank_audio_chunk)
+                await asyncio.sleep(chunk_duration)
+
+        # Send trigger audio
+        # NOTE: this audio *will* be transcribed and eventually make it into the context. That's OK:
+        # if we ever need to seed this service again with context it would make sense to include it
+        # since the instruction (i.e. the "wait for the trigger" instruction) will be part of the 
+        # context as well.
+        # print(f"[pk] sending trigger audio! {len(self._assistant_response_trigger_audio)}")
+        audio_chunks = [
+            self._assistant_response_trigger_audio[i : i + chunk_size]
+            for i in range(0, len(self._assistant_response_trigger_audio), chunk_size)
+        ]
+        for chunk in audio_chunks:
+            await self._send_user_audio_event(chunk)
+            await asyncio.sleep(chunk_duration)
diff --git a/src/pipecat/services/aws_nova_sonic/ready.wav b/src/pipecat/services/aws_nova_sonic/ready.wav
new file mode 100644
index 0000000000000000000000000000000000000000..ca932afa66d69dcf3626a54f9b171c767eb4d0e9
GIT binary patch
literal 23484
zcmeHvXLuCHvgqX5q}^4PKoTMfM9!FGvdIQa&KQgV1IB<2CK?lr$=N2EWWWY&a?TkH
z#$a+3LP7||jkB|p`qgOe++7Lzob&GY-oLy3&2)98uCDH$&{MtL+O=tOs2_&)YTmQ;
z@G%n;?HGpPkiKaOz;p~FFgw<<-ACO^K(SrtHXpY8s7-e(z#WHtHN0MpYBlRst68H4
zHf+emA!9+y|Nj2h1OMxRf8_xZBVkUdKO7DOKNVKstd`Z8M?g|@9a76hld472q-xXQ
z0GEpWf0HyTbX{7Vdg(vO$^UA{f5!WtaQ)x#6!k-(9Dr2+={f_DjwzZK&ruXx*lWQ$
zG)htaV2=Me{O_~|TNOOQ0<bpnGz)aBDy5Ca07f0F+Gy3N0a*1Wj>$-ZBqK!~;P6xa
zD`}Vl;UXthckoZED=;gxa`9)q_M%kPZnZ6%jls~n_+U%`4#;S@;Bufgt$zKiwjwZ>
z16(?!c~iZ!2JF*jiOQj3H4a^ehSRmGx)D+j^~#}3X!5E~m0P<WkQfCJh0z5E3%LOf
z8dD(_5>t>21zZfsQE;dl)N7FpeJ3M75)hAkkh}$l8_pi&LR`V&Ex>X}1RkNFzt!uR
zYLQwm4cb}Ef|=m3LJGrdSU98zNRe0!oTISl0w@w7?T7|0l^zPu9ATIlB#e+$Js8-o
z+J+E;NnoEJYDG}5YM~Ec57;6hCty6t4JjApp&XQf^3iLQjozY6z^G6TBnR;1LsHMG
zw4}Nc8Nq4;*cwt$uNf?|VX<IuDJ&kVf>pq(7aV0FPsGY&30P&UKH#fkwXw3mVJq-0
ziK+hbKu!11pXd&{hAyM4=sY@&j-rF;Bszi)qf-DMMF-FcIG+OCDd4z@9-^D*Z}bS=
zLJ!b$uv7Id34Sob5e9bDg_hUHlCdPHuM*Y@>x9+9x?^py-dH270oDO~5330F)PlSW
z9E}UCwt-bLN&_!sLk)kROK|K%zoUy#`$4ef09bbz9Y=ow^&+~BUV=Z~f_HFeH3mHx
z3;j|8D~Z*>s=?6+YYp`@#6HAEV7;)x*d%NQ_6;@?n}DsvzQ8`krePDY(Ln74R+TR3
zJsfkQ`{*&$@f{k6TA>;!1UVJAa#Ojc99Di+eo_`I8<m}qpHVI;sY;&mR-q7s;!s`m
zA)1YT0k6=Ys~6~Bi(SJ6ED3LokHhETYw=(3pYh}P5&R_nCw>k;jjzY&;zRNG@S1oC
zj<7e_73?_n6ZSdys2Q}wh`GSO$I*H;4fRIvp$e!3G9V+eB0HiGuc&splxS26)r6Y+
zqw#1I8jeN)_m}8Xv=Qw84SaVQ`X&u}mxlgN!un%#ut!)f_96Zhzkw?_Lmb5y<GI)(
z7#pLZt%CBSvRxi84^-|b>yZU-L^`QUR16(KH>2Js58~JH@%U7%EA|U=D<|ZAk}P$R
zdP-J#f;>bXDE%P*A*}NE<sH6k-!9*L-vZx{zUIDiKDTc-|Be5R&_McIUZ#{qlQ28@
zVjeM@JWhQ|lZ?#lV*4BJ7}gmln>?m==2SCoxny2!u3>&=3O9KTpBjE*>oNw~LWPiT
z@jcK_Q_wEumYgVWlun9&3)O_N{w=)6x5wAlcZd6q>%%2-r@YO*-MvG+d%PLmhTJv|
zaqE3``K5eyzr%l7*eBkSDk_ywB-R>VKzvJ%rbf{%nYT<A_BGoA>f3L)We^RtG0k8w
z@&?Y3U`#S58_OBr7=AajHE`?}mS&eSx%4dhF2zub$O_~kVk%LE2qOq$E?yo_h0*>L
zB_Rn$%@*Ys<(|SR!%!tO2`xZXQ8HSin3NynvhuG|ed!Bnm1L2-$=UJ<r7`*$dTkiS
z;@2=5?~lL4s^S*>BkVSe_UdSkk}Y?Y*GO5SNwf+>{6F!Fe1CGH*WmrcbJLyeYU7G`
zSzM{kRA)CA>)!5e=_%n&;v#&Pe82Kn{fC7u;wWjZJVjZ8oES@VC+AYz=;cg(_7vOI
zFwd~taN3Y){J_}M*vMGQxXvK5No*gcEj^1`PfjOB;RCVn&}wC!+)*aw)l$49ix0)m
z#B)L^!QuD#%Lz#ML~5j9*kr;-b!Sf+W%I6(XxmoXQQIZkWZPToB5SPmZpe<1i6Lb}
zT87jKIb-P<^4bz^S!rHr9cIX38W|>%vvGn<r-vD{jIow>Hrl?{e2A(;y73%tgY%xV
zdR~d_so9O*I<jrf<KAcfM&tss#qJ2D?X^tJ85&jdY;i1b)^-eXdAT;i*UCsN1-++K
zk}$rHvujr5t3!{g-A%uB;Knb1ow%R$qI1@CPolJvXie=VUZOc-4}L3mn~UQ=lZFxZ
z*mEIy5uqi2E;Y8yzA}l4ZR3>4kF5LHbmB7VfG%O5k!kdAY-9F1{S)Dm>-w`i3mk`Y
zDrEUH+rOEW@#&j}S?RemouOQuSYFw$tW#zxZRI+0j$9wRNVPO(hl~tc6<IsxVr*QA
z=CK`OrbT@eKG0U*GRRnsZNY42-Zwll);Eu{Ot6$PTaEYFkC{vKIi{8&*<`SE4@tLN
zwp_7%Z9ZYS7BbFS!hYQL#QKw|9g{+5lQ)?YYzDhLWP<T$syqJ3|G6*8H8$^xGt{$%
zTj`y|xxEQ_j?7P94NY17VpvM;wB6~SXLQN3x|4(p(iEnmtyx$D+iCU=`Bo6~C+1wp
zz2k6u>nJe>%D93;h~qukqf*vCaXz*_9{Z$2s^^U%t9$nL+%KK({HnPo=S}wnkBj?T
znof>3^tD!qyjikg!u9xu@hP!#_!HYm>pCkIdOLDjiR8p7WkM4@C3i+9g^VB{_`7@e
zxx2c?dEXavu{i1_HJoaJ@AhXnKhC?BGdB0Bvzfmo(bM>|H6si~{uZ$&yj^(L(0$f3
zAzy-r?po%Cth7H09UeN-cGU7Ey9ED2E+y5J>dEslAJKy>OIq;r=u6onrYetdnK{a~
zH}p1^W<FrPWS#6&YnO;BQMDsW+s($KhQsJnDO5Qo9l-AMjhywJSJPXjEPD3$<AG`W
zUwxhS_3PTtC#U_M^*Fzqm_=_3|1dhj@*B37>ympR?PB^G*KK8)>3-Ok5hE-Wgqzv3
zQ<gs%^Wfz3V>z3>ZTLNMjDLOZ-t_({gHyYv9nOwaZW`BwQc+}yp7GnGV<Kin$YHT|
z!PY3eN5s&`-(nh<dYNdBPcGRg(qg%SmlOK>&j^E*@0g!0RYJxahZ5C=Gx=*W+q}un
zZszGE*_mUORd$b^vgMfnGHihnG|<?<(!xG0;!0%mh*LJF=|f_waM)YN)yuP#+u?tx
zu-I_)ll06t%bVi5;jG407OmI-svPq%`zbq;j$(6+>1NvYDC9f4!}iKBz&IT%t(+Dr
zde*q5oV>h<oS3YVdABpF=HzAfN}Kg+<?|P>hj3k$CT2FaM<R+X#cuE?=dFEf%DU}t
z%2tU8iLDe-&tP-4d*1wh-tEr!!_!B)@1gfGhOgmzp4sK)+838IdU*drpIPR{Y$>@Y
z!C30|_?HRm;-|(fiTl0;AC(+aHZ~<XF=kBUG+S#!7iEdp<NAqPDJL=gOp`)tTY8bN
zeSbRSyo#O;!j}YPIUZ5HL|lmnk!Jfd3&V~l`YD<6OtOjLq4_&ogsq$T5_ZU0C*$Rd
zK`B?#=jJx>?Nhd5xDw(^bj-=I<zns?!WC+!bx!!}@I!WFddN<t9%8$QPQ(P-%~qw&
z6fgPxiSCT7jhQ>&?9W=2*E_e7W3i)a4*q8SvpTPCxet&fqt_)IuCOGLu(apv=Xf&r
z=C=}mHIE8AW9z{#^nR9h{eJ%4At}$^c=PY^>*NM9%TIS-bRXeH3B6Gq^~Urh{MWd&
zvL!24s5H7<iMSeJTTQo(Q>@FPGUKY2`XlzG1(PYq?aXCa?OpZdxrVXk1ICl|$7r3f
zLOM;<GC#2IjSwPFMT`pFVVz?8-JTxO(=ZlaDz@b2xQDrU?<RjKey(eB&aZDSWjxBN
z<#PB>p&#IRW3R*rlf_@q9=fWjWk|C1vycadQOsp3l?<h=)HY&-vW9!(=A4b3OMEVO
zd%l;C$;-=Wnl<n3t^7Or-F?kvkw53|lbP?>OD5Un_)X=M(xsz1D<5V5{+i49)Ok?e
zKnwT<UuaIR6!*Q0_gybivpHwH`yO9QvPj32l~@Wnj^0J?qN6SG5mic$P4*@eWiLj{
zW+OEepGf^3vN>XTL@#Rw(VoX$O`RRNjS6MxWtnV#X`IV!V-K36LfuhaW6wojj`D^t
z3ab&mBz#$z(OQO{B-6fl_c_--Z!dm;@4Ba|<89W6x4&lp=I$$>B^FcTD3*FdCXvz9
z8d{-Cuts<qDPx>tY{+)TWq&j8V)t<OGp>PuzIZ_R(HHHx<*1QA$T@<W#BcLW5$wLt
zc!rzq|6RQ5f9n)H@8O+n17ac*UM2h%CE}~IH>F0WrRHqp6NtgsRjy1{*OZPAqV7+6
zQ6c-XTj8FFZ}8sK&qNpcB{hY7P6+g$*2yv6#I&Si<qwyrY#D)_mv`V5%-*mq;WMp$
z$tHXk*DTjye<k9Gp^_!mY%@$`mM|MkGwu5#+s2%T9v<;k=y+SGwMj@zqnSJ*?esnL
zI9>f+C)`y$V_Zudee#~?#yU7pC-HAAgZiF+2CZLBH)B^Bce16JnIulWC94sy<#y70
zKGL(>u_nKjC*M6m*d|=ycX9LGz1(fYc%d$lhRe!Sl*@1ToaHYGccpYsYR)HOW=QF1
zW1JFo(At-{o%bg7^R$uKn|<L#5^Br2bDzClnDX6=QK>1}Z@dr1zQ{!984sDK8jsLT
z$u)GAxqd{gxS6E~ByNfA7_t(b;O6+k(Lm$Z)=}0BV*~7lw}tb9=dkpKiLh-CiwbLM
z|1IR8mAAeObwsX-{vbLgvQ^k!>l9NHs<gb{m%ttKp5l)9D*H@alKZQ?+-x#Wc14Pn
z$xqpXhM(Dc<P@?ld7iq${>4Ta%NR45LG)y$t+<O@=DL&rEPsRdvj320KbPSd#7*(`
z;zD_^^qsgy`9LTw-uK?~O;ku!S$^pD=hdYuhnI=@rKA*^X#N@v%{r1Ek=Y=(ia18*
zy>s*Gzpj_crY5AE&Y0|6<ugdH&~|dX$!5tldYDPnJj1oH$|cv7`lj@U@g>7<Q?>lV
zy`98NQZb#fOkpc1H$4gYEghHn71ZL8D`5x2cZ3eGPql^GlfpYjej4kEeH+yv!eeh|
zsmWecX7J-Z=RNh|IXBhE`tG=!I$q`U%w6X>Bh4T;GxKN%l}e4LBB{AlfA*~TJ4++;
zV#Y!XvQ;3x6P$mz5Awr3rMU9mGfv7og=^s6E%X*z2%Dt}zN4;ip}cgAtWV5xAIk5I
zS)$67x>qthysJ4}Zjih6)!@u0jwRv>d86lS#_uTuUPvjvl-93r<n8sn@RRaF;=Ey9
zh{f`N_7V#FYxs<Wkg~%{F)^2{m&rJxJ^z=~&$!8+VcDmA>^_;#xiW>p_&u^VyASTw
z)lBy-eQkn$Md&Z~-eGUU?NMKp;G*|hXS1b<=4iCi4V57|yr&$_J;@KxYndPKx#=Gx
zb(Xs;wNX7Z7_Uq9X6(js#;Ndz(Vn?NO(H%)rKRrjK(PXU(fhHpqUS^JS^g1c_qOn4
zyPCT&k3YWzpDu?IF6BeVn!HGJbjc4YnM(Bw?LdV4-)4QA6Xn>zpAzng&+|8=m3!7A
zrDs}|j3uxCcr!59>HggJL|lb0rVcPe*<|*X`H#qH38xdYOH2*xWB%OGgKkUv3}-Ce
zEIZg6@?q~qxK>z24&LN4um{8sbS3sj!(CHy$ZPA((7O>K(RggN5@TYIhR24iViwa3
zU6;*d;~5IS$U7ZfazpZJIgfZG-ypuRza#JDWucDpA?iU?rdG4_4Q|r{<1O|IHW4k8
zNn*cp32Q0;DLRC<{&B83E-Y`5V~gj2Yq6)XceuZ_P}RR9=W13@u|y<SS}L0oeu{1(
zHgo=#KioT69Ep;Z5H2h8lb4Q{`_s#0QkhFLu(t#9A95YU19BCmmF$%7V)qO`gq4b0
zo!BpaVO0MR+O*x$FVr8gDKaXolR1`J1<%ME`Km&Hk&_0>UnoDLEYuyZhAl#;@pz_P
zNQdyfF<~Wl#Qqgl#q!uN+jz^;&3-+Mv&ORz<=fmM_ef7g?*VQ%_qBJv`!Cl_*Gzb~
z$@JFX)5K(SAKsQTl)FkC@?wL@z03!uP39iPSn?0~zON?l_v2C{;jNIydxY)Y)A@ta
zXJqvB_)T1-G5V~*D*E%<W;M<E*!Mu*h%;DYVV>*noX@lFX3xkun>Qzat!pyJ3FGB?
z;!AH;=T6s2p#>dh>l0o!{IE67c$%ulKC&#0+!;GJwq@jJ)|Up4p|`1q`D>$}E=P5s
zPSQ8v&3P)(2qjBTq%tUhUTvIgK41zpyrrIz<!FZix3md~vOG4HG6?i5>LFQ@e1<o{
zE-L-tt^PaoR?ZbR`4YXIyz6`xp_If4ef;fwX5U@@mUKd{p+qWw%DquzL@KkyxBmXZ
z6w#<uReH+Dg$hCm-$ZVUd%bU!a+RHDx@YKv-s7(4^N#oV%UBtrmQtFp;#<N`mnw_f
z`L}M9JKfzuSgw@Ewo4WGzdfhDNBAewee3{vlKhISNem<&5&6^<#%}0h*lXx(Y+yE6
z{xVN6T{e8eR%5?1EHWK3H!&v}k1(Ok7G|EIf~m53fa#23F1vwg#%y2~FpZdH%$Lk)
zrX54k!^rvgd{{9{!q#BvSS+4~Hpz|P&RSoLmHaSw-;;f!A|we;e;Hw>;1tG)ABc@b
zK?oBs!4+wo;1Z1DX3-%ollDjkDPLG2%oT4+D`m4nD>IZSN*9><ebFoBBW1IkpsY}u
zqHoYD^gu!KT=|%ss0>lcDtqOsa%*X&_%$R(6hXID93lQH-W1=6?WLabJS9t6sU#_*
zl@HJhOvbkm2Z^1;4&pgMkw?j)R2*$#4l$M4-7ICOWvF7H4ZpFtfiPN3YfKYNXN(&S
zEe)Lv&kb9QeT^Fo?bx|YI1|m>r%zI?Niz|GFNM{`+NgqZPx?*F5f%zvg!hEQ{wDtG
zd<}jfzl}f7ALgg?FMI>|r~F0#Sz)61NVwzw#J|ShTuhUeD&x@$bVli_d<ZKCEu}@U
z5>sBOk4B@3iXySnZjlq$NFTzxJ_f4>^`x`@%lvu&ui}2Gx|AuD7bl3XrD5_jd87QJ
zG6z;E&ZD{LCaeVgLViwpsDAWQritOM@sOzrtOsl}PB+ao@ur=o{iX<WtB_@uJo9;D
zf5RJgjIpt?x#1A=jJas2XjsF%Po)s4cp~n`YT|o|c#^~yDl4R$QnqYVE=f-DnmAWJ
zA#al(Ntxo$Vkhy0xKV5_(ZYNo!oNeXh!=!(e+j9hcWRD1mtY-X2cqqk80@s~q{HFp
z!lki(rap1aQ^I%F`<~B|`{d26%(y(l)!I|a*TuCz^X${qr=>Hm^W(Ak=(V^*s4a{W
z`=C4YU~`&nT12@LPvfVRbcFA-&Is!h+asY-VnVz<wr!YX{(&Wpw~fb)hnRYF70OKX
zR;v4Nz?#Wi|0(ffB?eoMCWx*59%&bDr#canNs5`po?$YWY3xh71ksV8nGejOaUyw9
z4wt6G2>VW1qU?cwtt8y=n!PnVzj{JFVLlrd=l|HV-qVin=ld~p<I5}5kjUMMeazF)
z9p^~bz&wGgXNe7+PEU7q&YhJ1$}=ke<JZwy?c9^S4o8pN!CCEJB|IPWba74}Y@xBG
zvenfj-|LlR3p>M_9Mv|aTzIMQCQ(CT$A(u6dk|HoOhmaxrGJgyXggwT1S?s`ZD%8W
zq4k*a!e@LFaezEeCeQ<XD;}+QgvP=`X{6Lx8B9G6`76Z2gkd{TFRGcTS%?&J*Swao
z6Fc$G$UFFUv`J|qH$?sAfkJ7|ZP#GeAD-*p%Y0S-4{oUYm}{c@Z_n4h2!Fg#R!H|9
z6*l0VJig4W(l4-vvMs6}QBIiPP4OfN-y54nKV{FmZoSFPYVA7g+?c&7v#tww_sCwK
zil@#@9hbU14dpD6s-aQ7<vH0olYEb;aYjqnu&Ch?b?q&zRim!OtuNKO#M#)_2^Gq%
zPG}mDZABs9*g5-Gwt1mRAurHtZmfS5R+at<?SiM52f`%&u5X>V0bR!16HAFjl-1Cm
z8AzNUt{Y!iaocOtDZ^FPK|LaxFkV9yrWt-f`APKod;5Fx*Z7NkB`(Ev)79GjgZHL4
z-_yuF#(l>7J-65UwRfp6nZKYg^3P&ZrM_Sj`Y2<h<9Xe&BPFUuc%x5~8Qui$I0{7x
z7IUbbslg4%*_Zu<YlT{QC+631K5(?lz5AxutJj%Ja;N56T{VQW!d!PwPWils;!mbk
z*7-Kte3xCz{%LYY>`&}kT8>*DbuWB#<o4)E(UH+5;;P3-M07I!#eQz8ZW_)sWcQi{
z(`ek`jq%1x-(hE!C6Y%OtAz7SJXQQl@g`&pRtpUxBIx_Xuc#LGCuKL(VsohF<VgIg
zTu(fqjH0GeujFam40nC*rmsK0n}5wO@%`fJ>?q^pJ#XMXx|turZ{d1#`}|9Uu0kvM
zeWkJTJMv3Iq(*X6X|S@394YL|eQaJH{#B?cOp?|*a)o5nnQ&WkLOziroo90<=O0wk
z$m_mx?ilBjobB2B-hP!;&%KWu>)htr>Dlet>qvGE^-Uvkj0+7_s4vKd<XPi_(512G
z;?tt#&}`d+aBu9>c)8@<l21w=h?;F{XBlOWi>Ms&z&6dY-0&?Kg$(ld*bTZZHCDRi
z`^~qNf8?v|A1RGcb_=Vy(cDa-rZQhSrX-<-7>-9{AESZzcKi_D0Uv;E#BZT<(p0g9
zd;#WhS^p_+o^Jy`-oFUieV0q|u%1hvcE0ES95K^BpLck=`D%z2<+jp1$u0I*epkMb
zPs2KHvT{M1YrN?^;JRtK7TzrMxUIFXop+G3hb+ssvQM+D<s0PR$aQe#4VR2oeui^=
zHkErn&+UBe*&;RMlJh#}?sL`fCi^6AjFgI3r+Z`X$$3}{_FHSWFf4LK*mog2Ob<h5
zMUPGVIL;8&JM2Yht?(&f8^Ui#FN}&0iwy}i-8c5M#2W?j3ub{Kl$<Pf;4}TxWUIVM
zFbPeC27Ej36Yn{3wtPgI>=*d?(ouOhJeyoos^KHCaOIeM0KI?}{<|m=JwcC@cJfeB
z61T(scf23t7x5i^jkq_SCEQqkoiCM}>2-Q*a+|#Uef@-Xq8+X!Ck0uypr^bfw3KlC
z6LJySC|4H`NlQ@=REvA$&bCouhwR76&F<6g8{{P8Nz0ekr*wNh(Rqt|iFdU9VZ6v!
z%N^ou3Zra=KhM|HciqS4FLRFf-1g1zZ}Kk{(v_?71;Of1!hbZJv`(}z#^r2jX0xeA
zXzjRf<K9N~4dG0W?IlZGF0rS?{>ap@#<qv1<<?>0-6N*i^Vln76S@p(AV%QLh)^O`
z+U8@q7+<J=KR?meho9%$=dSGx^K9^+kQNBFeZ#%Iyj6UCg~`ecEK6A^F8BMyN~kq{
z7~6ymz_L*ydLWNRmoX1|p~T4Bqyyqkz7B5{5~WBf*`Mp3!?}G){$9dDK2kU?T!rVe
zQNmcIHgO+4kn+VW`3)99AHr7>NNFb1@+cUG_qcE>Dx{2gp80~Ta1+o~%4U3Jx^JkA
z(z)Kg2J$HOvN;q#;$H3O<tF{<Vs*jKSLSBq&B=Y`8qYiYmH6%6j^2j+55j)FjxwG2
zh3r9?;9JuHW|L{UdA>a~yn{X7VzE34dl<7n=5UlL?5zD*`0DVI5o;o@Msy8bZkl6Q
zW-b#l-_na+L(V1{d^p+xPeRktDlyL|a*2Ej{~_1kGoPEonY@GDcK1l{9^YZ#SNupH
z!JXxP_RSM6NzbJj!a08rakcUpd}*tR&MBXvL0Db<0B$Ef!8c-mp=NkP;x)bw>xmMS
z+6n_-4R)Z?7?P9aZ=|IXFOHUM*ar9pby{Y`%JKs28>OF84Rx1m$sa1~#3W^e_*xk+
zua+CR-;=wVY8opVGNirySz!^p;axHONH50v^NsoU6dzl`cz~Gh%5r|l50&aGP33L;
zN^iEyocqxGwfwpGId{VKr?Wio=c9yC;#uVnY^>B)TC0qsW*c@JPnr6dT}FqIH%azg
zQSQk4p`)x$TjTI8ktZXUMLNQ(+0#u+%@eG6=uX=Y#yeCxIfht(m%=NO+c8X<>TBma
z#kb{4@b7bB+<o_T$3({{cT+B%i}f~jhr7RUAMg(5tBNbcNkW*vn$T1pidMsy$U4|p
z@Fnm8{*tsYhZ!RsO4;da%o_tT*bHB=o0#^@RAw|=$FP;1!&Ik!B-@b_$kJ3TA`Z{P
zrlB;sseDf=E$@QwM1PAL1X_U4&Egp8XJM7p&U=NQh-?ZaHJ6ur<Nf`lb>i=yb9fK)
zRD3G_8P|r3ktO3uJe$nIcMC_|jF?Z@*`<aZ{;m0OuKscbl&Tn&{XUZ`#F?F+%8$ca
zNrbCYemiFu{wuMew1&@xXHS>!pg0w!6YtR;{37uMJK0dvaDZuR4z<R?b-IhGjb)6r
zgRP%^m2I{y*OF}RV9K-XvtJJBXkNl#bON)9F)@Dn1o1Iy3fj&|Riurg!Vl&4c`A9v
zx}Un+xk=BTZojjhQ*sP|r;tqVB5x(GDL0-=<#2zhSV!J3zb}U<*U)bKJA$Fi^dp*P
zIx|V^8pC*FbJI)HR+H7d!xU}0Zn^+pS^hA;Z?YTH+3p4#toY3|-ehl4Ul60nP1Fiz
zEgeT)#M@xy(HRs9<8>HOUA`=TDTgUF<ZQG}sw-uPyOs0uX>?clLhL4-7Jio(`i$}!
ze3{`9<3byFud^}QOMXOEpu>nvUjw)z%s@Aod2E<8+tJ3E!Os*wQ=;Y0BIA1IsOV_n
z?uadyPjlZmmU+5)clrCvZ55BWKq)O8_tznQro+iH#0S&^<^ubTVFFW&O|r}}T{k4K
z<t@X^tt|tL4-HQY8`ulXb>aZojS<Lv)I%99HI}!E$AlmGNj$^%@W#5`?#}KNF0*^Q
zE8m&!<{h0K+w+e*DtS212Cv=Q$Zhrf;@#>SCm5vR;&52)dMM6PaQrdW1K)%{z;EFs
zxr_>-9q{$DJpC>0rDB<iOiN}3bBta}AE1r&3_2OYH2m=HyN42}FKH*!h#krHU}n>o
z=`Ktw_AI-IUC+#7&QkHrC3t&S!y1?$sj^HrdN1<>-JV`VPX+B0=*o0&CWWfTj9?d#
z<(O$Cf_C>~o>4c67!+k-sgg_!>OK5CX*TsH#?Xi13-@Rug1CuQpiaSa*#_}1=^7e<
zI!oPsKY2)gz5f_biqCw9Tq*7??xyZMUf}w{(|J4heOG5+Gv6KlsL#ih@jvi=Bvh9^
zm2OL$lq9(__BCpcy-$$D6QU}45$_M{qm7BZutu7VA16|=viLKiGO>-QLrlcY*h16@
zO~5{bFYd?T>o_i*6n2S>bXD9c*!kJ;O?;?-qCbs)?W@i8@uqoK`99@W@@IXAxF_BO
z{v)`%MGF`Fr~JPOMkz|(tt6mtmHSFGRsoMEGKdajEH#RHLxt0G>60|aaLiI>E4z(}
zWzR8n*-2~#!x;7xb^vo3Ji34lHS}i@`+{k}_F`pb7?aFIG85Pp&<}m#UED%dVNTEs
znI=ppW(S=|_N8x73jLhvz|<o?Ws@k1scRTcEhcNxjmS#$M(C;M)OF%Hxs6;(2(UU_
znYji%^FIC|@kD7s?7_ncH_;RSTrtWMr6grD_^qNeU1%jd<T3vZ{}TTZezPysTbFy}
z-R=FuJCbvG{GOGbot{YVQtqaAJhzGa2=4WwuN!~Ze_QA(HWRmrU8EzDPtKDsDVxw~
z#RE|w%P|vf!0(`gSX-<q#7vCD9|2}BX2xsaZ6TJUEcO^8Sq7k|%0A_+60amGW#y65
z-_i-GjuZmx1P6r6!fIFx+AQ7>?g$yeEEwC%#qY$k;#V+}_efWz(Xt{}Q)Vm6l+qCY
zvKjjxs}B(m-{AF$i^OqqB(<EnNsXq4(03tP;3adODa$TlKV~<vwG40BH*7Y0hW)^>
z#8BPvC!5EvgVnlJSX=JFrn4{DPnnkNm(1_XLFObwF%{W~^k0;XK0_Mm=Cqq$Lye(c
zlaEM_d`b)>X5jaU2lynS1~G(qACJQ!Dg*V#-VlEf3&}EgdF(OVC$B^M-opC(3HeX?
zh15|>5%YyKVY84TRFxhJoH$4<CtUN-;lJf)`R@rEgsHH4-;H<q_V5q+HvT&>7vlVn
z`J?`9{|<j!!R>D-v<1)nAsiOFiEAZBYAV%|K9JI-H0f8Vx7@p6r669J3{fvfmGg?E
zw1QY48vJt#+lWoZzQtMsycIi#{ej)a;vi-r4@-g79R^l;;xP|IoYcfGK+Mk_h-R6J
zj-iqe5v0Hx;68|lXa~_M<spWrDm+nD!Op|644%$Q;N9>QaIJZczr<@39-<D}iabOn
zQ6;I_)In+~{Utq*enQt~!eG2x*vk-+QHzabr!ni8w(L#zH&}b=#;#%hq$@E!n61o0
zW-e2eIZIcdk5Cn;D`Y)#3$cZmPJBQPC5_|`;vxPA)&+Be4UzaTyd%CDOGOwu1YhpY
zC|@fbl+sGNoFTj9OY(B~jz3UxiRIuqg@To;hf)n`me@i}6u%L7i0i~IqDMF^j1=Yx
zhlJ;_GSycIhxnBDgl0m5z(Q^ly#AI#KVh)YQiv1g3d@9b!bV}Yuvf?wBE&Z0hvF>p
zG^|36k{U}%QVVIJv`e}pCChE(fpSl|g4|YqEHg@5rMc1oo}BDTOT~;T!rGQssfmU_
zELKYhe^11A!Tl-%`wpJD2qKnviT{NEh_Av^@lc`%@eAQ1lF2Z#D_qHT5qpTbWM?uN
zo+37r$H?JiU-C1uHu)*Jh<r#UQl+Wi$#2Nj;CF_~AUjessd`j(SRq2xDkw|j7^*V0
zmfA%>rTx@y>Lj&>T0;Fq?WFwFb!sGBGjieE(kd9+17UQJA&-#Z)FbftL~;dbfNu$t
zi3>zG;x4`rENg@x#O}hZehRbrV~A6Fjbb29W)r-Je5`CzzEUK4zI;`_Dc6%j<VtcR
zM9$2X#z`^otQa9Zf|aDJq8pCe;uP_<uuC{El!5iaO5#+ptvF6NC^Qvoh*yPILP_YI
z-vkNbe>RKN#Ma_Mv5uG}d;;rGm&M;;P3k#Z3;q=U5*2Zh)JiHPjg>A*r=?EtW>+3O
zUqxOkQ}F(=MYhAb-~w2E+oTv#M~Kea1<`!3U<_mMUa%Znh&{!sz|;IZ@cBzfF+>|8
z2SOaz6ZeU~h%)41av-^pr~;ndPJRoA5B%Mg{F6+EFRkB@<;Y&-RdN@M_7LhE*@ApX
zM3XzowPY7^15goYSV7()+dyPPHL?_}7>^-mkb}v_WHWLh^j~-KJJLgtWJhuU*_<3l
zjwct8H;KbU7vTM#I6~ZnJ5^btD)_zuaRa{sv1oO18{PsRh)XbgXF@z&DTt)23^Vrv
zT8wJLEWnT#-kHzCYG^rFUwtXN<%{wS`M5kt9u1?iw_HlTEu~0V(ne{wv{zan&4;T|
z2Y8C^BsGU?$S0B_CP1kNTw&`=gQd@<_AsW8O9!N1q(@SHh=GcLC)O3<(~<IK`L28t
zG>0l?h;-U5|1OtRx+!y&UU04iPo9j@UzrDKnX*IKt6YJ5;{zoOR(k88_7JIe5u*0)
zplaA4xQ;A^w~5_wZCVQvbq8TIO~qbgBk>p<$M0ducpJPS%!p_3J~9xlGC$)aQ5SsD
zgJ?=ngo&6&q!71>h0tfi0iH$(L>!q*%pm3xM~Sn<M&c?^R}e#quZaW18e%aqi};f0
z3DT{JCPYJ6DUT-H(ANosg~-5d#8vzttT5cikK*g_>G)uL1k^YJZ;8j_Y1kbs1A7c<
z7d8dr4?n<qfbDHy_WB^g@*qSK9)a<{5w4C6VLiJ&s)blYpmgP1<pYQ)JE<&GHo%>B
zs6r?_ta4XVN-3dmJ;;aq!E@*%T*(0sPM3d@f00kgX|hRq3hRDf%2T0qQGO|t%3b-6
z`~tq+*x)|NLtYK8CGC_6%4&#k{92g<eZEs!5AS}ZVJ3#a_lQ`CIg9~ohM@1^x;+oA
z1}|TPyX4;xwW>x<k`PB(9pX5LLTkoBROJuY9&9Ie82cN$3M1<oq!8!<SdhkYuvE;g
zuE4-mz9jSshI_CuydK^Fm$6X5hT>J=8vF@f2J%|aTRreT@P6_!U?$_E@b(Zh`8`1M
zU<Gp_-UaUr@JyI(o$v_|X&Hi7!dt=fKr}!i#5!KU9)g!&VwWMZawfc4eUHt@7Qi)f
z48(<21D|)onnG-9c^LC1@Vf%{w6}0yx(Sh`cOeG$7qlPkK`Wun!_eojx;_q$(J<r7
zp^~U9MDmqFVc;9T;!{L82;_iw);kdGm#SncDR6$QTvyI1Pob2iyi&5^j_p?%m{GjK
zD^9qJ#=shasv{OvMrGhSUJfN76SASw5Gz;>%E<-C2N3<)84X3Bz*rxFra+9~Hi!~D
zhJJ-~0;NNgumpCiQPdbj+tz`5Ya@t1?1pv3hQU2RjdvXdGhjBfeI2$Pjs@6aYy<dX
zGu*pZ0=y88#Q?7dzl??m*~!>vaH!X@kuY}#VSOQu#KuBI>oBYzM9mI`V=OiV;7Q=k
zZ{Y4f7j%9Dv8B7Q1K5us`5Sfw;62ze7~9LR)!07l5Vj4VMZh-+dSNz<kOA075J}n=
zdmk*W0X<-cKCnPiAfop*%;cMJJcNF@24n0Dq(jgXyP-e!LNxFpICjA(+z*k+hXK12
z{BR6iK<6M{b}!lvQNm|H>UZGS2h`ueTfd^MFqhVW=YB>%qMy(Pv;r+eYoKqJp(SW7
zJeMqlF+CSBi_vo6{0X=gqb;C$FWLd}=L_PXuc2oUPx}z1K%NCxd^O(L0P8;-vV#2~
z5W8FsVvh;PE5p~VaG1+#)NLq072&=X3pDlWYK3Q_WZ(>ksBII(lb3=c0a7GTqJggr
zL@*~pFIEBRWUMB{NY}*bL7%<{zG?|$wk7m#b2!?==xqt(_XDgQ<SpUa(G+5?8(=LU
zsWMezK2`_cRtKmo)Lj~CQX|8o!IDU@)(%f+Cb-g@zz+4N#@7=NrS6CLawjA&JWc0A
z6#8q3#?F9omI>|t8<OgQhY%&L_R&p<8NUH>#<!rKt^srt`rrz>4w#!z{<9#W{1JSs
zdI-_wFG0&|csEJ~UFiV5fpR9?HFMy|1b^m0oU$63?*P0LqT4Z;k?Qh=id7H=(d2%J
z@24TkTaDLO_XAK6wPY^vqB?Rc;L!+(4i1A{jna<<KgK{jcodvf-<N=t03Ij_rBVRJ
zL5hdGB+%o)<0_=aj>i;~RVWfPsj_MwRiN7reXH&dP`zgb3sfuASbq~(qDJ{EFe22Q
zQtF-p3G7vOl&E_R<btSuH~3V5_NbBn9+<bzg1CPVTtU^y{~YjkZb4H0p~n5Id3u4L
z(%_s9BTg;71`nnclvIyq09K{G1t=ZP+RlL-(3k}&A9g*ceW})@?lSN|->N;V?vPM(
z5$scUPN<$%z-D#Vi@IY$ZHc-kL*2hX6|{(ic4~VhRKL^Ubt9Y&kQ<==hJsO~_JN8w
zftS?zp!SWrkHQXfzzSv6Yqo-1#aRnrb@zfgb8H1ZR7<7;U8)|{uZI8juUfyl-$AQS
z9S2kaP3;AB_lUM@MB6{2_JWF2cblkE+Wr&u8d<P$0#52KA9W9#y5mR9;q!2TH`JD^
zSUss&Evfq?0=r8%=tD1zOf7d8>@M*^N%buM-$~`ylUPuPwo64Ws9m*1wL`U}AcI_m
z0*87{5ByY$ic!aZa8lc)CbiAMdvWw#IXX_uRlfuvt)zNS^_-e?4{EpoR^?Fl<>*{$
zS(6H!HK?$py01o))A_aD(KM-+sQjw+YCE(xXclTXt##UNA>DG#e(kKaT$c~{MboC2
z)sp(t%W57d7k~vc&BGc+<JMt~CvaAKBlxHGpt=u9?@P6;^<;pnI2m0cm^Z+uUP&~L
zfMvm)dV91w^pcjV5&<u0{#7ZOM}v7ajDFT~t@XMlt#*ChXtn8QZA|L?x)xo(UT+}R
z`%2~1xOHBgqL+*3P^lV4m)3L55?xy3(`dSNI!(1*hqYYSq2@(tI;YN~my71QM*{LH
zq}~yV)2VW+8Uox}NiVB(^{077ty}k@#-~|RyiK}}!c@(Y!Z_WRx^|7HXv;J!_1${!
zQZ+3aUe&K3s+`sW-6EA1fCCab6rgH$>TM2KtzxtkT&pS{;L){ddC{}3vnXCIt3P%A
zX>f7YYEnh(3bd&(rEp2}U~%;Z+O1Jled_OB-ohH+rE7gv7#}PXfP=Mb*x++v4SFe9
zM#Ba4={x~U08(lHUc1Vp%e-5YuD@trm{XT8$`vT9{3_SK=M2c`&_DHHz;2cEpEL(^
z6~|K?POtet&x5rD;9xJQ*x<6NTPqhwDOz(;$)a>kS8-hbDXw^p!Pvr@f?KJ#K<5b1
zg601`TvSfiS6uG~&r}swdA0Mq^#t_jn%=Fc@F>%?shWai1DXq}0&3CpF88~bV3~hS
z4RGpwdS2MB09KO-w5}*cZ@<bHXhSepFdV2qpfP~^*HExTK&K7`Q&l}h%fY&JnwG2c
zS=aI|m(HtG)UwX4wpYh%dC_uVs#el8Xq>_P!EjNzf1(w|YvrO?Ra-EwXq$@G7F^ck
zf+d1sjq_cMt|9nZ8Ng}Qs8RvR;y87BfJ@g=xIM*T1Nw@S4oJKUX`>+^uW<zXPnY>;
z%{oPY+S1FqWU$VF=ECDi$7{KI)-l0QVU2nzz!|{kI9-!gn~p8c!T_hPMePw?B7iF%
zQaN=Ey2bCx7R3}TYr2cWYT9+pYOZ05<JGAuO*;y6X_CS1C`=F3qG?h&12Q_Vo(JnG
z48P0qu9jelcllLnaebvp73EhswH_-<3DgpxsF0>xJ*%a{w0G+W&~<(9VsuL3Jkai7
zd|_#|R6Jj>Wx?_qtjQI|2ijE_A82QwCfy1>4`?katCtIN>ezR4jqjhh0~TnK!F&ad
z0DlB1!PIyCp;5I~>r%QmgR#MIfG<!>QCxB5;_C|L)NR%CV44d5YkW~^(el66rg8sM
zJO7<7)iz!B-}4r=<eytslrvbjUSBXzFkRECW3*ADmx5(<Sl6d<s?sXAdgv6LQ`7RV
zbZIiWye_A=Q<p572l#>^olC=Meh%OQkV-FH(#nCJ(X|$BkB%>ztJ*YP%@^<L50+55
zgX`BM|FgP^a%p;kvH!U?^_e7CQnOLV=y|Z^0XWcx;Fc6_m(Cr~RG3%g)_yvtn(H>H
zQUS@ru+Fba>u}M$sC<C_Pcp%^Y25nLNid~2>B6{RZdFfVs;)U;&;K7tv!bXiT3xzD
zDz$iye{ZF(BVc{;ngV<QC?NAcfwX!G_q~Qw&;L}*zp}14+CS;}CyO=yfF1Ah6|P15
z7Nw4M?Td>x8@172nDd|8RDACRTOJG-&mE|%Fz>tE8oel2a65x(!LZ7s)>9Z)xD>1>
z0P8$ju9yBNngTlCg#vnYntm11%kSPB0u&tz&ULT8t2vnK{~rEN^cBaW+f#gv?^55z
z6qO9-&}&rZr2f-a%+;@bff24lf&U6r^#<z-u1C|WYcJaCx|}NgZhr;q3Bam#dMN<u
zm_Q9hX<8itJ`H-;ZXKiNx<uh};ga5tf6`f4M`78*C7ts>&kM^JZMCjP_u9Xs>->7I
zjRn1=d-tEU1UTP?3QK7g6~?_QQy8PSLd9y50eQ{R;?LT5Ts0m*iviHT{;MVR|0VUP
OiQ=XI7yGY0@P7cz=1kiF

literal 0
HcmV?d00001


From 9fe265ea6489fe3c9cba72dc94b22463d96829a0 Mon Sep 17 00:00:00 2001
From: Paul Kompfner <paul@daily.co>
Date: Mon, 5 May 2025 13:41:30 -0400
Subject: [PATCH 65/97] [WIP] AWS Nova Sonic service - implement ability to
 persist and load conversations

---
 .../20e-persistent-context-aws-nova-sonic.py  | 256 ++++++++++++++++++
 examples/foundational/39-aws-nova-sonic.py    |   2 +-
 src/pipecat/services/aws_nova_sonic/aws.py    | 102 +++++--
 .../services/aws_nova_sonic/context.py        |  29 +-
 4 files changed, 350 insertions(+), 39 deletions(-)
 create mode 100644 examples/foundational/20e-persistent-context-aws-nova-sonic.py

diff --git a/examples/foundational/20e-persistent-context-aws-nova-sonic.py b/examples/foundational/20e-persistent-context-aws-nova-sonic.py
new file mode 100644
index 000000000..8a95f54b9
--- /dev/null
+++ b/examples/foundational/20e-persistent-context-aws-nova-sonic.py
@@ -0,0 +1,256 @@
+#
+# Copyright (c) 2025, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+
+import asyncio
+import glob
+import json
+import os
+from datetime import datetime
+
+from dotenv import load_dotenv
+from loguru import logger
+
+from pipecat.adapters.schemas.function_schema import FunctionSchema
+from pipecat.adapters.schemas.tools_schema import ToolsSchema
+from pipecat.audio.vad.silero import SileroVADAnalyzer
+from pipecat.audio.vad.vad_analyzer import VADParams
+from pipecat.pipeline.pipeline import Pipeline
+from pipecat.pipeline.runner import PipelineRunner
+from pipecat.pipeline.task import PipelineParams, PipelineTask
+from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
+from pipecat.services.aws_nova_sonic.aws import AWSNovaSonicLLMService
+from pipecat.transports.base_transport import TransportParams
+from pipecat.transports.network.small_webrtc import SmallWebRTCTransport
+from pipecat.transports.network.webrtc_connection import SmallWebRTCConnection
+
+load_dotenv(override=True)
+
+BASE_FILENAME = "/tmp/pipecat_conversation_"
+
+
+async def fetch_weather_from_api(function_name, tool_call_id, args, llm, context, result_callback):
+    temperature = 75 if args["format"] == "fahrenheit" else 24
+    await result_callback(
+        {
+            "conditions": "nice",
+            "temperature": temperature,
+            "format": args["format"],
+            "timestamp": datetime.now().strftime("%Y%m%d_%H%M%S"),
+        }
+    )
+
+
+async def get_saved_conversation_filenames(
+    function_name, tool_call_id, args, llm, context, result_callback
+):
+    # Construct the full pattern including the BASE_FILENAME
+    full_pattern = f"{BASE_FILENAME}*.json"
+
+    # Use glob to find all matching files
+    matching_files = glob.glob(full_pattern)
+    logger.debug(f"matching files: {matching_files}")
+
+    await result_callback({"filenames": matching_files})
+
+
+# async def get_saved_conversation_filenames(
+#     function_name, tool_call_id, args, llm, context, result_callback
+# ):
+#     pattern = re.compile(re.escape(BASE_FILENAME) + "\\d{8}_\\d{6}\\.json$")
+#     matching_files = []
+
+#     for filename in os.listdir("."):
+#         if pattern.match(filename):
+#             matching_files.append(filename)
+
+#     await result_callback({"filenames": matching_files})
+
+
+async def save_conversation(function_name, tool_call_id, args, llm, context, result_callback):
+    timestamp = datetime.now().strftime("%Y-%m-%d_%H:%M:%S")
+    filename = f"{BASE_FILENAME}{timestamp}.json"
+    logger.debug(
+        f"writing conversation to {filename}\n{json.dumps(context.get_messages_for_persistent_storage(), indent=4)}"
+    )
+    try:
+        with open(filename, "w") as file:
+            messages = context.get_messages_for_persistent_storage()
+            # remove the last message, which is the instruction we just gave to save the conversation
+            messages.pop()
+            json.dump(messages, file, indent=2)
+        await result_callback({"success": True})
+    except Exception as e:
+        await result_callback({"success": False, "error": str(e)})
+
+
+async def load_conversation(function_name, tool_call_id, args, llm, context, result_callback):
+    async def _reset():
+        filename = args["filename"]
+        logger.debug(f"loading conversation from {filename}")
+        try:
+            with open(filename, "r") as file:
+                messages = json.load(file)
+                messages.append(
+                    {
+                        "role": "user",
+                        "content": f"{AWSNovaSonicLLMService.AWAIT_TRIGGER_ASSISTANT_RESPONSE_INSTRUCTION}",
+                    }
+                )
+                context.set_messages(messages)
+                await llm.reset_conversation()
+                await llm.trigger_assistant_response()
+        except Exception as e:
+            await result_callback({"success": False, "error": str(e)})
+
+    asyncio.create_task(_reset())
+
+
+get_current_weather_tool = FunctionSchema(
+    name="get_current_weather",
+    description="Get the current weather",
+    properties={
+        "location": {
+            "type": "string",
+            "description": "The city and state, e.g. San Francisco, CA",
+        },
+        "format": {
+            "type": "string",
+            "enum": ["celsius", "fahrenheit"],
+            "description": "The temperature unit to use. Infer this from the user's location.",
+        },
+    },
+    required=["location", "format"],
+)
+
+save_conversation_tool = FunctionSchema(
+    name="save_conversation",
+    description="Save the current conversation. Use this function to persist the current conversation to external storage.",
+    properties={},
+    required=[],
+)
+
+get_saved_conversation_filenames_tool = FunctionSchema(
+    name="get_saved_conversation_filenames",
+    description="Get a list of saved conversation histories. Returns a list of filenames. Each filename includes a date and timestamp. Each file is conversation history that can be loaded into this session.",
+    properties={},
+    required=[],
+)
+
+load_conversation_tool = FunctionSchema(
+    name="load_conversation",
+    description="Load a conversation history. Use this function to load a conversation history into the current session.",
+    properties={
+        "filename": {
+            "type": "string",
+            "description": "The filename of the conversation history to load.",
+        }
+    },
+    required=["filename"],
+)
+
+tools = ToolsSchema(
+    standard_tools=[
+        get_current_weather_tool,
+        save_conversation_tool,
+        get_saved_conversation_filenames_tool,
+        load_conversation_tool,
+    ]
+)
+
+
+async def run_bot(webrtc_connection: SmallWebRTCConnection):
+    logger.info(f"Starting bot")
+
+    transport = SmallWebRTCTransport(
+        webrtc_connection=webrtc_connection,
+        params=TransportParams(
+            audio_in_enabled=True,
+            audio_out_enabled=True,
+            vad_enabled=True,
+            vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.8)),
+            vad_audio_passthrough=True,
+        ),
+    )
+
+    system_instruction = (
+        "You are a friendly assistant. The user and you will engage in a spoken dialog exchanging "
+        "the transcripts of a natural real-time conversation. Keep your responses short, generally "
+        "two or three sentences for chatty scenarios. "
+        f"{AWSNovaSonicLLMService.AWAIT_TRIGGER_ASSISTANT_RESPONSE_INSTRUCTION}"
+    )
+
+    llm = AWSNovaSonicLLMService(
+        secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
+        access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
+        region=os.getenv("AWS_REGION"),
+        voice_id="tiffany",  # matthew, tiffany, amy
+        # you could choose to pass instruction here rather than via context
+        # system_instruction=system_instruction,
+        # you could choose to pass tools here rather than via context
+        # tools=tools
+    )
+
+    llm.register_function("get_current_weather", fetch_weather_from_api)
+    llm.register_function("save_conversation", save_conversation)
+    llm.register_function("get_saved_conversation_filenames", get_saved_conversation_filenames)
+    llm.register_function("load_conversation", load_conversation)
+
+    context = OpenAILLMContext(
+        messages=[
+            {"role": "system", "content": f"{system_instruction}"},
+        ],
+        tools=tools,
+    )
+    context_aggregator = llm.create_context_aggregator(context)
+
+    pipeline = Pipeline(
+        [
+            transport.input(),  # Transport user input
+            context_aggregator.user(),
+            llm,  # LLM
+            transport.output(),  # Transport bot output
+            context_aggregator.assistant(),
+        ]
+    )
+
+    task = PipelineTask(
+        pipeline,
+        params=PipelineParams(
+            allow_interruptions=True,
+            enable_metrics=True,
+            enable_usage_metrics=True,
+            report_only_initial_ttfb=True,
+        ),
+    )
+
+    @transport.event_handler("on_client_connected")
+    async def on_client_connected(transport, client):
+        logger.info(f"Client connected")
+        # Kick off the conversation.
+        await task.queue_frames([context_aggregator.user().get_context_frame()])
+        # HACK: for now, we need this special way of triggering the first assistant response in AWS
+        # Nova Sonic. Note that this trigger requires a special corresponding bit of text in the
+        # system instruction. In the future, simply queueing the context frame should be sufficient.
+        await llm.trigger_assistant_response()
+
+    @transport.event_handler("on_client_disconnected")
+    async def on_client_disconnected(transport, client):
+        logger.info(f"Client disconnected")
+
+    @transport.event_handler("on_client_closed")
+    async def on_client_closed(transport, client):
+        logger.info(f"Client closed connection")
+        await task.cancel()
+
+    runner = PipelineRunner(handle_sigint=False)
+
+    await runner.run(task)
+
+
+if __name__ == "__main__":
+    from run import main
+
+    main()
diff --git a/examples/foundational/39-aws-nova-sonic.py b/examples/foundational/39-aws-nova-sonic.py
index 07670f75a..c80626962 100644
--- a/examples/foundational/39-aws-nova-sonic.py
+++ b/examples/foundational/39-aws-nova-sonic.py
@@ -102,7 +102,7 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection):
         region=os.getenv("AWS_REGION"),
         voice_id="tiffany",  # matthew, tiffany, amy
         # you could choose to pass instruction here rather than via context
-        # instruction=system_instruction
+        # system_instruction=system_instruction
         # you could choose to pass tools here rather than via context
         # tools=tools
     )
diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py
index 5b69810f3..50b83d3e0 100644
--- a/src/pipecat/services/aws_nova_sonic/aws.py
+++ b/src/pipecat/services/aws_nova_sonic/aws.py
@@ -7,6 +7,7 @@
 import asyncio
 import base64
 import json
+import time
 import uuid
 import wave
 from dataclasses import dataclass
@@ -119,7 +120,7 @@ class AWSNovaSonicLLMService(LLMService):
         region: str,
         model: str = "amazon.nova-sonic-v1:0",
         voice_id: str = "matthew",  # matthew, tiffany, amy
-        instruction: Optional[str] = None,
+        system_instruction: Optional[str] = None,
         tools: Optional[ToolsSchema] = None,
         send_transcription_frames: bool = True,
         **kwargs,
@@ -131,7 +132,7 @@ class AWSNovaSonicLLMService(LLMService):
         self._model = model
         self._client: BedrockRuntimeClient = None
         self._voice_id = voice_id
-        self._instruction = instruction
+        self._system_instruction = system_instruction
         self._tools = tools
         self._send_transcription_frames = send_transcription_frames
         self._context: AWSNovaSonicLLMContext = None
@@ -150,6 +151,8 @@ class AWSNovaSonicLLMService(LLMService):
         self._handling_bot_stopped_speaking = False
         self._triggering_assistant_response = False
         self._assistant_response_trigger_audio: bytes = None  # Not cleared on _disconnect()
+        self._disconnecting = False
+        self._connected_time: float = None
 
     #
     # standard AIService frame handling
@@ -174,6 +177,18 @@ class AWSNovaSonicLLMService(LLMService):
         await super().cancel(frame)
         await self._disconnect()
 
+    #
+    # conversation resetting
+    #
+
+    async def reset_conversation(self):
+        logger.debug("Resetting conversation")
+        await self._disconnect()
+        await self._start_connecting()
+        # Use existing context
+        self._context_available = True
+        await self._finish_connecting_if_context_available()
+
     #
     # frame processing
     #
@@ -207,10 +222,12 @@ class AWSNovaSonicLLMService(LLMService):
 
     async def _handle_context(self, context: OpenAILLMContext):
         # TODO: reset connection if needed (if entirely new context object provided, for instance)
-        print(f"[pk] receive updated context: {context.get_messages_for_initializing_history()}")
+        print(f"[pk] received updated context: {context.get_messages_for_initializing_history()}")
         if not self._context:
             # We got our initial context - try to finish connecting
-            self._context = AWSNovaSonicLLMContext.upgrade_to_nova_sonic(context)
+            self._context = AWSNovaSonicLLMContext.upgrade_to_nova_sonic(
+                context, self._system_instruction
+            )
             self._context_available = True
             await self._finish_connecting_if_context_available()
 
@@ -296,8 +313,8 @@ class AWSNovaSonicLLMService(LLMService):
         # Read context
         history = self._context.get_messages_for_initializing_history()
 
-        # Send prompt start event, specifying tools
-        # Tools from context take priority over tools from __init__()
+        # Send prompt start event, specifying tools.
+        # Tools from context take priority over self._tools.
         tools = (
             self._context.tools
             if self._context.tools
@@ -305,11 +322,14 @@ class AWSNovaSonicLLMService(LLMService):
         )
         await self._send_prompt_start_event(tools)
 
-        # Send system instruction
-        # Instruction from context takes priority over instruction from __init__()
-        instruction = history.instruction if history.instruction else self._instruction
-        if instruction:
-            await self._send_text_event(text=instruction, role=Role.SYSTEM)
+        # Send system instruction.
+        # Instruction from context takes priority over self._system_instruction.
+        # (NOTE: this prioritizing occurred automatically behind the scenes: the context was
+        # initialized with self._system_instruction and then updated itself from its messages when
+        # get_messages_for_initializing_history() was called).
+        # print(f"[pk] connecting, with system instruction: {history.system_instruction}")
+        if history.system_instruction:
+            await self._send_text_event(text=history.system_instruction, role=Role.SYSTEM)
 
         # Send conversation history
         for message in history.messages:
@@ -320,7 +340,7 @@ class AWSNovaSonicLLMService(LLMService):
         # - pass additional message(s)
         # - merge init-passed system instruction + context instruction (latter takes precedence)
         # - merge init-passed tools + context tools (latter takes precedence)
-        await self._send_text_event(text=self._instruction, role=Role.SYSTEM)
+        await self._send_text_event(text=self._system_instruction, role=Role.SYSTEM)
 
         # Start audio input
         await self._send_audio_input_start_event()
@@ -328,31 +348,43 @@ class AWSNovaSonicLLMService(LLMService):
         # Start receiving events
         self._receive_task = self.create_task(self._receive_task_handler())
 
-        # If we need to, send assistant response trigger
+        # Record finished connecting time
+        self._connected_time = time.time()
+
+        # If we need to, send assistant response trigger (depends on self._connected_time)
         if self._triggering_assistant_response:
-            # If the trigger was the first audio chunk sent on this connection it'd be ignored (I'm
-            # guessing the LLM can't quite "hear" the first little bit of audio sent). So send a bit
-            # of leading blank audio first.
-            await self._send_assistant_response_trigger(lead_with_blank_audio=True)
+            await self._send_assistant_response_trigger()
             self._triggering_assistant_response = False
 
     async def _disconnect(self):
         try:
-            # Clean up receive task
-            if self._receive_task:
-                await self.cancel_task(self._receive_task, timeout=1.0)
-                self._receive_task = None
+            # NOTE: see explanation of HACK, below
+            self._disconnecting = True
 
             # Clean up client
             if self._client:
+                print("[pk] Cleaning up client")
                 await self._send_session_end_events()
                 self._client = None
 
             # Clean up stream
             if self._stream:
+                print("[pk] Cleaning up stream")
                 await self._stream.input_stream.close()
                 self._stream = None
 
+            # NOTE: see explanation of HACK, below
+            await asyncio.sleep(1)
+
+            # Clean up receive task
+            # HACK: we should ideally be able to cancel the receive task before stopping the input
+            # stream, above (meaning we wouldn't need self._disconnecting). But for some reason if
+            # we don't close the input stream and wait a second first, we're getting an error a lot
+            # like this one: https://github.com/awslabs/amazon-transcribe-streaming-sdk/issues/61.
+            if self._receive_task:
+                await self.cancel_task(self._receive_task, timeout=1.0)
+                self._receive_task = None
+
             # Reset remaining connection-specific state
             self._prompt_name = None
             self._input_audio_content_name = None
@@ -362,6 +394,8 @@ class AWSNovaSonicLLMService(LLMService):
             self._ready_to_send_context = False
             self._handling_bot_stopped_speaking = False
             self._triggering_assistant_response = False
+            self._disconnecting = False
+            self._connected_time = None
         except Exception as e:
             logger.error(f"{self} error disconnecting: {e}")
 
@@ -619,9 +653,8 @@ class AWSNovaSonicLLMService(LLMService):
     # LLM communication: output events (LLM -> pipecat)
     #
 
-    # Receive the ongoing LLM "completion".
-    # There is generally a single completion per session.
-    # In a completion, a few different kinds of content can be delivered:
+    # Receive events for the session.
+    # A few different kinds of content can be delivered:
     # - Transcription of user audio
     # - Tool use
     # - Text preview of planned response speech before audio delivered
@@ -633,7 +666,7 @@ class AWSNovaSonicLLMService(LLMService):
     # The overall completion is wrapped by "completionStart" and "completionEnd" events.
     async def _receive_task_handler(self):
         try:
-            while self._client:
+            while self._client and not self._disconnecting:
                 output = await self._stream.await_output()
                 result = await output[1].receive()
 
@@ -906,16 +939,25 @@ class AWSNovaSonicLLMService(LLMService):
             await self._send_assistant_response_trigger()
             self._triggering_assistant_response = False
 
-    async def _send_assistant_response_trigger(self, lead_with_blank_audio=False):
+    async def _send_assistant_response_trigger(self):
         # TODO: if/when we make bitrate, etc configurable, avoid hard-coding this
         chunk_size = 640  # equivalent to what we get from InputAudioRawFrame
         chunk_duration = 640 / (
             16000 * 2
         )  # 640 bytes of 16-bit (2-byte) PCM mono audio at 16kHz corresponds to 0.02 seconds
 
-        # Lead with blank audio, if needed
-        if lead_with_blank_audio:
-            blank_audio_duration = 0.5  # much less than this and it doesn't reliably work
+        # Lead with a bit of blank audio, if needed.
+        # It seems like the LLM can't quite "hear" the first little bit of audio sent on a
+        # connection.
+        current_time = time.time()
+        max_blank_audio_duration = 0.5
+        blank_audio_duration = (
+            max_blank_audio_duration - (current_time - self._connected_time)
+            if self._connected_time is not None
+            and (current_time - self._connected_time) < max_blank_audio_duration
+            else None
+        )
+        if blank_audio_duration:
             blank_audio_chunk = b"\x00" * chunk_size
             num_chunks = int(blank_audio_duration / chunk_duration)
             for _ in range(num_chunks):
@@ -925,7 +967,7 @@ class AWSNovaSonicLLMService(LLMService):
         # Send trigger audio
         # NOTE: this audio *will* be transcribed and eventually make it into the context. That's OK:
         # if we ever need to seed this service again with context it would make sense to include it
-        # since the instruction (i.e. the "wait for the trigger" instruction) will be part of the 
+        # since the instruction (i.e. the "wait for the trigger" instruction) will be part of the
         # context as well.
         # print(f"[pk] sending trigger audio! {len(self._assistant_response_trigger_audio)}")
         audio_chunks = [
diff --git a/src/pipecat/services/aws_nova_sonic/context.py b/src/pipecat/services/aws_nova_sonic/context.py
index 3fac65a72..b12061e1e 100644
--- a/src/pipecat/services/aws_nova_sonic/context.py
+++ b/src/pipecat/services/aws_nova_sonic/context.py
@@ -49,7 +49,7 @@ class AWSNovaSonicConversationHistoryMessage:
 
 @dataclass
 class AWSNovaSonicConversationHistory:
-    instruction: str = None
+    system_instruction: str = None
     messages: list[AWSNovaSonicConversationHistoryMessage] = field(default_factory=list)
 
 
@@ -58,18 +58,22 @@ class AWSNovaSonicLLMContext(OpenAILLMContext):
         super().__init__(messages=messages, tools=tools, **kwargs)
         self.__setup_local()
 
-    def __setup_local(self):
+    def __setup_local(self, system_instruction: str = ""):
         self._assistant_text = ""
+        self._system_instruction = system_instruction
 
     @staticmethod
-    def upgrade_to_nova_sonic(obj: OpenAILLMContext) -> "AWSNovaSonicLLMContext":
+    def upgrade_to_nova_sonic(
+        obj: OpenAILLMContext, system_instruction: str
+    ) -> "AWSNovaSonicLLMContext":
         if isinstance(obj, OpenAILLMContext) and not isinstance(obj, AWSNovaSonicLLMContext):
             obj.__class__ = AWSNovaSonicLLMContext
-            obj.__setup_local()
+            obj.__setup_local(system_instruction)
         return obj
 
+    # NOTE: this method has the side-effect of updating _system_instruction from messages
     def get_messages_for_initializing_history(self) -> AWSNovaSonicConversationHistory:
-        history = AWSNovaSonicConversationHistory()
+        history = AWSNovaSonicConversationHistory(system_instruction=self._system_instruction)
 
         # Bail if there are no messages
         if not self.messages:
@@ -82,13 +86,15 @@ class AWSNovaSonicLLMContext(OpenAILLMContext):
             system = messages.pop(0)
             content = system.get("content")
             if isinstance(content, str):
-                history.instruction = content
+                history.system_instruction = content
             elif isinstance(content, list):
-                history.instruction = content[0].get("text")
+                history.system_instruction = content[0].get("text")
+            if history.system_instruction:
+                self._system_instruction = history.system_instruction
 
         # Process remaining messages to fill out conversation history.
         # Nova Sonic supports "user" and "assistant" messages in history.
-        print(f"[pk] standard messages: {messages}")
+        # print(f"[pk] standard messages: {messages}")
         for message in messages:
             history_message = self.from_standard_message(message)
             if history_message:
@@ -96,6 +102,13 @@ class AWSNovaSonicLLMContext(OpenAILLMContext):
 
         return history
 
+    def get_messages_for_persistent_storage(self):
+        messages = super().get_messages_for_persistent_storage()
+        # If we have a system instruction and messages doesn't already contain it, add it
+        if self._system_instruction and not (messages and messages[0].get("role") == "system"):
+            messages.insert(0, {"role": "system", "content": self._system_instruction})
+        return messages
+
     def from_standard_message(self, message) -> AWSNovaSonicConversationHistoryMessage:
         role = message.get("role")
         if message.get("role") == "user" or message.get("role") == "assistant":

From 2b02d08f4c7f03fba5f18706ec600476935a5c50 Mon Sep 17 00:00:00 2001
From: Paul Kompfner <paul@daily.co>
Date: Tue, 6 May 2025 09:26:22 -0400
Subject: [PATCH 66/97] [WIP] AWS Nova Sonic service - add comments to examples
 pointing out the us-east-1 is the only supported region so far

---
 examples/foundational/20e-persistent-context-aws-nova-sonic.py | 2 +-
 examples/foundational/39-aws-nova-sonic.py                     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/foundational/20e-persistent-context-aws-nova-sonic.py b/examples/foundational/20e-persistent-context-aws-nova-sonic.py
index 8a95f54b9..731c69c3a 100644
--- a/examples/foundational/20e-persistent-context-aws-nova-sonic.py
+++ b/examples/foundational/20e-persistent-context-aws-nova-sonic.py
@@ -185,7 +185,7 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection):
     llm = AWSNovaSonicLLMService(
         secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
         access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
-        region=os.getenv("AWS_REGION"),
+        region=os.getenv("AWS_REGION"),  # as of 2025-05-06, us-east-1 is the only supported region
         voice_id="tiffany",  # matthew, tiffany, amy
         # you could choose to pass instruction here rather than via context
         # system_instruction=system_instruction,
diff --git a/examples/foundational/39-aws-nova-sonic.py b/examples/foundational/39-aws-nova-sonic.py
index c80626962..a89796ea6 100644
--- a/examples/foundational/39-aws-nova-sonic.py
+++ b/examples/foundational/39-aws-nova-sonic.py
@@ -99,7 +99,7 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection):
     llm = AWSNovaSonicLLMService(
         secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
         access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
-        region=os.getenv("AWS_REGION"),
+        region=os.getenv("AWS_REGION"),  # as of 2025-05-06, us-east-1 is the only supported region
         voice_id="tiffany",  # matthew, tiffany, amy
         # you could choose to pass instruction here rather than via context
         # system_instruction=system_instruction

From 467233be046527da8be0bf40ebf5aaea147d8c36 Mon Sep 17 00:00:00 2001
From: Paul Kompfner <paul@daily.co>
Date: Tue, 6 May 2025 09:48:50 -0400
Subject: [PATCH 67/97] [WIP] AWS Nova Sonic service - support multi-line
 system prompt

---
 .../foundational/20e-persistent-context-aws-nova-sonic.py    | 4 ++++
 examples/foundational/39-aws-nova-sonic.py                   | 1 -
 src/pipecat/services/aws_nova_sonic/aws.py                   | 5 +++--
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/examples/foundational/20e-persistent-context-aws-nova-sonic.py b/examples/foundational/20e-persistent-context-aws-nova-sonic.py
index 731c69c3a..8ac1508f8 100644
--- a/examples/foundational/20e-persistent-context-aws-nova-sonic.py
+++ b/examples/foundational/20e-persistent-context-aws-nova-sonic.py
@@ -175,6 +175,10 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection):
         ),
     )
 
+    # Specify initial system instruction.
+    # HACK: note that, for now, we need to inject a special bit of text into this instruction to
+    # allow the first assistant response to be programmatically triggered (which happens in the
+    # on_client_connected handler, below)
     system_instruction = (
         "You are a friendly assistant. The user and you will engage in a spoken dialog exchanging "
         "the transcripts of a natural real-time conversation. Keep your responses short, generally "
diff --git a/examples/foundational/39-aws-nova-sonic.py b/examples/foundational/39-aws-nova-sonic.py
index a89796ea6..fe0d07dca 100644
--- a/examples/foundational/39-aws-nova-sonic.py
+++ b/examples/foundational/39-aws-nova-sonic.py
@@ -87,7 +87,6 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection):
     # HACK: note that, for now, we need to inject a special bit of text into this instruction to
     # allow the first assistant response to be programmatically triggered (which happens in the
     # on_client_connected handler, below)
-    # TODO: looks like Nova Sonic can't handle new lines?
     system_instruction = (
         "You are a friendly assistant. The user and you will engage in a spoken dialog exchanging "
         "the transcripts of a natural real-time conversation. Keep your responses short, generally "
diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py
index 50b83d3e0..e5df20e66 100644
--- a/src/pipecat/services/aws_nova_sonic/aws.py
+++ b/src/pipecat/services/aws_nova_sonic/aws.py
@@ -498,7 +498,7 @@ class AWSNovaSonicLLMService(LLMService):
         await self._send_client_event(audio_content_start)
 
     async def _send_text_event(self, text: str, role: Role):
-        if not self._stream:
+        if not self._stream or not text:
             return
 
         content_name = str(uuid.uuid4())
@@ -521,13 +521,14 @@ class AWSNovaSonicLLMService(LLMService):
         '''
         await self._send_client_event(text_content_start)
 
+        escaped_text = json.dumps(text) # includes quotes
         text_input = f'''
         {{
             "event": {{
                 "textInput": {{
                     "promptName": "{self._prompt_name}",
                     "contentName": "{content_name}",
-                    "content": "{text}"
+                    "content": {escaped_text}
                 }}
             }}
         }}

From c4d0f91a7fbed227b4c5ae80382a0cfeb9957389 Mon Sep 17 00:00:00 2001
From: Paul Kompfner <paul@daily.co>
Date: Tue, 6 May 2025 09:53:12 -0400
Subject: [PATCH 68/97] [WIP] AWS Nova Sonic service - remove some old code
 that was accidentally still there, possibly sending a duplicate system
 instruction

---
 src/pipecat/services/aws_nova_sonic/aws.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py
index e5df20e66..aa44beb60 100644
--- a/src/pipecat/services/aws_nova_sonic/aws.py
+++ b/src/pipecat/services/aws_nova_sonic/aws.py
@@ -335,13 +335,6 @@ class AWSNovaSonicLLMService(LLMService):
         for message in history.messages:
             await self._send_text_event(text=message.text, role=message.role)
 
-        # Send initial context (system instruction and conversation history)
-        # TODO: finish implementing
-        # - pass additional message(s)
-        # - merge init-passed system instruction + context instruction (latter takes precedence)
-        # - merge init-passed tools + context tools (latter takes precedence)
-        await self._send_text_event(text=self._system_instruction, role=Role.SYSTEM)
-
         # Start audio input
         await self._send_audio_input_start_event()
 

From d388c057c039531a7f96a081eb69793e6e7f4df0 Mon Sep 17 00:00:00 2001
From: Paul Kompfner <paul@daily.co>
Date: Tue, 6 May 2025 11:12:47 -0400
Subject: [PATCH 69/97] [WIP] AWS Nova Sonic service - recover from unwanted
 disconnection due to an error

---
 src/pipecat/services/aws_nova_sonic/aws.py     | 7 +++++++
 src/pipecat/services/aws_nova_sonic/context.py | 2 ++
 2 files changed, 9 insertions(+)

diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py
index aa44beb60..aa264155c 100644
--- a/src/pipecat/services/aws_nova_sonic/aws.py
+++ b/src/pipecat/services/aws_nova_sonic/aws.py
@@ -153,6 +153,7 @@ class AWSNovaSonicLLMService(LLMService):
         self._assistant_response_trigger_audio: bytes = None  # Not cleared on _disconnect()
         self._disconnecting = False
         self._connected_time: float = None
+        self._wants_connection = False
 
     #
     # standard AIService frame handling
@@ -160,6 +161,7 @@ class AWSNovaSonicLLMService(LLMService):
 
     async def start(self, frame: StartFrame):
         await super().start(frame)
+        self._wants_connection = True
         # TODO: maybe connect but don't send history until we get all of our settings?
         # how do we know how long to wait?
         # ah, i think we'll *always* get at least one OpenAILLMContextFrame which kicks things off
@@ -171,10 +173,12 @@ class AWSNovaSonicLLMService(LLMService):
 
     async def stop(self, frame: EndFrame):
         await super().stop(frame)
+        self._wants_connection = False
         await self._disconnect()
 
     async def cancel(self, frame: CancelFrame):
         await super().cancel(frame)
+        self._wants_connection = False
         await self._disconnect()
 
     #
@@ -183,6 +187,7 @@ class AWSNovaSonicLLMService(LLMService):
 
     async def reset_conversation(self):
         logger.debug("Resetting conversation")
+        await self._handle_bot_stopped_speaking()
         await self._disconnect()
         await self._start_connecting()
         # Use existing context
@@ -694,6 +699,8 @@ class AWSNovaSonicLLMService(LLMService):
 
         except Exception as e:
             logger.error(f"{self} error processing responses: {e}")
+            if self._wants_connection:
+                await self.reset_conversation()
 
     async def _handle_completion_start_event(self, event_json):
         # print("[pk] completion start")
diff --git a/src/pipecat/services/aws_nova_sonic/context.py b/src/pipecat/services/aws_nova_sonic/context.py
index b12061e1e..d96c2d1ed 100644
--- a/src/pipecat/services/aws_nova_sonic/context.py
+++ b/src/pipecat/services/aws_nova_sonic/context.py
@@ -143,6 +143,8 @@ class AWSNovaSonicLLMContext(OpenAILLMContext):
         # print(f"[pk] assistant text buffered: {self._assistant_text}")
 
     def flush_aggregated_assistant_text(self):
+        if not self._assistant_text:
+            return
         message = {
             "role": "assistant",
             "content": [{"type": "text", "text": self._assistant_text}],

From 73020be511c64e2026f1ed95730686198929b3df Mon Sep 17 00:00:00 2001
From: Paul Kompfner <paul@daily.co>
Date: Tue, 6 May 2025 12:25:25 -0400
Subject: [PATCH 70/97] [WIP] AWS Nova Sonic service - minor fix: only try to
 read received JSON if we have it

---
 src/pipecat/services/aws_nova_sonic/aws.py | 48 +++++++++++-----------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py
index aa264155c..c2b56ef74 100644
--- a/src/pipecat/services/aws_nova_sonic/aws.py
+++ b/src/pipecat/services/aws_nova_sonic/aws.py
@@ -519,7 +519,7 @@ class AWSNovaSonicLLMService(LLMService):
         '''
         await self._send_client_event(text_content_start)
 
-        escaped_text = json.dumps(text) # includes quotes
+        escaped_text = json.dumps(text)  # includes quotes
         text_input = f'''
         {{
             "event": {{
@@ -673,29 +673,29 @@ class AWSNovaSonicLLMService(LLMService):
                     response_data = result.value.bytes_.decode("utf-8")
                     json_data = json.loads(response_data)
 
-                if "event" in json_data:
-                    event_json = json_data["event"]
-                    if "completionStart" in event_json:
-                        # Handle the LLM completion starting
-                        await self._handle_completion_start_event(event_json)
-                    elif "contentStart" in event_json:
-                        # Handle a piece of content starting
-                        await self._handle_content_start_event(event_json)
-                    elif "textOutput" in event_json:
-                        # Handle text output content
-                        await self._handle_text_output_event(event_json)
-                    elif "audioOutput" in event_json:
-                        # Handle audio output content
-                        await self._handle_audio_output_event(event_json)
-                    elif "toolUse" in event_json:
-                        # Handle tool use
-                        await self._handle_tool_use_event(event_json)
-                    elif "contentEnd" in event_json:
-                        # Handle a piece of content ending
-                        await self._handle_content_end_event(event_json)
-                    elif "completionEnd" in event_json:
-                        # Handle the LLM completion ending
-                        await self._handle_completion_end_event(event_json)
+                    if "event" in json_data:
+                        event_json = json_data["event"]
+                        if "completionStart" in event_json:
+                            # Handle the LLM completion starting
+                            await self._handle_completion_start_event(event_json)
+                        elif "contentStart" in event_json:
+                            # Handle a piece of content starting
+                            await self._handle_content_start_event(event_json)
+                        elif "textOutput" in event_json:
+                            # Handle text output content
+                            await self._handle_text_output_event(event_json)
+                        elif "audioOutput" in event_json:
+                            # Handle audio output content
+                            await self._handle_audio_output_event(event_json)
+                        elif "toolUse" in event_json:
+                            # Handle tool use
+                            await self._handle_tool_use_event(event_json)
+                        elif "contentEnd" in event_json:
+                            # Handle a piece of content ending
+                            await self._handle_content_end_event(event_json)
+                        elif "completionEnd" in event_json:
+                            # Handle the LLM completion ending
+                            await self._handle_completion_end_event(event_json)
 
         except Exception as e:
             logger.error(f"{self} error processing responses: {e}")

From 885b2d1d2f3dcfa6e04b1c347848ca331c4722b8 Mon Sep 17 00:00:00 2001
From: Paul Kompfner <paul@daily.co>
Date: Tue, 6 May 2025 14:29:36 -0400
Subject: [PATCH 71/97] [WIP] AWS Nova Sonic service - make parameters
 configurable

---
 src/pipecat/services/aws_nova_sonic/aws.py | 73 ++++++++++++++--------
 1 file changed, 47 insertions(+), 26 deletions(-)

diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py
index c2b56ef74..c0367fa6b 100644
--- a/src/pipecat/services/aws_nova_sonic/aws.py
+++ b/src/pipecat/services/aws_nova_sonic/aws.py
@@ -28,6 +28,7 @@ from aws_sdk_bedrock_runtime.models import (
     InvokeModelWithBidirectionalStreamOutput,
 )
 from loguru import logger
+from pydantic import BaseModel, Field
 from smithy_aws_core.credentials_resolvers.static import StaticCredentialsResolver
 from smithy_aws_core.identity import AWSCredentialsIdentity
 from smithy_core.aio.eventstream import DuplexEventStream
@@ -107,6 +108,23 @@ class CurrentContent:
         )
 
 
+class Params(BaseModel):
+    # Audio input
+    input_sample_rate: Optional[int] = Field(default=16000)
+    input_sample_size: Optional[int] = Field(default=16)
+    input_channel_count: Optional[int] = Field(default=1)
+
+    # Audio output
+    output_sample_rate: Optional[int] = Field(default=24000)
+    output_sample_size: Optional[int] = Field(default=16)
+    output_channel_count: Optional[int] = Field(default=1)
+
+    # Inference
+    max_tokens: Optional[int] = Field(default=1024)
+    top_p: Optional[float] = Field(default=0.9)
+    temperature: Optional[float] = Field(default=0.7)
+
+
 class AWSNovaSonicLLMService(LLMService):
     # Override the default adapter to use the AWSNovaSonicLLMAdapter one
     adapter_class = AWSNovaSonicLLMAdapter
@@ -120,6 +138,7 @@ class AWSNovaSonicLLMService(LLMService):
         region: str,
         model: str = "amazon.nova-sonic-v1:0",
         voice_id: str = "matthew",  # matthew, tiffany, amy
+        params: Params = Params(),
         system_instruction: Optional[str] = None,
         tools: Optional[ToolsSchema] = None,
         send_transcription_frames: bool = True,
@@ -132,6 +151,7 @@ class AWSNovaSonicLLMService(LLMService):
         self._model = model
         self._client: BedrockRuntimeClient = None
         self._voice_id = voice_id
+        self._params = params
         self._system_instruction = system_instruction
         self._tools = tools
         self._send_transcription_frames = send_transcription_frames
@@ -419,18 +439,18 @@ class AWSNovaSonicLLMService(LLMService):
 
     # TODO: make params configurable?
     async def _send_session_start_event(self):
-        session_start = """
-        {
-          "event": {
-            "sessionStart": {
-              "inferenceConfiguration": {
-                "maxTokens": 1024,
-                "topP": 0.9,
-                "temperature": 0.7
-              }
-            }
-          }
-        }
+        session_start = f"""
+        {{
+          "event": {{
+            "sessionStart": {{
+              "inferenceConfiguration": {{
+                "maxTokens": {self._params.max_tokens},
+                "topP": {self._params.top_p},
+                "temperature": {self._params.temperature}
+              }}
+            }}
+          }}
+        }}
         """
         await self._send_client_event(session_start)
 
@@ -458,9 +478,9 @@ class AWSNovaSonicLLMService(LLMService):
               }},
               "audioOutputConfiguration": {{
                 "mediaType": "audio/lpcm",
-                "sampleRateHertz": 24000,
-                "sampleSizeBits": 16,
-                "channelCount": 1,
+                "sampleRateHertz": {self._params.output_sample_rate},
+                "sampleSizeBits": {self._params.output_sample_size},
+                "channelCount": {self._params.output_channel_count},
                 "voiceId": "{self._voice_id}",
                 "encoding": "base64",
                 "audioType": "SPEECH"
@@ -483,9 +503,9 @@ class AWSNovaSonicLLMService(LLMService):
                     "role": "USER",
                     "audioInputConfiguration": {{
                         "mediaType": "audio/lpcm",
-                        "sampleRateHertz": 16000,
-                        "sampleSizeBits": 16,
-                        "channelCount": 1,
+                        "sampleRateHertz": {self._params.input_sample_rate},
+                        "sampleSizeBits": {self._params.input_sample_size},
+                        "channelCount": {self._params.input_channel_count},
                         "audioType": "SPEECH",
                         "encoding": "base64"
                     }}
@@ -762,11 +782,10 @@ class AWSNovaSonicLLMService(LLMService):
 
         # Push audio frame
         audio = base64.b64decode(audio_content)
-        # TODO: make sample rate + channels (used in multiple places) consts
         frame = TTSAudioRawFrame(
             audio=audio,
-            sample_rate=24000,
-            num_channels=1,
+            sample_rate=self._params.output_sample_rate,
+            num_channels=self._params.output_channel_count,
         )
         await self.push_frame(frame)
 
@@ -941,11 +960,13 @@ class AWSNovaSonicLLMService(LLMService):
             self._triggering_assistant_response = False
 
     async def _send_assistant_response_trigger(self):
-        # TODO: if/when we make bitrate, etc configurable, avoid hard-coding this
-        chunk_size = 640  # equivalent to what we get from InputAudioRawFrame
-        chunk_duration = 640 / (
-            16000 * 2
-        )  # 640 bytes of 16-bit (2-byte) PCM mono audio at 16kHz corresponds to 0.02 seconds
+        chunk_duration = 0.02  # what we might get from InputAudioRawFrame
+        chunk_size = int(
+            chunk_duration
+            * self._params.input_sample_rate
+            * self._params.input_channel_count
+            * (self._params.input_sample_size / 8)
+        )  # e.g. 0.02 seconds of 16-bit (2-byte) PCM mono audio at 16kHz is 640 bytes
 
         # Lead with a bit of blank audio, if needed.
         # It seems like the LLM can't quite "hear" the first little bit of audio sent on a

From c7e223e85ae9d33d8194966c6de88a39311a7ef6 Mon Sep 17 00:00:00 2001
From: Paul Kompfner <paul@daily.co>
Date: Tue, 6 May 2025 16:06:29 -0400
Subject: [PATCH 72/97] [WIP] AWS Nova Sonic service - remove print statements
 in favor of logger

---
 examples/foundational/39-aws-nova-sonic.py    |  6 --
 src/pipecat/services/aws_nova_sonic/aws.py    | 72 +++++++------------
 .../services/aws_nova_sonic/context.py        |  7 +-
 3 files changed, 29 insertions(+), 56 deletions(-)

diff --git a/examples/foundational/39-aws-nova-sonic.py b/examples/foundational/39-aws-nova-sonic.py
index fe0d07dca..af44cf790 100644
--- a/examples/foundational/39-aws-nova-sonic.py
+++ b/examples/foundational/39-aws-nova-sonic.py
@@ -10,7 +10,6 @@ from datetime import datetime
 from dotenv import load_dotenv
 from loguru import logger
 
-# import logging
 from pipecat.adapters.schemas.function_schema import FunctionSchema
 from pipecat.adapters.schemas.tools_schema import ToolsSchema
 from pipecat.audio.vad.silero import SileroVADAnalyzer
@@ -27,11 +26,6 @@ from pipecat.transports.network.webrtc_connection import SmallWebRTCConnection
 # Load environment variables
 load_dotenv(override=True)
 
-# logging.basicConfig(
-#     level=logging.DEBUG,
-#     format='%(asctime)s - %(levelname)s - %(message)s'
-# )
-
 
 async def fetch_weather_from_api(function_name, tool_call_id, args, llm, context, result_callback):
     temperature = 75 if args["format"] == "fahrenheit" else 24
diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py
index c0367fa6b..1e318f164 100644
--- a/src/pipecat/services/aws_nova_sonic/aws.py
+++ b/src/pipecat/services/aws_nova_sonic/aws.py
@@ -229,25 +229,10 @@ class AWSNovaSonicLLMService(LLMService):
             await self._handle_bot_stopped_speaking()
         elif isinstance(frame, AWSNovaSonicFunctionCallResultFrame):
             await self._handle_function_call_result(frame)
-        # TODO: do we need to do anything for the below four frame types?
-        elif isinstance(frame, StartInterruptionFrame):
-            # print("[pk] StartInterruptionFrame")
-            pass
-        elif isinstance(frame, UserStartedSpeakingFrame):
-            # print("[pk] UserStartedSpeakingFrame")
-            pass
-        elif isinstance(frame, StopInterruptionFrame):
-            # print("[pk] StopInterruptionFrame")
-            pass
-        elif isinstance(frame, UserStoppedSpeakingFrame):
-            # print("[pk] UserStoppedSpeakingFrame")
-            pass
 
         await self.push_frame(frame, direction)
 
     async def _handle_context(self, context: OpenAILLMContext):
-        # TODO: reset connection if needed (if entirely new context object provided, for instance)
-        print(f"[pk] received updated context: {context.get_messages_for_initializing_history()}")
         if not self._context:
             # We got our initial context - try to finish connecting
             self._context = AWSNovaSonicLLMContext.upgrade_to_nova_sonic(
@@ -303,6 +288,8 @@ class AWSNovaSonicLLMService(LLMService):
 
     async def _start_connecting(self):
         try:
+            logger.info("Connecting...")
+
             if self._client:
                 # Here we assume that if we have a client we are connected or connecting
                 return
@@ -335,6 +322,8 @@ class AWSNovaSonicLLMService(LLMService):
         if not (self._context_available and self._ready_to_send_context):
             return
 
+        logger.info("Finishing connecting (setting up session)...")
+
         # Read context
         history = self._context.get_messages_for_initializing_history()
 
@@ -345,6 +334,7 @@ class AWSNovaSonicLLMService(LLMService):
             if self._context.tools
             else self.get_llm_adapter().from_standard_tools(self._tools)
         )
+        logger.debug(f"Using tools: {tools}")
         await self._send_prompt_start_event(tools)
 
         # Send system instruction.
@@ -352,7 +342,7 @@ class AWSNovaSonicLLMService(LLMService):
         # (NOTE: this prioritizing occurred automatically behind the scenes: the context was
         # initialized with self._system_instruction and then updated itself from its messages when
         # get_messages_for_initializing_history() was called).
-        # print(f"[pk] connecting, with system instruction: {history.system_instruction}")
+        logger.debug(f"Using system instruction: {history.system_instruction}")
         if history.system_instruction:
             await self._send_text_event(text=history.system_instruction, role=Role.SYSTEM)
 
@@ -366,9 +356,11 @@ class AWSNovaSonicLLMService(LLMService):
         # Start receiving events
         self._receive_task = self.create_task(self._receive_task_handler())
 
-        # Record finished connecting time
+        # Record finished connecting time (must be done before sending assistant response trigger)
         self._connected_time = time.time()
 
+        logger.info("Finished connecting")
+
         # If we need to, send assistant response trigger (depends on self._connected_time)
         if self._triggering_assistant_response:
             await self._send_assistant_response_trigger()
@@ -376,18 +368,18 @@ class AWSNovaSonicLLMService(LLMService):
 
     async def _disconnect(self):
         try:
+            logger.info("Disconnecting...")
+
             # NOTE: see explanation of HACK, below
             self._disconnecting = True
 
             # Clean up client
             if self._client:
-                print("[pk] Cleaning up client")
                 await self._send_session_end_events()
                 self._client = None
 
             # Clean up stream
             if self._stream:
-                print("[pk] Cleaning up stream")
                 await self._stream.input_stream.close()
                 self._stream = None
 
@@ -414,6 +406,8 @@ class AWSNovaSonicLLMService(LLMService):
             self._triggering_assistant_response = False
             self._disconnecting = False
             self._connected_time = None
+
+            logger.info("Finished disconnecting")
         except Exception as e:
             logger.error(f"{self} error disconnecting: {e}")
 
@@ -611,8 +605,6 @@ class AWSNovaSonicLLMService(LLMService):
         if not self._stream:
             return
 
-        # print(f"[pk] sending tool result. tool call ID: {tool_call_id}, result: {result}")
-
         content_name = str(uuid.uuid4())
 
         result_content_start = f'''
@@ -723,7 +715,6 @@ class AWSNovaSonicLLMService(LLMService):
                 await self.reset_conversation()
 
     async def _handle_completion_start_event(self, event_json):
-        # print("[pk] completion start")
         pass
 
     async def _handle_content_start_event(self, event_json):
@@ -744,10 +735,6 @@ class AWSNovaSonicLLMService(LLMService):
         )
         self._content_being_received = content
 
-        # print(f"[pk] content start: {content}")
-        # if content.role == Role.ASSISTANT:
-        #     print(f"[pk] assistant content start: {content}")
-
         if content.role == Role.ASSISTANT:
             if content.type == ContentType.AUDIO:
                 # Note that an assistant response can comprise of multiple audio blocks
@@ -763,9 +750,6 @@ class AWSNovaSonicLLMService(LLMService):
         content = self._content_being_received
 
         text_content = event_json["textOutput"]["content"]
-        # print(f"[pk] text output. content: {text_content}")
-        # if content.role == Role.ASSISTANT:
-        #     print(f"[pk] assistant text output. content: {text_content}")
 
         # Bookkeeping: augment the current content being received with text
         # Assumption: only one text content per content block
@@ -778,7 +762,6 @@ class AWSNovaSonicLLMService(LLMService):
 
         # Get audio
         audio_content = event_json["audioOutput"]["content"]
-        # print(f"[pk] audio output. content: {len(audio_content)}")
 
         # Push audio frame
         audio = base64.b64decode(audio_content)
@@ -800,10 +783,6 @@ class AWSNovaSonicLLMService(LLMService):
         tool_call_id = tool_use["toolUseId"]
         arguments = json.loads(tool_use["content"])
 
-        # print(
-        #     f"[pk] tool use - function_name: {function_name}, tool_call_id: {tool_call_id}, arguments: {arguments}"
-        # )
-
         # Call tool function
         if self.has_function(function_name):
             if function_name in self._functions.keys():
@@ -833,9 +812,6 @@ class AWSNovaSonicLLMService(LLMService):
 
         content_end = event_json["contentEnd"]
         stop_reason = content_end["stopReason"]
-        # print(f"[pk] content end: {content}.\n  stop_reason: {stop_reason}")
-        # if content.role == Role.ASSISTANT:
-        # print(f"[pk] assistant content end: {content}.\n  stop_reason: {stop_reason}")
 
         # Bookkeeping: clear current content being received
         self._content_being_received = None
@@ -856,25 +832,24 @@ class AWSNovaSonicLLMService(LLMService):
         self._content_being_received = False
 
     async def _handle_completion_end_event(self, event_json):
-        # print("[pk] completion end")
         pass
 
     async def _report_assistant_response_started(self):
+        logger.debug("Assistant response started")
+
         # Report that the assistant has started their response.
-        print("[pk] LLM full response started")
         await self.push_frame(LLMFullResponseStartFrame())
 
         # Report that equivalent of TTS (this is a speech-to-speech model) started
-        print("[pk] TTS started")
         await self.push_frame(TTSStartedFrame())
 
     async def _report_assistant_response_text_added(self, text):
+        logger.debug(f"Assistant response text added: {text}")
+
         # Report some text added to the ongoing assistant response
-        print(f"[pk] LLM text: {text}")
         await self.push_frame(LLMTextFrame(text))
 
         # Report some text added to the *equivalent* of TTS (this is a speech-to-speech model)
-        print(f"[pk] TTS text: {text}")
         await self.push_frame(TTSTextFrame(text))
 
         # TODO: this is a (hopefully temporary) HACK. Here we directly manipulate the context rather
@@ -890,19 +865,20 @@ class AWSNovaSonicLLMService(LLMService):
         self._context.buffer_assistant_text(text)
 
     async def _report_assistant_response_ended(self):
+        logger.debug("Assistant response ended")
+
         # Report that the assistant has finished their response.
-        print("[pk] LLM full response ended")
         await self.push_frame(LLMFullResponseEndFrame())
 
         # Report that equivalent of TTS (this is a speech-to-speech model) stopped.
-        print("[pk] TTS stopped")
         await self.push_frame(TTSStoppedFrame())
 
         # For an explanation of this hack, see _report_assistant_response_text_added.
         self._context.flush_aggregated_assistant_text()
 
     async def _report_user_transcription_text_added(self, text):
-        print(f"[pk] transcription: {text}")
+        logger.debug(f"User transcription text added: {text}")
+
         # Manually add new user transcription text to context.
         # We can't rely on the user context aggregator to do this since it's upstream from the LLM.
         self._context.add_user_transcription_text(text)
@@ -960,6 +936,8 @@ class AWSNovaSonicLLMService(LLMService):
             self._triggering_assistant_response = False
 
     async def _send_assistant_response_trigger(self):
+        logger.debug("Sending assistant response trigger...")
+
         chunk_duration = 0.02  # what we might get from InputAudioRawFrame
         chunk_size = int(
             chunk_duration
@@ -980,6 +958,9 @@ class AWSNovaSonicLLMService(LLMService):
             else None
         )
         if blank_audio_duration:
+            logger.debug(
+                f"Leading assistant response trigger with {blank_audio_duration}s of blank audio"
+            )
             blank_audio_chunk = b"\x00" * chunk_size
             num_chunks = int(blank_audio_duration / chunk_duration)
             for _ in range(num_chunks):
@@ -991,7 +972,6 @@ class AWSNovaSonicLLMService(LLMService):
         # if we ever need to seed this service again with context it would make sense to include it
         # since the instruction (i.e. the "wait for the trigger" instruction) will be part of the
         # context as well.
-        # print(f"[pk] sending trigger audio! {len(self._assistant_response_trigger_audio)}")
         audio_chunks = [
             self._assistant_response_trigger_audio[i : i + chunk_size]
             for i in range(0, len(self._assistant_response_trigger_audio), chunk_size)
diff --git a/src/pipecat/services/aws_nova_sonic/context.py b/src/pipecat/services/aws_nova_sonic/context.py
index d96c2d1ed..a8d9c4dba 100644
--- a/src/pipecat/services/aws_nova_sonic/context.py
+++ b/src/pipecat/services/aws_nova_sonic/context.py
@@ -94,7 +94,6 @@ class AWSNovaSonicLLMContext(OpenAILLMContext):
 
         # Process remaining messages to fill out conversation history.
         # Nova Sonic supports "user" and "assistant" messages in history.
-        # print(f"[pk] standard messages: {messages}")
         for message in messages:
             history_message = self.from_standard_message(message)
             if history_message:
@@ -136,11 +135,11 @@ class AWSNovaSonicLLMContext(OpenAILLMContext):
             "content": [{"type": "text", "text": text}],
         }
         self.add_message(message)
-        # print(f"[pk] context updated (user): {self.get_messages_for_logging()}")
+        # logger.debug(f"Context updated (user): {self.get_messages_for_logging()}")
 
     def buffer_assistant_text(self, text):
-        self._assistant_text += text  # TODO: determine if we need to add space or something
-        # print(f"[pk] assistant text buffered: {self._assistant_text}")
+        self._assistant_text += text
+        # logger.debug(f"Assistant text buffered: {self._assistant_text}")
 
     def flush_aggregated_assistant_text(self):
         if not self._assistant_text:

From 35848d10b36e71dd5761bf627a1b7794a6a11f86 Mon Sep 17 00:00:00 2001
From: Paul Kompfner <paul@daily.co>
Date: Tue, 6 May 2025 16:12:23 -0400
Subject: [PATCH 73/97] [WIP] AWS Nova Sonic service - remove various TODO
 comments

---
 src/pipecat/services/aws_nova_sonic/aws.py     | 14 +-------------
 src/pipecat/services/aws_nova_sonic/context.py |  7 +------
 2 files changed, 2 insertions(+), 19 deletions(-)

diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py
index 1e318f164..cac3cd53f 100644
--- a/src/pipecat/services/aws_nova_sonic/aws.py
+++ b/src/pipecat/services/aws_nova_sonic/aws.py
@@ -132,7 +132,6 @@ class AWSNovaSonicLLMService(LLMService):
     def __init__(
         self,
         *,
-        # TODO: if we have instruction here as an alternative to using context, we should do the same for tools...right?
         secret_access_key: str,
         access_key_id: str,
         region: str,
@@ -182,13 +181,6 @@ class AWSNovaSonicLLMService(LLMService):
     async def start(self, frame: StartFrame):
         await super().start(frame)
         self._wants_connection = True
-        # TODO: maybe connect but don't send history until we get all of our settings?
-        # how do we know how long to wait?
-        # ah, i think we'll *always* get at least one OpenAILLMContextFrame which kicks things off
-        # so we need to send the initial history when:
-        # - we're connected
-        # - we've gotten the first context
-        # i *think* this is what's controlled by _api_session_ready/_run_llm_when_api_session_ready
         await self._start_connecting()
 
     async def stop(self, frame: EndFrame):
@@ -247,7 +239,6 @@ class AWSNovaSonicLLMService(LLMService):
         if self._triggering_assistant_response:
             return
 
-        # TODO: check if _audio_input_paused? what causes that?
         await self._send_user_audio_event(frame.audio)
 
     async def _handle_bot_stopped_speaking(self):
@@ -417,9 +408,7 @@ class AWSNovaSonicLLMService(LLMService):
             region=self._region,
             aws_credentials_identity_resolver=StaticCredentialsResolver(
                 credentials=AWSCredentialsIdentity(
-                    access_key_id=self._access_key_id,
-                    secret_access_key=self._secret_access_key,
-                    # TODO: add additional stuff like aws_session_token
+                    access_key_id=self._access_key_id, secret_access_key=self._secret_access_key
                 )
             ),
             http_auth_scheme_resolver=HTTPAuthSchemeResolver(),
@@ -431,7 +420,6 @@ class AWSNovaSonicLLMService(LLMService):
     # LLM communication: input events (pipecat -> LLM)
     #
 
-    # TODO: make params configurable?
     async def _send_session_start_event(self):
         session_start = f"""
         {{
diff --git a/src/pipecat/services/aws_nova_sonic/context.py b/src/pipecat/services/aws_nova_sonic/context.py
index a8d9c4dba..561ae53db 100644
--- a/src/pipecat/services/aws_nova_sonic/context.py
+++ b/src/pipecat/services/aws_nova_sonic/context.py
@@ -150,7 +150,7 @@ class AWSNovaSonicLLMContext(OpenAILLMContext):
         }
         self._assistant_text = ""
         self.add_message(message)
-        # print(f"[pk] context updated (assistant): {self.get_messages_for_logging()}")
+        # logger.debug(f"Context updated (assistant): {self.get_messages_for_logging()}")
 
 
 @dataclass
@@ -168,11 +168,6 @@ class AWSNovaSonicUserContextAggregator(OpenAIUserContextAggregator):
         if isinstance(frame, LLMMessagesUpdateFrame):
             await self.push_frame(AWSNovaSonicMessagesUpdateFrame(context=self._context))
 
-        # Parent also doesn't push the LLMSetToolsFrame
-        # TODO: this
-        # if isinstance(frame, LLMSetToolsFrame):
-        #     await self.push_frame(frame, direction)
-
 
 class AWSNovaSonicAssistantContextAggregator(OpenAIAssistantContextAggregator):
     async def process_frame(self, frame: Frame, direction: FrameDirection):

From 5579145a0630a10fefbb1d9f1b71dfa599c84d07 Mon Sep 17 00:00:00 2001
From: Paul Kompfner <paul@daily.co>
Date: Tue, 6 May 2025 21:19:16 -0400
Subject: [PATCH 74/97] [WIP] AWS Nova Sonic service - post-rebase, update
 examples to play nicely with recent pipecat changes

---
 .../20e-persistent-context-aws-nova-sonic.py  | 42 +++++++++----------
 examples/foundational/39-aws-nova-sonic.py    | 14 +++----
 2 files changed, 27 insertions(+), 29 deletions(-)

diff --git a/examples/foundational/20e-persistent-context-aws-nova-sonic.py b/examples/foundational/20e-persistent-context-aws-nova-sonic.py
index 8ac1508f8..e092730fb 100644
--- a/examples/foundational/20e-persistent-context-aws-nova-sonic.py
+++ b/examples/foundational/20e-persistent-context-aws-nova-sonic.py
@@ -4,6 +4,7 @@
 # SPDX-License-Identifier: BSD 2-Clause License
 #
 
+import argparse
 import asyncio
 import glob
 import json
@@ -22,6 +23,7 @@ from pipecat.pipeline.runner import PipelineRunner
 from pipecat.pipeline.task import PipelineParams, PipelineTask
 from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
 from pipecat.services.aws_nova_sonic.aws import AWSNovaSonicLLMService
+from pipecat.services.llm_service import FunctionCallParams
 from pipecat.transports.base_transport import TransportParams
 from pipecat.transports.network.small_webrtc import SmallWebRTCTransport
 from pipecat.transports.network.webrtc_connection import SmallWebRTCConnection
@@ -31,21 +33,19 @@ load_dotenv(override=True)
 BASE_FILENAME = "/tmp/pipecat_conversation_"
 
 
-async def fetch_weather_from_api(function_name, tool_call_id, args, llm, context, result_callback):
-    temperature = 75 if args["format"] == "fahrenheit" else 24
-    await result_callback(
+async def fetch_weather_from_api(params: FunctionCallParams):
+    temperature = 75 if params.arguments["format"] == "fahrenheit" else 24
+    await params.result_callback(
         {
             "conditions": "nice",
             "temperature": temperature,
-            "format": args["format"],
+            "format": params.arguments["format"],
             "timestamp": datetime.now().strftime("%Y%m%d_%H%M%S"),
         }
     )
 
 
-async def get_saved_conversation_filenames(
-    function_name, tool_call_id, args, llm, context, result_callback
-):
+async def get_saved_conversation_filenames(params: FunctionCallParams):
     # Construct the full pattern including the BASE_FILENAME
     full_pattern = f"{BASE_FILENAME}*.json"
 
@@ -53,7 +53,7 @@ async def get_saved_conversation_filenames(
     matching_files = glob.glob(full_pattern)
     logger.debug(f"matching files: {matching_files}")
 
-    await result_callback({"filenames": matching_files})
+    await params.result_callback({"filenames": matching_files})
 
 
 # async def get_saved_conversation_filenames(
@@ -69,26 +69,26 @@ async def get_saved_conversation_filenames(
 #     await result_callback({"filenames": matching_files})
 
 
-async def save_conversation(function_name, tool_call_id, args, llm, context, result_callback):
+async def save_conversation(params: FunctionCallParams):
     timestamp = datetime.now().strftime("%Y-%m-%d_%H:%M:%S")
     filename = f"{BASE_FILENAME}{timestamp}.json"
     logger.debug(
-        f"writing conversation to {filename}\n{json.dumps(context.get_messages_for_persistent_storage(), indent=4)}"
+        f"writing conversation to {filename}\n{json.dumps(params.context.get_messages_for_persistent_storage(), indent=4)}"
     )
     try:
         with open(filename, "w") as file:
-            messages = context.get_messages_for_persistent_storage()
+            messages = params.context.get_messages_for_persistent_storage()
             # remove the last message, which is the instruction we just gave to save the conversation
             messages.pop()
             json.dump(messages, file, indent=2)
-        await result_callback({"success": True})
+        await params.result_callback({"success": True})
     except Exception as e:
-        await result_callback({"success": False, "error": str(e)})
+        await params.result_callback({"success": False, "error": str(e)})
 
 
-async def load_conversation(function_name, tool_call_id, args, llm, context, result_callback):
+async def load_conversation(params: FunctionCallParams):
     async def _reset():
-        filename = args["filename"]
+        filename = params.arguments["filename"]
         logger.debug(f"loading conversation from {filename}")
         try:
             with open(filename, "r") as file:
@@ -99,11 +99,11 @@ async def load_conversation(function_name, tool_call_id, args, llm, context, res
                         "content": f"{AWSNovaSonicLLMService.AWAIT_TRIGGER_ASSISTANT_RESPONSE_INSTRUCTION}",
                     }
                 )
-                context.set_messages(messages)
-                await llm.reset_conversation()
-                await llm.trigger_assistant_response()
+                params.context.set_messages(messages)
+                await params.llm.reset_conversation()
+                await params.llm.trigger_assistant_response()
         except Exception as e:
-            await result_callback({"success": False, "error": str(e)})
+            await params.result_callback({"success": False, "error": str(e)})
 
     asyncio.create_task(_reset())
 
@@ -161,7 +161,7 @@ tools = ToolsSchema(
 )
 
 
-async def run_bot(webrtc_connection: SmallWebRTCConnection):
+async def run_bot(webrtc_connection: SmallWebRTCConnection, _: argparse.Namespace):
     logger.info(f"Starting bot")
 
     transport = SmallWebRTCTransport(
@@ -169,9 +169,7 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection):
         params=TransportParams(
             audio_in_enabled=True,
             audio_out_enabled=True,
-            vad_enabled=True,
             vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.8)),
-            vad_audio_passthrough=True,
         ),
     )
 
diff --git a/examples/foundational/39-aws-nova-sonic.py b/examples/foundational/39-aws-nova-sonic.py
index af44cf790..9decc47ae 100644
--- a/examples/foundational/39-aws-nova-sonic.py
+++ b/examples/foundational/39-aws-nova-sonic.py
@@ -4,6 +4,7 @@
 # SPDX-License-Identifier: BSD 2-Clause License
 #
 
+import argparse
 import os
 from datetime import datetime
 
@@ -19,6 +20,7 @@ from pipecat.pipeline.runner import PipelineRunner
 from pipecat.pipeline.task import PipelineParams, PipelineTask
 from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
 from pipecat.services.aws_nova_sonic import AWSNovaSonicLLMService
+from pipecat.services.llm_service import FunctionCallParams
 from pipecat.transports.base_transport import TransportParams
 from pipecat.transports.network.small_webrtc import SmallWebRTCTransport
 from pipecat.transports.network.webrtc_connection import SmallWebRTCConnection
@@ -27,13 +29,13 @@ from pipecat.transports.network.webrtc_connection import SmallWebRTCConnection
 load_dotenv(override=True)
 
 
-async def fetch_weather_from_api(function_name, tool_call_id, args, llm, context, result_callback):
-    temperature = 75 if args["format"] == "fahrenheit" else 24
-    await result_callback(
+async def fetch_weather_from_api(params: FunctionCallParams):
+    temperature = 75 if params.arguments["format"] == "fahrenheit" else 24
+    await params.result_callback(
         {
             "conditions": "nice",
             "temperature": temperature,
-            "format": args["format"],
+            "format": params.arguments["format"],
             "timestamp": datetime.now().strftime("%Y%m%d_%H%M%S"),
         }
     )
@@ -60,7 +62,7 @@ weather_function = FunctionSchema(
 tools = ToolsSchema(standard_tools=[weather_function])
 
 
-async def run_bot(webrtc_connection: SmallWebRTCConnection):
+async def run_bot(webrtc_connection: SmallWebRTCConnection, _: argparse.Namespace):
     logger.info(f"Starting bot")
 
     # Initialize the SmallWebRTCTransport with the connection
@@ -71,8 +73,6 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection):
             audio_in_sample_rate=16000,
             audio_out_enabled=True,
             camera_in_enabled=False,
-            vad_enabled=True,
-            vad_audio_passthrough=True,
             vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.8)),
         ),
     )

From 84736472694592220b393830fb4f61c06ffd84d0 Mon Sep 17 00:00:00 2001
From: Paul Kompfner <paul@daily.co>
Date: Tue, 6 May 2025 21:57:41 -0400
Subject: [PATCH 75/97] [WIP] AWS Nova Sonic service - update
 persistent-context example to better avoid saving "transitional", as opposed
 to meaningful, context messages

---
 .../20e-persistent-context-aws-nova-sonic.py  | 21 +++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/examples/foundational/20e-persistent-context-aws-nova-sonic.py b/examples/foundational/20e-persistent-context-aws-nova-sonic.py
index e092730fb..1519f1c53 100644
--- a/examples/foundational/20e-persistent-context-aws-nova-sonic.py
+++ b/examples/foundational/20e-persistent-context-aws-nova-sonic.py
@@ -72,15 +72,24 @@ async def get_saved_conversation_filenames(params: FunctionCallParams):
 async def save_conversation(params: FunctionCallParams):
     timestamp = datetime.now().strftime("%Y-%m-%d_%H:%M:%S")
     filename = f"{BASE_FILENAME}{timestamp}.json"
-    logger.debug(
-        f"writing conversation to {filename}\n{json.dumps(params.context.get_messages_for_persistent_storage(), indent=4)}"
-    )
     try:
         with open(filename, "w") as file:
             messages = params.context.get_messages_for_persistent_storage()
-            # remove the last message, which is the instruction we just gave to save the conversation
-            messages.pop()
-            json.dump(messages, file, indent=2)
+            # remove the last few messages. in reverse order, they are:
+            # - the in progress save tool call
+            # - the invocation of the save tool call
+            # - the user ask to save (which may encompass one or more messages)
+            # the simplest thing to do is to pop messages until the last one is an assistant
+            # response
+            while messages and not (
+                messages[-1].get("role") == "assistant" and "content" in messages[-1]
+            ):
+                messages.pop()
+            if messages:  # we never expect this to be empty
+                logger.debug(
+                    f"writing conversation to {filename}\n{json.dumps(messages, indent=4)}"
+                )
+                json.dump(messages, file, indent=2)
         await params.result_callback({"success": True})
     except Exception as e:
         await params.result_callback({"success": False, "error": str(e)})

From ed06cdd2c7ea2a27e95b207cc5b72cdc625cf2bd Mon Sep 17 00:00:00 2001
From: Paul Kompfner <paul@daily.co>
Date: Tue, 6 May 2025 22:03:25 -0400
Subject: [PATCH 76/97] [WIP] AWS Nova Sonic service - add CHANGELOG entry

---
 CHANGELOG.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2eec61bca..bf6fec1b2 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
+- Added support for the AWS Nova Sonic speech-to-speech model with the new
+  `AWSNovaSonicLLMService`.
+  (see https://docs.aws.amazon.com/nova/latest/userguide/speech.html)
+
 - Added new AWS services `AWSBedrockLLMService` and `AWSTranscribeSTTService`.
 
 - Added `on_active_speaker_changed` event handler to the `DailyTransport` class.

From 896f8d85f70a43f6c20c74e77fe67abc48925967 Mon Sep 17 00:00:00 2001
From: Paul Kompfner <paul@daily.co>
Date: Tue, 6 May 2025 22:08:55 -0400
Subject: [PATCH 77/97] [WIP] AWS Nova Sonic service - remove out-of-date TODO
 comment

---
 examples/foundational/39-aws-nova-sonic.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/foundational/39-aws-nova-sonic.py b/examples/foundational/39-aws-nova-sonic.py
index 9decc47ae..4ed533e18 100644
--- a/examples/foundational/39-aws-nova-sonic.py
+++ b/examples/foundational/39-aws-nova-sonic.py
@@ -108,7 +108,6 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection, _: argparse.Namespac
     # Set up context and context management.
     # AWSNovaSonicService will adapt OpenAI LLM context objects with standard message format to
     # what's expected by Nova Sonic.
-    # TODO: since we can't trigger a response upon joining, this isn't particularly useful
     context = OpenAILLMContext(
         messages=[
             {"role": "system", "content": f"{system_instruction}"},

From 27bff7a75963298bd2325520afbc50c9f4dddc26 Mon Sep 17 00:00:00 2001
From: Paul Kompfner <paul@daily.co>
Date: Wed, 7 May 2025 09:11:27 -0400
Subject: [PATCH 78/97] [WIP] AWS Nova Sonic service - fix comment

---
 src/pipecat/adapters/services/aws_nova_sonic_adapter.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/pipecat/adapters/services/aws_nova_sonic_adapter.py b/src/pipecat/adapters/services/aws_nova_sonic_adapter.py
index b96980046..dc7eef92d 100644
--- a/src/pipecat/adapters/services/aws_nova_sonic_adapter.py
+++ b/src/pipecat/adapters/services/aws_nova_sonic_adapter.py
@@ -4,7 +4,7 @@
 # SPDX-License-Identifier: BSD 2-Clause License
 #
 import json
-from typing import Any, Dict, List, Union
+from typing import Any, Dict, List
 
 from pipecat.adapters.base_llm_adapter import BaseLLMAdapter
 from pipecat.adapters.schemas.function_schema import FunctionSchema
@@ -31,9 +31,9 @@ class AWSNovaSonicLLMAdapter(BaseLLMAdapter):
         }
 
     def to_provider_tools_format(self, tools_schema: ToolsSchema) -> List[Dict[str, Any]]:
-        """Converts function schemas to Openai Realtime function-calling format.
+        """Converts function schemas to AWS Nova Sonic function-calling format.
 
-        :return: Openai Realtime formatted function call definition.
+        :return: AWS Nova Sonic formatted function call definition.
         """
 
         functions_schema = tools_schema.standard_tools

From 4ba9a428610e074124499849fb0a8e22ccadd5d5 Mon Sep 17 00:00:00 2001
From: Paul Kompfner <paul@daily.co>
Date: Wed, 7 May 2025 09:52:05 -0400
Subject: [PATCH 79/97] [WIP] AWS Nova Sonic service - add more accurate typing

---
 src/pipecat/services/aws_nova_sonic/aws.py | 74 ++++++++++++++--------
 1 file changed, 47 insertions(+), 27 deletions(-)

diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py
index cac3cd53f..a2ef78f88 100644
--- a/src/pipecat/services/aws_nova_sonic/aws.py
+++ b/src/pipecat/services/aws_nova_sonic/aws.py
@@ -148,30 +148,34 @@ class AWSNovaSonicLLMService(LLMService):
         self._access_key_id = access_key_id
         self._region = region
         self._model = model
-        self._client: BedrockRuntimeClient = None
+        self._client: Optional[BedrockRuntimeClient] = None
         self._voice_id = voice_id
         self._params = params
         self._system_instruction = system_instruction
         self._tools = tools
         self._send_transcription_frames = send_transcription_frames
-        self._context: AWSNovaSonicLLMContext = None
-        self._stream: DuplexEventStream[
-            InvokeModelWithBidirectionalStreamInput,
-            InvokeModelWithBidirectionalStreamOutput,
-            InvokeModelWithBidirectionalStreamOperationOutput,
+        self._context: Optional[AWSNovaSonicLLMContext] = None
+        self._stream: Optional[
+            DuplexEventStream[
+                InvokeModelWithBidirectionalStreamInput,
+                InvokeModelWithBidirectionalStreamOutput,
+                InvokeModelWithBidirectionalStreamOperationOutput,
+            ]
         ] = None
-        self._receive_task = None
-        self._prompt_name = None
-        self._input_audio_content_name = None
-        self._content_being_received = None
+        self._receive_task: Optional[asyncio.Task] = None
+        self._prompt_name: Optional[str] = None
+        self._input_audio_content_name: Optional[str] = None
+        self._content_being_received: Optional[CurrentContent] = None
         self._assistant_is_responding = False
         self._context_available = False
         self._ready_to_send_context = False
         self._handling_bot_stopped_speaking = False
         self._triggering_assistant_response = False
-        self._assistant_response_trigger_audio: bytes = None  # Not cleared on _disconnect()
+        self._assistant_response_trigger_audio: Optional[bytes] = (
+            None  # Not cleared on _disconnect()
+        )
         self._disconnecting = False
-        self._connected_time: float = None
+        self._connected_time: Optional[float] = None
         self._wants_connection = False
 
     #
@@ -437,6 +441,9 @@ class AWSNovaSonicLLMService(LLMService):
         await self._send_client_event(session_start)
 
     async def _send_prompt_start_event(self, tools: List[Any]):
+        if not self._prompt_name:
+            return
+
         tools_config = (
             f""",
         "toolUseOutputConfiguration": {{
@@ -474,6 +481,9 @@ class AWSNovaSonicLLMService(LLMService):
         await self._send_client_event(prompt_start)
 
     async def _send_audio_input_start_event(self):
+        if not self._prompt_name:
+            return
+
         audio_content_start = f'''
         {{
             "event": {{
@@ -498,7 +508,7 @@ class AWSNovaSonicLLMService(LLMService):
         await self._send_client_event(audio_content_start)
 
     async def _send_text_event(self, text: str, role: Role):
-        if not self._stream or not text:
+        if not self._stream or not self._prompt_name or not text:
             return
 
         content_name = str(uuid.uuid4())
@@ -566,7 +576,7 @@ class AWSNovaSonicLLMService(LLMService):
         await self._send_client_event(audio_event)
 
     async def _send_session_end_events(self):
-        if not self._stream:
+        if not self._stream or not self._prompt_name:
             return
 
         prompt_end = f'''
@@ -590,7 +600,7 @@ class AWSNovaSonicLLMService(LLMService):
         await self._send_client_event(session_end)
 
     async def _send_tool_result(self, tool_call_id, result):
-        if not self._stream:
+        if not self._stream or not self._prompt_name:
             return
 
         content_name = str(uuid.uuid4())
@@ -643,6 +653,9 @@ class AWSNovaSonicLLMService(LLMService):
         await self._send_client_event(result_content_end)
 
     async def _send_client_event(self, event_json: str):
+        if not self._stream:  # should never happen
+            return
+
         event = InvokeModelWithBidirectionalStreamInputChunk(
             value=BidirectionalInputPayloadPart(bytes_=event_json.encode("utf-8"))
         )
@@ -732,8 +745,7 @@ class AWSNovaSonicLLMService(LLMService):
                     await self._report_assistant_response_started()
 
     async def _handle_text_output_event(self, event_json):
-        # This should never happen
-        if not self._content_being_received:
+        if not self._content_being_received:  # should never happen
             return
         content = self._content_being_received
 
@@ -744,8 +756,7 @@ class AWSNovaSonicLLMService(LLMService):
         content.text_content = text_content
 
     async def _handle_audio_output_event(self, event_json):
-        # This should never happen
-        if not self._content_being_received:
+        if not self._content_being_received:  # should never happen
             return
 
         # Get audio
@@ -761,8 +772,7 @@ class AWSNovaSonicLLMService(LLMService):
         await self.push_frame(frame)
 
     async def _handle_tool_use_event(self, event_json):
-        # This should never happen
-        if not self._content_being_received:
+        if not self._content_being_received or not self._context:  # should never happen
             return
 
         # Get tool use details
@@ -793,8 +803,7 @@ class AWSNovaSonicLLMService(LLMService):
             )
 
     async def _handle_content_end_event(self, event_json):
-        # This should never happen
-        if not self._content_being_received:
+        if not self._content_being_received:  # should never happen
             return
         content = self._content_being_received
 
@@ -817,8 +826,6 @@ class AWSNovaSonicLLMService(LLMService):
                     # User transcription text added
                     await self._report_user_transcription_text_added(content.text_content)
 
-        self._content_being_received = False
-
     async def _handle_completion_end_event(self, event_json):
         pass
 
@@ -832,6 +839,9 @@ class AWSNovaSonicLLMService(LLMService):
         await self.push_frame(TTSStartedFrame())
 
     async def _report_assistant_response_text_added(self, text):
+        if not self._context:  # should never happen
+            return
+
         logger.debug(f"Assistant response text added: {text}")
 
         # Report some text added to the ongoing assistant response
@@ -853,6 +863,9 @@ class AWSNovaSonicLLMService(LLMService):
         self._context.buffer_assistant_text(text)
 
     async def _report_assistant_response_ended(self):
+        if not self._context:  # should never happen
+            return
+
         logger.debug("Assistant response ended")
 
         # Report that the assistant has finished their response.
@@ -865,6 +878,9 @@ class AWSNovaSonicLLMService(LLMService):
         self._context.flush_aggregated_assistant_text()
 
     async def _report_user_transcription_text_added(self, text):
+        if not self._context:  # should never happen
+            return
+
         logger.debug(f"User transcription text added: {text}")
 
         # Manually add new user transcription text to context.
@@ -918,12 +934,16 @@ class AWSNovaSonicLLMService(LLMService):
                 self._assistant_response_trigger_audio = wav_file.readframes(wav_file.getnframes())
 
         # Send the trigger audio, if we're fully connected and set up
-        # NOTE: maybe there's a better way to determine whether we're done setting up?
-        if self._receive_task:
+        if self._connected_time is not None:
             await self._send_assistant_response_trigger()
             self._triggering_assistant_response = False
 
     async def _send_assistant_response_trigger(self):
+        if (
+            not self._assistant_response_trigger_audio or self._connected_time is None
+        ):  # should never happen
+            return
+
         logger.debug("Sending assistant response trigger...")
 
         chunk_duration = 0.02  # what we might get from InputAudioRawFrame

From 52036138c1ff2365f17c4b03df89e711b1aeb51a Mon Sep 17 00:00:00 2001
From: Paul Kompfner <paul@daily.co>
Date: Wed, 7 May 2025 10:22:51 -0400
Subject: [PATCH 80/97] [WIP] AWS Nova Sonic service - remove unnecessary
 (no-op) code

---
 src/pipecat/services/aws_nova_sonic/aws.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py
index a2ef78f88..6838daad6 100644
--- a/src/pipecat/services/aws_nova_sonic/aws.py
+++ b/src/pipecat/services/aws_nova_sonic/aws.py
@@ -269,7 +269,6 @@ class AWSNovaSonicLLMService(LLMService):
             await asyncio.sleep(0.25)
             self._assistant_is_responding = False
             await self._report_assistant_response_ended()
-            self._handling_bot_stopped_speaking = False
 
         self._handling_bot_stopped_speaking = False
 

From b013e375fb90f0810e1c1b367b5834e5f70d8fdc Mon Sep 17 00:00:00 2001
From: Paul Kompfner <paul@daily.co>
Date: Wed, 7 May 2025 10:38:23 -0400
Subject: [PATCH 81/97] [WIP] AWS Nova Sonic service - simplify a bit of logic
 (and do the same simplification in the OpenAI Realtime service)

---
 src/pipecat/services/aws_nova_sonic/aws.py          |  9 +--------
 src/pipecat/services/openai_realtime_beta/openai.py | 10 +---------
 2 files changed, 2 insertions(+), 17 deletions(-)

diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py
index 6838daad6..3f41a0166 100644
--- a/src/pipecat/services/aws_nova_sonic/aws.py
+++ b/src/pipecat/services/aws_nova_sonic/aws.py
@@ -782,14 +782,7 @@ class AWSNovaSonicLLMService(LLMService):
 
         # Call tool function
         if self.has_function(function_name):
-            if function_name in self._functions.keys():
-                await self.call_function(
-                    context=self._context,
-                    tool_call_id=tool_call_id,
-                    function_name=function_name,
-                    arguments=arguments,
-                )
-            elif None in self._functions.keys():
+            if function_name in self._functions.keys() or None in self._functions.keys():
                 await self.call_function(
                     context=self._context,
                     tool_call_id=tool_call_id,
diff --git a/src/pipecat/services/openai_realtime_beta/openai.py b/src/pipecat/services/openai_realtime_beta/openai.py
index 334ce98c8..0c37f73ce 100644
--- a/src/pipecat/services/openai_realtime_beta/openai.py
+++ b/src/pipecat/services/openai_realtime_beta/openai.py
@@ -577,15 +577,7 @@ class OpenAIRealtimeBetaLLMService(LLMService):
             arguments = json.loads(item.arguments)
             if self.has_function(function_name):
                 run_llm = index == total_items - 1
-                if function_name in self._functions.keys():
-                    await self.call_function(
-                        context=self._context,
-                        tool_call_id=tool_id,
-                        function_name=function_name,
-                        arguments=arguments,
-                        run_llm=run_llm,
-                    )
-                elif None in self._functions.keys():
+                if function_name in self._functions.keys() or None in self._functions.keys():
                     await self.call_function(
                         context=self._context,
                         tool_call_id=tool_id,

From c78f7798004a505522765c00b3e043ffbaa6784d Mon Sep 17 00:00:00 2001
From: Paul Kompfner <paul@daily.co>
Date: Wed, 7 May 2025 10:55:43 -0400
Subject: [PATCH 82/97] [WIP] AWS Nova Sonic service - log an error message if
 you try to use AWS Nova Sonic without the proper dependency (e.g. without
 having done `pip install pipecat-ai[aws]`)

---
 src/pipecat/services/aws_nova_sonic/aws.py | 40 ++++++++++++----------
 1 file changed, 21 insertions(+), 19 deletions(-)

diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py
index 3f41a0166..b4989185a 100644
--- a/src/pipecat/services/aws_nova_sonic/aws.py
+++ b/src/pipecat/services/aws_nova_sonic/aws.py
@@ -15,23 +15,8 @@ from enum import Enum
 from importlib.resources import files
 from typing import Any, List, Optional
 
-from aws_sdk_bedrock_runtime.client import (
-    BedrockRuntimeClient,
-    InvokeModelWithBidirectionalStreamOperationInput,
-)
-from aws_sdk_bedrock_runtime.config import Config, HTTPAuthSchemeResolver, SigV4AuthScheme
-from aws_sdk_bedrock_runtime.models import (
-    BidirectionalInputPayloadPart,
-    InvokeModelWithBidirectionalStreamInput,
-    InvokeModelWithBidirectionalStreamInputChunk,
-    InvokeModelWithBidirectionalStreamOperationOutput,
-    InvokeModelWithBidirectionalStreamOutput,
-)
 from loguru import logger
 from pydantic import BaseModel, Field
-from smithy_aws_core.credentials_resolvers.static import StaticCredentialsResolver
-from smithy_aws_core.identity import AWSCredentialsIdentity
-from smithy_core.aio.eventstream import DuplexEventStream
 
 from pipecat.adapters.schemas.tools_schema import ToolsSchema
 from pipecat.adapters.services.aws_nova_sonic_adapter import AWSNovaSonicLLMAdapter
@@ -45,15 +30,11 @@ from pipecat.frames.frames import (
     LLMFullResponseStartFrame,
     LLMTextFrame,
     StartFrame,
-    StartInterruptionFrame,
-    StopInterruptionFrame,
     TranscriptionFrame,
     TTSAudioRawFrame,
     TTSStartedFrame,
     TTSStoppedFrame,
     TTSTextFrame,
-    UserStartedSpeakingFrame,
-    UserStoppedSpeakingFrame,
 )
 from pipecat.processors.aggregators.llm_response import (
     LLMAssistantAggregatorParams,
@@ -75,6 +56,27 @@ from pipecat.services.aws_nova_sonic.frames import AWSNovaSonicFunctionCallResul
 from pipecat.services.llm_service import LLMService
 from pipecat.utils.time import time_now_iso8601
 
+try:
+    from aws_sdk_bedrock_runtime.client import (
+        BedrockRuntimeClient,
+        InvokeModelWithBidirectionalStreamOperationInput,
+    )
+    from aws_sdk_bedrock_runtime.config import Config, HTTPAuthSchemeResolver, SigV4AuthScheme
+    from aws_sdk_bedrock_runtime.models import (
+        BidirectionalInputPayloadPart,
+        InvokeModelWithBidirectionalStreamInput,
+        InvokeModelWithBidirectionalStreamInputChunk,
+        InvokeModelWithBidirectionalStreamOperationOutput,
+        InvokeModelWithBidirectionalStreamOutput,
+    )
+    from smithy_aws_core.credentials_resolvers.static import StaticCredentialsResolver
+    from smithy_aws_core.identity import AWSCredentialsIdentity
+    from smithy_core.aio.eventstream import DuplexEventStream
+except ModuleNotFoundError as e:
+    logger.error(f"Exception: {e}")
+    logger.error("In order to use AWS services, you need to `pip install pipecat-ai[aws]`.")
+    raise Exception(f"Missing module: {e}")
+
 
 class AWSNovaSonicUnhandledFunctionException(Exception):
     pass

From 1491462d157509eb336d5111a4f4a8f875e2cd90 Mon Sep 17 00:00:00 2001
From: Paul Kompfner <paul@daily.co>
Date: Wed, 7 May 2025 11:10:54 -0400
Subject: [PATCH 83/97] [WIP] AWS Nova Sonic service - remove
 `_handling_bot_stopped_speaking`, which no longer seems to be necessary; I'm
 no longer observing back-to-back `BotStoppedSpeaking` frames

---
 src/pipecat/services/aws_nova_sonic/aws.py | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py
index b4989185a..9f2f1f72e 100644
--- a/src/pipecat/services/aws_nova_sonic/aws.py
+++ b/src/pipecat/services/aws_nova_sonic/aws.py
@@ -171,7 +171,6 @@ class AWSNovaSonicLLMService(LLMService):
         self._assistant_is_responding = False
         self._context_available = False
         self._ready_to_send_context = False
-        self._handling_bot_stopped_speaking = False
         self._triggering_assistant_response = False
         self._assistant_response_trigger_audio: Optional[bytes] = (
             None  # Not cleared on _disconnect()
@@ -248,10 +247,6 @@ class AWSNovaSonicLLMService(LLMService):
         await self._send_user_audio_event(frame.audio)
 
     async def _handle_bot_stopped_speaking(self):
-        # Protect against back-to-back BotStoppedSpeaking calls, which I've observed
-        if self._handling_bot_stopped_speaking:
-            return
-        self._handling_bot_stopped_speaking = True
 
         if self._assistant_is_responding:
             # Consider the assistant finished with their response (after a short delay, to allow for
@@ -272,8 +267,6 @@ class AWSNovaSonicLLMService(LLMService):
             self._assistant_is_responding = False
             await self._report_assistant_response_ended()
 
-        self._handling_bot_stopped_speaking = False
-
     async def _handle_function_call_result(self, frame: AWSNovaSonicFunctionCallResultFrame):
         result = frame.result_frame
         await self._send_tool_result(tool_call_id=result.tool_call_id, result=result.result)
@@ -398,7 +391,6 @@ class AWSNovaSonicLLMService(LLMService):
             self._assistant_is_responding = False
             self._context_available = False
             self._ready_to_send_context = False
-            self._handling_bot_stopped_speaking = False
             self._triggering_assistant_response = False
             self._disconnecting = False
             self._connected_time = None

From b53f9235e4bd9f9fefe6d0e0b6761e972a3d8bf0 Mon Sep 17 00:00:00 2001
From: Paul Kompfner <paul@daily.co>
Date: Wed, 7 May 2025 11:42:37 -0400
Subject: [PATCH 84/97] [WIP] AWS Nova Sonic service - remove unnecessary
 `_context_available` state, instead just relying on the presence of
 `_context`

---
 src/pipecat/services/aws_nova_sonic/aws.py | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py
index 9f2f1f72e..4056d0ed0 100644
--- a/src/pipecat/services/aws_nova_sonic/aws.py
+++ b/src/pipecat/services/aws_nova_sonic/aws.py
@@ -169,7 +169,6 @@ class AWSNovaSonicLLMService(LLMService):
         self._input_audio_content_name: Optional[str] = None
         self._content_being_received: Optional[CurrentContent] = None
         self._assistant_is_responding = False
-        self._context_available = False
         self._ready_to_send_context = False
         self._triggering_assistant_response = False
         self._assistant_response_trigger_audio: Optional[bytes] = (
@@ -205,11 +204,13 @@ class AWSNovaSonicLLMService(LLMService):
     async def reset_conversation(self):
         logger.debug("Resetting conversation")
         await self._handle_bot_stopped_speaking()
+
+        # Carry over previous context through disconnect
+        context = self._context
         await self._disconnect()
+        self._context = context
+
         await self._start_connecting()
-        # Use existing context
-        self._context_available = True
-        await self._finish_connecting_if_context_available()
 
     #
     # frame processing
@@ -235,7 +236,6 @@ class AWSNovaSonicLLMService(LLMService):
             self._context = AWSNovaSonicLLMContext.upgrade_to_nova_sonic(
                 context, self._system_instruction
             )
-            self._context_available = True
             await self._finish_connecting_if_context_available()
 
     async def _handle_input_audio_frame(self, frame: InputAudioRawFrame):
@@ -247,7 +247,6 @@ class AWSNovaSonicLLMService(LLMService):
         await self._send_user_audio_event(frame.audio)
 
     async def _handle_bot_stopped_speaking(self):
-
         if self._assistant_is_responding:
             # Consider the assistant finished with their response (after a short delay, to allow for
             # any FINAL text block to come in).
@@ -308,7 +307,7 @@ class AWSNovaSonicLLMService(LLMService):
     async def _finish_connecting_if_context_available(self):
         # We can only finish connecting once we've gotten our initial context and we're ready to
         # send it
-        if not (self._context_available and self._ready_to_send_context):
+        if not (self._context and self._ready_to_send_context):
             return
 
         logger.info("Finishing connecting (setting up session)...")
@@ -389,7 +388,6 @@ class AWSNovaSonicLLMService(LLMService):
             self._input_audio_content_name = None
             self._content_being_received = None
             self._assistant_is_responding = False
-            self._context_available = False
             self._ready_to_send_context = False
             self._triggering_assistant_response = False
             self._disconnecting = False

From 93c9cc4a0e7dbd27f9d5adcbcb76215b35184bbf Mon Sep 17 00:00:00 2001
From: Paul Kompfner <paul@daily.co>
Date: Wed, 7 May 2025 12:03:23 -0400
Subject: [PATCH 85/97] [WIP] AWS Nova Sonic service - minor fix

---
 src/pipecat/services/aws_nova_sonic/aws.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py
index 4056d0ed0..eab12272c 100644
--- a/src/pipecat/services/aws_nova_sonic/aws.py
+++ b/src/pipecat/services/aws_nova_sonic/aws.py
@@ -669,7 +669,7 @@ class AWSNovaSonicLLMService(LLMService):
     # The overall completion is wrapped by "completionStart" and "completionEnd" events.
     async def _receive_task_handler(self):
         try:
-            while self._client and not self._disconnecting:
+            while self._stream and not self._disconnecting:
                 output = await self._stream.await_output()
                 result = await output[1].receive()
 

From 2920aa5af477720227666d93e057d7b25424b36b Mon Sep 17 00:00:00 2001
From: Paul Kompfner <paul@daily.co>
Date: Wed, 7 May 2025 14:32:32 -0400
Subject: [PATCH 86/97] [WIP] AWS Nova Sonic service - pull AWS Nova Sonic
 support out of the `aws` optional dependency in pyproject.toml and into its
 own `aws-nova-sonic` optional dependency. That's because it requires Python
 >= 3.12, a higher version than the base project's 3.10. This change allows
 anyone using any of the other AWS services (including our own unit tests) to
 continue using the lower Python version.

---
 CHANGELOG.md                               | 3 ++-
 pyproject.toml                             | 3 ++-
 src/pipecat/services/aws_nova_sonic/aws.py | 4 +++-
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index bf6fec1b2..319dce632 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -11,7 +11,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 - Added support for the AWS Nova Sonic speech-to-speech model with the new
   `AWSNovaSonicLLMService`.
-  (see https://docs.aws.amazon.com/nova/latest/userguide/speech.html)
+  See https://docs.aws.amazon.com/nova/latest/userguide/speech.html.
+  Note that it requires Python >= 3.12 and `pip install pipecat-ai[aws-nova-sonic]`.
 
 - Added new AWS services `AWSBedrockLLMService` and `AWSTranscribeSTTService`.
 
diff --git a/pyproject.toml b/pyproject.toml
index 7ce167d77..06d7fb0a4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -41,7 +41,8 @@ Website = "https://pipecat.ai"
 [project.optional-dependencies]
 anthropic = [ "anthropic~=0.49.0" ]
 assemblyai = [ "assemblyai~=0.37.0" ]
-aws = [ "boto3~=1.37.16", "websockets~=13.1", "aws_sdk_bedrock_runtime~=0.0.2" ]
+aws = [ "boto3~=1.37.16", "websockets~=13.1" ]
+aws-nova-sonic = [ "aws_sdk_bedrock_runtime~=0.0.2" ]
 azure = [ "azure-cognitiveservices-speech~=1.42.0"]
 cartesia = [ "cartesia~=1.4.0", "websockets~=13.1" ]
 cerebras = []
diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py
index eab12272c..b53578f5a 100644
--- a/src/pipecat/services/aws_nova_sonic/aws.py
+++ b/src/pipecat/services/aws_nova_sonic/aws.py
@@ -74,7 +74,9 @@ try:
     from smithy_core.aio.eventstream import DuplexEventStream
 except ModuleNotFoundError as e:
     logger.error(f"Exception: {e}")
-    logger.error("In order to use AWS services, you need to `pip install pipecat-ai[aws]`.")
+    logger.error(
+        "In order to use AWS services, you need to `pip install pipecat-ai[aws-nova-sonic]`."
+    )
     raise Exception(f"Missing module: {e}")
 
 
From a3038afa023b4a609fb62dffec21f4da3e780078 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= <aleix@daily.co>
Date: Wed, 7 May 2025 11:39:36 -0700
Subject: [PATCH 87/97] DailyTransport: fix multiple audio/video sources

---
 CHANGELOG.md                             |  3 +++
 src/pipecat/transports/services/daily.py | 17 +++++++++--------
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 319dce632..16da15420 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -39,6 +39,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Fixed
 
+- Fixed a `DailyTransport` issue that was causing issues when multiple audio or
+  video sources where being captured.
+
 - Fixed a `UltravoxSTTService` issue that would cause the service to generate
   all tokens as one word.
 
diff --git a/src/pipecat/transports/services/daily.py b/src/pipecat/transports/services/daily.py
index 5d00e76bc..9118b2107 100644
--- a/src/pipecat/transports/services/daily.py
+++ b/src/pipecat/transports/services/daily.py
@@ -700,7 +700,7 @@ class DailyTransportClient(EventHandler):
 
         await self.update_subscriptions(participant_settings={participant_id: media})
 
-        self._audio_renderers[participant_id] = {audio_source: callback}
+        self._audio_renderers.setdefault(participant_id, {})[audio_source] = callback
 
         self._client.set_audio_renderer(
             participant_id,
@@ -724,7 +724,7 @@ class DailyTransportClient(EventHandler):
 
         await self.update_subscriptions(participant_settings={participant_id: media})
 
-        self._video_renderers[participant_id] = {video_source: callback}
+        self._video_renderers.setdefault(participant_id, {})[video_source] = callback
 
         self._client.set_video_renderer(
             participant_id,
@@ -1061,12 +1061,13 @@ class DailyInputTransport(BaseInputTransport):
         video_source: str = "camera",
         color_format: str = "RGB",
     ):
-        self._video_renderers[participant_id] = {
-            video_source: {
-                "framerate": framerate,
-                "timestamp": 0,
-                "render_next_frame": [],
-            }
+        if participant_id not in self._video_renderers:
+            self._video_renderers[participant_id] = {}
+
+        self._video_renderers[participant_id][video_source] = {
+            "framerate": framerate,
+            "timestamp": 0,
+            "render_next_frame": [],
         }
 
         await self._client.capture_participant_video(

From ed00f7d071973ac64bce555b0ff20aff323eede4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= <aleix@daily.co>
Date: Wed, 7 May 2025 11:42:16 -0700
Subject: [PATCH 88/97] add video_source field to UserImageRequestFrame

---
 CHANGELOG.md                             | 3 +++
 src/pipecat/frames/frames.py             | 3 ++-
 src/pipecat/services/llm_service.py      | 2 ++
 src/pipecat/transports/services/daily.py | 3 ++-
 4 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 16da15420..3d2132d8d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
+- `UserImageRequestFrame.video_source` field has been added to request an image
+  from the desired video source.
+
 - Added support for the AWS Nova Sonic speech-to-speech model with the new
   `AWSNovaSonicLLMService`.
   See https://docs.aws.amazon.com/nova/latest/userguide/speech.html.
diff --git a/src/pipecat/frames/frames.py b/src/pipecat/frames/frames.py
index 05f5b666d..8d3f38459 100644
--- a/src/pipecat/frames/frames.py
+++ b/src/pipecat/frames/frames.py
@@ -715,9 +715,10 @@ class UserImageRequestFrame(SystemFrame):
     context: Optional[Any] = None
     function_name: Optional[str] = None
     tool_call_id: Optional[str] = None
+    video_source: Optional[str] = None
 
     def __str__(self):
-        return f"{self.name}(user: {self.user_id}, function: {self.function_name}, request: {self.tool_call_id})"
+        return f"{self.name}(user: {self.user_id}, video_source: {self.video_source}, function: {self.function_name}, request: {self.tool_call_id})"
 
 
 @dataclass
diff --git a/src/pipecat/services/llm_service.py b/src/pipecat/services/llm_service.py
index 15b2bd6e5..21b62325d 100644
--- a/src/pipecat/services/llm_service.py
+++ b/src/pipecat/services/llm_service.py
@@ -190,6 +190,7 @@ class LLMService(AIService):
         function_name: Optional[str] = None,
         tool_call_id: Optional[str] = None,
         text_content: Optional[str] = None,
+        video_source: Optional[str] = None,
     ):
         await self.push_frame(
             UserImageRequestFrame(
@@ -197,6 +198,7 @@ class LLMService(AIService):
                 function_name=function_name,
                 tool_call_id=tool_call_id,
                 context=text_content,
+                video_source=video_source,
             ),
             FrameDirection.UPSTREAM,
         )
diff --git a/src/pipecat/transports/services/daily.py b/src/pipecat/transports/services/daily.py
index 9118b2107..f1a514d0e 100644
--- a/src/pipecat/transports/services/daily.py
+++ b/src/pipecat/transports/services/daily.py
@@ -1076,7 +1076,8 @@ class DailyInputTransport(BaseInputTransport):
 
     async def request_participant_image(self, frame: UserImageRequestFrame):
         if frame.user_id in self._video_renderers:
-            self._video_renderers[frame.user_id]["render_next_frame"].append(frame)
+            video_source = frame.video_source if frame.video_source else "camera"
+            self._video_renderers[frame.user_id][video_source]["render_next_frame"].append(frame)
 
     async def _on_participant_video_frame(
         self, participant_id: str, video_frame: VideoFrame, video_source: str

From cdf0953722fa7892d5c458dacf7b058654adfba2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= <aleix@daily.co>
Date: Wed, 7 May 2025 11:56:36 -0700
Subject: [PATCH 89/97] pyproject: update daily-python to 0.18.2

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 13305933b..9c864c413 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -46,7 +46,7 @@ azure = [ "azure-cognitiveservices-speech~=1.42.0"]
 cartesia = [ "cartesia~=1.4.0", "websockets~=13.1" ]
 cerebras = []
 deepseek = []
-daily = [ "daily-python~=0.18.1" ]
+daily = [ "daily-python~=0.18.2" ]
 deepgram = [ "deepgram-sdk~=3.8.0" ]
 elevenlabs = [ "websockets~=13.1" ]
 fal = [ "fal-client~=0.5.9" ]

From 84d040c6d0aed545091c0884611a2b5e13611e16 Mon Sep 17 00:00:00 2001
From: Paul Kompfner <paul@daily.co>
Date: Wed, 7 May 2025 16:21:47 -0400
Subject: [PATCH 90/97] AWS Nova Sonic service - make interruption handling
 more reliable, in terms of: - not getting the conversation into a "stuck"
 state - not losing assistant text that should've made it into the context

---
 src/pipecat/services/aws_nova_sonic/aws.py | 64 +++++++++++++++-------
 1 file changed, 43 insertions(+), 21 deletions(-)

diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py
index b53578f5a..410481065 100644
--- a/src/pipecat/services/aws_nova_sonic/aws.py
+++ b/src/pipecat/services/aws_nova_sonic/aws.py
@@ -172,6 +172,7 @@ class AWSNovaSonicLLMService(LLMService):
         self._content_being_received: Optional[CurrentContent] = None
         self._assistant_is_responding = False
         self._ready_to_send_context = False
+        self._handling_bot_stopped_speaking = False
         self._triggering_assistant_response = False
         self._assistant_response_trigger_audio: Optional[bytes] = (
             None  # Not cleared on _disconnect()
@@ -205,7 +206,7 @@ class AWSNovaSonicLLMService(LLMService):
 
     async def reset_conversation(self):
         logger.debug("Resetting conversation")
-        await self._handle_bot_stopped_speaking()
+        await self._handle_bot_stopped_speaking(delay_to_catch_trailing_assistant_text=False)
 
         # Carry over previous context through disconnect
         context = self._context
@@ -226,7 +227,7 @@ class AWSNovaSonicLLMService(LLMService):
         elif isinstance(frame, InputAudioRawFrame):
             await self._handle_input_audio_frame(frame)
         elif isinstance(frame, BotStoppedSpeakingFrame):
-            await self._handle_bot_stopped_speaking()
+            await self._handle_bot_stopped_speaking(delay_to_catch_trailing_assistant_text=True)
         elif isinstance(frame, AWSNovaSonicFunctionCallResultFrame):
             await self._handle_function_call_result(frame)
 
@@ -248,25 +249,45 @@ class AWSNovaSonicLLMService(LLMService):
 
         await self._send_user_audio_event(frame.audio)
 
-    async def _handle_bot_stopped_speaking(self):
-        if self._assistant_is_responding:
-            # Consider the assistant finished with their response (after a short delay, to allow for
-            # any FINAL text block to come in).
-            #
-            # TODO: ideally we could base this solely on the LLM output events, but I couldn't
-            # figure out a reliable way to determine when we've gotten our last FINAL text block
-            # after the LLM is done talking.
-            #
-            # First I looked at stopReason, but it doesn't seem like the last FINAL text block is
-            # reliably marked END_TURN (sometimes the *first* one is, but not the last...bug?)
-            #
-            # Then I considered schemes where we tally or match up SPECULATIVE text blocks with
-            # FINAL text blocks to know how many or which FINAL blocks to expect, but user
-            # interruptions throw a wrench in these schemes: depending on the exact timing of the
-            # interruption, we should or shouldn't expect some FINAL blocks.
-            await asyncio.sleep(0.25)
-            self._assistant_is_responding = False
-            await self._report_assistant_response_ended()
+    async def _handle_bot_stopped_speaking(self, delay_to_catch_trailing_assistant_text: bool):
+        # Protect against back-to-back BotStoppedSpeaking calls, which I've observed
+        if self._handling_bot_stopped_speaking:
+            return
+        self._handling_bot_stopped_speaking = True
+
+        async def finalize_assistant_response():
+            if self._assistant_is_responding:
+                # Consider the assistant finished with their response (possibly after a short delay,
+                # to allow for any trailing FINAL assistant text block to come in that need to make
+                # it into context).
+                #
+                # TODO: ideally we could base this solely on the LLM output events, but I couldn't
+                # figure out a reliable way to determine when we've gotten our last FINAL text block
+                # after the LLM is done talking.
+                #
+                # First I looked at stopReason, but it doesn't seem like the last FINAL text block
+                # is reliably marked END_TURN (sometimes the *first* one is, but not the last...
+                # bug?)
+                #
+                # Then I considered schemes where we tally or match up SPECULATIVE text blocks with
+                # FINAL text blocks to know how many or which FINAL blocks to expect, but user
+                # interruptions throw a wrench in these schemes: depending on the exact timing of
+                # the interruption, we should or shouldn't expect some FINAL blocks.
+                if delay_to_catch_trailing_assistant_text:
+                    # This delay length is a balancing act between "catching" trailing assistant
+                    # text that is quite delayed but not waiting so long that user text comes in
+                    # first and results in a bit of context message order scrambling.
+                    await asyncio.sleep(1.25)
+                self._assistant_is_responding = False
+                await self._report_assistant_response_ended()
+
+            self._handling_bot_stopped_speaking = False
+
+        # Finalize the assistant response, either now or after a delay
+        if delay_to_catch_trailing_assistant_text:
+            self.create_task(finalize_assistant_response())
+        else:
+            await finalize_assistant_response()
 
     async def _handle_function_call_result(self, frame: AWSNovaSonicFunctionCallResultFrame):
         result = frame.result_frame
@@ -391,6 +412,7 @@ class AWSNovaSonicLLMService(LLMService):
             self._content_being_received = None
             self._assistant_is_responding = False
             self._ready_to_send_context = False
+            self._handling_bot_stopped_speaking = False
             self._triggering_assistant_response = False
             self._disconnecting = False
             self._connected_time = None

From 9e16e3d614ce9ee4eef72dd7d744dbb584c836f6 Mon Sep 17 00:00:00 2001
From: Mark Backman <mark@daily.co>
Date: Wed, 7 May 2025 11:00:55 -0400
Subject: [PATCH 91/97] Update ElevenLabsTTSService to use the new websocket
 API

---
 CHANGELOG.md                           |   4 +
 src/pipecat/services/elevenlabs/tts.py | 120 +++++++++++++++++--------
 2 files changed, 85 insertions(+), 39 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 101cc7f58..0bde64116 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -16,6 +16,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Changed
 
+- Updated `ElevenLabsTTSService` to use the beta websocket API
+  (multi-stream-input). This new API supports context_ids and cancelling those
+  contexts, which greatly improves interruption handling.
+
 - Observers `on_push_frame()` now take a single argument `FramePushed` instead
   of multiple arguments.
 
diff --git a/src/pipecat/services/elevenlabs/tts.py b/src/pipecat/services/elevenlabs/tts.py
index 0a3d5d0d1..324e8099e 100644
--- a/src/pipecat/services/elevenlabs/tts.py
+++ b/src/pipecat/services/elevenlabs/tts.py
@@ -7,11 +7,12 @@
 import asyncio
 import base64
 import json
+import uuid
 from typing import Any, AsyncGenerator, Dict, List, Literal, Mapping, Optional, Tuple, Union
 
 import aiohttp
 from loguru import logger
-from pydantic import BaseModel, model_validator
+from pydantic import BaseModel
 
 from pipecat.frames.frames import (
     CancelFrame,
@@ -26,7 +27,10 @@ from pipecat.frames.frames import (
     TTSStoppedFrame,
 )
 from pipecat.processors.frame_processor import FrameDirection
-from pipecat.services.tts_service import InterruptibleWordTTSService, WordTTSService
+from pipecat.services.tts_service import (
+    AudioContextWordTTSService,
+    WordTTSService,
+)
 from pipecat.transcriptions.language import Language
 
 # See .env.example for ElevenLabs configuration needed
@@ -159,10 +163,9 @@ def calculate_word_times(
     return word_times
 
 
-class ElevenLabsTTSService(InterruptibleWordTTSService):
+class ElevenLabsTTSService(AudioContextWordTTSService):
     class InputParams(BaseModel):
         language: Optional[Language] = None
-        optimize_streaming_latency: Optional[str] = None
         stability: Optional[float] = None
         similarity_boost: Optional[float] = None
         style: Optional[float] = None
@@ -172,16 +175,6 @@ class ElevenLabsTTSService(InterruptibleWordTTSService):
         enable_ssml_parsing: Optional[bool] = None
         enable_logging: Optional[bool] = None
 
-        @model_validator(mode="after")
-        def validate_voice_settings(self):
-            stability = self.stability
-            similarity_boost = self.similarity_boost
-            if (stability is None) != (similarity_boost is None):
-                raise ValueError(
-                    "Both 'stability' and 'similarity_boost' must be provided when using voice settings"
-                )
-            return self
-
     def __init__(
         self,
         *,
@@ -222,7 +215,6 @@ class ElevenLabsTTSService(InterruptibleWordTTSService):
             "language": self.language_to_service_language(params.language)
             if params.language
             else None,
-            "optimize_streaming_latency": params.optimize_streaming_latency,
             "stability": params.stability,
             "similarity_boost": params.similarity_boost,
             "style": params.style,
@@ -242,6 +234,8 @@ class ElevenLabsTTSService(InterruptibleWordTTSService):
         self._started = False
         self._cumulative_time = 0
 
+        # Context management for v1 multi API
+        self._context_id = None
         self._receive_task = None
         self._keepalive_task = None
 
@@ -257,15 +251,13 @@ class ElevenLabsTTSService(InterruptibleWordTTSService):
     async def set_model(self, model: str):
         await super().set_model(model)
         logger.info(f"Switching TTS model to: [{model}]")
-        await self._disconnect()
-        await self._connect()
+        # No need to disconnect/reconnect for model changes with multi-context API
 
     async def _update_settings(self, settings: Mapping[str, Any]):
         prev_voice = self._voice_id
         await super()._update_settings(settings)
+        # If voice changes, we don't need to reconnect, just use a new context
         if not prev_voice == self._voice_id:
-            await self._disconnect()
-            await self._connect()
             logger.info(f"Switching TTS voice to: [{self._voice_id}]")
 
     async def start(self, frame: StartFrame):
@@ -282,8 +274,8 @@ class ElevenLabsTTSService(InterruptibleWordTTSService):
         await self._disconnect()
 
     async def flush_audio(self):
-        if self._websocket:
-            msg = {"text": " ", "flush": True}
+        if self._websocket and self._context_id:
+            msg = {"context_id": self._context_id, "flush": True}
             await self._websocket.send(json.dumps(msg))
 
     async def push_frame(self, frame: Frame, direction: FrameDirection = FrameDirection.DOWNSTREAM):
@@ -323,10 +315,7 @@ class ElevenLabsTTSService(InterruptibleWordTTSService):
             voice_id = self._voice_id
             model = self.model_name
             output_format = self._output_format
-            url = f"{self._url}/v1/text-to-speech/{voice_id}/stream-input?model_id={model}&output_format={output_format}&auto_mode={self._settings['auto_mode']}"
-
-            if self._settings["optimize_streaming_latency"]:
-                url += f"&optimize_streaming_latency={self._settings['optimize_streaming_latency']}"
+            url = f"{self._url}/v1/text-to-speech/{voice_id}/multi-stream-input?model_id={model}&output_format={output_format}&auto_mode={self._settings['auto_mode']}"
 
             if self._settings["enable_ssml_parsing"]:
                 url += f"&enable_ssml_parsing={self._settings['enable_ssml_parsing']}"
@@ -347,14 +336,6 @@ class ElevenLabsTTSService(InterruptibleWordTTSService):
             # Set max websocket message size to 16MB for large audio responses
             self._websocket = await websockets.connect(url, max_size=16 * 1024 * 1024)
 
-            # According to ElevenLabs, we should always start with a single space.
-            msg: Dict[str, Any] = {
-                "text": " ",
-                "xi_api_key": self._api_key,
-            }
-            if self._voice_settings:
-                msg["voice_settings"] = self._voice_settings
-            await self._websocket.send(json.dumps(msg))
         except Exception as e:
             logger.error(f"{self} initialization error: {e}")
             self._websocket = None
@@ -366,12 +347,15 @@ class ElevenLabsTTSService(InterruptibleWordTTSService):
 
             if self._websocket:
                 logger.debug("Disconnecting from ElevenLabs")
-                await self._websocket.send(json.dumps({"text": ""}))
+                # Close all contexts and the socket
+                if self._context_id:
+                    await self._websocket.send(json.dumps({"close_socket": True}))
                 await self._websocket.close()
         except Exception as e:
             logger.error(f"{self} error closing websocket: {e}")
         finally:
             self._started = False
+            self._context_id = None
             self._websocket = None
 
     def _get_websocket(self):
@@ -379,9 +363,35 @@ class ElevenLabsTTSService(InterruptibleWordTTSService):
             return self._websocket
         raise Exception("Websocket not connected")
 
+    async def _handle_interruption(self, frame: StartInterruptionFrame, direction: FrameDirection):
+        await super()._handle_interruption(frame, direction)
+
+        # Close the current context when interrupted without closing the websocket
+        if self._context_id and self._websocket:
+            logger.trace(f"Closing context {self._context_id} due to interruption")
+            try:
+                await self._websocket.send(
+                    json.dumps({"context_id": self._context_id, "close_context": True})
+                )
+            except Exception as e:
+                logger.error(f"Error closing context on interruption: {e}")
+            self._context_id = None
+            self._started = False
+
     async def _receive_messages(self):
         async for message in self._get_websocket():
             msg = json.loads(message)
+            # Check if this message belongs to the current context
+            # The default context may return null/None for context_id
+            received_ctx_id = msg.get("context_id")
+            if (
+                self._context_id is not None
+                and received_ctx_id is not None
+                and received_ctx_id != self._context_id
+            ):
+                logger.trace(f"Ignoring message from different context: {received_ctx_id}")
+                continue
+
             if msg.get("audio"):
                 await self.stop_ttfb_metrics()
                 self.start_word_timestamps()
@@ -393,20 +403,45 @@ class ElevenLabsTTSService(InterruptibleWordTTSService):
                 word_times = calculate_word_times(msg["alignment"], self._cumulative_time)
                 await self.add_word_timestamps(word_times)
                 self._cumulative_time = word_times[-1][1]
+            if msg.get("is_final"):
+                logger.trace(f"Received final message for context {received_ctx_id}")
+                # Context has finished
+                if self._context_id == received_ctx_id:
+                    self._context_id = None
+                    self._started = False
 
     async def _keepalive_task_handler(self):
         while True:
             await asyncio.sleep(10)
             try:
-                await self._send_text("")
+                # Send an empty message to keep the connection alive
+                if self._websocket and self._websocket.open:
+                    await self._websocket.send(json.dumps({}))
             except websockets.ConnectionClosed as e:
                 logger.warning(f"{self} keepalive error: {e}")
                 break
 
     async def _send_text(self, text: str):
         if self._websocket:
-            msg = {"text": text + " "}
-            await self._websocket.send(json.dumps(msg))
+            if not self._context_id:
+                # First message for a new context - need a space to initialize
+                msg = {"text": " ", "context_id": str(uuid.uuid4()), "xi_api_key": self._api_key}
+
+                # Add voice settings only in first message for a context
+                if self._voice_settings:
+                    msg["voice_settings"] = self._voice_settings
+
+                await self._websocket.send(json.dumps(msg))
+                self._context_id = msg["context_id"]
+                logger.trace(f"Created new context {self._context_id}")
+
+                # Now send the actual text content
+                msg = {"text": text, "context_id": self._context_id}
+                await self._websocket.send(json.dumps(msg))
+            else:
+                # Continuing with an existing context
+                msg = {"text": text, "context_id": self._context_id}
+                await self._websocket.send(json.dumps(msg))
 
     async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
         logger.debug(f"{self}: Generating TTS [{text}]")
@@ -416,6 +451,13 @@ class ElevenLabsTTSService(InterruptibleWordTTSService):
                 await self._connect()
 
             try:
+                # Close previous context if there was one
+                if self._context_id and not self._started:
+                    await self._websocket.send(
+                        json.dumps({"context_id": self._context_id, "close_context": True})
+                    )
+                    self._context_id = None
+
                 if not self._started:
                     await self.start_ttfb_metrics()
                     yield TTSStartedFrame()
@@ -427,8 +469,8 @@ class ElevenLabsTTSService(InterruptibleWordTTSService):
             except Exception as e:
                 logger.error(f"{self} error sending message: {e}")
                 yield TTSStoppedFrame()
-                await self._disconnect()
-                await self._connect()
+                self._started = False
+                self._context_id = None
                 return
             yield None
         except Exception as e:

From efeb96c4e8f88270c642ec83f2f174921dacb0f6 Mon Sep 17 00:00:00 2001
From: Mark Backman <mark@daily.co>
Date: Wed, 7 May 2025 13:12:18 -0400
Subject: [PATCH 92/97] Remove unused imports

---
 src/pipecat/observers/loggers/llm_log_observer.py           | 3 +--
 src/pipecat/observers/loggers/transcription_log_observer.py | 2 --
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/pipecat/observers/loggers/llm_log_observer.py b/src/pipecat/observers/loggers/llm_log_observer.py
index 9e4d53b28..a6675b5c0 100644
--- a/src/pipecat/observers/loggers/llm_log_observer.py
+++ b/src/pipecat/observers/loggers/llm_log_observer.py
@@ -7,7 +7,6 @@
 from loguru import logger
 
 from pipecat.frames.frames import (
-    Frame,
     FunctionCallInProgressFrame,
     FunctionCallResultFrame,
     LLMFullResponseEndFrame,
@@ -17,7 +16,7 @@ from pipecat.frames.frames import (
 )
 from pipecat.observers.base_observer import BaseObserver, FramePushed
 from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContextFrame
-from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
+from pipecat.processors.frame_processor import FrameDirection
 from pipecat.services.llm_service import LLMService
 
 
diff --git a/src/pipecat/observers/loggers/transcription_log_observer.py b/src/pipecat/observers/loggers/transcription_log_observer.py
index 57e38c952..8ca1d9c9b 100644
--- a/src/pipecat/observers/loggers/transcription_log_observer.py
+++ b/src/pipecat/observers/loggers/transcription_log_observer.py
@@ -7,12 +7,10 @@
 from loguru import logger
 
 from pipecat.frames.frames import (
-    Frame,
     InterimTranscriptionFrame,
     TranscriptionFrame,
 )
 from pipecat.observers.base_observer import BaseObserver, FramePushed
-from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
 from pipecat.services.stt_service import STTService
 
 
From 75ce632f8456b0dbaf6457a7c29b59793ec264a7 Mon Sep 17 00:00:00 2001
From: Mark Backman <mark@daily.co>
Date: Wed, 7 May 2025 15:05:15 -0400
Subject: [PATCH 93/97] Add DebugLogObserver

---
 CHANGELOG.md                                  |   4 +
 examples/foundational/30-observer.py          |  26 ++-
 .../observers/loggers/debug_log_observer.py   | 218 ++++++++++++++++++
 3 files changed, 244 insertions(+), 4 deletions(-)
 create mode 100644 src/pipecat/observers/loggers/debug_log_observer.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5edb1b5a3..dacfb5fd4 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
+- Added `DebugLogObserver` for detailed frame logging with configurable
+  filtering by frame type and endpoint. This observer automatically extracts
+  and formats all frame data fields for debug logging.
+
 - `UserImageRequestFrame.video_source` field has been added to request an image
   from the desired video source.
 
diff --git a/examples/foundational/30-observer.py b/examples/foundational/30-observer.py
index 46bd96e53..c9cd08aee 100644
--- a/examples/foundational/30-observer.py
+++ b/examples/foundational/30-observer.py
@@ -14,18 +14,26 @@ from pipecat.audio.vad.silero import SileroVADAnalyzer
 from pipecat.frames.frames import (
     BotStartedSpeakingFrame,
     BotStoppedSpeakingFrame,
+    EndFrame,
     StartInterruptionFrame,
+    TTSTextFrame,
+    UserStartedSpeakingFrame,
 )
 from pipecat.observers.base_observer import BaseObserver, FramePushed
+from pipecat.observers.loggers.debug_log_observer import DebugLogObserver, FrameEndpoint
 from pipecat.observers.loggers.llm_log_observer import LLMLogObserver
 from pipecat.pipeline.pipeline import Pipeline
 from pipecat.pipeline.runner import PipelineRunner
 from pipecat.pipeline.task import PipelineParams, PipelineTask
-from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
+from pipecat.processors.aggregators.openai_llm_context import (
+    OpenAILLMContext,
+)
 from pipecat.processors.frame_processor import FrameDirection
 from pipecat.services.cartesia.tts import CartesiaTTSService
 from pipecat.services.deepgram.stt import DeepgramSTTService
 from pipecat.services.openai.llm import OpenAILLMService
+from pipecat.transports.base_input import BaseInputTransport
+from pipecat.transports.base_output import BaseOutputTransport
 from pipecat.transports.base_transport import TransportParams
 from pipecat.transports.network.small_webrtc import SmallWebRTCTransport
 from pipecat.transports.network.webrtc_connection import SmallWebRTCConnection
@@ -33,7 +41,7 @@ from pipecat.transports.network.webrtc_connection import SmallWebRTCConnection
 load_dotenv(override=True)
 
 
-class DebugObserver(BaseObserver):
+class CustomObserver(BaseObserver):
     """Observer to log interruptions and bot speaking events to the console.
 
     Logs all frame instances of:
@@ -58,7 +66,7 @@ class DebugObserver(BaseObserver):
         # Create direction arrow
         arrow = "→" if direction == FrameDirection.DOWNSTREAM else "←"
 
-        if isinstance(frame, StartInterruptionFrame):
+        if isinstance(frame, StartInterruptionFrame) and isinstance(src, BaseOutputTransport):
             logger.info(f"⚡ INTERRUPTION START: {src} {arrow} {dst} at {time_sec:.2f}s")
         elif isinstance(frame, BotStartedSpeakingFrame):
             logger.info(f"🤖 BOT START SPEAKING: {src} {arrow} {dst} at {time_sec:.2f}s")
@@ -117,7 +125,17 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection, _: argparse.Namespac
             enable_usage_metrics=True,
             report_only_initial_ttfb=True,
         ),
-        observers=[DebugObserver(), LLMLogObserver()],
+        observers=[
+            CustomObserver(),
+            LLMLogObserver(),
+            DebugLogObserver(
+                frame_types={
+                    TTSTextFrame: (BaseOutputTransport, FrameEndpoint.DESTINATION),
+                    UserStartedSpeakingFrame: (BaseInputTransport, FrameEndpoint.SOURCE),
+                    EndFrame: None,
+                }
+            ),
+        ],
     )
 
     @transport.event_handler("on_client_connected")
diff --git a/src/pipecat/observers/loggers/debug_log_observer.py b/src/pipecat/observers/loggers/debug_log_observer.py
new file mode 100644
index 000000000..bd09bd790
--- /dev/null
+++ b/src/pipecat/observers/loggers/debug_log_observer.py
@@ -0,0 +1,218 @@
+#
+# Copyright (c) 2024–2025, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+
+from dataclasses import fields, is_dataclass
+from enum import Enum, auto
+from typing import Dict, List, Optional, Set, Tuple, Type, Union
+
+from loguru import logger
+
+from pipecat.frames.frames import Frame
+from pipecat.observers.base_observer import BaseObserver, FramePushed
+from pipecat.processors.frame_processor import FrameDirection
+
+
+class FrameEndpoint(Enum):
+    """Specifies which endpoint (source or destination) to filter on."""
+
+    SOURCE = auto()
+    DESTINATION = auto()
+
+
+class DebugLogObserver(BaseObserver):
+    """Observer that logs frame activity with detailed content to the console.
+
+    Automatically extracts and formats data from any frame type, making it useful
+    for debugging pipeline behavior without needing frame-specific observers.
+
+    Args:
+        frame_types: Optional list of frame types to log, or a dict with frame type
+            filters. If None, logs all frame types.
+        exclude_fields: Optional set of field names to exclude from logging.
+
+    Examples:
+        Log all frames from all services:
+        ```python
+        observer = DebugLogObserver()
+        ```
+
+        Log specific frame types from any source/destination:
+        ```python
+        from pipecat.frames.frames import TranscriptionFrame, InterimTranscriptionFrame
+        observer = DebugLogObserver(frame_types=[TranscriptionFrame, InterimTranscriptionFrame])
+        ```
+
+        Log frames with specific source/destination filters:
+        ```python
+        from pipecat.frames.frames import StartInterruptionFrame, UserStartedSpeakingFrame, LLMTextFrame
+        from pipecat.transports.base_output_transport import BaseOutputTransport
+        from pipecat.services.stt_service import STTService
+
+        observer = DebugLogObserver(frame_types={
+            # Only log StartInterruptionFrame when source is BaseOutputTransport
+            StartInterruptionFrame: (BaseOutputTransport, FrameEndpoint.SOURCE),
+
+            # Only log UserStartedSpeakingFrame when destination is STTService
+            UserStartedSpeakingFrame: (STTService, FrameEndpoint.DESTINATION),
+
+            # Log LLMTextFrame regardless of source or destination type
+            LLMTextFrame: None
+        })
+        ```
+    """
+
+    def __init__(
+        self,
+        frame_types: Optional[
+            Union[List[Type[Frame]], Dict[Type[Frame], Optional[Tuple[Type, FrameEndpoint]]]]
+        ] = None,
+        exclude_fields: Optional[Set[str]] = None,
+    ):
+        """Initialize the debug log observer.
+
+        Args:
+            frame_types: List of frame types to log, or a dict mapping frame types to
+                filter configurations. Filter configs can be:
+                - None to log all instances of the frame type
+                - A tuple of (service_type, endpoint) to filter on a specific service
+                  and endpoint (SOURCE or DESTINATION)
+                If None is provided instead of a dict/list, log all frames.
+            exclude_fields: Set of field names to exclude from logging. If None, only binary
+                data fields are excluded.
+        """
+        # Process frame filters
+        self.frame_filters = {}
+
+        if frame_types is not None:
+            if isinstance(frame_types, list):
+                # List of frame types - log all instances
+                self.frame_filters = {frame_type: None for frame_type in frame_types}
+            else:
+                # Dict of frame types with filters
+                self.frame_filters = frame_types
+
+        # By default, exclude binary data fields that would clutter logs
+        self.exclude_fields = (
+            exclude_fields
+            if exclude_fields is not None
+            else {
+                "audio",  # Skip binary audio data
+                "image",  # Skip binary image data
+                "images",  # Skip lists of images
+            }
+        )
+
+    def _format_value(self, value):
+        """Format a value for logging.
+
+        Args:
+            value: The value to format.
+
+        Returns:
+            str: A string representation of the value suitable for logging.
+        """
+        if value is None:
+            return "None"
+        elif isinstance(value, str):
+            return f"{value!r}"
+        elif isinstance(value, (list, tuple)):
+            if len(value) == 0:
+                return "[]"
+            if isinstance(value[0], dict) and len(value) > 3:
+                # For message lists, just show count
+                return f"{len(value)} items"
+            return str(value)
+        elif isinstance(value, (bytes, bytearray)):
+            return f"{len(value)} bytes"
+        elif hasattr(value, "get_messages_for_logging") and callable(
+            getattr(value, "get_messages_for_logging")
+        ):
+            # Special case for OpenAI context
+            return f"{value.__class__.__name__} with messages: {value.get_messages_for_logging()}"
+        else:
+            return str(value)
+
+    def _should_log_frame(self, frame, src, dst):
+        """Determine if a frame should be logged based on filters.
+
+        Args:
+            frame: The frame being processed
+            src: The source component
+            dst: The destination component
+
+        Returns:
+            bool: True if the frame should be logged, False otherwise
+        """
+        # If no filters, log all frames
+        if not self.frame_filters:
+            return True
+
+        # Check if this frame type is in our filters
+        for frame_type, filter_config in self.frame_filters.items():
+            if isinstance(frame, frame_type):
+                # If filter is None, log all instances of this frame type
+                if filter_config is None:
+                    return True
+
+                # Otherwise, check the specific filter
+                service_type, endpoint = filter_config
+
+                if endpoint == FrameEndpoint.SOURCE:
+                    return isinstance(src, service_type)
+                elif endpoint == FrameEndpoint.DESTINATION:
+                    return isinstance(dst, service_type)
+
+        return False
+
+    async def on_push_frame(self, data: FramePushed):
+        """Process a frame being pushed into the pipeline.
+
+        Logs frame details to the console with all relevant fields and values.
+
+        Args:
+            data: Event data containing the frame, source, destination, direction, and timestamp.
+        """
+        src = data.source
+        dst = data.destination
+        frame = data.frame
+        direction = data.direction
+        timestamp = data.timestamp
+
+        # Check if we should log this frame
+        if not self._should_log_frame(frame, src, dst):
+            return
+
+        # Format direction arrow
+        arrow = "→" if direction == FrameDirection.DOWNSTREAM else "←"
+
+        time_sec = timestamp / 1_000_000_000
+        class_name = frame.__class__.__name__
+
+        # Build frame representation
+        frame_details = []
+
+        # If dataclass, extract fields
+        if is_dataclass(frame):
+            for field in fields(frame):
+                if field.name in self.exclude_fields:
+                    continue
+
+                value = getattr(frame, field.name)
+                if value is None:
+                    continue
+
+                formatted_value = self._format_value(value)
+                frame_details.append(f"{field.name}: {formatted_value}")
+
+        # Format the message
+        if frame_details:
+            details = ", ".join(frame_details)
+            message = f"{class_name} {details} at {time_sec:.2f}s"
+        else:
+            message = f"{class_name} at {time_sec:.2f}s"
+
+        # Log the message
+        logger.debug(f"{src} {arrow} {dst}: {message}")

From 9e0b4fe5d158e10ad5ba2cb22a0ea9ce882b3bc9 Mon Sep 17 00:00:00 2001
From: Mark Backman <mark@daily.co>
Date: Wed, 7 May 2025 17:19:52 -0400
Subject: [PATCH 94/97] Replace list with tuple

---
 .../observers/loggers/debug_log_observer.py   | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/pipecat/observers/loggers/debug_log_observer.py b/src/pipecat/observers/loggers/debug_log_observer.py
index bd09bd790..575a31683 100644
--- a/src/pipecat/observers/loggers/debug_log_observer.py
+++ b/src/pipecat/observers/loggers/debug_log_observer.py
@@ -6,7 +6,7 @@
 
 from dataclasses import fields, is_dataclass
 from enum import Enum, auto
-from typing import Dict, List, Optional, Set, Tuple, Type, Union
+from typing import Dict, Optional, Set, Tuple, Type, Union
 
 from loguru import logger
 
@@ -29,20 +29,20 @@ class DebugLogObserver(BaseObserver):
     for debugging pipeline behavior without needing frame-specific observers.
 
     Args:
-        frame_types: Optional list of frame types to log, or a dict with frame type
+        frame_types: Optional tuple of frame types to log, or a dict with frame type
             filters. If None, logs all frame types.
         exclude_fields: Optional set of field names to exclude from logging.
 
     Examples:
         Log all frames from all services:
         ```python
-        observer = DebugLogObserver()
+        observers = DebugLogObserver()
         ```
 
         Log specific frame types from any source/destination:
         ```python
         from pipecat.frames.frames import TranscriptionFrame, InterimTranscriptionFrame
-        observer = DebugLogObserver(frame_types=[TranscriptionFrame, InterimTranscriptionFrame])
+        observers = DebugLogObserver(frame_types=(TranscriptionFrame, InterimTranscriptionFrame))
         ```
 
         Log frames with specific source/destination filters:
@@ -51,7 +51,7 @@ class DebugLogObserver(BaseObserver):
         from pipecat.transports.base_output_transport import BaseOutputTransport
         from pipecat.services.stt_service import STTService
 
-        observer = DebugLogObserver(frame_types={
+        observers = DebugLogObserver(frame_types={
             # Only log StartInterruptionFrame when source is BaseOutputTransport
             StartInterruptionFrame: (BaseOutputTransport, FrameEndpoint.SOURCE),
 
@@ -67,19 +67,19 @@ class DebugLogObserver(BaseObserver):
     def __init__(
         self,
         frame_types: Optional[
-            Union[List[Type[Frame]], Dict[Type[Frame], Optional[Tuple[Type, FrameEndpoint]]]]
+            Union[Tuple[Type[Frame], ...], Dict[Type[Frame], Optional[Tuple[Type, FrameEndpoint]]]]
         ] = None,
         exclude_fields: Optional[Set[str]] = None,
     ):
         """Initialize the debug log observer.
 
         Args:
-            frame_types: List of frame types to log, or a dict mapping frame types to
+            frame_types: Tuple of frame types to log, or a dict mapping frame types to
                 filter configurations. Filter configs can be:
                 - None to log all instances of the frame type
                 - A tuple of (service_type, endpoint) to filter on a specific service
                   and endpoint (SOURCE or DESTINATION)
-                If None is provided instead of a dict/list, log all frames.
+                If None is provided instead of a tuple/dict, log all frames.
             exclude_fields: Set of field names to exclude from logging. If None, only binary
                 data fields are excluded.
         """
@@ -87,8 +87,8 @@ class DebugLogObserver(BaseObserver):
         self.frame_filters = {}
 
         if frame_types is not None:
-            if isinstance(frame_types, list):
-                # List of frame types - log all instances
+            if isinstance(frame_types, tuple):
+                # Tuple of frame types - log all instances
                 self.frame_filters = {frame_type: None for frame_type in frame_types}
             else:
                 # Dict of frame types with filters

From 7cfb9a4d15c70baa1ed0ae8be20bb198c9a2dc88 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= <aleix@daily.co>
Date: Wed, 7 May 2025 14:59:16 -0700
Subject: [PATCH 95/97] update CHANGELOG for 0.0.67

---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index dacfb5fd4..9b85c7f87 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,7 +5,7 @@ All notable changes to **Pipecat** will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-## [Unreleased]
+## [0.0.67] - 2025-05-07
 
 ### Added
 

From 91364028460eecb82db916c26bbc68d5647104b2 Mon Sep 17 00:00:00 2001
From: Mark Backman <mark@daily.co>
Date: Wed, 7 May 2025 18:29:27 -0400
Subject: [PATCH 96/97] Add load_dotenv to moondream example server

---
 examples/moondream-chatbot/server.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/examples/moondream-chatbot/server.py b/examples/moondream-chatbot/server.py
index bb322ff2e..9597bdc9a 100644
--- a/examples/moondream-chatbot/server.py
+++ b/examples/moondream-chatbot/server.py
@@ -10,12 +10,16 @@ import subprocess
 from contextlib import asynccontextmanager
 
 import aiohttp
+from dotenv import load_dotenv
 from fastapi import FastAPI, HTTPException, Request
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse, RedirectResponse
 
 from pipecat.transports.services.helpers.daily_rest import DailyRESTHelper, DailyRoomParams
 
+# Load environment variables from .env file
+load_dotenv(override=True)
+
 MAX_BOTS_PER_ROOM = 1
 
 # Bot sub-process dict for status reporting and concurrency control

From cb7e7a8aa30acda38dd4e07c4096ee3e62f8b2b4 Mon Sep 17 00:00:00 2001
From: Mark Backman <mark@daily.co>
Date: Wed, 7 May 2025 18:40:04 -0400
Subject: [PATCH 97/97] Add load_dotenv to patient-intake server file

---
 examples/patient-intake/server.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/examples/patient-intake/server.py b/examples/patient-intake/server.py
index 347b17dbd..10ccfb3b7 100644
--- a/examples/patient-intake/server.py
+++ b/examples/patient-intake/server.py
@@ -10,12 +10,16 @@ import subprocess
 from contextlib import asynccontextmanager
 
 import aiohttp
+from dotenv import load_dotenv
 from fastapi import FastAPI, HTTPException, Request
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse, RedirectResponse
 
 from pipecat.transports.services.helpers.daily_rest import DailyRESTHelper, DailyRoomParams
 
+# Load environment variables from .env file
+load_dotenv(override=True)
+
 MAX_BOTS_PER_ROOM = 1
 
 # Bot sub-process dict for status reporting and concurrency control