From e8783f6a33fe4049aaedff2e2940cc9b0ba5b011 Mon Sep 17 00:00:00 2001 From: Adithya Suresh Date: Mon, 24 Mar 2025 15:40:26 -0700 Subject: [PATCH 01/97] Handle cache token counts being none --- src/pipecat/services/anthropic/llm.py | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/src/pipecat/services/anthropic/llm.py b/src/pipecat/services/anthropic/llm.py index 3e369075a..fd646b21c 100644 --- a/src/pipecat/services/anthropic/llm.py +++ b/src/pipecat/services/anthropic/llm.py @@ -253,14 +253,24 @@ class AnthropicLLMService(LLMService): if hasattr(event.message.usage, "output_tokens") else 0 ) - if hasattr(event.message.usage, "cache_creation_input_tokens"): - cache_creation_input_tokens += ( - event.message.usage.cache_creation_input_tokens + cache_creation_input_tokens += ( + event.message.usage.cache_creation_input_tokens + if ( + hasattr(event.message.usage, "cache_creation_input_tokens") + and event.message.usage.cache_creation_input_tokens is not None ) - logger.debug(f"Cache creation input tokens: {cache_creation_input_tokens}") - if hasattr(event.message.usage, "cache_read_input_tokens"): - cache_read_input_tokens += event.message.usage.cache_read_input_tokens - logger.debug(f"Cache read input tokens: {cache_read_input_tokens}") + else 0 + ) + logger.debug(f"Cache creation input tokens: {cache_creation_input_tokens}") + cache_read_input_tokens += ( + event.message.usage.cache_read_input_tokens + if ( + hasattr(event.message.usage, "cache_read_input_tokens") + and event.message.usage.cache_read_input_tokens is not None + ) + else 0 + ) + logger.debug(f"Cache read input tokens: {cache_read_input_tokens}") total_input_tokens = ( prompt_tokens + cache_creation_input_tokens + cache_read_input_tokens ) From 855d567b1ef87c7e7df1bd8036f6ab446b81ff86 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= Date: Mon, 5 May 2025 14:06:58 -0700 Subject: [PATCH 02/97] only send data to transports after they are really ready --- CHANGELOG.md | 3 ++ src/pipecat/transports/base_input.py | 12 ++++--- src/pipecat/transports/base_output.py | 18 ++++++----- src/pipecat/transports/local/audio.py | 4 +++ src/pipecat/transports/local/tk.py | 4 +++ .../transports/network/fastapi_websocket.py | 2 ++ .../transports/network/small_webrtc.py | 2 ++ .../transports/network/websocket_client.py | 2 ++ .../transports/network/websocket_server.py | 2 ++ src/pipecat/transports/services/daily.py | 31 ++++++++++++------- src/pipecat/transports/services/livekit.py | 2 ++ 11 files changed, 58 insertions(+), 24 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 54cbd3cd6..95350157f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed +- Fixed an issue that could cause data to be sent to the transports when they + were still not ready. + - Remove custom audio tracks from `DailyTransport` before leaving. ## [0.0.66] - 2025-05-02 diff --git a/src/pipecat/transports/base_input.py b/src/pipecat/transports/base_input.py index 51ebdb677..f9a27a6d3 100644 --- a/src/pipecat/transports/base_input.py +++ b/src/pipecat/transports/base_input.py @@ -122,6 +122,7 @@ class BaseInputTransport(FrameProcessor): # Configure VAD analyzer. if self._params.vad_analyzer: self._params.vad_analyzer.set_sample_rate(self._sample_rate) + # Configure End of turn analyzer. if self._params.turn_analyzer: self._params.turn_analyzer.set_sample_rate(self._sample_rate) @@ -129,10 +130,6 @@ class BaseInputTransport(FrameProcessor): # Start audio filter. if self._params.audio_in_filter: await self._params.audio_in_filter.start(self._sample_rate) - # Create audio input queue and task if needed. - if not self._audio_task and self._params.audio_in_enabled: - self._audio_in_queue = asyncio.Queue() - self._audio_task = self.create_task(self._audio_task_handler()) async def stop(self, frame: EndFrame): # Cancel and wait for the audio input task to finish. @@ -149,6 +146,13 @@ class BaseInputTransport(FrameProcessor): await self.cancel_task(self._audio_task) self._audio_task = None + async def set_transport_ready(self, frame: StartFrame): + """To be called when the transport is ready to stream.""" + # Create audio input queue and task if needed. + if not self._audio_task and self._params.audio_in_enabled: + self._audio_in_queue = asyncio.Queue() + self._audio_task = self.create_task(self._audio_task_handler()) + async def push_audio_frame(self, frame: InputAudioRawFrame): if self._params.audio_in_enabled: await self._audio_in_queue.put(frame) diff --git a/src/pipecat/transports/base_output.py b/src/pipecat/transports/base_output.py index fa5d5e1c4..81492b84d 100644 --- a/src/pipecat/transports/base_output.py +++ b/src/pipecat/transports/base_output.py @@ -78,6 +78,16 @@ class BaseOutputTransport(FrameProcessor): audio_bytes_10ms = int(self._sample_rate / 100) * self._params.audio_out_channels * 2 self._audio_chunk_size = audio_bytes_10ms * self._params.audio_out_10ms_chunks + async def stop(self, frame: EndFrame): + for _, sender in self._media_senders.items(): + await sender.stop(frame) + + async def cancel(self, frame: CancelFrame): + for _, sender in self._media_senders.items(): + await sender.cancel(frame) + + async def set_transport_ready(self, frame: StartFrame): + """To be called when the transport is ready to stream.""" # Register destinations. for destination in self._params.audio_out_destinations: await self.register_audio_destination(destination) @@ -112,14 +122,6 @@ class BaseOutputTransport(FrameProcessor): ) await self._media_senders[destination].start(frame) - async def stop(self, frame: EndFrame): - for _, sender in self._media_senders.items(): - await sender.stop(frame) - - async def cancel(self, frame: CancelFrame): - for _, sender in self._media_senders.items(): - await sender.cancel(frame) - async def send_message(self, frame: TransportMessageFrame | TransportMessageUrgentFrame): pass diff --git a/src/pipecat/transports/local/audio.py b/src/pipecat/transports/local/audio.py index 8bfd7ee34..ba554c9e3 100644 --- a/src/pipecat/transports/local/audio.py +++ b/src/pipecat/transports/local/audio.py @@ -61,6 +61,8 @@ class LocalAudioInputTransport(BaseInputTransport): ) self._in_stream.start_stream() + await self.set_transport_ready(frame) + async def cleanup(self): await super().cleanup() if self._in_stream: @@ -111,6 +113,8 @@ class LocalAudioOutputTransport(BaseOutputTransport): ) self._out_stream.start_stream() + await self.set_transport_ready(frame) + async def cleanup(self): await super().cleanup() if self._out_stream: diff --git a/src/pipecat/transports/local/tk.py b/src/pipecat/transports/local/tk.py index bed6371c2..4086497cb 100644 --- a/src/pipecat/transports/local/tk.py +++ b/src/pipecat/transports/local/tk.py @@ -68,6 +68,8 @@ class TkInputTransport(BaseInputTransport): ) self._in_stream.start_stream() + await self.set_transport_ready(frame) + async def cleanup(self): await super().cleanup() if self._in_stream: @@ -124,6 +126,8 @@ class TkOutputTransport(BaseOutputTransport): ) self._out_stream.start_stream() + await self.set_transport_ready(frame) + async def cleanup(self): await super().cleanup() if self._out_stream: diff --git a/src/pipecat/transports/network/fastapi_websocket.py b/src/pipecat/transports/network/fastapi_websocket.py index 4a20bc49b..f04d56b0d 100644 --- a/src/pipecat/transports/network/fastapi_websocket.py +++ b/src/pipecat/transports/network/fastapi_websocket.py @@ -131,6 +131,7 @@ class FastAPIWebsocketInputTransport(BaseInputTransport): await self._client.trigger_client_connected() if not self._receive_task: self._receive_task = self.create_task(self._receive_messages()) + await self.set_transport_ready(frame) async def _stop_tasks(self): if self._monitor_websocket_task: @@ -204,6 +205,7 @@ class FastAPIWebsocketOutputTransport(BaseOutputTransport): await self._client.setup(frame) await self._params.serializer.setup(frame) self._send_interval = (self.audio_chunk_size / self.sample_rate) / 2 + await self.set_transport_ready(frame) async def stop(self, frame: EndFrame): await super().stop(frame) diff --git a/src/pipecat/transports/network/small_webrtc.py b/src/pipecat/transports/network/small_webrtc.py index fdd501299..ffa3f441a 100644 --- a/src/pipecat/transports/network/small_webrtc.py +++ b/src/pipecat/transports/network/small_webrtc.py @@ -395,6 +395,7 @@ class SmallWebRTCInputTransport(BaseInputTransport): self._receive_audio_task = self.create_task(self._receive_audio()) if not self._receive_video_task and self._params.video_in_enabled: self._receive_video_task = self.create_task(self._receive_video()) + await self.set_transport_ready(frame) async def _stop_tasks(self): if self._receive_audio_task: @@ -487,6 +488,7 @@ class SmallWebRTCOutputTransport(BaseOutputTransport): await super().start(frame) await self._client.setup(self._params, frame) await self._client.connect() + await self.set_transport_ready(frame) async def stop(self, frame: EndFrame): await super().stop(frame) diff --git a/src/pipecat/transports/network/websocket_client.py b/src/pipecat/transports/network/websocket_client.py index 7e9725a76..535a0ab21 100644 --- a/src/pipecat/transports/network/websocket_client.py +++ b/src/pipecat/transports/network/websocket_client.py @@ -136,6 +136,7 @@ class WebsocketClientInputTransport(BaseInputTransport): await self._params.serializer.setup(frame) await self._session.setup(frame) await self._session.connect() + await self.set_transport_ready(frame) async def stop(self, frame: EndFrame): await super().stop(frame) @@ -186,6 +187,7 @@ class WebsocketClientOutputTransport(BaseOutputTransport): await self._params.serializer.setup(frame) await self._session.setup(frame) await self._session.connect() + await self.set_transport_ready(frame) async def stop(self, frame: EndFrame): await super().stop(frame) diff --git a/src/pipecat/transports/network/websocket_server.py b/src/pipecat/transports/network/websocket_server.py index b930f9fd6..7c8738871 100644 --- a/src/pipecat/transports/network/websocket_server.py +++ b/src/pipecat/transports/network/websocket_server.py @@ -83,6 +83,7 @@ class WebsocketServerInputTransport(BaseInputTransport): await self._params.serializer.setup(frame) if not self._server_task: self._server_task = self.create_task(self._server_task_handler()) + await self.set_transport_ready(frame) async def stop(self, frame: EndFrame): await super().stop(frame) @@ -195,6 +196,7 @@ class WebsocketServerOutputTransport(BaseOutputTransport): await super().start(frame) await self._params.serializer.setup(frame) self._send_interval = (self.audio_chunk_size / self.sample_rate) / 2 + await self.set_transport_ready(frame) async def stop(self, frame: EndFrame): await super().stop(frame) diff --git a/src/pipecat/transports/services/daily.py b/src/pipecat/transports/services/daily.py index 3e43ddee1..9909d9336 100644 --- a/src/pipecat/transports/services/daily.py +++ b/src/pipecat/transports/services/daily.py @@ -944,19 +944,23 @@ class DailyInputTransport(BaseInputTransport): self._audio_in_task = self.create_task(self._audio_in_task_handler()) async def start(self, frame: StartFrame): - # Setup client. - await self._client.setup(frame) - - # Parent start. - await super().start(frame) - if self._initialized: return self._initialized = True + # Parent start. + await super().start(frame) + + # Setup client. + await self._client.setup(frame) + # Join the room. await self._client.join() + + # Indicate the transport that we are connected. + await self.set_transport_ready(frame) + if self._params.audio_in_stream_on_start: self.start_audio_in_streaming() @@ -1125,20 +1129,23 @@ class DailyOutputTransport(BaseOutputTransport): self._initialized = False async def start(self, frame: StartFrame): - # Setup client. - await self._client.setup(frame) - - # Parent start. - await super().start(frame) - if self._initialized: return self._initialized = True + # Parent start. + await super().start(frame) + + # Setup client. + await self._client.setup(frame) + # Join the room. await self._client.join() + # Indicate the transport that we are connected. + await self.set_transport_ready(frame) + async def stop(self, frame: EndFrame): # Parent stop. await super().stop(frame) diff --git a/src/pipecat/transports/services/livekit.py b/src/pipecat/transports/services/livekit.py index 456a70ea6..36cc5d604 100644 --- a/src/pipecat/transports/services/livekit.py +++ b/src/pipecat/transports/services/livekit.py @@ -370,6 +370,7 @@ class LiveKitInputTransport(BaseInputTransport): await self._client.connect() if not self._audio_in_task and self._params.audio_in_enabled: self._audio_in_task = self.create_task(self._audio_in_task_handler()) + await self.set_transport_ready(frame) logger.info("LiveKitInputTransport started") async def stop(self, frame: EndFrame): @@ -441,6 +442,7 @@ class LiveKitOutputTransport(BaseOutputTransport): await super().start(frame) await self._client.setup(frame) await self._client.connect() + await self.set_transport_ready(frame) logger.info("LiveKitOutputTransport started") async def stop(self, frame: EndFrame): From 9cc498b1fa17e2571092084d5dc6d3762fbe0bfd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= Date: Mon, 5 May 2025 21:27:49 -0700 Subject: [PATCH 03/97] TaskManager: use a dictionary instead of a set to store tasks --- CHANGELOG.md | 2 ++ src/pipecat/utils/asyncio.py | 15 ++++++++------- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 95350157f..ac4e2db11 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed +- Fixed a `TaskManager` that was causing dangling tasks to be reported. + - Fixed an issue that could cause data to be sent to the transports when they were still not ready. diff --git a/src/pipecat/utils/asyncio.py b/src/pipecat/utils/asyncio.py index acc4acec8..cea447329 100644 --- a/src/pipecat/utils/asyncio.py +++ b/src/pipecat/utils/asyncio.py @@ -6,7 +6,7 @@ import asyncio from abc import ABC, abstractmethod -from typing import Coroutine, Optional, Set +from typing import Coroutine, Dict, Optional, Sequence, Set from loguru import logger @@ -69,14 +69,14 @@ class BaseTaskManager(ABC): pass @abstractmethod - def current_tasks(self) -> Set[asyncio.Task]: + def current_tasks(self) -> Sequence[asyncio.Task]: """Returns the list of currently created/registered tasks.""" pass class TaskManager(BaseTaskManager): def __init__(self) -> None: - self._tasks: Set[asyncio.Task] = set() + self._tasks: Dict[str, asyncio.Task] = {} self._loop: Optional[asyncio.AbstractEventLoop] = None def set_event_loop(self, loop: asyncio.AbstractEventLoop): @@ -179,16 +179,17 @@ class TaskManager(BaseTaskManager): finally: self._remove_task(task) - def current_tasks(self) -> Set[asyncio.Task]: + def current_tasks(self) -> Sequence[asyncio.Task]: """Returns the list of currently created/registered tasks.""" - return self._tasks + return list(self._tasks.values()) def _add_task(self, task: asyncio.Task): - self._tasks.add(task) + name = task.get_name() + self._tasks[name] = task def _remove_task(self, task: asyncio.Task): name = task.get_name() try: - self._tasks.remove(task) + del self._tasks[name] except KeyError as e: logger.trace(f"{name}: unable to remove task (already removed?): {e}") From 45839053135f8f6dac67ea448fdd4f1909d17e95 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= Date: Mon, 5 May 2025 21:33:21 -0700 Subject: [PATCH 04/97] PipelineTask: cleanup if task is cancelled from outside Pipecat --- CHANGELOG.md | 3 +++ src/pipecat/pipeline/task.py | 31 ++++++++++++++++++++----------- 2 files changed, 23 insertions(+), 11 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ac4e2db11..d8be39cdb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed +- Fixed a `PipelineTask` issue that would cause tasks to not be cancelled if + task was cancelled from outside of Pipecat. + - Fixed a `TaskManager` that was causing dangling tasks to be reported. - Fixed an issue that could cause data to be sent to the transports when they diff --git a/src/pipecat/pipeline/task.py b/src/pipecat/pipeline/task.py index 8279373cb..c40173899 100644 --- a/src/pipecat/pipeline/task.py +++ b/src/pipecat/pipeline/task.py @@ -286,12 +286,7 @@ class PipelineTask(BaseTask): async def cancel(self): """Stops the running pipeline immediately.""" logger.debug(f"Canceling pipeline task {self}") - # Make sure everything is cleaned up downstream. This is sent - # out-of-band from the main streaming task which is what we want since - # we want to cancel right away. - await self._source.push_frame(CancelFrame()) - # Only cancel the push task. Everything else will be cancelled in run(). - await self._task_manager.cancel_task(self._process_push_task) + await self._cancel() async def run(self): """Starts and manages the pipeline execution until completion or cancellation.""" @@ -309,11 +304,17 @@ class PipelineTask(BaseTask): # well, because you get a CancelledError in every place you are # awaiting a task. pass - await self._cancel_tasks() - await self._cleanup(cleanup_pipeline) - if self._check_dangling_tasks: - self._print_dangling_tasks() - self._finished = True + finally: + # It's possibe that we get an asyncio.CancelledError from the + # outside, if so we need to make sure everything gets cancelled + # properly. + if cleanup_pipeline: + await self._cancel() + await self._cancel_tasks() + await self._cleanup(cleanup_pipeline) + if self._check_dangling_tasks: + self._print_dangling_tasks() + self._finished = True async def queue_frame(self, frame: Frame): """Queue a single frame to be pushed down the pipeline. @@ -336,6 +337,14 @@ class PipelineTask(BaseTask): for frame in frames: await self.queue_frame(frame) + async def _cancel(self): + # Make sure everything is cleaned up downstream. This is sent + # out-of-band from the main streaming task which is what we want since + # we want to cancel right away. + await self._source.push_frame(CancelFrame()) + # Only cancel the push task. Everything else will be cancelled in run(). + await self._task_manager.cancel_task(self._process_push_task) + async def _create_tasks(self): self._process_up_task = self._task_manager.create_task( self._process_up_queue(), f"{self}::_process_up_queue" From e06146c23770c3611a436e12c3dfb5528fb7afee Mon Sep 17 00:00:00 2001 From: Mark Backman Date: Tue, 6 May 2025 11:06:57 -0400 Subject: [PATCH 05/97] Add enable_ssml_parsing to ElevenLabsTTSService --- CHANGELOG.md | 2 ++ src/pipecat/services/elevenlabs/tts.py | 5 +++++ 2 files changed, 7 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 95350157f..1bbc5210d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added +- Added `enable_ssml_parsing` to `InputParams` in `ElevenLabsTTSService`. + - Added support to `RimeHttpTTSService` for the `arcana` model. ### Fixed diff --git a/src/pipecat/services/elevenlabs/tts.py b/src/pipecat/services/elevenlabs/tts.py index 4362fcdc9..ea89d1378 100644 --- a/src/pipecat/services/elevenlabs/tts.py +++ b/src/pipecat/services/elevenlabs/tts.py @@ -169,6 +169,7 @@ class ElevenLabsTTSService(InterruptibleWordTTSService): use_speaker_boost: Optional[bool] = None speed: Optional[float] = None auto_mode: Optional[bool] = True + enable_ssml_parsing: Optional[bool] = None @model_validator(mode="after") def validate_voice_settings(self): @@ -227,6 +228,7 @@ class ElevenLabsTTSService(InterruptibleWordTTSService): "use_speaker_boost": params.use_speaker_boost, "speed": params.speed, "auto_mode": str(params.auto_mode).lower(), + "enable_ssml_parsing": params.enable_ssml_parsing, } self.set_model_name(model) self.set_voice(voice_id) @@ -324,6 +326,9 @@ class ElevenLabsTTSService(InterruptibleWordTTSService): if self._settings["optimize_streaming_latency"]: url += f"&optimize_streaming_latency={self._settings['optimize_streaming_latency']}" + if self._settings["enable_ssml_parsing"]: + url += f"&enable_ssml_parsing={self._settings['enable_ssml_parsing']}" + # Language can only be used with the ELEVENLABS_MULTILINGUAL_MODELS language = self._settings["language"] if model in ELEVENLABS_MULTILINGUAL_MODELS and language is not None: From 8691870bcb138ec971c01501486e09d4d14fa118 Mon Sep 17 00:00:00 2001 From: Mark Backman Date: Tue, 6 May 2025 11:29:32 -0400 Subject: [PATCH 06/97] Update Deepgram TTS default voice to Aura 2 voice --- CHANGELOG.md | 4 ++++ examples/foundational/07c-interruptible-deepgram-vad.py | 2 +- examples/foundational/07c-interruptible-deepgram.py | 2 +- src/pipecat/services/deepgram/tts.py | 2 +- 4 files changed, 7 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d8be39cdb..f663976fa 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added support to `RimeHttpTTSService` for the `arcana` model. +### Changed + +- Updated the default voice for `DeepgramTTSService` to `aura-2-helena-en`. + ### Fixed - Fixed a `PipelineTask` issue that would cause tasks to not be cancelled if diff --git a/examples/foundational/07c-interruptible-deepgram-vad.py b/examples/foundational/07c-interruptible-deepgram-vad.py index a6d6ab4bb..945cdc447 100644 --- a/examples/foundational/07c-interruptible-deepgram-vad.py +++ b/examples/foundational/07c-interruptible-deepgram-vad.py @@ -47,7 +47,7 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection, _: argparse.Namespac live_options=LiveOptions(vad_events=True, utterance_end_ms="1000"), ) - tts = DeepgramTTSService(api_key=os.getenv("DEEPGRAM_API_KEY"), voice="aura-helios-en") + tts = DeepgramTTSService(api_key=os.getenv("DEEPGRAM_API_KEY"), voice="aura-2-andromeda-en") llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY")) diff --git a/examples/foundational/07c-interruptible-deepgram.py b/examples/foundational/07c-interruptible-deepgram.py index 3e02d8d77..2a707da4a 100644 --- a/examples/foundational/07c-interruptible-deepgram.py +++ b/examples/foundational/07c-interruptible-deepgram.py @@ -39,7 +39,7 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection, _: argparse.Namespac stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY")) - tts = DeepgramTTSService(api_key=os.getenv("DEEPGRAM_API_KEY"), voice="aura-helios-en") + tts = DeepgramTTSService(api_key=os.getenv("DEEPGRAM_API_KEY"), voice="aura-2-andromeda-en") llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY")) diff --git a/src/pipecat/services/deepgram/tts.py b/src/pipecat/services/deepgram/tts.py index ec8a755a0..93c710f9e 100644 --- a/src/pipecat/services/deepgram/tts.py +++ b/src/pipecat/services/deepgram/tts.py @@ -30,7 +30,7 @@ class DeepgramTTSService(TTSService): self, *, api_key: str, - voice: str = "aura-helios-en", + voice: str = "aura-2-helena-en", base_url: str = "", sample_rate: Optional[int] = None, encoding: str = "linear16", From 288f8865c8471adf5b4ec1706445099259b61af2 Mon Sep 17 00:00:00 2001 From: Mark Backman Date: Tue, 6 May 2025 12:13:26 -0400 Subject: [PATCH 07/97] Add enable_logging to ElevenLabsTTSService --- CHANGELOG.md | 3 ++- src/pipecat/services/elevenlabs/tts.py | 5 +++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1bbc5210d..013a84421 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,7 +9,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added -- Added `enable_ssml_parsing` to `InputParams` in `ElevenLabsTTSService`. +- Added `enable_ssml_parsing` and `enable_logging` to `InputParams` in + `ElevenLabsTTSService`. - Added support to `RimeHttpTTSService` for the `arcana` model. diff --git a/src/pipecat/services/elevenlabs/tts.py b/src/pipecat/services/elevenlabs/tts.py index ea89d1378..0a3d5d0d1 100644 --- a/src/pipecat/services/elevenlabs/tts.py +++ b/src/pipecat/services/elevenlabs/tts.py @@ -170,6 +170,7 @@ class ElevenLabsTTSService(InterruptibleWordTTSService): speed: Optional[float] = None auto_mode: Optional[bool] = True enable_ssml_parsing: Optional[bool] = None + enable_logging: Optional[bool] = None @model_validator(mode="after") def validate_voice_settings(self): @@ -229,6 +230,7 @@ class ElevenLabsTTSService(InterruptibleWordTTSService): "speed": params.speed, "auto_mode": str(params.auto_mode).lower(), "enable_ssml_parsing": params.enable_ssml_parsing, + "enable_logging": params.enable_logging, } self.set_model_name(model) self.set_voice(voice_id) @@ -329,6 +331,9 @@ class ElevenLabsTTSService(InterruptibleWordTTSService): if self._settings["enable_ssml_parsing"]: url += f"&enable_ssml_parsing={self._settings['enable_ssml_parsing']}" + if self._settings["enable_logging"]: + url += f"&enable_logging={self._settings['enable_logging']}" + # Language can only be used with the ELEVENLABS_MULTILINGUAL_MODELS language = self._settings["language"] if model in ELEVENLABS_MULTILINGUAL_MODELS and language is not None: From 0d30b000af2e20102b4286f7d540a9336e838e9f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= Date: Mon, 5 May 2025 11:48:55 -0700 Subject: [PATCH 08/97] BaseObserver: add FramePushed class and deprecated multiple arguments --- CHANGELOG.md | 8 +++ examples/foundational/30-observer.py | 20 +++---- src/pipecat/observers/base_observer.py | 56 ++++++++++++------ .../observers/loggers/llm_log_observer.py | 17 +++--- .../loggers/transcription_log_observer.py | 15 ++--- src/pipecat/pipeline/task_observer.py | 58 ++++++++----------- src/pipecat/processors/frame_processor.py | 20 +++++-- src/pipecat/processors/frameworks/rtvi.py | 15 ++--- src/pipecat/services/google/rtvi.py | 18 +++--- src/pipecat/tests/utils.py | 14 ++--- 10 files changed, 127 insertions(+), 114 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4e5130d7a..3ae2db0ae 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,8 +16,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed +- Observers `on_push_frame()` now take a single argument `FramePushed` instead + of multiple arguments. + - Updated the default voice for `DeepgramTTSService` to `aura-2-helena-en`. +### Deprecated + +- Observer `on_push_frame(src, dst, frame, direction, timestamp)` is now + deprecated, use `on_push_frame(data: FramePushed)` instead. + ### Fixed - Fixed a `PipelineTask` issue that would cause tasks to not be cancelled if diff --git a/examples/foundational/30-observer.py b/examples/foundational/30-observer.py index d8c2ec100..46bd96e53 100644 --- a/examples/foundational/30-observer.py +++ b/examples/foundational/30-observer.py @@ -14,16 +14,15 @@ from pipecat.audio.vad.silero import SileroVADAnalyzer from pipecat.frames.frames import ( BotStartedSpeakingFrame, BotStoppedSpeakingFrame, - Frame, StartInterruptionFrame, ) -from pipecat.observers.base_observer import BaseObserver +from pipecat.observers.base_observer import BaseObserver, FramePushed from pipecat.observers.loggers.llm_log_observer import LLMLogObserver from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner from pipecat.pipeline.task import PipelineParams, PipelineTask from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext -from pipecat.processors.frame_processor import FrameDirection, FrameProcessor +from pipecat.processors.frame_processor import FrameDirection from pipecat.services.cartesia.tts import CartesiaTTSService from pipecat.services.deepgram.stt import DeepgramSTTService from pipecat.services.openai.llm import OpenAILLMService @@ -46,14 +45,13 @@ class DebugObserver(BaseObserver): Log format: [EVENT TYPE]: [source processor] β†’ [destination processor] at [timestamp]s """ - async def on_push_frame( - self, - src: FrameProcessor, - dst: FrameProcessor, - frame: Frame, - direction: FrameDirection, - timestamp: int, - ): + async def on_push_frame(self, data: FramePushed): + src = data.source + dst = data.destination + frame = data.frame + direction = data.direction + timestamp = data.timestamp + # Convert timestamp to seconds for readability time_sec = timestamp / 1_000_000_000 diff --git a/src/pipecat/observers/base_observer.py b/src/pipecat/observers/base_observer.py index 46f746946..f1a0c2a1b 100644 --- a/src/pipecat/observers/base_observer.py +++ b/src/pipecat/observers/base_observer.py @@ -5,9 +5,38 @@ # from abc import ABC, abstractmethod +from dataclasses import dataclass + +from typing_extensions import TYPE_CHECKING from pipecat.frames.frames import Frame -from pipecat.processors.frame_processor import FrameDirection, FrameProcessor + +if TYPE_CHECKING: + from pipecat.processors.frame_processor import FrameDirection, FrameProcessor + + +@dataclass +class FramePushed: + """Represents an event where a frame is pushed from one processor to another + within the pipeline. + + This data structure is typically used by observers to track the flow of + frames through the pipeline for logging, debugging, or analytics purposes. + + Attributes: + source (FrameProcessor): The processor sending the frame. + destination (FrameProcessor): The processor receiving the frame. + frame (Frame): The frame being transferred. + direction (FrameDirection): The direction of the transfer (e.g., downstream or upstream). + timestamp (int): The time when the frame was pushed, based on the pipeline clock. + + """ + + source: "FrameProcessor" + destination: "FrameProcessor" + frame: Frame + direction: "FrameDirection" + timestamp: int class BaseObserver(ABC): @@ -19,26 +48,15 @@ class BaseObserver(ABC): """ @abstractmethod - async def on_push_frame( - self, - src: FrameProcessor, - dst: FrameProcessor, - frame: Frame, - direction: FrameDirection, - timestamp: int, - ): - """Abstract method to handle the event when a frame is pushed from one - processor to another. + async def on_push_frame(self, data: FramePushed): + """Handle the event when a frame is pushed from one processor to another. + + This method should be implemented by subclasses to define specific + behavior (e.g., logging, monitoring, debugging) when a frame is + transferred through the pipeline. Args: - src (FrameProcessor): The source frame processor that is sending the frame. - dst (FrameProcessor): The destination frame processor that will receive the frame. - frame (Frame): The frame being transferred between processors. - direction (FrameDirection): The direction of the frame transfer. - timestamp (int): The timestamp when the frame was pushed (based on the pipeline clock). - - This method should be implemented by subclasses to define specific behavior - when a frame is pushed. + data (FramePushed): The event data containing details about the frame transfer. """ pass diff --git a/src/pipecat/observers/loggers/llm_log_observer.py b/src/pipecat/observers/loggers/llm_log_observer.py index dd270abf5..9e4d53b28 100644 --- a/src/pipecat/observers/loggers/llm_log_observer.py +++ b/src/pipecat/observers/loggers/llm_log_observer.py @@ -15,7 +15,7 @@ from pipecat.frames.frames import ( LLMMessagesFrame, LLMTextFrame, ) -from pipecat.observers.base_observer import BaseObserver +from pipecat.observers.base_observer import BaseObserver, FramePushed from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContextFrame from pipecat.processors.frame_processor import FrameDirection, FrameProcessor from pipecat.services.llm_service import LLMService @@ -38,14 +38,13 @@ class LLMLogObserver(BaseObserver): """ - async def on_push_frame( - self, - src: FrameProcessor, - dst: FrameProcessor, - frame: Frame, - direction: FrameDirection, - timestamp: int, - ): + async def on_push_frame(self, data: FramePushed): + src = data.source + dst = data.destination + frame = data.frame + direction = data.direction + timestamp = data.timestamp + if not isinstance(src, LLMService) and not isinstance(dst, LLMService): return diff --git a/src/pipecat/observers/loggers/transcription_log_observer.py b/src/pipecat/observers/loggers/transcription_log_observer.py index 4547ee54f..57e38c952 100644 --- a/src/pipecat/observers/loggers/transcription_log_observer.py +++ b/src/pipecat/observers/loggers/transcription_log_observer.py @@ -11,7 +11,7 @@ from pipecat.frames.frames import ( InterimTranscriptionFrame, TranscriptionFrame, ) -from pipecat.observers.base_observer import BaseObserver +from pipecat.observers.base_observer import BaseObserver, FramePushed from pipecat.processors.frame_processor import FrameDirection, FrameProcessor from pipecat.services.stt_service import STTService @@ -29,14 +29,11 @@ class TranscriptionLogObserver(BaseObserver): """ - async def on_push_frame( - self, - src: FrameProcessor, - dst: FrameProcessor, - frame: Frame, - direction: FrameDirection, - timestamp: int, - ): + async def on_push_frame(self, data: FramePushed): + src = data.source + frame = data.frame + timestamp = data.timestamp + if not isinstance(src, STTService): return diff --git a/src/pipecat/pipeline/task_observer.py b/src/pipecat/pipeline/task_observer.py index dd805032c..252708f8c 100644 --- a/src/pipecat/pipeline/task_observer.py +++ b/src/pipecat/pipeline/task_observer.py @@ -5,13 +5,12 @@ # import asyncio +import inspect from typing import List from attr import dataclass -from pipecat.frames.frames import Frame -from pipecat.observers.base_observer import BaseObserver -from pipecat.processors.frame_processor import FrameDirection, FrameProcessor +from pipecat.observers.base_observer import BaseObserver, FramePushed from pipecat.utils.asyncio import BaseTaskManager @@ -27,20 +26,6 @@ class Proxy: observer: BaseObserver -@dataclass -class ObserverData: - """This is the data we receive from the main observer and that we put into a - proxy queue for later processing. - - """ - - src: FrameProcessor - dst: FrameProcessor - frame: Frame - direction: FrameDirection - timestamp: int - - class TaskObserver(BaseObserver): """This is a pipeline frame observer that is meant to be used as a proxy to the user provided observers. That is, this is the observer that should be @@ -68,20 +53,9 @@ class TaskObserver(BaseObserver): for proxy in self._proxies: await self._task_manager.cancel_task(proxy.task) - async def on_push_frame( - self, - src: FrameProcessor, - dst: FrameProcessor, - frame: Frame, - direction: FrameDirection, - timestamp: int, - ): + async def on_push_frame(self, data: FramePushed): for proxy in self._proxies: - await proxy.queue.put( - ObserverData( - src=src, dst=dst, frame=frame, direction=direction, timestamp=timestamp - ) - ) + await proxy.queue.put(data) def _create_proxies(self, observers) -> List[Proxy]: proxies = [] @@ -96,8 +70,26 @@ class TaskObserver(BaseObserver): return proxies async def _proxy_task_handler(self, queue: asyncio.Queue, observer: BaseObserver): + warning_reported = False while True: data = await queue.get() - await observer.on_push_frame( - data.src, data.dst, data.frame, data.direction, data.timestamp - ) + + signature = inspect.signature(observer.on_push_frame) + if len(signature.parameters) > 1: + if not warning_reported: + import warnings + + with warnings.catch_warnings(): + warnings.simplefilter("always") + warnings.warn( + "Observer `on_push_frame(source, destination, frame, direction, timestamp)` is deprecated, us `on_push_frame(data: FramePushed)` instead.", + DeprecationWarning, + ) + warning_reported = True + await observer.on_push_frame( + data.src, data.dst, data.frame, data.direction, data.timestamp + ) + else: + await observer.on_push_frame(data) + + queue.task_done() diff --git a/src/pipecat/processors/frame_processor.py b/src/pipecat/processors/frame_processor.py index 590698e7f..97cc24378 100644 --- a/src/pipecat/processors/frame_processor.py +++ b/src/pipecat/processors/frame_processor.py @@ -21,6 +21,7 @@ from pipecat.frames.frames import ( SystemFrame, ) from pipecat.metrics.metrics import LLMTokenUsage, MetricsData +from pipecat.observers.base_observer import FramePushed from pipecat.processors.metrics.frame_processor_metrics import FrameProcessorMetrics from pipecat.utils.asyncio import BaseTaskManager from pipecat.utils.base_object import BaseObject @@ -294,17 +295,28 @@ class FrameProcessor(BaseObject): timestamp = self._clock.get_time() if self._clock else 0 if direction == FrameDirection.DOWNSTREAM and self._next: logger.trace(f"Pushing {frame} from {self} to {self._next}") + if self._observer: - await self._observer.on_push_frame( - self, self._next, frame, direction, timestamp + data = FramePushed( + source=self, + destination=self._next, + frame=frame, + direction=direction, + timestamp=timestamp, ) + await self._observer.on_push_frame(data) await self._next.queue_frame(frame, direction) elif direction == FrameDirection.UPSTREAM and self._prev: logger.trace(f"Pushing {frame} upstream from {self} to {self._prev}") if self._observer: - await self._observer.on_push_frame( - self, self._prev, frame, direction, timestamp + data = FramePushed( + source=self, + destination=self._prev, + frame=frame, + direction=direction, + timestamp=timestamp, ) + await self._observer.on_push_frame(data) await self._prev.queue_frame(frame, direction) except Exception as e: logger.exception(f"Uncaught exception in {self}: {e}") diff --git a/src/pipecat/processors/frameworks/rtvi.py b/src/pipecat/processors/frameworks/rtvi.py index 55e91d7ff..ee0cced87 100644 --- a/src/pipecat/processors/frameworks/rtvi.py +++ b/src/pipecat/processors/frameworks/rtvi.py @@ -55,7 +55,7 @@ from pipecat.metrics.metrics import ( TTFBMetricsData, TTSUsageMetricsData, ) -from pipecat.observers.base_observer import BaseObserver +from pipecat.observers.base_observer import BaseObserver, FramePushed from pipecat.processors.aggregators.openai_llm_context import ( OpenAILLMContext, OpenAILLMContextFrame, @@ -445,14 +445,7 @@ class RTVIObserver(BaseObserver): self._frames_seen = set() rtvi.set_errors_enabled(self._params.errors_enabled) - async def on_push_frame( - self, - src: FrameProcessor, - dst: FrameProcessor, - frame: Frame, - direction: FrameDirection, - timestamp: int, - ): + async def on_push_frame(self, data: FramePushed): """Process a frame being pushed through the pipeline. Args: @@ -462,6 +455,10 @@ class RTVIObserver(BaseObserver): direction: Direction of frame flow in pipeline timestamp: Time when frame was pushed """ + src = data.source + frame = data.frame + direction = data.direction + # If we have already seen this frame, let's skip it. if frame.id in self._frames_seen: return diff --git a/src/pipecat/services/google/rtvi.py b/src/pipecat/services/google/rtvi.py index 88e67e6c6..cd60f6f1f 100644 --- a/src/pipecat/services/google/rtvi.py +++ b/src/pipecat/services/google/rtvi.py @@ -9,8 +9,9 @@ from typing import List, Literal, Optional from pydantic import BaseModel from pipecat.frames.frames import Frame +from pipecat.observers.base_observer import FramePushed from pipecat.processors.frame_processor import FrameDirection, FrameProcessor -from pipecat.processors.frameworks.rtvi import RTVIObserver +from pipecat.processors.frameworks.rtvi import RTVIObserver, RTVIProcessor from pipecat.services.google.frames import LLMSearchOrigin, LLMSearchResponseFrame @@ -27,18 +28,13 @@ class RTVIBotLLMSearchResponseMessage(BaseModel): class GoogleRTVIObserver(RTVIObserver): - def __init__(self, rtvi: FrameProcessor): + def __init__(self, rtvi: RTVIProcessor): super().__init__(rtvi) - async def on_push_frame( - self, - src: FrameProcessor, - dst: FrameProcessor, - frame: Frame, - direction: FrameDirection, - timestamp: int, - ): - await super().on_push_frame(src, dst, frame, direction, timestamp) + async def on_push_frame(self, data: FramePushed): + await super().on_push_frame(data) + + frame = data.frame if isinstance(frame, LLMSearchResponseFrame): await self._handle_llm_search_response_frame(frame) diff --git a/src/pipecat/tests/utils.py b/src/pipecat/tests/utils.py index e2368ba09..b5dfc5de1 100644 --- a/src/pipecat/tests/utils.py +++ b/src/pipecat/tests/utils.py @@ -15,7 +15,7 @@ from pipecat.frames.frames import ( StartFrame, SystemFrame, ) -from pipecat.observers.base_observer import BaseObserver +from pipecat.observers.base_observer import BaseObserver, FramePushed from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner from pipecat.pipeline.task import PipelineParams, PipelineTask @@ -42,14 +42,10 @@ class HeartbeatsObserver(BaseObserver): self._target = target self._callback = heartbeat_callback - async def on_push_frame( - self, - src: FrameProcessor, - dst: FrameProcessor, - frame: Frame, - direction: FrameDirection, - timestamp: int, - ): + async def on_push_frame(self, data: FramePushed): + src = data.source + frame = data.frame + if src == self._target and isinstance(frame, HeartbeatFrame): await self._callback(self._target, frame) From d69fa5dba507ade3146828d2082d8e329ec30b42 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= Date: Mon, 5 May 2025 11:51:08 -0700 Subject: [PATCH 09/97] update CHANGELOG with UltravoxSTTService fix --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3ae2db0ae..1a6274667 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -28,6 +28,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed +- Fixed a `UltravoxSTTService` issue that would cause the service to generate + all tokens as one word. + - Fixed a `PipelineTask` issue that would cause tasks to not be cancelled if task was cancelled from outside of Pipecat. From a1d46cb26bbb489d9ddcc414c59dced7fe6983b8 Mon Sep 17 00:00:00 2001 From: Mark Backman Date: Tue, 6 May 2025 21:23:23 -0400 Subject: [PATCH 10/97] Removing CanonicalMetricsService --- CHANGELOG.md | 4 + README.md | 2 +- docs/api/requirements.txt | 1 - examples/canonical-metrics/.gitignore | 161 -------------- examples/canonical-metrics/Dockerfile | 10 - examples/canonical-metrics/README.md | 66 ------ examples/canonical-metrics/bot.py | 146 ------------- examples/canonical-metrics/env.example | 6 - examples/canonical-metrics/requirements.txt | 5 - examples/canonical-metrics/runner.py | 55 ----- examples/canonical-metrics/server.py | 139 ------------ pyproject.toml | 1 - src/pipecat/services/canonical/__init__.py | 13 -- src/pipecat/services/canonical/metrics.py | 230 -------------------- 14 files changed, 5 insertions(+), 834 deletions(-) delete mode 100644 examples/canonical-metrics/.gitignore delete mode 100644 examples/canonical-metrics/Dockerfile delete mode 100644 examples/canonical-metrics/README.md delete mode 100644 examples/canonical-metrics/bot.py delete mode 100644 examples/canonical-metrics/env.example delete mode 100644 examples/canonical-metrics/requirements.txt delete mode 100644 examples/canonical-metrics/runner.py delete mode 100644 examples/canonical-metrics/server.py delete mode 100644 src/pipecat/services/canonical/__init__.py delete mode 100644 src/pipecat/services/canonical/metrics.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 1a6274667..101cc7f58 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -41,6 +41,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Remove custom audio tracks from `DailyTransport` before leaving. +### Removed + +- Removed `CanonicalMetricsService` as it's no longer maintained. + ## [0.0.66] - 2025-05-02 ### Added diff --git a/README.md b/README.md index ac2444f87..7f95bb664 100644 --- a/README.md +++ b/README.md @@ -60,7 +60,7 @@ You can connect to Pipecat from any platform using our official SDKs: | Memory | [mem0](https://docs.pipecat.ai/server/services/memory/mem0) | | Vision & Image | [fal](https://docs.pipecat.ai/server/services/image-generation/fal), [Google Imagen](https://docs.pipecat.ai/server/services/image-generation/fal), [Moondream](https://docs.pipecat.ai/server/services/vision/moondream) | | Audio Processing | [Silero VAD](https://docs.pipecat.ai/server/utilities/audio/silero-vad-analyzer), [Krisp](https://docs.pipecat.ai/server/utilities/audio/krisp-filter), [Koala](https://docs.pipecat.ai/server/utilities/audio/koala-filter), [Noisereduce](https://docs.pipecat.ai/server/utilities/audio/noisereduce-filter) | -| Analytics & Metrics | [Canonical AI](https://docs.pipecat.ai/server/services/analytics/canonical), [Sentry](https://docs.pipecat.ai/server/services/analytics/sentry) | +| Analytics & Metrics | [Sentry](https://docs.pipecat.ai/server/services/analytics/sentry) | πŸ“š [View full services documentation β†’](https://docs.pipecat.ai/server/services/supported-services) diff --git a/docs/api/requirements.txt b/docs/api/requirements.txt index 9badccd8f..a77ff1084 100644 --- a/docs/api/requirements.txt +++ b/docs/api/requirements.txt @@ -10,7 +10,6 @@ pipecat-ai[anthropic] pipecat-ai[assemblyai] pipecat-ai[aws] pipecat-ai[azure] -pipecat-ai[canonical] pipecat-ai[cartesia] pipecat-ai[cerebras] pipecat-ai[deepseek] diff --git a/examples/canonical-metrics/.gitignore b/examples/canonical-metrics/.gitignore deleted file mode 100644 index 50d9d205e..000000000 --- a/examples/canonical-metrics/.gitignore +++ /dev/null @@ -1,161 +0,0 @@ -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class -recordings/ -# C extensions -*.so - -# Distribution / packaging -.Python -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -wheels/ -share/python-wheels/ -*.egg-info/ -.installed.cfg -*.egg -MANIFEST - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.nox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -*.py,cover -.hypothesis/ -.pytest_cache/ -cover/ - -# Translations -*.mo -*.pot - -# Django stuff: -*.log -local_settings.py -db.sqlite3 -db.sqlite3-journal - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/_build/ - -# PyBuilder -.pybuilder/ -target/ - -# Jupyter Notebook -.ipynb_checkpoints - -# IPython -profile_default/ -ipython_config.py - -# pyenv -# For a library or package, you might want to ignore these files since the code is -# intended to run in multiple environments; otherwise, check them in: -# .python-version - -# pipenv -# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. -# However, in case of collaboration, if having platform-specific dependencies or dependencies -# having no cross-platform support, pipenv may install dependencies that don't work, or not -# install all needed dependencies. -#Pipfile.lock - -# poetry -# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. -# This is especially recommended for binary packages to ensure reproducibility, and is more -# commonly ignored for libraries. -# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control -#poetry.lock - -# pdm -# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. -#pdm.lock -# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it -# in version control. -# https://pdm.fming.dev/#use-with-ide -.pdm.toml - -# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm -__pypackages__/ - -# Celery stuff -celerybeat-schedule -celerybeat.pid - -# SageMath parsed files -*.sage.py - -# Environments -.env -.venv -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ - -# Spyder project settings -.spyderproject -.spyproject - -# Rope project settings -.ropeproject - -# mkdocs documentation -/site - -# mypy -.mypy_cache/ -.dmypy.json -dmypy.json - -# Pyre type checker -.pyre/ - -# pytype static type analyzer -.pytype/ - -# Cython debug symbols -cython_debug/ - -# PyCharm -# JetBrains specific template is maintained in a separate JetBrains.gitignore that can -# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore -# and can be added to the global gitignore or merged into this file. For a more nuclear -# option (not recommended) you can uncomment the following to ignore the entire idea folder. -#.idea/ -runpod.toml diff --git a/examples/canonical-metrics/Dockerfile b/examples/canonical-metrics/Dockerfile deleted file mode 100644 index a5b4668c6..000000000 --- a/examples/canonical-metrics/Dockerfile +++ /dev/null @@ -1,10 +0,0 @@ -FROM python:3.10-bullseye -RUN mkdir /app -COPY *.py /app/ -COPY requirements.txt /app/ -WORKDIR /app -RUN pip3 install -r requirements.txt - -EXPOSE 7860 - -CMD ["python3", "server.py"] diff --git a/examples/canonical-metrics/README.md b/examples/canonical-metrics/README.md deleted file mode 100644 index 068655d2b..000000000 --- a/examples/canonical-metrics/README.md +++ /dev/null @@ -1,66 +0,0 @@ -# Chatbot with canonical-metrics - -This project implements a chatbot using a pipeline architecture that integrates audio processing, transcription, and a language model for conversational interactions. The chatbot operates within a daily communication environment, utilizing various services for text-to-speech and language model responses. - -## Features - -- **Audio Input and Output**: Captures microphone input and plays back audio responses. -- **Voice Activity Detection**: Utilizes Silero VAD to manage audio input intelligently. -- **Text-to-Speech**: Integrates ElevenLabs TTS service to convert text responses into audio. -- **Language Model Interaction**: Uses OpenAI's GPT-4 model to generate responses based on user input. -- **Transcription Services**: Captures and transcribes participant speech for analytics. -- **Metrics Collection**: Sends audio data for analysis via Canonical Metrics Service. - -## Requirements - -- Python 3.10+ -- `python-dotenv` -- Additional libraries from the `pipecat` package. - -## Setup - -1. Clone the repository. -2. Install the required packages. -3. Set up environment variables for API keys: - - `OPENAI_API_KEY` - - `ELEVENLABS_API_KEY` - - `CANONICAL_API_KEY` - - `CANONICAL_API_URL` -4. Run the script. - -## Usage - -The chatbot introduces itself and engages in conversations, providing brief and creative responses. Designed for flexibility, it can support multiple languages with appropriate configuration. - -## Events - -- Participants joining or leaving the call are handled dynamically, adjusting the chatbot's behavior accordingly. - - -ℹ️ The first time, things might take extra time to get started since VAD (Voice Activity Detection) model needs to be downloaded. - -## Get started - -```python -python3 -m venv venv -source venv/bin/activate -pip install -r requirements.txt - -cp env.example .env # and add your credentials - -``` - -## Run the server - -```bash -python server.py -``` - -Then, visit `http://localhost:7860/` in your browser to start a chatbot session. - -## Build and test the Docker image - -``` -docker build -t chatbot . -docker run --env-file .env -p 7860:7860 chatbot -``` diff --git a/examples/canonical-metrics/bot.py b/examples/canonical-metrics/bot.py deleted file mode 100644 index 871d0542d..000000000 --- a/examples/canonical-metrics/bot.py +++ /dev/null @@ -1,146 +0,0 @@ -# -# Copyright (c) 2024–2025, Daily -# -# SPDX-License-Identifier: BSD 2-Clause License -# - -import asyncio -import os -import sys -import uuid - -import aiohttp -from dotenv import load_dotenv -from loguru import logger -from runner import configure - -from pipecat.audio.vad.silero import SileroVADAnalyzer -from pipecat.frames.frames import EndFrame -from pipecat.pipeline.pipeline import Pipeline -from pipecat.pipeline.runner import PipelineRunner -from pipecat.pipeline.task import PipelineParams, PipelineTask -from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext -from pipecat.processors.audio.audio_buffer_processor import AudioBufferProcessor -from pipecat.services.canonical.metrics import CanonicalMetricsService -from pipecat.services.elevenlabs.tts import ElevenLabsTTSService -from pipecat.services.openai.llm import OpenAILLMService -from pipecat.transports.services.daily import DailyParams, DailyTransport - -load_dotenv(override=True) - -logger.remove(0) -logger.add(sys.stderr, level="DEBUG") - - -async def main(): - async with aiohttp.ClientSession() as session: - (room_url, token) = await configure(session) - - transport = DailyTransport( - room_url, - token, - "Chatbot", - DailyParams( - audio_out_enabled=True, - audio_in_enabled=True, - video_out_enabled=False, - vad_analyzer=SileroVADAnalyzer(), - transcription_enabled=True, - # - # Spanish - # - # transcription_settings=DailyTranscriptionSettings( - # language="es", - # tier="nova", - # model="2-general" - # ) - ), - ) - - tts = ElevenLabsTTSService( - api_key=os.getenv("ELEVENLABS_API_KEY"), - # - # English - # - voice_id="cgSgspJ2msm6clMCkdW9", - # - # Spanish - # - # model="eleven_multilingual_v2", - # voice_id="gD1IexrzCvsXPHUuT0s3", - ) - - llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY")) - - messages = [ - { - "role": "system", - # - # English - # - "content": "You are Chatbot, a friendly, helpful robot. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way, but keep your responses brief. Start by introducing yourself. Keep all your responses to 12 words or fewer.", - # - # Spanish - # - # "content": "Eres Chatbot, un amigable y ΓΊtil robot. Tu objetivo es demostrar tus capacidades de una manera breve. Tus respuestas se convertiran a audio asΓ­ que nunca no debes incluir caracteres especiales. Contesta a lo que el usuario pregunte de una manera creativa, ΓΊtil y breve. Empieza por presentarte a ti mismo.", - }, - ] - - context = OpenAILLMContext(messages) - context_aggregator = llm.create_context_aggregator(context) - - """ - CanonicalMetrics uses AudioBufferProcessor under the hood to buffer the audio. On - call completion, CanonicalMetrics will send the audio buffer to Canonical for - analysis. Visit https://voice.canonical.chat to learn more. - """ - audio_buffer_processor = AudioBufferProcessor(num_channels=2) - canonical = CanonicalMetricsService( - audio_buffer_processor=audio_buffer_processor, - aiohttp_session=session, - api_key=os.getenv("CANONICAL_API_KEY"), - call_id=str(uuid.uuid4()), - assistant="pipecat-chatbot", - assistant_speaks_first=True, - context=context, - ) - pipeline = Pipeline( - [ - transport.input(), # microphone - context_aggregator.user(), - llm, - tts, - transport.output(), - canonical, # uploads audio buffer to Canonical AI for metrics - audio_buffer_processor, # captures audio into a buffer - context_aggregator.assistant(), - ] - ) - - task = PipelineTask(pipeline, params=PipelineParams(allow_interruptions=True)) - - @transport.event_handler("on_first_participant_joined") - async def on_first_participant_joined(transport, participant): - await audio_buffer_processor.start_recording() - await transport.capture_participant_transcription(participant["id"]) - await task.queue_frames([context_aggregator.user().get_context_frame()]) - - @transport.event_handler("on_participant_left") - async def on_participant_left(transport, participant, reason): - print(f"Participant left: {participant}") - await task.cancel() - - @transport.event_handler("on_call_state_updated") - async def on_call_state_updated(transport, state): - if state == "left": - # Here we don't want to cancel, we just want to finish sending - # whatever is queued, so we use an EndFrame(). - await task.queue_frame(EndFrame()) - - runner = PipelineRunner() - - await runner.run(task) - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/examples/canonical-metrics/env.example b/examples/canonical-metrics/env.example deleted file mode 100644 index 6b865401a..000000000 --- a/examples/canonical-metrics/env.example +++ /dev/null @@ -1,6 +0,0 @@ -DAILY_SAMPLE_ROOM_URL=https://yourdomain.daily.co/yourroom # (for joining the bot to the same room repeatedly for local dev) -DAILY_API_KEY=7df... -OPENAI_API_KEY=sk-PL... -ELEVENLABS_API_KEY=aeb... -CANONICAL_API_KEY=can... -CANONICAL_API_URL= diff --git a/examples/canonical-metrics/requirements.txt b/examples/canonical-metrics/requirements.txt deleted file mode 100644 index 7e53edc6b..000000000 --- a/examples/canonical-metrics/requirements.txt +++ /dev/null @@ -1,5 +0,0 @@ -python-dotenv -fastapi[all] -uvicorn -pipecat-ai[daily,openai,silero,elevenlabs,canonical] - diff --git a/examples/canonical-metrics/runner.py b/examples/canonical-metrics/runner.py deleted file mode 100644 index ad39a3ac4..000000000 --- a/examples/canonical-metrics/runner.py +++ /dev/null @@ -1,55 +0,0 @@ -# -# Copyright (c) 2024–2025, Daily -# -# SPDX-License-Identifier: BSD 2-Clause License -# - -import argparse -import os - -import aiohttp - -from pipecat.transports.services.helpers.daily_rest import DailyRESTHelper - - -async def configure(aiohttp_session: aiohttp.ClientSession): - parser = argparse.ArgumentParser(description="Daily AI SDK Bot Sample") - parser.add_argument( - "-u", "--url", type=str, required=False, help="URL of the Daily room to join" - ) - parser.add_argument( - "-k", - "--apikey", - type=str, - required=False, - help="Daily API Key (needed to create an owner token for the room)", - ) - - args, unknown = parser.parse_known_args() - - url = args.url or os.getenv("DAILY_SAMPLE_ROOM_URL") - key = args.apikey or os.getenv("DAILY_API_KEY") - - if not url: - raise Exception( - "No Daily room specified. use the -u/--url option from the command line, or set DAILY_SAMPLE_ROOM_URL in your environment to specify a Daily room URL." - ) - - if not key: - raise Exception( - "No Daily API key specified. use the -k/--apikey option from the command line, or set DAILY_API_KEY in your environment to specify a Daily API key, available from https://dashboard.daily.co/developers." - ) - - daily_rest_helper = DailyRESTHelper( - daily_api_key=key, - daily_api_url=os.getenv("DAILY_API_URL", "https://api.daily.co/v1"), - aiohttp_session=aiohttp_session, - ) - - # Create a meeting token for the given room with an expiration 1 hour in - # the future. - expiry_time: float = 60 * 60 - - token = await daily_rest_helper.get_token(url, expiry_time) - - return (url, token) diff --git a/examples/canonical-metrics/server.py b/examples/canonical-metrics/server.py deleted file mode 100644 index a0f38854c..000000000 --- a/examples/canonical-metrics/server.py +++ /dev/null @@ -1,139 +0,0 @@ -# -# Copyright (c) 2024–2025, Daily -# -# SPDX-License-Identifier: BSD 2-Clause License -# - -import argparse -import os -import subprocess -from contextlib import asynccontextmanager - -import aiohttp -from dotenv import load_dotenv -from fastapi import FastAPI, HTTPException, Request -from fastapi.middleware.cors import CORSMiddleware -from fastapi.responses import JSONResponse, RedirectResponse - -from pipecat.transports.services.helpers.daily_rest import DailyRESTHelper, DailyRoomParams - -MAX_BOTS_PER_ROOM = 1 - -# Bot sub-process dict for status reporting and concurrency control -bot_procs = {} - -daily_helpers = {} - -load_dotenv(override=True) - - -def cleanup(): - # Clean up function, just to be extra safe - for entry in bot_procs.values(): - proc = entry[0] - proc.terminate() - proc.wait() - - -@asynccontextmanager -async def lifespan(app: FastAPI): - aiohttp_session = aiohttp.ClientSession() - daily_helpers["rest"] = DailyRESTHelper( - daily_api_key=os.getenv("DAILY_API_KEY", ""), - daily_api_url=os.getenv("DAILY_API_URL", "https://api.daily.co/v1"), - aiohttp_session=aiohttp_session, - ) - yield - await aiohttp_session.close() - cleanup() - - -app = FastAPI(lifespan=lifespan) - -app.add_middleware( - CORSMiddleware, - allow_origins=["*"], - allow_credentials=True, - allow_methods=["*"], - allow_headers=["*"], -) - - -@app.get("/") -async def start_agent(request: Request): - print(f"!!! Creating room") - room = await daily_helpers["rest"].create_room(DailyRoomParams()) - print(f"!!! Room URL: {room.url}") - # Ensure the room property is present - if not room.url: - raise HTTPException( - status_code=500, - detail="Missing 'room' property in request data. Cannot start agent without a target room!", - ) - - # Check if there is already an existing process running in this room - num_bots_in_room = sum( - 1 for proc in bot_procs.values() if proc[1] == room.url and proc[0].poll() is None - ) - if num_bots_in_room >= MAX_BOTS_PER_ROOM: - raise HTTPException(status_code=500, detail=f"Max bot limited reach for room: {room.url}") - - # Get the token for the room - token = await daily_helpers["rest"].get_token(room.url) - - if not token: - raise HTTPException(status_code=500, detail=f"Failed to get token for room: {room.url}") - - # Spawn a new agent, and join the user session - # Note: this is mostly for demonstration purposes (refer to 'deployment' in README) - try: - proc = subprocess.Popen( - [f"python3 -m bot -u {room.url} -t {token}"], - shell=True, - bufsize=1, - cwd=os.path.dirname(os.path.abspath(__file__)), - ) - bot_procs[proc.pid] = (proc, room.url) - except Exception as e: - raise HTTPException(status_code=500, detail=f"Failed to start subprocess: {e}") - - return RedirectResponse(room.url) - - -@app.get("/status/{pid}") -def get_status(pid: int): - # Look up the subprocess - proc = bot_procs.get(pid) - - # If the subprocess doesn't exist, return an error - if not proc: - raise HTTPException(status_code=404, detail=f"Bot with process id: {pid} not found") - - # Check the status of the subprocess - if proc[0].poll() is None: - status = "running" - else: - status = "finished" - - return JSONResponse({"bot_id": pid, "status": status}) - - -if __name__ == "__main__": - import uvicorn - - default_host = os.getenv("HOST", "0.0.0.0") - default_port = int(os.getenv("FAST_API_PORT", "7860")) - - parser = argparse.ArgumentParser(description="Daily Storyteller FastAPI server") - parser.add_argument("--host", type=str, default=default_host, help="Host address") - parser.add_argument("--port", type=int, default=default_port, help="Port number") - parser.add_argument("--reload", action="store_true", help="Reload code on change") - - config = parser.parse_args() - - uvicorn.run( - "server:app", - host=config.host, - port=config.port, - reload=config.reload, - ) diff --git a/pyproject.toml b/pyproject.toml index ecddb0902..910c8d066 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,7 +43,6 @@ anthropic = [ "anthropic~=0.49.0" ] assemblyai = [ "assemblyai~=0.37.0" ] aws = [ "boto3~=1.37.16" ] azure = [ "azure-cognitiveservices-speech~=1.42.0"] -canonical = [ "aiofiles~=24.1.0" ] cartesia = [ "cartesia~=1.4.0", "websockets~=13.1" ] cerebras = [] deepseek = [] diff --git a/src/pipecat/services/canonical/__init__.py b/src/pipecat/services/canonical/__init__.py deleted file mode 100644 index f47b99c4e..000000000 --- a/src/pipecat/services/canonical/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# -# Copyright (c) 2024–2025, Daily -# -# SPDX-License-Identifier: BSD 2-Clause License -# - -import sys - -from pipecat.services import DeprecatedModuleProxy - -from .metrics import * - -sys.modules[__name__] = DeprecatedModuleProxy(globals(), "canonical", "canonical.metrics") diff --git a/src/pipecat/services/canonical/metrics.py b/src/pipecat/services/canonical/metrics.py deleted file mode 100644 index 012cd4ab7..000000000 --- a/src/pipecat/services/canonical/metrics.py +++ /dev/null @@ -1,230 +0,0 @@ -# -# Copyright (c) 2024–2025, Daily -# -# SPDX-License-Identifier: BSD 2-Clause License -# - -import io -import os -import uuid -import wave -from datetime import datetime -from typing import Dict, List, Optional, Tuple - -import aiohttp -from loguru import logger - -from pipecat.frames.frames import CancelFrame, EndFrame, Frame -from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext -from pipecat.processors.audio.audio_buffer_processor import AudioBufferProcessor -from pipecat.processors.frame_processor import FrameDirection -from pipecat.services.ai_service import AIService - -try: - import aiofiles - import aiofiles.os -except ModuleNotFoundError as e: - logger.error(f"Exception: {e}") - logger.error( - "In order to use Canonical Metrics, you need to `pip install pipecat-ai[canonical]`. " - + "Also, set the `CANONICAL_API_KEY` environment variable." - ) - raise Exception(f"Missing module: {e}") - - -# Multipart upload part size in bytes, cannot be smaller than 5MB -PART_SIZE = 1024 * 1024 * 5 - - -class CanonicalMetricsService(AIService): - """Initialize a CanonicalAudioProcessor instance. - - This class uses an AudioBufferProcessor to get the conversation audio and - uploads it to Canonical Voice API for audio processing. - - Args: - call_id (str): Your unique identifier for the call. This is used to match the call in the Canonical Voice system to the call in your system. - assistant (str): Identifier for the AI assistant. This can be whatever you want, it's intended for you convenience so you can distinguish - between different assistants and a grouping mechanism for calls. - assistant_speaks_first (bool, optional): Indicates if the assistant speaks first in the conversation. Defaults to True. - output_dir (str, optional): Directory to save temporary audio files. Defaults to "recordings". - - Attributes: - call_id (str): Stores the unique call identifier. - assistant (str): Stores the assistant identifier. - assistant_speaks_first (bool): Indicates whether the assistant speaks first. - output_dir (str): Directory path for saving temporary audio files. - - The constructor also ensures that the output directory exists. - """ - - def __init__( - self, - *, - aiohttp_session: aiohttp.ClientSession, - call_id: str, - assistant: str, - api_key: str, - api_url: str = "https://voiceapp.canonical.chat/api/v1", - assistant_speaks_first: bool = True, - output_dir: str = "recordings", - audio_buffer_processor: Optional[AudioBufferProcessor] = None, - context: Optional[OpenAILLMContext] = None, - **kwargs, - ): - super().__init__(**kwargs) - # Validate that at least one of audio_buffer_processor or context is provided - if audio_buffer_processor is None and context is None: - raise ValueError("At least one of audio_buffer_processor or context must be specified") - - self._aiohttp_session = aiohttp_session - self._audio_buffer_processor = audio_buffer_processor - self._api_key = api_key - self._api_url = api_url - self._call_id = call_id - self._assistant = assistant - self._assistant_speaks_first = assistant_speaks_first - self._output_dir = output_dir - self._context = context - - async def stop(self, frame: EndFrame): - await super().stop(frame) - await self._process_completion() - - async def cancel(self, frame: CancelFrame): - await super().cancel(frame) - await self._process_completion() - - async def process_frame(self, frame: Frame, direction: FrameDirection): - await super().process_frame(frame, direction) - await self.push_frame(frame, direction) - - async def _process_completion(self): - if self._audio_buffer_processor is not None: - await self._process_audio() - elif self._context is not None: - await self._process_transcript() - - async def _process_transcript(self): - params = { - "callId": self._call_id, - "assistant": {"id": self._assistant, "speaksFirst": self._assistant_speaks_first}, - "transcript": self._context.messages, - } - response = await self._aiohttp_session.post( - f"{self._api_url}/call", - headers=self._request_headers(), - json=params, - ) - if not response.ok: - logger.error(f"Failed to process transcript: {await response.text()}") - - async def _process_audio(self): - audio_buffer_processor = self._audio_buffer_processor - - if not audio_buffer_processor.has_audio(): - return - - os.makedirs(self._output_dir, exist_ok=True) - filename = self._get_output_filename() - audio = audio_buffer_processor.merge_audio_buffers() - - with io.BytesIO() as buffer: - with wave.open(buffer, "wb") as wf: - wf.setsampwidth(2) - wf.setnchannels(audio_buffer_processor.num_channels) - wf.setframerate(audio_buffer_processor.sample_rate) - wf.writeframes(audio) - async with aiofiles.open(filename, "wb") as file: - await file.write(buffer.getvalue()) - - try: - await self._multipart_upload(filename) - await aiofiles.os.remove(filename) - except FileNotFoundError: - pass - except Exception as e: - logger.error(f"Failed to upload recording: {e}") - - def _get_output_filename(self): - timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - return f"{self._output_dir}/{timestamp}-{uuid.uuid4().hex}.wav" - - def _request_headers(self): - return {"Content-Type": "application/json", "X-Canonical-Api-Key": self._api_key} - - async def _multipart_upload(self, file_path: str): - upload_request, upload_response = await self._request_upload(file_path) - if upload_request is None or upload_response is None: - return - parts = await self._upload_parts(file_path, upload_response) - if parts is None: - return - await self._upload_complete(parts, upload_request, upload_response) - - async def _request_upload(self, file_path: str) -> Tuple[Dict, Dict]: - filename = os.path.basename(file_path) - filesize = os.path.getsize(file_path) - numparts = int((filesize + PART_SIZE - 1) / PART_SIZE) - - params = { - "filename": filename, - "parts": numparts, - "callId": self._call_id, - "assistant": {"id": self._assistant, "speaksFirst": self._assistant_speaks_first}, - } - logger.debug(f"Requesting presigned URLs for {numparts} parts") - response = await self._aiohttp_session.post( - f"{self._api_url}/recording/uploadRequest", headers=self._request_headers(), json=params - ) - if not response.ok: - logger.error(f"Failed to get presigned URLs: {await response.text()}") - return None, None - response_json = await response.json() - return params, response_json - - async def _upload_parts(self, file_path: str, upload_response: Dict) -> List[Dict]: - urls = upload_response["urls"] - parts = [] - try: - async with aiofiles.open(file_path, "rb") as file: - for partnum, upload_url in enumerate(urls, start=1): - data = await file.read(PART_SIZE) - if not data: - break - - response = await self._aiohttp_session.put(upload_url, data=data) - if not response.ok: - logger.error(f"Failed to upload part {partnum}: {await response.text()}") - return None - - etag = response.headers["ETag"] - parts.append({"partnum": str(partnum), "etag": etag}) - - except Exception as e: - logger.error(f"Multipart upload aborted, an error occurred: {str(e)}") - return parts - - async def _upload_complete( - self, parts: List[Dict], upload_request: Dict, upload_response: Dict - ): - params = { - "filename": upload_request["filename"], - "parts": parts, - "slug": upload_response["slug"], - "callId": self._call_id, - "assistant": {"id": self._assistant, "speaksFirst": self._assistant_speaks_first}, - } - if self._context is not None: - params["transcript"] = self._context.messages - - logger.debug(f"Completing upload for {params['filename']}") - logger.debug(f"Slug: {params['slug']}") - response = await self._aiohttp_session.post( - f"{self._api_url}/recording/uploadComplete", - headers=self._request_headers(), - json=params, - ) - if not response.ok: - logger.error(f"Failed to complete upload: {await response.text()}") - return From a4447019293e5e2fe11eade1ac492605aecf9782 Mon Sep 17 00:00:00 2001 From: Mark Backman Date: Wed, 7 May 2025 09:02:08 -0400 Subject: [PATCH 11/97] Update README with Riva services --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 7f95bb664..47be9b6e1 100644 --- a/README.md +++ b/README.md @@ -51,9 +51,9 @@ You can connect to Pipecat from any platform using our official SDKs: | Category | Services | | ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| Speech-to-Text | [AssemblyAI](https://docs.pipecat.ai/server/services/stt/assemblyai), [Azure](https://docs.pipecat.ai/server/services/stt/azure), [Deepgram](https://docs.pipecat.ai/server/services/stt/deepgram), [Fal Wizper](https://docs.pipecat.ai/server/services/stt/fal), [Gladia](https://docs.pipecat.ai/server/services/stt/gladia), [Google](https://docs.pipecat.ai/server/services/stt/google), [Groq (Whisper)](https://docs.pipecat.ai/server/services/stt/groq), [OpenAI (Whisper)](https://docs.pipecat.ai/server/services/stt/openai), [Parakeet (NVIDIA)](https://docs.pipecat.ai/server/services/stt/parakeet), [Ultravox](https://docs.pipecat.ai/server/services/stt/ultravox), [Whisper](https://docs.pipecat.ai/server/services/stt/whisper) | +| Speech-to-Text | [AssemblyAI](https://docs.pipecat.ai/server/services/stt/assemblyai), [Azure](https://docs.pipecat.ai/server/services/stt/azure), [Deepgram](https://docs.pipecat.ai/server/services/stt/deepgram), [Fal Wizper](https://docs.pipecat.ai/server/services/stt/fal), [Gladia](https://docs.pipecat.ai/server/services/stt/gladia), [Google](https://docs.pipecat.ai/server/services/stt/google), [Groq (Whisper)](https://docs.pipecat.ai/server/services/stt/groq), [OpenAI (Whisper)](https://docs.pipecat.ai/server/services/stt/openai), [NVIDIA Riva](https://docs.pipecat.ai/server/services/stt/riva), [Ultravox](https://docs.pipecat.ai/server/services/stt/ultravox), [Whisper](https://docs.pipecat.ai/server/services/stt/whisper) | | LLMs | [Anthropic](https://docs.pipecat.ai/server/services/llm/anthropic), [Azure](https://docs.pipecat.ai/server/services/llm/azure), [Cerebras](https://docs.pipecat.ai/server/services/llm/cerebras), [DeepSeek](https://docs.pipecat.ai/server/services/llm/deepseek), [Fireworks AI](https://docs.pipecat.ai/server/services/llm/fireworks), [Gemini](https://docs.pipecat.ai/server/services/llm/gemini), [Grok](https://docs.pipecat.ai/server/services/llm/grok), [Groq](https://docs.pipecat.ai/server/services/llm/groq), [NVIDIA NIM](https://docs.pipecat.ai/server/services/llm/nim), [Ollama](https://docs.pipecat.ai/server/services/llm/ollama), [OpenAI](https://docs.pipecat.ai/server/services/llm/openai), [OpenRouter](https://docs.pipecat.ai/server/services/llm/openrouter), [Perplexity](https://docs.pipecat.ai/server/services/llm/perplexity), [Qwen](https://docs.pipecat.ai/server/services/llm/qwen), [Together AI](https://docs.pipecat.ai/server/services/llm/together) | -| Text-to-Speech | [AWS](https://docs.pipecat.ai/server/services/tts/aws), [Azure](https://docs.pipecat.ai/server/services/tts/azure), [Cartesia](https://docs.pipecat.ai/server/services/tts/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/tts/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/tts/elevenlabs), [FastPitch (NVIDIA)](https://docs.pipecat.ai/server/services/tts/fastpitch), [Fish](https://docs.pipecat.ai/server/services/tts/fish), [Google](https://docs.pipecat.ai/server/services/tts/google), [LMNT](https://docs.pipecat.ai/server/services/tts/lmnt), [Neuphonic](https://docs.pipecat.ai/server/services/tts/neuphonic), [OpenAI](https://docs.pipecat.ai/server/services/tts/openai), [Piper](https://docs.pipecat.ai/server/services/tts/piper), [PlayHT](https://docs.pipecat.ai/server/services/tts/playht), [Rime](https://docs.pipecat.ai/server/services/tts/rime), [XTTS](https://docs.pipecat.ai/server/services/tts/xtts) | +| Text-to-Speech | [AWS](https://docs.pipecat.ai/server/services/tts/aws), [Azure](https://docs.pipecat.ai/server/services/tts/azure), [Cartesia](https://docs.pipecat.ai/server/services/tts/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/tts/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/tts/elevenlabs), [Fish](https://docs.pipecat.ai/server/services/tts/fish), [Google](https://docs.pipecat.ai/server/services/tts/google), [LMNT](https://docs.pipecat.ai/server/services/tts/lmnt), [Neuphonic](https://docs.pipecat.ai/server/services/tts/neuphonic), [NVIDIA Riva](https://docs.pipecat.ai/server/services/tts/riva), [OpenAI](https://docs.pipecat.ai/server/services/tts/openai), [Piper](https://docs.pipecat.ai/server/services/tts/piper), [PlayHT](https://docs.pipecat.ai/server/services/tts/playht), [Rime](https://docs.pipecat.ai/server/services/tts/rime), [XTTS](https://docs.pipecat.ai/server/services/tts/xtts) | | Speech-to-Speech | [Gemini Multimodal Live](https://docs.pipecat.ai/server/services/s2s/gemini), [OpenAI Realtime](https://docs.pipecat.ai/server/services/s2s/openai) | | Transport | [Daily (WebRTC)](https://docs.pipecat.ai/server/services/transport/daily), [FastAPI Websocket](https://docs.pipecat.ai/server/services/transport/fastapi-websocket), [SmallWebRTCTransport](https://docs.pipecat.ai/server/services/transport/small-webrtc), [WebSocket Server](https://docs.pipecat.ai/server/services/transport/websocket-server), Local | | Video | [Tavus](https://docs.pipecat.ai/server/services/video/tavus), [Simli](https://docs.pipecat.ai/server/services/video/simli) | From 5e39c0cfeb8f8b162c0c27443a1a3417ccc5a338 Mon Sep 17 00:00:00 2001 From: Dan Berg Date: Wed, 7 May 2025 14:30:39 +0200 Subject: [PATCH 12/97] DailyTransport: added on_active_speaker_changed event handler --- CHANGELOG.md | 2 ++ src/pipecat/transports/services/daily.py | 10 ++++++++++ 2 files changed, 12 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 101cc7f58..3e330b9c9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added +- Added `on_active_speaker_changed` event handler to the `DailyTransport` class. + - Added `enable_ssml_parsing` and `enable_logging` to `InputParams` in `ElevenLabsTTSService`. diff --git a/src/pipecat/transports/services/daily.py b/src/pipecat/transports/services/daily.py index 9909d9336..5d00e76bc 100644 --- a/src/pipecat/transports/services/daily.py +++ b/src/pipecat/transports/services/daily.py @@ -175,6 +175,7 @@ class DailyCallbacks(BaseModel): """Callback handlers for Daily events. Attributes: + on_active_speaker_changed: Called when the active speaker of the call has changed. on_joined: Called when bot successfully joined a room. on_left: Called when bot left a room. on_error: Called when an error occurs. @@ -201,6 +202,7 @@ class DailyCallbacks(BaseModel): on_recording_error: Called when recording encounters an error. """ + on_active_speaker_changed: Callable[[Mapping[str, Any]], Awaitable[None]] on_joined: Callable[[Mapping[str, Any]], Awaitable[None]] on_left: Callable[[], Awaitable[None]] on_error: Callable[[str], Awaitable[None]] @@ -789,6 +791,9 @@ class DailyTransportClient(EventHandler): # Daily (EventHandler) # + def on_active_speaker_changed(self, participant): + self._call_async_callback(self._callbacks.on_active_speaker_changed, participant) + def on_app_message(self, message: Any, sender: str): self._call_async_callback(self._callbacks.on_app_message, message, sender) @@ -1208,6 +1213,7 @@ class DailyTransport(BaseTransport): super().__init__(input_name=input_name, output_name=output_name) callbacks = DailyCallbacks( + on_active_speaker_changed=self._on_active_speaker_changed, on_joined=self._on_joined, on_left=self._on_left, on_error=self._on_error, @@ -1243,6 +1249,7 @@ class DailyTransport(BaseTransport): # Register supported handlers. The user will only be able to register # these handlers. + self._register_event_handler("on_active_speaker_changed") self._register_event_handler("on_joined") self._register_event_handler("on_left") self._register_event_handler("on_error") @@ -1377,6 +1384,9 @@ class DailyTransport(BaseTransport): async def update_remote_participants(self, remote_participants: Mapping[str, Any]): await self._client.update_remote_participants(remote_participants=remote_participants) + async def _on_active_speaker_changed(self, participant: Any): + await self._call_event_handler("on_active_speaker_changed", participant) + async def _on_joined(self, data): await self._call_event_handler("on_joined", data) From 5b66133a6cf27d9087e14897d7e2eced3e852cbe Mon Sep 17 00:00:00 2001 From: mattie ruth backman Date: Wed, 7 May 2025 12:08:28 -0400 Subject: [PATCH 13/97] Revert breaking change in RTVI protocol for function calling --- src/pipecat/processors/frameworks/rtvi.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/pipecat/processors/frameworks/rtvi.py b/src/pipecat/processors/frameworks/rtvi.py index ee0cced87..909dd15b7 100644 --- a/src/pipecat/processors/frameworks/rtvi.py +++ b/src/pipecat/processors/frameworks/rtvi.py @@ -254,7 +254,7 @@ class RTVIBotReady(BaseModel): class RTVILLMFunctionCallMessageData(BaseModel): function_name: str tool_call_id: str - arguments: Mapping[str, Any] + args: Mapping[str, Any] class RTVILLMFunctionCallMessage(BaseModel): @@ -700,7 +700,7 @@ class RTVIProcessor(FrameProcessor): fn = RTVILLMFunctionCallMessageData( function_name=params.function_name, tool_call_id=params.tool_call_id, - arguments=params.arguments, + args=params.arguments, ) message = RTVILLMFunctionCallMessage(data=fn) await self._push_transport_message(message, exclude_none=False) From 2b18f60261adfb6252f8385ade3614f26f926714 Mon Sep 17 00:00:00 2001 From: Tico Ballagas Date: Sat, 8 Feb 2025 18:46:59 -0800 Subject: [PATCH 14/97] Initial implementation of AWS Transcribe TTS --- .../foundational/07m-interruptible-polly.py | 13 +- src/pipecat/services/aws/tts.py | 624 +++++++++++++++++- 2 files changed, 626 insertions(+), 11 deletions(-) diff --git a/examples/foundational/07m-interruptible-polly.py b/examples/foundational/07m-interruptible-polly.py index 286fe5128..63349360e 100644 --- a/examples/foundational/07m-interruptible-polly.py +++ b/examples/foundational/07m-interruptible-polly.py @@ -15,9 +15,9 @@ from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner from pipecat.pipeline.task import PipelineParams, PipelineTask from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext -from pipecat.services.aws.tts import PollyTTSService -from pipecat.services.deepgram.stt import DeepgramSTTService -from pipecat.services.openai.llm import OpenAILLMService +from pipecat.services.aws import PollyTTSService, TranscribeSTTService +from pipecat.services.openai import OpenAILLMService +from pipecat.transcriptions.language import Language from pipecat.transports.base_transport import TransportParams from pipecat.transports.network.small_webrtc import SmallWebRTCTransport from pipecat.transports.network.webrtc_connection import SmallWebRTCConnection @@ -37,14 +37,11 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection, _: argparse.Namespac ), ) - stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY")) + stt = TranscribeSTTService() tts = PollyTTSService( - api_key=os.getenv("AWS_SECRET_ACCESS_KEY"), - aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"), - region=os.getenv("AWS_REGION"), voice_id="Amy", - params=PollyTTSService.InputParams(engine="neural", language="en-GB", rate="1.05"), + params=PollyTTSService.InputParams(engine="standard", language=Language.EN_GB, rate="1.05"), ) llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY")) diff --git a/src/pipecat/services/aws/tts.py b/src/pipecat/services/aws/tts.py index db6e168ab..e90ea9220 100644 --- a/src/pipecat/services/aws/tts.py +++ b/src/pipecat/services/aws/tts.py @@ -5,7 +5,21 @@ # import asyncio -from typing import AsyncGenerator, Optional +from typing import AsyncGenerator, Optional, Dict +import os +import datetime +import time +from urllib.parse import urlencode +import json +import struct +from io import BytesIO +import urllib.parse +import hashlib +import hmac +import random +import string +import binascii +import numpy as np from loguru import logger from pydantic import BaseModel @@ -17,17 +31,27 @@ from pipecat.frames.frames import ( TTSAudioRawFrame, TTSStartedFrame, TTSStoppedFrame, + TranscriptionFrame, + InterimTranscriptionFrame, + StartFrame, + EndFrame, + CancelFrame, ) -from pipecat.services.tts_service import TTSService +from pipecat.services.ai_services import TTSService, STTService from pipecat.transcriptions.language import Language +from pipecat.utils.time import time_now_iso8601 try: import boto3 from botocore.exceptions import BotoCoreError, ClientError + import websockets + from botocore.auth import SigV4Auth + from botocore.awsrequest import AWSRequest + from botocore.credentials import Credentials except ModuleNotFoundError as e: logger.error(f"Exception: {e}") logger.error( - "In order to use Deepgram, you need to `pip install pipecat-ai[aws]`. Also, set `AWS_SECRET_ACCESS_KEY`, `AWS_ACCESS_KEY_ID`, and `AWS_REGION` environment variable." + "In order to use AWS services, you need to `pip install pipecat-ai[aws]`. Also, set `AWS_SECRET_ACCESS_KEY`, `AWS_ACCESS_KEY_ID`, and `AWS_REGION` environment variable." ) raise Exception(f"Missing module: {e}") @@ -151,6 +175,24 @@ class PollyTTSService(TTSService): self.set_voice(voice_id) + # Get credentials from environment variables if not provided + self._credentials = { + "aws_access_key_id": aws_access_key_id or os.getenv("AWS_ACCESS_KEY_ID"), + "aws_secret_access_key": api_key or os.getenv("AWS_SECRET_ACCESS_KEY"), + "aws_session_token": aws_session_token or os.getenv("AWS_SESSION_TOKEN"), + "region": region or os.getenv("AWS_REGION", "us-east-1"), + } + + # Validate that we have the required credentials + if ( + not self._credentials["aws_access_key_id"] + or not self._credentials["aws_secret_access_key"] + ): + raise ValueError( + "AWS credentials not found. Please provide them either through constructor parameters " + "or set AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY environment variables." + ) + def can_generate_metrics(self) -> bool: return True @@ -248,3 +290,579 @@ class PollyTTSService(TTSService): finally: yield TTSStoppedFrame() + + +class AWSTTSService(PollyTTSService): + def __init__(self, **kwargs): + super().__init__(**kwargs) + + import warnings + + with warnings.catch_warnings(): + warnings.simplefilter("always") + warnings.warn( + "'AWSTTSService' is deprecated, use 'PollyTTSService' instead.", DeprecationWarning + ) + + +def get_presigned_url( + *, + region: str, + credentials: Dict[str, Optional[str]], + language_code: str, + media_encoding: str = "pcm", + sample_rate: int = 16000, + number_of_channels: int = 1, + enable_partial_results_stabilization: bool = True, + partial_results_stability: str = "high", + vocabulary_name: Optional[str] = None, + vocabulary_filter_name: Optional[str] = None, + show_speaker_label: bool = False, + enable_channel_identification: bool = False, +) -> str: + """Create a presigned URL for AWS Transcribe streaming.""" + access_key = credentials.get("access_key") + secret_key = credentials.get("secret_key") + session_token = credentials.get("session_token") + + if not access_key or not secret_key: + raise ValueError("AWS credentials are required") + + # Initialize the URL generator + url_generator = AWSTranscribePresignedURL( + access_key=access_key, secret_key=secret_key, session_token=session_token, region=region + ) + + # Get the presigned URL + return url_generator.get_request_url( + sample_rate=sample_rate, + language_code=language_code, + media_encoding=media_encoding, + vocabulary_name=vocabulary_name, + vocabulary_filter_name=vocabulary_filter_name, + show_speaker_label=show_speaker_label, + enable_channel_identification=enable_channel_identification, + number_of_channels=number_of_channels, + enable_partial_results_stabilization=enable_partial_results_stabilization, + partial_results_stability=partial_results_stability, + ) + + +class AWSTranscribePresignedURL: + def __init__( + self, access_key: str, secret_key: str, session_token: str, region: str = "us-east-1" + ): + self.access_key = access_key + self.secret_key = secret_key + self.session_token = session_token + self.method = "GET" + self.service = "transcribe" + self.region = region + self.endpoint = "" + self.host = "" + self.amz_date = "" + self.datestamp = "" + self.canonical_uri = "/stream-transcription-websocket" + self.canonical_headers = "" + self.signed_headers = "host" + self.algorithm = "AWS4-HMAC-SHA256" + self.credential_scope = "" + self.canonical_querystring = "" + self.payload_hash = "" + self.canonical_request = "" + self.string_to_sign = "" + self.signature = "" + self.request_url = "" + + def get_request_url( + self, + sample_rate: int, + language_code: str = "", + media_encoding: str = "pcm", + vocabulary_name: str = "", + vocabulary_filter_name: str = "", + show_speaker_label: bool = False, + enable_channel_identification: bool = False, + number_of_channels: int = 1, + enable_partial_results_stabilization: bool = False, + partial_results_stability: str = "", + ) -> str: + self.endpoint = f"wss://transcribestreaming.{self.region}.amazonaws.com:8443" + self.host = f"transcribestreaming.{self.region}.amazonaws.com:8443" + + now = datetime.datetime.utcnow() + self.amz_date = now.strftime("%Y%m%dT%H%M%SZ") + self.datestamp = now.strftime("%Y%m%d") + self.canonical_headers = f"host:{self.host}\n" + self.credential_scope = f"{self.datestamp}%2F{self.region}%2F{self.service}%2Faws4_request" + + # Create canonical querystring + self.canonical_querystring = "X-Amz-Algorithm=" + self.algorithm + self.canonical_querystring += ( + "&X-Amz-Credential=" + self.access_key + "%2F" + self.credential_scope + ) + self.canonical_querystring += "&X-Amz-Date=" + self.amz_date + self.canonical_querystring += "&X-Amz-Expires=300" + if self.session_token: + self.canonical_querystring += "&X-Amz-Security-Token=" + urllib.parse.quote( + self.session_token, safe="" + ) + self.canonical_querystring += "&X-Amz-SignedHeaders=" + self.signed_headers + + if enable_channel_identification: + self.canonical_querystring += "&enable-channel-identification=true" + if enable_partial_results_stabilization: + self.canonical_querystring += "&enable-partial-results-stabilization=true" + if language_code: + self.canonical_querystring += "&language-code=" + language_code + if media_encoding: + self.canonical_querystring += "&media-encoding=" + media_encoding + if number_of_channels > 1: + self.canonical_querystring += "&number-of-channels=" + str(number_of_channels) + if partial_results_stability: + self.canonical_querystring += "&partial-results-stability=" + partial_results_stability + if sample_rate: + self.canonical_querystring += "&sample-rate=" + str(sample_rate) + if show_speaker_label: + self.canonical_querystring += "&show-speaker-label=true" + if vocabulary_filter_name: + self.canonical_querystring += "&vocabulary-filter-name=" + vocabulary_filter_name + if vocabulary_name: + self.canonical_querystring += "&vocabulary-name=" + vocabulary_name + + # Create payload hash + self.payload_hash = hashlib.sha256("".encode("utf-8")).hexdigest() + + # Create canonical request + self.canonical_request = f"{self.method}\n{self.canonical_uri}\n{self.canonical_querystring}\n{self.canonical_headers}\n{self.signed_headers}\n{self.payload_hash}" + + # Create string to sign + credential_scope = f"{self.datestamp}/{self.region}/{self.service}/aws4_request" + string_to_sign = ( + f"{self.algorithm}\n{self.amz_date}\n{credential_scope}\n" + + hashlib.sha256(self.canonical_request.encode("utf-8")).hexdigest() + ) + + # Calculate signature + k_date = hmac.new( + f"AWS4{self.secret_key}".encode("utf-8"), self.datestamp.encode("utf-8"), hashlib.sha256 + ).digest() + k_region = hmac.new(k_date, self.region.encode("utf-8"), hashlib.sha256).digest() + k_service = hmac.new(k_region, self.service.encode("utf-8"), hashlib.sha256).digest() + k_signing = hmac.new(k_service, b"aws4_request", hashlib.sha256).digest() + self.signature = hmac.new( + k_signing, string_to_sign.encode("utf-8"), hashlib.sha256 + ).hexdigest() + + # Add signature to query string + self.canonical_querystring += "&X-Amz-Signature=" + self.signature + + # Create request URL + self.request_url = self.endpoint + self.canonical_uri + "?" + self.canonical_querystring + return self.request_url + + +def get_headers(header_name: str, header_value: str) -> bytearray: + """Build a header following AWS event stream format.""" + name = header_name.encode("utf-8") + name_byte_length = bytes([len(name)]) + value_type = bytes([7]) # 7 represents a string + value = header_value.encode("utf-8") + value_byte_length = struct.pack(">H", len(value)) + + # Construct the header + header_list = bytearray() + header_list.extend(name_byte_length) + header_list.extend(name) + header_list.extend(value_type) + header_list.extend(value_byte_length) + header_list.extend(value) + return header_list + + +def build_event_message(payload: bytes) -> bytes: + """ + Build an event message for AWS Transcribe streaming. + Matches AWS sample: https://github.com/aws-samples/amazon-transcribe-streaming-python-websockets/blob/main/eventstream.py + """ + # Build headers + content_type_header = get_headers(":content-type", "application/octet-stream") + event_type_header = get_headers(":event-type", "AudioEvent") + message_type_header = get_headers(":message-type", "event") + + headers = bytearray() + headers.extend(content_type_header) + headers.extend(event_type_header) + headers.extend(message_type_header) + + # Calculate total byte length and headers byte length + # 16 accounts for 8 byte prelude, 2x 4 byte CRCs + total_byte_length = struct.pack(">I", len(headers) + len(payload) + 16) + headers_byte_length = struct.pack(">I", len(headers)) + + # Build the prelude + prelude = bytearray([0] * 8) + prelude[:4] = total_byte_length + prelude[4:] = headers_byte_length + + # Calculate checksum for prelude + prelude_crc = struct.pack(">I", binascii.crc32(prelude) & 0xFFFFFFFF) + + # Construct the message + message_as_list = bytearray() + message_as_list.extend(prelude) + message_as_list.extend(prelude_crc) + message_as_list.extend(headers) + message_as_list.extend(payload) + + # Calculate checksum for message + message = bytes(message_as_list) + message_crc = struct.pack(">I", binascii.crc32(message) & 0xFFFFFFFF) + + # Add message checksum + message_as_list.extend(message_crc) + + return bytes(message_as_list) + + +def decode_event(message): + # Extract the prelude, headers, payload and CRC + prelude = message[:8] + total_length, headers_length = struct.unpack(">II", prelude) + prelude_crc = struct.unpack(">I", message[8:12])[0] + headers = message[12 : 12 + headers_length] + payload = message[12 + headers_length : -4] + message_crc = struct.unpack(">I", message[-4:])[0] + + # Check the CRCs + assert prelude_crc == binascii.crc32(prelude) & 0xFFFFFFFF, "Prelude CRC check failed" + assert message_crc == binascii.crc32(message[:-4]) & 0xFFFFFFFF, "Message CRC check failed" + + # Parse the headers + headers_dict = {} + while headers: + name_len = headers[0] + name = headers[1 : 1 + name_len].decode("utf-8") + value_type = headers[1 + name_len] + value_len = struct.unpack(">H", headers[2 + name_len : 4 + name_len])[0] + value = headers[4 + name_len : 4 + name_len + value_len].decode("utf-8") + headers_dict[name] = value + headers = headers[4 + name_len + value_len :] + + return headers_dict, json.loads(payload) + + +class TranscribeSTTService(STTService): + def __init__( + self, + *, + api_key: Optional[str] = None, + aws_access_key_id: Optional[str] = None, + aws_session_token: Optional[str] = None, + region: Optional[str] = "us-east-1", + sample_rate: int = 16000, + language: Language = Language.EN, + **kwargs, + ): + super().__init__(**kwargs) + + self._settings = { + "sample_rate": sample_rate, + "language": language, + "media_encoding": "linear16", # AWS expects raw PCM + "number_of_channels": 1, + "show_speaker_label": False, + "enable_channel_identification": False, + } + + # Validate sample rate - AWS Transcribe only supports 8000 Hz or 16000 Hz + if sample_rate not in [8000, 16000]: + logger.warning( + f"AWS Transcribe only supports 8000 Hz or 16000 Hz sample rates. Converting from {sample_rate} Hz to 16000 Hz." + ) + self._settings["sample_rate"] = 16000 + + self._credentials = { + "aws_access_key_id": aws_access_key_id or os.getenv("AWS_ACCESS_KEY_ID"), + "aws_secret_access_key": api_key or os.getenv("AWS_SECRET_ACCESS_KEY"), + "aws_session_token": aws_session_token or os.getenv("AWS_SESSION_TOKEN"), + "region": region or os.getenv("AWS_REGION", "us-east-1"), + } + + self._ws_client = None + self._connection_lock = asyncio.Lock() + self._connecting = False + self._receive_task = None + + def get_service_encoding(self, encoding: str) -> str: + """Convert internal encoding format to AWS Transcribe format.""" + encoding_map = { + "linear16": "pcm", # AWS expects "pcm" for 16-bit linear PCM + } + return encoding_map.get(encoding, encoding) + + async def start(self, frame: StartFrame): + """Initialize the connection when the service starts.""" + await super().start(frame) + logger.info("Starting AWS Transcribe service...") + retry_count = 0 + max_retries = 3 + + while retry_count < max_retries: + try: + await self._connect() + if self._ws_client and self._ws_client.open: + logger.info("Successfully established WebSocket connection") + return + logger.warning("WebSocket connection not established after connect") + except Exception as e: + logger.error(f"Failed to connect (attempt {retry_count + 1}/{max_retries}): {e}") + retry_count += 1 + if retry_count < max_retries: + await asyncio.sleep(1) # Wait before retrying + + raise RuntimeError("Failed to establish WebSocket connection after multiple attempts") + + async def run_stt(self, frame: Frame) -> AsyncGenerator[Frame, None]: + """Process audio data and send to AWS Transcribe""" + try: + # Skip if no speech detected + if hasattr(frame, "is_speech") and not frame.is_speech: + logger.debug("Skipping non-speech frame") + return + + # Ensure WebSocket is connected + if not self._ws_client or not self._ws_client.open: + logger.info("WebSocket not connected, attempting to reconnect...") + try: + await self._connect() + except Exception as e: + logger.error(f"Failed to reconnect: {e}") + yield ErrorFrame("Failed to reconnect to AWS Transcribe", fatal=False) + return + + # Get the audio data - if frame is bytes, use directly, otherwise get audio attribute + audio_data = frame if isinstance(frame, bytes) else frame.audio + + # Format the audio data according to AWS event stream format + event_message = build_event_message(audio_data) + # logger.debug(f"Sending audio chunk of size {len(audio_data)} bytes") + + # Send the formatted event message + try: + await self._ws_client.send(event_message) + # Start metrics after first chunk sent + await self.start_processing_metrics() + await self.start_ttfb_metrics() + except websockets.exceptions.ConnectionClosed as e: + logger.warning(f"Connection closed while sending: {e}") + await self._disconnect() + # Don't yield error here - we'll retry on next frame + except Exception as e: + logger.error(f"Error sending audio: {e}") + yield ErrorFrame(f"AWS Transcribe error: {str(e)}", fatal=False) + await self._disconnect() + + except Exception as e: + logger.error(f"Error in run_stt: {e}") + yield ErrorFrame(f"AWS Transcribe error: {str(e)}", fatal=False) + await self._disconnect() + + async def _connect(self): + """Connect to AWS Transcribe with connection state management.""" + if ( + self._ws_client + and self._ws_client.open + and self._receive_task + and not self._receive_task.done() + ): + logger.debug("Already connected") + return + + async with self._connection_lock: + if self._connecting: + logger.debug("Connection already in progress") + return + + try: + self._connecting = True + logger.debug("Starting connection process...") + + if self._ws_client: + await self._disconnect() + + language_code = self.language_to_service_language( + Language(self._settings["language"]) + ) + if not language_code: + raise ValueError(f"Unsupported language: {self._settings['language']}") + + # Generate random websocket key + websocket_key = "".join( + random.choices( + string.ascii_uppercase + string.ascii_lowercase + string.digits, k=20 + ) + ) + + # Add required headers + extra_headers = { + "Origin": "https://localhost", + "Sec-WebSocket-Key": websocket_key, + "Sec-WebSocket-Version": "13", + "Connection": "keep-alive", + } + + # Get presigned URL + presigned_url = get_presigned_url( + region=self._credentials["region"], + credentials={ + "access_key": self._credentials["aws_access_key_id"], + "secret_key": self._credentials["aws_secret_access_key"], + "session_token": self._credentials["aws_session_token"], + }, + language_code=language_code, + media_encoding=self.get_service_encoding( + self._settings["media_encoding"] + ), # Convert to AWS format + sample_rate=self._settings["sample_rate"], + number_of_channels=self._settings["number_of_channels"], + enable_partial_results_stabilization=True, + partial_results_stability="high", + show_speaker_label=self._settings["show_speaker_label"], + enable_channel_identification=self._settings["enable_channel_identification"], + ) + + logger.debug(f"Connecting to WebSocket with URL: {presigned_url[:100]}...") + + # Connect with the required headers and settings + self._ws_client = await websockets.connect( + presigned_url, + extra_headers=extra_headers, + subprotocols=["mqtt"], + ping_interval=None, + ping_timeout=None, + compression=None, + ) + logger.debug("WebSocket connected, starting receive task...") + + # Start receive task + self._receive_task = asyncio.create_task(self._receive_loop()) + + logger.info("Successfully connected to AWS Transcribe") + + except Exception as e: + logger.error(f"Failed to connect to AWS Transcribe: {e}") + await self._disconnect() + raise + + finally: + self._connecting = False + + async def _disconnect(self): + """Disconnect from AWS Transcribe.""" + if self._receive_task: + self._receive_task.cancel() + try: + await self._receive_task + except asyncio.CancelledError: + pass + self._receive_task = None + + if self._ws_client: + try: + if self._ws_client.open: + # Send end-stream message + end_stream = {"message-type": "event", "event": "end"} + await self._ws_client.send(json.dumps(end_stream)) + await self._ws_client.close() + except Exception as e: + logger.warning(f"Error closing WebSocket connection: {e}") + finally: + self._ws_client = None + + def language_to_service_language(self, language: Language) -> str | None: + """Convert internal language enum to AWS Transcribe language code.""" + language_map = { + Language.EN: "en-US", + Language.ES: "es-US", + Language.FR: "fr-FR", + Language.DE: "de-DE", + Language.IT: "it-IT", + Language.PT: "pt-BR", + Language.JA: "ja-JP", + Language.KO: "ko-KR", + Language.ZH: "zh-CN", + } + return language_map.get(language) + + async def _receive_loop(self): + """Background task to receive and process messages from AWS Transcribe.""" + try: + logger.debug("Receive loop started") + while True: + if not self._ws_client or not self._ws_client.open: + logger.warning("WebSocket closed in receive loop") + break + + try: + response = await self._ws_client.recv() + headers, payload = decode_event(response) + + # logger.debug(f"Received message type: {headers.get(':message-type')}") + + if headers.get(":message-type") == "event": + # Process transcription results + results = payload.get("Transcript", {}).get("Results", []) + if results: + result = results[0] + alternatives = result.get("Alternatives", []) + if alternatives: + transcript = alternatives[0].get("Transcript", "") + is_final = not result.get("IsPartial", True) + + if transcript: + await self.stop_ttfb_metrics() + if is_final: + await self.push_frame( + TranscriptionFrame( + transcript, + "", + time_now_iso8601(), + self._settings["language"], + ) + ) + await self.stop_processing_metrics() + else: + await self.push_frame( + InterimTranscriptionFrame( + transcript, + "", + time_now_iso8601(), + self._settings["language"], + ) + ) + elif headers.get(":message-type") == "exception": + error_msg = payload.get("Message", "Unknown error") + logger.error(f"Exception from AWS: {error_msg}") + await self.push_frame( + ErrorFrame(f"AWS Transcribe error: {error_msg}", fatal=False) + ) + else: + logger.debug(f"Other message type received: {headers}") + logger.debug(f"Payload: {payload}") + + except websockets.exceptions.ConnectionClosed as e: + logger.error( + f"WebSocket connection closed in receive loop with code {e.code}: {e.reason}" + ) + break + except Exception as e: + logger.error(f"Error in receive loop: {e}") + break + + except asyncio.CancelledError: + logger.debug("Receive loop cancelled") + except Exception as e: + logger.error(f"Unexpected error in receive loop: {e}") + finally: + logger.debug("Receive loop ended") From acb7d597cb037919272778a9cfa97c8bf1bc67f7 Mon Sep 17 00:00:00 2001 From: Tico Ballagas Date: Thu, 13 Feb 2025 15:00:50 -0800 Subject: [PATCH 15/97] Change example to use generative voices --- examples/foundational/07m-interruptible-polly.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/examples/foundational/07m-interruptible-polly.py b/examples/foundational/07m-interruptible-polly.py index 63349360e..b3bd08061 100644 --- a/examples/foundational/07m-interruptible-polly.py +++ b/examples/foundational/07m-interruptible-polly.py @@ -40,8 +40,11 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection, _: argparse.Namespac stt = TranscribeSTTService() tts = PollyTTSService( - voice_id="Amy", - params=PollyTTSService.InputParams(engine="standard", language=Language.EN_GB, rate="1.05"), + region="us-west-2", # only specific regions support generative TTS + voice_id="Joanna", + params=PollyTTSService.InputParams( + engine="generative", language=Language.EN_GB, rate="1.05" + ), ) llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY")) From 844f61dfeac533fd1f61f436fba0b36886352394 Mon Sep 17 00:00:00 2001 From: Adithya Suresh Date: Thu, 3 Apr 2025 08:41:34 +0000 Subject: [PATCH 16/97] Initial implementation --- .../adapters/services/bedrock_adapter.py | 38 + src/pipecat/services/aws/llm.py | 803 ++++++++++++++++++ 2 files changed, 841 insertions(+) create mode 100644 src/pipecat/adapters/services/bedrock_adapter.py create mode 100644 src/pipecat/services/aws/llm.py diff --git a/src/pipecat/adapters/services/bedrock_adapter.py b/src/pipecat/adapters/services/bedrock_adapter.py new file mode 100644 index 000000000..0aba6aba2 --- /dev/null +++ b/src/pipecat/adapters/services/bedrock_adapter.py @@ -0,0 +1,38 @@ +# +# Copyright (c) 2024–2025, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +from typing import Any, Dict, List, Union + +from pipecat.adapters.base_llm_adapter import BaseLLMAdapter +from pipecat.adapters.schemas.function_schema import FunctionSchema +from pipecat.adapters.schemas.tools_schema import ToolsSchema + + +class BedrockLLMAdapter(BaseLLMAdapter): + @staticmethod + def _to_bedrock_function_format(function: FunctionSchema) -> Dict[str, Any]: + return { + "toolSpec": { + "name": function.name, + "description": function.description, + "inputSchema": { + "json": { + "type": "object", + "properties": function.properties, + "required": function.required, + }, + } + } + } + + def to_provider_tools_format(self, tools_schema: ToolsSchema) -> List[Dict[str, Any]]: + """Converts function schemas to Bedrock's function-calling format. + + :return: Bedrock formatted function call definition. + """ + + functions_schema = tools_schema.standard_tools + return [self._to_bedrock_function_format(func) for func in functions_schema] diff --git a/src/pipecat/services/aws/llm.py b/src/pipecat/services/aws/llm.py new file mode 100644 index 000000000..3b476e03b --- /dev/null +++ b/src/pipecat/services/aws/llm.py @@ -0,0 +1,803 @@ +# +# Copyright (c) 2024–2025, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +import asyncio +import base64 +import copy +import io +import json +import re +from dataclasses import dataclass +from typing import Any, Dict, List, Mapping, Optional, Union + +import boto3 +from botocore.config import Config +import httpx +from loguru import logger +from PIL import Image +from pydantic import BaseModel, Field + +from pipecat.adapters.services.anthropic_adapter import AnthropicLLMAdapter +from pipecat.frames.frames import ( + Frame, + FunctionCallCancelFrame, + FunctionCallInProgressFrame, + FunctionCallResultFrame, + LLMFullResponseEndFrame, + LLMFullResponseStartFrame, + LLMMessagesFrame, + LLMTextFrame, + LLMUpdateSettingsFrame, + UserImageRawFrame, + VisionImageRawFrame, +) +from pipecat.metrics.metrics import LLMTokenUsage +from pipecat.processors.aggregators.llm_response import ( + LLMAssistantContextAggregator, + LLMUserContextAggregator, +) +from pipecat.processors.aggregators.openai_llm_context import ( + OpenAILLMContext, + OpenAILLMContextFrame, +) +from pipecat.processors.frame_processor import FrameDirection +from pipecat.services.ai_services import LLMService + +try: + from anthropic import NOT_GIVEN, NotGiven +except ModuleNotFoundError as e: + logger.error(f"Exception: {e}") + logger.error( + "In order to use Anthropic, you need to `pip install pipecat-ai[anthropic]`. " + + "Also, set `ANTHROPIC_API_KEY` environment variable." + ) + raise Exception(f"Missing module: {e}") + + +@dataclass +class BedrockContextAggregatorPair: + _user: "BedrockUserContextAggregator" + _assistant: "BedrockAssistantContextAggregator" + + def user(self) -> "BedrockUserContextAggregator": + return self._user + + def assistant(self) -> "BedrockAssistantContextAggregator": + return self._assistant + + +class BedrockLLMService(LLMService): + """This class implements inference with AWS Bedrock models including Amazon Nova and Anthropic Claude. + + Requires AWS credentials to be configured in the environment or through boto3 configuration. + """ + class InputParams(BaseModel): + max_tokens: Optional[int] = Field(default_factory=lambda: 4096, ge=1) + temperature: Optional[float] = Field(default_factory=lambda: 0.7, ge=0.0, le=1.0) + top_p: Optional[float] = Field(default_factory=lambda: 0.999, ge=0.0, le=1.0) + stop_sequences: Optional[List[str]] = Field(default_factory=lambda: []) + latency: Optional[str] = Field(default_factory=lambda: "standard") + additional_model_request_fields: Optional[Dict[str, Any]] = Field(default_factory=dict) + + def __init__( + self, + *, + aws_access_key: str, + aws_secret_key: str, + aws_session_token: Optional[str] = None, + aws_region: str = "us-east-1", + model: str, + params: InputParams = InputParams(), + client_config: Optional[Config] = None, + **kwargs, + ): + super().__init__(**kwargs) + + # Initialize the Bedrock client + if not client_config: + client_config = Config( + connect_timeout=300, # 5 minutes + read_timeout=300, # 5 minutes + retries={'max_attempts': 3} + ) + session = boto3.Session( + aws_access_key_id=aws_access_key, + aws_secret_access_key=aws_secret_key, + aws_session_token=aws_session_token, + region_name=aws_region + ) + self._client = session.client( + service_name='bedrock-runtime', + config=client_config + ) + + self.set_model_name(model) + self._settings = { + "max_tokens": params.max_tokens, + "temperature": params.temperature, + "top_p": params.top_p, + "latency": params.latency, + "additional_model_request_fields": params.additional_model_request_fields if isinstance(params.additional_model_request_fields, dict) else {}, + } + + # Determine model provider from model ID + self.model_provider = self._get_model_provider(model) + logger.info(f"Using AWS Bedrock model: {model} from provider: {self.model_provider}") + + def _get_model_provider(self, model: str) -> str: + """Determine the model provider from the model ID""" + if "anthropic." in model: + return "anthropic" + elif "amazon." in model: + return "amazon" + else: + raise ValueError(f"Unsupported model: {model}. Only Anthropic Claude and Amazon Nova model families are supported.") + + def can_generate_metrics(self) -> bool: + return True + + def create_context_aggregator( + self, + context: OpenAILLMContext, + *, + user_kwargs: Mapping[str, Any] = {}, + assistant_kwargs: Mapping[str, Any] = {}, + ) -> BedrockContextAggregatorPair: + """Create an instance of BedrockContextAggregatorPair from an + OpenAILLMContext. Constructor keyword arguments for both the user and + assistant aggregators can be provided. + + Args: + context (OpenAILLMContext): The LLM context. + user_kwargs (Mapping[str, Any], optional): Additional keyword + arguments for the user context aggregator constructor. Defaults + to an empty mapping. + assistant_kwargs (Mapping[str, Any], optional): Additional keyword + arguments for the assistant context aggregator + constructor. Defaults to an empty mapping. + + Returns: + BedrockContextAggregatorPair: A pair of context aggregators, one + for the user and one for the assistant, encapsulated in an + BedrockContextAggregatorPair. + """ + context.set_llm_adapter(self.get_llm_adapter()) + + if isinstance(context, OpenAILLMContext): + context = BedrockLLMContext.from_openai_context(context) + + user = BedrockUserContextAggregator(context, **user_kwargs) + assistant = BedrockAssistantContextAggregator(context, **assistant_kwargs) + return BedrockContextAggregatorPair(_user=user, _assistant=assistant) + + async def _process_context(self, context: "BedrockLLMContext"): + # Usage tracking + prompt_tokens = 0 + completion_tokens = 0 + completion_tokens_estimate = 0 + use_completion_tokens_estimate = False + + try: + await self.push_frame(LLMFullResponseStartFrame()) + await self.start_processing_metrics() + + # logger.debug( + # f"{self}: Generating chat with Bedrock model {self.model_name} | [{context.get_messages_for_logging()}]" + # ) + + await self.start_ttfb_metrics() + + # Set up inference config + inference_config = { + "maxTokens": self._settings["max_tokens"], + "temperature": self._settings["temperature"], + "topP": self._settings["top_p"], + } + + # Prepare request parameters + request_params = { + "modelId": self.model_name, + "messages": context.messages, + "inferenceConfig": inference_config, + "additionalModelRequestFields": self._settings["additional_model_request_fields"] + } + + # Add system message + request_params["system"] = [{"text": context.system}] + + # Add tools if present + if context.tools: + print(context.tools) + tool_config = { + "tools": context.tools + } + + # Add tool_choice if specified + if context.tool_choice: + if context.tool_choice == "auto": + tool_config["toolChoice"] = {"auto": {}} + elif context.tool_choice == "none": + # Skip adding toolChoice for "none" + pass + elif isinstance(context.tool_choice, dict) and "function" in context.tool_choice: + tool_config["toolChoice"] = { + "tool": { + "name": context.tool_choice["function"]["name"] + } + } + + request_params["toolConfig"] = tool_config + + # Add performance config if latency is specified + if self._settings["latency"] in ["standard", "optimized"]: + request_params["performanceConfig"] = { + "latency": self._settings["latency"] + } + + logger.debug(f"Calling Bedrock model with: {request_params}") + + # Call Bedrock with streaming + response = self._client.converse_stream(**request_params) + + await self.stop_ttfb_metrics() + + # Process the streaming response + tool_use_block = None + json_accumulator = "" + + for event in response["stream"]: + # Handle text content + if "contentBlockDelta" in event: + delta = event["contentBlockDelta"]["delta"] + if "text" in delta: + await self.push_frame(LLMTextFrame(delta["text"])) + completion_tokens_estimate += self._estimate_tokens(delta["text"]) + elif "toolUse" in delta and "input" in delta["toolUse"]: + # Handle partial JSON for tool use + json_str = json.dumps(delta["toolUse"]["input"]) + json_accumulator += json_str + completion_tokens_estimate += self._estimate_tokens(json_str) + + # Handle tool use start + elif "contentBlockStart" in event: + content_block = event["contentBlockStart"] + if content_block.get("type") == "toolUse": + tool_use_block = { + "id": content_block["toolUse"].get("toolUseId", ""), + "name": content_block["toolUse"].get("name", "") + } + json_accumulator = "" + + # Handle message completion with tool use + elif "messageDelta" in event and "stopReason" in event["messageDelta"]: + if event["messageDelta"]["stopReason"] == "toolUse" and tool_use_block: + try: + arguments = json.loads(json_accumulator) if json_accumulator else {} + await self.call_function( + context=context, + tool_call_id=tool_use_block["id"], + function_name=tool_use_block["name"], + arguments=arguments, + ) + except json.JSONDecodeError: + logger.error(f"Failed to parse tool arguments: {json_accumulator}") + + # Handle usage metrics if available + if "usage" in event: + usage = event["usage"] + prompt_tokens += usage.get("inputTokens", 0) + completion_tokens += usage.get("outputTokens", 0) + + except asyncio.CancelledError: + # If we're interrupted, we won't get a complete usage report. So set our flag to use the + # token estimate. The reraise the exception so all the processors running in this task + # also get cancelled. + use_completion_tokens_estimate = True + raise + except httpx.TimeoutException: + await self._call_event_handler("on_completion_timeout") + except Exception as e: + logger.exception(f"{self} exception: {e}") + finally: + await self.stop_processing_metrics() + await self.push_frame(LLMFullResponseEndFrame()) + comp_tokens = ( + completion_tokens + if not use_completion_tokens_estimate + else completion_tokens_estimate + ) + await self._report_usage_metrics( + prompt_tokens=prompt_tokens, + completion_tokens=comp_tokens, + ) + + async def process_frame(self, frame: Frame, direction: FrameDirection): + await super().process_frame(frame, direction) + + context = None + if isinstance(frame, OpenAILLMContextFrame): + context = BedrockLLMContext.upgrade_to_bedrock(frame.context) + elif isinstance(frame, LLMMessagesFrame): + context = BedrockLLMContext.from_messages(frame.messages) + elif isinstance(frame, VisionImageRawFrame): + # This is only useful in very simple pipelines because it creates + # a new context. Generally we want a context manager to catch + # UserImageRawFrames coming through the pipeline and add them + # to the context. + context = BedrockLLMContext.from_image_frame(frame) + elif isinstance(frame, LLMUpdateSettingsFrame): + await self._update_settings(frame.settings) + else: + await self.push_frame(frame, direction) + + if context: + await self._process_context(context) + + def _estimate_tokens(self, text: str) -> int: + return int(len(re.split(r"[^\w]+", text)) * 1.3) + + async def _report_usage_metrics( + self, + prompt_tokens: int, + completion_tokens: int, + ): + if prompt_tokens or completion_tokens: + tokens = LLMTokenUsage( + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + total_tokens=prompt_tokens + completion_tokens, + ) + await self.start_llm_usage_metrics(tokens) + + +class BedrockLLMContext(OpenAILLMContext): + def __init__( + self, + messages: Optional[List[dict]] = None, + tools: Optional[List[dict]] = None, + tool_choice: Optional[dict] = None, + *, + system: Union[str, NotGiven] = NOT_GIVEN, + ): + super().__init__(messages=messages, tools=tools, tool_choice=tool_choice) + self.system = system + + @staticmethod + def upgrade_to_bedrock(obj: OpenAILLMContext) -> "BedrockLLMContext": + logger.debug(f"Upgrading to Bedrock: {obj}") + if isinstance(obj, OpenAILLMContext) and not isinstance(obj, BedrockLLMContext): + obj.__class__ = BedrockLLMContext + obj._restructure_from_openai_messages() + else: + obj._restructure_from_bedrock_messages() + return obj + + @classmethod + def from_openai_context(cls, openai_context: OpenAILLMContext): + self = cls( + messages=openai_context.messages, + tools=openai_context.tools, + tool_choice=openai_context.tool_choice, + ) + self.set_llm_adapter(openai_context.get_llm_adapter()) + self._restructure_from_openai_messages() + return self + + @classmethod + def from_messages(cls, messages: List[dict]) -> "BedrockLLMContext": + self = cls(messages=messages) + # self._restructure_from_openai_messages() + return self + + @classmethod + def from_image_frame(cls, frame: VisionImageRawFrame) -> "BedrockLLMContext": + context = cls() + context.add_image_frame_message( + format=frame.format, size=frame.size, image=frame.image, text=frame.text + ) + return context + + def set_messages(self, messages: List): + self._messages[:] = messages + # self._restructure_from_openai_messages() + + # convert a message in Bedrock format into one or more messages in OpenAI format + def to_standard_messages(self, obj): + """Convert Bedrock message format to standard structured format. + + Handles text content and function calls for both user and assistant messages. + + Args: + obj: Message in Bedrock format: + { + "role": "user/assistant", + "content": [{"text": str} | {"toolUse": {...}} | {"toolResult": {...}}] + } + + Returns: + List of messages in standard format: + [ + { + "role": "user/assistant/tool", + "content": [{"type": "text", "text": str}] + } + ] + """ + role = obj.get("role") + content = obj.get("content") + + if role == "assistant": + if isinstance(content, str): + return [{"role": role, "content": [{"type": "text", "text": content}]}] + elif isinstance(content, list): + text_items = [] + tool_items = [] + for item in content: + if "text" in item: + text_items.append({"type": "text", "text": item["text"]}) + elif "toolUse" in item: + tool_use = item["toolUse"] + tool_items.append( + { + "type": "function", + "id": tool_use["toolUseId"], + "function": { + "name": tool_use["name"], + "arguments": json.dumps(tool_use["input"]), + }, + } + ) + messages = [] + if text_items: + messages.append({"role": role, "content": text_items}) + if tool_items: + messages.append({"role": role, "tool_calls": tool_items}) + return messages + elif role == "user": + if isinstance(content, str): + return [{"role": role, "content": [{"type": "text", "text": content}]}] + elif isinstance(content, list): + text_items = [] + tool_items = [] + for item in content: + if "text" in item: + text_items.append({"type": "text", "text": item["text"]}) + elif "toolResult" in item: + tool_result = item["toolResult"] + # Extract content from toolResult + result_content = "" + if isinstance(tool_result["content"], list): + for content_item in tool_result["content"]: + if "text" in content_item: + result_content = content_item["text"] + elif "json" in content_item: + result_content = json.dumps(content_item["json"]) + else: + result_content = tool_result["content"] + + tool_items.append( + { + "role": "tool", + "tool_call_id": tool_result["toolUseId"], + "content": result_content, + } + ) + messages = [] + if text_items: + messages.append({"role": role, "content": text_items}) + messages.extend(tool_items) + return messages + + def from_standard_message(self, message): + """Convert standard format message to Bedrock format. + + Handles conversion of text content, tool calls, and tool results. + Empty text content is converted to "(empty)". + + Args: + message: Message in standard format: + { + "role": "user/assistant/tool", + "content": str | [{"type": "text", ...}], + "tool_calls": [{"id": str, "function": {"name": str, "arguments": str}}] + } + + Returns: + Message in Bedrock format: + { + "role": "user/assistant", + "content": [ + {"text": str} | + {"toolUse": {"toolUseId": str, "name": str, "input": dict}} | + {"toolResult": {"toolUseId": str, "content": [...], "status": str}} + ] + } + """ + print(message) + if message["role"] == "tool": + # Try to parse the content as JSON if it looks like JSON + try: + if message["content"].strip().startswith('{') and message["content"].strip().endswith('}'): + content_json = json.loads(message["content"]) + tool_result_content = [{"json": content_json}] + else: + tool_result_content = [{"text": message["content"]}] + except: + tool_result_content = [{"text": message["content"]}] + + return { + "role": "user", + "content": [ + { + "toolResult": { + "toolUseId": message["tool_call_id"], + "content": tool_result_content + }, + }, + ], + } + + if message.get("tool_calls"): + tc = message["tool_calls"] + ret = {"role": "assistant", "content": []} + for tool_call in tc: + function = tool_call["function"] + arguments = json.loads(function["arguments"]) + new_tool_use = { + "toolUse": { + "toolUseId": tool_call["id"], + "name": function["name"], + "input": arguments, + } + } + ret["content"].append(new_tool_use) + return ret + + # Handle text content + content = message.get("content") + if isinstance(content, str): + if content == "": + return {"role": message["role"], "content": [{"text": "(empty)"}]} + else: + return {"role": message["role"], "content": [{"text": content}]} + elif isinstance(content, list): + new_content = [] + for item in content: + if item.get("type", "") == "text": + text_content = item["text"] if item["text"] != "" else "(empty)" + new_content.append({"text": text_content}) + return {"role": message["role"], "content": new_content} + + return message + + def add_image_frame_message( + self, *, format: str, size: tuple[int, int], image: bytes, text: str = None + ): + buffer = io.BytesIO() + Image.frombytes(format, size, image).save(buffer, format="JPEG") + encoded_image = base64.b64encode(buffer.getvalue()).decode("utf-8") + + # Image should be the first content block in the message + content = [ + { + "type": "image", + "format": "jpeg", + "source": { + "bytes": encoded_image + } + } + ] + if text: + content.append({"text": text}) + self.add_message({"role": "user", "content": content}) + + def add_message(self, message): + try: + if self.messages: + # Bedrock requires that roles alternate. If this message's role is the same as the + # last message, we should add this message's content to the last message. + if self.messages[-1]["role"] == message["role"]: + # if the last message has just a content string, convert it to a list + # in the proper format + if isinstance(self.messages[-1]["content"], str): + self.messages[-1]["content"] = [ + {"type": "text", "text": self.messages[-1]["content"]} + ] + # if this message has just a content string, convert it to a list + # in the proper format + if isinstance(message["content"], str): + message["content"] = [{"text": message["content"]}] + # append the content of this message to the last message + self.messages[-1]["content"].extend(message["content"]) + else: + self.messages.append(message) + else: + self.messages.append(message) + except Exception as e: + logger.error(f"Error adding message: {e}") + + def _restructure_from_bedrock_messages(self): + """Restructure messages in Bedrock format by handling system messages, + merging consecutive messages with the same role, and ensuring proper content formatting. + """ + + print(self.messages) + + # Handle system message if present at the beginning + if self.messages and self.messages[0]["role"] == "system": + if len(self.messages) == 1: + self.messages[0]["role"] = "user" + else: + system_content = self.messages.pop(0)["content"] + self.system = system_content[0]["text"] if isinstance(system_content, list) and system_content and isinstance(system_content[0], dict) and "text" in system_content[0] else str(system_content) + + # Ensure content is properly formatted + for msg in self.messages: + if isinstance(msg["content"], str): + msg["content"] = [{"text": msg["content"]}] + elif not msg["content"]: + msg["content"] = [{"text": "(empty)"}] + elif isinstance(msg["content"], list): + for idx, item in enumerate(msg["content"]): + if isinstance(item, dict) and "text" in item and item["text"] == "": + item["text"] = "(empty)" + elif isinstance(item, str) and item == "": + msg["content"][idx] = {"text": "(empty)"} + + # Merge consecutive messages with the same role + merged_messages = [] + for msg in self.messages: + if merged_messages and merged_messages[-1]["role"] == msg["role"]: + merged_messages[-1]["content"].extend(msg["content"]) + else: + merged_messages.append(msg) + + self.messages.clear() + self.messages.extend(merged_messages) + + def _restructure_from_openai_messages(self): + # first, map across self._messages calling self.from_standard_message(m) to modify messages in place + try: + self._messages[:] = [self.from_standard_message(m) for m in self._messages] + except Exception as e: + logger.error(f"Error mapping messages: {e}") + + # See if we should pull the system message out of our context.messages list. (For + # compatibility with Open AI messages format.) + if self.messages and self.messages[0]["role"] == "system": + if len(self.messages) == 1: + # If we have only have a system message in the list, all we can really do + # without introducing too much magic is change the role to "user". + self.messages[0]["role"] = "user" + else: + # If we have more than one message, we'll pull the system message out of the + # list. + self.system = self.messages[0]["content"] + self.messages.pop(0) + + # Merge consecutive messages with the same role. + i = 0 + while i < len(self.messages) - 1: + current_message = self.messages[i] + next_message = self.messages[i + 1] + if current_message["role"] == next_message["role"]: + # Convert content to list of dictionaries if it's a string + if isinstance(current_message["content"], str): + current_message["content"] = [ + {"type": "text", "text": current_message["content"]} + ] + if isinstance(next_message["content"], str): + next_message["content"] = [{"type": "text", "text": next_message["content"]}] + # Concatenate the content + current_message["content"].extend(next_message["content"]) + # Remove the next message from the list + self.messages.pop(i + 1) + else: + i += 1 + + # Avoid empty content in messages + for message in self.messages: + if isinstance(message["content"], str) and message["content"] == "": + message["content"] = "(empty)" + elif isinstance(message["content"], list) and len(message["content"]) == 0: + message["content"] = [{"type": "text", "text": "(empty)"}] + + def get_messages_for_persistent_storage(self): + messages = super().get_messages_for_persistent_storage() + if self.system: + messages.insert(0, {"role": "system", "content": self.system}) + return messages + + def get_messages_for_logging(self) -> str: + msgs = [] + for message in self.messages: + msg = copy.deepcopy(message) + if "content" in msg: + if isinstance(msg["content"], list): + for item in msg["content"]: + if item.get("image"): + item["source"]["bytes"] = "..." + msgs.append(msg) + return json.dumps(msgs) + + +class BedrockUserContextAggregator(LLMUserContextAggregator): + pass + + +class BedrockAssistantContextAggregator(LLMAssistantContextAggregator): + async def handle_function_call_in_progress(self, frame: FunctionCallInProgressFrame): + # Format tool use according to Bedrock API + self._context.add_message( + { + "role": "assistant", + "content": [ + { + "toolUse": { + "toolUseId": frame.tool_call_id, + "name": frame.function_name, + "input": frame.arguments + } + } + ], + } + ) + self._context.add_message( + { + "role": "user", + "content": [ + { + "toolResult": { + "toolUseId": frame.tool_call_id, + "content": [ + { + "text": "IN_PROGRESS" + } + ], + } + } + ], + } + ) + + async def handle_function_call_result(self, frame: FunctionCallResultFrame): + if frame.result: + result = json.dumps(frame.result) + await self._update_function_call_result(frame.function_name, frame.tool_call_id, result) + else: + await self._update_function_call_result( + frame.function_name, frame.tool_call_id, "COMPLETED" + ) + + async def handle_function_call_cancel(self, frame: FunctionCallCancelFrame): + await self._update_function_call_result( + frame.function_name, frame.tool_call_id, "CANCELLED" + ) + + async def _update_function_call_result( + self, function_name: str, tool_call_id: str, result: Any + ): + for message in self._context.messages: + if message["role"] == "user": + for content in message["content"]: + if ( + isinstance(content, dict) + and content.get("toolResult") + and content["toolResult"]["toolUseId"] == tool_call_id + ): + content["toolResult"]["content"] = [{"text": result}] + + async def handle_user_image_frame(self, frame: UserImageRawFrame): + await self._update_function_call_result( + frame.request.function_name, frame.request.tool_call_id, "COMPLETED" + ) + self._context.add_image_frame_message( + format=frame.format, + size=frame.size, + image=frame.image, + text=frame.request.context, + ) + \ No newline at end of file From 88c9e08bd819f9768e234e0e9341388d86997e87 Mon Sep 17 00:00:00 2001 From: Adithya Suresh Date: Thu, 3 Apr 2025 11:27:17 +0000 Subject: [PATCH 17/97] Updated tools parsing logic --- src/pipecat/services/aws/llm.py | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/src/pipecat/services/aws/llm.py b/src/pipecat/services/aws/llm.py index 3b476e03b..2f762e9bd 100644 --- a/src/pipecat/services/aws/llm.py +++ b/src/pipecat/services/aws/llm.py @@ -210,7 +210,6 @@ class BedrockLLMService(LLMService): # Add tools if present if context.tools: - print(context.tools) tool_config = { "tools": context.tools } @@ -257,23 +256,22 @@ class BedrockLLMService(LLMService): completion_tokens_estimate += self._estimate_tokens(delta["text"]) elif "toolUse" in delta and "input" in delta["toolUse"]: # Handle partial JSON for tool use - json_str = json.dumps(delta["toolUse"]["input"]) - json_accumulator += json_str - completion_tokens_estimate += self._estimate_tokens(json_str) + json_accumulator += delta["toolUse"]["input"] + completion_tokens_estimate += self._estimate_tokens(delta["toolUse"]["input"]) # Handle tool use start elif "contentBlockStart" in event: - content_block = event["contentBlockStart"] - if content_block.get("type") == "toolUse": + content_block_start = event["contentBlockStart"]['start'] + if "toolUse" in content_block_start: tool_use_block = { - "id": content_block["toolUse"].get("toolUseId", ""), - "name": content_block["toolUse"].get("name", "") + "id": content_block_start["toolUse"].get("toolUseId", ""), + "name": content_block_start["toolUse"].get("name", "") } json_accumulator = "" # Handle message completion with tool use - elif "messageDelta" in event and "stopReason" in event["messageDelta"]: - if event["messageDelta"]["stopReason"] == "toolUse" and tool_use_block: + elif "messageStop" in event and "stopReason" in event["messageStop"]: + if event["messageStop"]["stopReason"] == "tool_use" and tool_use_block: try: arguments = json.loads(json_accumulator) if json_accumulator else {} await self.call_function( @@ -286,8 +284,8 @@ class BedrockLLMService(LLMService): logger.error(f"Failed to parse tool arguments: {json_accumulator}") # Handle usage metrics if available - if "usage" in event: - usage = event["usage"] + if "metadata" in event and "usage" in event["metadata"]: + usage = event["metadata"]["usage"] prompt_tokens += usage.get("inputTokens", 0) completion_tokens += usage.get("outputTokens", 0) @@ -516,7 +514,6 @@ class BedrockLLMContext(OpenAILLMContext): ] } """ - print(message) if message["role"] == "tool": # Try to parse the content as JSON if it looks like JSON try: @@ -623,9 +620,6 @@ class BedrockLLMContext(OpenAILLMContext): """Restructure messages in Bedrock format by handling system messages, merging consecutive messages with the same role, and ensuring proper content formatting. """ - - print(self.messages) - # Handle system message if present at the beginning if self.messages and self.messages[0]["role"] == "system": if len(self.messages) == 1: @@ -739,7 +733,7 @@ class BedrockAssistantContextAggregator(LLMAssistantContextAggregator): "toolUse": { "toolUseId": frame.tool_call_id, "name": frame.function_name, - "input": frame.arguments + "input": frame.arguments if frame.arguments else {} } } ], From 05ae8d3ffa11e07f8868e1e81c387ce4419dddfa Mon Sep 17 00:00:00 2001 From: Adithya Suresh Date: Fri, 4 Apr 2025 05:36:09 +0000 Subject: [PATCH 18/97] Removed OpenAI based context formatting --- src/pipecat/services/aws/llm.py | 580 ++++++++++++++++---------------- 1 file changed, 286 insertions(+), 294 deletions(-) diff --git a/src/pipecat/services/aws/llm.py b/src/pipecat/services/aws/llm.py index 2f762e9bd..cb21eccaa 100644 --- a/src/pipecat/services/aws/llm.py +++ b/src/pipecat/services/aws/llm.py @@ -46,16 +46,6 @@ from pipecat.processors.aggregators.openai_llm_context import ( from pipecat.processors.frame_processor import FrameDirection from pipecat.services.ai_services import LLMService -try: - from anthropic import NOT_GIVEN, NotGiven -except ModuleNotFoundError as e: - logger.error(f"Exception: {e}") - logger.error( - "In order to use Anthropic, you need to `pip install pipecat-ai[anthropic]`. " - + "Also, set `ANTHROPIC_API_KEY` environment variable." - ) - raise Exception(f"Missing module: {e}") - @dataclass class BedrockContextAggregatorPair: @@ -69,288 +59,6 @@ class BedrockContextAggregatorPair: return self._assistant -class BedrockLLMService(LLMService): - """This class implements inference with AWS Bedrock models including Amazon Nova and Anthropic Claude. - - Requires AWS credentials to be configured in the environment or through boto3 configuration. - """ - class InputParams(BaseModel): - max_tokens: Optional[int] = Field(default_factory=lambda: 4096, ge=1) - temperature: Optional[float] = Field(default_factory=lambda: 0.7, ge=0.0, le=1.0) - top_p: Optional[float] = Field(default_factory=lambda: 0.999, ge=0.0, le=1.0) - stop_sequences: Optional[List[str]] = Field(default_factory=lambda: []) - latency: Optional[str] = Field(default_factory=lambda: "standard") - additional_model_request_fields: Optional[Dict[str, Any]] = Field(default_factory=dict) - - def __init__( - self, - *, - aws_access_key: str, - aws_secret_key: str, - aws_session_token: Optional[str] = None, - aws_region: str = "us-east-1", - model: str, - params: InputParams = InputParams(), - client_config: Optional[Config] = None, - **kwargs, - ): - super().__init__(**kwargs) - - # Initialize the Bedrock client - if not client_config: - client_config = Config( - connect_timeout=300, # 5 minutes - read_timeout=300, # 5 minutes - retries={'max_attempts': 3} - ) - session = boto3.Session( - aws_access_key_id=aws_access_key, - aws_secret_access_key=aws_secret_key, - aws_session_token=aws_session_token, - region_name=aws_region - ) - self._client = session.client( - service_name='bedrock-runtime', - config=client_config - ) - - self.set_model_name(model) - self._settings = { - "max_tokens": params.max_tokens, - "temperature": params.temperature, - "top_p": params.top_p, - "latency": params.latency, - "additional_model_request_fields": params.additional_model_request_fields if isinstance(params.additional_model_request_fields, dict) else {}, - } - - # Determine model provider from model ID - self.model_provider = self._get_model_provider(model) - logger.info(f"Using AWS Bedrock model: {model} from provider: {self.model_provider}") - - def _get_model_provider(self, model: str) -> str: - """Determine the model provider from the model ID""" - if "anthropic." in model: - return "anthropic" - elif "amazon." in model: - return "amazon" - else: - raise ValueError(f"Unsupported model: {model}. Only Anthropic Claude and Amazon Nova model families are supported.") - - def can_generate_metrics(self) -> bool: - return True - - def create_context_aggregator( - self, - context: OpenAILLMContext, - *, - user_kwargs: Mapping[str, Any] = {}, - assistant_kwargs: Mapping[str, Any] = {}, - ) -> BedrockContextAggregatorPair: - """Create an instance of BedrockContextAggregatorPair from an - OpenAILLMContext. Constructor keyword arguments for both the user and - assistant aggregators can be provided. - - Args: - context (OpenAILLMContext): The LLM context. - user_kwargs (Mapping[str, Any], optional): Additional keyword - arguments for the user context aggregator constructor. Defaults - to an empty mapping. - assistant_kwargs (Mapping[str, Any], optional): Additional keyword - arguments for the assistant context aggregator - constructor. Defaults to an empty mapping. - - Returns: - BedrockContextAggregatorPair: A pair of context aggregators, one - for the user and one for the assistant, encapsulated in an - BedrockContextAggregatorPair. - """ - context.set_llm_adapter(self.get_llm_adapter()) - - if isinstance(context, OpenAILLMContext): - context = BedrockLLMContext.from_openai_context(context) - - user = BedrockUserContextAggregator(context, **user_kwargs) - assistant = BedrockAssistantContextAggregator(context, **assistant_kwargs) - return BedrockContextAggregatorPair(_user=user, _assistant=assistant) - - async def _process_context(self, context: "BedrockLLMContext"): - # Usage tracking - prompt_tokens = 0 - completion_tokens = 0 - completion_tokens_estimate = 0 - use_completion_tokens_estimate = False - - try: - await self.push_frame(LLMFullResponseStartFrame()) - await self.start_processing_metrics() - - # logger.debug( - # f"{self}: Generating chat with Bedrock model {self.model_name} | [{context.get_messages_for_logging()}]" - # ) - - await self.start_ttfb_metrics() - - # Set up inference config - inference_config = { - "maxTokens": self._settings["max_tokens"], - "temperature": self._settings["temperature"], - "topP": self._settings["top_p"], - } - - # Prepare request parameters - request_params = { - "modelId": self.model_name, - "messages": context.messages, - "inferenceConfig": inference_config, - "additionalModelRequestFields": self._settings["additional_model_request_fields"] - } - - # Add system message - request_params["system"] = [{"text": context.system}] - - # Add tools if present - if context.tools: - tool_config = { - "tools": context.tools - } - - # Add tool_choice if specified - if context.tool_choice: - if context.tool_choice == "auto": - tool_config["toolChoice"] = {"auto": {}} - elif context.tool_choice == "none": - # Skip adding toolChoice for "none" - pass - elif isinstance(context.tool_choice, dict) and "function" in context.tool_choice: - tool_config["toolChoice"] = { - "tool": { - "name": context.tool_choice["function"]["name"] - } - } - - request_params["toolConfig"] = tool_config - - # Add performance config if latency is specified - if self._settings["latency"] in ["standard", "optimized"]: - request_params["performanceConfig"] = { - "latency": self._settings["latency"] - } - - logger.debug(f"Calling Bedrock model with: {request_params}") - - # Call Bedrock with streaming - response = self._client.converse_stream(**request_params) - - await self.stop_ttfb_metrics() - - # Process the streaming response - tool_use_block = None - json_accumulator = "" - - for event in response["stream"]: - # Handle text content - if "contentBlockDelta" in event: - delta = event["contentBlockDelta"]["delta"] - if "text" in delta: - await self.push_frame(LLMTextFrame(delta["text"])) - completion_tokens_estimate += self._estimate_tokens(delta["text"]) - elif "toolUse" in delta and "input" in delta["toolUse"]: - # Handle partial JSON for tool use - json_accumulator += delta["toolUse"]["input"] - completion_tokens_estimate += self._estimate_tokens(delta["toolUse"]["input"]) - - # Handle tool use start - elif "contentBlockStart" in event: - content_block_start = event["contentBlockStart"]['start'] - if "toolUse" in content_block_start: - tool_use_block = { - "id": content_block_start["toolUse"].get("toolUseId", ""), - "name": content_block_start["toolUse"].get("name", "") - } - json_accumulator = "" - - # Handle message completion with tool use - elif "messageStop" in event and "stopReason" in event["messageStop"]: - if event["messageStop"]["stopReason"] == "tool_use" and tool_use_block: - try: - arguments = json.loads(json_accumulator) if json_accumulator else {} - await self.call_function( - context=context, - tool_call_id=tool_use_block["id"], - function_name=tool_use_block["name"], - arguments=arguments, - ) - except json.JSONDecodeError: - logger.error(f"Failed to parse tool arguments: {json_accumulator}") - - # Handle usage metrics if available - if "metadata" in event and "usage" in event["metadata"]: - usage = event["metadata"]["usage"] - prompt_tokens += usage.get("inputTokens", 0) - completion_tokens += usage.get("outputTokens", 0) - - except asyncio.CancelledError: - # If we're interrupted, we won't get a complete usage report. So set our flag to use the - # token estimate. The reraise the exception so all the processors running in this task - # also get cancelled. - use_completion_tokens_estimate = True - raise - except httpx.TimeoutException: - await self._call_event_handler("on_completion_timeout") - except Exception as e: - logger.exception(f"{self} exception: {e}") - finally: - await self.stop_processing_metrics() - await self.push_frame(LLMFullResponseEndFrame()) - comp_tokens = ( - completion_tokens - if not use_completion_tokens_estimate - else completion_tokens_estimate - ) - await self._report_usage_metrics( - prompt_tokens=prompt_tokens, - completion_tokens=comp_tokens, - ) - - async def process_frame(self, frame: Frame, direction: FrameDirection): - await super().process_frame(frame, direction) - - context = None - if isinstance(frame, OpenAILLMContextFrame): - context = BedrockLLMContext.upgrade_to_bedrock(frame.context) - elif isinstance(frame, LLMMessagesFrame): - context = BedrockLLMContext.from_messages(frame.messages) - elif isinstance(frame, VisionImageRawFrame): - # This is only useful in very simple pipelines because it creates - # a new context. Generally we want a context manager to catch - # UserImageRawFrames coming through the pipeline and add them - # to the context. - context = BedrockLLMContext.from_image_frame(frame) - elif isinstance(frame, LLMUpdateSettingsFrame): - await self._update_settings(frame.settings) - else: - await self.push_frame(frame, direction) - - if context: - await self._process_context(context) - - def _estimate_tokens(self, text: str) -> int: - return int(len(re.split(r"[^\w]+", text)) * 1.3) - - async def _report_usage_metrics( - self, - prompt_tokens: int, - completion_tokens: int, - ): - if prompt_tokens or completion_tokens: - tokens = LLMTokenUsage( - prompt_tokens=prompt_tokens, - completion_tokens=completion_tokens, - total_tokens=prompt_tokens + completion_tokens, - ) - await self.start_llm_usage_metrics(tokens) - - class BedrockLLMContext(OpenAILLMContext): def __init__( self, @@ -358,7 +66,7 @@ class BedrockLLMContext(OpenAILLMContext): tools: Optional[List[dict]] = None, tool_choice: Optional[dict] = None, *, - system: Union[str, NotGiven] = NOT_GIVEN, + system: Optional[str] = None, ): super().__init__(messages=messages, tools=tools, tool_choice=tool_choice) self.system = system @@ -375,6 +83,7 @@ class BedrockLLMContext(OpenAILLMContext): @classmethod def from_openai_context(cls, openai_context: OpenAILLMContext): + logger.debug("from_openai_context called") self = cls( messages=openai_context.messages, tools=openai_context.tools, @@ -621,6 +330,7 @@ class BedrockLLMContext(OpenAILLMContext): merging consecutive messages with the same role, and ensuring proper content formatting. """ # Handle system message if present at the beginning + logger.debug(f"_restructure_from_bedrock_messages: {self.messages}") if self.messages and self.messages[0]["role"] == "system": if len(self.messages) == 1: self.messages[0]["role"] = "user" @@ -653,6 +363,7 @@ class BedrockLLMContext(OpenAILLMContext): self.messages.extend(merged_messages) def _restructure_from_openai_messages(self): + logger.debug(f"_restructure_from_openai_messages: {self.messages}") # first, map across self._messages calling self.from_standard_message(m) to modify messages in place try: self._messages[:] = [self.from_standard_message(m) for m in self._messages] @@ -794,4 +505,285 @@ class BedrockAssistantContextAggregator(LLMAssistantContextAggregator): image=frame.image, text=frame.request.context, ) - \ No newline at end of file + + +class BedrockLLMService(LLMService): + """This class implements inference with AWS Bedrock models including Amazon Nova and Anthropic Claude. + + Requires AWS credentials to be configured in the environment or through boto3 configuration. + """ + class InputParams(BaseModel): + max_tokens: Optional[int] = Field(default_factory=lambda: 4096, ge=1) + temperature: Optional[float] = Field(default_factory=lambda: 0.7, ge=0.0, le=1.0) + top_p: Optional[float] = Field(default_factory=lambda: 0.999, ge=0.0, le=1.0) + stop_sequences: Optional[List[str]] = Field(default_factory=lambda: []) + latency: Optional[str] = Field(default_factory=lambda: "standard") + additional_model_request_fields: Optional[Dict[str, Any]] = Field(default_factory=dict) + + def __init__( + self, + *, + aws_access_key: Optional[str] = None, + aws_secret_key: Optional[str] = None, + aws_session_token: Optional[str] = None, + aws_region: str = "us-east-1", + model: str, + params: InputParams = InputParams(), + client_config: Optional[Config] = None, + **kwargs, + ): + super().__init__(**kwargs) + + # Initialize the Bedrock client + if not client_config: + client_config = Config( + connect_timeout=300, # 5 minutes + read_timeout=300, # 5 minutes + retries={'max_attempts': 3} + ) + session = boto3.Session( + aws_access_key_id=aws_access_key, + aws_secret_access_key=aws_secret_key, + aws_session_token=aws_session_token, + region_name=aws_region + ) + self._client = session.client( + service_name='bedrock-runtime', + config=client_config + ) + + self.set_model_name(model) + self._settings = { + "max_tokens": params.max_tokens, + "temperature": params.temperature, + "top_p": params.top_p, + "latency": params.latency, + "additional_model_request_fields": params.additional_model_request_fields if isinstance(params.additional_model_request_fields, dict) else {}, + } + + # Determine model provider from model ID + self.model_provider = self._get_model_provider(model) + logger.info(f"Using AWS Bedrock model: {model} from provider: {self.model_provider}") + + def _get_model_provider(self, model: str) -> str: + """Determine the model provider from the model ID""" + if "anthropic." in model: + return "anthropic" + elif "amazon." in model: + return "amazon" + else: + raise ValueError(f"Unsupported model: {model}. Only Anthropic Claude and Amazon Nova model families are supported.") + + def can_generate_metrics(self) -> bool: + return True + + def create_context_aggregator( + self, + context: BedrockLLMContext, + *, + user_kwargs: Mapping[str, Any] = {}, + assistant_kwargs: Mapping[str, Any] = {}, + ) -> BedrockContextAggregatorPair: + """Create an instance of BedrockContextAggregatorPair from an + OpenAILLMContext. Constructor keyword arguments for both the user and + assistant aggregators can be provided. + + Args: + context (OpenAILLMContext): The LLM context. + user_kwargs (Mapping[str, Any], optional): Additional keyword + arguments for the user context aggregator constructor. Defaults + to an empty mapping. + assistant_kwargs (Mapping[str, Any], optional): Additional keyword + arguments for the assistant context aggregator + constructor. Defaults to an empty mapping. + + Returns: + BedrockContextAggregatorPair: A pair of context aggregators, one + for the user and one for the assistant, encapsulated in an + BedrockContextAggregatorPair. + """ + context.set_llm_adapter(self.get_llm_adapter()) + + if isinstance(context, OpenAILLMContext) and not isinstance(context, BedrockLLMContext): + context = BedrockLLMContext.from_openai_context(context) + + user = BedrockUserContextAggregator(context, **user_kwargs) + assistant = BedrockAssistantContextAggregator(context, **assistant_kwargs) + return BedrockContextAggregatorPair(_user=user, _assistant=assistant) + + async def _process_context(self, context: "BedrockLLMContext"): + # Usage tracking + prompt_tokens = 0 + completion_tokens = 0 + completion_tokens_estimate = 0 + use_completion_tokens_estimate = False + + try: + await self.push_frame(LLMFullResponseStartFrame()) + await self.start_processing_metrics() + + # logger.debug( + # f"{self}: Generating chat with Bedrock model {self.model_name} | [{context.get_messages_for_logging()}]" + # ) + + await self.start_ttfb_metrics() + + # Set up inference config + inference_config = { + "maxTokens": self._settings["max_tokens"], + "temperature": self._settings["temperature"], + "topP": self._settings["top_p"], + } + + # Prepare request parameters + request_params = { + "modelId": self.model_name, + "messages": context.messages, + "inferenceConfig": inference_config, + "additionalModelRequestFields": self._settings["additional_model_request_fields"] + } + + # Add system message + request_params["system"] = [{"text": context.system}] + + # Add tools if present + if context.tools: + tool_config = { + "tools": context.tools + } + + # Add tool_choice if specified + if context.tool_choice: + if context.tool_choice == "auto": + tool_config["toolChoice"] = {"auto": {}} + elif context.tool_choice == "none": + # Skip adding toolChoice for "none" + pass + elif isinstance(context.tool_choice, dict) and "function" in context.tool_choice: + tool_config["toolChoice"] = { + "tool": { + "name": context.tool_choice["function"]["name"] + } + } + + request_params["toolConfig"] = tool_config + + # Add performance config if latency is specified + if self._settings["latency"] in ["standard", "optimized"]: + request_params["performanceConfig"] = { + "latency": self._settings["latency"] + } + + logger.debug(f"Calling Bedrock model with: {request_params}") + + # Call Bedrock with streaming + response = self._client.converse_stream(**request_params) + + await self.stop_ttfb_metrics() + + # Process the streaming response + tool_use_block = None + json_accumulator = "" + + for event in response["stream"]: + # Handle text content + if "contentBlockDelta" in event: + delta = event["contentBlockDelta"]["delta"] + if "text" in delta: + await self.push_frame(LLMTextFrame(delta["text"])) + completion_tokens_estimate += self._estimate_tokens(delta["text"]) + elif "toolUse" in delta and "input" in delta["toolUse"]: + # Handle partial JSON for tool use + json_accumulator += delta["toolUse"]["input"] + completion_tokens_estimate += self._estimate_tokens(delta["toolUse"]["input"]) + + # Handle tool use start + elif "contentBlockStart" in event: + content_block_start = event["contentBlockStart"]['start'] + if "toolUse" in content_block_start: + tool_use_block = { + "id": content_block_start["toolUse"].get("toolUseId", ""), + "name": content_block_start["toolUse"].get("name", "") + } + json_accumulator = "" + + # Handle message completion with tool use + elif "messageStop" in event and "stopReason" in event["messageStop"]: + if event["messageStop"]["stopReason"] == "tool_use" and tool_use_block: + try: + arguments = json.loads(json_accumulator) if json_accumulator else {} + await self.call_function( + context=context, + tool_call_id=tool_use_block["id"], + function_name=tool_use_block["name"], + arguments=arguments, + ) + except json.JSONDecodeError: + logger.error(f"Failed to parse tool arguments: {json_accumulator}") + + # Handle usage metrics if available + if "metadata" in event and "usage" in event["metadata"]: + usage = event["metadata"]["usage"] + prompt_tokens += usage.get("inputTokens", 0) + completion_tokens += usage.get("outputTokens", 0) + + except asyncio.CancelledError: + # If we're interrupted, we won't get a complete usage report. So set our flag to use the + # token estimate. The reraise the exception so all the processors running in this task + # also get cancelled. + use_completion_tokens_estimate = True + raise + except httpx.TimeoutException: + await self._call_event_handler("on_completion_timeout") + except Exception as e: + logger.exception(f"{self} exception: {e}") + finally: + await self.stop_processing_metrics() + await self.push_frame(LLMFullResponseEndFrame()) + comp_tokens = ( + completion_tokens + if not use_completion_tokens_estimate + else completion_tokens_estimate + ) + await self._report_usage_metrics( + prompt_tokens=prompt_tokens, + completion_tokens=comp_tokens, + ) + + async def process_frame(self, frame: Frame, direction: FrameDirection): + await super().process_frame(frame, direction) + + context = None + if isinstance(frame, OpenAILLMContextFrame): + context = BedrockLLMContext.upgrade_to_bedrock(frame.context) + elif isinstance(frame, LLMMessagesFrame): + context = BedrockLLMContext.from_messages(frame.messages) + elif isinstance(frame, VisionImageRawFrame): + # This is only useful in very simple pipelines because it creates + # a new context. Generally we want a context manager to catch + # UserImageRawFrames coming through the pipeline and add them + # to the context. + context = BedrockLLMContext.from_image_frame(frame) + elif isinstance(frame, LLMUpdateSettingsFrame): + await self._update_settings(frame.settings) + else: + await self.push_frame(frame, direction) + + if context: + await self._process_context(context) + + def _estimate_tokens(self, text: str) -> int: + return int(len(re.split(r"[^\w]+", text)) * 1.3) + + async def _report_usage_metrics( + self, + prompt_tokens: int, + completion_tokens: int, + ): + if prompt_tokens or completion_tokens: + tokens = LLMTokenUsage( + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + total_tokens=prompt_tokens + completion_tokens, + ) + await self.start_llm_usage_metrics(tokens) From f014f718eb2fce7c5ebbd67f88c777e2d553f3aa Mon Sep 17 00:00:00 2001 From: Adithya Suresh Date: Fri, 4 Apr 2025 05:39:08 +0000 Subject: [PATCH 19/97] Restructured STT and enabled prosody tags for generative Polly --- ...ible-polly.py => 07m-interruptible-aws.py} | 43 +- src/pipecat/services/aws/stt.py | 600 +++++++++++++++++ src/pipecat/services/aws/tts.py | 612 +----------------- 3 files changed, 638 insertions(+), 617 deletions(-) rename examples/foundational/{07m-interruptible-polly.py => 07m-interruptible-aws.py} (70%) create mode 100644 src/pipecat/services/aws/stt.py diff --git a/examples/foundational/07m-interruptible-polly.py b/examples/foundational/07m-interruptible-aws.py similarity index 70% rename from examples/foundational/07m-interruptible-polly.py rename to examples/foundational/07m-interruptible-aws.py index b3bd08061..d1fae6b5e 100644 --- a/examples/foundational/07m-interruptible-polly.py +++ b/examples/foundational/07m-interruptible-aws.py @@ -5,7 +5,6 @@ # import argparse -import os from dotenv import load_dotenv from loguru import logger @@ -14,13 +13,13 @@ from pipecat.audio.vad.silero import SileroVADAnalyzer from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner from pipecat.pipeline.task import PipelineParams, PipelineTask -from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext -from pipecat.services.aws import PollyTTSService, TranscribeSTTService -from pipecat.services.openai import OpenAILLMService from pipecat.transcriptions.language import Language from pipecat.transports.base_transport import TransportParams from pipecat.transports.network.small_webrtc import SmallWebRTCTransport from pipecat.transports.network.webrtc_connection import SmallWebRTCConnection +from pipecat.services.aws.llm import BedrockLLMService, BedrockLLMContext +from pipecat.services.aws.stt import TranscribeSTTService +from pipecat.services.aws.tts import PollyTTSService load_dotenv(override=True) @@ -43,20 +42,30 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection, _: argparse.Namespac region="us-west-2", # only specific regions support generative TTS voice_id="Joanna", params=PollyTTSService.InputParams( - engine="generative", language=Language.EN_GB, rate="1.05" + engine="generative", + language=Language.EN_US, + rate="1.1" ), ) - llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY")) + llm = BedrockLLMService( + aws_region="us-west-2", + model="us.anthropic.claude-3-5-haiku-20241022-v1:0", + params=BedrockLLMService.InputParams( + temperature=0.8, + latency="optimized" + ) + ) messages = [ - { - "role": "system", - "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.", - }, - ] + { + "role": "system", + "content": [{"text": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way."}], + }, + ] + ) - context = OpenAILLMContext(messages) + context = BedrockLLMContext(messages) context_aggregator = llm.create_context_aggregator(context) pipeline = Pipeline( @@ -68,8 +77,8 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection, _: argparse.Namespac tts, # TTS transport.output(), # Transport bot output context_aggregator.assistant(), # Assistant spoken responses - ] - ) + ] + ) task = PipelineTask( pipeline, @@ -85,16 +94,12 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection, _: argparse.Namespac async def on_client_connected(transport, client): logger.info(f"Client connected") # Kick off the conversation. - messages.append({"role": "system", "content": "Please introduce yourself to the user."}) + messages.append({"role": "user", "content": [{"text": "Please introduce yourself to the user."}]}) await task.queue_frames([context_aggregator.user().get_context_frame()]) @transport.event_handler("on_client_disconnected") async def on_client_disconnected(transport, client): logger.info(f"Client disconnected") - - @transport.event_handler("on_client_closed") - async def on_client_closed(transport, client): - logger.info(f"Client closed connection") await task.cancel() runner = PipelineRunner(handle_sigint=False) diff --git a/src/pipecat/services/aws/stt.py b/src/pipecat/services/aws/stt.py new file mode 100644 index 000000000..08d74d484 --- /dev/null +++ b/src/pipecat/services/aws/stt.py @@ -0,0 +1,600 @@ +import asyncio +from typing import AsyncGenerator, Optional, Dict +import os +import datetime +from urllib.parse import urlencode +import json +import struct +import urllib.parse +import hashlib +import hmac +import random +import string +import binascii + +from loguru import logger + +from pipecat.frames.frames import ( + ErrorFrame, + Frame, + TranscriptionFrame, + InterimTranscriptionFrame, + StartFrame +) +from pipecat.services.ai_services import STTService +from pipecat.transcriptions.language import Language +from pipecat.utils.time import time_now_iso8601 + +try: + import boto3 + from botocore.exceptions import BotoCoreError, ClientError + import websockets +except ModuleNotFoundError as e: + logger.error(f"Exception: {e}") + logger.error( + "In order to use AWS services, you need to `pip install pipecat-ai[aws]`. Also, remember to set `AWS_SECRET_ACCESS_KEY`, `AWS_ACCESS_KEY_ID`, and `AWS_REGION` environment variable." + ) + raise Exception(f"Missing module: {e}") + + +def get_presigned_url( + *, + region: str, + credentials: Dict[str, Optional[str]], + language_code: str, + media_encoding: str = "pcm", + sample_rate: int = 16000, + number_of_channels: int = 1, + enable_partial_results_stabilization: bool = True, + partial_results_stability: str = "high", + vocabulary_name: Optional[str] = None, + vocabulary_filter_name: Optional[str] = None, + show_speaker_label: bool = False, + enable_channel_identification: bool = False, +) -> str: + """Create a presigned URL for AWS Transcribe streaming.""" + access_key = credentials.get("access_key") + secret_key = credentials.get("secret_key") + session_token = credentials.get("session_token") + + if not access_key or not secret_key: + raise ValueError("AWS credentials are required") + + # Initialize the URL generator + url_generator = AWSTranscribePresignedURL( + access_key=access_key, secret_key=secret_key, session_token=session_token, region=region + ) + + # Get the presigned URL + return url_generator.get_request_url( + sample_rate=sample_rate, + language_code=language_code, + media_encoding=media_encoding, + vocabulary_name=vocabulary_name, + vocabulary_filter_name=vocabulary_filter_name, + show_speaker_label=show_speaker_label, + enable_channel_identification=enable_channel_identification, + number_of_channels=number_of_channels, + enable_partial_results_stabilization=enable_partial_results_stabilization, + partial_results_stability=partial_results_stability, + ) + + +class AWSTranscribePresignedURL: + def __init__( + self, access_key: str, secret_key: str, session_token: str, region: str = "us-east-1" + ): + self.access_key = access_key + self.secret_key = secret_key + self.session_token = session_token + self.method = "GET" + self.service = "transcribe" + self.region = region + self.endpoint = "" + self.host = "" + self.amz_date = "" + self.datestamp = "" + self.canonical_uri = "/stream-transcription-websocket" + self.canonical_headers = "" + self.signed_headers = "host" + self.algorithm = "AWS4-HMAC-SHA256" + self.credential_scope = "" + self.canonical_querystring = "" + self.payload_hash = "" + self.canonical_request = "" + self.string_to_sign = "" + self.signature = "" + self.request_url = "" + + def get_request_url( + self, + sample_rate: int, + language_code: str = "", + media_encoding: str = "pcm", + vocabulary_name: str = "", + vocabulary_filter_name: str = "", + show_speaker_label: bool = False, + enable_channel_identification: bool = False, + number_of_channels: int = 1, + enable_partial_results_stabilization: bool = False, + partial_results_stability: str = "", + ) -> str: + self.endpoint = f"wss://transcribestreaming.{self.region}.amazonaws.com:8443" + self.host = f"transcribestreaming.{self.region}.amazonaws.com:8443" + + now = datetime.datetime.utcnow() + self.amz_date = now.strftime("%Y%m%dT%H%M%SZ") + self.datestamp = now.strftime("%Y%m%d") + self.canonical_headers = f"host:{self.host}\n" + self.credential_scope = f"{self.datestamp}%2F{self.region}%2F{self.service}%2Faws4_request" + + # Create canonical querystring + self.canonical_querystring = "X-Amz-Algorithm=" + self.algorithm + self.canonical_querystring += ( + "&X-Amz-Credential=" + self.access_key + "%2F" + self.credential_scope + ) + self.canonical_querystring += "&X-Amz-Date=" + self.amz_date + self.canonical_querystring += "&X-Amz-Expires=300" + if self.session_token: + self.canonical_querystring += "&X-Amz-Security-Token=" + urllib.parse.quote( + self.session_token, safe="" + ) + self.canonical_querystring += "&X-Amz-SignedHeaders=" + self.signed_headers + + if enable_channel_identification: + self.canonical_querystring += "&enable-channel-identification=true" + if enable_partial_results_stabilization: + self.canonical_querystring += "&enable-partial-results-stabilization=true" + if language_code: + self.canonical_querystring += "&language-code=" + language_code + if media_encoding: + self.canonical_querystring += "&media-encoding=" + media_encoding + if number_of_channels > 1: + self.canonical_querystring += "&number-of-channels=" + str(number_of_channels) + if partial_results_stability: + self.canonical_querystring += "&partial-results-stability=" + partial_results_stability + if sample_rate: + self.canonical_querystring += "&sample-rate=" + str(sample_rate) + if show_speaker_label: + self.canonical_querystring += "&show-speaker-label=true" + if vocabulary_filter_name: + self.canonical_querystring += "&vocabulary-filter-name=" + vocabulary_filter_name + if vocabulary_name: + self.canonical_querystring += "&vocabulary-name=" + vocabulary_name + + # Create payload hash + self.payload_hash = hashlib.sha256("".encode("utf-8")).hexdigest() + + # Create canonical request + self.canonical_request = f"{self.method}\n{self.canonical_uri}\n{self.canonical_querystring}\n{self.canonical_headers}\n{self.signed_headers}\n{self.payload_hash}" + + # Create string to sign + credential_scope = f"{self.datestamp}/{self.region}/{self.service}/aws4_request" + string_to_sign = ( + f"{self.algorithm}\n{self.amz_date}\n{credential_scope}\n" + + hashlib.sha256(self.canonical_request.encode("utf-8")).hexdigest() + ) + + # Calculate signature + k_date = hmac.new( + f"AWS4{self.secret_key}".encode("utf-8"), self.datestamp.encode("utf-8"), hashlib.sha256 + ).digest() + k_region = hmac.new(k_date, self.region.encode("utf-8"), hashlib.sha256).digest() + k_service = hmac.new(k_region, self.service.encode("utf-8"), hashlib.sha256).digest() + k_signing = hmac.new(k_service, b"aws4_request", hashlib.sha256).digest() + self.signature = hmac.new( + k_signing, string_to_sign.encode("utf-8"), hashlib.sha256 + ).hexdigest() + + # Add signature to query string + self.canonical_querystring += "&X-Amz-Signature=" + self.signature + + # Create request URL + self.request_url = self.endpoint + self.canonical_uri + "?" + self.canonical_querystring + return self.request_url + + +def get_headers(header_name: str, header_value: str) -> bytearray: + """Build a header following AWS event stream format.""" + name = header_name.encode("utf-8") + name_byte_length = bytes([len(name)]) + value_type = bytes([7]) # 7 represents a string + value = header_value.encode("utf-8") + value_byte_length = struct.pack(">H", len(value)) + + # Construct the header + header_list = bytearray() + header_list.extend(name_byte_length) + header_list.extend(name) + header_list.extend(value_type) + header_list.extend(value_byte_length) + header_list.extend(value) + return header_list + + +def build_event_message(payload: bytes) -> bytes: + """ + Build an event message for AWS Transcribe streaming. + Matches AWS sample: https://github.com/aws-samples/amazon-transcribe-streaming-python-websockets/blob/main/eventstream.py + """ + # Build headers + content_type_header = get_headers(":content-type", "application/octet-stream") + event_type_header = get_headers(":event-type", "AudioEvent") + message_type_header = get_headers(":message-type", "event") + + headers = bytearray() + headers.extend(content_type_header) + headers.extend(event_type_header) + headers.extend(message_type_header) + + # Calculate total byte length and headers byte length + # 16 accounts for 8 byte prelude, 2x 4 byte CRCs + total_byte_length = struct.pack(">I", len(headers) + len(payload) + 16) + headers_byte_length = struct.pack(">I", len(headers)) + + # Build the prelude + prelude = bytearray([0] * 8) + prelude[:4] = total_byte_length + prelude[4:] = headers_byte_length + + # Calculate checksum for prelude + prelude_crc = struct.pack(">I", binascii.crc32(prelude) & 0xFFFFFFFF) + + # Construct the message + message_as_list = bytearray() + message_as_list.extend(prelude) + message_as_list.extend(prelude_crc) + message_as_list.extend(headers) + message_as_list.extend(payload) + + # Calculate checksum for message + message = bytes(message_as_list) + message_crc = struct.pack(">I", binascii.crc32(message) & 0xFFFFFFFF) + + # Add message checksum + message_as_list.extend(message_crc) + + return bytes(message_as_list) + + +def decode_event(message): + # Extract the prelude, headers, payload and CRC + prelude = message[:8] + total_length, headers_length = struct.unpack(">II", prelude) + prelude_crc = struct.unpack(">I", message[8:12])[0] + headers = message[12 : 12 + headers_length] + payload = message[12 + headers_length : -4] + message_crc = struct.unpack(">I", message[-4:])[0] + + # Check the CRCs + assert prelude_crc == binascii.crc32(prelude) & 0xFFFFFFFF, "Prelude CRC check failed" + assert message_crc == binascii.crc32(message[:-4]) & 0xFFFFFFFF, "Message CRC check failed" + + # Parse the headers + headers_dict = {} + while headers: + name_len = headers[0] + name = headers[1 : 1 + name_len].decode("utf-8") + value_type = headers[1 + name_len] + value_len = struct.unpack(">H", headers[2 + name_len : 4 + name_len])[0] + value = headers[4 + name_len : 4 + name_len + value_len].decode("utf-8") + headers_dict[name] = value + headers = headers[4 + name_len + value_len :] + + return headers_dict, json.loads(payload) + + +class TranscribeSTTService(STTService): + def __init__( + self, + *, + api_key: Optional[str] = None, + aws_access_key_id: Optional[str] = None, + aws_session_token: Optional[str] = None, + region: Optional[str] = "us-east-1", + sample_rate: int = 16000, + language: Language = Language.EN, + **kwargs, + ): + super().__init__(**kwargs) + + self._settings = { + "sample_rate": sample_rate, + "language": language, + "media_encoding": "linear16", # AWS expects raw PCM + "number_of_channels": 1, + "show_speaker_label": False, + "enable_channel_identification": False, + } + + # Validate sample rate - AWS Transcribe only supports 8000 Hz or 16000 Hz + if sample_rate not in [8000, 16000]: + logger.warning( + f"AWS Transcribe only supports 8000 Hz or 16000 Hz sample rates. Converting from {sample_rate} Hz to 16000 Hz." + ) + self._settings["sample_rate"] = 16000 + + self._credentials = { + "aws_access_key_id": aws_access_key_id or os.getenv("AWS_ACCESS_KEY_ID"), + "aws_secret_access_key": api_key or os.getenv("AWS_SECRET_ACCESS_KEY"), + "aws_session_token": aws_session_token or os.getenv("AWS_SESSION_TOKEN"), + "region": region or os.getenv("AWS_REGION", "us-east-1"), + } + + self._ws_client = None + self._connection_lock = asyncio.Lock() + self._connecting = False + self._receive_task = None + + def get_service_encoding(self, encoding: str) -> str: + """Convert internal encoding format to AWS Transcribe format.""" + encoding_map = { + "linear16": "pcm", # AWS expects "pcm" for 16-bit linear PCM + } + return encoding_map.get(encoding, encoding) + + async def start(self, frame: StartFrame): + """Initialize the connection when the service starts.""" + await super().start(frame) + logger.info("Starting AWS Transcribe service...") + retry_count = 0 + max_retries = 3 + + while retry_count < max_retries: + try: + await self._connect() + if self._ws_client and self._ws_client.open: + logger.info("Successfully established WebSocket connection") + return + logger.warning("WebSocket connection not established after connect") + except Exception as e: + logger.error(f"Failed to connect (attempt {retry_count + 1}/{max_retries}): {e}") + retry_count += 1 + if retry_count < max_retries: + await asyncio.sleep(1) # Wait before retrying + + raise RuntimeError("Failed to establish WebSocket connection after multiple attempts") + + async def run_stt(self, frame: Frame) -> AsyncGenerator[Frame, None]: + """Process audio data and send to AWS Transcribe""" + try: + # Skip if no speech detected + if hasattr(frame, "is_speech") and not frame.is_speech: + logger.debug("Skipping non-speech frame") + return + + # Ensure WebSocket is connected + if not self._ws_client or not self._ws_client.open: + logger.info("WebSocket not connected, attempting to reconnect...") + try: + await self._connect() + except Exception as e: + logger.error(f"Failed to reconnect: {e}") + yield ErrorFrame("Failed to reconnect to AWS Transcribe", fatal=False) + return + + # Get the audio data - if frame is bytes, use directly, otherwise get audio attribute + audio_data = frame if isinstance(frame, bytes) else frame.audio + + # Format the audio data according to AWS event stream format + event_message = build_event_message(audio_data) + # logger.debug(f"Sending audio chunk of size {len(audio_data)} bytes") + + # Send the formatted event message + try: + await self._ws_client.send(event_message) + # Start metrics after first chunk sent + await self.start_processing_metrics() + await self.start_ttfb_metrics() + except websockets.exceptions.ConnectionClosed as e: + logger.warning(f"Connection closed while sending: {e}") + await self._disconnect() + # Don't yield error here - we'll retry on next frame + except Exception as e: + logger.error(f"Error sending audio: {e}") + yield ErrorFrame(f"AWS Transcribe error: {str(e)}", fatal=False) + await self._disconnect() + + except Exception as e: + logger.error(f"Error in run_stt: {e}") + yield ErrorFrame(f"AWS Transcribe error: {str(e)}", fatal=False) + await self._disconnect() + + async def _connect(self): + """Connect to AWS Transcribe with connection state management.""" + if ( + self._ws_client + and self._ws_client.open + and self._receive_task + and not self._receive_task.done() + ): + logger.debug("Already connected") + return + + async with self._connection_lock: + if self._connecting: + logger.debug("Connection already in progress") + return + + try: + self._connecting = True + logger.debug("Starting connection process...") + + if self._ws_client: + await self._disconnect() + + language_code = self.language_to_service_language( + Language(self._settings["language"]) + ) + if not language_code: + raise ValueError(f"Unsupported language: {self._settings['language']}") + + # Generate random websocket key + websocket_key = "".join( + random.choices( + string.ascii_uppercase + string.ascii_lowercase + string.digits, k=20 + ) + ) + + # Add required headers + extra_headers = { + "Origin": "https://localhost", + "Sec-WebSocket-Key": websocket_key, + "Sec-WebSocket-Version": "13", + "Connection": "keep-alive", + } + + # Get presigned URL + presigned_url = get_presigned_url( + region=self._credentials["region"], + credentials={ + "access_key": self._credentials["aws_access_key_id"], + "secret_key": self._credentials["aws_secret_access_key"], + "session_token": self._credentials["aws_session_token"], + }, + language_code=language_code, + media_encoding=self.get_service_encoding( + self._settings["media_encoding"] + ), # Convert to AWS format + sample_rate=self._settings["sample_rate"], + number_of_channels=self._settings["number_of_channels"], + enable_partial_results_stabilization=True, + partial_results_stability="high", + show_speaker_label=self._settings["show_speaker_label"], + enable_channel_identification=self._settings["enable_channel_identification"], + ) + + logger.debug(f"Connecting to WebSocket with URL: {presigned_url[:100]}...") + + # Connect with the required headers and settings + self._ws_client = await websockets.connect( + presigned_url, + extra_headers=extra_headers, + subprotocols=["mqtt"], + ping_interval=None, + ping_timeout=None, + compression=None, + ) + logger.debug("WebSocket connected, starting receive task...") + + # Start receive task + self._receive_task = asyncio.create_task(self._receive_loop()) + + logger.info("Successfully connected to AWS Transcribe") + + except Exception as e: + logger.error(f"Failed to connect to AWS Transcribe: {e}") + await self._disconnect() + raise + + finally: + self._connecting = False + + async def _disconnect(self): + """Disconnect from AWS Transcribe.""" + if self._receive_task: + self._receive_task.cancel() + try: + await self._receive_task + except asyncio.CancelledError: + pass + self._receive_task = None + + if self._ws_client: + try: + if self._ws_client.open: + # Send end-stream message + end_stream = {"message-type": "event", "event": "end"} + await self._ws_client.send(json.dumps(end_stream)) + await self._ws_client.close() + except Exception as e: + logger.warning(f"Error closing WebSocket connection: {e}") + finally: + self._ws_client = None + + def language_to_service_language(self, language: Language) -> str | None: + """Convert internal language enum to AWS Transcribe language code.""" + language_map = { + Language.EN: "en-US", + Language.ES: "es-US", + Language.FR: "fr-FR", + Language.DE: "de-DE", + Language.IT: "it-IT", + Language.PT: "pt-BR", + Language.JA: "ja-JP", + Language.KO: "ko-KR", + Language.ZH: "zh-CN", + } + return language_map.get(language) + + async def _receive_loop(self): + """Background task to receive and process messages from AWS Transcribe.""" + try: + logger.debug("Receive loop started") + while True: + if not self._ws_client or not self._ws_client.open: + logger.warning("WebSocket closed in receive loop") + break + + try: + response = await self._ws_client.recv() + headers, payload = decode_event(response) + + # logger.debug(f"Received message type: {headers.get(':message-type')}") + + if headers.get(":message-type") == "event": + # Process transcription results + results = payload.get("Transcript", {}).get("Results", []) + if results: + result = results[0] + alternatives = result.get("Alternatives", []) + if alternatives: + transcript = alternatives[0].get("Transcript", "") + is_final = not result.get("IsPartial", True) + + if transcript: + await self.stop_ttfb_metrics() + if is_final: + await self.push_frame( + TranscriptionFrame( + transcript, + "", + time_now_iso8601(), + self._settings["language"], + ) + ) + await self.stop_processing_metrics() + else: + await self.push_frame( + InterimTranscriptionFrame( + transcript, + "", + time_now_iso8601(), + self._settings["language"], + ) + ) + elif headers.get(":message-type") == "exception": + error_msg = payload.get("Message", "Unknown error") + logger.error(f"Exception from AWS: {error_msg}") + await self.push_frame( + ErrorFrame(f"AWS Transcribe error: {error_msg}", fatal=False) + ) + else: + logger.debug(f"Other message type received: {headers}") + logger.debug(f"Payload: {payload}") + + except websockets.exceptions.ConnectionClosed as e: + logger.error( + f"WebSocket connection closed in receive loop with code {e.code}: {e.reason}" + ) + break + except Exception as e: + logger.error(f"Error in receive loop: {e}") + break + + except asyncio.CancelledError: + logger.debug("Receive loop cancelled") + except Exception as e: + logger.error(f"Unexpected error in receive loop: {e}") + finally: + logger.debug("Receive loop ended") \ No newline at end of file diff --git a/src/pipecat/services/aws/tts.py b/src/pipecat/services/aws/tts.py index e90ea9220..ed1230dd7 100644 --- a/src/pipecat/services/aws/tts.py +++ b/src/pipecat/services/aws/tts.py @@ -5,21 +5,8 @@ # import asyncio -from typing import AsyncGenerator, Optional, Dict +from typing import AsyncGenerator, Optional import os -import datetime -import time -from urllib.parse import urlencode -import json -import struct -from io import BytesIO -import urllib.parse -import hashlib -import hmac -import random -import string -import binascii -import numpy as np from loguru import logger from pydantic import BaseModel @@ -30,28 +17,18 @@ from pipecat.frames.frames import ( Frame, TTSAudioRawFrame, TTSStartedFrame, - TTSStoppedFrame, - TranscriptionFrame, - InterimTranscriptionFrame, - StartFrame, - EndFrame, - CancelFrame, + TTSStoppedFrame ) -from pipecat.services.ai_services import TTSService, STTService +from pipecat.services.ai_services import TTSService from pipecat.transcriptions.language import Language -from pipecat.utils.time import time_now_iso8601 try: import boto3 from botocore.exceptions import BotoCoreError, ClientError - import websockets - from botocore.auth import SigV4Auth - from botocore.awsrequest import AWSRequest - from botocore.credentials import Credentials except ModuleNotFoundError as e: logger.error(f"Exception: {e}") logger.error( - "In order to use AWS services, you need to `pip install pipecat-ai[aws]`. Also, set `AWS_SECRET_ACCESS_KEY`, `AWS_ACCESS_KEY_ID`, and `AWS_REGION` environment variable." + "In order to use AWS services, you need to `pip install pipecat-ai[aws]`. Also, remember to set `AWS_SECRET_ACCESS_KEY`, `AWS_ACCESS_KEY_ID`, and `AWS_REGION` environment variable." ) raise Exception(f"Missing module: {e}") @@ -207,18 +184,18 @@ class PollyTTSService(TTSService): prosody_attrs = [] # Prosody tags are only supported for standard and neural engines - if self._settings["engine"] != "generative": - if self._settings["rate"]: - prosody_attrs.append(f"rate='{self._settings['rate']}'") + if self._settings["engine"] == "standard": if self._settings["pitch"]: prosody_attrs.append(f"pitch='{self._settings['pitch']}'") - if self._settings["volume"]: - prosody_attrs.append(f"volume='{self._settings['volume']}'") + + if self._settings["rate"]: + prosody_attrs.append(f"rate='{self._settings['rate']}'") + if self._settings["volume"]: + prosody_attrs.append(f"volume='{self._settings['volume']}'") + # logger.warning("Prosody tags are not supported for generative engine. Ignoring.") - if prosody_attrs: + if prosody_attrs: ssml += f"" - else: - logger.warning("Prosody tags are not supported for generative engine. Ignoring.") ssml += text @@ -229,6 +206,8 @@ class PollyTTSService(TTSService): ssml += "" + logger.debug(f"SSML: {ssml}") + return ssml async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: @@ -303,566 +282,3 @@ class AWSTTSService(PollyTTSService): warnings.warn( "'AWSTTSService' is deprecated, use 'PollyTTSService' instead.", DeprecationWarning ) - - -def get_presigned_url( - *, - region: str, - credentials: Dict[str, Optional[str]], - language_code: str, - media_encoding: str = "pcm", - sample_rate: int = 16000, - number_of_channels: int = 1, - enable_partial_results_stabilization: bool = True, - partial_results_stability: str = "high", - vocabulary_name: Optional[str] = None, - vocabulary_filter_name: Optional[str] = None, - show_speaker_label: bool = False, - enable_channel_identification: bool = False, -) -> str: - """Create a presigned URL for AWS Transcribe streaming.""" - access_key = credentials.get("access_key") - secret_key = credentials.get("secret_key") - session_token = credentials.get("session_token") - - if not access_key or not secret_key: - raise ValueError("AWS credentials are required") - - # Initialize the URL generator - url_generator = AWSTranscribePresignedURL( - access_key=access_key, secret_key=secret_key, session_token=session_token, region=region - ) - - # Get the presigned URL - return url_generator.get_request_url( - sample_rate=sample_rate, - language_code=language_code, - media_encoding=media_encoding, - vocabulary_name=vocabulary_name, - vocabulary_filter_name=vocabulary_filter_name, - show_speaker_label=show_speaker_label, - enable_channel_identification=enable_channel_identification, - number_of_channels=number_of_channels, - enable_partial_results_stabilization=enable_partial_results_stabilization, - partial_results_stability=partial_results_stability, - ) - - -class AWSTranscribePresignedURL: - def __init__( - self, access_key: str, secret_key: str, session_token: str, region: str = "us-east-1" - ): - self.access_key = access_key - self.secret_key = secret_key - self.session_token = session_token - self.method = "GET" - self.service = "transcribe" - self.region = region - self.endpoint = "" - self.host = "" - self.amz_date = "" - self.datestamp = "" - self.canonical_uri = "/stream-transcription-websocket" - self.canonical_headers = "" - self.signed_headers = "host" - self.algorithm = "AWS4-HMAC-SHA256" - self.credential_scope = "" - self.canonical_querystring = "" - self.payload_hash = "" - self.canonical_request = "" - self.string_to_sign = "" - self.signature = "" - self.request_url = "" - - def get_request_url( - self, - sample_rate: int, - language_code: str = "", - media_encoding: str = "pcm", - vocabulary_name: str = "", - vocabulary_filter_name: str = "", - show_speaker_label: bool = False, - enable_channel_identification: bool = False, - number_of_channels: int = 1, - enable_partial_results_stabilization: bool = False, - partial_results_stability: str = "", - ) -> str: - self.endpoint = f"wss://transcribestreaming.{self.region}.amazonaws.com:8443" - self.host = f"transcribestreaming.{self.region}.amazonaws.com:8443" - - now = datetime.datetime.utcnow() - self.amz_date = now.strftime("%Y%m%dT%H%M%SZ") - self.datestamp = now.strftime("%Y%m%d") - self.canonical_headers = f"host:{self.host}\n" - self.credential_scope = f"{self.datestamp}%2F{self.region}%2F{self.service}%2Faws4_request" - - # Create canonical querystring - self.canonical_querystring = "X-Amz-Algorithm=" + self.algorithm - self.canonical_querystring += ( - "&X-Amz-Credential=" + self.access_key + "%2F" + self.credential_scope - ) - self.canonical_querystring += "&X-Amz-Date=" + self.amz_date - self.canonical_querystring += "&X-Amz-Expires=300" - if self.session_token: - self.canonical_querystring += "&X-Amz-Security-Token=" + urllib.parse.quote( - self.session_token, safe="" - ) - self.canonical_querystring += "&X-Amz-SignedHeaders=" + self.signed_headers - - if enable_channel_identification: - self.canonical_querystring += "&enable-channel-identification=true" - if enable_partial_results_stabilization: - self.canonical_querystring += "&enable-partial-results-stabilization=true" - if language_code: - self.canonical_querystring += "&language-code=" + language_code - if media_encoding: - self.canonical_querystring += "&media-encoding=" + media_encoding - if number_of_channels > 1: - self.canonical_querystring += "&number-of-channels=" + str(number_of_channels) - if partial_results_stability: - self.canonical_querystring += "&partial-results-stability=" + partial_results_stability - if sample_rate: - self.canonical_querystring += "&sample-rate=" + str(sample_rate) - if show_speaker_label: - self.canonical_querystring += "&show-speaker-label=true" - if vocabulary_filter_name: - self.canonical_querystring += "&vocabulary-filter-name=" + vocabulary_filter_name - if vocabulary_name: - self.canonical_querystring += "&vocabulary-name=" + vocabulary_name - - # Create payload hash - self.payload_hash = hashlib.sha256("".encode("utf-8")).hexdigest() - - # Create canonical request - self.canonical_request = f"{self.method}\n{self.canonical_uri}\n{self.canonical_querystring}\n{self.canonical_headers}\n{self.signed_headers}\n{self.payload_hash}" - - # Create string to sign - credential_scope = f"{self.datestamp}/{self.region}/{self.service}/aws4_request" - string_to_sign = ( - f"{self.algorithm}\n{self.amz_date}\n{credential_scope}\n" - + hashlib.sha256(self.canonical_request.encode("utf-8")).hexdigest() - ) - - # Calculate signature - k_date = hmac.new( - f"AWS4{self.secret_key}".encode("utf-8"), self.datestamp.encode("utf-8"), hashlib.sha256 - ).digest() - k_region = hmac.new(k_date, self.region.encode("utf-8"), hashlib.sha256).digest() - k_service = hmac.new(k_region, self.service.encode("utf-8"), hashlib.sha256).digest() - k_signing = hmac.new(k_service, b"aws4_request", hashlib.sha256).digest() - self.signature = hmac.new( - k_signing, string_to_sign.encode("utf-8"), hashlib.sha256 - ).hexdigest() - - # Add signature to query string - self.canonical_querystring += "&X-Amz-Signature=" + self.signature - - # Create request URL - self.request_url = self.endpoint + self.canonical_uri + "?" + self.canonical_querystring - return self.request_url - - -def get_headers(header_name: str, header_value: str) -> bytearray: - """Build a header following AWS event stream format.""" - name = header_name.encode("utf-8") - name_byte_length = bytes([len(name)]) - value_type = bytes([7]) # 7 represents a string - value = header_value.encode("utf-8") - value_byte_length = struct.pack(">H", len(value)) - - # Construct the header - header_list = bytearray() - header_list.extend(name_byte_length) - header_list.extend(name) - header_list.extend(value_type) - header_list.extend(value_byte_length) - header_list.extend(value) - return header_list - - -def build_event_message(payload: bytes) -> bytes: - """ - Build an event message for AWS Transcribe streaming. - Matches AWS sample: https://github.com/aws-samples/amazon-transcribe-streaming-python-websockets/blob/main/eventstream.py - """ - # Build headers - content_type_header = get_headers(":content-type", "application/octet-stream") - event_type_header = get_headers(":event-type", "AudioEvent") - message_type_header = get_headers(":message-type", "event") - - headers = bytearray() - headers.extend(content_type_header) - headers.extend(event_type_header) - headers.extend(message_type_header) - - # Calculate total byte length and headers byte length - # 16 accounts for 8 byte prelude, 2x 4 byte CRCs - total_byte_length = struct.pack(">I", len(headers) + len(payload) + 16) - headers_byte_length = struct.pack(">I", len(headers)) - - # Build the prelude - prelude = bytearray([0] * 8) - prelude[:4] = total_byte_length - prelude[4:] = headers_byte_length - - # Calculate checksum for prelude - prelude_crc = struct.pack(">I", binascii.crc32(prelude) & 0xFFFFFFFF) - - # Construct the message - message_as_list = bytearray() - message_as_list.extend(prelude) - message_as_list.extend(prelude_crc) - message_as_list.extend(headers) - message_as_list.extend(payload) - - # Calculate checksum for message - message = bytes(message_as_list) - message_crc = struct.pack(">I", binascii.crc32(message) & 0xFFFFFFFF) - - # Add message checksum - message_as_list.extend(message_crc) - - return bytes(message_as_list) - - -def decode_event(message): - # Extract the prelude, headers, payload and CRC - prelude = message[:8] - total_length, headers_length = struct.unpack(">II", prelude) - prelude_crc = struct.unpack(">I", message[8:12])[0] - headers = message[12 : 12 + headers_length] - payload = message[12 + headers_length : -4] - message_crc = struct.unpack(">I", message[-4:])[0] - - # Check the CRCs - assert prelude_crc == binascii.crc32(prelude) & 0xFFFFFFFF, "Prelude CRC check failed" - assert message_crc == binascii.crc32(message[:-4]) & 0xFFFFFFFF, "Message CRC check failed" - - # Parse the headers - headers_dict = {} - while headers: - name_len = headers[0] - name = headers[1 : 1 + name_len].decode("utf-8") - value_type = headers[1 + name_len] - value_len = struct.unpack(">H", headers[2 + name_len : 4 + name_len])[0] - value = headers[4 + name_len : 4 + name_len + value_len].decode("utf-8") - headers_dict[name] = value - headers = headers[4 + name_len + value_len :] - - return headers_dict, json.loads(payload) - - -class TranscribeSTTService(STTService): - def __init__( - self, - *, - api_key: Optional[str] = None, - aws_access_key_id: Optional[str] = None, - aws_session_token: Optional[str] = None, - region: Optional[str] = "us-east-1", - sample_rate: int = 16000, - language: Language = Language.EN, - **kwargs, - ): - super().__init__(**kwargs) - - self._settings = { - "sample_rate": sample_rate, - "language": language, - "media_encoding": "linear16", # AWS expects raw PCM - "number_of_channels": 1, - "show_speaker_label": False, - "enable_channel_identification": False, - } - - # Validate sample rate - AWS Transcribe only supports 8000 Hz or 16000 Hz - if sample_rate not in [8000, 16000]: - logger.warning( - f"AWS Transcribe only supports 8000 Hz or 16000 Hz sample rates. Converting from {sample_rate} Hz to 16000 Hz." - ) - self._settings["sample_rate"] = 16000 - - self._credentials = { - "aws_access_key_id": aws_access_key_id or os.getenv("AWS_ACCESS_KEY_ID"), - "aws_secret_access_key": api_key or os.getenv("AWS_SECRET_ACCESS_KEY"), - "aws_session_token": aws_session_token or os.getenv("AWS_SESSION_TOKEN"), - "region": region or os.getenv("AWS_REGION", "us-east-1"), - } - - self._ws_client = None - self._connection_lock = asyncio.Lock() - self._connecting = False - self._receive_task = None - - def get_service_encoding(self, encoding: str) -> str: - """Convert internal encoding format to AWS Transcribe format.""" - encoding_map = { - "linear16": "pcm", # AWS expects "pcm" for 16-bit linear PCM - } - return encoding_map.get(encoding, encoding) - - async def start(self, frame: StartFrame): - """Initialize the connection when the service starts.""" - await super().start(frame) - logger.info("Starting AWS Transcribe service...") - retry_count = 0 - max_retries = 3 - - while retry_count < max_retries: - try: - await self._connect() - if self._ws_client and self._ws_client.open: - logger.info("Successfully established WebSocket connection") - return - logger.warning("WebSocket connection not established after connect") - except Exception as e: - logger.error(f"Failed to connect (attempt {retry_count + 1}/{max_retries}): {e}") - retry_count += 1 - if retry_count < max_retries: - await asyncio.sleep(1) # Wait before retrying - - raise RuntimeError("Failed to establish WebSocket connection after multiple attempts") - - async def run_stt(self, frame: Frame) -> AsyncGenerator[Frame, None]: - """Process audio data and send to AWS Transcribe""" - try: - # Skip if no speech detected - if hasattr(frame, "is_speech") and not frame.is_speech: - logger.debug("Skipping non-speech frame") - return - - # Ensure WebSocket is connected - if not self._ws_client or not self._ws_client.open: - logger.info("WebSocket not connected, attempting to reconnect...") - try: - await self._connect() - except Exception as e: - logger.error(f"Failed to reconnect: {e}") - yield ErrorFrame("Failed to reconnect to AWS Transcribe", fatal=False) - return - - # Get the audio data - if frame is bytes, use directly, otherwise get audio attribute - audio_data = frame if isinstance(frame, bytes) else frame.audio - - # Format the audio data according to AWS event stream format - event_message = build_event_message(audio_data) - # logger.debug(f"Sending audio chunk of size {len(audio_data)} bytes") - - # Send the formatted event message - try: - await self._ws_client.send(event_message) - # Start metrics after first chunk sent - await self.start_processing_metrics() - await self.start_ttfb_metrics() - except websockets.exceptions.ConnectionClosed as e: - logger.warning(f"Connection closed while sending: {e}") - await self._disconnect() - # Don't yield error here - we'll retry on next frame - except Exception as e: - logger.error(f"Error sending audio: {e}") - yield ErrorFrame(f"AWS Transcribe error: {str(e)}", fatal=False) - await self._disconnect() - - except Exception as e: - logger.error(f"Error in run_stt: {e}") - yield ErrorFrame(f"AWS Transcribe error: {str(e)}", fatal=False) - await self._disconnect() - - async def _connect(self): - """Connect to AWS Transcribe with connection state management.""" - if ( - self._ws_client - and self._ws_client.open - and self._receive_task - and not self._receive_task.done() - ): - logger.debug("Already connected") - return - - async with self._connection_lock: - if self._connecting: - logger.debug("Connection already in progress") - return - - try: - self._connecting = True - logger.debug("Starting connection process...") - - if self._ws_client: - await self._disconnect() - - language_code = self.language_to_service_language( - Language(self._settings["language"]) - ) - if not language_code: - raise ValueError(f"Unsupported language: {self._settings['language']}") - - # Generate random websocket key - websocket_key = "".join( - random.choices( - string.ascii_uppercase + string.ascii_lowercase + string.digits, k=20 - ) - ) - - # Add required headers - extra_headers = { - "Origin": "https://localhost", - "Sec-WebSocket-Key": websocket_key, - "Sec-WebSocket-Version": "13", - "Connection": "keep-alive", - } - - # Get presigned URL - presigned_url = get_presigned_url( - region=self._credentials["region"], - credentials={ - "access_key": self._credentials["aws_access_key_id"], - "secret_key": self._credentials["aws_secret_access_key"], - "session_token": self._credentials["aws_session_token"], - }, - language_code=language_code, - media_encoding=self.get_service_encoding( - self._settings["media_encoding"] - ), # Convert to AWS format - sample_rate=self._settings["sample_rate"], - number_of_channels=self._settings["number_of_channels"], - enable_partial_results_stabilization=True, - partial_results_stability="high", - show_speaker_label=self._settings["show_speaker_label"], - enable_channel_identification=self._settings["enable_channel_identification"], - ) - - logger.debug(f"Connecting to WebSocket with URL: {presigned_url[:100]}...") - - # Connect with the required headers and settings - self._ws_client = await websockets.connect( - presigned_url, - extra_headers=extra_headers, - subprotocols=["mqtt"], - ping_interval=None, - ping_timeout=None, - compression=None, - ) - logger.debug("WebSocket connected, starting receive task...") - - # Start receive task - self._receive_task = asyncio.create_task(self._receive_loop()) - - logger.info("Successfully connected to AWS Transcribe") - - except Exception as e: - logger.error(f"Failed to connect to AWS Transcribe: {e}") - await self._disconnect() - raise - - finally: - self._connecting = False - - async def _disconnect(self): - """Disconnect from AWS Transcribe.""" - if self._receive_task: - self._receive_task.cancel() - try: - await self._receive_task - except asyncio.CancelledError: - pass - self._receive_task = None - - if self._ws_client: - try: - if self._ws_client.open: - # Send end-stream message - end_stream = {"message-type": "event", "event": "end"} - await self._ws_client.send(json.dumps(end_stream)) - await self._ws_client.close() - except Exception as e: - logger.warning(f"Error closing WebSocket connection: {e}") - finally: - self._ws_client = None - - def language_to_service_language(self, language: Language) -> str | None: - """Convert internal language enum to AWS Transcribe language code.""" - language_map = { - Language.EN: "en-US", - Language.ES: "es-US", - Language.FR: "fr-FR", - Language.DE: "de-DE", - Language.IT: "it-IT", - Language.PT: "pt-BR", - Language.JA: "ja-JP", - Language.KO: "ko-KR", - Language.ZH: "zh-CN", - } - return language_map.get(language) - - async def _receive_loop(self): - """Background task to receive and process messages from AWS Transcribe.""" - try: - logger.debug("Receive loop started") - while True: - if not self._ws_client or not self._ws_client.open: - logger.warning("WebSocket closed in receive loop") - break - - try: - response = await self._ws_client.recv() - headers, payload = decode_event(response) - - # logger.debug(f"Received message type: {headers.get(':message-type')}") - - if headers.get(":message-type") == "event": - # Process transcription results - results = payload.get("Transcript", {}).get("Results", []) - if results: - result = results[0] - alternatives = result.get("Alternatives", []) - if alternatives: - transcript = alternatives[0].get("Transcript", "") - is_final = not result.get("IsPartial", True) - - if transcript: - await self.stop_ttfb_metrics() - if is_final: - await self.push_frame( - TranscriptionFrame( - transcript, - "", - time_now_iso8601(), - self._settings["language"], - ) - ) - await self.stop_processing_metrics() - else: - await self.push_frame( - InterimTranscriptionFrame( - transcript, - "", - time_now_iso8601(), - self._settings["language"], - ) - ) - elif headers.get(":message-type") == "exception": - error_msg = payload.get("Message", "Unknown error") - logger.error(f"Exception from AWS: {error_msg}") - await self.push_frame( - ErrorFrame(f"AWS Transcribe error: {error_msg}", fatal=False) - ) - else: - logger.debug(f"Other message type received: {headers}") - logger.debug(f"Payload: {payload}") - - except websockets.exceptions.ConnectionClosed as e: - logger.error( - f"WebSocket connection closed in receive loop with code {e.code}: {e.reason}" - ) - break - except Exception as e: - logger.error(f"Error in receive loop: {e}") - break - - except asyncio.CancelledError: - logger.debug("Receive loop cancelled") - except Exception as e: - logger.error(f"Unexpected error in receive loop: {e}") - finally: - logger.debug("Receive loop ended") From b2b01861b2b09232c83266ffca9d9833ed722d97 Mon Sep 17 00:00:00 2001 From: Adithya Suresh Date: Fri, 4 Apr 2025 06:17:40 +0000 Subject: [PATCH 20/97] Remove model restriction --- src/pipecat/services/aws/llm.py | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/src/pipecat/services/aws/llm.py b/src/pipecat/services/aws/llm.py index cb21eccaa..2cca54c52 100644 --- a/src/pipecat/services/aws/llm.py +++ b/src/pipecat/services/aws/llm.py @@ -561,18 +561,7 @@ class BedrockLLMService(LLMService): "additional_model_request_fields": params.additional_model_request_fields if isinstance(params.additional_model_request_fields, dict) else {}, } - # Determine model provider from model ID - self.model_provider = self._get_model_provider(model) - logger.info(f"Using AWS Bedrock model: {model} from provider: {self.model_provider}") - - def _get_model_provider(self, model: str) -> str: - """Determine the model provider from the model ID""" - if "anthropic." in model: - return "anthropic" - elif "amazon." in model: - return "amazon" - else: - raise ValueError(f"Unsupported model: {model}. Only Anthropic Claude and Amazon Nova model families are supported.") + logger.info(f"Using AWS Bedrock model: {model}") def can_generate_metrics(self) -> bool: return True From fa5cac7e0a2e5f84f6f7ce45a7198309fc0386f9 Mon Sep 17 00:00:00 2001 From: Adithya Suresh Date: Tue, 8 Apr 2025 02:52:37 +0000 Subject: [PATCH 21/97] Bug fix in content format --- src/pipecat/services/aws/llm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pipecat/services/aws/llm.py b/src/pipecat/services/aws/llm.py index 2cca54c52..fa33e26b3 100644 --- a/src/pipecat/services/aws/llm.py +++ b/src/pipecat/services/aws/llm.py @@ -310,7 +310,7 @@ class BedrockLLMContext(OpenAILLMContext): # in the proper format if isinstance(self.messages[-1]["content"], str): self.messages[-1]["content"] = [ - {"type": "text", "text": self.messages[-1]["content"]} + {"text": self.messages[-1]["content"]} ] # if this message has just a content string, convert it to a list # in the proper format From aa964847f360a9083a7c863e2ba09f1fe41dfd32 Mon Sep 17 00:00:00 2001 From: Adithya Suresh Date: Tue, 8 Apr 2025 04:49:40 +0000 Subject: [PATCH 22/97] System param to be a list --- src/pipecat/services/aws/llm.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/src/pipecat/services/aws/llm.py b/src/pipecat/services/aws/llm.py index fa33e26b3..f94aa3fbb 100644 --- a/src/pipecat/services/aws/llm.py +++ b/src/pipecat/services/aws/llm.py @@ -336,7 +336,15 @@ class BedrockLLMContext(OpenAILLMContext): self.messages[0]["role"] = "user" else: system_content = self.messages.pop(0)["content"] - self.system = system_content[0]["text"] if isinstance(system_content, list) and system_content and isinstance(system_content[0], dict) and "text" in system_content[0] else str(system_content) + if isinstance(system_content, str): + system_content = [{"text": system_content}] + + if self.system: + if isinstance(self.system, str): + self.system = [{"text": self.system}] + self.system.extend(system_content) + else: + self.system = system_content # Ensure content is properly formatted for msg in self.messages: @@ -600,7 +608,7 @@ class BedrockLLMService(LLMService): assistant = BedrockAssistantContextAggregator(context, **assistant_kwargs) return BedrockContextAggregatorPair(_user=user, _assistant=assistant) - async def _process_context(self, context: "BedrockLLMContext"): + async def _process_context(self, context: BedrockLLMContext): # Usage tracking prompt_tokens = 0 completion_tokens = 0 @@ -633,7 +641,7 @@ class BedrockLLMService(LLMService): } # Add system message - request_params["system"] = [{"text": context.system}] + request_params["system"] = context.system # Add tools if present if context.tools: From 664111a3c98e5df6f34ac2a5d1281682e810d46e Mon Sep 17 00:00:00 2001 From: Adithya Suresh Date: Tue, 8 Apr 2025 04:52:18 +0000 Subject: [PATCH 23/97] Added cache related info to metrics --- src/pipecat/services/aws/llm.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/pipecat/services/aws/llm.py b/src/pipecat/services/aws/llm.py index f94aa3fbb..7c3539f7a 100644 --- a/src/pipecat/services/aws/llm.py +++ b/src/pipecat/services/aws/llm.py @@ -613,6 +613,8 @@ class BedrockLLMService(LLMService): prompt_tokens = 0 completion_tokens = 0 completion_tokens_estimate = 0 + cache_read_input_tokens = 0 + cache_creation_input_tokens = 0 use_completion_tokens_estimate = False try: @@ -723,6 +725,8 @@ class BedrockLLMService(LLMService): usage = event["metadata"]["usage"] prompt_tokens += usage.get("inputTokens", 0) completion_tokens += usage.get("outputTokens", 0) + cache_read_input_tokens += usage.get("cacheReadInputTokens", 0) + cache_creation_input_tokens += usage.get("cacheWriteInputTokens", 0) except asyncio.CancelledError: # If we're interrupted, we won't get a complete usage report. So set our flag to use the @@ -745,6 +749,8 @@ class BedrockLLMService(LLMService): await self._report_usage_metrics( prompt_tokens=prompt_tokens, completion_tokens=comp_tokens, + cache_read_input_tokens=cache_read_input_tokens, + cache_creation_input_tokens=cache_creation_input_tokens ) async def process_frame(self, frame: Frame, direction: FrameDirection): @@ -776,11 +782,15 @@ class BedrockLLMService(LLMService): self, prompt_tokens: int, completion_tokens: int, + cache_read_input_tokens: int, + cache_creation_input_tokens: int ): if prompt_tokens or completion_tokens: tokens = LLMTokenUsage( prompt_tokens=prompt_tokens, completion_tokens=completion_tokens, total_tokens=prompt_tokens + completion_tokens, + cache_read_input_tokens=cache_read_input_tokens, + cache_creation_input_tokens=cache_creation_input_tokens ) await self.start_llm_usage_metrics(tokens) From a4b9db9e073aa21229b6abc2f76aac93419de775 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= Date: Tue, 6 May 2025 11:37:23 -0700 Subject: [PATCH 24/97] fix formatting --- .../foundational/07m-interruptible-aws.py | 38 ++--- .../adapters/services/bedrock_adapter.py | 2 +- src/pipecat/services/aws/llm.py | 134 ++++++++---------- src/pipecat/services/aws/stt.py | 4 +- src/pipecat/services/aws/tts.py | 6 +- 5 files changed, 85 insertions(+), 99 deletions(-) diff --git a/examples/foundational/07m-interruptible-aws.py b/examples/foundational/07m-interruptible-aws.py index d1fae6b5e..ddb8b222e 100644 --- a/examples/foundational/07m-interruptible-aws.py +++ b/examples/foundational/07m-interruptible-aws.py @@ -13,13 +13,13 @@ from pipecat.audio.vad.silero import SileroVADAnalyzer from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner from pipecat.pipeline.task import PipelineParams, PipelineTask +from pipecat.services.aws.llm import BedrockLLMContext, BedrockLLMService +from pipecat.services.aws.stt import TranscribeSTTService +from pipecat.services.aws.tts import PollyTTSService from pipecat.transcriptions.language import Language from pipecat.transports.base_transport import TransportParams from pipecat.transports.network.small_webrtc import SmallWebRTCTransport from pipecat.transports.network.webrtc_connection import SmallWebRTCConnection -from pipecat.services.aws.llm import BedrockLLMService, BedrockLLMContext -from pipecat.services.aws.stt import TranscribeSTTService -from pipecat.services.aws.tts import PollyTTSService load_dotenv(override=True) @@ -42,28 +42,26 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection, _: argparse.Namespac region="us-west-2", # only specific regions support generative TTS voice_id="Joanna", params=PollyTTSService.InputParams( - engine="generative", - language=Language.EN_US, - rate="1.1" + engine="generative", language=Language.EN_US, rate="1.1" ), ) llm = BedrockLLMService( aws_region="us-west-2", model="us.anthropic.claude-3-5-haiku-20241022-v1:0", - params=BedrockLLMService.InputParams( - temperature=0.8, - latency="optimized" - ) + params=BedrockLLMService.InputParams(temperature=0.8, latency="optimized"), ) messages = [ - { - "role": "system", - "content": [{"text": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way."}], - }, - ] - ) + { + "role": "system", + "content": [ + { + "text": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way." + } + ], + }, + ] context = BedrockLLMContext(messages) context_aggregator = llm.create_context_aggregator(context) @@ -77,8 +75,8 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection, _: argparse.Namespac tts, # TTS transport.output(), # Transport bot output context_aggregator.assistant(), # Assistant spoken responses - ] - ) + ] + ) task = PipelineTask( pipeline, @@ -94,7 +92,9 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection, _: argparse.Namespac async def on_client_connected(transport, client): logger.info(f"Client connected") # Kick off the conversation. - messages.append({"role": "user", "content": [{"text": "Please introduce yourself to the user."}]}) + messages.append( + {"role": "user", "content": [{"text": "Please introduce yourself to the user."}]} + ) await task.queue_frames([context_aggregator.user().get_context_frame()]) @transport.event_handler("on_client_disconnected") diff --git a/src/pipecat/adapters/services/bedrock_adapter.py b/src/pipecat/adapters/services/bedrock_adapter.py index 0aba6aba2..b877f01fc 100644 --- a/src/pipecat/adapters/services/bedrock_adapter.py +++ b/src/pipecat/adapters/services/bedrock_adapter.py @@ -24,7 +24,7 @@ class BedrockLLMAdapter(BaseLLMAdapter): "properties": function.properties, "required": function.required, }, - } + }, } } diff --git a/src/pipecat/services/aws/llm.py b/src/pipecat/services/aws/llm.py index 7c3539f7a..3b9c1fedd 100644 --- a/src/pipecat/services/aws/llm.py +++ b/src/pipecat/services/aws/llm.py @@ -135,7 +135,7 @@ class BedrockLLMContext(OpenAILLMContext): """ role = obj.get("role") content = obj.get("content") - + if role == "assistant": if isinstance(content, str): return [{"role": role, "content": [{"type": "text", "text": content}]}] @@ -184,7 +184,7 @@ class BedrockLLMContext(OpenAILLMContext): result_content = json.dumps(content_item["json"]) else: result_content = tool_result["content"] - + tool_items.append( { "role": "tool", @@ -226,26 +226,28 @@ class BedrockLLMContext(OpenAILLMContext): if message["role"] == "tool": # Try to parse the content as JSON if it looks like JSON try: - if message["content"].strip().startswith('{') and message["content"].strip().endswith('}'): + if message["content"].strip().startswith("{") and message[ + "content" + ].strip().endswith("}"): content_json = json.loads(message["content"]) tool_result_content = [{"json": content_json}] else: tool_result_content = [{"text": message["content"]}] except: tool_result_content = [{"text": message["content"]}] - + return { "role": "user", "content": [ { "toolResult": { "toolUseId": message["tool_call_id"], - "content": tool_result_content + "content": tool_result_content, }, }, ], } - + if message.get("tool_calls"): tc = message["tool_calls"] ret = {"role": "assistant", "content": []} @@ -261,7 +263,7 @@ class BedrockLLMContext(OpenAILLMContext): } ret["content"].append(new_tool_use) return ret - + # Handle text content content = message.get("content") if isinstance(content, str): @@ -276,7 +278,7 @@ class BedrockLLMContext(OpenAILLMContext): text_content = item["text"] if item["text"] != "" else "(empty)" new_content.append({"text": text_content}) return {"role": message["role"], "content": new_content} - + return message def add_image_frame_message( @@ -287,15 +289,7 @@ class BedrockLLMContext(OpenAILLMContext): encoded_image = base64.b64encode(buffer.getvalue()).decode("utf-8") # Image should be the first content block in the message - content = [ - { - "type": "image", - "format": "jpeg", - "source": { - "bytes": encoded_image - } - } - ] + content = [{"type": "image", "format": "jpeg", "source": {"bytes": encoded_image}}] if text: content.append({"text": text}) self.add_message({"role": "user", "content": content}) @@ -309,9 +303,7 @@ class BedrockLLMContext(OpenAILLMContext): # if the last message has just a content string, convert it to a list # in the proper format if isinstance(self.messages[-1]["content"], str): - self.messages[-1]["content"] = [ - {"text": self.messages[-1]["content"]} - ] + self.messages[-1]["content"] = [{"text": self.messages[-1]["content"]}] # if this message has just a content string, convert it to a list # in the proper format if isinstance(message["content"], str): @@ -326,7 +318,7 @@ class BedrockLLMContext(OpenAILLMContext): logger.error(f"Error adding message: {e}") def _restructure_from_bedrock_messages(self): - """Restructure messages in Bedrock format by handling system messages, + """Restructure messages in Bedrock format by handling system messages, merging consecutive messages with the same role, and ensuring proper content formatting. """ # Handle system message if present at the beginning @@ -338,7 +330,7 @@ class BedrockLLMContext(OpenAILLMContext): system_content = self.messages.pop(0)["content"] if isinstance(system_content, str): system_content = [{"text": system_content}] - + if self.system: if isinstance(self.system, str): self.system = [{"text": self.system}] @@ -366,7 +358,7 @@ class BedrockLLMContext(OpenAILLMContext): merged_messages[-1]["content"].extend(msg["content"]) else: merged_messages.append(msg) - + self.messages.clear() self.messages.extend(merged_messages) @@ -452,7 +444,7 @@ class BedrockAssistantContextAggregator(LLMAssistantContextAggregator): "toolUse": { "toolUseId": frame.tool_call_id, "name": frame.function_name, - "input": frame.arguments if frame.arguments else {} + "input": frame.arguments if frame.arguments else {}, } } ], @@ -465,11 +457,7 @@ class BedrockAssistantContextAggregator(LLMAssistantContextAggregator): { "toolResult": { "toolUseId": frame.tool_call_id, - "content": [ - { - "text": "IN_PROGRESS" - } - ], + "content": [{"text": "IN_PROGRESS"}], } } ], @@ -517,9 +505,10 @@ class BedrockAssistantContextAggregator(LLMAssistantContextAggregator): class BedrockLLMService(LLMService): """This class implements inference with AWS Bedrock models including Amazon Nova and Anthropic Claude. - + Requires AWS credentials to be configured in the environment or through boto3 configuration. """ + class InputParams(BaseModel): max_tokens: Optional[int] = Field(default_factory=lambda: 4096, ge=1) temperature: Optional[float] = Field(default_factory=lambda: 0.7, ge=0.0, le=1.0) @@ -541,34 +530,33 @@ class BedrockLLMService(LLMService): **kwargs, ): super().__init__(**kwargs) - + # Initialize the Bedrock client if not client_config: client_config = Config( connect_timeout=300, # 5 minutes - read_timeout=300, # 5 minutes - retries={'max_attempts': 3} + read_timeout=300, # 5 minutes + retries={"max_attempts": 3}, ) session = boto3.Session( aws_access_key_id=aws_access_key, aws_secret_access_key=aws_secret_key, aws_session_token=aws_session_token, - region_name=aws_region + region_name=aws_region, ) - self._client = session.client( - service_name='bedrock-runtime', - config=client_config - ) - + self._client = session.client(service_name="bedrock-runtime", config=client_config) + self.set_model_name(model) self._settings = { "max_tokens": params.max_tokens, "temperature": params.temperature, "top_p": params.top_p, "latency": params.latency, - "additional_model_request_fields": params.additional_model_request_fields if isinstance(params.additional_model_request_fields, dict) else {}, + "additional_model_request_fields": params.additional_model_request_fields + if isinstance(params.additional_model_request_fields, dict) + else {}, } - + logger.info(f"Using AWS Bedrock model: {model}") def can_generate_metrics(self) -> bool: @@ -603,7 +591,7 @@ class BedrockLLMService(LLMService): if isinstance(context, OpenAILLMContext) and not isinstance(context, BedrockLLMContext): context = BedrockLLMContext.from_openai_context(context) - + user = BedrockUserContextAggregator(context, **user_kwargs) assistant = BedrockAssistantContextAggregator(context, **assistant_kwargs) return BedrockContextAggregatorPair(_user=user, _assistant=assistant) @@ -626,31 +614,29 @@ class BedrockLLMService(LLMService): # ) await self.start_ttfb_metrics() - + # Set up inference config inference_config = { "maxTokens": self._settings["max_tokens"], "temperature": self._settings["temperature"], "topP": self._settings["top_p"], } - + # Prepare request parameters request_params = { "modelId": self.model_name, "messages": context.messages, "inferenceConfig": inference_config, - "additionalModelRequestFields": self._settings["additional_model_request_fields"] + "additionalModelRequestFields": self._settings["additional_model_request_fields"], } - + # Add system message request_params["system"] = context.system - + # Add tools if present if context.tools: - tool_config = { - "tools": context.tools - } - + tool_config = {"tools": context.tools} + # Add tool_choice if specified if context.tool_choice: if context.tool_choice == "auto": @@ -658,32 +644,30 @@ class BedrockLLMService(LLMService): elif context.tool_choice == "none": # Skip adding toolChoice for "none" pass - elif isinstance(context.tool_choice, dict) and "function" in context.tool_choice: + elif ( + isinstance(context.tool_choice, dict) and "function" in context.tool_choice + ): tool_config["toolChoice"] = { - "tool": { - "name": context.tool_choice["function"]["name"] - } + "tool": {"name": context.tool_choice["function"]["name"]} } - + request_params["toolConfig"] = tool_config - + # Add performance config if latency is specified if self._settings["latency"] in ["standard", "optimized"]: - request_params["performanceConfig"] = { - "latency": self._settings["latency"] - } - + request_params["performanceConfig"] = {"latency": self._settings["latency"]} + logger.debug(f"Calling Bedrock model with: {request_params}") - + # Call Bedrock with streaming response = self._client.converse_stream(**request_params) - + await self.stop_ttfb_metrics() - + # Process the streaming response tool_use_block = None json_accumulator = "" - + for event in response["stream"]: # Handle text content if "contentBlockDelta" in event: @@ -694,18 +678,20 @@ class BedrockLLMService(LLMService): elif "toolUse" in delta and "input" in delta["toolUse"]: # Handle partial JSON for tool use json_accumulator += delta["toolUse"]["input"] - completion_tokens_estimate += self._estimate_tokens(delta["toolUse"]["input"]) - + completion_tokens_estimate += self._estimate_tokens( + delta["toolUse"]["input"] + ) + # Handle tool use start elif "contentBlockStart" in event: - content_block_start = event["contentBlockStart"]['start'] + content_block_start = event["contentBlockStart"]["start"] if "toolUse" in content_block_start: tool_use_block = { "id": content_block_start["toolUse"].get("toolUseId", ""), - "name": content_block_start["toolUse"].get("name", "") + "name": content_block_start["toolUse"].get("name", ""), } json_accumulator = "" - + # Handle message completion with tool use elif "messageStop" in event and "stopReason" in event["messageStop"]: if event["messageStop"]["stopReason"] == "tool_use" and tool_use_block: @@ -719,7 +705,7 @@ class BedrockLLMService(LLMService): ) except json.JSONDecodeError: logger.error(f"Failed to parse tool arguments: {json_accumulator}") - + # Handle usage metrics if available if "metadata" in event and "usage" in event["metadata"]: usage = event["metadata"]["usage"] @@ -750,7 +736,7 @@ class BedrockLLMService(LLMService): prompt_tokens=prompt_tokens, completion_tokens=comp_tokens, cache_read_input_tokens=cache_read_input_tokens, - cache_creation_input_tokens=cache_creation_input_tokens + cache_creation_input_tokens=cache_creation_input_tokens, ) async def process_frame(self, frame: Frame, direction: FrameDirection): @@ -783,7 +769,7 @@ class BedrockLLMService(LLMService): prompt_tokens: int, completion_tokens: int, cache_read_input_tokens: int, - cache_creation_input_tokens: int + cache_creation_input_tokens: int, ): if prompt_tokens or completion_tokens: tokens = LLMTokenUsage( @@ -791,6 +777,6 @@ class BedrockLLMService(LLMService): completion_tokens=completion_tokens, total_tokens=prompt_tokens + completion_tokens, cache_read_input_tokens=cache_read_input_tokens, - cache_creation_input_tokens=cache_creation_input_tokens + cache_creation_input_tokens=cache_creation_input_tokens, ) await self.start_llm_usage_metrics(tokens) diff --git a/src/pipecat/services/aws/stt.py b/src/pipecat/services/aws/stt.py index 08d74d484..d749eff0c 100644 --- a/src/pipecat/services/aws/stt.py +++ b/src/pipecat/services/aws/stt.py @@ -19,7 +19,7 @@ from pipecat.frames.frames import ( Frame, TranscriptionFrame, InterimTranscriptionFrame, - StartFrame + StartFrame, ) from pipecat.services.ai_services import STTService from pipecat.transcriptions.language import Language @@ -597,4 +597,4 @@ class TranscribeSTTService(STTService): except Exception as e: logger.error(f"Unexpected error in receive loop: {e}") finally: - logger.debug("Receive loop ended") \ No newline at end of file + logger.debug("Receive loop ended") diff --git a/src/pipecat/services/aws/tts.py b/src/pipecat/services/aws/tts.py index ed1230dd7..d61f74ab2 100644 --- a/src/pipecat/services/aws/tts.py +++ b/src/pipecat/services/aws/tts.py @@ -17,7 +17,7 @@ from pipecat.frames.frames import ( Frame, TTSAudioRawFrame, TTSStartedFrame, - TTSStoppedFrame + TTSStoppedFrame, ) from pipecat.services.ai_services import TTSService from pipecat.transcriptions.language import Language @@ -187,7 +187,7 @@ class PollyTTSService(TTSService): if self._settings["engine"] == "standard": if self._settings["pitch"]: prosody_attrs.append(f"pitch='{self._settings['pitch']}'") - + if self._settings["rate"]: prosody_attrs.append(f"rate='{self._settings['rate']}'") if self._settings["volume"]: @@ -195,7 +195,7 @@ class PollyTTSService(TTSService): # logger.warning("Prosody tags are not supported for generative engine. Ignoring.") if prosody_attrs: - ssml += f"" + ssml += f"" ssml += text From b4de98cfb756beb4b7540b57f4d7cfd6efd16315 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= Date: Tue, 6 May 2025 13:42:18 -0700 Subject: [PATCH 25/97] AWS: various cleanups (logs, imports...) --- .../foundational/07m-interruptible-aws.py | 19 +- pyproject.toml | 2 +- .../adapters/services/bedrock_adapter.py | 2 +- src/pipecat/services/aws/__init__.py | 4 +- src/pipecat/services/aws/llm.py | 43 +- src/pipecat/services/aws/stt.py | 465 ++++-------------- src/pipecat/services/aws/tts.py | 10 +- 7 files changed, 140 insertions(+), 405 deletions(-) diff --git a/examples/foundational/07m-interruptible-aws.py b/examples/foundational/07m-interruptible-aws.py index ddb8b222e..c88439c62 100644 --- a/examples/foundational/07m-interruptible-aws.py +++ b/examples/foundational/07m-interruptible-aws.py @@ -13,7 +13,8 @@ from pipecat.audio.vad.silero import SileroVADAnalyzer from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner from pipecat.pipeline.task import PipelineParams, PipelineTask -from pipecat.services.aws.llm import BedrockLLMContext, BedrockLLMService +from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext +from pipecat.services.aws.llm import BedrockLLMService from pipecat.services.aws.stt import TranscribeSTTService from pipecat.services.aws.tts import PollyTTSService from pipecat.transcriptions.language import Language @@ -55,15 +56,11 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection, _: argparse.Namespac messages = [ { "role": "system", - "content": [ - { - "text": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way." - } - ], + "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.", }, ] - context = BedrockLLMContext(messages) + context = OpenAILLMContext(messages) context_aggregator = llm.create_context_aggregator(context) pipeline = Pipeline( @@ -92,14 +89,16 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection, _: argparse.Namespac async def on_client_connected(transport, client): logger.info(f"Client connected") # Kick off the conversation. - messages.append( - {"role": "user", "content": [{"text": "Please introduce yourself to the user."}]} - ) + messages.append({"role": "user", "content": "Please introduce yourself to the user."}) await task.queue_frames([context_aggregator.user().get_context_frame()]) @transport.event_handler("on_client_disconnected") async def on_client_disconnected(transport, client): logger.info(f"Client disconnected") + + @transport.event_handler("on_client_closed") + async def on_client_closed(transport, client): + logger.info(f"Client closed connection") await task.cancel() runner = PipelineRunner(handle_sigint=False) diff --git a/pyproject.toml b/pyproject.toml index 910c8d066..13305933b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -41,7 +41,7 @@ Website = "https://pipecat.ai" [project.optional-dependencies] anthropic = [ "anthropic~=0.49.0" ] assemblyai = [ "assemblyai~=0.37.0" ] -aws = [ "boto3~=1.37.16" ] +aws = [ "boto3~=1.37.16", "websockets~=13.1" ] azure = [ "azure-cognitiveservices-speech~=1.42.0"] cartesia = [ "cartesia~=1.4.0", "websockets~=13.1" ] cerebras = [] diff --git a/src/pipecat/adapters/services/bedrock_adapter.py b/src/pipecat/adapters/services/bedrock_adapter.py index b877f01fc..cfb2a5f27 100644 --- a/src/pipecat/adapters/services/bedrock_adapter.py +++ b/src/pipecat/adapters/services/bedrock_adapter.py @@ -4,7 +4,7 @@ # SPDX-License-Identifier: BSD 2-Clause License # -from typing import Any, Dict, List, Union +from typing import Any, Dict, List from pipecat.adapters.base_llm_adapter import BaseLLMAdapter from pipecat.adapters.schemas.function_schema import FunctionSchema diff --git a/src/pipecat/services/aws/__init__.py b/src/pipecat/services/aws/__init__.py index b36c88499..b1f157bd3 100644 --- a/src/pipecat/services/aws/__init__.py +++ b/src/pipecat/services/aws/__init__.py @@ -8,6 +8,8 @@ import sys from pipecat.services import DeprecatedModuleProxy +from .llm import * +from .stt import * from .tts import * -sys.modules[__name__] = DeprecatedModuleProxy(globals(), "aws", "aws.tts") +sys.modules[__name__] = DeprecatedModuleProxy(globals(), "aws", "aws.[llm,stt,tts]") diff --git a/src/pipecat/services/aws/llm.py b/src/pipecat/services/aws/llm.py index 3b9c1fedd..63b0964c2 100644 --- a/src/pipecat/services/aws/llm.py +++ b/src/pipecat/services/aws/llm.py @@ -11,16 +11,12 @@ import io import json import re from dataclasses import dataclass -from typing import Any, Dict, List, Mapping, Optional, Union +from typing import Any, Dict, List, Optional -import boto3 -from botocore.config import Config -import httpx from loguru import logger from PIL import Image from pydantic import BaseModel, Field -from pipecat.adapters.services.anthropic_adapter import AnthropicLLMAdapter from pipecat.frames.frames import ( Frame, FunctionCallCancelFrame, @@ -36,7 +32,9 @@ from pipecat.frames.frames import ( ) from pipecat.metrics.metrics import LLMTokenUsage from pipecat.processors.aggregators.llm_response import ( + LLMAssistantAggregatorParams, LLMAssistantContextAggregator, + LLMUserAggregatorParams, LLMUserContextAggregator, ) from pipecat.processors.aggregators.openai_llm_context import ( @@ -44,7 +42,18 @@ from pipecat.processors.aggregators.openai_llm_context import ( OpenAILLMContextFrame, ) from pipecat.processors.frame_processor import FrameDirection -from pipecat.services.ai_services import LLMService +from pipecat.services.llm_service import LLMService + +try: + import boto3 + import httpx + from botocore.config import Config +except ModuleNotFoundError as e: + logger.error(f"Exception: {e}") + logger.error( + "In order to use AWS services, you need to `pip install pipecat-ai[aws]`. Also, remember to set `AWS_SECRET_ACCESS_KEY`, `AWS_ACCESS_KEY_ID`, and `AWS_REGION` environment variable." + ) + raise Exception(f"Missing module: {e}") @dataclass @@ -564,10 +573,10 @@ class BedrockLLMService(LLMService): def create_context_aggregator( self, - context: BedrockLLMContext, + context: OpenAILLMContext, *, - user_kwargs: Mapping[str, Any] = {}, - assistant_kwargs: Mapping[str, Any] = {}, + user_params: LLMUserAggregatorParams = LLMUserAggregatorParams(), + assistant_params: LLMAssistantAggregatorParams = LLMAssistantAggregatorParams(), ) -> BedrockContextAggregatorPair: """Create an instance of BedrockContextAggregatorPair from an OpenAILLMContext. Constructor keyword arguments for both the user and @@ -575,12 +584,10 @@ class BedrockLLMService(LLMService): Args: context (OpenAILLMContext): The LLM context. - user_kwargs (Mapping[str, Any], optional): Additional keyword - arguments for the user context aggregator constructor. Defaults - to an empty mapping. - assistant_kwargs (Mapping[str, Any], optional): Additional keyword - arguments for the assistant context aggregator - constructor. Defaults to an empty mapping. + user_params (LLMUserAggregatorParams, optional): User aggregator + parameters. + assistant_params (LLMAssistantAggregatorParams, optional): User + aggregator parameters. Returns: BedrockContextAggregatorPair: A pair of context aggregators, one @@ -589,11 +596,11 @@ class BedrockLLMService(LLMService): """ context.set_llm_adapter(self.get_llm_adapter()) - if isinstance(context, OpenAILLMContext) and not isinstance(context, BedrockLLMContext): + if isinstance(context, OpenAILLMContext): context = BedrockLLMContext.from_openai_context(context) - user = BedrockUserContextAggregator(context, **user_kwargs) - assistant = BedrockAssistantContextAggregator(context, **assistant_kwargs) + user = BedrockUserContextAggregator(context, params=user_params) + assistant = BedrockAssistantContextAggregator(context, params=assistant_params) return BedrockContextAggregatorPair(_user=user, _assistant=assistant) async def _process_context(self, context: BedrockLLMContext): diff --git a/src/pipecat/services/aws/stt.py b/src/pipecat/services/aws/stt.py index d749eff0c..0468ab31b 100644 --- a/src/pipecat/services/aws/stt.py +++ b/src/pipecat/services/aws/stt.py @@ -1,289 +1,40 @@ +# +# Copyright (c) 2024–2025, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + import asyncio -from typing import AsyncGenerator, Optional, Dict -import os -import datetime -from urllib.parse import urlencode import json -import struct -import urllib.parse -import hashlib -import hmac +import os import random import string -import binascii +from typing import AsyncGenerator, Optional from loguru import logger from pipecat.frames.frames import ( + CancelFrame, + EndFrame, ErrorFrame, Frame, - TranscriptionFrame, InterimTranscriptionFrame, StartFrame, + TranscriptionFrame, ) -from pipecat.services.ai_services import STTService +from pipecat.services.aws.utils import build_event_message, decode_event, get_presigned_url +from pipecat.services.stt_service import STTService from pipecat.transcriptions.language import Language from pipecat.utils.time import time_now_iso8601 try: - import boto3 - from botocore.exceptions import BotoCoreError, ClientError import websockets except ModuleNotFoundError as e: logger.error(f"Exception: {e}") - logger.error( - "In order to use AWS services, you need to `pip install pipecat-ai[aws]`. Also, remember to set `AWS_SECRET_ACCESS_KEY`, `AWS_ACCESS_KEY_ID`, and `AWS_REGION` environment variable." - ) + logger.error("In order to use AWS services, you need to `pip install pipecat-ai[aws]`.") raise Exception(f"Missing module: {e}") -def get_presigned_url( - *, - region: str, - credentials: Dict[str, Optional[str]], - language_code: str, - media_encoding: str = "pcm", - sample_rate: int = 16000, - number_of_channels: int = 1, - enable_partial_results_stabilization: bool = True, - partial_results_stability: str = "high", - vocabulary_name: Optional[str] = None, - vocabulary_filter_name: Optional[str] = None, - show_speaker_label: bool = False, - enable_channel_identification: bool = False, -) -> str: - """Create a presigned URL for AWS Transcribe streaming.""" - access_key = credentials.get("access_key") - secret_key = credentials.get("secret_key") - session_token = credentials.get("session_token") - - if not access_key or not secret_key: - raise ValueError("AWS credentials are required") - - # Initialize the URL generator - url_generator = AWSTranscribePresignedURL( - access_key=access_key, secret_key=secret_key, session_token=session_token, region=region - ) - - # Get the presigned URL - return url_generator.get_request_url( - sample_rate=sample_rate, - language_code=language_code, - media_encoding=media_encoding, - vocabulary_name=vocabulary_name, - vocabulary_filter_name=vocabulary_filter_name, - show_speaker_label=show_speaker_label, - enable_channel_identification=enable_channel_identification, - number_of_channels=number_of_channels, - enable_partial_results_stabilization=enable_partial_results_stabilization, - partial_results_stability=partial_results_stability, - ) - - -class AWSTranscribePresignedURL: - def __init__( - self, access_key: str, secret_key: str, session_token: str, region: str = "us-east-1" - ): - self.access_key = access_key - self.secret_key = secret_key - self.session_token = session_token - self.method = "GET" - self.service = "transcribe" - self.region = region - self.endpoint = "" - self.host = "" - self.amz_date = "" - self.datestamp = "" - self.canonical_uri = "/stream-transcription-websocket" - self.canonical_headers = "" - self.signed_headers = "host" - self.algorithm = "AWS4-HMAC-SHA256" - self.credential_scope = "" - self.canonical_querystring = "" - self.payload_hash = "" - self.canonical_request = "" - self.string_to_sign = "" - self.signature = "" - self.request_url = "" - - def get_request_url( - self, - sample_rate: int, - language_code: str = "", - media_encoding: str = "pcm", - vocabulary_name: str = "", - vocabulary_filter_name: str = "", - show_speaker_label: bool = False, - enable_channel_identification: bool = False, - number_of_channels: int = 1, - enable_partial_results_stabilization: bool = False, - partial_results_stability: str = "", - ) -> str: - self.endpoint = f"wss://transcribestreaming.{self.region}.amazonaws.com:8443" - self.host = f"transcribestreaming.{self.region}.amazonaws.com:8443" - - now = datetime.datetime.utcnow() - self.amz_date = now.strftime("%Y%m%dT%H%M%SZ") - self.datestamp = now.strftime("%Y%m%d") - self.canonical_headers = f"host:{self.host}\n" - self.credential_scope = f"{self.datestamp}%2F{self.region}%2F{self.service}%2Faws4_request" - - # Create canonical querystring - self.canonical_querystring = "X-Amz-Algorithm=" + self.algorithm - self.canonical_querystring += ( - "&X-Amz-Credential=" + self.access_key + "%2F" + self.credential_scope - ) - self.canonical_querystring += "&X-Amz-Date=" + self.amz_date - self.canonical_querystring += "&X-Amz-Expires=300" - if self.session_token: - self.canonical_querystring += "&X-Amz-Security-Token=" + urllib.parse.quote( - self.session_token, safe="" - ) - self.canonical_querystring += "&X-Amz-SignedHeaders=" + self.signed_headers - - if enable_channel_identification: - self.canonical_querystring += "&enable-channel-identification=true" - if enable_partial_results_stabilization: - self.canonical_querystring += "&enable-partial-results-stabilization=true" - if language_code: - self.canonical_querystring += "&language-code=" + language_code - if media_encoding: - self.canonical_querystring += "&media-encoding=" + media_encoding - if number_of_channels > 1: - self.canonical_querystring += "&number-of-channels=" + str(number_of_channels) - if partial_results_stability: - self.canonical_querystring += "&partial-results-stability=" + partial_results_stability - if sample_rate: - self.canonical_querystring += "&sample-rate=" + str(sample_rate) - if show_speaker_label: - self.canonical_querystring += "&show-speaker-label=true" - if vocabulary_filter_name: - self.canonical_querystring += "&vocabulary-filter-name=" + vocabulary_filter_name - if vocabulary_name: - self.canonical_querystring += "&vocabulary-name=" + vocabulary_name - - # Create payload hash - self.payload_hash = hashlib.sha256("".encode("utf-8")).hexdigest() - - # Create canonical request - self.canonical_request = f"{self.method}\n{self.canonical_uri}\n{self.canonical_querystring}\n{self.canonical_headers}\n{self.signed_headers}\n{self.payload_hash}" - - # Create string to sign - credential_scope = f"{self.datestamp}/{self.region}/{self.service}/aws4_request" - string_to_sign = ( - f"{self.algorithm}\n{self.amz_date}\n{credential_scope}\n" - + hashlib.sha256(self.canonical_request.encode("utf-8")).hexdigest() - ) - - # Calculate signature - k_date = hmac.new( - f"AWS4{self.secret_key}".encode("utf-8"), self.datestamp.encode("utf-8"), hashlib.sha256 - ).digest() - k_region = hmac.new(k_date, self.region.encode("utf-8"), hashlib.sha256).digest() - k_service = hmac.new(k_region, self.service.encode("utf-8"), hashlib.sha256).digest() - k_signing = hmac.new(k_service, b"aws4_request", hashlib.sha256).digest() - self.signature = hmac.new( - k_signing, string_to_sign.encode("utf-8"), hashlib.sha256 - ).hexdigest() - - # Add signature to query string - self.canonical_querystring += "&X-Amz-Signature=" + self.signature - - # Create request URL - self.request_url = self.endpoint + self.canonical_uri + "?" + self.canonical_querystring - return self.request_url - - -def get_headers(header_name: str, header_value: str) -> bytearray: - """Build a header following AWS event stream format.""" - name = header_name.encode("utf-8") - name_byte_length = bytes([len(name)]) - value_type = bytes([7]) # 7 represents a string - value = header_value.encode("utf-8") - value_byte_length = struct.pack(">H", len(value)) - - # Construct the header - header_list = bytearray() - header_list.extend(name_byte_length) - header_list.extend(name) - header_list.extend(value_type) - header_list.extend(value_byte_length) - header_list.extend(value) - return header_list - - -def build_event_message(payload: bytes) -> bytes: - """ - Build an event message for AWS Transcribe streaming. - Matches AWS sample: https://github.com/aws-samples/amazon-transcribe-streaming-python-websockets/blob/main/eventstream.py - """ - # Build headers - content_type_header = get_headers(":content-type", "application/octet-stream") - event_type_header = get_headers(":event-type", "AudioEvent") - message_type_header = get_headers(":message-type", "event") - - headers = bytearray() - headers.extend(content_type_header) - headers.extend(event_type_header) - headers.extend(message_type_header) - - # Calculate total byte length and headers byte length - # 16 accounts for 8 byte prelude, 2x 4 byte CRCs - total_byte_length = struct.pack(">I", len(headers) + len(payload) + 16) - headers_byte_length = struct.pack(">I", len(headers)) - - # Build the prelude - prelude = bytearray([0] * 8) - prelude[:4] = total_byte_length - prelude[4:] = headers_byte_length - - # Calculate checksum for prelude - prelude_crc = struct.pack(">I", binascii.crc32(prelude) & 0xFFFFFFFF) - - # Construct the message - message_as_list = bytearray() - message_as_list.extend(prelude) - message_as_list.extend(prelude_crc) - message_as_list.extend(headers) - message_as_list.extend(payload) - - # Calculate checksum for message - message = bytes(message_as_list) - message_crc = struct.pack(">I", binascii.crc32(message) & 0xFFFFFFFF) - - # Add message checksum - message_as_list.extend(message_crc) - - return bytes(message_as_list) - - -def decode_event(message): - # Extract the prelude, headers, payload and CRC - prelude = message[:8] - total_length, headers_length = struct.unpack(">II", prelude) - prelude_crc = struct.unpack(">I", message[8:12])[0] - headers = message[12 : 12 + headers_length] - payload = message[12 + headers_length : -4] - message_crc = struct.unpack(">I", message[-4:])[0] - - # Check the CRCs - assert prelude_crc == binascii.crc32(prelude) & 0xFFFFFFFF, "Prelude CRC check failed" - assert message_crc == binascii.crc32(message[:-4]) & 0xFFFFFFFF, "Message CRC check failed" - - # Parse the headers - headers_dict = {} - while headers: - name_len = headers[0] - name = headers[1 : 1 + name_len].decode("utf-8") - value_type = headers[1 + name_len] - value_len = struct.unpack(">H", headers[2 + name_len : 4 + name_len])[0] - value = headers[4 + name_len : 4 + name_len + value_len].decode("utf-8") - headers_dict[name] = value - headers = headers[4 + name_len + value_len :] - - return headers_dict, json.loads(payload) - - class TranscribeSTTService(STTService): def __init__( self, @@ -355,17 +106,20 @@ class TranscribeSTTService(STTService): raise RuntimeError("Failed to establish WebSocket connection after multiple attempts") - async def run_stt(self, frame: Frame) -> AsyncGenerator[Frame, None]: + async def stop(self, frame: EndFrame): + await super().stop(frame) + await self._disconnect() + + async def cancel(self, frame: CancelFrame): + await super().cancel(frame) + await self._disconnect() + + async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]: """Process audio data and send to AWS Transcribe""" try: - # Skip if no speech detected - if hasattr(frame, "is_speech") and not frame.is_speech: - logger.debug("Skipping non-speech frame") - return - # Ensure WebSocket is connected if not self._ws_client or not self._ws_client.open: - logger.info("WebSocket not connected, attempting to reconnect...") + logger.debug("WebSocket not connected, attempting to reconnect...") try: await self._connect() except Exception as e: @@ -373,12 +127,8 @@ class TranscribeSTTService(STTService): yield ErrorFrame("Failed to reconnect to AWS Transcribe", fatal=False) return - # Get the audio data - if frame is bytes, use directly, otherwise get audio attribute - audio_data = frame if isinstance(frame, bytes) else frame.audio - # Format the audio data according to AWS event stream format - event_message = build_event_message(audio_data) - # logger.debug(f"Sending audio chunk of size {len(audio_data)} bytes") + event_message = build_event_message(audio) # Send the formatted event message try: @@ -402,23 +152,18 @@ class TranscribeSTTService(STTService): async def _connect(self): """Connect to AWS Transcribe with connection state management.""" - if ( - self._ws_client - and self._ws_client.open - and self._receive_task - and not self._receive_task.done() - ): - logger.debug("Already connected") + if self._ws_client and self._ws_client.open and self._receive_task: + logger.debug(f"{self} Already connected") return async with self._connection_lock: if self._connecting: - logger.debug("Connection already in progress") + logger.debug(f"{self} Connection already in progress") return try: self._connecting = True - logger.debug("Starting connection process...") + logger.debug(f"{self} Starting connection process...") if self._ws_client: await self._disconnect() @@ -464,7 +209,7 @@ class TranscribeSTTService(STTService): enable_channel_identification=self._settings["enable_channel_identification"], ) - logger.debug(f"Connecting to WebSocket with URL: {presigned_url[:100]}...") + logger.debug(f"{self} Connecting to WebSocket with URL: {presigned_url[:100]}...") # Connect with the required headers and settings self._ws_client = await websockets.connect( @@ -475,15 +220,16 @@ class TranscribeSTTService(STTService): ping_timeout=None, compression=None, ) - logger.debug("WebSocket connected, starting receive task...") + + logger.debug(f"{self} WebSocket connected, starting receive task...") # Start receive task - self._receive_task = asyncio.create_task(self._receive_loop()) + self._receive_task = self.create_task(self._receive_loop()) - logger.info("Successfully connected to AWS Transcribe") + logger.info(f"{self} Successfully connected to AWS Transcribe") except Exception as e: - logger.error(f"Failed to connect to AWS Transcribe: {e}") + logger.error(f"{self} Failed to connect to AWS Transcribe: {e}") await self._disconnect() raise @@ -493,24 +239,19 @@ class TranscribeSTTService(STTService): async def _disconnect(self): """Disconnect from AWS Transcribe.""" if self._receive_task: - self._receive_task.cancel() - try: - await self._receive_task - except asyncio.CancelledError: - pass + await self.cancel_task(self._receive_task) self._receive_task = None - if self._ws_client: - try: - if self._ws_client.open: - # Send end-stream message - end_stream = {"message-type": "event", "event": "end"} - await self._ws_client.send(json.dumps(end_stream)) - await self._ws_client.close() - except Exception as e: - logger.warning(f"Error closing WebSocket connection: {e}") - finally: - self._ws_client = None + try: + if self._ws_client and self._ws_client.open: + # Send end-stream message + end_stream = {"message-type": "event", "event": "end"} + await self._ws_client.send(json.dumps(end_stream)) + await self._ws_client.close() + except Exception as e: + logger.warning(f"{self} Error closing WebSocket connection: {e}") + finally: + self._ws_client = None def language_to_service_language(self, language: Language) -> str | None: """Convert internal language enum to AWS Transcribe language code.""" @@ -529,72 +270,60 @@ class TranscribeSTTService(STTService): async def _receive_loop(self): """Background task to receive and process messages from AWS Transcribe.""" - try: - logger.debug("Receive loop started") - while True: - if not self._ws_client or not self._ws_client.open: - logger.warning("WebSocket closed in receive loop") - break + while True: + if not self._ws_client or not self._ws_client.open: + logger.warning(f"{self} WebSocket closed in receive loop") + break - try: - response = await self._ws_client.recv() - headers, payload = decode_event(response) + try: + response = await self._ws_client.recv() + headers, payload = decode_event(response) - # logger.debug(f"Received message type: {headers.get(':message-type')}") + if headers.get(":message-type") == "event": + # Process transcription results + results = payload.get("Transcript", {}).get("Results", []) + if results: + result = results[0] + alternatives = result.get("Alternatives", []) + if alternatives: + transcript = alternatives[0].get("Transcript", "") + is_final = not result.get("IsPartial", True) - if headers.get(":message-type") == "event": - # Process transcription results - results = payload.get("Transcript", {}).get("Results", []) - if results: - result = results[0] - alternatives = result.get("Alternatives", []) - if alternatives: - transcript = alternatives[0].get("Transcript", "") - is_final = not result.get("IsPartial", True) - - if transcript: - await self.stop_ttfb_metrics() - if is_final: - await self.push_frame( - TranscriptionFrame( - transcript, - "", - time_now_iso8601(), - self._settings["language"], - ) + if transcript: + await self.stop_ttfb_metrics() + if is_final: + await self.push_frame( + TranscriptionFrame( + transcript, + "", + time_now_iso8601(), + self._settings["language"], ) - await self.stop_processing_metrics() - else: - await self.push_frame( - InterimTranscriptionFrame( - transcript, - "", - time_now_iso8601(), - self._settings["language"], - ) + ) + await self.stop_processing_metrics() + else: + await self.push_frame( + InterimTranscriptionFrame( + transcript, + "", + time_now_iso8601(), + self._settings["language"], ) - elif headers.get(":message-type") == "exception": - error_msg = payload.get("Message", "Unknown error") - logger.error(f"Exception from AWS: {error_msg}") - await self.push_frame( - ErrorFrame(f"AWS Transcribe error: {error_msg}", fatal=False) - ) - else: - logger.debug(f"Other message type received: {headers}") - logger.debug(f"Payload: {payload}") - - except websockets.exceptions.ConnectionClosed as e: - logger.error( - f"WebSocket connection closed in receive loop with code {e.code}: {e.reason}" + ) + elif headers.get(":message-type") == "exception": + error_msg = payload.get("Message", "Unknown error") + logger.error(f"{self} Exception from AWS: {error_msg}") + await self.push_frame( + ErrorFrame(f"AWS Transcribe error: {error_msg}", fatal=False) ) - break - except Exception as e: - logger.error(f"Error in receive loop: {e}") - break - - except asyncio.CancelledError: - logger.debug("Receive loop cancelled") - except Exception as e: - logger.error(f"Unexpected error in receive loop: {e}") - finally: - logger.debug("Receive loop ended") + else: + logger.debug(f"{self} Other message type received: {headers}") + logger.debug(f"{self} Payload: {payload}") + except websockets.exceptions.ConnectionClosed as e: + logger.error( + f"{self} WebSocket connection closed in receive loop with code {e.code}: {e.reason}" + ) + break + except Exception as e: + logger.error(f"{self} Unexpected error in receive loop: {e}") + break diff --git a/src/pipecat/services/aws/tts.py b/src/pipecat/services/aws/tts.py index d61f74ab2..0fdbb8273 100644 --- a/src/pipecat/services/aws/tts.py +++ b/src/pipecat/services/aws/tts.py @@ -5,8 +5,8 @@ # import asyncio -from typing import AsyncGenerator, Optional import os +from typing import AsyncGenerator, Optional from loguru import logger from pydantic import BaseModel @@ -19,7 +19,7 @@ from pipecat.frames.frames import ( TTSStartedFrame, TTSStoppedFrame, ) -from pipecat.services.ai_services import TTSService +from pipecat.services.tts_service import TTSService from pipecat.transcriptions.language import Language try: @@ -27,9 +27,7 @@ try: from botocore.exceptions import BotoCoreError, ClientError except ModuleNotFoundError as e: logger.error(f"Exception: {e}") - logger.error( - "In order to use AWS services, you need to `pip install pipecat-ai[aws]`. Also, remember to set `AWS_SECRET_ACCESS_KEY`, `AWS_ACCESS_KEY_ID`, and `AWS_REGION` environment variable." - ) + logger.error("In order to use AWS services, you need to `pip install pipecat-ai[aws]`.") raise Exception(f"Missing module: {e}") @@ -206,7 +204,7 @@ class PollyTTSService(TTSService): ssml += "" - logger.debug(f"SSML: {ssml}") + logger.trace(f"{self} SSML: {ssml}") return ssml From bed2e894a22aa48a56b94ac2473ca6a5f5fa079c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= Date: Tue, 6 May 2025 13:42:41 -0700 Subject: [PATCH 26/97] BedrockLLMService: pull initial system frame from messages --- src/pipecat/services/aws/llm.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/src/pipecat/services/aws/llm.py b/src/pipecat/services/aws/llm.py index 63b0964c2..cec0cc2e6 100644 --- a/src/pipecat/services/aws/llm.py +++ b/src/pipecat/services/aws/llm.py @@ -382,15 +382,8 @@ class BedrockLLMContext(OpenAILLMContext): # See if we should pull the system message out of our context.messages list. (For # compatibility with Open AI messages format.) if self.messages and self.messages[0]["role"] == "system": - if len(self.messages) == 1: - # If we have only have a system message in the list, all we can really do - # without introducing too much magic is change the role to "user". - self.messages[0]["role"] = "user" - else: - # If we have more than one message, we'll pull the system message out of the - # list. - self.system = self.messages[0]["content"] - self.messages.pop(0) + self.system = self.messages[0]["content"] + self.messages.pop(0) # Merge consecutive messages with the same role. i = 0 From 58de381746594faddbdf7c8b711b9e3b04e0cd36 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= Date: Tue, 6 May 2025 13:44:30 -0700 Subject: [PATCH 27/97] AWS: add missing utils --- src/pipecat/services/aws/utils.py | 261 ++++++++++++++++++++++++++++++ 1 file changed, 261 insertions(+) create mode 100644 src/pipecat/services/aws/utils.py diff --git a/src/pipecat/services/aws/utils.py b/src/pipecat/services/aws/utils.py new file mode 100644 index 000000000..db69456e9 --- /dev/null +++ b/src/pipecat/services/aws/utils.py @@ -0,0 +1,261 @@ +# +# Copyright (c) 2024–2025, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +import binascii +import datetime +import hashlib +import hmac +import json +import struct +import urllib.parse +from typing import Dict, Optional + + +def get_presigned_url( + *, + region: str, + credentials: Dict[str, Optional[str]], + language_code: str, + media_encoding: str = "pcm", + sample_rate: int = 16000, + number_of_channels: int = 1, + enable_partial_results_stabilization: bool = True, + partial_results_stability: str = "high", + vocabulary_name: Optional[str] = None, + vocabulary_filter_name: Optional[str] = None, + show_speaker_label: bool = False, + enable_channel_identification: bool = False, +) -> str: + """Create a presigned URL for AWS Transcribe streaming.""" + access_key = credentials.get("access_key") + secret_key = credentials.get("secret_key") + session_token = credentials.get("session_token") + + if not access_key or not secret_key: + raise ValueError("AWS credentials are required") + + # Initialize the URL generator + url_generator = AWSTranscribePresignedURL( + access_key=access_key, secret_key=secret_key, session_token=session_token, region=region + ) + + # Get the presigned URL + return url_generator.get_request_url( + sample_rate=sample_rate, + language_code=language_code, + media_encoding=media_encoding, + vocabulary_name=vocabulary_name, + vocabulary_filter_name=vocabulary_filter_name, + show_speaker_label=show_speaker_label, + enable_channel_identification=enable_channel_identification, + number_of_channels=number_of_channels, + enable_partial_results_stabilization=enable_partial_results_stabilization, + partial_results_stability=partial_results_stability, + ) + + +class AWSTranscribePresignedURL: + def __init__( + self, access_key: str, secret_key: str, session_token: str, region: str = "us-east-1" + ): + self.access_key = access_key + self.secret_key = secret_key + self.session_token = session_token + self.method = "GET" + self.service = "transcribe" + self.region = region + self.endpoint = "" + self.host = "" + self.amz_date = "" + self.datestamp = "" + self.canonical_uri = "/stream-transcription-websocket" + self.canonical_headers = "" + self.signed_headers = "host" + self.algorithm = "AWS4-HMAC-SHA256" + self.credential_scope = "" + self.canonical_querystring = "" + self.payload_hash = "" + self.canonical_request = "" + self.string_to_sign = "" + self.signature = "" + self.request_url = "" + + def get_request_url( + self, + sample_rate: int, + language_code: str = "", + media_encoding: str = "pcm", + vocabulary_name: str = "", + vocabulary_filter_name: str = "", + show_speaker_label: bool = False, + enable_channel_identification: bool = False, + number_of_channels: int = 1, + enable_partial_results_stabilization: bool = False, + partial_results_stability: str = "", + ) -> str: + self.endpoint = f"wss://transcribestreaming.{self.region}.amazonaws.com:8443" + self.host = f"transcribestreaming.{self.region}.amazonaws.com:8443" + + now = datetime.datetime.utcnow() + self.amz_date = now.strftime("%Y%m%dT%H%M%SZ") + self.datestamp = now.strftime("%Y%m%d") + self.canonical_headers = f"host:{self.host}\n" + self.credential_scope = f"{self.datestamp}%2F{self.region}%2F{self.service}%2Faws4_request" + + # Create canonical querystring + self.canonical_querystring = "X-Amz-Algorithm=" + self.algorithm + self.canonical_querystring += ( + "&X-Amz-Credential=" + self.access_key + "%2F" + self.credential_scope + ) + self.canonical_querystring += "&X-Amz-Date=" + self.amz_date + self.canonical_querystring += "&X-Amz-Expires=300" + if self.session_token: + self.canonical_querystring += "&X-Amz-Security-Token=" + urllib.parse.quote( + self.session_token, safe="" + ) + self.canonical_querystring += "&X-Amz-SignedHeaders=" + self.signed_headers + + if enable_channel_identification: + self.canonical_querystring += "&enable-channel-identification=true" + if enable_partial_results_stabilization: + self.canonical_querystring += "&enable-partial-results-stabilization=true" + if language_code: + self.canonical_querystring += "&language-code=" + language_code + if media_encoding: + self.canonical_querystring += "&media-encoding=" + media_encoding + if number_of_channels > 1: + self.canonical_querystring += "&number-of-channels=" + str(number_of_channels) + if partial_results_stability: + self.canonical_querystring += "&partial-results-stability=" + partial_results_stability + if sample_rate: + self.canonical_querystring += "&sample-rate=" + str(sample_rate) + if show_speaker_label: + self.canonical_querystring += "&show-speaker-label=true" + if vocabulary_filter_name: + self.canonical_querystring += "&vocabulary-filter-name=" + vocabulary_filter_name + if vocabulary_name: + self.canonical_querystring += "&vocabulary-name=" + vocabulary_name + + # Create payload hash + self.payload_hash = hashlib.sha256("".encode("utf-8")).hexdigest() + + # Create canonical request + self.canonical_request = f"{self.method}\n{self.canonical_uri}\n{self.canonical_querystring}\n{self.canonical_headers}\n{self.signed_headers}\n{self.payload_hash}" + + # Create string to sign + credential_scope = f"{self.datestamp}/{self.region}/{self.service}/aws4_request" + string_to_sign = ( + f"{self.algorithm}\n{self.amz_date}\n{credential_scope}\n" + + hashlib.sha256(self.canonical_request.encode("utf-8")).hexdigest() + ) + + # Calculate signature + k_date = hmac.new( + f"AWS4{self.secret_key}".encode("utf-8"), self.datestamp.encode("utf-8"), hashlib.sha256 + ).digest() + k_region = hmac.new(k_date, self.region.encode("utf-8"), hashlib.sha256).digest() + k_service = hmac.new(k_region, self.service.encode("utf-8"), hashlib.sha256).digest() + k_signing = hmac.new(k_service, b"aws4_request", hashlib.sha256).digest() + self.signature = hmac.new( + k_signing, string_to_sign.encode("utf-8"), hashlib.sha256 + ).hexdigest() + + # Add signature to query string + self.canonical_querystring += "&X-Amz-Signature=" + self.signature + + # Create request URL + self.request_url = self.endpoint + self.canonical_uri + "?" + self.canonical_querystring + return self.request_url + + +def get_headers(header_name: str, header_value: str) -> bytearray: + """Build a header following AWS event stream format.""" + name = header_name.encode("utf-8") + name_byte_length = bytes([len(name)]) + value_type = bytes([7]) # 7 represents a string + value = header_value.encode("utf-8") + value_byte_length = struct.pack(">H", len(value)) + + # Construct the header + header_list = bytearray() + header_list.extend(name_byte_length) + header_list.extend(name) + header_list.extend(value_type) + header_list.extend(value_byte_length) + header_list.extend(value) + return header_list + + +def build_event_message(payload: bytes) -> bytes: + """ + Build an event message for AWS Transcribe streaming. + Matches AWS sample: https://github.com/aws-samples/amazon-transcribe-streaming-python-websockets/blob/main/eventstream.py + """ + # Build headers + content_type_header = get_headers(":content-type", "application/octet-stream") + event_type_header = get_headers(":event-type", "AudioEvent") + message_type_header = get_headers(":message-type", "event") + + headers = bytearray() + headers.extend(content_type_header) + headers.extend(event_type_header) + headers.extend(message_type_header) + + # Calculate total byte length and headers byte length + # 16 accounts for 8 byte prelude, 2x 4 byte CRCs + total_byte_length = struct.pack(">I", len(headers) + len(payload) + 16) + headers_byte_length = struct.pack(">I", len(headers)) + + # Build the prelude + prelude = bytearray([0] * 8) + prelude[:4] = total_byte_length + prelude[4:] = headers_byte_length + + # Calculate checksum for prelude + prelude_crc = struct.pack(">I", binascii.crc32(prelude) & 0xFFFFFFFF) + + # Construct the message + message_as_list = bytearray() + message_as_list.extend(prelude) + message_as_list.extend(prelude_crc) + message_as_list.extend(headers) + message_as_list.extend(payload) + + # Calculate checksum for message + message = bytes(message_as_list) + message_crc = struct.pack(">I", binascii.crc32(message) & 0xFFFFFFFF) + + # Add message checksum + message_as_list.extend(message_crc) + + return bytes(message_as_list) + + +def decode_event(message): + # Extract the prelude, headers, payload and CRC + prelude = message[:8] + total_length, headers_length = struct.unpack(">II", prelude) + prelude_crc = struct.unpack(">I", message[8:12])[0] + headers = message[12 : 12 + headers_length] + payload = message[12 + headers_length : -4] + message_crc = struct.unpack(">I", message[-4:])[0] + + # Check the CRCs + assert prelude_crc == binascii.crc32(prelude) & 0xFFFFFFFF, "Prelude CRC check failed" + assert message_crc == binascii.crc32(message[:-4]) & 0xFFFFFFFF, "Message CRC check failed" + + # Parse the headers + headers_dict = {} + while headers: + name_len = headers[0] + name = headers[1 : 1 + name_len].decode("utf-8") + value_type = headers[1 + name_len] + value_len = struct.unpack(">H", headers[2 + name_len : 4 + name_len])[0] + value = headers[4 + name_len : 4 + name_len + value_len].decode("utf-8") + headers_dict[name] = value + headers = headers[4 + name_len + value_len :] + + return headers_dict, json.loads(payload) From ce1a72850bee417ceaea8f6796ed0ba99dabc601 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= Date: Tue, 6 May 2025 14:51:21 -0700 Subject: [PATCH 28/97] tests: add bedrock context aggregator tests --- test-requirements.txt | 2 +- tests/test_context_aggregators.py | 82 +++++++++++++++++++++++-------- 2 files changed, 63 insertions(+), 21 deletions(-) diff --git a/test-requirements.txt b/test-requirements.txt index b34a53ab9..fec8adf52 100644 --- a/test-requirements.txt +++ b/test-requirements.txt @@ -1 +1 @@ --e ".[anthropic,google,langchain]" +-e ".[anthropic,aws,google,langchain]" diff --git a/tests/test_context_aggregators.py b/tests/test_context_aggregators.py index dfe210e07..cd84d476d 100644 --- a/tests/test_context_aggregators.py +++ b/tests/test_context_aggregators.py @@ -40,6 +40,11 @@ from pipecat.services.anthropic.llm import ( AnthropicLLMContext, AnthropicUserContextAggregator, ) +from pipecat.services.aws.llm import ( + BedrockAssistantContextAggregator, + BedrockLLMContext, + BedrockUserContextAggregator, +) from pipecat.services.google.llm import ( GoogleAssistantContextAggregator, GoogleLLMContext, @@ -669,26 +674,6 @@ class TestLLMUserContextAggregator(BaseTestUserContextAggregator, unittest.Isola AGGREGATOR_CLASS = LLMUserContextAggregator -# -# OpenAI -# - - -class TestOpenAIUserContextAggregator( - BaseTestUserContextAggregator, unittest.IsolatedAsyncioTestCase -): - CONTEXT_CLASS = OpenAILLMContext - AGGREGATOR_CLASS = OpenAIUserContextAggregator - - -class TestOpenAIAssistantContextAggregator( - BaseTestAssistantContextAggreagator, unittest.IsolatedAsyncioTestCase -): - CONTEXT_CLASS = OpenAILLMContext - AGGREGATOR_CLASS = OpenAIAssistantContextAggregator - EXPECTED_CONTEXT_FRAMES = [OpenAILLMContextFrame, OpenAILLMContextAssistantTimestampFrame] - - # # Anthropic # @@ -724,6 +709,43 @@ class TestAnthropicAssistantContextAggregator( assert context.messages[index]["content"][0]["content"] == json.dumps(content) +# +# AWS (Bedrock) +# + + +class TestBedrockUserContextAggregator( + BaseTestUserContextAggregator, unittest.IsolatedAsyncioTestCase +): + CONTEXT_CLASS = BedrockLLMContext + AGGREGATOR_CLASS = BedrockUserContextAggregator + + def check_message_multi_content( + self, context: OpenAILLMContext, content_index: int, index: int, content: str + ): + messages = context.messages[content_index] + assert messages["content"][index]["text"] == content + + +class TestBedrockAssistantContextAggregator( + BaseTestAssistantContextAggreagator, unittest.IsolatedAsyncioTestCase +): + CONTEXT_CLASS = BedrockLLMContext + AGGREGATOR_CLASS = BedrockAssistantContextAggregator + EXPECTED_CONTEXT_FRAMES = [OpenAILLMContextFrame, OpenAILLMContextAssistantTimestampFrame] + + def check_message_multi_content( + self, context: OpenAILLMContext, content_index: int, index: int, content: str + ): + messages = context.messages[content_index] + assert messages["content"][index]["text"] == content + + def check_function_call_result(self, context: OpenAILLMContext, index: int, content: Any): + assert context.messages[index]["content"][0]["toolResult"]["content"][0][ + "text" + ] == json.dumps(content) + + # # Google # @@ -766,3 +788,23 @@ class TestGoogleAssistantContextAggregator( def check_function_call_result(self, context: OpenAILLMContext, index: int, content: Any): obj = glm.Content.to_dict(context.messages[index]) assert obj["parts"][0]["function_response"]["response"]["value"] == json.dumps(content) + + +# +# OpenAI +# + + +class TestOpenAIUserContextAggregator( + BaseTestUserContextAggregator, unittest.IsolatedAsyncioTestCase +): + CONTEXT_CLASS = OpenAILLMContext + AGGREGATOR_CLASS = OpenAIUserContextAggregator + + +class TestOpenAIAssistantContextAggregator( + BaseTestAssistantContextAggreagator, unittest.IsolatedAsyncioTestCase +): + CONTEXT_CLASS = OpenAILLMContext + AGGREGATOR_CLASS = OpenAIAssistantContextAggregator + EXPECTED_CONTEXT_FRAMES = [OpenAILLMContextFrame, OpenAILLMContextAssistantTimestampFrame] From a8405649d0c617e97f11ea9daceee55132979136 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= Date: Tue, 6 May 2025 14:52:29 -0700 Subject: [PATCH 29/97] aws: use AWS prefix for all services --- CHANGELOG.md | 4 + .../foundational/07m-interruptible-aws.py | 16 ++-- src/pipecat/services/aws/llm.py | 94 ++++++++++--------- src/pipecat/services/aws/stt.py | 2 +- src/pipecat/services/aws/tts.py | 8 +- tests/test_context_aggregators.py | 18 ++-- 6 files changed, 74 insertions(+), 68 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3e330b9c9..2eec61bca 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added +- Added new AWS services `AWSBedrockLLMService` and `AWSTranscribeSTTService`. + - Added `on_active_speaker_changed` event handler to the `DailyTransport` class. - Added `enable_ssml_parsing` and `enable_logging` to `InputParams` in @@ -25,6 +27,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Deprecated +- `PollyTTSService` is now deprecated, use `AWSPollyTTSService` instead. + - Observer `on_push_frame(src, dst, frame, direction, timestamp)` is now deprecated, use `on_push_frame(data: FramePushed)` instead. diff --git a/examples/foundational/07m-interruptible-aws.py b/examples/foundational/07m-interruptible-aws.py index c88439c62..2ccc7b717 100644 --- a/examples/foundational/07m-interruptible-aws.py +++ b/examples/foundational/07m-interruptible-aws.py @@ -14,9 +14,9 @@ from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner from pipecat.pipeline.task import PipelineParams, PipelineTask from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext -from pipecat.services.aws.llm import BedrockLLMService -from pipecat.services.aws.stt import TranscribeSTTService -from pipecat.services.aws.tts import PollyTTSService +from pipecat.services.aws.llm import AWSBedrockLLMService +from pipecat.services.aws.stt import AWSTranscribeSTTService +from pipecat.services.aws.tts import AWSPollyTTSService from pipecat.transcriptions.language import Language from pipecat.transports.base_transport import TransportParams from pipecat.transports.network.small_webrtc import SmallWebRTCTransport @@ -37,20 +37,20 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection, _: argparse.Namespac ), ) - stt = TranscribeSTTService() + stt = AWSTranscribeSTTService() - tts = PollyTTSService( + tts = AWSPollyTTSService( region="us-west-2", # only specific regions support generative TTS voice_id="Joanna", - params=PollyTTSService.InputParams( + params=AWSPollyTTSService.InputParams( engine="generative", language=Language.EN_US, rate="1.1" ), ) - llm = BedrockLLMService( + llm = AWSBedrockLLMService( aws_region="us-west-2", model="us.anthropic.claude-3-5-haiku-20241022-v1:0", - params=BedrockLLMService.InputParams(temperature=0.8, latency="optimized"), + params=AWSBedrockLLMService.InputParams(temperature=0.8, latency="optimized"), ) messages = [ diff --git a/src/pipecat/services/aws/llm.py b/src/pipecat/services/aws/llm.py index cec0cc2e6..00a877c0f 100644 --- a/src/pipecat/services/aws/llm.py +++ b/src/pipecat/services/aws/llm.py @@ -57,18 +57,18 @@ except ModuleNotFoundError as e: @dataclass -class BedrockContextAggregatorPair: - _user: "BedrockUserContextAggregator" - _assistant: "BedrockAssistantContextAggregator" +class AWSBedrockContextAggregatorPair: + _user: "AWSBedrockUserContextAggregator" + _assistant: "AWSBedrockAssistantContextAggregator" - def user(self) -> "BedrockUserContextAggregator": + def user(self) -> "AWSBedrockUserContextAggregator": return self._user - def assistant(self) -> "BedrockAssistantContextAggregator": + def assistant(self) -> "AWSBedrockAssistantContextAggregator": return self._assistant -class BedrockLLMContext(OpenAILLMContext): +class AWSBedrockLLMContext(OpenAILLMContext): def __init__( self, messages: Optional[List[dict]] = None, @@ -81,10 +81,10 @@ class BedrockLLMContext(OpenAILLMContext): self.system = system @staticmethod - def upgrade_to_bedrock(obj: OpenAILLMContext) -> "BedrockLLMContext": - logger.debug(f"Upgrading to Bedrock: {obj}") - if isinstance(obj, OpenAILLMContext) and not isinstance(obj, BedrockLLMContext): - obj.__class__ = BedrockLLMContext + def upgrade_to_bedrock(obj: OpenAILLMContext) -> "AWSBedrockLLMContext": + logger.debug(f"Upgrading to AWS Bedrock: {obj}") + if isinstance(obj, OpenAILLMContext) and not isinstance(obj, AWSBedrockLLMContext): + obj.__class__ = AWSBedrockLLMContext obj._restructure_from_openai_messages() else: obj._restructure_from_bedrock_messages() @@ -103,13 +103,13 @@ class BedrockLLMContext(OpenAILLMContext): return self @classmethod - def from_messages(cls, messages: List[dict]) -> "BedrockLLMContext": + def from_messages(cls, messages: List[dict]) -> "AWSBedrockLLMContext": self = cls(messages=messages) # self._restructure_from_openai_messages() return self @classmethod - def from_image_frame(cls, frame: VisionImageRawFrame) -> "BedrockLLMContext": + def from_image_frame(cls, frame: VisionImageRawFrame) -> "AWSBedrockLLMContext": context = cls() context.add_image_frame_message( format=frame.format, size=frame.size, image=frame.image, text=frame.text @@ -120,14 +120,14 @@ class BedrockLLMContext(OpenAILLMContext): self._messages[:] = messages # self._restructure_from_openai_messages() - # convert a message in Bedrock format into one or more messages in OpenAI format + # convert a message in AWS Bedrock format into one or more messages in OpenAI format def to_standard_messages(self, obj): - """Convert Bedrock message format to standard structured format. + """Convert AWS Bedrock message format to standard structured format. Handles text content and function calls for both user and assistant messages. Args: - obj: Message in Bedrock format: + obj: Message in AWS Bedrock format: { "role": "user/assistant", "content": [{"text": str} | {"toolUse": {...}} | {"toolResult": {...}}] @@ -208,7 +208,7 @@ class BedrockLLMContext(OpenAILLMContext): return messages def from_standard_message(self, message): - """Convert standard format message to Bedrock format. + """Convert standard format message to AWS Bedrock format. Handles conversion of text content, tool calls, and tool results. Empty text content is converted to "(empty)". @@ -222,7 +222,7 @@ class BedrockLLMContext(OpenAILLMContext): } Returns: - Message in Bedrock format: + Message in AWS Bedrock format: { "role": "user/assistant", "content": [ @@ -306,8 +306,9 @@ class BedrockLLMContext(OpenAILLMContext): def add_message(self, message): try: if self.messages: - # Bedrock requires that roles alternate. If this message's role is the same as the - # last message, we should add this message's content to the last message. + # AWS Bedrock requires that roles alternate. If this message's + # role is the same as the last message, we should add this + # message's content to the last message. if self.messages[-1]["role"] == message["role"]: # if the last message has just a content string, convert it to a list # in the proper format @@ -327,8 +328,10 @@ class BedrockLLMContext(OpenAILLMContext): logger.error(f"Error adding message: {e}") def _restructure_from_bedrock_messages(self): - """Restructure messages in Bedrock format by handling system messages, - merging consecutive messages with the same role, and ensuring proper content formatting. + """Restructure messages in AWS Bedrock format by handling system + messages, merging consecutive messages with the same role, and ensuring + proper content formatting. + """ # Handle system message if present at the beginning logger.debug(f"_restructure_from_bedrock_messages: {self.messages}") @@ -431,13 +434,13 @@ class BedrockLLMContext(OpenAILLMContext): return json.dumps(msgs) -class BedrockUserContextAggregator(LLMUserContextAggregator): +class AWSBedrockUserContextAggregator(LLMUserContextAggregator): pass -class BedrockAssistantContextAggregator(LLMAssistantContextAggregator): +class AWSBedrockAssistantContextAggregator(LLMAssistantContextAggregator): async def handle_function_call_in_progress(self, frame: FunctionCallInProgressFrame): - # Format tool use according to Bedrock API + # Format tool use according to AWS Bedrock API self._context.add_message( { "role": "assistant", @@ -505,10 +508,13 @@ class BedrockAssistantContextAggregator(LLMAssistantContextAggregator): ) -class BedrockLLMService(LLMService): - """This class implements inference with AWS Bedrock models including Amazon Nova and Anthropic Claude. +class AWSBedrockLLMService(LLMService): + """This class implements inference with AWS Bedrock models including Amazon + Nova and Anthropic Claude. + + Requires AWS credentials to be configured in the environment or through + boto3 configuration. - Requires AWS credentials to be configured in the environment or through boto3 configuration. """ class InputParams(BaseModel): @@ -533,7 +539,7 @@ class BedrockLLMService(LLMService): ): super().__init__(**kwargs) - # Initialize the Bedrock client + # Initialize the AWS Bedrock client if not client_config: client_config = Config( connect_timeout=300, # 5 minutes @@ -570,8 +576,8 @@ class BedrockLLMService(LLMService): *, user_params: LLMUserAggregatorParams = LLMUserAggregatorParams(), assistant_params: LLMAssistantAggregatorParams = LLMAssistantAggregatorParams(), - ) -> BedrockContextAggregatorPair: - """Create an instance of BedrockContextAggregatorPair from an + ) -> AWSBedrockContextAggregatorPair: + """Create an instance of AWSBedrockContextAggregatorPair from an OpenAILLMContext. Constructor keyword arguments for both the user and assistant aggregators can be provided. @@ -583,20 +589,20 @@ class BedrockLLMService(LLMService): aggregator parameters. Returns: - BedrockContextAggregatorPair: A pair of context aggregators, one + AWSBedrockContextAggregatorPair: A pair of context aggregators, one for the user and one for the assistant, encapsulated in an - BedrockContextAggregatorPair. + AWSBedrockContextAggregatorPair. """ context.set_llm_adapter(self.get_llm_adapter()) if isinstance(context, OpenAILLMContext): - context = BedrockLLMContext.from_openai_context(context) + context = AWSBedrockLLMContext.from_openai_context(context) - user = BedrockUserContextAggregator(context, params=user_params) - assistant = BedrockAssistantContextAggregator(context, params=assistant_params) - return BedrockContextAggregatorPair(_user=user, _assistant=assistant) + user = AWSBedrockUserContextAggregator(context, params=user_params) + assistant = AWSBedrockAssistantContextAggregator(context, params=assistant_params) + return AWSBedrockContextAggregatorPair(_user=user, _assistant=assistant) - async def _process_context(self, context: BedrockLLMContext): + async def _process_context(self, context: AWSBedrockLLMContext): # Usage tracking prompt_tokens = 0 completion_tokens = 0 @@ -609,10 +615,6 @@ class BedrockLLMService(LLMService): await self.push_frame(LLMFullResponseStartFrame()) await self.start_processing_metrics() - # logger.debug( - # f"{self}: Generating chat with Bedrock model {self.model_name} | [{context.get_messages_for_logging()}]" - # ) - await self.start_ttfb_metrics() # Set up inference config @@ -657,9 +659,9 @@ class BedrockLLMService(LLMService): if self._settings["latency"] in ["standard", "optimized"]: request_params["performanceConfig"] = {"latency": self._settings["latency"]} - logger.debug(f"Calling Bedrock model with: {request_params}") + logger.debug(f"Calling AWS Bedrock model with: {request_params}") - # Call Bedrock with streaming + # Call AWS Bedrock with streaming response = self._client.converse_stream(**request_params) await self.stop_ttfb_metrics() @@ -744,15 +746,15 @@ class BedrockLLMService(LLMService): context = None if isinstance(frame, OpenAILLMContextFrame): - context = BedrockLLMContext.upgrade_to_bedrock(frame.context) + context = AWSBedrockLLMContext.upgrade_to_bedrock(frame.context) elif isinstance(frame, LLMMessagesFrame): - context = BedrockLLMContext.from_messages(frame.messages) + context = AWSBedrockLLMContext.from_messages(frame.messages) elif isinstance(frame, VisionImageRawFrame): # This is only useful in very simple pipelines because it creates # a new context. Generally we want a context manager to catch # UserImageRawFrames coming through the pipeline and add them # to the context. - context = BedrockLLMContext.from_image_frame(frame) + context = AWSBedrockLLMContext.from_image_frame(frame) elif isinstance(frame, LLMUpdateSettingsFrame): await self._update_settings(frame.settings) else: diff --git a/src/pipecat/services/aws/stt.py b/src/pipecat/services/aws/stt.py index 0468ab31b..a02625f81 100644 --- a/src/pipecat/services/aws/stt.py +++ b/src/pipecat/services/aws/stt.py @@ -35,7 +35,7 @@ except ModuleNotFoundError as e: raise Exception(f"Missing module: {e}") -class TranscribeSTTService(STTService): +class AWSTranscribeSTTService(STTService): def __init__( self, *, diff --git a/src/pipecat/services/aws/tts.py b/src/pipecat/services/aws/tts.py index 0fdbb8273..40d746514 100644 --- a/src/pipecat/services/aws/tts.py +++ b/src/pipecat/services/aws/tts.py @@ -107,7 +107,7 @@ def language_to_aws_language(language: Language) -> Optional[str]: return language_map.get(language) -class PollyTTSService(TTSService): +class AWSPollyTTSService(TTSService): class InputParams(BaseModel): engine: Optional[str] = None language: Optional[Language] = Language.EN @@ -190,7 +190,6 @@ class PollyTTSService(TTSService): prosody_attrs.append(f"rate='{self._settings['rate']}'") if self._settings["volume"]: prosody_attrs.append(f"volume='{self._settings['volume']}'") - # logger.warning("Prosody tags are not supported for generative engine. Ignoring.") if prosody_attrs: ssml += f"" @@ -269,7 +268,7 @@ class PollyTTSService(TTSService): yield TTSStoppedFrame() -class AWSTTSService(PollyTTSService): +class PollyTTSService(AWSPollyTTSService): def __init__(self, **kwargs): super().__init__(**kwargs) @@ -278,5 +277,6 @@ class AWSTTSService(PollyTTSService): with warnings.catch_warnings(): warnings.simplefilter("always") warnings.warn( - "'AWSTTSService' is deprecated, use 'PollyTTSService' instead.", DeprecationWarning + "'PollyTTSService' is deprecated, use 'AWSPollyTTSService' instead.", + DeprecationWarning, ) diff --git a/tests/test_context_aggregators.py b/tests/test_context_aggregators.py index cd84d476d..0f68110ce 100644 --- a/tests/test_context_aggregators.py +++ b/tests/test_context_aggregators.py @@ -41,9 +41,9 @@ from pipecat.services.anthropic.llm import ( AnthropicUserContextAggregator, ) from pipecat.services.aws.llm import ( - BedrockAssistantContextAggregator, - BedrockLLMContext, - BedrockUserContextAggregator, + AWSBedrockAssistantContextAggregator, + AWSBedrockLLMContext, + AWSBedrockUserContextAggregator, ) from pipecat.services.google.llm import ( GoogleAssistantContextAggregator, @@ -714,11 +714,11 @@ class TestAnthropicAssistantContextAggregator( # -class TestBedrockUserContextAggregator( +class TestAWSBedrockUserContextAggregator( BaseTestUserContextAggregator, unittest.IsolatedAsyncioTestCase ): - CONTEXT_CLASS = BedrockLLMContext - AGGREGATOR_CLASS = BedrockUserContextAggregator + CONTEXT_CLASS = AWSBedrockLLMContext + AGGREGATOR_CLASS = AWSBedrockUserContextAggregator def check_message_multi_content( self, context: OpenAILLMContext, content_index: int, index: int, content: str @@ -727,11 +727,11 @@ class TestBedrockUserContextAggregator( assert messages["content"][index]["text"] == content -class TestBedrockAssistantContextAggregator( +class TestAWSBedrockAssistantContextAggregator( BaseTestAssistantContextAggreagator, unittest.IsolatedAsyncioTestCase ): - CONTEXT_CLASS = BedrockLLMContext - AGGREGATOR_CLASS = BedrockAssistantContextAggregator + CONTEXT_CLASS = AWSBedrockLLMContext + AGGREGATOR_CLASS = AWSBedrockAssistantContextAggregator EXPECTED_CONTEXT_FRAMES = [OpenAILLMContextFrame, OpenAILLMContextAssistantTimestampFrame] def check_message_multi_content( From 458549f7df9b8e3e1bf9474cdf4aeb5b65560cc1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= Date: Tue, 6 May 2025 21:07:09 -0700 Subject: [PATCH 30/97] AWSBedrockLLMService: fix function calling --- .../foundational/07m-interruptible-aws.py | 5 +- .../foundational/14r-function-calling-aws.py | 139 ++++++++++++++++++ .../adapters/services/anthropic_adapter.py | 2 +- .../adapters/services/bedrock_adapter.py | 2 +- src/pipecat/services/aws/llm.py | 11 +- tests/test_function_calling_adapters.py | 30 ++++ 6 files changed, 178 insertions(+), 11 deletions(-) create mode 100644 examples/foundational/14r-function-calling-aws.py diff --git a/examples/foundational/07m-interruptible-aws.py b/examples/foundational/07m-interruptible-aws.py index 2ccc7b717..bbcfe7313 100644 --- a/examples/foundational/07m-interruptible-aws.py +++ b/examples/foundational/07m-interruptible-aws.py @@ -17,7 +17,6 @@ from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext from pipecat.services.aws.llm import AWSBedrockLLMService from pipecat.services.aws.stt import AWSTranscribeSTTService from pipecat.services.aws.tts import AWSPollyTTSService -from pipecat.transcriptions.language import Language from pipecat.transports.base_transport import TransportParams from pipecat.transports.network.small_webrtc import SmallWebRTCTransport from pipecat.transports.network.webrtc_connection import SmallWebRTCConnection @@ -42,9 +41,7 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection, _: argparse.Namespac tts = AWSPollyTTSService( region="us-west-2", # only specific regions support generative TTS voice_id="Joanna", - params=AWSPollyTTSService.InputParams( - engine="generative", language=Language.EN_US, rate="1.1" - ), + params=AWSPollyTTSService.InputParams(engine="generative", rate="1.1"), ) llm = AWSBedrockLLMService( diff --git a/examples/foundational/14r-function-calling-aws.py b/examples/foundational/14r-function-calling-aws.py new file mode 100644 index 000000000..cf4859576 --- /dev/null +++ b/examples/foundational/14r-function-calling-aws.py @@ -0,0 +1,139 @@ +# +# Copyright (c) 2024–2025, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +import argparse +import os + +from dotenv import load_dotenv +from loguru import logger + +from pipecat.adapters.schemas.function_schema import FunctionSchema +from pipecat.adapters.schemas.tools_schema import ToolsSchema +from pipecat.audio.vad.silero import SileroVADAnalyzer +from pipecat.pipeline.pipeline import Pipeline +from pipecat.pipeline.runner import PipelineRunner +from pipecat.pipeline.task import PipelineParams, PipelineTask +from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext +from pipecat.services.aws.llm import AWSBedrockLLMService +from pipecat.services.aws.stt import AWSTranscribeSTTService +from pipecat.services.aws.tts import AWSPollyTTSService +from pipecat.services.llm_service import FunctionCallParams +from pipecat.transports.base_transport import TransportParams +from pipecat.transports.network.small_webrtc import SmallWebRTCTransport +from pipecat.transports.network.webrtc_connection import SmallWebRTCConnection + +load_dotenv(override=True) + + +async def fetch_weather_from_api(params: FunctionCallParams): + await params.result_callback({"conditions": "nice", "temperature": "75"}) + + +async def run_bot(webrtc_connection: SmallWebRTCConnection, _: argparse.Namespace): + logger.info(f"Starting bot") + + transport = SmallWebRTCTransport( + webrtc_connection=webrtc_connection, + params=TransportParams( + audio_in_enabled=True, + audio_out_enabled=True, + vad_analyzer=SileroVADAnalyzer(), + ), + ) + + stt = AWSTranscribeSTTService() + + tts = AWSPollyTTSService( + region="us-west-2", # only specific regions support generative TTS + voice_id="Joanna", + params=AWSPollyTTSService.InputParams(engine="generative", rate="1.1"), + ) + + llm = AWSBedrockLLMService( + aws_region="us-west-2", + model="us.anthropic.claude-3-5-haiku-20241022-v1:0", + params=AWSBedrockLLMService.InputParams(temperature=0.8, latency="optimized"), + ) + + # You can also register a function_name of None to get all functions + # sent to the same callback with an additional function_name parameter. + llm.register_function("get_current_weather", fetch_weather_from_api) + + weather_function = FunctionSchema( + name="get_current_weather", + description="Get the current weather", + properties={ + "location": { + "type": "string", + "description": "The city and state, e.g. San Francisco, CA", + }, + "format": { + "type": "string", + "enum": ["celsius", "fahrenheit"], + "description": "The temperature unit to use. Infer this from the user's location.", + }, + }, + required=["location", "format"], + ) + tools = ToolsSchema(standard_tools=[weather_function]) + + messages = [ + { + "role": "system", + "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.", + }, + ] + + context = OpenAILLMContext(messages, tools) + context_aggregator = llm.create_context_aggregator(context) + + pipeline = Pipeline( + [ + transport.input(), + stt, + context_aggregator.user(), + llm, + tts, + transport.output(), + context_aggregator.assistant(), + ] + ) + + task = PipelineTask( + pipeline, + params=PipelineParams( + allow_interruptions=True, + enable_metrics=True, + enable_usage_metrics=True, + report_only_initial_ttfb=True, + ), + ) + + @transport.event_handler("on_client_connected") + async def on_client_connected(transport, client): + logger.info(f"Client connected") + # Kick off the conversation. + messages.append({"role": "user", "content": "Please introduce yourself to the user."}) + await task.queue_frames([context_aggregator.user().get_context_frame()]) + + @transport.event_handler("on_client_disconnected") + async def on_client_disconnected(transport, client): + logger.info(f"Client disconnected") + + @transport.event_handler("on_client_closed") + async def on_client_closed(transport, client): + logger.info(f"Client closed connection") + await task.cancel() + + runner = PipelineRunner(handle_sigint=False) + + await runner.run(task) + + +if __name__ == "__main__": + from run import main + + main() diff --git a/src/pipecat/adapters/services/anthropic_adapter.py b/src/pipecat/adapters/services/anthropic_adapter.py index a699469d3..23197d3a8 100644 --- a/src/pipecat/adapters/services/anthropic_adapter.py +++ b/src/pipecat/adapters/services/anthropic_adapter.py @@ -4,7 +4,7 @@ # SPDX-License-Identifier: BSD 2-Clause License # -from typing import Any, Dict, List, Union +from typing import Any, Dict, List from pipecat.adapters.base_llm_adapter import BaseLLMAdapter from pipecat.adapters.schemas.function_schema import FunctionSchema diff --git a/src/pipecat/adapters/services/bedrock_adapter.py b/src/pipecat/adapters/services/bedrock_adapter.py index cfb2a5f27..113a6938d 100644 --- a/src/pipecat/adapters/services/bedrock_adapter.py +++ b/src/pipecat/adapters/services/bedrock_adapter.py @@ -11,7 +11,7 @@ from pipecat.adapters.schemas.function_schema import FunctionSchema from pipecat.adapters.schemas.tools_schema import ToolsSchema -class BedrockLLMAdapter(BaseLLMAdapter): +class AWSBedrockLLMAdapter(BaseLLMAdapter): @staticmethod def _to_bedrock_function_format(function: FunctionSchema) -> Dict[str, Any]: return { diff --git a/src/pipecat/services/aws/llm.py b/src/pipecat/services/aws/llm.py index 00a877c0f..921d3c790 100644 --- a/src/pipecat/services/aws/llm.py +++ b/src/pipecat/services/aws/llm.py @@ -17,6 +17,7 @@ from loguru import logger from PIL import Image from pydantic import BaseModel, Field +from pipecat.adapters.services.bedrock_adapter import AWSBedrockLLMAdapter from pipecat.frames.frames import ( Frame, FunctionCallCancelFrame, @@ -92,7 +93,6 @@ class AWSBedrockLLMContext(OpenAILLMContext): @classmethod def from_openai_context(cls, openai_context: OpenAILLMContext): - logger.debug("from_openai_context called") self = cls( messages=openai_context.messages, tools=openai_context.tools, @@ -105,7 +105,7 @@ class AWSBedrockLLMContext(OpenAILLMContext): @classmethod def from_messages(cls, messages: List[dict]) -> "AWSBedrockLLMContext": self = cls(messages=messages) - # self._restructure_from_openai_messages() + self._restructure_from_openai_messages() return self @classmethod @@ -118,7 +118,7 @@ class AWSBedrockLLMContext(OpenAILLMContext): def set_messages(self, messages: List): self._messages[:] = messages - # self._restructure_from_openai_messages() + self._restructure_from_openai_messages() # convert a message in AWS Bedrock format into one or more messages in OpenAI format def to_standard_messages(self, obj): @@ -334,7 +334,6 @@ class AWSBedrockLLMContext(OpenAILLMContext): """ # Handle system message if present at the beginning - logger.debug(f"_restructure_from_bedrock_messages: {self.messages}") if self.messages and self.messages[0]["role"] == "system": if len(self.messages) == 1: self.messages[0]["role"] = "user" @@ -375,7 +374,6 @@ class AWSBedrockLLMContext(OpenAILLMContext): self.messages.extend(merged_messages) def _restructure_from_openai_messages(self): - logger.debug(f"_restructure_from_openai_messages: {self.messages}") # first, map across self._messages calling self.from_standard_message(m) to modify messages in place try: self._messages[:] = [self.from_standard_message(m) for m in self._messages] @@ -517,6 +515,9 @@ class AWSBedrockLLMService(LLMService): """ + # Overriding the default adapter to use the Anthropic one. + adapter_class = AWSBedrockLLMAdapter + class InputParams(BaseModel): max_tokens: Optional[int] = Field(default_factory=lambda: 4096, ge=1) temperature: Optional[float] = Field(default_factory=lambda: 0.7, ge=0.0, le=1.0) diff --git a/tests/test_function_calling_adapters.py b/tests/test_function_calling_adapters.py index 5d6dafce3..83640bb80 100644 --- a/tests/test_function_calling_adapters.py +++ b/tests/test_function_calling_adapters.py @@ -11,6 +11,7 @@ from openai.types.chat import ChatCompletionToolParam from pipecat.adapters.schemas.function_schema import FunctionSchema from pipecat.adapters.schemas.tools_schema import AdapterType, ToolsSchema from pipecat.adapters.services.anthropic_adapter import AnthropicLLMAdapter +from pipecat.adapters.services.bedrock_adapter import AWSBedrockLLMAdapter from pipecat.adapters.services.gemini_adapter import GeminiLLMAdapter from pipecat.adapters.services.open_ai_adapter import OpenAILLMAdapter from pipecat.adapters.services.open_ai_realtime_adapter import OpenAIRealtimeLLMAdapter @@ -174,3 +175,32 @@ class TestFunctionAdapters(unittest.TestCase): tools_def = self.tools_def tools_def.custom_tools = {AdapterType.GEMINI: [search_tool]} assert GeminiLLMAdapter().to_provider_tools_format(tools_def) == expected + + def test_bedrock_adapter(self): + """Test AWS Bedrock adapter format transformation.""" + expected = [ + { + "toolSpec": { + "name": "get_weather", + "description": "Get the weather in a given location", + "inputSchema": { + "json": { + "type": "object", + "properties": { + "format": { + "type": "string", + "enum": ["celsius", "fahrenheit"], + "description": "The temperature unit to use.", + }, + "location": { + "type": "string", + "description": "The city, e.g. San Francisco", + }, + }, + "required": ["location", "format"], + } + }, + } + } + ] + assert AWSBedrockLLMAdapter().to_provider_tools_format(self.tools_def) == expected From 80ef6dc4dec7eb92cf430a8d885fb404e05b1b5b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= Date: Tue, 6 May 2025 21:14:06 -0700 Subject: [PATCH 31/97] update README with AWS Bedrock and Transcribe --- README.md | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 47be9b6e1..ec3b0a791 100644 --- a/README.md +++ b/README.md @@ -49,18 +49,18 @@ You can connect to Pipecat from any platform using our official SDKs: ## 🧩 Available services -| Category | Services | -| ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| Speech-to-Text | [AssemblyAI](https://docs.pipecat.ai/server/services/stt/assemblyai), [Azure](https://docs.pipecat.ai/server/services/stt/azure), [Deepgram](https://docs.pipecat.ai/server/services/stt/deepgram), [Fal Wizper](https://docs.pipecat.ai/server/services/stt/fal), [Gladia](https://docs.pipecat.ai/server/services/stt/gladia), [Google](https://docs.pipecat.ai/server/services/stt/google), [Groq (Whisper)](https://docs.pipecat.ai/server/services/stt/groq), [OpenAI (Whisper)](https://docs.pipecat.ai/server/services/stt/openai), [NVIDIA Riva](https://docs.pipecat.ai/server/services/stt/riva), [Ultravox](https://docs.pipecat.ai/server/services/stt/ultravox), [Whisper](https://docs.pipecat.ai/server/services/stt/whisper) | -| LLMs | [Anthropic](https://docs.pipecat.ai/server/services/llm/anthropic), [Azure](https://docs.pipecat.ai/server/services/llm/azure), [Cerebras](https://docs.pipecat.ai/server/services/llm/cerebras), [DeepSeek](https://docs.pipecat.ai/server/services/llm/deepseek), [Fireworks AI](https://docs.pipecat.ai/server/services/llm/fireworks), [Gemini](https://docs.pipecat.ai/server/services/llm/gemini), [Grok](https://docs.pipecat.ai/server/services/llm/grok), [Groq](https://docs.pipecat.ai/server/services/llm/groq), [NVIDIA NIM](https://docs.pipecat.ai/server/services/llm/nim), [Ollama](https://docs.pipecat.ai/server/services/llm/ollama), [OpenAI](https://docs.pipecat.ai/server/services/llm/openai), [OpenRouter](https://docs.pipecat.ai/server/services/llm/openrouter), [Perplexity](https://docs.pipecat.ai/server/services/llm/perplexity), [Qwen](https://docs.pipecat.ai/server/services/llm/qwen), [Together AI](https://docs.pipecat.ai/server/services/llm/together) | -| Text-to-Speech | [AWS](https://docs.pipecat.ai/server/services/tts/aws), [Azure](https://docs.pipecat.ai/server/services/tts/azure), [Cartesia](https://docs.pipecat.ai/server/services/tts/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/tts/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/tts/elevenlabs), [Fish](https://docs.pipecat.ai/server/services/tts/fish), [Google](https://docs.pipecat.ai/server/services/tts/google), [LMNT](https://docs.pipecat.ai/server/services/tts/lmnt), [Neuphonic](https://docs.pipecat.ai/server/services/tts/neuphonic), [NVIDIA Riva](https://docs.pipecat.ai/server/services/tts/riva), [OpenAI](https://docs.pipecat.ai/server/services/tts/openai), [Piper](https://docs.pipecat.ai/server/services/tts/piper), [PlayHT](https://docs.pipecat.ai/server/services/tts/playht), [Rime](https://docs.pipecat.ai/server/services/tts/rime), [XTTS](https://docs.pipecat.ai/server/services/tts/xtts) | -| Speech-to-Speech | [Gemini Multimodal Live](https://docs.pipecat.ai/server/services/s2s/gemini), [OpenAI Realtime](https://docs.pipecat.ai/server/services/s2s/openai) | -| Transport | [Daily (WebRTC)](https://docs.pipecat.ai/server/services/transport/daily), [FastAPI Websocket](https://docs.pipecat.ai/server/services/transport/fastapi-websocket), [SmallWebRTCTransport](https://docs.pipecat.ai/server/services/transport/small-webrtc), [WebSocket Server](https://docs.pipecat.ai/server/services/transport/websocket-server), Local | -| Video | [Tavus](https://docs.pipecat.ai/server/services/video/tavus), [Simli](https://docs.pipecat.ai/server/services/video/simli) | -| Memory | [mem0](https://docs.pipecat.ai/server/services/memory/mem0) | -| Vision & Image | [fal](https://docs.pipecat.ai/server/services/image-generation/fal), [Google Imagen](https://docs.pipecat.ai/server/services/image-generation/fal), [Moondream](https://docs.pipecat.ai/server/services/vision/moondream) | -| Audio Processing | [Silero VAD](https://docs.pipecat.ai/server/utilities/audio/silero-vad-analyzer), [Krisp](https://docs.pipecat.ai/server/utilities/audio/krisp-filter), [Koala](https://docs.pipecat.ai/server/utilities/audio/koala-filter), [Noisereduce](https://docs.pipecat.ai/server/utilities/audio/noisereduce-filter) | -| Analytics & Metrics | [Sentry](https://docs.pipecat.ai/server/services/analytics/sentry) | +| Category | Services | +|---------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| Speech-to-Text | [AssemblyAI](https://docs.pipecat.ai/server/services/stt/assemblyai), [AWS](https://docs.pipecat.ai/server/services/stt/aws), [Azure](https://docs.pipecat.ai/server/services/stt/azure), [Deepgram](https://docs.pipecat.ai/server/services/stt/deepgram), [Fal Wizper](https://docs.pipecat.ai/server/services/stt/fal), [Gladia](https://docs.pipecat.ai/server/services/stt/gladia), [Google](https://docs.pipecat.ai/server/services/stt/google), [Groq (Whisper)](https://docs.pipecat.ai/server/services/stt/groq), [OpenAI (Whisper)](https://docs.pipecat.ai/server/services/stt/openai), [Parakeet (NVIDIA)](https://docs.pipecat.ai/server/services/stt/parakeet), [Ultravox](https://docs.pipecat.ai/server/services/stt/ultravox), [Whisper](https://docs.pipecat.ai/server/services/stt/whisper) | +| LLMs | [Anthropic](https://docs.pipecat.ai/server/services/llm/anthropic), [AWS](https://docs.pipecat.ai/server/services/llm/aws), [Azure](https://docs.pipecat.ai/server/services/llm/azure), [Cerebras](https://docs.pipecat.ai/server/services/llm/cerebras), [DeepSeek](https://docs.pipecat.ai/server/services/llm/deepseek), [Fireworks AI](https://docs.pipecat.ai/server/services/llm/fireworks), [Gemini](https://docs.pipecat.ai/server/services/llm/gemini), [Grok](https://docs.pipecat.ai/server/services/llm/grok), [Groq](https://docs.pipecat.ai/server/services/llm/groq), [NVIDIA NIM](https://docs.pipecat.ai/server/services/llm/nim), [Ollama](https://docs.pipecat.ai/server/services/llm/ollama), [OpenAI](https://docs.pipecat.ai/server/services/llm/openai), [OpenRouter](https://docs.pipecat.ai/server/services/llm/openrouter), [Perplexity](https://docs.pipecat.ai/server/services/llm/perplexity), [Qwen](https://docs.pipecat.ai/server/services/llm/qwen), [Together AI](https://docs.pipecat.ai/server/services/llm/together) | +| Text-to-Speech | [AWS](https://docs.pipecat.ai/server/services/tts/aws), [Azure](https://docs.pipecat.ai/server/services/tts/azure), [Cartesia](https://docs.pipecat.ai/server/services/tts/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/tts/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/tts/elevenlabs), [FastPitch (NVIDIA)](https://docs.pipecat.ai/server/services/tts/fastpitch), [Fish](https://docs.pipecat.ai/server/services/tts/fish), [Google](https://docs.pipecat.ai/server/services/tts/google), [LMNT](https://docs.pipecat.ai/server/services/tts/lmnt), [Neuphonic](https://docs.pipecat.ai/server/services/tts/neuphonic), [OpenAI](https://docs.pipecat.ai/server/services/tts/openai), [Piper](https://docs.pipecat.ai/server/services/tts/piper), [PlayHT](https://docs.pipecat.ai/server/services/tts/playht), [Rime](https://docs.pipecat.ai/server/services/tts/rime), [XTTS](https://docs.pipecat.ai/server/services/tts/xtts) | +| Speech-to-Speech | [Gemini Multimodal Live](https://docs.pipecat.ai/server/services/s2s/gemini), [OpenAI Realtime](https://docs.pipecat.ai/server/services/s2s/openai) | +| Transport | [Daily (WebRTC)](https://docs.pipecat.ai/server/services/transport/daily), [FastAPI Websocket](https://docs.pipecat.ai/server/services/transport/fastapi-websocket), [SmallWebRTCTransport](https://docs.pipecat.ai/server/services/transport/small-webrtc), [WebSocket Server](https://docs.pipecat.ai/server/services/transport/websocket-server), Local | +| Video | [Tavus](https://docs.pipecat.ai/server/services/video/tavus), [Simli](https://docs.pipecat.ai/server/services/video/simli) | +| Memory | [mem0](https://docs.pipecat.ai/server/services/memory/mem0) | +| Vision & Image | [fal](https://docs.pipecat.ai/server/services/image-generation/fal), [Google Imagen](https://docs.pipecat.ai/server/services/image-generation/fal), [Moondream](https://docs.pipecat.ai/server/services/vision/moondream) | +| Audio Processing | [Silero VAD](https://docs.pipecat.ai/server/utilities/audio/silero-vad-analyzer), [Krisp](https://docs.pipecat.ai/server/utilities/audio/krisp-filter), [Koala](https://docs.pipecat.ai/server/utilities/audio/koala-filter), [Noisereduce](https://docs.pipecat.ai/server/utilities/audio/noisereduce-filter) | +| Analytics & Metrics | [Sentry](https://docs.pipecat.ai/server/services/analytics/sentry) | πŸ“š [View full services documentation β†’](https://docs.pipecat.ai/server/services/supported-services) From 5e5626f04fe59d57fdc4982bcdc7fa48467f7c4f Mon Sep 17 00:00:00 2001 From: Paul Kompfner Date: Wed, 23 Apr 2025 11:40:36 -0400 Subject: [PATCH 32/97] [WIP] AWS Nova Sonic service --- examples/foundational/39-aws-nova-sonic.py | 115 ++++++++++++++++++ pyproject.toml | 2 +- .../services/aws_nova_sonic/__init__.py | 1 + src/pipecat/services/aws_nova_sonic/aws.py | 101 +++++++++++++++ 4 files changed, 218 insertions(+), 1 deletion(-) create mode 100644 examples/foundational/39-aws-nova-sonic.py create mode 100644 src/pipecat/services/aws_nova_sonic/__init__.py create mode 100644 src/pipecat/services/aws_nova_sonic/aws.py diff --git a/examples/foundational/39-aws-nova-sonic.py b/examples/foundational/39-aws-nova-sonic.py new file mode 100644 index 000000000..33fbbe477 --- /dev/null +++ b/examples/foundational/39-aws-nova-sonic.py @@ -0,0 +1,115 @@ +# +# Copyright (c) 2024–2025, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +import os + +from dotenv import load_dotenv +from loguru import logger + +from pipecat.audio.vad.silero import SileroVADAnalyzer +from pipecat.audio.vad.vad_analyzer import VADParams +from pipecat.frames.frames import LLMMessagesAppendFrame +from pipecat.pipeline.pipeline import Pipeline +from pipecat.pipeline.runner import PipelineRunner +from pipecat.pipeline.task import PipelineParams, PipelineTask +from pipecat.services.aws_nova_sonic import AWSNovaSonicService +from pipecat.transports.base_transport import TransportParams +from pipecat.transports.network.small_webrtc import SmallWebRTCTransport +from pipecat.transports.network.webrtc_connection import SmallWebRTCConnection + +# Load environment variables +load_dotenv(override=True) + + +async def run_bot(webrtc_connection: SmallWebRTCConnection): + logger.info(f"Starting bot") + + # Initialize the SmallWebRTCTransport with the connection + transport = SmallWebRTCTransport( + webrtc_connection=webrtc_connection, + params=TransportParams( + audio_in_enabled=True, + audio_out_enabled=True, + camera_in_enabled=False, + vad_enabled=True, + vad_audio_passthrough=True, + # set stop_secs to something roughly similar to the internal setting + # of the Multimodal Live api, just to align events. + vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.5)), + ), + ) + + # Create the AWS Nova Sonic LLM service + # TODO: system instruction + # system_instruction = f""" + # You are a helpful AI assistant. + # Your goal is to demonstrate your capabilities in a helpful and engaging way. + # Your output will be converted to audio so don't include special characters in your answers. + # Respond to what the user said in a creative and helpful way. + # """ + + llm = AWSNovaSonicService( + secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"), + access_key_id=os.getenv("AWS_ACCESS_KEY_ID"), + region=os.getenv("AWS_REGION"), + ) + + # Build the pipeline + pipeline = Pipeline( + [ + transport.input(), + llm, + transport.output(), + ] + ) + + # Configure the pipeline task + task = PipelineTask( + pipeline, + params=PipelineParams( + allow_interruptions=True, + enable_metrics=True, + enable_usage_metrics=True, + ), + ) + + # Handle client connection event + @transport.event_handler("on_client_connected") + async def on_client_connected(transport, client): + logger.info(f"Client connected") + # Kick off the conversation. + await task.queue_frames( + [ + LLMMessagesAppendFrame( + messages=[ + { + "role": "user", + "content": f"Greet the user and introduce yourself.", + } + ] + ) + ] + ) + + # Handle client disconnection events + @transport.event_handler("on_client_disconnected") + async def on_client_disconnected(transport, client): + logger.info(f"Client disconnected") + + @transport.event_handler("on_client_closed") + async def on_client_closed(transport, client): + logger.info(f"Client closed connection") + await task.cancel() + + # Run the pipeline + runner = PipelineRunner(handle_sigint=False) + await runner.run(task) + + +if __name__ == "__main__": + from run import main + + main() diff --git a/pyproject.toml b/pyproject.toml index 13305933b..d6d05c00c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -41,7 +41,7 @@ Website = "https://pipecat.ai" [project.optional-dependencies] anthropic = [ "anthropic~=0.49.0" ] assemblyai = [ "assemblyai~=0.37.0" ] -aws = [ "boto3~=1.37.16", "websockets~=13.1" ] +aws = [ "boto3~=1.37.16", "websockets~=13.1", "aws_sdk_bedrock_runtime~=0.0.2" ] azure = [ "azure-cognitiveservices-speech~=1.42.0"] cartesia = [ "cartesia~=1.4.0", "websockets~=13.1" ] cerebras = [] diff --git a/src/pipecat/services/aws_nova_sonic/__init__.py b/src/pipecat/services/aws_nova_sonic/__init__.py new file mode 100644 index 000000000..b5559715a --- /dev/null +++ b/src/pipecat/services/aws_nova_sonic/__init__.py @@ -0,0 +1 @@ +from .aws import AWSNovaSonicService diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py new file mode 100644 index 000000000..3caf16761 --- /dev/null +++ b/src/pipecat/services/aws_nova_sonic/aws.py @@ -0,0 +1,101 @@ +from aws_sdk_bedrock_runtime.client import ( + BedrockRuntimeClient, + InvokeModelWithBidirectionalStreamOperationInput, +) +from aws_sdk_bedrock_runtime.config import Config, HTTPAuthSchemeResolver, SigV4AuthScheme +from aws_sdk_bedrock_runtime.models import ( + BidirectionalInputPayloadPart, + InvokeModelWithBidirectionalStreamInput, + InvokeModelWithBidirectionalStreamInputChunk, + InvokeModelWithBidirectionalStreamOperationOutput, + InvokeModelWithBidirectionalStreamOutput, +) +from smithy_aws_core.credentials_resolvers.static import StaticCredentialsResolver +from smithy_aws_core.identity import AWSCredentialsIdentity +from smithy_core.aio.eventstream import DuplexEventStream + +from pipecat.frames.frames import CancelFrame, EndFrame, StartFrame +from pipecat.services.llm_service import LLMService + + +class AWSNovaSonicService(LLMService): + def __init__( + self, + *, + secret_access_key: str, + access_key_id: str, + region: str, + model: str = "amazon.nova-sonic-v1:0", + **kwargs, + ): + super().__init__(**kwargs) + self._secret_access_key = secret_access_key + self._access_key_id = access_key_id + self._region = region + self._model = model + self._client: BedrockRuntimeClient = None + self._stream: DuplexEventStream[ + InvokeModelWithBidirectionalStreamInput, + InvokeModelWithBidirectionalStreamOutput, + InvokeModelWithBidirectionalStreamOperationOutput, + ] = None + self._receive_task = None + + # + # standard AIService frame handling + # + + async def start(self, frame: StartFrame): + await super().start(frame) + await self._connect() + + async def stop(self, frame: EndFrame): + await super().stop(frame) + await self._disconnect() + + async def cancel(self, frame: CancelFrame): + await super().cancel(frame) + await self._disconnect() + + # + # communication + # + + async def _connect(self): + if self._client: + # Here we assume that if we have a client we are connected. + return + self._initialize_client() + self._stream = await self._client.invoke_model_with_bidirectional_stream( + InvokeModelWithBidirectionalStreamOperationInput(model_id=self._model) + ) + self._receive_task = self.create_task(self._receive_task_handler()) + pass + + async def _disconnect(self): + pass + + def _initialize_client(self) -> BedrockRuntimeClient: + config = Config( + endpoint_uri=f"https://bedrock-runtime.{self._region}.amazonaws.com", + region=self._region, + aws_credentials_identity_resolver=StaticCredentialsResolver( + credentials=AWSCredentialsIdentity( + access_key_id=self._access_key_id, + secret_access_key=self._secret_access_key, + # TODO: add additional stuff like aws_session_token + ) + ), + http_auth_scheme_resolver=HTTPAuthSchemeResolver(), + http_auth_schemes={"aws.auth#sigv4": SigV4AuthScheme()}, + ) + self._client = BedrockRuntimeClient(config=config) + + async def _send_client_event(self, event_json): + event = InvokeModelWithBidirectionalStreamInputChunk( + value=BidirectionalInputPayloadPart(bytes_=event_json.encode("utf-8")) + ) + await self._stream.input_stream.send(event) + + async def _receive_task_handler(self): + pass From a9e395b3660f873732cc89886ab3846a368be406 Mon Sep 17 00:00:00 2001 From: Paul Kompfner Date: Wed, 23 Apr 2025 14:05:04 -0400 Subject: [PATCH 33/97] [WIP] AWS Nova Sonic service --- examples/foundational/39-aws-nova-sonic.py | 14 +- src/pipecat/services/aws_nova_sonic/aws.py | 201 +++++++++++++++++++-- 2 files changed, 194 insertions(+), 21 deletions(-) diff --git a/examples/foundational/39-aws-nova-sonic.py b/examples/foundational/39-aws-nova-sonic.py index 33fbbe477..266680542 100644 --- a/examples/foundational/39-aws-nova-sonic.py +++ b/examples/foundational/39-aws-nova-sonic.py @@ -43,15 +43,15 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection): ) # Create the AWS Nova Sonic LLM service - # TODO: system instruction - # system_instruction = f""" - # You are a helpful AI assistant. - # Your goal is to demonstrate your capabilities in a helpful and engaging way. - # Your output will be converted to audio so don't include special characters in your answers. - # Respond to what the user said in a creative and helpful way. - # """ + system_instruction = f""" + You are a helpful AI assistant. + Your goal is to demonstrate your capabilities in a helpful and engaging way. + Your output will be converted to audio so don't include special characters in your answers. + Respond to what the user said in a creative and helpful way. + """ llm = AWSNovaSonicService( + instruction=system_instruction, secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"), access_key_id=os.getenv("AWS_ACCESS_KEY_ID"), region=os.getenv("AWS_REGION"), diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py index 3caf16761..d94587879 100644 --- a/src/pipecat/services/aws_nova_sonic/aws.py +++ b/src/pipecat/services/aws_nova_sonic/aws.py @@ -1,3 +1,7 @@ +import base64 +import uuid +from enum import Enum + from aws_sdk_bedrock_runtime.client import ( BedrockRuntimeClient, InvokeModelWithBidirectionalStreamOperationInput, @@ -10,18 +14,26 @@ from aws_sdk_bedrock_runtime.models import ( InvokeModelWithBidirectionalStreamOperationOutput, InvokeModelWithBidirectionalStreamOutput, ) +from loguru import logger from smithy_aws_core.credentials_resolvers.static import StaticCredentialsResolver from smithy_aws_core.identity import AWSCredentialsIdentity from smithy_core.aio.eventstream import DuplexEventStream -from pipecat.frames.frames import CancelFrame, EndFrame, StartFrame +from pipecat.frames.frames import CancelFrame, EndFrame, Frame, InputAudioRawFrame, StartFrame +from pipecat.processors.frame_processor import FrameDirection from pipecat.services.llm_service import LLMService +class Role(Enum): + SYSTEM = "SYSTEM" + USER = "USER" + + class AWSNovaSonicService(LLMService): def __init__( self, *, + instruction: str, secret_access_key: str, access_key_id: str, region: str, @@ -29,6 +41,7 @@ class AWSNovaSonicService(LLMService): **kwargs, ): super().__init__(**kwargs) + self._instruction = instruction self._secret_access_key = secret_access_key self._access_key_id = access_key_id self._region = region @@ -40,6 +53,8 @@ class AWSNovaSonicService(LLMService): InvokeModelWithBidirectionalStreamOperationOutput, ] = None self._receive_task = None + self._prompt_name = str(uuid.uuid4()) + self._input_audio_content_name = str(uuid.uuid4()) # # standard AIService frame handling @@ -58,24 +73,54 @@ class AWSNovaSonicService(LLMService): await self._disconnect() # - # communication + # frame processing + # + + async def process_frame(self, frame: Frame, direction: FrameDirection): + await super().process_frame(frame, direction) + + if isinstance(frame, InputAudioRawFrame): + # TODO: check if _audio_input_paused? what causes that? + await self._send_user_audio(frame) + + await self.push_frame(frame, direction) + + # + # communication with LLM # async def _connect(self): - if self._client: - # Here we assume that if we have a client we are connected. - return - self._initialize_client() - self._stream = await self._client.invoke_model_with_bidirectional_stream( - InvokeModelWithBidirectionalStreamOperationInput(model_id=self._model) - ) - self._receive_task = self.create_task(self._receive_task_handler()) - pass + try: + if self._client: + # Here we assume that if we have a client we are connected + return + + # Create the client + self._client = self._create_client() + + # Start the bidirectional stream + self._stream = await self._client.invoke_model_with_bidirectional_stream( + InvokeModelWithBidirectionalStreamOperationInput(model_id=self._model) + ) + + # Send session start events + await self._send_session_start() + + # Send initial system instruction + await self._send_text(text=self._instruction, role=Role.SYSTEM) + + # Start audio input + await self._send_audio_input_start() + + self._receive_task = self.create_task(self._receive_task_handler()) + except Exception as e: + logger.error(f"{self} initialization error: {e}") + self._client = None async def _disconnect(self): pass - def _initialize_client(self) -> BedrockRuntimeClient: + def _create_client(self) -> BedrockRuntimeClient: config = Config( endpoint_uri=f"https://bedrock-runtime.{self._region}.amazonaws.com", region=self._region, @@ -89,9 +134,137 @@ class AWSNovaSonicService(LLMService): http_auth_scheme_resolver=HTTPAuthSchemeResolver(), http_auth_schemes={"aws.auth#sigv4": SigV4AuthScheme()}, ) - self._client = BedrockRuntimeClient(config=config) + return BedrockRuntimeClient(config=config) - async def _send_client_event(self, event_json): + # TODO: make params configurable? + async def _send_session_start(self): + session_start = """ + { + "event": { + "sessionStart": { + "inferenceConfiguration": { + "maxTokens": 1024, + "topP": 0.9, + "temperature": 0.7 + } + } + } + } + """ + await self._send_client_event(session_start) + + prompt_start = f''' + {{ + "event": {{ + "promptStart": {{ + "promptName": "{self._prompt_name}", + "textOutputConfiguration": {{ + "mediaType": "text/plain" + }}, + "audioOutputConfiguration": {{ + "mediaType": "audio/lpcm", + "sampleRateHertz": 24000, + "sampleSizeBits": 16, + "channelCount": 1, + "voiceId": "matthew", + "encoding": "base64", + "audioType": "SPEECH" + }} + }} + }} + }} + ''' + await self._send_client_event(prompt_start) + + async def _send_audio_input_start(self): + audio_content_start = f''' + {{ + "event": {{ + "contentStart": {{ + "promptName": "{self._prompt_name}", + "contentName": "{self._input_audio_content_name}", + "type": "AUDIO", + "interactive": true, + "role": "USER", + "audioInputConfiguration": {{ + "mediaType": "audio/lpcm", + "sampleRateHertz": 16000, + "sampleSizeBits": 16, + "channelCount": 1, + "audioType": "SPEECH", + "encoding": "base64" + }} + }} + }} + }} + ''' + await self._send_client_event(audio_content_start) + + async def _send_text(self, text: str, role: Role): + content_name = str(uuid.uuid4()) + + text_content_start = f''' + {{ + "event": {{ + "contentStart": {{ + "promptName": "{self._prompt_name}", + "contentName": "{content_name}", + "type": "TEXT", + "interactive": true, + "role": "{role.value}", + "textInputConfiguration": {{ + "mediaType": "text/plain" + }} + }} + }} + }} + ''' + await self._send_client_event(text_content_start) + + text_input = f''' + {{ + "event": {{ + "textInput": {{ + "promptName": "{self._prompt_name}", + "contentName": "{content_name}", + "content": "{text}" + }} + }} + }} + ''' + await self._send_client_event(text_input) + + text_content_end = f''' + {{ + "event": {{ + "contentEnd": {{ + "promptName": "{self._prompt_name}", + "contentName": "{content_name}" + }} + }} + }} + ''' + await self._send_client_event(text_content_end) + + async def _send_user_audio(self, frame: InputAudioRawFrame): + if not self._client: + return + + blob = base64.b64encode(frame.audio) + audio_event = f''' + {{ + "event": {{ + "audioInput": {{ + "promptName": "{self._prompt_name}", + "contentName": "{self._input_audio_content_name}", + "content": "{blob.decode("utf-8")}" + }} + }} + }} + ''' + await self._send_client_event(audio_event) + + async def _send_client_event(self, event_json: str): event = InvokeModelWithBidirectionalStreamInputChunk( value=BidirectionalInputPayloadPart(bytes_=event_json.encode("utf-8")) ) From 6d30f441e83d2a676a3c1210d6cc5127f28bd70f Mon Sep 17 00:00:00 2001 From: Paul Kompfner Date: Wed, 23 Apr 2025 15:00:04 -0400 Subject: [PATCH 34/97] [WIP] AWS Nova Sonic service --- src/pipecat/services/aws_nova_sonic/aws.py | 55 +++++++++++++++++++++- 1 file changed, 53 insertions(+), 2 deletions(-) diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py index d94587879..8bd2437bd 100644 --- a/src/pipecat/services/aws_nova_sonic/aws.py +++ b/src/pipecat/services/aws_nova_sonic/aws.py @@ -1,4 +1,5 @@ import base64 +import json import uuid from enum import Enum @@ -19,7 +20,14 @@ from smithy_aws_core.credentials_resolvers.static import StaticCredentialsResolv from smithy_aws_core.identity import AWSCredentialsIdentity from smithy_core.aio.eventstream import DuplexEventStream -from pipecat.frames.frames import CancelFrame, EndFrame, Frame, InputAudioRawFrame, StartFrame +from pipecat.frames.frames import ( + CancelFrame, + EndFrame, + Frame, + InputAudioRawFrame, + StartFrame, + TTSAudioRawFrame, +) from pipecat.processors.frame_processor import FrameDirection from pipecat.services.llm_service import LLMService @@ -91,6 +99,8 @@ class AWSNovaSonicService(LLMService): async def _connect(self): try: + # TODO: remove after debugging + logger.debug("[pk] started connecting!") if self._client: # Here we assume that if we have a client we are connected return @@ -113,6 +123,8 @@ class AWSNovaSonicService(LLMService): await self._send_audio_input_start() self._receive_task = self.create_task(self._receive_task_handler()) + + logger.debug("[pk] finished connecting!") except Exception as e: logger.error(f"{self} initialization error: {e}") self._client = None @@ -271,4 +283,43 @@ class AWSNovaSonicService(LLMService): await self._stream.input_stream.send(event) async def _receive_task_handler(self): - pass + try: + while self._client: + # TODO: remove after debugging + logger.debug(f"[pk] awaiting output from server...") + + output = await self._stream.await_output() + + # TODO: remove after debugging + logger.debug(f"[pk] got output from server: {result}") + + result = await output[1].receive() + + # TODO: remove after debugging + logger.debug(f"[pk] got result from server: {result}") + + if result.value and result.value.bytes_: + response_data = result.value.bytes_.decode("utf-8") + json_data = json.loads(response_data) + + # TODO: remove after debugging + logger.debug(f"[pk] got JSON from server: {json_data}") + + if "audioOutput" in json_data["event"]: + self._handle_audio_output_event(json_data["event"]) + except Exception as e: + logger.error(f"{self} error processing responses: {e}") + + async def _handle_audio_output_event(self, event): + # TODO: remove after debugging + logger.debug("[pk] got output audio!") + audio_content = event["audioOutput"]["content"] + audio = base64.b64decode(audio_content) + # TODO: how is _current_audio_response used? + # TODO: make sample rate + channels (used in multiple places) consts + frame = TTSAudioRawFrame( + audio=audio, + sample_rate=24000, + num_channels=1, + ) + await self.push_frame(frame) From 7668b27fc0ac7019355d9e206f00ff4128a38905 Mon Sep 17 00:00:00 2001 From: Paul Kompfner Date: Wed, 23 Apr 2025 17:44:07 -0400 Subject: [PATCH 35/97] [WIP] AWS Nova Sonic service --- examples/foundational/39-aws-nova-sonic.py | 17 +++++++++++------ src/pipecat/services/aws_nova_sonic/aws.py | 2 +- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/examples/foundational/39-aws-nova-sonic.py b/examples/foundational/39-aws-nova-sonic.py index 266680542..fd7568d63 100644 --- a/examples/foundational/39-aws-nova-sonic.py +++ b/examples/foundational/39-aws-nova-sonic.py @@ -32,6 +32,7 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection): webrtc_connection=webrtc_connection, params=TransportParams( audio_in_enabled=True, + audio_in_sample_rate=16000, audio_out_enabled=True, camera_in_enabled=False, vad_enabled=True, @@ -43,12 +44,16 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection): ) # Create the AWS Nova Sonic LLM service - system_instruction = f""" - You are a helpful AI assistant. - Your goal is to demonstrate your capabilities in a helpful and engaging way. - Your output will be converted to audio so don't include special characters in your answers. - Respond to what the user said in a creative and helpful way. - """ + # system_instruction = f""" + # You are a helpful AI assistant. + # Your goal is to demonstrate your capabilities in a helpful and engaging way. + # Your output will be converted to audio so don't include special characters in your answers. + # Respond to what the user said in a creative and helpful way. + # """ + # TODO: looks like Nova Sonic can't handle new lines? + system_instruction = "You are a friendly assistant. The user and you will engage in a spoken dialog " \ + "exchanging the transcripts of a natural real-time conversation. Keep your responses short, " \ + "generally two or three sentences for chatty scenarios." llm = AWSNovaSonicService( instruction=system_instruction, diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py index 8bd2437bd..6cd953c3b 100644 --- a/src/pipecat/services/aws_nova_sonic/aws.py +++ b/src/pipecat/services/aws_nova_sonic/aws.py @@ -306,7 +306,7 @@ class AWSNovaSonicService(LLMService): logger.debug(f"[pk] got JSON from server: {json_data}") if "audioOutput" in json_data["event"]: - self._handle_audio_output_event(json_data["event"]) + await self._handle_audio_output_event(json_data["event"]) except Exception as e: logger.error(f"{self} error processing responses: {e}") From d789334a60e4fc7b3aa5c1952bcdfdb5bc43bbea Mon Sep 17 00:00:00 2001 From: Paul Kompfner Date: Thu, 24 Apr 2025 10:29:27 -0400 Subject: [PATCH 36/97] [WIP] AWS Nova Sonic service --- src/pipecat/services/aws_nova_sonic/aws.py | 23 ++-------------------- 1 file changed, 2 insertions(+), 21 deletions(-) diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py index 6cd953c3b..aff0be2d2 100644 --- a/src/pipecat/services/aws_nova_sonic/aws.py +++ b/src/pipecat/services/aws_nova_sonic/aws.py @@ -99,8 +99,6 @@ class AWSNovaSonicService(LLMService): async def _connect(self): try: - # TODO: remove after debugging - logger.debug("[pk] started connecting!") if self._client: # Here we assume that if we have a client we are connected return @@ -123,8 +121,6 @@ class AWSNovaSonicService(LLMService): await self._send_audio_input_start() self._receive_task = self.create_task(self._receive_task_handler()) - - logger.debug("[pk] finished connecting!") except Exception as e: logger.error(f"{self} initialization error: {e}") self._client = None @@ -285,35 +281,20 @@ class AWSNovaSonicService(LLMService): async def _receive_task_handler(self): try: while self._client: - # TODO: remove after debugging - logger.debug(f"[pk] awaiting output from server...") - output = await self._stream.await_output() - - # TODO: remove after debugging - logger.debug(f"[pk] got output from server: {result}") - result = await output[1].receive() - # TODO: remove after debugging - logger.debug(f"[pk] got result from server: {result}") - if result.value and result.value.bytes_: response_data = result.value.bytes_.decode("utf-8") json_data = json.loads(response_data) - # TODO: remove after debugging - logger.debug(f"[pk] got JSON from server: {json_data}") - if "audioOutput" in json_data["event"]: await self._handle_audio_output_event(json_data["event"]) except Exception as e: logger.error(f"{self} error processing responses: {e}") - async def _handle_audio_output_event(self, event): - # TODO: remove after debugging - logger.debug("[pk] got output audio!") - audio_content = event["audioOutput"]["content"] + async def _handle_audio_output_event(self, event_json): + audio_content = event_json["audioOutput"]["content"] audio = base64.b64decode(audio_content) # TODO: how is _current_audio_response used? # TODO: make sample rate + channels (used in multiple places) consts From 13569a5a5a634a7ff65041d08ce12c410dd721bf Mon Sep 17 00:00:00 2001 From: Paul Kompfner Date: Thu, 24 Apr 2025 13:55:09 -0400 Subject: [PATCH 37/97] [WIP] AWS Nova Sonic service --- examples/foundational/39-aws-nova-sonic.py | 14 +++-- src/pipecat/services/aws_nova_sonic/aws.py | 63 ++++++++++++++++++++-- 2 files changed, 71 insertions(+), 6 deletions(-) diff --git a/examples/foundational/39-aws-nova-sonic.py b/examples/foundational/39-aws-nova-sonic.py index fd7568d63..fffaee686 100644 --- a/examples/foundational/39-aws-nova-sonic.py +++ b/examples/foundational/39-aws-nova-sonic.py @@ -9,6 +9,7 @@ import os from dotenv import load_dotenv from loguru import logger +# import logging from pipecat.audio.vad.silero import SileroVADAnalyzer from pipecat.audio.vad.vad_analyzer import VADParams from pipecat.frames.frames import LLMMessagesAppendFrame @@ -23,6 +24,11 @@ from pipecat.transports.network.webrtc_connection import SmallWebRTCConnection # Load environment variables load_dotenv(override=True) +# logging.basicConfig( +# level=logging.DEBUG, +# format='%(asctime)s - %(levelname)s - %(message)s' +# ) + async def run_bot(webrtc_connection: SmallWebRTCConnection): logger.info(f"Starting bot") @@ -51,9 +57,11 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection): # Respond to what the user said in a creative and helpful way. # """ # TODO: looks like Nova Sonic can't handle new lines? - system_instruction = "You are a friendly assistant. The user and you will engage in a spoken dialog " \ - "exchanging the transcripts of a natural real-time conversation. Keep your responses short, " \ - "generally two or three sentences for chatty scenarios." + system_instruction = ( + "You are a friendly assistant. The user and you will engage in a spoken dialog " + "exchanging the transcripts of a natural real-time conversation. Keep your responses short, " + "generally two or three sentences for chatty scenarios." + ) llm = AWSNovaSonicService( instruction=system_instruction, diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py index aff0be2d2..f8517023e 100644 --- a/src/pipecat/services/aws_nova_sonic/aws.py +++ b/src/pipecat/services/aws_nova_sonic/aws.py @@ -94,7 +94,7 @@ class AWSNovaSonicService(LLMService): await self.push_frame(frame, direction) # - # communication with LLM + # LLM communication: lifecycle # async def _connect(self): @@ -144,6 +144,10 @@ class AWSNovaSonicService(LLMService): ) return BedrockRuntimeClient(config=config) + # + # LLM communication: input events (pipecat -> LLM) + # + # TODO: make params configurable? async def _send_session_start(self): session_start = """ @@ -278,6 +282,18 @@ class AWSNovaSonicService(LLMService): ) await self._stream.input_stream.send(event) + # + # LLM communication: output events (LLM -> pipecat) + # + + # Receive LLM responses ("completions"). + # Each response contains up to four pieces of content, delivered sequentially: + # - User transcription + # - Tool use (optional) + # - Text response + # - Audio response + # Each piece of content is wrapped by "contentStart" and "contentEnd" events. + # Each overall response is wrapped by "completionStart" and "completionEnd" events. async def _receive_task_handler(self): try: while self._client: @@ -288,13 +304,46 @@ class AWSNovaSonicService(LLMService): response_data = result.value.bytes_.decode("utf-8") json_data = json.loads(response_data) - if "audioOutput" in json_data["event"]: - await self._handle_audio_output_event(json_data["event"]) + if "event" in json_data: + event_json = json_data["event"] + if "completionStart" in event_json: + # Handle the LLM response starting + await self._handle_completion_start_event(event_json) + elif "contentStart" in event_json: + # Handle a piece of content starting + await self._handle_content_start_event(event_json) + elif "textOutput" in event_json: + # Handle text output content + await self._handle_text_output_event(event_json) + elif "audioOutput" in event_json: + # Handle audio output content + await self._handle_audio_output_event(event_json) + elif "contentEnd" in event_json: + # Handle a piece of content ending + await self._handle_content_end_event(event_json) + elif "completionStart" in event_json: + # Handle the LLM response ending + await self._handle_completion_end_event(event_json) + except Exception as e: logger.error(f"{self} error processing responses: {e}") + async def _handle_completion_start_event(self, event_json): + print("[pk] completion start") + + async def _handle_content_start_event(self, event_json): + content_start = event_json["contentStart"] + type = content_start["type"] + role = content_start["role"] + print(f"[pk] content start. type: {type}, role: {role}") + + async def _handle_text_output_event(self, event_json): + text_content = event_json["textOutput"]["content"] + print(f"[pk] text output. content: {text_content}") + async def _handle_audio_output_event(self, event_json): audio_content = event_json["audioOutput"]["content"] + print(f"[pk] audio output. content: {len(audio_content)}") audio = base64.b64decode(audio_content) # TODO: how is _current_audio_response used? # TODO: make sample rate + channels (used in multiple places) consts @@ -304,3 +353,11 @@ class AWSNovaSonicService(LLMService): num_channels=1, ) await self.push_frame(frame) + + async def _handle_content_end_event(self, event_json): + content_end = event_json["contentEnd"] + type = content_end["type"] + print(f"[pk] content end. type: {type}") + + async def _handle_completion_end_event(self, event_json): + print("[pk] completion end") From 8cbad070ade59d21df5f0c109545af197376f1cd Mon Sep 17 00:00:00 2001 From: Paul Kompfner Date: Thu, 24 Apr 2025 14:21:43 -0400 Subject: [PATCH 38/97] [WIP] AWS Nova Sonic service --- src/pipecat/services/aws_nova_sonic/aws.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py index f8517023e..f28fe4360 100644 --- a/src/pipecat/services/aws_nova_sonic/aws.py +++ b/src/pipecat/services/aws_nova_sonic/aws.py @@ -89,7 +89,7 @@ class AWSNovaSonicService(LLMService): if isinstance(frame, InputAudioRawFrame): # TODO: check if _audio_input_paused? what causes that? - await self._send_user_audio(frame) + await self._send_user_audio_event(frame) await self.push_frame(frame, direction) @@ -112,13 +112,13 @@ class AWSNovaSonicService(LLMService): ) # Send session start events - await self._send_session_start() + await self._send_session_start_event() # Send initial system instruction - await self._send_text(text=self._instruction, role=Role.SYSTEM) + await self._send_text_event(text=self._instruction, role=Role.SYSTEM) # Start audio input - await self._send_audio_input_start() + await self._send_audio_input_start_event() self._receive_task = self.create_task(self._receive_task_handler()) except Exception as e: @@ -149,7 +149,7 @@ class AWSNovaSonicService(LLMService): # # TODO: make params configurable? - async def _send_session_start(self): + async def _send_session_start_event(self): session_start = """ { "event": { @@ -188,7 +188,7 @@ class AWSNovaSonicService(LLMService): ''' await self._send_client_event(prompt_start) - async def _send_audio_input_start(self): + async def _send_audio_input_start_event(self): audio_content_start = f''' {{ "event": {{ @@ -212,7 +212,7 @@ class AWSNovaSonicService(LLMService): ''' await self._send_client_event(audio_content_start) - async def _send_text(self, text: str, role: Role): + async def _send_text_event(self, text: str, role: Role): content_name = str(uuid.uuid4()) text_content_start = f''' @@ -258,7 +258,7 @@ class AWSNovaSonicService(LLMService): ''' await self._send_client_event(text_content_end) - async def _send_user_audio(self, frame: InputAudioRawFrame): + async def _send_user_audio_event(self, frame: InputAudioRawFrame): if not self._client: return @@ -357,7 +357,8 @@ class AWSNovaSonicService(LLMService): async def _handle_content_end_event(self, event_json): content_end = event_json["contentEnd"] type = content_end["type"] - print(f"[pk] content end. type: {type}") + stop_reason = content_end["stopReason"] + print(f"[pk] content end. type: {type}, stop_reason: {stop_reason}") async def _handle_completion_end_event(self, event_json): print("[pk] completion end") From b1d413b9be63779766aaa164114cdbc69f010cb1 Mon Sep 17 00:00:00 2001 From: Paul Kompfner Date: Thu, 24 Apr 2025 15:23:01 -0400 Subject: [PATCH 39/97] [WIP] AWS Nova Sonic service --- src/pipecat/services/aws_nova_sonic/aws.py | 28 +++++++++++++++------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py index f28fe4360..c271f49c5 100644 --- a/src/pipecat/services/aws_nova_sonic/aws.py +++ b/src/pipecat/services/aws_nova_sonic/aws.py @@ -286,14 +286,18 @@ class AWSNovaSonicService(LLMService): # LLM communication: output events (LLM -> pipecat) # - # Receive LLM responses ("completions"). - # Each response contains up to four pieces of content, delivered sequentially: - # - User transcription - # - Tool use (optional) - # - Text response - # - Audio response - # Each piece of content is wrapped by "contentStart" and "contentEnd" events. - # Each overall response is wrapped by "completionStart" and "completionEnd" events. + # Receive the ongoing LLM "completion". + # There is generally a single completion per session. + # In a completion, a few different kinds of content can be delivered: + # - Transcription of user audio + # - Tool use + # - Text preview of planned response speech before audio delivered + # - User interruption notification + # - Text of response speech that whose audio was actually delivered + # - Audio of response speech + # Each piece of content is wrapped by "contentStart" and "contentEnd" events. The content is + # delivered sequentially: one piece of content will end before another starts. + # The overall completion is wrapped by "completionStart" and "completionEnd" events. async def _receive_task_handler(self): try: while self._client: @@ -335,7 +339,13 @@ class AWSNovaSonicService(LLMService): content_start = event_json["contentStart"] type = content_start["type"] role = content_start["role"] - print(f"[pk] content start. type: {type}, role: {role}") + generation_stage = None + if "additionalModelFields" in content_start: + additional_model_fields = json.loads(content_start["additionalModelFields"]) + generation_stage = additional_model_fields.get("generationStage") + print( + f"[pk] content start. type: {type}, role: {role}, generation_stage: {generation_stage}" + ) async def _handle_text_output_event(self, event_json): text_content = event_json["textOutput"]["content"] From e40aa4f99a5b3e95fb8546c8fbe7b59749e64164 Mon Sep 17 00:00:00 2001 From: Paul Kompfner Date: Thu, 24 Apr 2025 15:56:28 -0400 Subject: [PATCH 40/97] [WIP] AWS Nova Sonic service - added TTSStartedFrame and TTSStoppedFrame --- src/pipecat/services/aws_nova_sonic/aws.py | 34 +++++++++++++++++----- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py index c271f49c5..2e875f96f 100644 --- a/src/pipecat/services/aws_nova_sonic/aws.py +++ b/src/pipecat/services/aws_nova_sonic/aws.py @@ -27,6 +27,8 @@ from pipecat.frames.frames import ( InputAudioRawFrame, StartFrame, TTSAudioRawFrame, + TTSStartedFrame, + TTSStoppedFrame, ) from pipecat.processors.frame_processor import FrameDirection from pipecat.services.llm_service import LLMService @@ -63,6 +65,7 @@ class AWSNovaSonicService(LLMService): self._receive_task = None self._prompt_name = str(uuid.uuid4()) self._input_audio_content_name = str(uuid.uuid4()) + self._audio_response_ongoing = False # # standard AIService frame handling @@ -333,7 +336,8 @@ class AWSNovaSonicService(LLMService): logger.error(f"{self} error processing responses: {e}") async def _handle_completion_start_event(self, event_json): - print("[pk] completion start") + # print("[pk] completion start") + pass async def _handle_content_start_event(self, event_json): content_start = event_json["contentStart"] @@ -343,19 +347,26 @@ class AWSNovaSonicService(LLMService): if "additionalModelFields" in content_start: additional_model_fields = json.loads(content_start["additionalModelFields"]) generation_stage = additional_model_fields.get("generationStage") - print( - f"[pk] content start. type: {type}, role: {role}, generation_stage: {generation_stage}" - ) + # print( + # f"[pk] content start. type: {type}, role: {role}, generation_stage: {generation_stage}" + # ) async def _handle_text_output_event(self, event_json): text_content = event_json["textOutput"]["content"] - print(f"[pk] text output. content: {text_content}") + # print(f"[pk] text output. content: {text_content}") async def _handle_audio_output_event(self, event_json): audio_content = event_json["audioOutput"]["content"] print(f"[pk] audio output. content: {len(audio_content)}") + + # Report that *equivalent* of TTS (this is a speech-to-speech model) started + if not self._audio_response_ongoing: + self._audio_response_ongoing = True + # print("[pk] starting TTS") + await self.push_frame(TTSStartedFrame()) + + # Push audio frame audio = base64.b64decode(audio_content) - # TODO: how is _current_audio_response used? # TODO: make sample rate + channels (used in multiple places) consts frame = TTSAudioRawFrame( audio=audio, @@ -368,7 +379,14 @@ class AWSNovaSonicService(LLMService): content_end = event_json["contentEnd"] type = content_end["type"] stop_reason = content_end["stopReason"] - print(f"[pk] content end. type: {type}, stop_reason: {stop_reason}") + # print(f"[pk] content end. type: {type}, stop_reason: {stop_reason}") + + # Report that *equivalent* of TTS (this is a speech-to-speech model) stopped + if type == "AUDIO" and self._audio_response_ongoing: + print("[pk] stopping TTS") + self._audio_response_ongoing = False + await self.push_frame(TTSStoppedFrame()) async def _handle_completion_end_event(self, event_json): - print("[pk] completion end") + # print("[pk] completion end") + pass From de294caed9aaf2d3b37d6c2f0ec0ba7382b9d3f7 Mon Sep 17 00:00:00 2001 From: Paul Kompfner Date: Fri, 25 Apr 2025 15:12:37 -0400 Subject: [PATCH 41/97] [WIP] AWS Nova Sonic service - added LLMFullResponseStartFrame, LLMTextFrame, and LLMFullResponseEndFrame --- src/pipecat/services/aws_nova_sonic/aws.py | 124 +++++++++++++++++---- 1 file changed, 102 insertions(+), 22 deletions(-) diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py index 2e875f96f..a2437b9dd 100644 --- a/src/pipecat/services/aws_nova_sonic/aws.py +++ b/src/pipecat/services/aws_nova_sonic/aws.py @@ -1,6 +1,7 @@ import base64 import json import uuid +from dataclasses import dataclass from enum import Enum from aws_sdk_bedrock_runtime.client import ( @@ -25,6 +26,9 @@ from pipecat.frames.frames import ( EndFrame, Frame, InputAudioRawFrame, + LLMFullResponseEndFrame, + LLMFullResponseStartFrame, + LLMTextFrame, StartFrame, TTSAudioRawFrame, TTSStartedFrame, @@ -37,6 +41,36 @@ from pipecat.services.llm_service import LLMService class Role(Enum): SYSTEM = "SYSTEM" USER = "USER" + ASSISTANT = "ASSISTANT" + TOOL = "TOOL" + + +class ContentType(Enum): + AUDIO = "AUDIO" + TEXT = "TEXT" + TOOL = "TOOL" + + +class TextStage(Enum): + FINAL = "FINAL" # what has been said + SPECULATIVE = "SPECULATIVE" # what's planned to be said + + +@dataclass +class CurrentContent: + type: ContentType + role: Role + text_stage: TextStage # None if not text + text_content: str # starts as None, then fills in if text + + def __str__(self): + return ( + f"CurrentContent(\n" + f" type={self.type.name},\n" + f" role={self.role.name},\n" + f" text_stage={self.text_stage.name if self.text_stage else 'None'}\n" + f")" + ) class AWSNovaSonicService(LLMService): @@ -65,7 +99,8 @@ class AWSNovaSonicService(LLMService): self._receive_task = None self._prompt_name = str(uuid.uuid4()) self._input_audio_content_name = str(uuid.uuid4()) - self._audio_response_ongoing = False + self._content_being_received = None # TODO: clean this up on error or when finished + self._assistant_is_responding = False # # standard AIService frame handling @@ -314,7 +349,7 @@ class AWSNovaSonicService(LLMService): if "event" in json_data: event_json = json_data["event"] if "completionStart" in event_json: - # Handle the LLM response starting + # Handle the LLM completion starting await self._handle_completion_start_event(event_json) elif "contentStart" in event_json: # Handle a piece of content starting @@ -329,7 +364,7 @@ class AWSNovaSonicService(LLMService): # Handle a piece of content ending await self._handle_content_end_event(event_json) elif "completionStart" in event_json: - # Handle the LLM response ending + # Handle the LLM completion ending await self._handle_completion_end_event(event_json) except Exception as e: @@ -347,24 +382,35 @@ class AWSNovaSonicService(LLMService): if "additionalModelFields" in content_start: additional_model_fields = json.loads(content_start["additionalModelFields"]) generation_stage = additional_model_fields.get("generationStage") - # print( - # f"[pk] content start. type: {type}, role: {role}, generation_stage: {generation_stage}" - # ) + + # Bookkeeping: track current content being received + content = CurrentContent( + type=ContentType(type), + role=Role(role), + text_stage=TextStage(generation_stage) if generation_stage else None, + text_content=None + ) + self._content_being_received = content + + if content.role == Role.ASSISTANT: + if content.type == ContentType.AUDIO: + # Report that *equivalent* of TTS (this is a speech-to-speech model) started + # print("[pk] TTS started") + await self.push_frame(TTSStartedFrame()) + + print(f"[pk] content start: {self._content_being_received}") async def _handle_text_output_event(self, event_json): text_content = event_json["textOutput"]["content"] - # print(f"[pk] text output. content: {text_content}") + print(f"[pk] text output. content: {text_content}") + + # Bookkeeping: augment the current content being received with text + content = self._content_being_received + content.text_content = text_content async def _handle_audio_output_event(self, event_json): audio_content = event_json["audioOutput"]["content"] - print(f"[pk] audio output. content: {len(audio_content)}") - - # Report that *equivalent* of TTS (this is a speech-to-speech model) started - if not self._audio_response_ongoing: - self._audio_response_ongoing = True - # print("[pk] starting TTS") - await self.push_frame(TTSStartedFrame()) - + # print(f"[pk] audio output. content: {len(audio_content)}") # Push audio frame audio = base64.b64decode(audio_content) # TODO: make sample rate + channels (used in multiple places) consts @@ -377,15 +423,49 @@ class AWSNovaSonicService(LLMService): async def _handle_content_end_event(self, event_json): content_end = event_json["contentEnd"] - type = content_end["type"] stop_reason = content_end["stopReason"] - # print(f"[pk] content end. type: {type}, stop_reason: {stop_reason}") + # print( + # f"[pk] content end: {self._content_being_received}.\n" + # f" stop_reason: {stop_reason}" + # ) - # Report that *equivalent* of TTS (this is a speech-to-speech model) stopped - if type == "AUDIO" and self._audio_response_ongoing: - print("[pk] stopping TTS") - self._audio_response_ongoing = False - await self.push_frame(TTSStoppedFrame()) + # Bookkeeping: clear current content being received + content = self._content_being_received + self._content_being_received = None + + if content and content.role == Role.ASSISTANT: + if content.type == ContentType.AUDIO: + # We got to the end of a chunk of the assistant's audio. + # Report that *equivalent* of TTS (this is a speech-to-speech model) stopped. + # print("[pk] TTS stopped") + await self.push_frame(TTSStoppedFrame()) + elif content.type == ContentType.TEXT: + # Ignore non-final text, and the "interrupted" message (which isn't meaningful text) + if content.text_stage == TextStage.FINAL and stop_reason != "INTERRUPTED": + # TODO: the way we're tracking the start and stop of the assistant response here + # is rather busted, and results in way too many "responses" being put into the + # context (every final text content block is treated as its own response). + # We *should* only record that an assistant response has ended when: + # - the assistant truly finished its turn (stop_reason is END_TURN) + # - when this is the next text content block after an INTERRUPTED has occurred + # BUT it seems like there's a bug where, if there are multiple assistant text + # content blocks, the *first* one gets marked END_TURN rather than the last. + print("[pk] LLM full response started") + self._assistant_is_responding = True + await self.push_frame(LLMFullResponseStartFrame()) + + if self._assistant_is_responding: + # Add text to the ongoing reported assistant response + print(f"[pk] LLM text: {content.text_content}") + await self.push_frame(LLMTextFrame(content.text_content)) + + # Report that the assistant has finished their response. + # TODO: kinda busted. see TODO comment above. + print("[pk] LLM full response ended") + await self.push_frame(LLMFullResponseEndFrame()) + self._assistant_is_responding = False + + self._content_being_received = False async def _handle_completion_end_event(self, event_json): # print("[pk] completion end") From 260f7c9b85f9c38b6d00dfb79c8f6ebbe021dcbf Mon Sep 17 00:00:00 2001 From: Paul Kompfner Date: Fri, 25 Apr 2025 15:19:45 -0400 Subject: [PATCH 42/97] [WIP] AWS Nova Sonic service - format --- src/pipecat/services/aws_nova_sonic/aws.py | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py index a2437b9dd..e9ce2013d 100644 --- a/src/pipecat/services/aws_nova_sonic/aws.py +++ b/src/pipecat/services/aws_nova_sonic/aws.py @@ -61,7 +61,7 @@ class CurrentContent: type: ContentType role: Role text_stage: TextStage # None if not text - text_content: str # starts as None, then fills in if text + text_content: str # starts as None, then fills in if text def __str__(self): return ( @@ -388,7 +388,7 @@ class AWSNovaSonicService(LLMService): type=ContentType(type), role=Role(role), text_stage=TextStage(generation_stage) if generation_stage else None, - text_content=None + text_content=None, ) self._content_being_received = content @@ -396,7 +396,7 @@ class AWSNovaSonicService(LLMService): if content.type == ContentType.AUDIO: # Report that *equivalent* of TTS (this is a speech-to-speech model) started # print("[pk] TTS started") - await self.push_frame(TTSStartedFrame()) + await self.push_frame(TTSStartedFrame()) print(f"[pk] content start: {self._content_being_received}") @@ -424,10 +424,7 @@ class AWSNovaSonicService(LLMService): async def _handle_content_end_event(self, event_json): content_end = event_json["contentEnd"] stop_reason = content_end["stopReason"] - # print( - # f"[pk] content end: {self._content_being_received}.\n" - # f" stop_reason: {stop_reason}" - # ) + print(f"[pk] content end: {self._content_being_received}.\n stop_reason: {stop_reason}") # Bookkeeping: clear current content being received content = self._content_being_received @@ -443,25 +440,25 @@ class AWSNovaSonicService(LLMService): # Ignore non-final text, and the "interrupted" message (which isn't meaningful text) if content.text_stage == TextStage.FINAL and stop_reason != "INTERRUPTED": # TODO: the way we're tracking the start and stop of the assistant response here - # is rather busted, and results in way too many "responses" being put into the + # is rather busted, and results in way too many "responses" being put into the # context (every final text content block is treated as its own response). # We *should* only record that an assistant response has ended when: # - the assistant truly finished its turn (stop_reason is END_TURN) # - when this is the next text content block after an INTERRUPTED has occurred - # BUT it seems like there's a bug where, if there are multiple assistant text + # BUT it seems like there's a bug where, if there are multiple assistant text # content blocks, the *first* one gets marked END_TURN rather than the last. - print("[pk] LLM full response started") + # print("[pk] LLM full response started") self._assistant_is_responding = True await self.push_frame(LLMFullResponseStartFrame()) if self._assistant_is_responding: # Add text to the ongoing reported assistant response - print(f"[pk] LLM text: {content.text_content}") + # print(f"[pk] LLM text: {content.text_content}") await self.push_frame(LLMTextFrame(content.text_content)) # Report that the assistant has finished their response. # TODO: kinda busted. see TODO comment above. - print("[pk] LLM full response ended") + # print("[pk] LLM full response ended") await self.push_frame(LLMFullResponseEndFrame()) self._assistant_is_responding = False From a38206de9caf91d29364c72a879b303e76bad51d Mon Sep 17 00:00:00 2001 From: Paul Kompfner Date: Fri, 25 Apr 2025 15:33:45 -0400 Subject: [PATCH 43/97] [WIP] AWS Nova Sonic service - added TranscriptionFrame --- src/pipecat/services/aws_nova_sonic/aws.py | 41 ++++++++++++++++++---- 1 file changed, 34 insertions(+), 7 deletions(-) diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py index e9ce2013d..5ded76b81 100644 --- a/src/pipecat/services/aws_nova_sonic/aws.py +++ b/src/pipecat/services/aws_nova_sonic/aws.py @@ -30,12 +30,14 @@ from pipecat.frames.frames import ( LLMFullResponseStartFrame, LLMTextFrame, StartFrame, + TranscriptionFrame, TTSAudioRawFrame, TTSStartedFrame, TTSStoppedFrame, ) from pipecat.processors.frame_processor import FrameDirection from pipecat.services.llm_service import LLMService +from pipecat.utils.time import time_now_iso8601 class Role(Enum): @@ -398,19 +400,30 @@ class AWSNovaSonicService(LLMService): # print("[pk] TTS started") await self.push_frame(TTSStartedFrame()) - print(f"[pk] content start: {self._content_being_received}") + # print(f"[pk] content start: {self._content_being_received}") async def _handle_text_output_event(self, event_json): + # This should never happen + if not self._content_being_received: + return + text_content = event_json["textOutput"]["content"] - print(f"[pk] text output. content: {text_content}") + # print(f"[pk] text output. content: {text_content}") # Bookkeeping: augment the current content being received with text + # Assumption: only one text content per content block content = self._content_being_received content.text_content = text_content async def _handle_audio_output_event(self, event_json): + # This should never happen + if not self._content_being_received: + return + + # Get audio audio_content = event_json["audioOutput"]["content"] # print(f"[pk] audio output. content: {len(audio_content)}") + # Push audio frame audio = base64.b64decode(audio_content) # TODO: make sample rate + channels (used in multiple places) consts @@ -422,15 +435,19 @@ class AWSNovaSonicService(LLMService): await self.push_frame(frame) async def _handle_content_end_event(self, event_json): + # This should never happen + if not self._content_being_received: + return + content_end = event_json["contentEnd"] stop_reason = content_end["stopReason"] - print(f"[pk] content end: {self._content_being_received}.\n stop_reason: {stop_reason}") + # print(f"[pk] content end: {self._content_being_received}.\n stop_reason: {stop_reason}") # Bookkeeping: clear current content being received content = self._content_being_received self._content_being_received = None - if content and content.role == Role.ASSISTANT: + if content.role == Role.ASSISTANT: if content.type == ContentType.AUDIO: # We got to the end of a chunk of the assistant's audio. # Report that *equivalent* of TTS (this is a speech-to-speech model) stopped. @@ -447,20 +464,30 @@ class AWSNovaSonicService(LLMService): # - when this is the next text content block after an INTERRUPTED has occurred # BUT it seems like there's a bug where, if there are multiple assistant text # content blocks, the *first* one gets marked END_TURN rather than the last. - # print("[pk] LLM full response started") + print("[pk] LLM full response started") self._assistant_is_responding = True await self.push_frame(LLMFullResponseStartFrame()) if self._assistant_is_responding: # Add text to the ongoing reported assistant response - # print(f"[pk] LLM text: {content.text_content}") + print(f"[pk] LLM text: {content.text_content}") await self.push_frame(LLMTextFrame(content.text_content)) # Report that the assistant has finished their response. # TODO: kinda busted. see TODO comment above. - # print("[pk] LLM full response ended") + print("[pk] LLM full response ended") await self.push_frame(LLMFullResponseEndFrame()) self._assistant_is_responding = False + elif content.role == Role.USER: + if content.type == ContentType.TEXT: + if content.text_stage == TextStage.FINAL: + # Report a bit of user transcription + print(f"[pk] transcription: {content.text_content}") + await self.push_frame( + TranscriptionFrame( + text=content.text_content, user_id="", timestamp=time_now_iso8601() + ) + ) self._content_being_received = False From 0c255d26183d988c82111e007807f19d89105111 Mon Sep 17 00:00:00 2001 From: Paul Kompfner Date: Fri, 25 Apr 2025 16:49:59 -0400 Subject: [PATCH 44/97] [WIP] AWS Nova Sonic service - added TTSTextFrame and reworked/cleaned up some bookkeeping logic --- src/pipecat/services/aws_nova_sonic/aws.py | 101 +++++++++++++-------- 1 file changed, 65 insertions(+), 36 deletions(-) diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py index 5ded76b81..4e5619f52 100644 --- a/src/pipecat/services/aws_nova_sonic/aws.py +++ b/src/pipecat/services/aws_nova_sonic/aws.py @@ -34,6 +34,7 @@ from pipecat.frames.frames import ( TTSAudioRawFrame, TTSStartedFrame, TTSStoppedFrame, + TTSTextFrame, ) from pipecat.processors.frame_processor import FrameDirection from pipecat.services.llm_service import LLMService @@ -394,13 +395,14 @@ class AWSNovaSonicService(LLMService): ) self._content_being_received = content + # print(f"[pk] content start: {self._content_being_received}") + if content.role == Role.ASSISTANT: if content.type == ContentType.AUDIO: - # Report that *equivalent* of TTS (this is a speech-to-speech model) started - # print("[pk] TTS started") - await self.push_frame(TTSStartedFrame()) - - # print(f"[pk] content start: {self._content_being_received}") + if not self._assistant_is_responding: + # The assistant has started responding. + self._assistant_is_responding = True + await self._report_assistant_started_responding() async def _handle_text_output_event(self, event_json): # This should never happen @@ -448,49 +450,76 @@ class AWSNovaSonicService(LLMService): self._content_being_received = None if content.role == Role.ASSISTANT: - if content.type == ContentType.AUDIO: - # We got to the end of a chunk of the assistant's audio. - # Report that *equivalent* of TTS (this is a speech-to-speech model) stopped. - # print("[pk] TTS stopped") - await self.push_frame(TTSStoppedFrame()) - elif content.type == ContentType.TEXT: + if content.type == ContentType.TEXT: # Ignore non-final text, and the "interrupted" message (which isn't meaningful text) if content.text_stage == TextStage.FINAL and stop_reason != "INTERRUPTED": - # TODO: the way we're tracking the start and stop of the assistant response here - # is rather busted, and results in way too many "responses" being put into the - # context (every final text content block is treated as its own response). - # We *should* only record that an assistant response has ended when: - # - the assistant truly finished its turn (stop_reason is END_TURN) - # - when this is the next text content block after an INTERRUPTED has occurred - # BUT it seems like there's a bug where, if there are multiple assistant text - # content blocks, the *first* one gets marked END_TURN rather than the last. - print("[pk] LLM full response started") - self._assistant_is_responding = True - await self.push_frame(LLMFullResponseStartFrame()) + # TODO: shoot, for now we may need to "restart" the assistant responding because + # every FINAL text block has to be treated as its own response. See below TODO + # for more information. + if not self._assistant_is_responding: + self._assistant_is_responding = True + await self._report_assistant_started_responding() if self._assistant_is_responding: - # Add text to the ongoing reported assistant response - print(f"[pk] LLM text: {content.text_content}") - await self.push_frame(LLMTextFrame(content.text_content)) + # Text added to the ongoing assistant response + await self._report_assistant_response_text_added(content.text_content) - # Report that the assistant has finished their response. - # TODO: kinda busted. see TODO comment above. - print("[pk] LLM full response ended") - await self.push_frame(LLMFullResponseEndFrame()) + # Consider the assistant finished with their response. + # TODO: the way we're tracking the start/stop of the assistant response + # is rather busted, and results in way too many "responses" being put into + # the context (every FINAL text content block is treated as its own + # response). We *should* only record that an assistant response has ended + # when: + # - the assistant truly finished its turn (stop_reason is END_TURN) + # - when the assistant has been interrupted, and outputs what's actually + # been said + # BUT it seems like there's a bug where, if there are multiple assistant + # text content blocks, the *first* one gets marked END_TURN rather than the + # last. It's similarly unclear how to determine what the last text content + # block will be after an interruption. self._assistant_is_responding = False + await self._report_assistant_stopped_responding() elif content.role == Role.USER: if content.type == ContentType.TEXT: if content.text_stage == TextStage.FINAL: - # Report a bit of user transcription - print(f"[pk] transcription: {content.text_content}") - await self.push_frame( - TranscriptionFrame( - text=content.text_content, user_id="", timestamp=time_now_iso8601() - ) - ) + # User transcription text added + await self._report_user_transcription_text_added(content.text_content) self._content_being_received = False async def _handle_completion_end_event(self, event_json): # print("[pk] completion end") pass + + async def _report_assistant_started_responding(self): + # Report that the assistant has started their response. + print("[pk] LLM full response started") + await self.push_frame(LLMFullResponseStartFrame()) + + # Report that equivalent of TTS (this is a speech-to-speech model) started + print("[pk] TTS started") + await self.push_frame(TTSStartedFrame()) + + async def _report_assistant_response_text_added(self, text): + # Report some text added to the ongoing assistant response + print(f"[pk] LLM text: {text}") + await self.push_frame(LLMTextFrame(text)) + + # Report some text added to the *equivalent* of TTS (this is a speech-to-speech model) + print(f"[pk] TTS text: {text}") + await self.push_frame(TTSTextFrame(text)) + + async def _report_assistant_stopped_responding(self): + # Report that the assistant has finished their response. + print("[pk] LLM full response ended") + await self.push_frame(LLMFullResponseEndFrame()) + + # Report that equivalent of TTS (this is a speech-to-speech model) stopped. + print("[pk] TTS stopped") + await self.push_frame(TTSStoppedFrame()) + + async def _report_user_transcription_text_added(self, text): + print(f"[pk] transcription: {text}") + await self.push_frame( + TranscriptionFrame(text=text, user_id="", timestamp=time_now_iso8601()) + ) From 1f9baefba8bab6a13c46ca1af129544f520e2a5a Mon Sep 17 00:00:00 2001 From: Paul Kompfner Date: Sun, 27 Apr 2025 06:50:28 -0400 Subject: [PATCH 45/97] [WIP] AWS Nova Sonic service - added stubs for handling interruption and user-started-speaking frames --- src/pipecat/services/aws_nova_sonic/aws.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py index 4e5619f52..737bc82fb 100644 --- a/src/pipecat/services/aws_nova_sonic/aws.py +++ b/src/pipecat/services/aws_nova_sonic/aws.py @@ -30,11 +30,15 @@ from pipecat.frames.frames import ( LLMFullResponseStartFrame, LLMTextFrame, StartFrame, + StartInterruptionFrame, + StopInterruptionFrame, TranscriptionFrame, TTSAudioRawFrame, TTSStartedFrame, TTSStoppedFrame, TTSTextFrame, + UserStartedSpeakingFrame, + UserStoppedSpeakingFrame, ) from pipecat.processors.frame_processor import FrameDirection from pipecat.services.llm_service import LLMService @@ -131,6 +135,15 @@ class AWSNovaSonicService(LLMService): if isinstance(frame, InputAudioRawFrame): # TODO: check if _audio_input_paused? what causes that? await self._send_user_audio_event(frame) + # TODO: do we need to do anything for these? + elif isinstance(frame, StartInterruptionFrame): + print("[pk] StartInterruptionFrame") + elif isinstance(frame, UserStartedSpeakingFrame): + print("[pk] UserStartedSpeakingFrame") + elif isinstance(frame, StopInterruptionFrame): + print("[pk] StopInterruptionFrame") + elif isinstance(frame, UserStoppedSpeakingFrame): + print("[pk] UserStoppedSpeakingFrame") await self.push_frame(frame, direction) From 5b64613f65a3c6d28923bb2396a58c4909a05f4d Mon Sep 17 00:00:00 2001 From: Paul Kompfner Date: Mon, 28 Apr 2025 09:16:10 -0400 Subject: [PATCH 46/97] [WIP] AWS Nova Sonic service --- examples/foundational/39-aws-nova-sonic.py | 4 +--- src/pipecat/services/aws_nova_sonic/aws.py | 22 +++++++++++++--------- 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/examples/foundational/39-aws-nova-sonic.py b/examples/foundational/39-aws-nova-sonic.py index fffaee686..567655002 100644 --- a/examples/foundational/39-aws-nova-sonic.py +++ b/examples/foundational/39-aws-nova-sonic.py @@ -43,9 +43,7 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection): camera_in_enabled=False, vad_enabled=True, vad_audio_passthrough=True, - # set stop_secs to something roughly similar to the internal setting - # of the Multimodal Live api, just to align events. - vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.5)), + vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.8)), ), ) diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py index 737bc82fb..facf84a49 100644 --- a/src/pipecat/services/aws_nova_sonic/aws.py +++ b/src/pipecat/services/aws_nova_sonic/aws.py @@ -137,13 +137,17 @@ class AWSNovaSonicService(LLMService): await self._send_user_audio_event(frame) # TODO: do we need to do anything for these? elif isinstance(frame, StartInterruptionFrame): - print("[pk] StartInterruptionFrame") + # print("[pk] StartInterruptionFrame") + pass elif isinstance(frame, UserStartedSpeakingFrame): - print("[pk] UserStartedSpeakingFrame") + # print("[pk] UserStartedSpeakingFrame") + pass elif isinstance(frame, StopInterruptionFrame): - print("[pk] StopInterruptionFrame") + # print("[pk] StopInterruptionFrame") + pass elif isinstance(frame, UserStoppedSpeakingFrame): - print("[pk] UserStoppedSpeakingFrame") + # print("[pk] UserStoppedSpeakingFrame") + pass await self.push_frame(frame, direction) @@ -415,7 +419,7 @@ class AWSNovaSonicService(LLMService): if not self._assistant_is_responding: # The assistant has started responding. self._assistant_is_responding = True - await self._report_assistant_started_responding() + await self._report_assistant_response_started() async def _handle_text_output_event(self, event_json): # This should never happen @@ -471,7 +475,7 @@ class AWSNovaSonicService(LLMService): # for more information. if not self._assistant_is_responding: self._assistant_is_responding = True - await self._report_assistant_started_responding() + await self._report_assistant_response_started() if self._assistant_is_responding: # Text added to the ongoing assistant response @@ -491,7 +495,7 @@ class AWSNovaSonicService(LLMService): # last. It's similarly unclear how to determine what the last text content # block will be after an interruption. self._assistant_is_responding = False - await self._report_assistant_stopped_responding() + await self._report_assistant_response_ended() elif content.role == Role.USER: if content.type == ContentType.TEXT: if content.text_stage == TextStage.FINAL: @@ -504,7 +508,7 @@ class AWSNovaSonicService(LLMService): # print("[pk] completion end") pass - async def _report_assistant_started_responding(self): + async def _report_assistant_response_started(self): # Report that the assistant has started their response. print("[pk] LLM full response started") await self.push_frame(LLMFullResponseStartFrame()) @@ -522,7 +526,7 @@ class AWSNovaSonicService(LLMService): print(f"[pk] TTS text: {text}") await self.push_frame(TTSTextFrame(text)) - async def _report_assistant_stopped_responding(self): + async def _report_assistant_response_ended(self): # Report that the assistant has finished their response. print("[pk] LLM full response ended") await self.push_frame(LLMFullResponseEndFrame()) From 68c1069548c110dbad409bee788a2f571854bea0 Mon Sep 17 00:00:00 2001 From: Paul Kompfner Date: Mon, 28 Apr 2025 10:37:11 -0400 Subject: [PATCH 47/97] [WIP] AWS Nova Sonic service --- src/pipecat/services/aws_nova_sonic/aws.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py index facf84a49..dd7fd702b 100644 --- a/src/pipecat/services/aws_nova_sonic/aws.py +++ b/src/pipecat/services/aws_nova_sonic/aws.py @@ -412,7 +412,9 @@ class AWSNovaSonicService(LLMService): ) self._content_being_received = content - # print(f"[pk] content start: {self._content_being_received}") + # print(f"[pk] content start: {content}") + if content.role == Role.ASSISTANT: + print(f"[pk] assistant content start: {content}") if content.role == Role.ASSISTANT: if content.type == ContentType.AUDIO: @@ -425,13 +427,15 @@ class AWSNovaSonicService(LLMService): # This should never happen if not self._content_being_received: return + content = self._content_being_received text_content = event_json["textOutput"]["content"] # print(f"[pk] text output. content: {text_content}") + if content.role == Role.ASSISTANT: + print(f"[pk] assistant text output. content: {text_content}") # Bookkeeping: augment the current content being received with text # Assumption: only one text content per content block - content = self._content_being_received content.text_content = text_content async def _handle_audio_output_event(self, event_json): @@ -457,13 +461,15 @@ class AWSNovaSonicService(LLMService): # This should never happen if not self._content_being_received: return + content = self._content_being_received content_end = event_json["contentEnd"] stop_reason = content_end["stopReason"] - # print(f"[pk] content end: {self._content_being_received}.\n stop_reason: {stop_reason}") + # print(f"[pk] content end: {content}.\n stop_reason: {stop_reason}") + if content.role == Role.ASSISTANT: + print(f"[pk] assistant content end: {content}.\n stop_reason: {stop_reason}") # Bookkeeping: clear current content being received - content = self._content_being_received self._content_being_received = None if content.role == Role.ASSISTANT: From 96d05e12fcd7fa821657b10b74ef6d564c4e279d Mon Sep 17 00:00:00 2001 From: Paul Kompfner Date: Mon, 28 Apr 2025 11:15:51 -0400 Subject: [PATCH 48/97] [WIP] AWS Nova Sonic service --- src/pipecat/services/aws_nova_sonic/aws.py | 57 +++++++++++----------- 1 file changed, 28 insertions(+), 29 deletions(-) diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py index dd7fd702b..640658ca9 100644 --- a/src/pipecat/services/aws_nova_sonic/aws.py +++ b/src/pipecat/services/aws_nova_sonic/aws.py @@ -22,6 +22,7 @@ from smithy_aws_core.identity import AWSCredentialsIdentity from smithy_core.aio.eventstream import DuplexEventStream from pipecat.frames.frames import ( + BotStoppedSpeakingFrame, CancelFrame, EndFrame, Frame, @@ -148,9 +149,29 @@ class AWSNovaSonicService(LLMService): elif isinstance(frame, UserStoppedSpeakingFrame): # print("[pk] UserStoppedSpeakingFrame") pass + elif isinstance(frame, BotStoppedSpeakingFrame): + await self._handle_bot_stopped_speaking() await self.push_frame(frame, direction) + async def _handle_bot_stopped_speaking(self): + if self._assistant_is_responding: + # Consider the assistant finished with their response. + # + # TODO: ideally we could base this solely on the LLM output events, but I couldn't + # figure out a reliable way to determine when we've gotten our last FINAL text block + # after the LLM is done talking. + # + # First I looked at stopReason, but it doesn't seem like the last FINAL text block is + # reliably marked END_TURN (sometimes the *first* one is, but not the last...bug?) + # + # Then I considered schemes where we tally or match up SPECULATIVE text blocks with + # FINAL text blocks to know how many or which FINAL blocks to expect, but user + # interruptions throw a wrench in these schemes: depending on the exact timing of the + # interruption, we should or shouldn't expect some FINAL blocks. + self._assistant_is_responding = False + await self._report_assistant_response_ended() + # # LLM communication: lifecycle # @@ -413,11 +434,12 @@ class AWSNovaSonicService(LLMService): self._content_being_received = content # print(f"[pk] content start: {content}") - if content.role == Role.ASSISTANT: - print(f"[pk] assistant content start: {content}") + # if content.role == Role.ASSISTANT: + # print(f"[pk] assistant content start: {content}") if content.role == Role.ASSISTANT: if content.type == ContentType.AUDIO: + # Note that an assistant response can comprise of multiple audio blocks if not self._assistant_is_responding: # The assistant has started responding. self._assistant_is_responding = True @@ -431,8 +453,8 @@ class AWSNovaSonicService(LLMService): text_content = event_json["textOutput"]["content"] # print(f"[pk] text output. content: {text_content}") - if content.role == Role.ASSISTANT: - print(f"[pk] assistant text output. content: {text_content}") + # if content.role == Role.ASSISTANT: + # print(f"[pk] assistant text output. content: {text_content}") # Bookkeeping: augment the current content being received with text # Assumption: only one text content per content block @@ -466,8 +488,8 @@ class AWSNovaSonicService(LLMService): content_end = event_json["contentEnd"] stop_reason = content_end["stopReason"] # print(f"[pk] content end: {content}.\n stop_reason: {stop_reason}") - if content.role == Role.ASSISTANT: - print(f"[pk] assistant content end: {content}.\n stop_reason: {stop_reason}") + # if content.role == Role.ASSISTANT: + # print(f"[pk] assistant content end: {content}.\n stop_reason: {stop_reason}") # Bookkeeping: clear current content being received self._content_being_received = None @@ -476,32 +498,9 @@ class AWSNovaSonicService(LLMService): if content.type == ContentType.TEXT: # Ignore non-final text, and the "interrupted" message (which isn't meaningful text) if content.text_stage == TextStage.FINAL and stop_reason != "INTERRUPTED": - # TODO: shoot, for now we may need to "restart" the assistant responding because - # every FINAL text block has to be treated as its own response. See below TODO - # for more information. - if not self._assistant_is_responding: - self._assistant_is_responding = True - await self._report_assistant_response_started() - if self._assistant_is_responding: # Text added to the ongoing assistant response await self._report_assistant_response_text_added(content.text_content) - - # Consider the assistant finished with their response. - # TODO: the way we're tracking the start/stop of the assistant response - # is rather busted, and results in way too many "responses" being put into - # the context (every FINAL text content block is treated as its own - # response). We *should* only record that an assistant response has ended - # when: - # - the assistant truly finished its turn (stop_reason is END_TURN) - # - when the assistant has been interrupted, and outputs what's actually - # been said - # BUT it seems like there's a bug where, if there are multiple assistant - # text content blocks, the *first* one gets marked END_TURN rather than the - # last. It's similarly unclear how to determine what the last text content - # block will be after an interruption. - self._assistant_is_responding = False - await self._report_assistant_response_ended() elif content.role == Role.USER: if content.type == ContentType.TEXT: if content.text_stage == TextStage.FINAL: From 9b8bce1914810971ef18bc8a01d4ab41287fc2e5 Mon Sep 17 00:00:00 2001 From: Paul Kompfner Date: Mon, 28 Apr 2025 13:18:09 -0400 Subject: [PATCH 49/97] [WIP] AWS Nova Sonic service - add voice_id --- examples/foundational/39-aws-nova-sonic.py | 1 + src/pipecat/services/aws_nova_sonic/aws.py | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/examples/foundational/39-aws-nova-sonic.py b/examples/foundational/39-aws-nova-sonic.py index 567655002..445464957 100644 --- a/examples/foundational/39-aws-nova-sonic.py +++ b/examples/foundational/39-aws-nova-sonic.py @@ -66,6 +66,7 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection): secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"), access_key_id=os.getenv("AWS_ACCESS_KEY_ID"), region=os.getenv("AWS_REGION"), + voice_id="tiffany", # matthew, tiffany, amy ) # Build the pipeline diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py index 640658ca9..eb57d9b80 100644 --- a/src/pipecat/services/aws_nova_sonic/aws.py +++ b/src/pipecat/services/aws_nova_sonic/aws.py @@ -90,6 +90,7 @@ class AWSNovaSonicService(LLMService): access_key_id: str, region: str, model: str = "amazon.nova-sonic-v1:0", + voice_id: str = "matthew", # matthew, tiffany, amy **kwargs, ): super().__init__(**kwargs) @@ -99,6 +100,7 @@ class AWSNovaSonicService(LLMService): self._region = region self._model = model self._client: BedrockRuntimeClient = None + self._voice_id = voice_id self._stream: DuplexEventStream[ InvokeModelWithBidirectionalStreamInput, InvokeModelWithBidirectionalStreamOutput, @@ -257,7 +259,7 @@ class AWSNovaSonicService(LLMService): "sampleRateHertz": 24000, "sampleSizeBits": 16, "channelCount": 1, - "voiceId": "matthew", + "voiceId": "{self._voice_id}", "encoding": "base64", "audioType": "SPEECH" }} From 9f7f42e885db0d68cfa619ad5720abfd536ce690 Mon Sep 17 00:00:00 2001 From: Paul Kompfner Date: Mon, 28 Apr 2025 13:41:55 -0400 Subject: [PATCH 50/97] [WIP] AWS Nova Sonic service --- src/pipecat/services/aws_nova_sonic/aws.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py index eb57d9b80..563d35422 100644 --- a/src/pipecat/services/aws_nova_sonic/aws.py +++ b/src/pipecat/services/aws_nova_sonic/aws.py @@ -193,7 +193,7 @@ class AWSNovaSonicService(LLMService): ) # Send session start events - await self._send_session_start_event() + await self._send_session_start_events() # Send initial system instruction await self._send_text_event(text=self._instruction, role=Role.SYSTEM) @@ -230,7 +230,7 @@ class AWSNovaSonicService(LLMService): # # TODO: make params configurable? - async def _send_session_start_event(self): + async def _send_session_start_events(self): session_start = """ { "event": { From f182eafb40dd320c874f963e7549ba64a4dfac8b Mon Sep 17 00:00:00 2001 From: Paul Kompfner Date: Tue, 29 Apr 2025 11:39:32 -0400 Subject: [PATCH 51/97] [WIP] AWS Nova Sonic service - add ability to pass in OpenAILLMContext --- examples/foundational/39-aws-nova-sonic.py | 47 ++--- .../services/aws_nova_sonic/__init__.py | 2 +- src/pipecat/services/aws_nova_sonic/aws.py | 187 +++++++++++++++--- .../services/aws_nova_sonic/context.py | 121 ++++++++++++ 4 files changed, 303 insertions(+), 54 deletions(-) create mode 100644 src/pipecat/services/aws_nova_sonic/context.py diff --git a/examples/foundational/39-aws-nova-sonic.py b/examples/foundational/39-aws-nova-sonic.py index 445464957..c44f85a48 100644 --- a/examples/foundational/39-aws-nova-sonic.py +++ b/examples/foundational/39-aws-nova-sonic.py @@ -16,7 +16,8 @@ from pipecat.frames.frames import LLMMessagesAppendFrame from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner from pipecat.pipeline.task import PipelineParams, PipelineTask -from pipecat.services.aws_nova_sonic import AWSNovaSonicService +from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext +from pipecat.services.aws_nova_sonic import AWSNovaSonicLLMService from pipecat.transports.base_transport import TransportParams from pipecat.transports.network.small_webrtc import SmallWebRTCTransport from pipecat.transports.network.webrtc_connection import SmallWebRTCConnection @@ -47,13 +48,7 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection): ), ) - # Create the AWS Nova Sonic LLM service - # system_instruction = f""" - # You are a helpful AI assistant. - # Your goal is to demonstrate your capabilities in a helpful and engaging way. - # Your output will be converted to audio so don't include special characters in your answers. - # Respond to what the user said in a creative and helpful way. - # """ + # Specify initial system instruction # TODO: looks like Nova Sonic can't handle new lines? system_instruction = ( "You are a friendly assistant. The user and you will engage in a spoken dialog " @@ -61,20 +56,37 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection): "generally two or three sentences for chatty scenarios." ) - llm = AWSNovaSonicService( - instruction=system_instruction, + # Create the AWS Nova Sonic LLM service + llm = AWSNovaSonicLLMService( secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"), access_key_id=os.getenv("AWS_ACCESS_KEY_ID"), region=os.getenv("AWS_REGION"), - voice_id="tiffany", # matthew, tiffany, amy + voice_id="tiffany", # matthew, tiffany, amy + # instruction=system_instruction # could pass instruction here rather than context, below ) + # Set up context and context management. + # AWSNovaSonicService will adapt OpenAI LLM context objects with standard message format to + # what's expected by Nova Sonic. + context = OpenAILLMContext( + messages=[ + {"role": "system", "content": f"{system_instruction}"}, + { + "role": "user", + "content": "Tell me hello! Don't wait for me to say anything else first!", + }, + ] + ) + context_aggregator = llm.create_context_aggregator(context) + # Build the pipeline pipeline = Pipeline( [ transport.input(), + context_aggregator.user(), llm, transport.output(), + context_aggregator.assistant(), ] ) @@ -93,18 +105,7 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection): async def on_client_connected(transport, client): logger.info(f"Client connected") # Kick off the conversation. - await task.queue_frames( - [ - LLMMessagesAppendFrame( - messages=[ - { - "role": "user", - "content": f"Greet the user and introduce yourself.", - } - ] - ) - ] - ) + await task.queue_frames([context_aggregator.user().get_context_frame()]) # Handle client disconnection events @transport.event_handler("on_client_disconnected") diff --git a/src/pipecat/services/aws_nova_sonic/__init__.py b/src/pipecat/services/aws_nova_sonic/__init__.py index b5559715a..e14c44f8a 100644 --- a/src/pipecat/services/aws_nova_sonic/__init__.py +++ b/src/pipecat/services/aws_nova_sonic/__init__.py @@ -1 +1 @@ -from .aws import AWSNovaSonicService +from .aws import AWSNovaSonicLLMService diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py index 563d35422..cc07e5463 100644 --- a/src/pipecat/services/aws_nova_sonic/aws.py +++ b/src/pipecat/services/aws_nova_sonic/aws.py @@ -3,6 +3,7 @@ import json import uuid from dataclasses import dataclass from enum import Enum +from typing import Any from aws_sdk_bedrock_runtime.client import ( BedrockRuntimeClient, @@ -41,18 +42,26 @@ from pipecat.frames.frames import ( UserStartedSpeakingFrame, UserStoppedSpeakingFrame, ) +from pipecat.processors.aggregators.llm_response import ( + LLMAssistantAggregatorParams, + LLMUserAggregatorParams, +) +from pipecat.processors.aggregators.openai_llm_context import ( + OpenAILLMContext, + OpenAILLMContextFrame, +) from pipecat.processors.frame_processor import FrameDirection +from pipecat.services.aws_nova_sonic.context import ( + AWSNovaSonicAssistantContextAggregator, + AWSNovaSonicContextAggregatorPair, + AWSNovaSonicLLMContext, + AWSNovaSonicUserContextAggregator, + Role, +) from pipecat.services.llm_service import LLMService from pipecat.utils.time import time_now_iso8601 -class Role(Enum): - SYSTEM = "SYSTEM" - USER = "USER" - ASSISTANT = "ASSISTANT" - TOOL = "TOOL" - - class ContentType(Enum): AUDIO = "AUDIO" TEXT = "TEXT" @@ -81,36 +90,40 @@ class CurrentContent: ) -class AWSNovaSonicService(LLMService): +class AWSNovaSonicLLMService(LLMService): def __init__( self, *, - instruction: str, + # TODO: if we have instruction here as an alternative to using context, we should do the same for tools...right? secret_access_key: str, access_key_id: str, region: str, model: str = "amazon.nova-sonic-v1:0", voice_id: str = "matthew", # matthew, tiffany, amy + instruction: str = None, **kwargs, ): super().__init__(**kwargs) - self._instruction = instruction self._secret_access_key = secret_access_key self._access_key_id = access_key_id self._region = region self._model = model self._client: BedrockRuntimeClient = None self._voice_id = voice_id + self._instruction = instruction + self._context: AWSNovaSonicLLMContext = None self._stream: DuplexEventStream[ InvokeModelWithBidirectionalStreamInput, InvokeModelWithBidirectionalStreamOutput, InvokeModelWithBidirectionalStreamOperationOutput, ] = None self._receive_task = None - self._prompt_name = str(uuid.uuid4()) - self._input_audio_content_name = str(uuid.uuid4()) - self._content_being_received = None # TODO: clean this up on error or when finished + self._prompt_name = None + self._input_audio_content_name = None + self._content_being_received = None self._assistant_is_responding = False + self._context_available = False + self._ready_to_send_context = False # # standard AIService frame handling @@ -118,7 +131,14 @@ class AWSNovaSonicService(LLMService): async def start(self, frame: StartFrame): await super().start(frame) - await self._connect() + # TODO: maybe connect but don't send history until we get all of our settings? + # how do we know how long to wait? + # ah, i think we'll *always* get at least one OpenAILLMContextFrame which kicks things off + # so we need to send the initial history when: + # - we're connected + # - we've gotten the first context + # i *think* this is what's controlled by _api_session_ready/_run_llm_when_api_session_ready + await self._start_connecting() async def stop(self, frame: EndFrame): await super().stop(frame) @@ -135,10 +155,14 @@ class AWSNovaSonicService(LLMService): async def process_frame(self, frame: Frame, direction: FrameDirection): await super().process_frame(frame, direction) - if isinstance(frame, InputAudioRawFrame): + if isinstance(frame, OpenAILLMContextFrame): + await self._handle_context(frame.context) + elif isinstance(frame, InputAudioRawFrame): # TODO: check if _audio_input_paused? what causes that? await self._send_user_audio_event(frame) - # TODO: do we need to do anything for these? + elif isinstance(frame, BotStoppedSpeakingFrame): + await self._handle_bot_stopped_speaking() + # TODO: do we need to do anything for the below four frame types? elif isinstance(frame, StartInterruptionFrame): # print("[pk] StartInterruptionFrame") pass @@ -151,11 +175,19 @@ class AWSNovaSonicService(LLMService): elif isinstance(frame, UserStoppedSpeakingFrame): # print("[pk] UserStoppedSpeakingFrame") pass - elif isinstance(frame, BotStoppedSpeakingFrame): - await self._handle_bot_stopped_speaking() await self.push_frame(frame, direction) + async def _handle_context(self, context: OpenAILLMContext): + # TODO: if context has changed, reconnect + # TODO: remove + print(f"[pk] _handle_context: {context.get_messages_for_initializing_history()}") + if not self._context: + # We got our initial context - try to finish connecting + self._context = AWSNovaSonicLLMContext.upgrade_to_nova_sonic(context) + self._context_available = True + await self._finish_connecting_if_context_available() + async def _handle_bot_stopped_speaking(self): if self._assistant_is_responding: # Consider the assistant finished with their response. @@ -178,12 +210,16 @@ class AWSNovaSonicService(LLMService): # LLM communication: lifecycle # - async def _connect(self): + async def _start_connecting(self): try: if self._client: - # Here we assume that if we have a client we are connected + # Here we assume that if we have a client we are connected or connecting return + # Set IDs for the connection + self._prompt_name = str(uuid.uuid4()) + self._input_audio_content_name = str(uuid.uuid4()) + # Create the client self._client = self._create_client() @@ -195,19 +231,71 @@ class AWSNovaSonicService(LLMService): # Send session start events await self._send_session_start_events() - # Send initial system instruction - await self._send_text_event(text=self._instruction, role=Role.SYSTEM) - - # Start audio input - await self._send_audio_input_start_event() - - self._receive_task = self.create_task(self._receive_task_handler()) + # Finish connecting + self._ready_to_send_context = True + await self._finish_connecting_if_context_available() except Exception as e: logger.error(f"{self} initialization error: {e}") - self._client = None + self._disconnect() + + async def _finish_connecting_if_context_available(self): + # We can only finish connecting once we've gotten our initial context and we're ready to + # send it + if not (self._context_available and self._ready_to_send_context): + return + + # Read context + history = self._context.get_messages_for_initializing_history() + + # Send system instruction + # Instruction from context takes priority + instruction = history.instruction if history.instruction else self._instruction + if instruction: + await self._send_text_event(text=instruction, role=Role.SYSTEM) + + # Send conversation history + for message in history.messages: + await self._send_text_event(text=message.text, role=message.role) + + # Send initial context (system instruction and conversation history) + # TODO: finish implementing + # - pass additional message(s) + # - merge init-passed system instruction + context instruction (latter takes precedence) + # - merge init-passed tools + context tools (latter takes precedence) + await self._send_text_event(text=self._instruction, role=Role.SYSTEM) + + # Start audio input + await self._send_audio_input_start_event() + + # Start receiving events + self._receive_task = self.create_task(self._receive_task_handler()) async def _disconnect(self): - pass + try: + # Clean up receive task + if self._receive_task: + await self.cancel_task(self._receive_task, timeout=1.0) + self._receive_task = None + + # Clean up client + if self._client: + await self._send_session_end_events() + self._client = None + + # Clean up stream + if self._stream: + await self._stream.input_stream.close() + self._stream = None + + # Reset remaining connection-specific state + self._prompt_name = None + self._input_audio_content_name = None + self._content_being_received = None + self._assistant_is_responding = False + self._context_available = False + self._ready_to_send_context = False + except Exception as e: + logger.error(f"{self} error disconnecting: {e}") def _create_client(self) -> BedrockRuntimeClient: config = Config( @@ -340,7 +428,7 @@ class AWSNovaSonicService(LLMService): await self._send_client_event(text_content_end) async def _send_user_audio_event(self, frame: InputAudioRawFrame): - if not self._client: + if not self._stream: return blob = base64.b64encode(frame.audio) @@ -357,6 +445,30 @@ class AWSNovaSonicService(LLMService): ''' await self._send_client_event(audio_event) + async def _send_session_end_events(self): + if not self._stream: + return + + prompt_end = f''' + {{ + "event": {{ + "promptEnd": {{ + "promptName": "{self._prompt_name}" + }} + }} + }} + ''' + await self._send_client_event(prompt_end) + + session_end = """ + { + "event": { + "sessionEnd": {} + } + } + """ + await self._send_client_event(session_end) + async def _send_client_event(self, event_json: str): event = InvokeModelWithBidirectionalStreamInputChunk( value=BidirectionalInputPayloadPart(bytes_=event_json.encode("utf-8")) @@ -547,3 +659,18 @@ class AWSNovaSonicService(LLMService): await self.push_frame( TranscriptionFrame(text=text, user_id="", timestamp=time_now_iso8601()) ) + + # + # Context + # + + def create_context_aggregator( + self, + context: OpenAILLMContext, + *, + user_params: LLMUserAggregatorParams = LLMUserAggregatorParams(), + assistant_params: LLMAssistantAggregatorParams = LLMAssistantAggregatorParams(), + ) -> AWSNovaSonicContextAggregatorPair: + user = AWSNovaSonicUserContextAggregator(context=context, params=user_params) + assistant = AWSNovaSonicAssistantContextAggregator(context=context, params=assistant_params) + return AWSNovaSonicContextAggregatorPair(user, assistant) diff --git a/src/pipecat/services/aws_nova_sonic/context.py b/src/pipecat/services/aws_nova_sonic/context.py new file mode 100644 index 000000000..331ecc13e --- /dev/null +++ b/src/pipecat/services/aws_nova_sonic/context.py @@ -0,0 +1,121 @@ +import copy +from dataclasses import dataclass, field +from enum import Enum + +from loguru import logger + +from pipecat.frames.frames import DataFrame, Frame, LLMMessagesUpdateFrame, LLMSetToolsFrame +from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext +from pipecat.processors.frame_processor import FrameDirection +from pipecat.services.openai.llm import ( + OpenAIAssistantContextAggregator, + OpenAIUserContextAggregator, +) + + +class Role(Enum): + SYSTEM = "SYSTEM" + USER = "USER" + ASSISTANT = "ASSISTANT" + TOOL = "TOOL" + + +@dataclass +class AWSNovaSonicConversationHistoryMessage: + role: Role # only USER and ASSISTANT + text: str + + +@dataclass +class AWSNovaSonicConversationHistory: + instruction: str = None + messages: list[AWSNovaSonicConversationHistoryMessage] = field(default_factory=list) + + +@dataclass +class AWSNovaSonicLLMContext(OpenAILLMContext): + @staticmethod + def upgrade_to_nova_sonic(obj: OpenAILLMContext) -> "AWSNovaSonicLLMContext": + if isinstance(obj, OpenAILLMContext) and not isinstance(obj, AWSNovaSonicLLMContext): + obj.__class__ = AWSNovaSonicLLMContext + return obj + + def get_messages_for_initializing_history(self) -> AWSNovaSonicConversationHistory: + history = AWSNovaSonicConversationHistory() + + # Bail if there are no messages + if not self.messages: + return history + + messages = copy.deepcopy(self.messages) + + # If we have a "system" message as our first message, let's pull that out into "instruction" + if messages[0].get("role") == "system": + system = messages.pop(0) + content = system.get("content") + if isinstance(content, str): + history.instruction = content + elif isinstance(content, list): + history.instruction = content[0].get("text") + + # Process remaining messages to fill out conversation history. + # Nova Sonic supports "user" and "assistant" messages in history. + for message in messages: + history_message = self.from_standard_message(message) + if history_message: + history.messages.append(history_message) + + return history + + def from_standard_message(self, message) -> AWSNovaSonicConversationHistoryMessage: + role = message.get("role") + if message.get("role") == "user" or message.get("role") == "assistant": + content = message.get("content") + if isinstance(message.get("content"), list): + content = "" + for c in message.get("content"): + if c.get("type") == "text": + content += " " + c.get("text") + else: + logger.error( + f"Unhandled content type in context message: {c.get('type')} - {message}" + ) + return AWSNovaSonicConversationHistoryMessage(role=Role[role.upper()], text=content) + logger.error(f"Unhandled message type in from_standard_message: {message}") + + +@dataclass +class AWSNovaSonicMessagesUpdateFrame(DataFrame): + context: AWSNovaSonicLLMContext + + +class AWSNovaSonicUserContextAggregator(OpenAIUserContextAggregator): + async def process_frame( + self, frame: Frame, direction: FrameDirection = FrameDirection.DOWNSTREAM + ): + await super().process_frame(frame, direction) + + # Parent does not push LLMMessagesUpdateFrame + if isinstance(frame, LLMMessagesUpdateFrame): + await self.push_frame(AWSNovaSonicMessagesUpdateFrame(context=self._context)) + + # Parent also doesn't push the LLMSetToolsFrame + # TODO: this + # if isinstance(frame, LLMSetToolsFrame): + # await self.push_frame(frame, direction) + + +class AWSNovaSonicAssistantContextAggregator(OpenAIAssistantContextAggregator): + pass + + +@dataclass +class AWSNovaSonicContextAggregatorPair: + _user: AWSNovaSonicUserContextAggregator + _assistant: AWSNovaSonicAssistantContextAggregator + + def user(self) -> AWSNovaSonicUserContextAggregator: + return self._user + + def assistant(self) -> AWSNovaSonicAssistantContextAggregator: + return self._assistant From 2b7e1cb5b1fdae4a7b1ecf27ade3d8bafd0fefd8 Mon Sep 17 00:00:00 2001 From: Paul Kompfner Date: Tue, 29 Apr 2025 16:38:02 -0400 Subject: [PATCH 52/97] [WIP] AWS Nova Sonic service - add tool calling --- examples/foundational/39-aws-nova-sonic.py | 50 +++++- .../services/aws_nova_sonic_adapter.py | 40 +++++ src/pipecat/services/aws_nova_sonic/aws.py | 149 +++++++++++++++++- .../services/aws_nova_sonic/context.py | 25 ++- src/pipecat/services/aws_nova_sonic/frames.py | 14 ++ 5 files changed, 267 insertions(+), 11 deletions(-) create mode 100644 src/pipecat/adapters/services/aws_nova_sonic_adapter.py create mode 100644 src/pipecat/services/aws_nova_sonic/frames.py diff --git a/examples/foundational/39-aws-nova-sonic.py b/examples/foundational/39-aws-nova-sonic.py index c44f85a48..c9bef1fed 100644 --- a/examples/foundational/39-aws-nova-sonic.py +++ b/examples/foundational/39-aws-nova-sonic.py @@ -5,14 +5,16 @@ # import os +from datetime import datetime from dotenv import load_dotenv from loguru import logger # import logging +from pipecat.adapters.schemas.function_schema import FunctionSchema +from pipecat.adapters.schemas.tools_schema import ToolsSchema from pipecat.audio.vad.silero import SileroVADAnalyzer from pipecat.audio.vad.vad_analyzer import VADParams -from pipecat.frames.frames import LLMMessagesAppendFrame from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner from pipecat.pipeline.task import PipelineParams, PipelineTask @@ -31,6 +33,39 @@ load_dotenv(override=True) # ) +async def fetch_weather_from_api(function_name, tool_call_id, args, llm, context, result_callback): + temperature = 75 if args["format"] == "fahrenheit" else 24 + await result_callback( + { + "conditions": "nice", + "temperature": temperature, + "format": args["format"], + "timestamp": datetime.now().strftime("%Y%m%d_%H%M%S"), + } + ) + + +weather_function = FunctionSchema( + name="get_current_weather", + description="Get the current weather", + properties={ + "location": { + "type": "string", + "description": "The city and state, e.g. San Francisco, CA", + }, + "format": { + "type": "string", + "enum": ["celsius", "fahrenheit"], + "description": "The temperature unit to use. Infer this from the users location.", + }, + }, + required=["location", "format"], +) + +# Create tools schema +tools = ToolsSchema(standard_tools=[weather_function]) + + async def run_bot(webrtc_connection: SmallWebRTCConnection): logger.info(f"Starting bot") @@ -62,20 +97,27 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection): access_key_id=os.getenv("AWS_ACCESS_KEY_ID"), region=os.getenv("AWS_REGION"), voice_id="tiffany", # matthew, tiffany, amy - # instruction=system_instruction # could pass instruction here rather than context, below + # instruction=system_instruction # you could pass instruction here rather than in context ) + # Register function for function calls + # you can either register a single function for all function calls, or specific functions + # llm.register_function(None, fetch_weather_from_api) + llm.register_function("get_current_weather", fetch_weather_from_api) + # Set up context and context management. # AWSNovaSonicService will adapt OpenAI LLM context objects with standard message format to # what's expected by Nova Sonic. + # TODO: since we can't trigger a response upon joining, this isn't particularly useful context = OpenAILLMContext( messages=[ {"role": "system", "content": f"{system_instruction}"}, { "role": "user", - "content": "Tell me hello! Don't wait for me to say anything else first!", + "content": "Say hello!", }, - ] + ], + tools=tools, ) context_aggregator = llm.create_context_aggregator(context) diff --git a/src/pipecat/adapters/services/aws_nova_sonic_adapter.py b/src/pipecat/adapters/services/aws_nova_sonic_adapter.py new file mode 100644 index 000000000..b96980046 --- /dev/null +++ b/src/pipecat/adapters/services/aws_nova_sonic_adapter.py @@ -0,0 +1,40 @@ +# +# Copyright (c) 2024–2025, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# +import json +from typing import Any, Dict, List, Union + +from pipecat.adapters.base_llm_adapter import BaseLLMAdapter +from pipecat.adapters.schemas.function_schema import FunctionSchema +from pipecat.adapters.schemas.tools_schema import ToolsSchema + + +class AWSNovaSonicLLMAdapter(BaseLLMAdapter): + @staticmethod + def _to_aws_nova_sonic_function_format(function: FunctionSchema) -> Dict[str, Any]: + return { + "toolSpec": { + "name": function.name, + "description": function.description, + "inputSchema": { + "json": json.dumps( + { + "type": "object", + "properties": function.properties, + "required": function.required, + } + ) + }, + } + } + + def to_provider_tools_format(self, tools_schema: ToolsSchema) -> List[Dict[str, Any]]: + """Converts function schemas to Openai Realtime function-calling format. + + :return: Openai Realtime formatted function call definition. + """ + + functions_schema = tools_schema.standard_tools + return [self._to_aws_nova_sonic_function_format(func) for func in functions_schema] diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py index cc07e5463..8b6dab3ed 100644 --- a/src/pipecat/services/aws_nova_sonic/aws.py +++ b/src/pipecat/services/aws_nova_sonic/aws.py @@ -1,9 +1,15 @@ +# +# Copyright (c) 2024–2025, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + import base64 import json import uuid from dataclasses import dataclass from enum import Enum -from typing import Any +from typing import Any, List from aws_sdk_bedrock_runtime.client import ( BedrockRuntimeClient, @@ -22,6 +28,7 @@ from smithy_aws_core.credentials_resolvers.static import StaticCredentialsResolv from smithy_aws_core.identity import AWSCredentialsIdentity from smithy_core.aio.eventstream import DuplexEventStream +from pipecat.adapters.services.aws_nova_sonic_adapter import AWSNovaSonicLLMAdapter from pipecat.frames.frames import ( BotStoppedSpeakingFrame, CancelFrame, @@ -58,10 +65,15 @@ from pipecat.services.aws_nova_sonic.context import ( AWSNovaSonicUserContextAggregator, Role, ) +from pipecat.services.aws_nova_sonic.frames import AWSNovaSonicFunctionCallResultFrame from pipecat.services.llm_service import LLMService from pipecat.utils.time import time_now_iso8601 +class AWSNovaSonicUnhandledFunctionException(Exception): + pass + + class ContentType(Enum): AUDIO = "AUDIO" TEXT = "TEXT" @@ -91,6 +103,9 @@ class CurrentContent: class AWSNovaSonicLLMService(LLMService): + # Override the default adapter to use the AWSNovaSonicLLMAdapter one + adapter_class = AWSNovaSonicLLMAdapter + def __init__( self, *, @@ -162,6 +177,8 @@ class AWSNovaSonicLLMService(LLMService): await self._send_user_audio_event(frame) elif isinstance(frame, BotStoppedSpeakingFrame): await self._handle_bot_stopped_speaking() + elif isinstance(frame, AWSNovaSonicFunctionCallResultFrame): + await self._handle_function_call_result(frame) # TODO: do we need to do anything for the below four frame types? elif isinstance(frame, StartInterruptionFrame): # print("[pk] StartInterruptionFrame") @@ -206,6 +223,10 @@ class AWSNovaSonicLLMService(LLMService): self._assistant_is_responding = False await self._report_assistant_response_ended() + async def _handle_function_call_result(self, frame: AWSNovaSonicFunctionCallResultFrame): + result = frame.result_frame + await self._send_tool_result(tool_call_id=result.tool_call_id, result=result.result) + # # LLM communication: lifecycle # @@ -228,8 +249,8 @@ class AWSNovaSonicLLMService(LLMService): InvokeModelWithBidirectionalStreamOperationInput(model_id=self._model) ) - # Send session start events - await self._send_session_start_events() + # Send session start event + await self._send_session_start_event() # Finish connecting self._ready_to_send_context = True @@ -247,6 +268,10 @@ class AWSNovaSonicLLMService(LLMService): # Read context history = self._context.get_messages_for_initializing_history() + # Send prompt start event, specifying tools + tools = self._context.tools + await self._send_prompt_start_event(tools) + # Send system instruction # Instruction from context takes priority instruction = history.instruction if history.instruction else self._instruction @@ -318,7 +343,7 @@ class AWSNovaSonicLLMService(LLMService): # # TODO: make params configurable? - async def _send_session_start_events(self): + async def _send_session_start_event(self): session_start = """ { "event": { @@ -334,6 +359,20 @@ class AWSNovaSonicLLMService(LLMService): """ await self._send_client_event(session_start) + async def _send_prompt_start_event(self, tools: List[Any]): + tools_config = ( + f""", + "toolUseOutputConfiguration": {{ + "mediaType": "application/json" + }}, + "toolConfiguration": {{ + "tools": {json.dumps(tools)} + }} + """ + if tools + else "" + ) + prompt_start = f''' {{ "event": {{ @@ -350,7 +389,7 @@ class AWSNovaSonicLLMService(LLMService): "voiceId": "{self._voice_id}", "encoding": "base64", "audioType": "SPEECH" - }} + }}{tools_config} }} }} }} @@ -382,6 +421,9 @@ class AWSNovaSonicLLMService(LLMService): await self._send_client_event(audio_content_start) async def _send_text_event(self, text: str, role: Role): + if not self._stream: + return + content_name = str(uuid.uuid4()) text_content_start = f''' @@ -469,6 +511,61 @@ class AWSNovaSonicLLMService(LLMService): """ await self._send_client_event(session_end) + async def _send_tool_result(self, tool_call_id, result): + if not self._stream: + return + + # print(f"[pk] sending tool result. tool call ID: {tool_call_id}, result: {result}") + + content_name = str(uuid.uuid4()) + + result_content_start = f''' + {{ + "event": {{ + "contentStart": {{ + "promptName": "{self._prompt_name}", + "contentName": "{content_name}", + "interactive": false, + "type": "TOOL", + "role": "TOOL", + "toolResultInputConfiguration": {{ + "toolUseId": "{tool_call_id}", + "type": "TEXT", + "textInputConfiguration": {{ + "mediaType": "text/plain" + }} + }} + }} + }} + }} + ''' + await self._send_client_event(result_content_start) + + result_content = json.dumps( + { + "event": { + "toolResult": { + "promptName": self._prompt_name, + "contentName": content_name, + "content": json.dumps(result) if isinstance(result, dict) else result, + } + } + } + ) + await self._send_client_event(result_content) + + result_content_end = f""" + {{ + "event": {{ + "contentEnd": {{ + "promptName": "{self._prompt_name}", + "contentName": "{content_name}" + }} + }} + }} + """ + await self._send_client_event(result_content_end) + async def _send_client_event(self, event_json: str): event = InvokeModelWithBidirectionalStreamInputChunk( value=BidirectionalInputPayloadPart(bytes_=event_json.encode("utf-8")) @@ -515,6 +612,9 @@ class AWSNovaSonicLLMService(LLMService): elif "audioOutput" in event_json: # Handle audio output content await self._handle_audio_output_event(event_json) + elif "toolUse" in event_json: + # Handle tool use + await self._handle_tool_use_event(event_json) elif "contentEnd" in event_json: # Handle a piece of content ending await self._handle_content_end_event(event_json) @@ -593,6 +693,42 @@ class AWSNovaSonicLLMService(LLMService): ) await self.push_frame(frame) + async def _handle_tool_use_event(self, event_json): + # This should never happen + if not self._content_being_received: + return + + # Get tool use details + tool_use = event_json["toolUse"] + function_name = tool_use["toolName"] + tool_call_id = tool_use["toolUseId"] + arguments = json.loads(tool_use["content"]) + + # print( + # f"[pk] tool use - function_name: {function_name}, tool_call_id: {tool_call_id}, arguments: {arguments}" + # ) + + # Call tool function + if self.has_function(function_name): + if function_name in self._functions.keys(): + await self.call_function( + context=self._context, + tool_call_id=tool_call_id, + function_name=function_name, + arguments=arguments, + ) + elif None in self._functions.keys(): + await self.call_function( + context=self._context, + tool_call_id=tool_call_id, + function_name=function_name, + arguments=arguments, + ) + else: + raise AWSNovaSonicUnhandledFunctionException( + f"The LLM tried to call a function named '{function_name}', but there isn't a callback registered for that function." + ) + async def _handle_content_end_event(self, event_json): # This should never happen if not self._content_being_received: @@ -671,6 +807,9 @@ class AWSNovaSonicLLMService(LLMService): user_params: LLMUserAggregatorParams = LLMUserAggregatorParams(), assistant_params: LLMAssistantAggregatorParams = LLMAssistantAggregatorParams(), ) -> AWSNovaSonicContextAggregatorPair: + context.set_llm_adapter(self.get_llm_adapter()) + user = AWSNovaSonicUserContextAggregator(context=context, params=user_params) assistant = AWSNovaSonicAssistantContextAggregator(context=context, params=assistant_params) + return AWSNovaSonicContextAggregatorPair(user, assistant) diff --git a/src/pipecat/services/aws_nova_sonic/context.py b/src/pipecat/services/aws_nova_sonic/context.py index 331ecc13e..820254cfb 100644 --- a/src/pipecat/services/aws_nova_sonic/context.py +++ b/src/pipecat/services/aws_nova_sonic/context.py @@ -1,12 +1,25 @@ +# +# Copyright (c) 2025, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + import copy from dataclasses import dataclass, field from enum import Enum from loguru import logger -from pipecat.frames.frames import DataFrame, Frame, LLMMessagesUpdateFrame, LLMSetToolsFrame +from pipecat.frames.frames import ( + DataFrame, + Frame, + FunctionCallResultFrame, + LLMMessagesUpdateFrame, + LLMSetToolsFrame, +) from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext from pipecat.processors.frame_processor import FrameDirection +from pipecat.services.aws_nova_sonic.frames import AWSNovaSonicFunctionCallResultFrame from pipecat.services.openai.llm import ( OpenAIAssistantContextAggregator, OpenAIUserContextAggregator, @@ -106,7 +119,15 @@ class AWSNovaSonicUserContextAggregator(OpenAIUserContextAggregator): class AWSNovaSonicAssistantContextAggregator(OpenAIAssistantContextAggregator): - pass + async def handle_function_call_result(self, frame: FunctionCallResultFrame): + await super().handle_function_call_result(frame) + + # The standard function callback code path pushes the FunctionCallResultFrame from the llm itself, + # so we didn't have a chance to add the result to the openai realtime api context. Let's push a + # special frame to do that. + await self.push_frame( + AWSNovaSonicFunctionCallResultFrame(result_frame=frame), FrameDirection.UPSTREAM + ) @dataclass diff --git a/src/pipecat/services/aws_nova_sonic/frames.py b/src/pipecat/services/aws_nova_sonic/frames.py new file mode 100644 index 000000000..94d410f22 --- /dev/null +++ b/src/pipecat/services/aws_nova_sonic/frames.py @@ -0,0 +1,14 @@ +# +# Copyright (c) 2025, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +from dataclasses import dataclass + +from pipecat.frames.frames import DataFrame, FunctionCallResultFrame + + +@dataclass +class AWSNovaSonicFunctionCallResultFrame(DataFrame): + result_frame: FunctionCallResultFrame From da5c4953d5c793d25d7ab1d523928d48550b27b2 Mon Sep 17 00:00:00 2001 From: Paul Kompfner Date: Wed, 30 Apr 2025 10:51:06 -0400 Subject: [PATCH 53/97] [WIP] AWS Nova Sonic service - allow passing in tools into initializer --- examples/foundational/39-aws-nova-sonic.py | 5 ++++- src/pipecat/services/aws_nova_sonic/aws.py | 16 ++++++++++++---- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/examples/foundational/39-aws-nova-sonic.py b/examples/foundational/39-aws-nova-sonic.py index c9bef1fed..f08cfad04 100644 --- a/examples/foundational/39-aws-nova-sonic.py +++ b/examples/foundational/39-aws-nova-sonic.py @@ -97,7 +97,10 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection): access_key_id=os.getenv("AWS_ACCESS_KEY_ID"), region=os.getenv("AWS_REGION"), voice_id="tiffany", # matthew, tiffany, amy - # instruction=system_instruction # you could pass instruction here rather than in context + # you could choose to pass instruction here rather than via context + # instruction=system_instruction + # you could choose to pass tools here rather than via context + # tools=tools ) # Register function for function calls diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py index 8b6dab3ed..586f759c6 100644 --- a/src/pipecat/services/aws_nova_sonic/aws.py +++ b/src/pipecat/services/aws_nova_sonic/aws.py @@ -9,7 +9,7 @@ import json import uuid from dataclasses import dataclass from enum import Enum -from typing import Any, List +from typing import Any, List, Optional from aws_sdk_bedrock_runtime.client import ( BedrockRuntimeClient, @@ -28,6 +28,7 @@ from smithy_aws_core.credentials_resolvers.static import StaticCredentialsResolv from smithy_aws_core.identity import AWSCredentialsIdentity from smithy_core.aio.eventstream import DuplexEventStream +from pipecat.adapters.schemas.tools_schema import ToolsSchema from pipecat.adapters.services.aws_nova_sonic_adapter import AWSNovaSonicLLMAdapter from pipecat.frames.frames import ( BotStoppedSpeakingFrame, @@ -115,7 +116,8 @@ class AWSNovaSonicLLMService(LLMService): region: str, model: str = "amazon.nova-sonic-v1:0", voice_id: str = "matthew", # matthew, tiffany, amy - instruction: str = None, + instruction: Optional[str] = None, + tools: Optional[ToolsSchema] = None, **kwargs, ): super().__init__(**kwargs) @@ -126,6 +128,7 @@ class AWSNovaSonicLLMService(LLMService): self._client: BedrockRuntimeClient = None self._voice_id = voice_id self._instruction = instruction + self._tools = tools self._context: AWSNovaSonicLLMContext = None self._stream: DuplexEventStream[ InvokeModelWithBidirectionalStreamInput, @@ -269,11 +272,16 @@ class AWSNovaSonicLLMService(LLMService): history = self._context.get_messages_for_initializing_history() # Send prompt start event, specifying tools - tools = self._context.tools + # Tools from context take priority over tools from __init__() + tools = ( + self._context.tools + if self._context.tools + else self.get_llm_adapter().from_standard_tools(self._tools) + ) await self._send_prompt_start_event(tools) # Send system instruction - # Instruction from context takes priority + # Instruction from context takes priority over instruction from __init__() instruction = history.instruction if history.instruction else self._instruction if instruction: await self._send_text_event(text=instruction, role=Role.SYSTEM) From 394648f1c9a7a6e1cb6d7865fae31498696185a2 Mon Sep 17 00:00:00 2001 From: Paul Kompfner Date: Wed, 30 Apr 2025 11:04:47 -0400 Subject: [PATCH 54/97] [WIP] AWS Nova Sonic service - fix user utterances not making it into the context --- src/pipecat/services/aws_nova_sonic/aws.py | 9 ++++++--- src/pipecat/services/aws_nova_sonic/context.py | 7 +++++++ 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py index 586f759c6..1b2937f83 100644 --- a/src/pipecat/services/aws_nova_sonic/aws.py +++ b/src/pipecat/services/aws_nova_sonic/aws.py @@ -199,9 +199,8 @@ class AWSNovaSonicLLMService(LLMService): await self.push_frame(frame, direction) async def _handle_context(self, context: OpenAILLMContext): - # TODO: if context has changed, reconnect - # TODO: remove - print(f"[pk] _handle_context: {context.get_messages_for_initializing_history()}") + # TODO: reset connection if needed (if entirely new context object provided, for instance) + print(f"[pk] receive updated context: {context.get_messages_for_initializing_history()}") if not self._context: # We got our initial context - try to finish connecting self._context = AWSNovaSonicLLMContext.upgrade_to_nova_sonic(context) @@ -800,6 +799,10 @@ class AWSNovaSonicLLMService(LLMService): async def _report_user_transcription_text_added(self, text): print(f"[pk] transcription: {text}") + # Manually add new user transcription text to context. + # We can't rely on the user context aggregator to do this since it's upstream from the LLM. + self._context.add_user_transcription_text_as_message(text) + # Report that some new user transcription text is available. await self.push_frame( TranscriptionFrame(text=text, user_id="", timestamp=time_now_iso8601()) ) diff --git a/src/pipecat/services/aws_nova_sonic/context.py b/src/pipecat/services/aws_nova_sonic/context.py index 820254cfb..92cd313cb 100644 --- a/src/pipecat/services/aws_nova_sonic/context.py +++ b/src/pipecat/services/aws_nova_sonic/context.py @@ -96,6 +96,13 @@ class AWSNovaSonicLLMContext(OpenAILLMContext): return AWSNovaSonicConversationHistoryMessage(role=Role[role.upper()], text=content) logger.error(f"Unhandled message type in from_standard_message: {message}") + def add_user_transcription_text_as_message(self, text): + message = { + "role": "user", + "content": [{"type": "text", "text": text}], + } + self.add_message(message) + @dataclass class AWSNovaSonicMessagesUpdateFrame(DataFrame): From 3960c604a4ab53a45269855cd976605c262c6261 Mon Sep 17 00:00:00 2001 From: Paul Kompfner Date: Wed, 30 Apr 2025 11:20:23 -0400 Subject: [PATCH 55/97] [WIP] AWS Nova Sonic service - fix empty assistant conversation history item in the context after tool use --- src/pipecat/services/aws_nova_sonic/context.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/pipecat/services/aws_nova_sonic/context.py b/src/pipecat/services/aws_nova_sonic/context.py index 92cd313cb..206c1fd2b 100644 --- a/src/pipecat/services/aws_nova_sonic/context.py +++ b/src/pipecat/services/aws_nova_sonic/context.py @@ -93,7 +93,13 @@ class AWSNovaSonicLLMContext(OpenAILLMContext): logger.error( f"Unhandled content type in context message: {c.get('type')} - {message}" ) - return AWSNovaSonicConversationHistoryMessage(role=Role[role.upper()], text=content) + # There won't be content if this is an assistant tool call entry. + # We're ignoring those since they can't be loaded into AWS Nova Sonic conversation + # history + if content: + return AWSNovaSonicConversationHistoryMessage(role=Role[role.upper()], text=content) + # We're ignoring messages with role "tool" since they can't be loaded into AWS Nova Sonic + # conversation history logger.error(f"Unhandled message type in from_standard_message: {message}") def add_user_transcription_text_as_message(self, text): From 5e0803479ea04fbb40f7f80d398165f6e2b50380 Mon Sep 17 00:00:00 2001 From: Paul Kompfner Date: Wed, 30 Apr 2025 14:53:22 -0400 Subject: [PATCH 56/97] [WIP] AWS Nova Sonic service - add send_transcription_frames option --- src/pipecat/services/aws_nova_sonic/aws.py | 10 +++++++--- src/pipecat/services/aws_nova_sonic/context.py | 4 ++-- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py index 1b2937f83..3c6de7ad7 100644 --- a/src/pipecat/services/aws_nova_sonic/aws.py +++ b/src/pipecat/services/aws_nova_sonic/aws.py @@ -118,6 +118,7 @@ class AWSNovaSonicLLMService(LLMService): voice_id: str = "matthew", # matthew, tiffany, amy instruction: Optional[str] = None, tools: Optional[ToolsSchema] = None, + send_transcription_frames: bool = True, **kwargs, ): super().__init__(**kwargs) @@ -129,6 +130,7 @@ class AWSNovaSonicLLMService(LLMService): self._voice_id = voice_id self._instruction = instruction self._tools = tools + self._send_transcription_frames = send_transcription_frames self._context: AWSNovaSonicLLMContext = None self._stream: DuplexEventStream[ InvokeModelWithBidirectionalStreamInput, @@ -802,10 +804,12 @@ class AWSNovaSonicLLMService(LLMService): # Manually add new user transcription text to context. # We can't rely on the user context aggregator to do this since it's upstream from the LLM. self._context.add_user_transcription_text_as_message(text) + # Report that some new user transcription text is available. - await self.push_frame( - TranscriptionFrame(text=text, user_id="", timestamp=time_now_iso8601()) - ) + if self._send_transcription_frames: + await self.push_frame( + TranscriptionFrame(text=text, user_id="", timestamp=time_now_iso8601()) + ) # # Context diff --git a/src/pipecat/services/aws_nova_sonic/context.py b/src/pipecat/services/aws_nova_sonic/context.py index 206c1fd2b..e4662ee57 100644 --- a/src/pipecat/services/aws_nova_sonic/context.py +++ b/src/pipecat/services/aws_nova_sonic/context.py @@ -94,11 +94,11 @@ class AWSNovaSonicLLMContext(OpenAILLMContext): f"Unhandled content type in context message: {c.get('type')} - {message}" ) # There won't be content if this is an assistant tool call entry. - # We're ignoring those since they can't be loaded into AWS Nova Sonic conversation + # We're ignoring those since they can't be loaded into AWS Nova Sonic conversation # history if content: return AWSNovaSonicConversationHistoryMessage(role=Role[role.upper()], text=content) - # We're ignoring messages with role "tool" since they can't be loaded into AWS Nova Sonic + # We're ignoring messages with role "tool" since they can't be loaded into AWS Nova Sonic # conversation history logger.error(f"Unhandled message type in from_standard_message: {message}") From 2154db07f085e92aab4ef99b45a438dc316088cd Mon Sep 17 00:00:00 2001 From: Paul Kompfner Date: Wed, 30 Apr 2025 15:10:10 -0400 Subject: [PATCH 57/97] [WIP] AWS Nova Sonic service - remove unnecessary error log --- src/pipecat/services/aws_nova_sonic/context.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/pipecat/services/aws_nova_sonic/context.py b/src/pipecat/services/aws_nova_sonic/context.py index e4662ee57..4b41b53b3 100644 --- a/src/pipecat/services/aws_nova_sonic/context.py +++ b/src/pipecat/services/aws_nova_sonic/context.py @@ -98,9 +98,8 @@ class AWSNovaSonicLLMContext(OpenAILLMContext): # history if content: return AWSNovaSonicConversationHistoryMessage(role=Role[role.upper()], text=content) - # We're ignoring messages with role "tool" since they can't be loaded into AWS Nova Sonic - # conversation history - logger.error(f"Unhandled message type in from_standard_message: {message}") + # NOTE: we're ignoring messages with role "tool" since they can't be loaded into AWS Nova + # Sonic conversation history def add_user_transcription_text_as_message(self, text): message = { From 6938152db67947e09da37b7d2e3d649fa0b5a939 Mon Sep 17 00:00:00 2001 From: Paul Kompfner Date: Wed, 30 Apr 2025 15:15:49 -0400 Subject: [PATCH 58/97] [WIP] AWS Nova Sonic service - fix comment --- src/pipecat/services/aws_nova_sonic/context.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/pipecat/services/aws_nova_sonic/context.py b/src/pipecat/services/aws_nova_sonic/context.py index 4b41b53b3..5d9bafec5 100644 --- a/src/pipecat/services/aws_nova_sonic/context.py +++ b/src/pipecat/services/aws_nova_sonic/context.py @@ -73,6 +73,7 @@ class AWSNovaSonicLLMContext(OpenAILLMContext): # Process remaining messages to fill out conversation history. # Nova Sonic supports "user" and "assistant" messages in history. + print(f"[pk] standard messages: {messages}") for message in messages: history_message = self.from_standard_message(message) if history_message: @@ -134,9 +135,9 @@ class AWSNovaSonicAssistantContextAggregator(OpenAIAssistantContextAggregator): async def handle_function_call_result(self, frame: FunctionCallResultFrame): await super().handle_function_call_result(frame) - # The standard function callback code path pushes the FunctionCallResultFrame from the llm itself, - # so we didn't have a chance to add the result to the openai realtime api context. Let's push a - # special frame to do that. + # The standard function callback code path pushes the FunctionCallResultFrame from the LLM + # itself, so we didn't have a chance to add the result to the AWS Nova Sonic server-side + # context. Let's push a special frame to do that. await self.push_frame( AWSNovaSonicFunctionCallResultFrame(result_frame=frame), FrameDirection.UPSTREAM ) From d6ef3d64ace855238e0ee60c5e5d92247b8a7448 Mon Sep 17 00:00:00 2001 From: Paul Kompfner Date: Wed, 30 Apr 2025 21:40:40 -0400 Subject: [PATCH 59/97] [WIP] AWS Nova Sonic service - fix context problems of double-counting LLM text, and mis-categorizing user text as LLM text --- .../services/aws_nova_sonic/context.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/src/pipecat/services/aws_nova_sonic/context.py b/src/pipecat/services/aws_nova_sonic/context.py index 5d9bafec5..4e2a4fcc1 100644 --- a/src/pipecat/services/aws_nova_sonic/context.py +++ b/src/pipecat/services/aws_nova_sonic/context.py @@ -16,6 +16,8 @@ from pipecat.frames.frames import ( FunctionCallResultFrame, LLMMessagesUpdateFrame, LLMSetToolsFrame, + LLMTextFrame, + TranscriptionFrame, ) from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext from pipecat.processors.frame_processor import FrameDirection @@ -132,6 +134,23 @@ class AWSNovaSonicUserContextAggregator(OpenAIUserContextAggregator): class AWSNovaSonicAssistantContextAggregator(OpenAIAssistantContextAggregator): + # AWS Nova Sonic is a speech-to-speech model. + # It behaves like a combined STT + LLM + TTS service, emitting all of: + # - TranscriptionFrame (for user text) + # - LLMTextFrame (for assistant text) + # - TTSTextFrame (for assistant text) + # In a "standard" pipeline (with separate STT + LLM + TTS services): + # - The TranscriptionFrame is swallowed by the LLMUserContextAggregator + # - The LLMTextFrame is swallowed by the TTS service + # Meaning the LLMAssistantContextAggregator only receives the TTSTextFrames. It actually + # implicitly assumes it will receive only *non-duplicate* *assistant-related* text frames, and + # will misbehave otherwise (double-counting assistant text, or mis-categorizing user text as + # assistant text). + # So, let's override process_frame here to ignore TranscriptionFrames and LLMTextFrames. + async def process_frame(self, frame: Frame, direction: FrameDirection): + if not isinstance(frame, (LLMTextFrame, TranscriptionFrame)): + await super().process_frame(frame, direction) + async def handle_function_call_result(self, frame: FunctionCallResultFrame): await super().handle_function_call_result(frame) From c47703995406be98ff717dc9bb4f9943b1bc1faa Mon Sep 17 00:00:00 2001 From: Paul Kompfner Date: Wed, 30 Apr 2025 22:29:36 -0400 Subject: [PATCH 60/97] [WIP] AWS Nova Sonic service - just for safety, add a short delay after BotStoppedSpeaking before sending LLMFullResponseEndFrame + TTSStoppedFrame, to give a bit of leeway for the LLM to deliver the "FINAL" text block describing what was said --- src/pipecat/services/aws_nova_sonic/aws.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py index 3c6de7ad7..1ef171750 100644 --- a/src/pipecat/services/aws_nova_sonic/aws.py +++ b/src/pipecat/services/aws_nova_sonic/aws.py @@ -4,6 +4,7 @@ # SPDX-License-Identifier: BSD 2-Clause License # +import asyncio import base64 import json import uuid @@ -211,7 +212,8 @@ class AWSNovaSonicLLMService(LLMService): async def _handle_bot_stopped_speaking(self): if self._assistant_is_responding: - # Consider the assistant finished with their response. + # Consider the assistant finished with their response (after a short delay, to allow for + # any FINAL text block to come in). # # TODO: ideally we could base this solely on the LLM output events, but I couldn't # figure out a reliable way to determine when we've gotten our last FINAL text block @@ -224,6 +226,7 @@ class AWSNovaSonicLLMService(LLMService): # FINAL text blocks to know how many or which FINAL blocks to expect, but user # interruptions throw a wrench in these schemes: depending on the exact timing of the # interruption, we should or shouldn't expect some FINAL blocks. + await asyncio.sleep(0.25) self._assistant_is_responding = False await self._report_assistant_response_ended() From 38c9fa681a3a0678e58805bcca5ff833f8a1a9e3 Mon Sep 17 00:00:00 2001 From: Paul Kompfner Date: Thu, 1 May 2025 17:50:29 -0400 Subject: [PATCH 61/97] [WIP] AWS Nova Sonic service - Protect against back-to-back BotStoppedSpeaking calls, which I've observed --- src/pipecat/services/aws_nova_sonic/aws.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py index 1ef171750..e8da485a8 100644 --- a/src/pipecat/services/aws_nova_sonic/aws.py +++ b/src/pipecat/services/aws_nova_sonic/aws.py @@ -145,6 +145,8 @@ class AWSNovaSonicLLMService(LLMService): self._assistant_is_responding = False self._context_available = False self._ready_to_send_context = False + self._handling_bot_stopped_speaking = False + # # standard AIService frame handling @@ -211,6 +213,11 @@ class AWSNovaSonicLLMService(LLMService): await self._finish_connecting_if_context_available() async def _handle_bot_stopped_speaking(self): + # Protect against back-to-back BotStoppedSpeaking calls, which I've observed + if self._handling_bot_stopped_speaking: + return + self._handling_bot_stopped_speaking = True + if self._assistant_is_responding: # Consider the assistant finished with their response (after a short delay, to allow for # any FINAL text block to come in). @@ -229,6 +236,9 @@ class AWSNovaSonicLLMService(LLMService): await asyncio.sleep(0.25) self._assistant_is_responding = False await self._report_assistant_response_ended() + self._handling_bot_stopped_speaking = False + + self._handling_bot_stopped_speaking = False async def _handle_function_call_result(self, frame: AWSNovaSonicFunctionCallResultFrame): result = frame.result_frame From 4ffdc3b77ceed4168f747384ac71ae1acd0ae941 Mon Sep 17 00:00:00 2001 From: Paul Kompfner Date: Thu, 1 May 2025 21:54:36 -0400 Subject: [PATCH 62/97] [WIP] AWS Nova Sonic service - do hacky direct manipulation of the context for now, since I can't seem to get assistant context aggregation working properly with frames, grr --- src/pipecat/services/aws_nova_sonic/aws.py | 19 +++++-- .../services/aws_nova_sonic/context.py | 54 +++++++++++++------ 2 files changed, 54 insertions(+), 19 deletions(-) diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py index e8da485a8..35f312a0e 100644 --- a/src/pipecat/services/aws_nova_sonic/aws.py +++ b/src/pipecat/services/aws_nova_sonic/aws.py @@ -147,7 +147,6 @@ class AWSNovaSonicLLMService(LLMService): self._ready_to_send_context = False self._handling_bot_stopped_speaking = False - # # standard AIService frame handling # @@ -760,8 +759,10 @@ class AWSNovaSonicLLMService(LLMService): content_end = event_json["contentEnd"] stop_reason = content_end["stopReason"] # print(f"[pk] content end: {content}.\n stop_reason: {stop_reason}") - # if content.role == Role.ASSISTANT: - # print(f"[pk] assistant content end: {content}.\n stop_reason: {stop_reason}") + if content.role == Role.ASSISTANT: + # print(f"[pk] assistant content end: {content}.\n stop_reason: {stop_reason}") + if content.text_stage == TextStage.FINAL: + print(f"[pk] assistant FINAL text: {content.text_content}") # Bookkeeping: clear current content being received self._content_being_received = None @@ -803,6 +804,18 @@ class AWSNovaSonicLLMService(LLMService): print(f"[pk] TTS text: {text}") await self.push_frame(TTSTextFrame(text)) + # TODO: this is a (hopefully temporary) HACK. Here we directly manipulate the context rather + # than relying on the frames pushed to the assistant context aggregator. The pattern of + # receiving full-sentence text after the assistant has spoken does not easily fit with the + # Pipecat expectation of chunks of text streaming in while the assistant is speaking. + # Interruption handling was especially challenging. Rather than spend days trying to fit a + # square peg in a round hole, I decided on this hack for the time being. We can most cleanly + # abandon this hack if/when AWS Nova Sonic implements streaming smaller text chunks + # interspersed with audio. Note that when we move away from this hack, we need to make sure + # that on an interruption we avoid sending LLMFullResponseEndFrame, which gets the + # LLMAssistantContextAggregator into a bad state. + self._context.add_assistant_text_as_message(text) + async def _report_assistant_response_ended(self): # Report that the assistant has finished their response. print("[pk] LLM full response ended") diff --git a/src/pipecat/services/aws_nova_sonic/context.py b/src/pipecat/services/aws_nova_sonic/context.py index 4e2a4fcc1..647e40ae6 100644 --- a/src/pipecat/services/aws_nova_sonic/context.py +++ b/src/pipecat/services/aws_nova_sonic/context.py @@ -11,13 +11,19 @@ from enum import Enum from loguru import logger from pipecat.frames.frames import ( + BotStoppedSpeakingFrame, DataFrame, Frame, FunctionCallResultFrame, + LLMFullResponseEndFrame, + LLMFullResponseStartFrame, + LLMMessagesAppendFrame, LLMMessagesUpdateFrame, + LLMSetToolChoiceFrame, LLMSetToolsFrame, - LLMTextFrame, - TranscriptionFrame, + StartInterruptionFrame, + TextFrame, + UserImageRawFrame, ) from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext from pipecat.processors.frame_processor import FrameDirection @@ -110,6 +116,15 @@ class AWSNovaSonicLLMContext(OpenAILLMContext): "content": [{"type": "text", "text": text}], } self.add_message(message) + # print(f"[pk] context updated (user): {self.get_messages_for_logging()}") + + def add_assistant_text_as_message(self, text): + message = { + "role": "assistant", + "content": [{"type": "text", "text": text}], + } + self.add_message(message) + # print(f"[pk] context updated (assistant): {self.get_messages_for_logging()}") @dataclass @@ -134,21 +149,28 @@ class AWSNovaSonicUserContextAggregator(OpenAIUserContextAggregator): class AWSNovaSonicAssistantContextAggregator(OpenAIAssistantContextAggregator): - # AWS Nova Sonic is a speech-to-speech model. - # It behaves like a combined STT + LLM + TTS service, emitting all of: - # - TranscriptionFrame (for user text) - # - LLMTextFrame (for assistant text) - # - TTSTextFrame (for assistant text) - # In a "standard" pipeline (with separate STT + LLM + TTS services): - # - The TranscriptionFrame is swallowed by the LLMUserContextAggregator - # - The LLMTextFrame is swallowed by the TTS service - # Meaning the LLMAssistantContextAggregator only receives the TTSTextFrames. It actually - # implicitly assumes it will receive only *non-duplicate* *assistant-related* text frames, and - # will misbehave otherwise (double-counting assistant text, or mis-categorizing user text as - # assistant text). - # So, let's override process_frame here to ignore TranscriptionFrames and LLMTextFrames. async def process_frame(self, frame: Frame, direction: FrameDirection): - if not isinstance(frame, (LLMTextFrame, TranscriptionFrame)): + # HACK: For now, disable the context aggregator by making it just pass through all frames + # that the parent handles (except the function call stuff, which we still need). + # For an explanation of this hack, see + # AWSNovaSonicLLMService._report_assistant_response_text_added. + if isinstance( + frame, + ( + StartInterruptionFrame, + LLMFullResponseStartFrame, + LLMFullResponseEndFrame, + TextFrame, + LLMMessagesAppendFrame, + LLMMessagesUpdateFrame, + LLMSetToolsFrame, + LLMSetToolChoiceFrame, + UserImageRawFrame, + BotStoppedSpeakingFrame, + ), + ): + await self.push_frame(frame, direction) + else: await super().process_frame(frame, direction) async def handle_function_call_result(self, frame: FunctionCallResultFrame): From 3784bdbd27ef06e857537b9e08f74a74f865ffeb Mon Sep 17 00:00:00 2001 From: Paul Kompfner Date: Fri, 2 May 2025 10:42:52 -0400 Subject: [PATCH 63/97] [WIP] AWS Nova Sonic service - in our hacky direct manipulation of the context, aggregate assistant text rather than recording every chunk as a separate message --- src/pipecat/services/aws_nova_sonic/aws.py | 13 ++++++------ .../services/aws_nova_sonic/context.py | 20 +++++++++++++++---- 2 files changed, 23 insertions(+), 10 deletions(-) diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py index 35f312a0e..e7b1fd8e6 100644 --- a/src/pipecat/services/aws_nova_sonic/aws.py +++ b/src/pipecat/services/aws_nova_sonic/aws.py @@ -759,10 +759,8 @@ class AWSNovaSonicLLMService(LLMService): content_end = event_json["contentEnd"] stop_reason = content_end["stopReason"] # print(f"[pk] content end: {content}.\n stop_reason: {stop_reason}") - if content.role == Role.ASSISTANT: - # print(f"[pk] assistant content end: {content}.\n stop_reason: {stop_reason}") - if content.text_stage == TextStage.FINAL: - print(f"[pk] assistant FINAL text: {content.text_content}") + # if content.role == Role.ASSISTANT: + # print(f"[pk] assistant content end: {content}.\n stop_reason: {stop_reason}") # Bookkeeping: clear current content being received self._content_being_received = None @@ -814,7 +812,7 @@ class AWSNovaSonicLLMService(LLMService): # interspersed with audio. Note that when we move away from this hack, we need to make sure # that on an interruption we avoid sending LLMFullResponseEndFrame, which gets the # LLMAssistantContextAggregator into a bad state. - self._context.add_assistant_text_as_message(text) + self._context.buffer_assistant_text(text) async def _report_assistant_response_ended(self): # Report that the assistant has finished their response. @@ -825,11 +823,14 @@ class AWSNovaSonicLLMService(LLMService): print("[pk] TTS stopped") await self.push_frame(TTSStoppedFrame()) + # For an explanation of this hack, see _report_assistant_response_text_added. + self._context.flush_aggregated_assistant_text() + async def _report_user_transcription_text_added(self, text): print(f"[pk] transcription: {text}") # Manually add new user transcription text to context. # We can't rely on the user context aggregator to do this since it's upstream from the LLM. - self._context.add_user_transcription_text_as_message(text) + self._context.add_user_transcription_text(text) # Report that some new user transcription text is available. if self._send_transcription_frames: diff --git a/src/pipecat/services/aws_nova_sonic/context.py b/src/pipecat/services/aws_nova_sonic/context.py index 647e40ae6..3fac65a72 100644 --- a/src/pipecat/services/aws_nova_sonic/context.py +++ b/src/pipecat/services/aws_nova_sonic/context.py @@ -53,12 +53,19 @@ class AWSNovaSonicConversationHistory: messages: list[AWSNovaSonicConversationHistoryMessage] = field(default_factory=list) -@dataclass class AWSNovaSonicLLMContext(OpenAILLMContext): + def __init__(self, messages=None, tools=None, **kwargs): + super().__init__(messages=messages, tools=tools, **kwargs) + self.__setup_local() + + def __setup_local(self): + self._assistant_text = "" + @staticmethod def upgrade_to_nova_sonic(obj: OpenAILLMContext) -> "AWSNovaSonicLLMContext": if isinstance(obj, OpenAILLMContext) and not isinstance(obj, AWSNovaSonicLLMContext): obj.__class__ = AWSNovaSonicLLMContext + obj.__setup_local() return obj def get_messages_for_initializing_history(self) -> AWSNovaSonicConversationHistory: @@ -110,7 +117,7 @@ class AWSNovaSonicLLMContext(OpenAILLMContext): # NOTE: we're ignoring messages with role "tool" since they can't be loaded into AWS Nova # Sonic conversation history - def add_user_transcription_text_as_message(self, text): + def add_user_transcription_text(self, text): message = { "role": "user", "content": [{"type": "text", "text": text}], @@ -118,11 +125,16 @@ class AWSNovaSonicLLMContext(OpenAILLMContext): self.add_message(message) # print(f"[pk] context updated (user): {self.get_messages_for_logging()}") - def add_assistant_text_as_message(self, text): + def buffer_assistant_text(self, text): + self._assistant_text += text # TODO: determine if we need to add space or something + # print(f"[pk] assistant text buffered: {self._assistant_text}") + + def flush_aggregated_assistant_text(self): message = { "role": "assistant", - "content": [{"type": "text", "text": text}], + "content": [{"type": "text", "text": self._assistant_text}], } + self._assistant_text = "" self.add_message(message) # print(f"[pk] context updated (assistant): {self.get_messages_for_logging()}") From cc1f4ba81c24ff0928eb336507ee20b800d5946f Mon Sep 17 00:00:00 2001 From: Paul Kompfner Date: Fri, 2 May 2025 11:31:56 -0400 Subject: [PATCH 64/97] [WIP] AWS Nova Sonic service - add a hacky way of programmatically triggering an assistant response --- examples/foundational/39-aws-nova-sonic.py | 18 +++- pyproject.toml | 1 + src/pipecat/services/aws_nova_sonic/aws.py | 92 ++++++++++++++++-- src/pipecat/services/aws_nova_sonic/ready.wav | Bin 0 -> 23484 bytes 4 files changed, 100 insertions(+), 11 deletions(-) create mode 100644 src/pipecat/services/aws_nova_sonic/ready.wav diff --git a/examples/foundational/39-aws-nova-sonic.py b/examples/foundational/39-aws-nova-sonic.py index f08cfad04..07670f75a 100644 --- a/examples/foundational/39-aws-nova-sonic.py +++ b/examples/foundational/39-aws-nova-sonic.py @@ -83,12 +83,16 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection): ), ) - # Specify initial system instruction + # Specify initial system instruction. + # HACK: note that, for now, we need to inject a special bit of text into this instruction to + # allow the first assistant response to be programmatically triggered (which happens in the + # on_client_connected handler, below) # TODO: looks like Nova Sonic can't handle new lines? system_instruction = ( - "You are a friendly assistant. The user and you will engage in a spoken dialog " - "exchanging the transcripts of a natural real-time conversation. Keep your responses short, " - "generally two or three sentences for chatty scenarios." + "You are a friendly assistant. The user and you will engage in a spoken dialog exchanging " + "the transcripts of a natural real-time conversation. Keep your responses short, generally " + "two or three sentences for chatty scenarios. " + f"{AWSNovaSonicLLMService.AWAIT_TRIGGER_ASSISTANT_RESPONSE_INSTRUCTION}" ) # Create the AWS Nova Sonic LLM service @@ -117,7 +121,7 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection): {"role": "system", "content": f"{system_instruction}"}, { "role": "user", - "content": "Say hello!", + "content": "Tell me a fun fact!", }, ], tools=tools, @@ -151,6 +155,10 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection): logger.info(f"Client connected") # Kick off the conversation. await task.queue_frames([context_aggregator.user().get_context_frame()]) + # HACK: for now, we need this special way of triggering the first assistant response in AWS + # Nova Sonic. Note that this trigger requires a special corresponding bit of text in the + # system instruction. In the future, simply queueing the context frame should be sufficient. + await llm.trigger_assistant_response() # Handle client disconnection events @transport.event_handler("on_client_disconnected") diff --git a/pyproject.toml b/pyproject.toml index d6d05c00c..7ce167d77 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -96,6 +96,7 @@ where = ["src"] [tool.setuptools.package-data] "pipecat" = ["py.typed"] +"pipecat.services.aws_nova_sonic" = ["src/pipecat/services/aws_nova_sonic/ready.wav"] [tool.pytest.ini_options] addopts = "--verbose" diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py index e7b1fd8e6..5b69810f3 100644 --- a/src/pipecat/services/aws_nova_sonic/aws.py +++ b/src/pipecat/services/aws_nova_sonic/aws.py @@ -8,8 +8,10 @@ import asyncio import base64 import json import uuid +import wave from dataclasses import dataclass from enum import Enum +from importlib.resources import files from typing import Any, List, Optional from aws_sdk_bedrock_runtime.client import ( @@ -146,6 +148,8 @@ class AWSNovaSonicLLMService(LLMService): self._context_available = False self._ready_to_send_context = False self._handling_bot_stopped_speaking = False + self._triggering_assistant_response = False + self._assistant_response_trigger_audio: bytes = None # Not cleared on _disconnect() # # standard AIService frame handling @@ -180,8 +184,7 @@ class AWSNovaSonicLLMService(LLMService): if isinstance(frame, OpenAILLMContextFrame): await self._handle_context(frame.context) elif isinstance(frame, InputAudioRawFrame): - # TODO: check if _audio_input_paused? what causes that? - await self._send_user_audio_event(frame) + await self._handle_input_audio_frame(frame) elif isinstance(frame, BotStoppedSpeakingFrame): await self._handle_bot_stopped_speaking() elif isinstance(frame, AWSNovaSonicFunctionCallResultFrame): @@ -211,6 +214,15 @@ class AWSNovaSonicLLMService(LLMService): self._context_available = True await self._finish_connecting_if_context_available() + async def _handle_input_audio_frame(self, frame: InputAudioRawFrame): + # Wait until we're done sending the assistant response trigger audio before sending audio + # from the user's mic + if self._triggering_assistant_response: + return + + # TODO: check if _audio_input_paused? what causes that? + await self._send_user_audio_event(frame.audio) + async def _handle_bot_stopped_speaking(self): # Protect against back-to-back BotStoppedSpeaking calls, which I've observed if self._handling_bot_stopped_speaking: @@ -316,6 +328,14 @@ class AWSNovaSonicLLMService(LLMService): # Start receiving events self._receive_task = self.create_task(self._receive_task_handler()) + # If we need to, send assistant response trigger + if self._triggering_assistant_response: + # If the trigger was the first audio chunk sent on this connection it'd be ignored (I'm + # guessing the LLM can't quite "hear" the first little bit of audio sent). So send a bit + # of leading blank audio first. + await self._send_assistant_response_trigger(lead_with_blank_audio=True) + self._triggering_assistant_response = False + async def _disconnect(self): try: # Clean up receive task @@ -340,6 +360,8 @@ class AWSNovaSonicLLMService(LLMService): self._assistant_is_responding = False self._context_available = False self._ready_to_send_context = False + self._handling_bot_stopped_speaking = False + self._triggering_assistant_response = False except Exception as e: logger.error(f"{self} error disconnecting: {e}") @@ -490,11 +512,11 @@ class AWSNovaSonicLLMService(LLMService): ''' await self._send_client_event(text_content_end) - async def _send_user_audio_event(self, frame: InputAudioRawFrame): + async def _send_user_audio_event(self, audio: bytes): if not self._stream: return - blob = base64.b64encode(frame.audio) + blob = base64.b64encode(audio) audio_event = f''' {{ "event": {{ @@ -639,7 +661,7 @@ class AWSNovaSonicLLMService(LLMService): elif "contentEnd" in event_json: # Handle a piece of content ending await self._handle_content_end_event(event_json) - elif "completionStart" in event_json: + elif "completionEnd" in event_json: # Handle the LLM completion ending await self._handle_completion_end_event(event_json) @@ -839,7 +861,7 @@ class AWSNovaSonicLLMService(LLMService): ) # - # Context + # context # def create_context_aggregator( @@ -855,3 +877,61 @@ class AWSNovaSonicLLMService(LLMService): assistant = AWSNovaSonicAssistantContextAggregator(context=context, params=assistant_params) return AWSNovaSonicContextAggregatorPair(user, assistant) + + # + # assistant response trigger (HACK) + # + + # Class variable + AWAIT_TRIGGER_ASSISTANT_RESPONSE_INSTRUCTION = ( + "Start speaking when you hear the user say 'ready', but don't consider that 'ready' to be " + "a meaningful part of the conversation other than as a trigger for you to start speaking." + ) + + async def trigger_assistant_response(self): + if self._triggering_assistant_response: + return False + + self._triggering_assistant_response = True + + # Read audio bytes, if we don't already have them cached + if not self._assistant_response_trigger_audio: + file_path = files("pipecat.services.aws_nova_sonic").joinpath("ready.wav") + with wave.open(file_path.open("rb"), "rb") as wav_file: + self._assistant_response_trigger_audio = wav_file.readframes(wav_file.getnframes()) + + # Send the trigger audio, if we're fully connected and set up + # NOTE: maybe there's a better way to determine whether we're done setting up? + if self._receive_task: + await self._send_assistant_response_trigger() + self._triggering_assistant_response = False + + async def _send_assistant_response_trigger(self, lead_with_blank_audio=False): + # TODO: if/when we make bitrate, etc configurable, avoid hard-coding this + chunk_size = 640 # equivalent to what we get from InputAudioRawFrame + chunk_duration = 640 / ( + 16000 * 2 + ) # 640 bytes of 16-bit (2-byte) PCM mono audio at 16kHz corresponds to 0.02 seconds + + # Lead with blank audio, if needed + if lead_with_blank_audio: + blank_audio_duration = 0.5 # much less than this and it doesn't reliably work + blank_audio_chunk = b"\x00" * chunk_size + num_chunks = int(blank_audio_duration / chunk_duration) + for _ in range(num_chunks): + await self._send_user_audio_event(blank_audio_chunk) + await asyncio.sleep(chunk_duration) + + # Send trigger audio + # NOTE: this audio *will* be transcribed and eventually make it into the context. That's OK: + # if we ever need to seed this service again with context it would make sense to include it + # since the instruction (i.e. the "wait for the trigger" instruction) will be part of the + # context as well. + # print(f"[pk] sending trigger audio! {len(self._assistant_response_trigger_audio)}") + audio_chunks = [ + self._assistant_response_trigger_audio[i : i + chunk_size] + for i in range(0, len(self._assistant_response_trigger_audio), chunk_size) + ] + for chunk in audio_chunks: + await self._send_user_audio_event(chunk) + await asyncio.sleep(chunk_duration) diff --git a/src/pipecat/services/aws_nova_sonic/ready.wav b/src/pipecat/services/aws_nova_sonic/ready.wav new file mode 100644 index 0000000000000000000000000000000000000000..ca932afa66d69dcf3626a54f9b171c767eb4d0e9 GIT binary patch literal 23484 zcmeHvXLuCHvgqX5q}^4PKoTMfM9!FGvdIQa&KQgV1IB<2CK?lr$=N2EWWWY&a?TkH z#$a+3LP7||jkB|p`qgOe++7Lzob&GY-oLy3&2)98uCDH$&{MtL+O=tOs2_&)YTmQ; z@G%n;?HGpPkiKaOz;p~FFgw<<-ACO^K(SrtHXpY8s7-e(z#WHtHN0MpYBlRst68H4 zHf+emA!9+y|Nj2h1OMxRf8_xZBVkUdKO7DOKNVKstd`Z8M?g|@9a76hld472q-xXQ z0GEpWf0HyTbX{7Vdg(vO$^UA{f5!WtaQ)x#6!k-(9Dr2+={f_DjwzZK&ruXx*lWQ$ zG)htaV2=Me{O_~|TNOOQ0$-ZBqK!~;P6xa zD`}Vl;UXthckoZED=;gxa`9)q_M%kPZnZ6%jls~n_+U%`4#;S@;Bufgt$zKiwjwZ> z16(?!c~iZ!2JF*jiOQj3H4a^ehSRmGx)D+j^~#}3X!5E~m0Pt>21zZfsQE;dl)N7FpeJ3M75)hAkkh}$l8_pi&LR`V&Ex>X}1RkNFzt!uR zYLQwm4cb}Ef|=m3LJGrdSU98zNRe0!oTISl0w@w7?T7|0l^zPu9ATIlB#e+$Js8-o z+J+E;NnoEJYDG}5YM~Ec57;6hCty6t4JjApp&XQf^3iLQjozY6z^G6TBnR;1LsHMG zw4}Nc8Nq4;*cwt$uNf?|VXpq(7aV0FPsGY&30P&UKH#fkwXw3mVJq-0 ziK+hbKu!11pXd&{hAyM4=sY@&j-rF;Bszi)qf-DMMF-FcIG+OCDd4z@9-^D*Z}bS= zLJ!b$uv7Id34Sob5e9bDg_hUHlCdPHuM*Y@>x9+9x?^py-dH270oDO~5330F)PlSW z9E}UCwt-bLN&_!sLk)kROK|K%zoUy#`$4ef09bbz9Y=ow^&+~BUV=Z~f_HFeH3mHx z3;j|8D~Z*>s=?6+YYp`@#6HAEV7;)x*d%NQ_6;@?n}DsvzQ8`krePDY(Ln74R+TR3 zJsfkQ`{*&$@f{k6TA>;!1UVJAa#Ojc99Di+eo_`I8k;jjzY&;zRNG@S1oC zj<7e_73?_n6ZSdys2Q}wh`GSO$I*H;4fRIvp$e!3G9V+eB0HiGuc&splxS26)r6Y+ zqw#1I8jeN)_m}8Xv=Qw84SaVQ`X&u}mxlgN!un%#ut!)f_96Zhzkw?_Lmb5yyZU-L^`QUR16(KH>2Js58~JH@%U7%EA|U=D<|ZAk}P$R zdP-J#f;>bXDE%P*A*}NE?m==2SCoxny2!u3>&=3O9KTpBjE*>oNw~LWPiT z@jcK_Q_wEumYgVWlun9&3)O_N{w=)6x5wAlcZd6q>%%2-r@YO*-MvG+d%PLmhTJv| zaqE3``K5eyzr%l7*eBkSDk_ywB-R>VKzvJ%rbf{%nYTf3L)We^RtG0k8w z@&?Y3U`#S58_OBr7=AajHE`?}mS&eSx%4dhF2zub$O_~kVk%LE2qOq$E?yo_h0*>L zB_Rn$%@*Ys<(|SR!%!tO2`xZXQ8HSin3NynvhuG|ed!Bnm1L2-$=UJBkVSe_UdSkk}Y?Y*GO5SNwf+>{6F!Fe1CGH*WmrcbJLyeYU7G` zSzM{kRA)CA>)!5e=_%n&;v#&Pe82Kn{fC7u;wWjZJVjZ8oES@VC+AYz=;cg(_7vOI zFwd~taN3Y){J_}M*vMGQxXvK5No*gcEj^1`PfjOB;RCVn&}wC!+)*aw)l$49ix0)m z#B)L^!QuD#%Lz#ML~5j9*kr;-b!Sf+W%I6(XxmoXQQIZkWZPToB5SPmZpe<1i6Lb} zT87jKIb-P<^4bz^S!rHr9cIX38W|>%vvGnHrl?{e2A(;y73%tgY%xV zdR~d_so9O*I4kF5LHbmB7VfG%O5k!kdAY-9F1{S)Dm>-w`i3mk`Y zDrEUH+rOEW@#&j}S?RemouOQuSYFw$tW#zxZRI+0j$9wRNVPO(hl~tc6nJe>%D93;h~qukqf*vCaXz*_9{Z$2s^^U%t9$nL+%KK({HnPo=S}wnkBj?T znof>3^tD!qyjikg!u9xu@hP!#_!HYm>pCkIdOLDjiR8p7WkM4@C3i+9g^VB{_`7@e zxx2c?dEXavu{i1_HJoaJ@AhXnKhC?BGdB0Bvzfmo(bM>|H6si~{uZ$&yj^(L(0$f3 zAzy-r?po%Cth7H09UeN-cGU7Ey9ED2E+y5J>dEslAJKy>OIq;r=u6onrYetdnK{a~ zH}p1^WympR?PB^G*KK8)>3-Ok5hE-Wgqzv3 zQz3>ZTLNMjDLOZ-t_({gHyYv9nOwaZW`BwQc+}yp7GnGV&j^E*@0g!0RYJxahZ5C=Gx=*W+q}un zZszGE*_mUORd$b^vgMfnGHihnG|1NvYDC9f4!}iKBz&IT%t(+Dr zde*q5oV>hhj3k$CT2FaM@Y z!C30|_?HRm;-|(fiTl0;AC(+aHZ~?Nhd5xDw(^bj-=I&?9W=2*E_e7W3i)a4*q8SvpTPCxet&fqt_)IuCOGLu(apv=Xf&r z=C=}mHIE8AW9z{#^nR9h{eJ%4At}$^c=PY^>*NM9%TIS-bRXeH3B6Gq^~Urh{MWd& zvL!24s5H7@FPGUKY2`XlzG1(PYq?aXCa?OpZdxrVXk1ICl|$7r3f zLOM;p&#IRW3R*rlf_@q9=fWjWk|C1vycadQOsp3l?OD5Un_)X=M(xsz1D<5V5{+i49)Ok?e zKnwTXTL@#Rw(VoX$O`RRNjS6MxWtnV#X`IV!V-K36LfuhaW6wojj`D^t z3ab&mBz#$z(OQO{B-6fl_c_--Z!dm;@4Ba|<89W6x4&lp=I$$>B^FcTD3*FdCXvz9 z8d{-CutstY{+)TWq&j8V)tPuzIZ_R(HHHx<*1QA$T@F0WrRHqp6NtgsRjy1{*OZPAqV7+6 zQ6c-XTj8FFZ}8sK&qNpcB{hY7P6+g$*2yv6#I&Sif+C)`y$V_Zudee#~?#yU7pC-HAAgZiF+2CZLBH)B^Bce16JnIulWC94sy<#y70 zKGL(>u_nKjC*M6m*d|=ycX9LGz1(fYc%d$lhRe!Sl*@1ToaHYGccpYsYR)HOW=QF1 zW1JFo(At-{o%bg7^R$uKn|1}Z@dr1zQ{!984sDK8jsLT z$u)GAxqd{gxS6E~ByNfA7_t(b;O6+k(Lm$Z)=}0BV*~7lw}tb9=dkpKiLh-CiwbLM z|1IR8mAAeObwsX-{vbLgvQ^k!>l9NHs4Ta%NR45LG)y$t+s*Gzpj_crY5AE&Y0|67lV zy`98NQZb#fOkpc1H$4gYEghHn71ZL8D`5x2cZ3eGPql^GlfpYjej4kEeH+yv!eeh| zsmWecX7J-Z=RNh|IXBhE`tG=!I$q`U%w6X>Bh4T;GxKN%l}e4LBB{AlfA*~TJ4++; zV#Y!XvQ;3x6P$mz5Awr3rMU9mGfv7og=^s6E%X*z2%Dt}zN4;ip}cgAtWV5xAIk5I zS)$67x>qthysJ4}Zjih6)!@u0jwRv>d86lS#_uTuUPvjvl-93r^_;#xiW>p_&u^VyASTw z)lBy-eQkn$Md&Z~-eGUU?NMKp;G*|hXS1b<=4iCi4V57|yr&$_J;@KxYndPKx#=Gx zb(Xs;wNX7Z7_Uq9X6(js#;Ndz(Vn?NO(H%)rKRrjK(PXU(fhHpqUS^JS^g1c_qOn4 zyPCT&k3YWzpDu?IF6BeVn!HGJbjc4YnM(Bw?LdV4-)4QA6Xn>zpAzng&+|8=m3!7A zrDs}|j3uxCcr!59>HggJL|lb0rVcPe*<|*X`H#qH38xdYOH2*xWB%OGgKkUv3}-Ce zEIZg6@?q~qxK>z24&LN4um{8sbS3sj!(CHy$ZPA((7O>K(RggN5@TYIhR24iViwa3 zU6;*d;~5IS$U7ZfazpZJIgfZG-ypuRza#JDWucDpA?iU?rdG4_4Q|r{<1O|IHW4k8 zNn*cp32Q0;DLRC<{&B83E-Y`5V~gj2Yq6)XceuZ_P}RR9=W13@u|yUnoDLEYuyZhAl#;@pz_P zNQdyfF<~Wl#Qqgl#q!uN+jz^;&3-+Mv&ORz<=fmM_ef7g?*VQ%_qBJv`!Cl_*Gzb~ z$@JFX)5K(SAKsQTl)FkC@?wL@z03!uP39iPSn?0~zON?l_v2C{;jNIydxY)Y)A@ta zXJqvB_)T1-G5V~*D*E%*noX@lFX3xkun>Qzat!pyJ3FGB? z;!AH;=T6s2p#>dh>l0o!{IE67c$%ulKC�+!;GJwq@jJ)|Up4p|`1q`D>$}E=P5s zPSQ8v&3P)(2qjBTq%tUhUTvIgK41zpyrrIzLFQ@e1;1tG)ABc@b zK?oBs!4+wo;1Z1DX3-%ollDjkDPLG2%oT4+D`m4nD>IZSN*9>lcDtqOsa%*X&_%$R(6hXID93lQH-W1=6?WLabJS9t6sU#_* zl@HJhOvbkm2Z^1;4&pgMkw?j)R2*$#4l$M4-7ICOWvF7H4ZpFtfiPN3YfKYNXN(&S zEe)Lv&kb9QeT^Fo?bx|YI1|m>r%zI?Niz|GFNM{`+NgqZPx?*F5f%zvg!hEQ{wDtG zd<}jfzl}f7ALgg?FMI>|r~F0#Sz)61NVwzw#J|ShTuhUeD&x@$bVli_dsBOk4B@3iXySnZjlq$NFTzxJ_f4>^`x`@%lvu&ui}2Gx|AuD7bl3XrD5_jd87QJ zG6z;E&ZD{LCaeVgLViwpsDAWQritOM@sOzrtOsl}PB+ao@ur=o{iXHm^W(Ak=(V^*s4a{W z`=C4YU~`&nT12@LPvfVRbcFA-&Is!h+asY-VnVzVW1qU?cwtt8y=n!PnVzj{JFVLlrd=l|HV-qVin=ld~pM_9Mv|aTzIMQCQ(CT$A(u6dk|HoOhmaxrGJgyXggwT1S?s`ZD%8W zq4k*a!e@LFaezEeCeQV0bR!16HAFjl-1Cm z8AzNUt{Y!iaocOtDZ^FPK|LaxFkV9yrWt-f`APKod;5Fx*Z7NkB`(Ev)79GjgZHL4 z-_yuF#(l>7J-65UwRfp6nZKYg^3P&ZrM_Sj`Y2`&}kT8>*DbuWB#?q^pJ#XMXx|turZ{d1#`}|9Uu0kvM zeWkJTJMv3Iq(*X6X|S@394YL|eQaJH{#B?cOp?|*a)o5nnQ&WkLOziroo90<=O0wk z$m_mx?ilBjobB2B-hP!;&%KWu>)htr>Dlet>qvGE^-Uvkj0+7_s4vKdc)8@Ms&z&6dY-0&?Kg$(ld*bTZZHCDRi z`^~qNf8?v|A1RGcb_=Vy(cDa-rZQhSrX-<-7>-9{AESZzcKi_D0Uv;E#BZT<(p0g9 zd;#WhS^p_+o^Jy`-oFUieV0q|u%1hvcE0ES95K^BpLck=`D%z2<+jp1$u0I*epkMb zPs2KHvT{M1YrN?^;JRtK7TzrMxUIFXop+G3hb+ssvQM+D=*d?(ouOhJeyoos^KHCaOIeM0KI?}{<|m=JwcC@cJfeB z61T(scf23t7x5i^jkq_SCEQqkoiCM}>2-Q*a+|#Uef@-Xq8+X!Ck0uypr^bfw3KlC z6LJySC|4H`NlQ@=REvA$&bCouhwR76&F<6g8{{P8Nz0ekr*wNh(Rqt|iFdU9VZ6v! z%N^ou3Zra=KhM|HciqS4FLRFf-1g1zZ}Kk{(v_?71;Of1!hbZJv`(}z#^r2jX0xeA zXzjRfap@#0-6N*i^Vln76S@p(AV%QLh)^O` z+U8@q7+NNFb1@+cUG_qcE>Dx{2gp80~Ta1+o~%4U3Jx^JkA z(z)Kg2J$HOvN;q#;$H3Oa~yn{X7VzE34dl<7n=5UlL?5zD*`0DVI5o;o@Msy8bZkl6Q zW-b#l-_na+L(V1{d^p+xPeRktDlyL|a*2Ej{~_1kGoPEonY@GDcK1l{9^YZ#SNupH z!JXxP_RSM6NzbJj!a08rakcUpd}*tR&MBXvL0Db<0B$Ef!8c-mp=NkP;x)bw>xmMS z+6n_-4R)Z?7?P9aZ=|IXFOHUM*ar9pby{Y`%JKs28>OF84Rx1m$sa1~#3W^e_*xk+ zua+CR-;=wVY8opVGNirySz!^p;axHONH50v^NsoU6dzl`cz~Gh%5r|l50&aGP33L; zN^iEyocqxGwfwpGId{VKr?Wio=c9yC;#uVnY^>B)TC0qsW*c@JPnr6dT}FqIH%azg zQSQk4p`)x$TjTI8ktZXUMLNQ(+0#u+%@eG6=uX=Y#yeCxIfht(m%=NO+c8X<>TBma z#kb{4@b7bB+i=yb9fK) zRD3G_8P|r3ktO3uJe$nIcMC_|jF?Z@*`!pg0w!6YtR;{37uMJK0dvaDZuR4zHOUA`=TDTgUF?clLhL4-7Jio(`i$}! ze3{`9<3byFud^}QOMXOEpu>nvUjw)z%s@Aod2E<8+tJ3E!Os*wQ=;Y0BIA1IsOV_n z?uadyPjlZmmU+5)clrCvZ55BWKq)O8_tznQro+iH#0S&^<^ubTVFFW&O|r}}T{k4K zaZojSSCm5vR;&52)dMM6PaQrdW1K)%{z;EFs zxr_>-9q{$DJpC>0rDBo z=`Ktw_AI-IUC+#7&QkHrC3t&S!y1?$sj^HrdN1<>-JV`VPX+B0=*o0&CWWfTj9?d# z<(O$Cf_C>~o>4c67!+k-sgg_!>OK5CX*TsH#?Xi13-@Rug1CuQpiaSa*#_}1=^7e< zI!oPsKY2)gz5f_biqCw9Tq*7??xyZMUf}w{(|J4heOG5+Gv6KlsL#ih@jvi=Bvh9^ zm2OL$lq9(__BCpcy-$$D6QU}45$_M{qm7BZutu7VA16|=viLKiGO>-QLrlcY*h16@ zO~5{bFYd?T>o_i*6n2S>bXD9c*!kJ;O?;?-qCbs)?W@i8@uqoK`99@W@@IXAxF_BO z{v)`%MGF`Fr~JPOMkz|(tt6mtmHSFGRsoMEGKdajEH#RHLxt0G>60|aaLiI>E4z(} zWzR8n*-2~#!x;7xb^vo3Ji34lHS}i@`+{k}_F`pb7?aFIG85Pp&<}m#UED%dVNTEs znI=ppW(S=|_N8x73jLhvz|?g$yeEEwC%#qY$k;#V+}_efWz(Xt{}Q)Vm6l+qCY zvKjjxs}B(m-{AF$i^OqqB( z#8BPvC!5EvgVnlJSX=JFrn4{DPnnkNm(1_XLFObwF%{W~^k0;XK0_Mm=Cqq$Lye(c zlaEM_d`b)>X5jaU2lynS1~G(qACJQ!Dg*V#-VlEf3&}EgdF(OVC$B^M-opC(3HeX? zh15|>5%YyKVY84TRFxhJoH$47vlVn z`J?`9{|D-v<1)nAsiOFiEAZBYAV%|K9JI-H0f8Vx7@p6r669J3{fvfmGg?E zw1QY48vJt#+lWoZzQtMsycIi#{ej)a;vi-r4@-g79R^l;;xP|IoYcfGK+Mk_h-R6J zj-iqe5v0Hx;68|lXa~_MQaIJZczr<@39-p zG^|36k{U}%QVVIJv`e}pCChE(fpSl|g4|YqEHg@5rMc1oo}BDTOT~;T!rGQssfmU_ zELKYhe^11A!Tl-%`wpJD2qKnviT{NEh_Av^@lc`%@eAQ1lF2Z#D_qHT5qpTbWM?uN zo+37r$H?JiU-C1uHu)*JhrTx@y>Lj&>T0;Fq?WFwFb!sGBGjieE(kd9+17UQJA&-#Z)FbftL~;dbfNu$t zi3>zG;x4`rENg@x#O}hZehRbrV~A6Fjbb29W)r-Je5`CzzEUK4zI;`_Dc6%jRKN#Ma_Mv5uG}d;;rGm&M;;P3k#Z3;q=U5*2Zh)JiHPjg>A*r=?EtW>+3O zUqxOkQ}F(=MYhAb-~w2E+oTv#M~Kea1<`!3U<_mMUa%Znh&{!sz|;IZ@cBzfF+>|8 z2SOaz6ZeU~h%)41av-^pr~;ndPJRoA5B%Mg{F6+EFRkB@<;Y&-RdN@M_7LhE*@ApX zM3XzowPY7^15goYSV7()+dyPPHL?_}7>^-mkb}v_WHWLh^j~-KJJLgtWJhuU*_<3l zjwct8H;KbU7vTM#I6~ZnJ5^btD)_zuaRa{sv1oO18{PsRh)XbgXF@z&DTt)23^Vrv zT8wJLEWnT#-kHzCYG^rFUwtXN<%{wS`M5kt9u1?iw_HlTEu~0V(ne{wv{zan&4;T| z2Y8C^BsGU?$S0B_CP1kNTw&`=gQd@<_AsW8O9!N1q(@SHh=GcLC)O3<(~0l?h;-U5|1OtRx+!y&UU04iPo9j@UzrDKnX*IKt6YJ5;{zoOR(k88_7JIe5u*0) zplaA4xQ;A^w~5_wZCVQvbq8TIO~qbgBk>p<$M0ducpJPS%!p_3J~9xlGC$)aQ5SsD zgJ?=ngo&6&q!71>h0tfi0iH$(L>!q*%pm3xM~SnG)uL1k^YJZ;8j_Y1kbs1A7c< z7d8dr4?nB zs6r?_ta4XVN-3dmJ;;aq!E@*%T*(0sPM3d@f00kgX|hRq3hRDf%2T0qQGO|t%3b-6 z`~tq+*x)|NLtYK8CGC_6%4&#k{92gxJ=8vF@f2J%|aTRreT@P6_!U?$_E@b(Zh`8`1M zU%E<-C2N3<)84X3Bz*rxFra+9~Hi!~D zhJJ-~0;NNgumpCiQPdbj+tz`5Ya@t1?1pv3hQU2RjdvXdGhjBfeI2$Pjs@6aYyvaH!X@kuY}#VSOQu#KuBI>oBYzM9mI`V=OiV;7Q=k zZ{Y4f7j%9Dv8B7Q1K5us`5Sfw;62ze7~9LR)!07l5Vj4VMZh-+dSNzUZGS2h`ueTfd^MFqhVW=YB>%qMy(Pv;r+eYoKqJp(SW7 zJeMqlF+CSBi_vo6{0X=gqb;C$FWLd}=L_PXuc2oUPx}z1K%NCxd^O(L0P8;-vV#2~ z5W8FsVvh;PE5p~VaG1+#)NLq072&=X3pDlWYK3Q_WZ(>ksBII(lb3=c0a7GTqJggr zL@*~pFIEBRWUMB{NY}*bL7%<{zG?|$wk7m#b2!?==xqt(_XDgQ|t8#4w#!z{<9#W{1JSs zdI-_wFG0&|csEJ~UFiV5fpR9?HFMy|1b^m0oU$63?*P0LqT4Z;k?Qh=id7H=(d2%J z@24TkTaDLO_XAK6wPY^vqB?Rc;L!+(4i1A{jna<i;~RVWfPsj_MwRiN7reXH&dP`zgb3sfuASbq~(qDJ{EFe22Q zQtF-p3G7vOl&E_RN;V?vPM( z5$scUPN<$%z-D#Vi@IY$ZHc-kL*2hX6|{(ic4~VhRKL^Ubt9Y&kQ<==hJsO~_JN8w zftS?zp!SWrkHQXfzzSv6Yqo-1#aRnrb@zfgb8H1ZR7<7;U8)|{uZI8juUfyl-$AQS z9S2kaP3;AB_lUM@MB6{2_JWF2cblkE+Wr&u8d@M*^N%buM-$~`ylUPuPwo64Ws9m*1wL`U}AcI_m z0*87{5ByY$ic!aZa8lc)CbiAMdvWw#IXX_uRlfuvt)zNS^_-e?4{EpoR^?Fl<>*{$ zS(6H!HK?$py01o))A_aD(KM-+sQjw+YCE(xXclTXt##UNA>DG#e(kKaT$c~{MboC2 z)sp(t%W57d7k~vc&BGc+TM2KtzxtkT&pS{;L){ddC{}3vnXCIt3P%A zX>f7YYEnh(3bd&(rEp2}U~%;Z+O1Jled_OB-ohH+rE7gv7#}PXfP=Mb*x++v4SFe9 zM#Ba4={x~U08(lHUc1Vp%e-5YuD@trm{XT8$`vT9{3_SK=M2c`&_DHHz;2cEpEL(^ z6~|K?POtet&x5rD;9xJQ*x<6NTPqhwDOz(;$)a>kS8-hbDXw^p!Pvr@f?KJ#K<5b1 zg601`TvSfiS6uG~&r}swdA0Mq^#t_jn%=Fc@F>%?shWai1DXq}0&3CpF88~bV3~hS z4RGpwdS2MB09KO-w5}*cZ@_P!EjNzf1(w|YvrO?Ra-EwXq$@G7F^ck zf+d1sjq_cMt|9nZ8Ng}Qs8RvR;y87BfJ@g=xIM*T1Nw@S4oJKUX`>+^uW; z%{oPY+S1FqWU$VF=ECDi$7{KI)-l0QVU2nzz!|{kI9-!gn~p8c!T_hPMePw?B7iF% zQaN=Ey2bCx7R3}TYr2cWYT9+pYOZ05xJ*%a{w0G+W&~<(9VsuL3Jkai7 zd|_#|R6Jj>Wx?_qtjQI|2ijE_A82QwCfy1>4`?katCtIN>ezR4jqjhh0~TnK!F&ad z0DlB1!PIyCp;5I~>r%QmgR#MIfGQCxB5;_C|L)NR%CV44d5YkW~^(el66rg8sM zJO7<7)iz!B-}4r=^olC=Meh%OQkV-FH(#nCJ(X|$BkB%>ztJ*YP%@^H zQUS@ru+Fba>u}M$sCB6{RZdFfVs;)U;&;K7tv!bXiT3xzD zDz$iye{ZF(BVc{;ngVx8@172nDd|8RDACRTOJG-&mE|%Fz>tE8oel2a65x(!LZ7s)>9Z)xD>1> z0P8$ju9yBNngTlCg#vnYntm11%kSPB0u&tz&ULT8t2vnK{~rEN^cBaW+f#gv?^55z z6qO9-&}&rZr2f-a%+;@bff24lf&U6r^#&kM^JZMCjP_u9Xs>->7I zjRn1=d-tEU1UTP?3QK7g6~?_QQy8PSLd9y50eQ{R;?LT5Ts0m*iviHT{;MVR|0VUP OiQ=XI7yGY0@P7cz=1kiF literal 0 HcmV?d00001 From 9fe265ea6489fe3c9cba72dc94b22463d96829a0 Mon Sep 17 00:00:00 2001 From: Paul Kompfner Date: Mon, 5 May 2025 13:41:30 -0400 Subject: [PATCH 65/97] [WIP] AWS Nova Sonic service - implement ability to persist and load conversations --- .../20e-persistent-context-aws-nova-sonic.py | 256 ++++++++++++++++++ examples/foundational/39-aws-nova-sonic.py | 2 +- src/pipecat/services/aws_nova_sonic/aws.py | 102 +++++-- .../services/aws_nova_sonic/context.py | 29 +- 4 files changed, 350 insertions(+), 39 deletions(-) create mode 100644 examples/foundational/20e-persistent-context-aws-nova-sonic.py diff --git a/examples/foundational/20e-persistent-context-aws-nova-sonic.py b/examples/foundational/20e-persistent-context-aws-nova-sonic.py new file mode 100644 index 000000000..8a95f54b9 --- /dev/null +++ b/examples/foundational/20e-persistent-context-aws-nova-sonic.py @@ -0,0 +1,256 @@ +# +# Copyright (c) 2025, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +import asyncio +import glob +import json +import os +from datetime import datetime + +from dotenv import load_dotenv +from loguru import logger + +from pipecat.adapters.schemas.function_schema import FunctionSchema +from pipecat.adapters.schemas.tools_schema import ToolsSchema +from pipecat.audio.vad.silero import SileroVADAnalyzer +from pipecat.audio.vad.vad_analyzer import VADParams +from pipecat.pipeline.pipeline import Pipeline +from pipecat.pipeline.runner import PipelineRunner +from pipecat.pipeline.task import PipelineParams, PipelineTask +from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext +from pipecat.services.aws_nova_sonic.aws import AWSNovaSonicLLMService +from pipecat.transports.base_transport import TransportParams +from pipecat.transports.network.small_webrtc import SmallWebRTCTransport +from pipecat.transports.network.webrtc_connection import SmallWebRTCConnection + +load_dotenv(override=True) + +BASE_FILENAME = "/tmp/pipecat_conversation_" + + +async def fetch_weather_from_api(function_name, tool_call_id, args, llm, context, result_callback): + temperature = 75 if args["format"] == "fahrenheit" else 24 + await result_callback( + { + "conditions": "nice", + "temperature": temperature, + "format": args["format"], + "timestamp": datetime.now().strftime("%Y%m%d_%H%M%S"), + } + ) + + +async def get_saved_conversation_filenames( + function_name, tool_call_id, args, llm, context, result_callback +): + # Construct the full pattern including the BASE_FILENAME + full_pattern = f"{BASE_FILENAME}*.json" + + # Use glob to find all matching files + matching_files = glob.glob(full_pattern) + logger.debug(f"matching files: {matching_files}") + + await result_callback({"filenames": matching_files}) + + +# async def get_saved_conversation_filenames( +# function_name, tool_call_id, args, llm, context, result_callback +# ): +# pattern = re.compile(re.escape(BASE_FILENAME) + "\\d{8}_\\d{6}\\.json$") +# matching_files = [] + +# for filename in os.listdir("."): +# if pattern.match(filename): +# matching_files.append(filename) + +# await result_callback({"filenames": matching_files}) + + +async def save_conversation(function_name, tool_call_id, args, llm, context, result_callback): + timestamp = datetime.now().strftime("%Y-%m-%d_%H:%M:%S") + filename = f"{BASE_FILENAME}{timestamp}.json" + logger.debug( + f"writing conversation to {filename}\n{json.dumps(context.get_messages_for_persistent_storage(), indent=4)}" + ) + try: + with open(filename, "w") as file: + messages = context.get_messages_for_persistent_storage() + # remove the last message, which is the instruction we just gave to save the conversation + messages.pop() + json.dump(messages, file, indent=2) + await result_callback({"success": True}) + except Exception as e: + await result_callback({"success": False, "error": str(e)}) + + +async def load_conversation(function_name, tool_call_id, args, llm, context, result_callback): + async def _reset(): + filename = args["filename"] + logger.debug(f"loading conversation from {filename}") + try: + with open(filename, "r") as file: + messages = json.load(file) + messages.append( + { + "role": "user", + "content": f"{AWSNovaSonicLLMService.AWAIT_TRIGGER_ASSISTANT_RESPONSE_INSTRUCTION}", + } + ) + context.set_messages(messages) + await llm.reset_conversation() + await llm.trigger_assistant_response() + except Exception as e: + await result_callback({"success": False, "error": str(e)}) + + asyncio.create_task(_reset()) + + +get_current_weather_tool = FunctionSchema( + name="get_current_weather", + description="Get the current weather", + properties={ + "location": { + "type": "string", + "description": "The city and state, e.g. San Francisco, CA", + }, + "format": { + "type": "string", + "enum": ["celsius", "fahrenheit"], + "description": "The temperature unit to use. Infer this from the user's location.", + }, + }, + required=["location", "format"], +) + +save_conversation_tool = FunctionSchema( + name="save_conversation", + description="Save the current conversation. Use this function to persist the current conversation to external storage.", + properties={}, + required=[], +) + +get_saved_conversation_filenames_tool = FunctionSchema( + name="get_saved_conversation_filenames", + description="Get a list of saved conversation histories. Returns a list of filenames. Each filename includes a date and timestamp. Each file is conversation history that can be loaded into this session.", + properties={}, + required=[], +) + +load_conversation_tool = FunctionSchema( + name="load_conversation", + description="Load a conversation history. Use this function to load a conversation history into the current session.", + properties={ + "filename": { + "type": "string", + "description": "The filename of the conversation history to load.", + } + }, + required=["filename"], +) + +tools = ToolsSchema( + standard_tools=[ + get_current_weather_tool, + save_conversation_tool, + get_saved_conversation_filenames_tool, + load_conversation_tool, + ] +) + + +async def run_bot(webrtc_connection: SmallWebRTCConnection): + logger.info(f"Starting bot") + + transport = SmallWebRTCTransport( + webrtc_connection=webrtc_connection, + params=TransportParams( + audio_in_enabled=True, + audio_out_enabled=True, + vad_enabled=True, + vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.8)), + vad_audio_passthrough=True, + ), + ) + + system_instruction = ( + "You are a friendly assistant. The user and you will engage in a spoken dialog exchanging " + "the transcripts of a natural real-time conversation. Keep your responses short, generally " + "two or three sentences for chatty scenarios. " + f"{AWSNovaSonicLLMService.AWAIT_TRIGGER_ASSISTANT_RESPONSE_INSTRUCTION}" + ) + + llm = AWSNovaSonicLLMService( + secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"), + access_key_id=os.getenv("AWS_ACCESS_KEY_ID"), + region=os.getenv("AWS_REGION"), + voice_id="tiffany", # matthew, tiffany, amy + # you could choose to pass instruction here rather than via context + # system_instruction=system_instruction, + # you could choose to pass tools here rather than via context + # tools=tools + ) + + llm.register_function("get_current_weather", fetch_weather_from_api) + llm.register_function("save_conversation", save_conversation) + llm.register_function("get_saved_conversation_filenames", get_saved_conversation_filenames) + llm.register_function("load_conversation", load_conversation) + + context = OpenAILLMContext( + messages=[ + {"role": "system", "content": f"{system_instruction}"}, + ], + tools=tools, + ) + context_aggregator = llm.create_context_aggregator(context) + + pipeline = Pipeline( + [ + transport.input(), # Transport user input + context_aggregator.user(), + llm, # LLM + transport.output(), # Transport bot output + context_aggregator.assistant(), + ] + ) + + task = PipelineTask( + pipeline, + params=PipelineParams( + allow_interruptions=True, + enable_metrics=True, + enable_usage_metrics=True, + report_only_initial_ttfb=True, + ), + ) + + @transport.event_handler("on_client_connected") + async def on_client_connected(transport, client): + logger.info(f"Client connected") + # Kick off the conversation. + await task.queue_frames([context_aggregator.user().get_context_frame()]) + # HACK: for now, we need this special way of triggering the first assistant response in AWS + # Nova Sonic. Note that this trigger requires a special corresponding bit of text in the + # system instruction. In the future, simply queueing the context frame should be sufficient. + await llm.trigger_assistant_response() + + @transport.event_handler("on_client_disconnected") + async def on_client_disconnected(transport, client): + logger.info(f"Client disconnected") + + @transport.event_handler("on_client_closed") + async def on_client_closed(transport, client): + logger.info(f"Client closed connection") + await task.cancel() + + runner = PipelineRunner(handle_sigint=False) + + await runner.run(task) + + +if __name__ == "__main__": + from run import main + + main() diff --git a/examples/foundational/39-aws-nova-sonic.py b/examples/foundational/39-aws-nova-sonic.py index 07670f75a..c80626962 100644 --- a/examples/foundational/39-aws-nova-sonic.py +++ b/examples/foundational/39-aws-nova-sonic.py @@ -102,7 +102,7 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection): region=os.getenv("AWS_REGION"), voice_id="tiffany", # matthew, tiffany, amy # you could choose to pass instruction here rather than via context - # instruction=system_instruction + # system_instruction=system_instruction # you could choose to pass tools here rather than via context # tools=tools ) diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py index 5b69810f3..50b83d3e0 100644 --- a/src/pipecat/services/aws_nova_sonic/aws.py +++ b/src/pipecat/services/aws_nova_sonic/aws.py @@ -7,6 +7,7 @@ import asyncio import base64 import json +import time import uuid import wave from dataclasses import dataclass @@ -119,7 +120,7 @@ class AWSNovaSonicLLMService(LLMService): region: str, model: str = "amazon.nova-sonic-v1:0", voice_id: str = "matthew", # matthew, tiffany, amy - instruction: Optional[str] = None, + system_instruction: Optional[str] = None, tools: Optional[ToolsSchema] = None, send_transcription_frames: bool = True, **kwargs, @@ -131,7 +132,7 @@ class AWSNovaSonicLLMService(LLMService): self._model = model self._client: BedrockRuntimeClient = None self._voice_id = voice_id - self._instruction = instruction + self._system_instruction = system_instruction self._tools = tools self._send_transcription_frames = send_transcription_frames self._context: AWSNovaSonicLLMContext = None @@ -150,6 +151,8 @@ class AWSNovaSonicLLMService(LLMService): self._handling_bot_stopped_speaking = False self._triggering_assistant_response = False self._assistant_response_trigger_audio: bytes = None # Not cleared on _disconnect() + self._disconnecting = False + self._connected_time: float = None # # standard AIService frame handling @@ -174,6 +177,18 @@ class AWSNovaSonicLLMService(LLMService): await super().cancel(frame) await self._disconnect() + # + # conversation resetting + # + + async def reset_conversation(self): + logger.debug("Resetting conversation") + await self._disconnect() + await self._start_connecting() + # Use existing context + self._context_available = True + await self._finish_connecting_if_context_available() + # # frame processing # @@ -207,10 +222,12 @@ class AWSNovaSonicLLMService(LLMService): async def _handle_context(self, context: OpenAILLMContext): # TODO: reset connection if needed (if entirely new context object provided, for instance) - print(f"[pk] receive updated context: {context.get_messages_for_initializing_history()}") + print(f"[pk] received updated context: {context.get_messages_for_initializing_history()}") if not self._context: # We got our initial context - try to finish connecting - self._context = AWSNovaSonicLLMContext.upgrade_to_nova_sonic(context) + self._context = AWSNovaSonicLLMContext.upgrade_to_nova_sonic( + context, self._system_instruction + ) self._context_available = True await self._finish_connecting_if_context_available() @@ -296,8 +313,8 @@ class AWSNovaSonicLLMService(LLMService): # Read context history = self._context.get_messages_for_initializing_history() - # Send prompt start event, specifying tools - # Tools from context take priority over tools from __init__() + # Send prompt start event, specifying tools. + # Tools from context take priority over self._tools. tools = ( self._context.tools if self._context.tools @@ -305,11 +322,14 @@ class AWSNovaSonicLLMService(LLMService): ) await self._send_prompt_start_event(tools) - # Send system instruction - # Instruction from context takes priority over instruction from __init__() - instruction = history.instruction if history.instruction else self._instruction - if instruction: - await self._send_text_event(text=instruction, role=Role.SYSTEM) + # Send system instruction. + # Instruction from context takes priority over self._system_instruction. + # (NOTE: this prioritizing occurred automatically behind the scenes: the context was + # initialized with self._system_instruction and then updated itself from its messages when + # get_messages_for_initializing_history() was called). + # print(f"[pk] connecting, with system instruction: {history.system_instruction}") + if history.system_instruction: + await self._send_text_event(text=history.system_instruction, role=Role.SYSTEM) # Send conversation history for message in history.messages: @@ -320,7 +340,7 @@ class AWSNovaSonicLLMService(LLMService): # - pass additional message(s) # - merge init-passed system instruction + context instruction (latter takes precedence) # - merge init-passed tools + context tools (latter takes precedence) - await self._send_text_event(text=self._instruction, role=Role.SYSTEM) + await self._send_text_event(text=self._system_instruction, role=Role.SYSTEM) # Start audio input await self._send_audio_input_start_event() @@ -328,31 +348,43 @@ class AWSNovaSonicLLMService(LLMService): # Start receiving events self._receive_task = self.create_task(self._receive_task_handler()) - # If we need to, send assistant response trigger + # Record finished connecting time + self._connected_time = time.time() + + # If we need to, send assistant response trigger (depends on self._connected_time) if self._triggering_assistant_response: - # If the trigger was the first audio chunk sent on this connection it'd be ignored (I'm - # guessing the LLM can't quite "hear" the first little bit of audio sent). So send a bit - # of leading blank audio first. - await self._send_assistant_response_trigger(lead_with_blank_audio=True) + await self._send_assistant_response_trigger() self._triggering_assistant_response = False async def _disconnect(self): try: - # Clean up receive task - if self._receive_task: - await self.cancel_task(self._receive_task, timeout=1.0) - self._receive_task = None + # NOTE: see explanation of HACK, below + self._disconnecting = True # Clean up client if self._client: + print("[pk] Cleaning up client") await self._send_session_end_events() self._client = None # Clean up stream if self._stream: + print("[pk] Cleaning up stream") await self._stream.input_stream.close() self._stream = None + # NOTE: see explanation of HACK, below + await asyncio.sleep(1) + + # Clean up receive task + # HACK: we should ideally be able to cancel the receive task before stopping the input + # stream, above (meaning we wouldn't need self._disconnecting). But for some reason if + # we don't close the input stream and wait a second first, we're getting an error a lot + # like this one: https://github.com/awslabs/amazon-transcribe-streaming-sdk/issues/61. + if self._receive_task: + await self.cancel_task(self._receive_task, timeout=1.0) + self._receive_task = None + # Reset remaining connection-specific state self._prompt_name = None self._input_audio_content_name = None @@ -362,6 +394,8 @@ class AWSNovaSonicLLMService(LLMService): self._ready_to_send_context = False self._handling_bot_stopped_speaking = False self._triggering_assistant_response = False + self._disconnecting = False + self._connected_time = None except Exception as e: logger.error(f"{self} error disconnecting: {e}") @@ -619,9 +653,8 @@ class AWSNovaSonicLLMService(LLMService): # LLM communication: output events (LLM -> pipecat) # - # Receive the ongoing LLM "completion". - # There is generally a single completion per session. - # In a completion, a few different kinds of content can be delivered: + # Receive events for the session. + # A few different kinds of content can be delivered: # - Transcription of user audio # - Tool use # - Text preview of planned response speech before audio delivered @@ -633,7 +666,7 @@ class AWSNovaSonicLLMService(LLMService): # The overall completion is wrapped by "completionStart" and "completionEnd" events. async def _receive_task_handler(self): try: - while self._client: + while self._client and not self._disconnecting: output = await self._stream.await_output() result = await output[1].receive() @@ -906,16 +939,25 @@ class AWSNovaSonicLLMService(LLMService): await self._send_assistant_response_trigger() self._triggering_assistant_response = False - async def _send_assistant_response_trigger(self, lead_with_blank_audio=False): + async def _send_assistant_response_trigger(self): # TODO: if/when we make bitrate, etc configurable, avoid hard-coding this chunk_size = 640 # equivalent to what we get from InputAudioRawFrame chunk_duration = 640 / ( 16000 * 2 ) # 640 bytes of 16-bit (2-byte) PCM mono audio at 16kHz corresponds to 0.02 seconds - # Lead with blank audio, if needed - if lead_with_blank_audio: - blank_audio_duration = 0.5 # much less than this and it doesn't reliably work + # Lead with a bit of blank audio, if needed. + # It seems like the LLM can't quite "hear" the first little bit of audio sent on a + # connection. + current_time = time.time() + max_blank_audio_duration = 0.5 + blank_audio_duration = ( + max_blank_audio_duration - (current_time - self._connected_time) + if self._connected_time is not None + and (current_time - self._connected_time) < max_blank_audio_duration + else None + ) + if blank_audio_duration: blank_audio_chunk = b"\x00" * chunk_size num_chunks = int(blank_audio_duration / chunk_duration) for _ in range(num_chunks): @@ -925,7 +967,7 @@ class AWSNovaSonicLLMService(LLMService): # Send trigger audio # NOTE: this audio *will* be transcribed and eventually make it into the context. That's OK: # if we ever need to seed this service again with context it would make sense to include it - # since the instruction (i.e. the "wait for the trigger" instruction) will be part of the + # since the instruction (i.e. the "wait for the trigger" instruction) will be part of the # context as well. # print(f"[pk] sending trigger audio! {len(self._assistant_response_trigger_audio)}") audio_chunks = [ diff --git a/src/pipecat/services/aws_nova_sonic/context.py b/src/pipecat/services/aws_nova_sonic/context.py index 3fac65a72..b12061e1e 100644 --- a/src/pipecat/services/aws_nova_sonic/context.py +++ b/src/pipecat/services/aws_nova_sonic/context.py @@ -49,7 +49,7 @@ class AWSNovaSonicConversationHistoryMessage: @dataclass class AWSNovaSonicConversationHistory: - instruction: str = None + system_instruction: str = None messages: list[AWSNovaSonicConversationHistoryMessage] = field(default_factory=list) @@ -58,18 +58,22 @@ class AWSNovaSonicLLMContext(OpenAILLMContext): super().__init__(messages=messages, tools=tools, **kwargs) self.__setup_local() - def __setup_local(self): + def __setup_local(self, system_instruction: str = ""): self._assistant_text = "" + self._system_instruction = system_instruction @staticmethod - def upgrade_to_nova_sonic(obj: OpenAILLMContext) -> "AWSNovaSonicLLMContext": + def upgrade_to_nova_sonic( + obj: OpenAILLMContext, system_instruction: str + ) -> "AWSNovaSonicLLMContext": if isinstance(obj, OpenAILLMContext) and not isinstance(obj, AWSNovaSonicLLMContext): obj.__class__ = AWSNovaSonicLLMContext - obj.__setup_local() + obj.__setup_local(system_instruction) return obj + # NOTE: this method has the side-effect of updating _system_instruction from messages def get_messages_for_initializing_history(self) -> AWSNovaSonicConversationHistory: - history = AWSNovaSonicConversationHistory() + history = AWSNovaSonicConversationHistory(system_instruction=self._system_instruction) # Bail if there are no messages if not self.messages: @@ -82,13 +86,15 @@ class AWSNovaSonicLLMContext(OpenAILLMContext): system = messages.pop(0) content = system.get("content") if isinstance(content, str): - history.instruction = content + history.system_instruction = content elif isinstance(content, list): - history.instruction = content[0].get("text") + history.system_instruction = content[0].get("text") + if history.system_instruction: + self._system_instruction = history.system_instruction # Process remaining messages to fill out conversation history. # Nova Sonic supports "user" and "assistant" messages in history. - print(f"[pk] standard messages: {messages}") + # print(f"[pk] standard messages: {messages}") for message in messages: history_message = self.from_standard_message(message) if history_message: @@ -96,6 +102,13 @@ class AWSNovaSonicLLMContext(OpenAILLMContext): return history + def get_messages_for_persistent_storage(self): + messages = super().get_messages_for_persistent_storage() + # If we have a system instruction and messages doesn't already contain it, add it + if self._system_instruction and not (messages and messages[0].get("role") == "system"): + messages.insert(0, {"role": "system", "content": self._system_instruction}) + return messages + def from_standard_message(self, message) -> AWSNovaSonicConversationHistoryMessage: role = message.get("role") if message.get("role") == "user" or message.get("role") == "assistant": From 2b02d08f4c7f03fba5f18706ec600476935a5c50 Mon Sep 17 00:00:00 2001 From: Paul Kompfner Date: Tue, 6 May 2025 09:26:22 -0400 Subject: [PATCH 66/97] [WIP] AWS Nova Sonic service - add comments to examples pointing out the us-east-1 is the only supported region so far --- examples/foundational/20e-persistent-context-aws-nova-sonic.py | 2 +- examples/foundational/39-aws-nova-sonic.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/foundational/20e-persistent-context-aws-nova-sonic.py b/examples/foundational/20e-persistent-context-aws-nova-sonic.py index 8a95f54b9..731c69c3a 100644 --- a/examples/foundational/20e-persistent-context-aws-nova-sonic.py +++ b/examples/foundational/20e-persistent-context-aws-nova-sonic.py @@ -185,7 +185,7 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection): llm = AWSNovaSonicLLMService( secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"), access_key_id=os.getenv("AWS_ACCESS_KEY_ID"), - region=os.getenv("AWS_REGION"), + region=os.getenv("AWS_REGION"), # as of 2025-05-06, us-east-1 is the only supported region voice_id="tiffany", # matthew, tiffany, amy # you could choose to pass instruction here rather than via context # system_instruction=system_instruction, diff --git a/examples/foundational/39-aws-nova-sonic.py b/examples/foundational/39-aws-nova-sonic.py index c80626962..a89796ea6 100644 --- a/examples/foundational/39-aws-nova-sonic.py +++ b/examples/foundational/39-aws-nova-sonic.py @@ -99,7 +99,7 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection): llm = AWSNovaSonicLLMService( secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"), access_key_id=os.getenv("AWS_ACCESS_KEY_ID"), - region=os.getenv("AWS_REGION"), + region=os.getenv("AWS_REGION"), # as of 2025-05-06, us-east-1 is the only supported region voice_id="tiffany", # matthew, tiffany, amy # you could choose to pass instruction here rather than via context # system_instruction=system_instruction From 467233be046527da8be0bf40ebf5aaea147d8c36 Mon Sep 17 00:00:00 2001 From: Paul Kompfner Date: Tue, 6 May 2025 09:48:50 -0400 Subject: [PATCH 67/97] [WIP] AWS Nova Sonic service - support multi-line system prompt --- .../foundational/20e-persistent-context-aws-nova-sonic.py | 4 ++++ examples/foundational/39-aws-nova-sonic.py | 1 - src/pipecat/services/aws_nova_sonic/aws.py | 5 +++-- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/examples/foundational/20e-persistent-context-aws-nova-sonic.py b/examples/foundational/20e-persistent-context-aws-nova-sonic.py index 731c69c3a..8ac1508f8 100644 --- a/examples/foundational/20e-persistent-context-aws-nova-sonic.py +++ b/examples/foundational/20e-persistent-context-aws-nova-sonic.py @@ -175,6 +175,10 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection): ), ) + # Specify initial system instruction. + # HACK: note that, for now, we need to inject a special bit of text into this instruction to + # allow the first assistant response to be programmatically triggered (which happens in the + # on_client_connected handler, below) system_instruction = ( "You are a friendly assistant. The user and you will engage in a spoken dialog exchanging " "the transcripts of a natural real-time conversation. Keep your responses short, generally " diff --git a/examples/foundational/39-aws-nova-sonic.py b/examples/foundational/39-aws-nova-sonic.py index a89796ea6..fe0d07dca 100644 --- a/examples/foundational/39-aws-nova-sonic.py +++ b/examples/foundational/39-aws-nova-sonic.py @@ -87,7 +87,6 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection): # HACK: note that, for now, we need to inject a special bit of text into this instruction to # allow the first assistant response to be programmatically triggered (which happens in the # on_client_connected handler, below) - # TODO: looks like Nova Sonic can't handle new lines? system_instruction = ( "You are a friendly assistant. The user and you will engage in a spoken dialog exchanging " "the transcripts of a natural real-time conversation. Keep your responses short, generally " diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py index 50b83d3e0..e5df20e66 100644 --- a/src/pipecat/services/aws_nova_sonic/aws.py +++ b/src/pipecat/services/aws_nova_sonic/aws.py @@ -498,7 +498,7 @@ class AWSNovaSonicLLMService(LLMService): await self._send_client_event(audio_content_start) async def _send_text_event(self, text: str, role: Role): - if not self._stream: + if not self._stream or not text: return content_name = str(uuid.uuid4()) @@ -521,13 +521,14 @@ class AWSNovaSonicLLMService(LLMService): ''' await self._send_client_event(text_content_start) + escaped_text = json.dumps(text) # includes quotes text_input = f''' {{ "event": {{ "textInput": {{ "promptName": "{self._prompt_name}", "contentName": "{content_name}", - "content": "{text}" + "content": {escaped_text} }} }} }} From c4d0f91a7fbed227b4c5ae80382a0cfeb9957389 Mon Sep 17 00:00:00 2001 From: Paul Kompfner Date: Tue, 6 May 2025 09:53:12 -0400 Subject: [PATCH 68/97] [WIP] AWS Nova Sonic service - remove some old code that was accidentally still there, possibly sending a duplicate system instruction --- src/pipecat/services/aws_nova_sonic/aws.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py index e5df20e66..aa44beb60 100644 --- a/src/pipecat/services/aws_nova_sonic/aws.py +++ b/src/pipecat/services/aws_nova_sonic/aws.py @@ -335,13 +335,6 @@ class AWSNovaSonicLLMService(LLMService): for message in history.messages: await self._send_text_event(text=message.text, role=message.role) - # Send initial context (system instruction and conversation history) - # TODO: finish implementing - # - pass additional message(s) - # - merge init-passed system instruction + context instruction (latter takes precedence) - # - merge init-passed tools + context tools (latter takes precedence) - await self._send_text_event(text=self._system_instruction, role=Role.SYSTEM) - # Start audio input await self._send_audio_input_start_event() From d388c057c039531a7f96a081eb69793e6e7f4df0 Mon Sep 17 00:00:00 2001 From: Paul Kompfner Date: Tue, 6 May 2025 11:12:47 -0400 Subject: [PATCH 69/97] [WIP] AWS Nova Sonic service - recover from unwanted disconnection due to an error --- src/pipecat/services/aws_nova_sonic/aws.py | 7 +++++++ src/pipecat/services/aws_nova_sonic/context.py | 2 ++ 2 files changed, 9 insertions(+) diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py index aa44beb60..aa264155c 100644 --- a/src/pipecat/services/aws_nova_sonic/aws.py +++ b/src/pipecat/services/aws_nova_sonic/aws.py @@ -153,6 +153,7 @@ class AWSNovaSonicLLMService(LLMService): self._assistant_response_trigger_audio: bytes = None # Not cleared on _disconnect() self._disconnecting = False self._connected_time: float = None + self._wants_connection = False # # standard AIService frame handling @@ -160,6 +161,7 @@ class AWSNovaSonicLLMService(LLMService): async def start(self, frame: StartFrame): await super().start(frame) + self._wants_connection = True # TODO: maybe connect but don't send history until we get all of our settings? # how do we know how long to wait? # ah, i think we'll *always* get at least one OpenAILLMContextFrame which kicks things off @@ -171,10 +173,12 @@ class AWSNovaSonicLLMService(LLMService): async def stop(self, frame: EndFrame): await super().stop(frame) + self._wants_connection = False await self._disconnect() async def cancel(self, frame: CancelFrame): await super().cancel(frame) + self._wants_connection = False await self._disconnect() # @@ -183,6 +187,7 @@ class AWSNovaSonicLLMService(LLMService): async def reset_conversation(self): logger.debug("Resetting conversation") + await self._handle_bot_stopped_speaking() await self._disconnect() await self._start_connecting() # Use existing context @@ -694,6 +699,8 @@ class AWSNovaSonicLLMService(LLMService): except Exception as e: logger.error(f"{self} error processing responses: {e}") + if self._wants_connection: + await self.reset_conversation() async def _handle_completion_start_event(self, event_json): # print("[pk] completion start") diff --git a/src/pipecat/services/aws_nova_sonic/context.py b/src/pipecat/services/aws_nova_sonic/context.py index b12061e1e..d96c2d1ed 100644 --- a/src/pipecat/services/aws_nova_sonic/context.py +++ b/src/pipecat/services/aws_nova_sonic/context.py @@ -143,6 +143,8 @@ class AWSNovaSonicLLMContext(OpenAILLMContext): # print(f"[pk] assistant text buffered: {self._assistant_text}") def flush_aggregated_assistant_text(self): + if not self._assistant_text: + return message = { "role": "assistant", "content": [{"type": "text", "text": self._assistant_text}], From 73020be511c64e2026f1ed95730686198929b3df Mon Sep 17 00:00:00 2001 From: Paul Kompfner Date: Tue, 6 May 2025 12:25:25 -0400 Subject: [PATCH 70/97] [WIP] AWS Nova Sonic service - minor fix: only try to read received JSON if we have it --- src/pipecat/services/aws_nova_sonic/aws.py | 48 +++++++++++----------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py index aa264155c..c2b56ef74 100644 --- a/src/pipecat/services/aws_nova_sonic/aws.py +++ b/src/pipecat/services/aws_nova_sonic/aws.py @@ -519,7 +519,7 @@ class AWSNovaSonicLLMService(LLMService): ''' await self._send_client_event(text_content_start) - escaped_text = json.dumps(text) # includes quotes + escaped_text = json.dumps(text) # includes quotes text_input = f''' {{ "event": {{ @@ -673,29 +673,29 @@ class AWSNovaSonicLLMService(LLMService): response_data = result.value.bytes_.decode("utf-8") json_data = json.loads(response_data) - if "event" in json_data: - event_json = json_data["event"] - if "completionStart" in event_json: - # Handle the LLM completion starting - await self._handle_completion_start_event(event_json) - elif "contentStart" in event_json: - # Handle a piece of content starting - await self._handle_content_start_event(event_json) - elif "textOutput" in event_json: - # Handle text output content - await self._handle_text_output_event(event_json) - elif "audioOutput" in event_json: - # Handle audio output content - await self._handle_audio_output_event(event_json) - elif "toolUse" in event_json: - # Handle tool use - await self._handle_tool_use_event(event_json) - elif "contentEnd" in event_json: - # Handle a piece of content ending - await self._handle_content_end_event(event_json) - elif "completionEnd" in event_json: - # Handle the LLM completion ending - await self._handle_completion_end_event(event_json) + if "event" in json_data: + event_json = json_data["event"] + if "completionStart" in event_json: + # Handle the LLM completion starting + await self._handle_completion_start_event(event_json) + elif "contentStart" in event_json: + # Handle a piece of content starting + await self._handle_content_start_event(event_json) + elif "textOutput" in event_json: + # Handle text output content + await self._handle_text_output_event(event_json) + elif "audioOutput" in event_json: + # Handle audio output content + await self._handle_audio_output_event(event_json) + elif "toolUse" in event_json: + # Handle tool use + await self._handle_tool_use_event(event_json) + elif "contentEnd" in event_json: + # Handle a piece of content ending + await self._handle_content_end_event(event_json) + elif "completionEnd" in event_json: + # Handle the LLM completion ending + await self._handle_completion_end_event(event_json) except Exception as e: logger.error(f"{self} error processing responses: {e}") From 885b2d1d2f3dcfa6e04b1c347848ca331c4722b8 Mon Sep 17 00:00:00 2001 From: Paul Kompfner Date: Tue, 6 May 2025 14:29:36 -0400 Subject: [PATCH 71/97] [WIP] AWS Nova Sonic service - make parameters configurable --- src/pipecat/services/aws_nova_sonic/aws.py | 73 ++++++++++++++-------- 1 file changed, 47 insertions(+), 26 deletions(-) diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py index c2b56ef74..c0367fa6b 100644 --- a/src/pipecat/services/aws_nova_sonic/aws.py +++ b/src/pipecat/services/aws_nova_sonic/aws.py @@ -28,6 +28,7 @@ from aws_sdk_bedrock_runtime.models import ( InvokeModelWithBidirectionalStreamOutput, ) from loguru import logger +from pydantic import BaseModel, Field from smithy_aws_core.credentials_resolvers.static import StaticCredentialsResolver from smithy_aws_core.identity import AWSCredentialsIdentity from smithy_core.aio.eventstream import DuplexEventStream @@ -107,6 +108,23 @@ class CurrentContent: ) +class Params(BaseModel): + # Audio input + input_sample_rate: Optional[int] = Field(default=16000) + input_sample_size: Optional[int] = Field(default=16) + input_channel_count: Optional[int] = Field(default=1) + + # Audio output + output_sample_rate: Optional[int] = Field(default=24000) + output_sample_size: Optional[int] = Field(default=16) + output_channel_count: Optional[int] = Field(default=1) + + # Inference + max_tokens: Optional[int] = Field(default=1024) + top_p: Optional[float] = Field(default=0.9) + temperature: Optional[float] = Field(default=0.7) + + class AWSNovaSonicLLMService(LLMService): # Override the default adapter to use the AWSNovaSonicLLMAdapter one adapter_class = AWSNovaSonicLLMAdapter @@ -120,6 +138,7 @@ class AWSNovaSonicLLMService(LLMService): region: str, model: str = "amazon.nova-sonic-v1:0", voice_id: str = "matthew", # matthew, tiffany, amy + params: Params = Params(), system_instruction: Optional[str] = None, tools: Optional[ToolsSchema] = None, send_transcription_frames: bool = True, @@ -132,6 +151,7 @@ class AWSNovaSonicLLMService(LLMService): self._model = model self._client: BedrockRuntimeClient = None self._voice_id = voice_id + self._params = params self._system_instruction = system_instruction self._tools = tools self._send_transcription_frames = send_transcription_frames @@ -419,18 +439,18 @@ class AWSNovaSonicLLMService(LLMService): # TODO: make params configurable? async def _send_session_start_event(self): - session_start = """ - { - "event": { - "sessionStart": { - "inferenceConfiguration": { - "maxTokens": 1024, - "topP": 0.9, - "temperature": 0.7 - } - } - } - } + session_start = f""" + {{ + "event": {{ + "sessionStart": {{ + "inferenceConfiguration": {{ + "maxTokens": {self._params.max_tokens}, + "topP": {self._params.top_p}, + "temperature": {self._params.temperature} + }} + }} + }} + }} """ await self._send_client_event(session_start) @@ -458,9 +478,9 @@ class AWSNovaSonicLLMService(LLMService): }}, "audioOutputConfiguration": {{ "mediaType": "audio/lpcm", - "sampleRateHertz": 24000, - "sampleSizeBits": 16, - "channelCount": 1, + "sampleRateHertz": {self._params.output_sample_rate}, + "sampleSizeBits": {self._params.output_sample_size}, + "channelCount": {self._params.output_channel_count}, "voiceId": "{self._voice_id}", "encoding": "base64", "audioType": "SPEECH" @@ -483,9 +503,9 @@ class AWSNovaSonicLLMService(LLMService): "role": "USER", "audioInputConfiguration": {{ "mediaType": "audio/lpcm", - "sampleRateHertz": 16000, - "sampleSizeBits": 16, - "channelCount": 1, + "sampleRateHertz": {self._params.input_sample_rate}, + "sampleSizeBits": {self._params.input_sample_size}, + "channelCount": {self._params.input_channel_count}, "audioType": "SPEECH", "encoding": "base64" }} @@ -762,11 +782,10 @@ class AWSNovaSonicLLMService(LLMService): # Push audio frame audio = base64.b64decode(audio_content) - # TODO: make sample rate + channels (used in multiple places) consts frame = TTSAudioRawFrame( audio=audio, - sample_rate=24000, - num_channels=1, + sample_rate=self._params.output_sample_rate, + num_channels=self._params.output_channel_count, ) await self.push_frame(frame) @@ -941,11 +960,13 @@ class AWSNovaSonicLLMService(LLMService): self._triggering_assistant_response = False async def _send_assistant_response_trigger(self): - # TODO: if/when we make bitrate, etc configurable, avoid hard-coding this - chunk_size = 640 # equivalent to what we get from InputAudioRawFrame - chunk_duration = 640 / ( - 16000 * 2 - ) # 640 bytes of 16-bit (2-byte) PCM mono audio at 16kHz corresponds to 0.02 seconds + chunk_duration = 0.02 # what we might get from InputAudioRawFrame + chunk_size = int( + chunk_duration + * self._params.input_sample_rate + * self._params.input_channel_count + * (self._params.input_sample_size / 8) + ) # e.g. 0.02 seconds of 16-bit (2-byte) PCM mono audio at 16kHz is 640 bytes # Lead with a bit of blank audio, if needed. # It seems like the LLM can't quite "hear" the first little bit of audio sent on a From c7e223e85ae9d33d8194966c6de88a39311a7ef6 Mon Sep 17 00:00:00 2001 From: Paul Kompfner Date: Tue, 6 May 2025 16:06:29 -0400 Subject: [PATCH 72/97] [WIP] AWS Nova Sonic service - remove print statements in favor of logger --- examples/foundational/39-aws-nova-sonic.py | 6 -- src/pipecat/services/aws_nova_sonic/aws.py | 72 +++++++------------ .../services/aws_nova_sonic/context.py | 7 +- 3 files changed, 29 insertions(+), 56 deletions(-) diff --git a/examples/foundational/39-aws-nova-sonic.py b/examples/foundational/39-aws-nova-sonic.py index fe0d07dca..af44cf790 100644 --- a/examples/foundational/39-aws-nova-sonic.py +++ b/examples/foundational/39-aws-nova-sonic.py @@ -10,7 +10,6 @@ from datetime import datetime from dotenv import load_dotenv from loguru import logger -# import logging from pipecat.adapters.schemas.function_schema import FunctionSchema from pipecat.adapters.schemas.tools_schema import ToolsSchema from pipecat.audio.vad.silero import SileroVADAnalyzer @@ -27,11 +26,6 @@ from pipecat.transports.network.webrtc_connection import SmallWebRTCConnection # Load environment variables load_dotenv(override=True) -# logging.basicConfig( -# level=logging.DEBUG, -# format='%(asctime)s - %(levelname)s - %(message)s' -# ) - async def fetch_weather_from_api(function_name, tool_call_id, args, llm, context, result_callback): temperature = 75 if args["format"] == "fahrenheit" else 24 diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py index c0367fa6b..1e318f164 100644 --- a/src/pipecat/services/aws_nova_sonic/aws.py +++ b/src/pipecat/services/aws_nova_sonic/aws.py @@ -229,25 +229,10 @@ class AWSNovaSonicLLMService(LLMService): await self._handle_bot_stopped_speaking() elif isinstance(frame, AWSNovaSonicFunctionCallResultFrame): await self._handle_function_call_result(frame) - # TODO: do we need to do anything for the below four frame types? - elif isinstance(frame, StartInterruptionFrame): - # print("[pk] StartInterruptionFrame") - pass - elif isinstance(frame, UserStartedSpeakingFrame): - # print("[pk] UserStartedSpeakingFrame") - pass - elif isinstance(frame, StopInterruptionFrame): - # print("[pk] StopInterruptionFrame") - pass - elif isinstance(frame, UserStoppedSpeakingFrame): - # print("[pk] UserStoppedSpeakingFrame") - pass await self.push_frame(frame, direction) async def _handle_context(self, context: OpenAILLMContext): - # TODO: reset connection if needed (if entirely new context object provided, for instance) - print(f"[pk] received updated context: {context.get_messages_for_initializing_history()}") if not self._context: # We got our initial context - try to finish connecting self._context = AWSNovaSonicLLMContext.upgrade_to_nova_sonic( @@ -303,6 +288,8 @@ class AWSNovaSonicLLMService(LLMService): async def _start_connecting(self): try: + logger.info("Connecting...") + if self._client: # Here we assume that if we have a client we are connected or connecting return @@ -335,6 +322,8 @@ class AWSNovaSonicLLMService(LLMService): if not (self._context_available and self._ready_to_send_context): return + logger.info("Finishing connecting (setting up session)...") + # Read context history = self._context.get_messages_for_initializing_history() @@ -345,6 +334,7 @@ class AWSNovaSonicLLMService(LLMService): if self._context.tools else self.get_llm_adapter().from_standard_tools(self._tools) ) + logger.debug(f"Using tools: {tools}") await self._send_prompt_start_event(tools) # Send system instruction. @@ -352,7 +342,7 @@ class AWSNovaSonicLLMService(LLMService): # (NOTE: this prioritizing occurred automatically behind the scenes: the context was # initialized with self._system_instruction and then updated itself from its messages when # get_messages_for_initializing_history() was called). - # print(f"[pk] connecting, with system instruction: {history.system_instruction}") + logger.debug(f"Using system instruction: {history.system_instruction}") if history.system_instruction: await self._send_text_event(text=history.system_instruction, role=Role.SYSTEM) @@ -366,9 +356,11 @@ class AWSNovaSonicLLMService(LLMService): # Start receiving events self._receive_task = self.create_task(self._receive_task_handler()) - # Record finished connecting time + # Record finished connecting time (must be done before sending assistant response trigger) self._connected_time = time.time() + logger.info("Finished connecting") + # If we need to, send assistant response trigger (depends on self._connected_time) if self._triggering_assistant_response: await self._send_assistant_response_trigger() @@ -376,18 +368,18 @@ class AWSNovaSonicLLMService(LLMService): async def _disconnect(self): try: + logger.info("Disconnecting...") + # NOTE: see explanation of HACK, below self._disconnecting = True # Clean up client if self._client: - print("[pk] Cleaning up client") await self._send_session_end_events() self._client = None # Clean up stream if self._stream: - print("[pk] Cleaning up stream") await self._stream.input_stream.close() self._stream = None @@ -414,6 +406,8 @@ class AWSNovaSonicLLMService(LLMService): self._triggering_assistant_response = False self._disconnecting = False self._connected_time = None + + logger.info("Finished disconnecting") except Exception as e: logger.error(f"{self} error disconnecting: {e}") @@ -611,8 +605,6 @@ class AWSNovaSonicLLMService(LLMService): if not self._stream: return - # print(f"[pk] sending tool result. tool call ID: {tool_call_id}, result: {result}") - content_name = str(uuid.uuid4()) result_content_start = f''' @@ -723,7 +715,6 @@ class AWSNovaSonicLLMService(LLMService): await self.reset_conversation() async def _handle_completion_start_event(self, event_json): - # print("[pk] completion start") pass async def _handle_content_start_event(self, event_json): @@ -744,10 +735,6 @@ class AWSNovaSonicLLMService(LLMService): ) self._content_being_received = content - # print(f"[pk] content start: {content}") - # if content.role == Role.ASSISTANT: - # print(f"[pk] assistant content start: {content}") - if content.role == Role.ASSISTANT: if content.type == ContentType.AUDIO: # Note that an assistant response can comprise of multiple audio blocks @@ -763,9 +750,6 @@ class AWSNovaSonicLLMService(LLMService): content = self._content_being_received text_content = event_json["textOutput"]["content"] - # print(f"[pk] text output. content: {text_content}") - # if content.role == Role.ASSISTANT: - # print(f"[pk] assistant text output. content: {text_content}") # Bookkeeping: augment the current content being received with text # Assumption: only one text content per content block @@ -778,7 +762,6 @@ class AWSNovaSonicLLMService(LLMService): # Get audio audio_content = event_json["audioOutput"]["content"] - # print(f"[pk] audio output. content: {len(audio_content)}") # Push audio frame audio = base64.b64decode(audio_content) @@ -800,10 +783,6 @@ class AWSNovaSonicLLMService(LLMService): tool_call_id = tool_use["toolUseId"] arguments = json.loads(tool_use["content"]) - # print( - # f"[pk] tool use - function_name: {function_name}, tool_call_id: {tool_call_id}, arguments: {arguments}" - # ) - # Call tool function if self.has_function(function_name): if function_name in self._functions.keys(): @@ -833,9 +812,6 @@ class AWSNovaSonicLLMService(LLMService): content_end = event_json["contentEnd"] stop_reason = content_end["stopReason"] - # print(f"[pk] content end: {content}.\n stop_reason: {stop_reason}") - # if content.role == Role.ASSISTANT: - # print(f"[pk] assistant content end: {content}.\n stop_reason: {stop_reason}") # Bookkeeping: clear current content being received self._content_being_received = None @@ -856,25 +832,24 @@ class AWSNovaSonicLLMService(LLMService): self._content_being_received = False async def _handle_completion_end_event(self, event_json): - # print("[pk] completion end") pass async def _report_assistant_response_started(self): + logger.debug("Assistant response started") + # Report that the assistant has started their response. - print("[pk] LLM full response started") await self.push_frame(LLMFullResponseStartFrame()) # Report that equivalent of TTS (this is a speech-to-speech model) started - print("[pk] TTS started") await self.push_frame(TTSStartedFrame()) async def _report_assistant_response_text_added(self, text): + logger.debug(f"Assistant response text added: {text}") + # Report some text added to the ongoing assistant response - print(f"[pk] LLM text: {text}") await self.push_frame(LLMTextFrame(text)) # Report some text added to the *equivalent* of TTS (this is a speech-to-speech model) - print(f"[pk] TTS text: {text}") await self.push_frame(TTSTextFrame(text)) # TODO: this is a (hopefully temporary) HACK. Here we directly manipulate the context rather @@ -890,19 +865,20 @@ class AWSNovaSonicLLMService(LLMService): self._context.buffer_assistant_text(text) async def _report_assistant_response_ended(self): + logger.debug("Assistant response ended") + # Report that the assistant has finished their response. - print("[pk] LLM full response ended") await self.push_frame(LLMFullResponseEndFrame()) # Report that equivalent of TTS (this is a speech-to-speech model) stopped. - print("[pk] TTS stopped") await self.push_frame(TTSStoppedFrame()) # For an explanation of this hack, see _report_assistant_response_text_added. self._context.flush_aggregated_assistant_text() async def _report_user_transcription_text_added(self, text): - print(f"[pk] transcription: {text}") + logger.debug(f"User transcription text added: {text}") + # Manually add new user transcription text to context. # We can't rely on the user context aggregator to do this since it's upstream from the LLM. self._context.add_user_transcription_text(text) @@ -960,6 +936,8 @@ class AWSNovaSonicLLMService(LLMService): self._triggering_assistant_response = False async def _send_assistant_response_trigger(self): + logger.debug("Sending assistant response trigger...") + chunk_duration = 0.02 # what we might get from InputAudioRawFrame chunk_size = int( chunk_duration @@ -980,6 +958,9 @@ class AWSNovaSonicLLMService(LLMService): else None ) if blank_audio_duration: + logger.debug( + f"Leading assistant response trigger with {blank_audio_duration}s of blank audio" + ) blank_audio_chunk = b"\x00" * chunk_size num_chunks = int(blank_audio_duration / chunk_duration) for _ in range(num_chunks): @@ -991,7 +972,6 @@ class AWSNovaSonicLLMService(LLMService): # if we ever need to seed this service again with context it would make sense to include it # since the instruction (i.e. the "wait for the trigger" instruction) will be part of the # context as well. - # print(f"[pk] sending trigger audio! {len(self._assistant_response_trigger_audio)}") audio_chunks = [ self._assistant_response_trigger_audio[i : i + chunk_size] for i in range(0, len(self._assistant_response_trigger_audio), chunk_size) diff --git a/src/pipecat/services/aws_nova_sonic/context.py b/src/pipecat/services/aws_nova_sonic/context.py index d96c2d1ed..a8d9c4dba 100644 --- a/src/pipecat/services/aws_nova_sonic/context.py +++ b/src/pipecat/services/aws_nova_sonic/context.py @@ -94,7 +94,6 @@ class AWSNovaSonicLLMContext(OpenAILLMContext): # Process remaining messages to fill out conversation history. # Nova Sonic supports "user" and "assistant" messages in history. - # print(f"[pk] standard messages: {messages}") for message in messages: history_message = self.from_standard_message(message) if history_message: @@ -136,11 +135,11 @@ class AWSNovaSonicLLMContext(OpenAILLMContext): "content": [{"type": "text", "text": text}], } self.add_message(message) - # print(f"[pk] context updated (user): {self.get_messages_for_logging()}") + # logger.debug(f"Context updated (user): {self.get_messages_for_logging()}") def buffer_assistant_text(self, text): - self._assistant_text += text # TODO: determine if we need to add space or something - # print(f"[pk] assistant text buffered: {self._assistant_text}") + self._assistant_text += text + # logger.debug(f"Assistant text buffered: {self._assistant_text}") def flush_aggregated_assistant_text(self): if not self._assistant_text: From 35848d10b36e71dd5761bf627a1b7794a6a11f86 Mon Sep 17 00:00:00 2001 From: Paul Kompfner Date: Tue, 6 May 2025 16:12:23 -0400 Subject: [PATCH 73/97] [WIP] AWS Nova Sonic service - remove various TODO comments --- src/pipecat/services/aws_nova_sonic/aws.py | 14 +------------- src/pipecat/services/aws_nova_sonic/context.py | 7 +------ 2 files changed, 2 insertions(+), 19 deletions(-) diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py index 1e318f164..cac3cd53f 100644 --- a/src/pipecat/services/aws_nova_sonic/aws.py +++ b/src/pipecat/services/aws_nova_sonic/aws.py @@ -132,7 +132,6 @@ class AWSNovaSonicLLMService(LLMService): def __init__( self, *, - # TODO: if we have instruction here as an alternative to using context, we should do the same for tools...right? secret_access_key: str, access_key_id: str, region: str, @@ -182,13 +181,6 @@ class AWSNovaSonicLLMService(LLMService): async def start(self, frame: StartFrame): await super().start(frame) self._wants_connection = True - # TODO: maybe connect but don't send history until we get all of our settings? - # how do we know how long to wait? - # ah, i think we'll *always* get at least one OpenAILLMContextFrame which kicks things off - # so we need to send the initial history when: - # - we're connected - # - we've gotten the first context - # i *think* this is what's controlled by _api_session_ready/_run_llm_when_api_session_ready await self._start_connecting() async def stop(self, frame: EndFrame): @@ -247,7 +239,6 @@ class AWSNovaSonicLLMService(LLMService): if self._triggering_assistant_response: return - # TODO: check if _audio_input_paused? what causes that? await self._send_user_audio_event(frame.audio) async def _handle_bot_stopped_speaking(self): @@ -417,9 +408,7 @@ class AWSNovaSonicLLMService(LLMService): region=self._region, aws_credentials_identity_resolver=StaticCredentialsResolver( credentials=AWSCredentialsIdentity( - access_key_id=self._access_key_id, - secret_access_key=self._secret_access_key, - # TODO: add additional stuff like aws_session_token + access_key_id=self._access_key_id, secret_access_key=self._secret_access_key ) ), http_auth_scheme_resolver=HTTPAuthSchemeResolver(), @@ -431,7 +420,6 @@ class AWSNovaSonicLLMService(LLMService): # LLM communication: input events (pipecat -> LLM) # - # TODO: make params configurable? async def _send_session_start_event(self): session_start = f""" {{ diff --git a/src/pipecat/services/aws_nova_sonic/context.py b/src/pipecat/services/aws_nova_sonic/context.py index a8d9c4dba..561ae53db 100644 --- a/src/pipecat/services/aws_nova_sonic/context.py +++ b/src/pipecat/services/aws_nova_sonic/context.py @@ -150,7 +150,7 @@ class AWSNovaSonicLLMContext(OpenAILLMContext): } self._assistant_text = "" self.add_message(message) - # print(f"[pk] context updated (assistant): {self.get_messages_for_logging()}") + # logger.debug(f"Context updated (assistant): {self.get_messages_for_logging()}") @dataclass @@ -168,11 +168,6 @@ class AWSNovaSonicUserContextAggregator(OpenAIUserContextAggregator): if isinstance(frame, LLMMessagesUpdateFrame): await self.push_frame(AWSNovaSonicMessagesUpdateFrame(context=self._context)) - # Parent also doesn't push the LLMSetToolsFrame - # TODO: this - # if isinstance(frame, LLMSetToolsFrame): - # await self.push_frame(frame, direction) - class AWSNovaSonicAssistantContextAggregator(OpenAIAssistantContextAggregator): async def process_frame(self, frame: Frame, direction: FrameDirection): From 5579145a0630a10fefbb1d9f1b71dfa599c84d07 Mon Sep 17 00:00:00 2001 From: Paul Kompfner Date: Tue, 6 May 2025 21:19:16 -0400 Subject: [PATCH 74/97] [WIP] AWS Nova Sonic service - post-rebase, update examples to play nicely with recent pipecat changes --- .../20e-persistent-context-aws-nova-sonic.py | 42 +++++++++---------- examples/foundational/39-aws-nova-sonic.py | 14 +++---- 2 files changed, 27 insertions(+), 29 deletions(-) diff --git a/examples/foundational/20e-persistent-context-aws-nova-sonic.py b/examples/foundational/20e-persistent-context-aws-nova-sonic.py index 8ac1508f8..e092730fb 100644 --- a/examples/foundational/20e-persistent-context-aws-nova-sonic.py +++ b/examples/foundational/20e-persistent-context-aws-nova-sonic.py @@ -4,6 +4,7 @@ # SPDX-License-Identifier: BSD 2-Clause License # +import argparse import asyncio import glob import json @@ -22,6 +23,7 @@ from pipecat.pipeline.runner import PipelineRunner from pipecat.pipeline.task import PipelineParams, PipelineTask from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext from pipecat.services.aws_nova_sonic.aws import AWSNovaSonicLLMService +from pipecat.services.llm_service import FunctionCallParams from pipecat.transports.base_transport import TransportParams from pipecat.transports.network.small_webrtc import SmallWebRTCTransport from pipecat.transports.network.webrtc_connection import SmallWebRTCConnection @@ -31,21 +33,19 @@ load_dotenv(override=True) BASE_FILENAME = "/tmp/pipecat_conversation_" -async def fetch_weather_from_api(function_name, tool_call_id, args, llm, context, result_callback): - temperature = 75 if args["format"] == "fahrenheit" else 24 - await result_callback( +async def fetch_weather_from_api(params: FunctionCallParams): + temperature = 75 if params.arguments["format"] == "fahrenheit" else 24 + await params.result_callback( { "conditions": "nice", "temperature": temperature, - "format": args["format"], + "format": params.arguments["format"], "timestamp": datetime.now().strftime("%Y%m%d_%H%M%S"), } ) -async def get_saved_conversation_filenames( - function_name, tool_call_id, args, llm, context, result_callback -): +async def get_saved_conversation_filenames(params: FunctionCallParams): # Construct the full pattern including the BASE_FILENAME full_pattern = f"{BASE_FILENAME}*.json" @@ -53,7 +53,7 @@ async def get_saved_conversation_filenames( matching_files = glob.glob(full_pattern) logger.debug(f"matching files: {matching_files}") - await result_callback({"filenames": matching_files}) + await params.result_callback({"filenames": matching_files}) # async def get_saved_conversation_filenames( @@ -69,26 +69,26 @@ async def get_saved_conversation_filenames( # await result_callback({"filenames": matching_files}) -async def save_conversation(function_name, tool_call_id, args, llm, context, result_callback): +async def save_conversation(params: FunctionCallParams): timestamp = datetime.now().strftime("%Y-%m-%d_%H:%M:%S") filename = f"{BASE_FILENAME}{timestamp}.json" logger.debug( - f"writing conversation to {filename}\n{json.dumps(context.get_messages_for_persistent_storage(), indent=4)}" + f"writing conversation to {filename}\n{json.dumps(params.context.get_messages_for_persistent_storage(), indent=4)}" ) try: with open(filename, "w") as file: - messages = context.get_messages_for_persistent_storage() + messages = params.context.get_messages_for_persistent_storage() # remove the last message, which is the instruction we just gave to save the conversation messages.pop() json.dump(messages, file, indent=2) - await result_callback({"success": True}) + await params.result_callback({"success": True}) except Exception as e: - await result_callback({"success": False, "error": str(e)}) + await params.result_callback({"success": False, "error": str(e)}) -async def load_conversation(function_name, tool_call_id, args, llm, context, result_callback): +async def load_conversation(params: FunctionCallParams): async def _reset(): - filename = args["filename"] + filename = params.arguments["filename"] logger.debug(f"loading conversation from {filename}") try: with open(filename, "r") as file: @@ -99,11 +99,11 @@ async def load_conversation(function_name, tool_call_id, args, llm, context, res "content": f"{AWSNovaSonicLLMService.AWAIT_TRIGGER_ASSISTANT_RESPONSE_INSTRUCTION}", } ) - context.set_messages(messages) - await llm.reset_conversation() - await llm.trigger_assistant_response() + params.context.set_messages(messages) + await params.llm.reset_conversation() + await params.llm.trigger_assistant_response() except Exception as e: - await result_callback({"success": False, "error": str(e)}) + await params.result_callback({"success": False, "error": str(e)}) asyncio.create_task(_reset()) @@ -161,7 +161,7 @@ tools = ToolsSchema( ) -async def run_bot(webrtc_connection: SmallWebRTCConnection): +async def run_bot(webrtc_connection: SmallWebRTCConnection, _: argparse.Namespace): logger.info(f"Starting bot") transport = SmallWebRTCTransport( @@ -169,9 +169,7 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection): params=TransportParams( audio_in_enabled=True, audio_out_enabled=True, - vad_enabled=True, vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.8)), - vad_audio_passthrough=True, ), ) diff --git a/examples/foundational/39-aws-nova-sonic.py b/examples/foundational/39-aws-nova-sonic.py index af44cf790..9decc47ae 100644 --- a/examples/foundational/39-aws-nova-sonic.py +++ b/examples/foundational/39-aws-nova-sonic.py @@ -4,6 +4,7 @@ # SPDX-License-Identifier: BSD 2-Clause License # +import argparse import os from datetime import datetime @@ -19,6 +20,7 @@ from pipecat.pipeline.runner import PipelineRunner from pipecat.pipeline.task import PipelineParams, PipelineTask from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext from pipecat.services.aws_nova_sonic import AWSNovaSonicLLMService +from pipecat.services.llm_service import FunctionCallParams from pipecat.transports.base_transport import TransportParams from pipecat.transports.network.small_webrtc import SmallWebRTCTransport from pipecat.transports.network.webrtc_connection import SmallWebRTCConnection @@ -27,13 +29,13 @@ from pipecat.transports.network.webrtc_connection import SmallWebRTCConnection load_dotenv(override=True) -async def fetch_weather_from_api(function_name, tool_call_id, args, llm, context, result_callback): - temperature = 75 if args["format"] == "fahrenheit" else 24 - await result_callback( +async def fetch_weather_from_api(params: FunctionCallParams): + temperature = 75 if params.arguments["format"] == "fahrenheit" else 24 + await params.result_callback( { "conditions": "nice", "temperature": temperature, - "format": args["format"], + "format": params.arguments["format"], "timestamp": datetime.now().strftime("%Y%m%d_%H%M%S"), } ) @@ -60,7 +62,7 @@ weather_function = FunctionSchema( tools = ToolsSchema(standard_tools=[weather_function]) -async def run_bot(webrtc_connection: SmallWebRTCConnection): +async def run_bot(webrtc_connection: SmallWebRTCConnection, _: argparse.Namespace): logger.info(f"Starting bot") # Initialize the SmallWebRTCTransport with the connection @@ -71,8 +73,6 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection): audio_in_sample_rate=16000, audio_out_enabled=True, camera_in_enabled=False, - vad_enabled=True, - vad_audio_passthrough=True, vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.8)), ), ) From 84736472694592220b393830fb4f61c06ffd84d0 Mon Sep 17 00:00:00 2001 From: Paul Kompfner Date: Tue, 6 May 2025 21:57:41 -0400 Subject: [PATCH 75/97] [WIP] AWS Nova Sonic service - update persistent-context example to better avoid saving "transitional", as opposed to meaningful, context messages --- .../20e-persistent-context-aws-nova-sonic.py | 21 +++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/examples/foundational/20e-persistent-context-aws-nova-sonic.py b/examples/foundational/20e-persistent-context-aws-nova-sonic.py index e092730fb..1519f1c53 100644 --- a/examples/foundational/20e-persistent-context-aws-nova-sonic.py +++ b/examples/foundational/20e-persistent-context-aws-nova-sonic.py @@ -72,15 +72,24 @@ async def get_saved_conversation_filenames(params: FunctionCallParams): async def save_conversation(params: FunctionCallParams): timestamp = datetime.now().strftime("%Y-%m-%d_%H:%M:%S") filename = f"{BASE_FILENAME}{timestamp}.json" - logger.debug( - f"writing conversation to {filename}\n{json.dumps(params.context.get_messages_for_persistent_storage(), indent=4)}" - ) try: with open(filename, "w") as file: messages = params.context.get_messages_for_persistent_storage() - # remove the last message, which is the instruction we just gave to save the conversation - messages.pop() - json.dump(messages, file, indent=2) + # remove the last few messages. in reverse order, they are: + # - the in progress save tool call + # - the invocation of the save tool call + # - the user ask to save (which may encompass one or more messages) + # the simplest thing to do is to pop messages until the last one is an assistant + # response + while messages and not ( + messages[-1].get("role") == "assistant" and "content" in messages[-1] + ): + messages.pop() + if messages: # we never expect this to be empty + logger.debug( + f"writing conversation to {filename}\n{json.dumps(messages, indent=4)}" + ) + json.dump(messages, file, indent=2) await params.result_callback({"success": True}) except Exception as e: await params.result_callback({"success": False, "error": str(e)}) From ed06cdd2c7ea2a27e95b207cc5b72cdc625cf2bd Mon Sep 17 00:00:00 2001 From: Paul Kompfner Date: Tue, 6 May 2025 22:03:25 -0400 Subject: [PATCH 76/97] [WIP] AWS Nova Sonic service - add CHANGELOG entry --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2eec61bca..bf6fec1b2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added +- Added support for the AWS Nova Sonic speech-to-speech model with the new + `AWSNovaSonicLLMService`. + (see https://docs.aws.amazon.com/nova/latest/userguide/speech.html) + - Added new AWS services `AWSBedrockLLMService` and `AWSTranscribeSTTService`. - Added `on_active_speaker_changed` event handler to the `DailyTransport` class. From 896f8d85f70a43f6c20c74e77fe67abc48925967 Mon Sep 17 00:00:00 2001 From: Paul Kompfner Date: Tue, 6 May 2025 22:08:55 -0400 Subject: [PATCH 77/97] [WIP] AWS Nova Sonic service - remove out-of-date TODO comment --- examples/foundational/39-aws-nova-sonic.py | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/foundational/39-aws-nova-sonic.py b/examples/foundational/39-aws-nova-sonic.py index 9decc47ae..4ed533e18 100644 --- a/examples/foundational/39-aws-nova-sonic.py +++ b/examples/foundational/39-aws-nova-sonic.py @@ -108,7 +108,6 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection, _: argparse.Namespac # Set up context and context management. # AWSNovaSonicService will adapt OpenAI LLM context objects with standard message format to # what's expected by Nova Sonic. - # TODO: since we can't trigger a response upon joining, this isn't particularly useful context = OpenAILLMContext( messages=[ {"role": "system", "content": f"{system_instruction}"}, From 27bff7a75963298bd2325520afbc50c9f4dddc26 Mon Sep 17 00:00:00 2001 From: Paul Kompfner Date: Wed, 7 May 2025 09:11:27 -0400 Subject: [PATCH 78/97] [WIP] AWS Nova Sonic service - fix comment --- src/pipecat/adapters/services/aws_nova_sonic_adapter.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/pipecat/adapters/services/aws_nova_sonic_adapter.py b/src/pipecat/adapters/services/aws_nova_sonic_adapter.py index b96980046..dc7eef92d 100644 --- a/src/pipecat/adapters/services/aws_nova_sonic_adapter.py +++ b/src/pipecat/adapters/services/aws_nova_sonic_adapter.py @@ -4,7 +4,7 @@ # SPDX-License-Identifier: BSD 2-Clause License # import json -from typing import Any, Dict, List, Union +from typing import Any, Dict, List from pipecat.adapters.base_llm_adapter import BaseLLMAdapter from pipecat.adapters.schemas.function_schema import FunctionSchema @@ -31,9 +31,9 @@ class AWSNovaSonicLLMAdapter(BaseLLMAdapter): } def to_provider_tools_format(self, tools_schema: ToolsSchema) -> List[Dict[str, Any]]: - """Converts function schemas to Openai Realtime function-calling format. + """Converts function schemas to AWS Nova Sonic function-calling format. - :return: Openai Realtime formatted function call definition. + :return: AWS Nova Sonic formatted function call definition. """ functions_schema = tools_schema.standard_tools From 4ba9a428610e074124499849fb0a8e22ccadd5d5 Mon Sep 17 00:00:00 2001 From: Paul Kompfner Date: Wed, 7 May 2025 09:52:05 -0400 Subject: [PATCH 79/97] [WIP] AWS Nova Sonic service - add more accurate typing --- src/pipecat/services/aws_nova_sonic/aws.py | 74 ++++++++++++++-------- 1 file changed, 47 insertions(+), 27 deletions(-) diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py index cac3cd53f..a2ef78f88 100644 --- a/src/pipecat/services/aws_nova_sonic/aws.py +++ b/src/pipecat/services/aws_nova_sonic/aws.py @@ -148,30 +148,34 @@ class AWSNovaSonicLLMService(LLMService): self._access_key_id = access_key_id self._region = region self._model = model - self._client: BedrockRuntimeClient = None + self._client: Optional[BedrockRuntimeClient] = None self._voice_id = voice_id self._params = params self._system_instruction = system_instruction self._tools = tools self._send_transcription_frames = send_transcription_frames - self._context: AWSNovaSonicLLMContext = None - self._stream: DuplexEventStream[ - InvokeModelWithBidirectionalStreamInput, - InvokeModelWithBidirectionalStreamOutput, - InvokeModelWithBidirectionalStreamOperationOutput, + self._context: Optional[AWSNovaSonicLLMContext] = None + self._stream: Optional[ + DuplexEventStream[ + InvokeModelWithBidirectionalStreamInput, + InvokeModelWithBidirectionalStreamOutput, + InvokeModelWithBidirectionalStreamOperationOutput, + ] ] = None - self._receive_task = None - self._prompt_name = None - self._input_audio_content_name = None - self._content_being_received = None + self._receive_task: Optional[asyncio.Task] = None + self._prompt_name: Optional[str] = None + self._input_audio_content_name: Optional[str] = None + self._content_being_received: Optional[CurrentContent] = None self._assistant_is_responding = False self._context_available = False self._ready_to_send_context = False self._handling_bot_stopped_speaking = False self._triggering_assistant_response = False - self._assistant_response_trigger_audio: bytes = None # Not cleared on _disconnect() + self._assistant_response_trigger_audio: Optional[bytes] = ( + None # Not cleared on _disconnect() + ) self._disconnecting = False - self._connected_time: float = None + self._connected_time: Optional[float] = None self._wants_connection = False # @@ -437,6 +441,9 @@ class AWSNovaSonicLLMService(LLMService): await self._send_client_event(session_start) async def _send_prompt_start_event(self, tools: List[Any]): + if not self._prompt_name: + return + tools_config = ( f""", "toolUseOutputConfiguration": {{ @@ -474,6 +481,9 @@ class AWSNovaSonicLLMService(LLMService): await self._send_client_event(prompt_start) async def _send_audio_input_start_event(self): + if not self._prompt_name: + return + audio_content_start = f''' {{ "event": {{ @@ -498,7 +508,7 @@ class AWSNovaSonicLLMService(LLMService): await self._send_client_event(audio_content_start) async def _send_text_event(self, text: str, role: Role): - if not self._stream or not text: + if not self._stream or not self._prompt_name or not text: return content_name = str(uuid.uuid4()) @@ -566,7 +576,7 @@ class AWSNovaSonicLLMService(LLMService): await self._send_client_event(audio_event) async def _send_session_end_events(self): - if not self._stream: + if not self._stream or not self._prompt_name: return prompt_end = f''' @@ -590,7 +600,7 @@ class AWSNovaSonicLLMService(LLMService): await self._send_client_event(session_end) async def _send_tool_result(self, tool_call_id, result): - if not self._stream: + if not self._stream or not self._prompt_name: return content_name = str(uuid.uuid4()) @@ -643,6 +653,9 @@ class AWSNovaSonicLLMService(LLMService): await self._send_client_event(result_content_end) async def _send_client_event(self, event_json: str): + if not self._stream: # should never happen + return + event = InvokeModelWithBidirectionalStreamInputChunk( value=BidirectionalInputPayloadPart(bytes_=event_json.encode("utf-8")) ) @@ -732,8 +745,7 @@ class AWSNovaSonicLLMService(LLMService): await self._report_assistant_response_started() async def _handle_text_output_event(self, event_json): - # This should never happen - if not self._content_being_received: + if not self._content_being_received: # should never happen return content = self._content_being_received @@ -744,8 +756,7 @@ class AWSNovaSonicLLMService(LLMService): content.text_content = text_content async def _handle_audio_output_event(self, event_json): - # This should never happen - if not self._content_being_received: + if not self._content_being_received: # should never happen return # Get audio @@ -761,8 +772,7 @@ class AWSNovaSonicLLMService(LLMService): await self.push_frame(frame) async def _handle_tool_use_event(self, event_json): - # This should never happen - if not self._content_being_received: + if not self._content_being_received or not self._context: # should never happen return # Get tool use details @@ -793,8 +803,7 @@ class AWSNovaSonicLLMService(LLMService): ) async def _handle_content_end_event(self, event_json): - # This should never happen - if not self._content_being_received: + if not self._content_being_received: # should never happen return content = self._content_being_received @@ -817,8 +826,6 @@ class AWSNovaSonicLLMService(LLMService): # User transcription text added await self._report_user_transcription_text_added(content.text_content) - self._content_being_received = False - async def _handle_completion_end_event(self, event_json): pass @@ -832,6 +839,9 @@ class AWSNovaSonicLLMService(LLMService): await self.push_frame(TTSStartedFrame()) async def _report_assistant_response_text_added(self, text): + if not self._context: # should never happen + return + logger.debug(f"Assistant response text added: {text}") # Report some text added to the ongoing assistant response @@ -853,6 +863,9 @@ class AWSNovaSonicLLMService(LLMService): self._context.buffer_assistant_text(text) async def _report_assistant_response_ended(self): + if not self._context: # should never happen + return + logger.debug("Assistant response ended") # Report that the assistant has finished their response. @@ -865,6 +878,9 @@ class AWSNovaSonicLLMService(LLMService): self._context.flush_aggregated_assistant_text() async def _report_user_transcription_text_added(self, text): + if not self._context: # should never happen + return + logger.debug(f"User transcription text added: {text}") # Manually add new user transcription text to context. @@ -918,12 +934,16 @@ class AWSNovaSonicLLMService(LLMService): self._assistant_response_trigger_audio = wav_file.readframes(wav_file.getnframes()) # Send the trigger audio, if we're fully connected and set up - # NOTE: maybe there's a better way to determine whether we're done setting up? - if self._receive_task: + if self._connected_time is not None: await self._send_assistant_response_trigger() self._triggering_assistant_response = False async def _send_assistant_response_trigger(self): + if ( + not self._assistant_response_trigger_audio or self._connected_time is None + ): # should never happen + return + logger.debug("Sending assistant response trigger...") chunk_duration = 0.02 # what we might get from InputAudioRawFrame From 52036138c1ff2365f17c4b03df89e711b1aeb51a Mon Sep 17 00:00:00 2001 From: Paul Kompfner Date: Wed, 7 May 2025 10:22:51 -0400 Subject: [PATCH 80/97] [WIP] AWS Nova Sonic service - remove unnecessary (no-op) code --- src/pipecat/services/aws_nova_sonic/aws.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py index a2ef78f88..6838daad6 100644 --- a/src/pipecat/services/aws_nova_sonic/aws.py +++ b/src/pipecat/services/aws_nova_sonic/aws.py @@ -269,7 +269,6 @@ class AWSNovaSonicLLMService(LLMService): await asyncio.sleep(0.25) self._assistant_is_responding = False await self._report_assistant_response_ended() - self._handling_bot_stopped_speaking = False self._handling_bot_stopped_speaking = False From b013e375fb90f0810e1c1b367b5834e5f70d8fdc Mon Sep 17 00:00:00 2001 From: Paul Kompfner Date: Wed, 7 May 2025 10:38:23 -0400 Subject: [PATCH 81/97] [WIP] AWS Nova Sonic service - simplify a bit of logic (and do the same simplification in the OpenAI Realtime service) --- src/pipecat/services/aws_nova_sonic/aws.py | 9 +-------- src/pipecat/services/openai_realtime_beta/openai.py | 10 +--------- 2 files changed, 2 insertions(+), 17 deletions(-) diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py index 6838daad6..3f41a0166 100644 --- a/src/pipecat/services/aws_nova_sonic/aws.py +++ b/src/pipecat/services/aws_nova_sonic/aws.py @@ -782,14 +782,7 @@ class AWSNovaSonicLLMService(LLMService): # Call tool function if self.has_function(function_name): - if function_name in self._functions.keys(): - await self.call_function( - context=self._context, - tool_call_id=tool_call_id, - function_name=function_name, - arguments=arguments, - ) - elif None in self._functions.keys(): + if function_name in self._functions.keys() or None in self._functions.keys(): await self.call_function( context=self._context, tool_call_id=tool_call_id, diff --git a/src/pipecat/services/openai_realtime_beta/openai.py b/src/pipecat/services/openai_realtime_beta/openai.py index 334ce98c8..0c37f73ce 100644 --- a/src/pipecat/services/openai_realtime_beta/openai.py +++ b/src/pipecat/services/openai_realtime_beta/openai.py @@ -577,15 +577,7 @@ class OpenAIRealtimeBetaLLMService(LLMService): arguments = json.loads(item.arguments) if self.has_function(function_name): run_llm = index == total_items - 1 - if function_name in self._functions.keys(): - await self.call_function( - context=self._context, - tool_call_id=tool_id, - function_name=function_name, - arguments=arguments, - run_llm=run_llm, - ) - elif None in self._functions.keys(): + if function_name in self._functions.keys() or None in self._functions.keys(): await self.call_function( context=self._context, tool_call_id=tool_id, From c78f7798004a505522765c00b3e043ffbaa6784d Mon Sep 17 00:00:00 2001 From: Paul Kompfner Date: Wed, 7 May 2025 10:55:43 -0400 Subject: [PATCH 82/97] [WIP] AWS Nova Sonic service - log an error message if you try to use AWS Nova Sonic without the proper dependency (e.g. without having done `pip install pipecat-ai[aws]`) --- src/pipecat/services/aws_nova_sonic/aws.py | 40 ++++++++++++---------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py index 3f41a0166..b4989185a 100644 --- a/src/pipecat/services/aws_nova_sonic/aws.py +++ b/src/pipecat/services/aws_nova_sonic/aws.py @@ -15,23 +15,8 @@ from enum import Enum from importlib.resources import files from typing import Any, List, Optional -from aws_sdk_bedrock_runtime.client import ( - BedrockRuntimeClient, - InvokeModelWithBidirectionalStreamOperationInput, -) -from aws_sdk_bedrock_runtime.config import Config, HTTPAuthSchemeResolver, SigV4AuthScheme -from aws_sdk_bedrock_runtime.models import ( - BidirectionalInputPayloadPart, - InvokeModelWithBidirectionalStreamInput, - InvokeModelWithBidirectionalStreamInputChunk, - InvokeModelWithBidirectionalStreamOperationOutput, - InvokeModelWithBidirectionalStreamOutput, -) from loguru import logger from pydantic import BaseModel, Field -from smithy_aws_core.credentials_resolvers.static import StaticCredentialsResolver -from smithy_aws_core.identity import AWSCredentialsIdentity -from smithy_core.aio.eventstream import DuplexEventStream from pipecat.adapters.schemas.tools_schema import ToolsSchema from pipecat.adapters.services.aws_nova_sonic_adapter import AWSNovaSonicLLMAdapter @@ -45,15 +30,11 @@ from pipecat.frames.frames import ( LLMFullResponseStartFrame, LLMTextFrame, StartFrame, - StartInterruptionFrame, - StopInterruptionFrame, TranscriptionFrame, TTSAudioRawFrame, TTSStartedFrame, TTSStoppedFrame, TTSTextFrame, - UserStartedSpeakingFrame, - UserStoppedSpeakingFrame, ) from pipecat.processors.aggregators.llm_response import ( LLMAssistantAggregatorParams, @@ -75,6 +56,27 @@ from pipecat.services.aws_nova_sonic.frames import AWSNovaSonicFunctionCallResul from pipecat.services.llm_service import LLMService from pipecat.utils.time import time_now_iso8601 +try: + from aws_sdk_bedrock_runtime.client import ( + BedrockRuntimeClient, + InvokeModelWithBidirectionalStreamOperationInput, + ) + from aws_sdk_bedrock_runtime.config import Config, HTTPAuthSchemeResolver, SigV4AuthScheme + from aws_sdk_bedrock_runtime.models import ( + BidirectionalInputPayloadPart, + InvokeModelWithBidirectionalStreamInput, + InvokeModelWithBidirectionalStreamInputChunk, + InvokeModelWithBidirectionalStreamOperationOutput, + InvokeModelWithBidirectionalStreamOutput, + ) + from smithy_aws_core.credentials_resolvers.static import StaticCredentialsResolver + from smithy_aws_core.identity import AWSCredentialsIdentity + from smithy_core.aio.eventstream import DuplexEventStream +except ModuleNotFoundError as e: + logger.error(f"Exception: {e}") + logger.error("In order to use AWS services, you need to `pip install pipecat-ai[aws]`.") + raise Exception(f"Missing module: {e}") + class AWSNovaSonicUnhandledFunctionException(Exception): pass From 1491462d157509eb336d5111a4f4a8f875e2cd90 Mon Sep 17 00:00:00 2001 From: Paul Kompfner Date: Wed, 7 May 2025 11:10:54 -0400 Subject: [PATCH 83/97] [WIP] AWS Nova Sonic service - remove `_handling_bot_stopped_speaking`, which no longer seems to be necessary; I'm no longer observing back-to-back `BotStoppedSpeaking` frames --- src/pipecat/services/aws_nova_sonic/aws.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py index b4989185a..9f2f1f72e 100644 --- a/src/pipecat/services/aws_nova_sonic/aws.py +++ b/src/pipecat/services/aws_nova_sonic/aws.py @@ -171,7 +171,6 @@ class AWSNovaSonicLLMService(LLMService): self._assistant_is_responding = False self._context_available = False self._ready_to_send_context = False - self._handling_bot_stopped_speaking = False self._triggering_assistant_response = False self._assistant_response_trigger_audio: Optional[bytes] = ( None # Not cleared on _disconnect() @@ -248,10 +247,6 @@ class AWSNovaSonicLLMService(LLMService): await self._send_user_audio_event(frame.audio) async def _handle_bot_stopped_speaking(self): - # Protect against back-to-back BotStoppedSpeaking calls, which I've observed - if self._handling_bot_stopped_speaking: - return - self._handling_bot_stopped_speaking = True if self._assistant_is_responding: # Consider the assistant finished with their response (after a short delay, to allow for @@ -272,8 +267,6 @@ class AWSNovaSonicLLMService(LLMService): self._assistant_is_responding = False await self._report_assistant_response_ended() - self._handling_bot_stopped_speaking = False - async def _handle_function_call_result(self, frame: AWSNovaSonicFunctionCallResultFrame): result = frame.result_frame await self._send_tool_result(tool_call_id=result.tool_call_id, result=result.result) @@ -398,7 +391,6 @@ class AWSNovaSonicLLMService(LLMService): self._assistant_is_responding = False self._context_available = False self._ready_to_send_context = False - self._handling_bot_stopped_speaking = False self._triggering_assistant_response = False self._disconnecting = False self._connected_time = None From b53f9235e4bd9f9fefe6d0e0b6761e972a3d8bf0 Mon Sep 17 00:00:00 2001 From: Paul Kompfner Date: Wed, 7 May 2025 11:42:37 -0400 Subject: [PATCH 84/97] [WIP] AWS Nova Sonic service - remove unnecessary `_context_available` state, instead just relying on the presence of `_context` --- src/pipecat/services/aws_nova_sonic/aws.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py index 9f2f1f72e..4056d0ed0 100644 --- a/src/pipecat/services/aws_nova_sonic/aws.py +++ b/src/pipecat/services/aws_nova_sonic/aws.py @@ -169,7 +169,6 @@ class AWSNovaSonicLLMService(LLMService): self._input_audio_content_name: Optional[str] = None self._content_being_received: Optional[CurrentContent] = None self._assistant_is_responding = False - self._context_available = False self._ready_to_send_context = False self._triggering_assistant_response = False self._assistant_response_trigger_audio: Optional[bytes] = ( @@ -205,11 +204,13 @@ class AWSNovaSonicLLMService(LLMService): async def reset_conversation(self): logger.debug("Resetting conversation") await self._handle_bot_stopped_speaking() + + # Carry over previous context through disconnect + context = self._context await self._disconnect() + self._context = context + await self._start_connecting() - # Use existing context - self._context_available = True - await self._finish_connecting_if_context_available() # # frame processing @@ -235,7 +236,6 @@ class AWSNovaSonicLLMService(LLMService): self._context = AWSNovaSonicLLMContext.upgrade_to_nova_sonic( context, self._system_instruction ) - self._context_available = True await self._finish_connecting_if_context_available() async def _handle_input_audio_frame(self, frame: InputAudioRawFrame): @@ -247,7 +247,6 @@ class AWSNovaSonicLLMService(LLMService): await self._send_user_audio_event(frame.audio) async def _handle_bot_stopped_speaking(self): - if self._assistant_is_responding: # Consider the assistant finished with their response (after a short delay, to allow for # any FINAL text block to come in). @@ -308,7 +307,7 @@ class AWSNovaSonicLLMService(LLMService): async def _finish_connecting_if_context_available(self): # We can only finish connecting once we've gotten our initial context and we're ready to # send it - if not (self._context_available and self._ready_to_send_context): + if not (self._context and self._ready_to_send_context): return logger.info("Finishing connecting (setting up session)...") @@ -389,7 +388,6 @@ class AWSNovaSonicLLMService(LLMService): self._input_audio_content_name = None self._content_being_received = None self._assistant_is_responding = False - self._context_available = False self._ready_to_send_context = False self._triggering_assistant_response = False self._disconnecting = False From 93c9cc4a0e7dbd27f9d5adcbcb76215b35184bbf Mon Sep 17 00:00:00 2001 From: Paul Kompfner Date: Wed, 7 May 2025 12:03:23 -0400 Subject: [PATCH 85/97] [WIP] AWS Nova Sonic service - minor fix --- src/pipecat/services/aws_nova_sonic/aws.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py index 4056d0ed0..eab12272c 100644 --- a/src/pipecat/services/aws_nova_sonic/aws.py +++ b/src/pipecat/services/aws_nova_sonic/aws.py @@ -669,7 +669,7 @@ class AWSNovaSonicLLMService(LLMService): # The overall completion is wrapped by "completionStart" and "completionEnd" events. async def _receive_task_handler(self): try: - while self._client and not self._disconnecting: + while self._stream and not self._disconnecting: output = await self._stream.await_output() result = await output[1].receive() From 2920aa5af477720227666d93e057d7b25424b36b Mon Sep 17 00:00:00 2001 From: Paul Kompfner Date: Wed, 7 May 2025 14:32:32 -0400 Subject: [PATCH 86/97] [WIP] AWS Nova Sonic service - pull AWS Nova Sonic support out of the `aws` optional dependency in pyproject.toml and into its own `aws-nova-sonic` optional dependency. That's because it requires Python >= 3.12, a higher version than the base project's 3.10. This change allows anyone using any of the other AWS services (including our own unit tests) to continue using the lower Python version. --- CHANGELOG.md | 3 ++- pyproject.toml | 3 ++- src/pipecat/services/aws_nova_sonic/aws.py | 4 +++- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bf6fec1b2..319dce632 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,7 +11,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added support for the AWS Nova Sonic speech-to-speech model with the new `AWSNovaSonicLLMService`. - (see https://docs.aws.amazon.com/nova/latest/userguide/speech.html) + See https://docs.aws.amazon.com/nova/latest/userguide/speech.html. + Note that it requires Python >= 3.12 and `pip install pipecat-ai[aws-nova-sonic]`. - Added new AWS services `AWSBedrockLLMService` and `AWSTranscribeSTTService`. diff --git a/pyproject.toml b/pyproject.toml index 7ce167d77..06d7fb0a4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -41,7 +41,8 @@ Website = "https://pipecat.ai" [project.optional-dependencies] anthropic = [ "anthropic~=0.49.0" ] assemblyai = [ "assemblyai~=0.37.0" ] -aws = [ "boto3~=1.37.16", "websockets~=13.1", "aws_sdk_bedrock_runtime~=0.0.2" ] +aws = [ "boto3~=1.37.16", "websockets~=13.1" ] +aws-nova-sonic = [ "aws_sdk_bedrock_runtime~=0.0.2" ] azure = [ "azure-cognitiveservices-speech~=1.42.0"] cartesia = [ "cartesia~=1.4.0", "websockets~=13.1" ] cerebras = [] diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py index eab12272c..b53578f5a 100644 --- a/src/pipecat/services/aws_nova_sonic/aws.py +++ b/src/pipecat/services/aws_nova_sonic/aws.py @@ -74,7 +74,9 @@ try: from smithy_core.aio.eventstream import DuplexEventStream except ModuleNotFoundError as e: logger.error(f"Exception: {e}") - logger.error("In order to use AWS services, you need to `pip install pipecat-ai[aws]`.") + logger.error( + "In order to use AWS services, you need to `pip install pipecat-ai[aws-nova-sonic]`." + ) raise Exception(f"Missing module: {e}") From a3038afa023b4a609fb62dffec21f4da3e780078 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= Date: Wed, 7 May 2025 11:39:36 -0700 Subject: [PATCH 87/97] DailyTransport: fix multiple audio/video sources --- CHANGELOG.md | 3 +++ src/pipecat/transports/services/daily.py | 17 +++++++++-------- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 319dce632..16da15420 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -39,6 +39,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed +- Fixed a `DailyTransport` issue that was causing issues when multiple audio or + video sources where being captured. + - Fixed a `UltravoxSTTService` issue that would cause the service to generate all tokens as one word. diff --git a/src/pipecat/transports/services/daily.py b/src/pipecat/transports/services/daily.py index 5d00e76bc..9118b2107 100644 --- a/src/pipecat/transports/services/daily.py +++ b/src/pipecat/transports/services/daily.py @@ -700,7 +700,7 @@ class DailyTransportClient(EventHandler): await self.update_subscriptions(participant_settings={participant_id: media}) - self._audio_renderers[participant_id] = {audio_source: callback} + self._audio_renderers.setdefault(participant_id, {})[audio_source] = callback self._client.set_audio_renderer( participant_id, @@ -724,7 +724,7 @@ class DailyTransportClient(EventHandler): await self.update_subscriptions(participant_settings={participant_id: media}) - self._video_renderers[participant_id] = {video_source: callback} + self._video_renderers.setdefault(participant_id, {})[video_source] = callback self._client.set_video_renderer( participant_id, @@ -1061,12 +1061,13 @@ class DailyInputTransport(BaseInputTransport): video_source: str = "camera", color_format: str = "RGB", ): - self._video_renderers[participant_id] = { - video_source: { - "framerate": framerate, - "timestamp": 0, - "render_next_frame": [], - } + if participant_id not in self._video_renderers: + self._video_renderers[participant_id] = {} + + self._video_renderers[participant_id][video_source] = { + "framerate": framerate, + "timestamp": 0, + "render_next_frame": [], } await self._client.capture_participant_video( From ed00f7d071973ac64bce555b0ff20aff323eede4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= Date: Wed, 7 May 2025 11:42:16 -0700 Subject: [PATCH 88/97] add video_source field to UserImageRequestFrame --- CHANGELOG.md | 3 +++ src/pipecat/frames/frames.py | 3 ++- src/pipecat/services/llm_service.py | 2 ++ src/pipecat/transports/services/daily.py | 3 ++- 4 files changed, 9 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 16da15420..3d2132d8d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added +- `UserImageRequestFrame.video_source` field has been added to request an image + from the desired video source. + - Added support for the AWS Nova Sonic speech-to-speech model with the new `AWSNovaSonicLLMService`. See https://docs.aws.amazon.com/nova/latest/userguide/speech.html. diff --git a/src/pipecat/frames/frames.py b/src/pipecat/frames/frames.py index 05f5b666d..8d3f38459 100644 --- a/src/pipecat/frames/frames.py +++ b/src/pipecat/frames/frames.py @@ -715,9 +715,10 @@ class UserImageRequestFrame(SystemFrame): context: Optional[Any] = None function_name: Optional[str] = None tool_call_id: Optional[str] = None + video_source: Optional[str] = None def __str__(self): - return f"{self.name}(user: {self.user_id}, function: {self.function_name}, request: {self.tool_call_id})" + return f"{self.name}(user: {self.user_id}, video_source: {self.video_source}, function: {self.function_name}, request: {self.tool_call_id})" @dataclass diff --git a/src/pipecat/services/llm_service.py b/src/pipecat/services/llm_service.py index 15b2bd6e5..21b62325d 100644 --- a/src/pipecat/services/llm_service.py +++ b/src/pipecat/services/llm_service.py @@ -190,6 +190,7 @@ class LLMService(AIService): function_name: Optional[str] = None, tool_call_id: Optional[str] = None, text_content: Optional[str] = None, + video_source: Optional[str] = None, ): await self.push_frame( UserImageRequestFrame( @@ -197,6 +198,7 @@ class LLMService(AIService): function_name=function_name, tool_call_id=tool_call_id, context=text_content, + video_source=video_source, ), FrameDirection.UPSTREAM, ) diff --git a/src/pipecat/transports/services/daily.py b/src/pipecat/transports/services/daily.py index 9118b2107..f1a514d0e 100644 --- a/src/pipecat/transports/services/daily.py +++ b/src/pipecat/transports/services/daily.py @@ -1076,7 +1076,8 @@ class DailyInputTransport(BaseInputTransport): async def request_participant_image(self, frame: UserImageRequestFrame): if frame.user_id in self._video_renderers: - self._video_renderers[frame.user_id]["render_next_frame"].append(frame) + video_source = frame.video_source if frame.video_source else "camera" + self._video_renderers[frame.user_id][video_source]["render_next_frame"].append(frame) async def _on_participant_video_frame( self, participant_id: str, video_frame: VideoFrame, video_source: str From cdf0953722fa7892d5c458dacf7b058654adfba2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= Date: Wed, 7 May 2025 11:56:36 -0700 Subject: [PATCH 89/97] pyproject: update daily-python to 0.18.2 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 13305933b..9c864c413 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,7 +46,7 @@ azure = [ "azure-cognitiveservices-speech~=1.42.0"] cartesia = [ "cartesia~=1.4.0", "websockets~=13.1" ] cerebras = [] deepseek = [] -daily = [ "daily-python~=0.18.1" ] +daily = [ "daily-python~=0.18.2" ] deepgram = [ "deepgram-sdk~=3.8.0" ] elevenlabs = [ "websockets~=13.1" ] fal = [ "fal-client~=0.5.9" ] From 84d040c6d0aed545091c0884611a2b5e13611e16 Mon Sep 17 00:00:00 2001 From: Paul Kompfner Date: Wed, 7 May 2025 16:21:47 -0400 Subject: [PATCH 90/97] AWS Nova Sonic service - make interruption handling more reliable, in terms of: - not getting the conversation into a "stuck" state - not losing assistant text that should've made it into the context --- src/pipecat/services/aws_nova_sonic/aws.py | 64 +++++++++++++++------- 1 file changed, 43 insertions(+), 21 deletions(-) diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py index b53578f5a..410481065 100644 --- a/src/pipecat/services/aws_nova_sonic/aws.py +++ b/src/pipecat/services/aws_nova_sonic/aws.py @@ -172,6 +172,7 @@ class AWSNovaSonicLLMService(LLMService): self._content_being_received: Optional[CurrentContent] = None self._assistant_is_responding = False self._ready_to_send_context = False + self._handling_bot_stopped_speaking = False self._triggering_assistant_response = False self._assistant_response_trigger_audio: Optional[bytes] = ( None # Not cleared on _disconnect() @@ -205,7 +206,7 @@ class AWSNovaSonicLLMService(LLMService): async def reset_conversation(self): logger.debug("Resetting conversation") - await self._handle_bot_stopped_speaking() + await self._handle_bot_stopped_speaking(delay_to_catch_trailing_assistant_text=False) # Carry over previous context through disconnect context = self._context @@ -226,7 +227,7 @@ class AWSNovaSonicLLMService(LLMService): elif isinstance(frame, InputAudioRawFrame): await self._handle_input_audio_frame(frame) elif isinstance(frame, BotStoppedSpeakingFrame): - await self._handle_bot_stopped_speaking() + await self._handle_bot_stopped_speaking(delay_to_catch_trailing_assistant_text=True) elif isinstance(frame, AWSNovaSonicFunctionCallResultFrame): await self._handle_function_call_result(frame) @@ -248,25 +249,45 @@ class AWSNovaSonicLLMService(LLMService): await self._send_user_audio_event(frame.audio) - async def _handle_bot_stopped_speaking(self): - if self._assistant_is_responding: - # Consider the assistant finished with their response (after a short delay, to allow for - # any FINAL text block to come in). - # - # TODO: ideally we could base this solely on the LLM output events, but I couldn't - # figure out a reliable way to determine when we've gotten our last FINAL text block - # after the LLM is done talking. - # - # First I looked at stopReason, but it doesn't seem like the last FINAL text block is - # reliably marked END_TURN (sometimes the *first* one is, but not the last...bug?) - # - # Then I considered schemes where we tally or match up SPECULATIVE text blocks with - # FINAL text blocks to know how many or which FINAL blocks to expect, but user - # interruptions throw a wrench in these schemes: depending on the exact timing of the - # interruption, we should or shouldn't expect some FINAL blocks. - await asyncio.sleep(0.25) - self._assistant_is_responding = False - await self._report_assistant_response_ended() + async def _handle_bot_stopped_speaking(self, delay_to_catch_trailing_assistant_text: bool): + # Protect against back-to-back BotStoppedSpeaking calls, which I've observed + if self._handling_bot_stopped_speaking: + return + self._handling_bot_stopped_speaking = True + + async def finalize_assistant_response(): + if self._assistant_is_responding: + # Consider the assistant finished with their response (possibly after a short delay, + # to allow for any trailing FINAL assistant text block to come in that need to make + # it into context). + # + # TODO: ideally we could base this solely on the LLM output events, but I couldn't + # figure out a reliable way to determine when we've gotten our last FINAL text block + # after the LLM is done talking. + # + # First I looked at stopReason, but it doesn't seem like the last FINAL text block + # is reliably marked END_TURN (sometimes the *first* one is, but not the last... + # bug?) + # + # Then I considered schemes where we tally or match up SPECULATIVE text blocks with + # FINAL text blocks to know how many or which FINAL blocks to expect, but user + # interruptions throw a wrench in these schemes: depending on the exact timing of + # the interruption, we should or shouldn't expect some FINAL blocks. + if delay_to_catch_trailing_assistant_text: + # This delay length is a balancing act between "catching" trailing assistant + # text that is quite delayed but not waiting so long that user text comes in + # first and results in a bit of context message order scrambling. + await asyncio.sleep(1.25) + self._assistant_is_responding = False + await self._report_assistant_response_ended() + + self._handling_bot_stopped_speaking = False + + # Finalize the assistant response, either now or after a delay + if delay_to_catch_trailing_assistant_text: + self.create_task(finalize_assistant_response()) + else: + await finalize_assistant_response() async def _handle_function_call_result(self, frame: AWSNovaSonicFunctionCallResultFrame): result = frame.result_frame @@ -391,6 +412,7 @@ class AWSNovaSonicLLMService(LLMService): self._content_being_received = None self._assistant_is_responding = False self._ready_to_send_context = False + self._handling_bot_stopped_speaking = False self._triggering_assistant_response = False self._disconnecting = False self._connected_time = None From 9e16e3d614ce9ee4eef72dd7d744dbb584c836f6 Mon Sep 17 00:00:00 2001 From: Mark Backman Date: Wed, 7 May 2025 11:00:55 -0400 Subject: [PATCH 91/97] Update ElevenLabsTTSService to use the new websocket API --- CHANGELOG.md | 4 + src/pipecat/services/elevenlabs/tts.py | 120 +++++++++++++++++-------- 2 files changed, 85 insertions(+), 39 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 101cc7f58..0bde64116 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed +- Updated `ElevenLabsTTSService` to use the beta websocket API + (multi-stream-input). This new API supports context_ids and cancelling those + contexts, which greatly improves interruption handling. + - Observers `on_push_frame()` now take a single argument `FramePushed` instead of multiple arguments. diff --git a/src/pipecat/services/elevenlabs/tts.py b/src/pipecat/services/elevenlabs/tts.py index 0a3d5d0d1..324e8099e 100644 --- a/src/pipecat/services/elevenlabs/tts.py +++ b/src/pipecat/services/elevenlabs/tts.py @@ -7,11 +7,12 @@ import asyncio import base64 import json +import uuid from typing import Any, AsyncGenerator, Dict, List, Literal, Mapping, Optional, Tuple, Union import aiohttp from loguru import logger -from pydantic import BaseModel, model_validator +from pydantic import BaseModel from pipecat.frames.frames import ( CancelFrame, @@ -26,7 +27,10 @@ from pipecat.frames.frames import ( TTSStoppedFrame, ) from pipecat.processors.frame_processor import FrameDirection -from pipecat.services.tts_service import InterruptibleWordTTSService, WordTTSService +from pipecat.services.tts_service import ( + AudioContextWordTTSService, + WordTTSService, +) from pipecat.transcriptions.language import Language # See .env.example for ElevenLabs configuration needed @@ -159,10 +163,9 @@ def calculate_word_times( return word_times -class ElevenLabsTTSService(InterruptibleWordTTSService): +class ElevenLabsTTSService(AudioContextWordTTSService): class InputParams(BaseModel): language: Optional[Language] = None - optimize_streaming_latency: Optional[str] = None stability: Optional[float] = None similarity_boost: Optional[float] = None style: Optional[float] = None @@ -172,16 +175,6 @@ class ElevenLabsTTSService(InterruptibleWordTTSService): enable_ssml_parsing: Optional[bool] = None enable_logging: Optional[bool] = None - @model_validator(mode="after") - def validate_voice_settings(self): - stability = self.stability - similarity_boost = self.similarity_boost - if (stability is None) != (similarity_boost is None): - raise ValueError( - "Both 'stability' and 'similarity_boost' must be provided when using voice settings" - ) - return self - def __init__( self, *, @@ -222,7 +215,6 @@ class ElevenLabsTTSService(InterruptibleWordTTSService): "language": self.language_to_service_language(params.language) if params.language else None, - "optimize_streaming_latency": params.optimize_streaming_latency, "stability": params.stability, "similarity_boost": params.similarity_boost, "style": params.style, @@ -242,6 +234,8 @@ class ElevenLabsTTSService(InterruptibleWordTTSService): self._started = False self._cumulative_time = 0 + # Context management for v1 multi API + self._context_id = None self._receive_task = None self._keepalive_task = None @@ -257,15 +251,13 @@ class ElevenLabsTTSService(InterruptibleWordTTSService): async def set_model(self, model: str): await super().set_model(model) logger.info(f"Switching TTS model to: [{model}]") - await self._disconnect() - await self._connect() + # No need to disconnect/reconnect for model changes with multi-context API async def _update_settings(self, settings: Mapping[str, Any]): prev_voice = self._voice_id await super()._update_settings(settings) + # If voice changes, we don't need to reconnect, just use a new context if not prev_voice == self._voice_id: - await self._disconnect() - await self._connect() logger.info(f"Switching TTS voice to: [{self._voice_id}]") async def start(self, frame: StartFrame): @@ -282,8 +274,8 @@ class ElevenLabsTTSService(InterruptibleWordTTSService): await self._disconnect() async def flush_audio(self): - if self._websocket: - msg = {"text": " ", "flush": True} + if self._websocket and self._context_id: + msg = {"context_id": self._context_id, "flush": True} await self._websocket.send(json.dumps(msg)) async def push_frame(self, frame: Frame, direction: FrameDirection = FrameDirection.DOWNSTREAM): @@ -323,10 +315,7 @@ class ElevenLabsTTSService(InterruptibleWordTTSService): voice_id = self._voice_id model = self.model_name output_format = self._output_format - url = f"{self._url}/v1/text-to-speech/{voice_id}/stream-input?model_id={model}&output_format={output_format}&auto_mode={self._settings['auto_mode']}" - - if self._settings["optimize_streaming_latency"]: - url += f"&optimize_streaming_latency={self._settings['optimize_streaming_latency']}" + url = f"{self._url}/v1/text-to-speech/{voice_id}/multi-stream-input?model_id={model}&output_format={output_format}&auto_mode={self._settings['auto_mode']}" if self._settings["enable_ssml_parsing"]: url += f"&enable_ssml_parsing={self._settings['enable_ssml_parsing']}" @@ -347,14 +336,6 @@ class ElevenLabsTTSService(InterruptibleWordTTSService): # Set max websocket message size to 16MB for large audio responses self._websocket = await websockets.connect(url, max_size=16 * 1024 * 1024) - # According to ElevenLabs, we should always start with a single space. - msg: Dict[str, Any] = { - "text": " ", - "xi_api_key": self._api_key, - } - if self._voice_settings: - msg["voice_settings"] = self._voice_settings - await self._websocket.send(json.dumps(msg)) except Exception as e: logger.error(f"{self} initialization error: {e}") self._websocket = None @@ -366,12 +347,15 @@ class ElevenLabsTTSService(InterruptibleWordTTSService): if self._websocket: logger.debug("Disconnecting from ElevenLabs") - await self._websocket.send(json.dumps({"text": ""})) + # Close all contexts and the socket + if self._context_id: + await self._websocket.send(json.dumps({"close_socket": True})) await self._websocket.close() except Exception as e: logger.error(f"{self} error closing websocket: {e}") finally: self._started = False + self._context_id = None self._websocket = None def _get_websocket(self): @@ -379,9 +363,35 @@ class ElevenLabsTTSService(InterruptibleWordTTSService): return self._websocket raise Exception("Websocket not connected") + async def _handle_interruption(self, frame: StartInterruptionFrame, direction: FrameDirection): + await super()._handle_interruption(frame, direction) + + # Close the current context when interrupted without closing the websocket + if self._context_id and self._websocket: + logger.trace(f"Closing context {self._context_id} due to interruption") + try: + await self._websocket.send( + json.dumps({"context_id": self._context_id, "close_context": True}) + ) + except Exception as e: + logger.error(f"Error closing context on interruption: {e}") + self._context_id = None + self._started = False + async def _receive_messages(self): async for message in self._get_websocket(): msg = json.loads(message) + # Check if this message belongs to the current context + # The default context may return null/None for context_id + received_ctx_id = msg.get("context_id") + if ( + self._context_id is not None + and received_ctx_id is not None + and received_ctx_id != self._context_id + ): + logger.trace(f"Ignoring message from different context: {received_ctx_id}") + continue + if msg.get("audio"): await self.stop_ttfb_metrics() self.start_word_timestamps() @@ -393,20 +403,45 @@ class ElevenLabsTTSService(InterruptibleWordTTSService): word_times = calculate_word_times(msg["alignment"], self._cumulative_time) await self.add_word_timestamps(word_times) self._cumulative_time = word_times[-1][1] + if msg.get("is_final"): + logger.trace(f"Received final message for context {received_ctx_id}") + # Context has finished + if self._context_id == received_ctx_id: + self._context_id = None + self._started = False async def _keepalive_task_handler(self): while True: await asyncio.sleep(10) try: - await self._send_text("") + # Send an empty message to keep the connection alive + if self._websocket and self._websocket.open: + await self._websocket.send(json.dumps({})) except websockets.ConnectionClosed as e: logger.warning(f"{self} keepalive error: {e}") break async def _send_text(self, text: str): if self._websocket: - msg = {"text": text + " "} - await self._websocket.send(json.dumps(msg)) + if not self._context_id: + # First message for a new context - need a space to initialize + msg = {"text": " ", "context_id": str(uuid.uuid4()), "xi_api_key": self._api_key} + + # Add voice settings only in first message for a context + if self._voice_settings: + msg["voice_settings"] = self._voice_settings + + await self._websocket.send(json.dumps(msg)) + self._context_id = msg["context_id"] + logger.trace(f"Created new context {self._context_id}") + + # Now send the actual text content + msg = {"text": text, "context_id": self._context_id} + await self._websocket.send(json.dumps(msg)) + else: + # Continuing with an existing context + msg = {"text": text, "context_id": self._context_id} + await self._websocket.send(json.dumps(msg)) async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: logger.debug(f"{self}: Generating TTS [{text}]") @@ -416,6 +451,13 @@ class ElevenLabsTTSService(InterruptibleWordTTSService): await self._connect() try: + # Close previous context if there was one + if self._context_id and not self._started: + await self._websocket.send( + json.dumps({"context_id": self._context_id, "close_context": True}) + ) + self._context_id = None + if not self._started: await self.start_ttfb_metrics() yield TTSStartedFrame() @@ -427,8 +469,8 @@ class ElevenLabsTTSService(InterruptibleWordTTSService): except Exception as e: logger.error(f"{self} error sending message: {e}") yield TTSStoppedFrame() - await self._disconnect() - await self._connect() + self._started = False + self._context_id = None return yield None except Exception as e: From efeb96c4e8f88270c642ec83f2f174921dacb0f6 Mon Sep 17 00:00:00 2001 From: Mark Backman Date: Wed, 7 May 2025 13:12:18 -0400 Subject: [PATCH 92/97] Remove unused imports --- src/pipecat/observers/loggers/llm_log_observer.py | 3 +-- src/pipecat/observers/loggers/transcription_log_observer.py | 2 -- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/src/pipecat/observers/loggers/llm_log_observer.py b/src/pipecat/observers/loggers/llm_log_observer.py index 9e4d53b28..a6675b5c0 100644 --- a/src/pipecat/observers/loggers/llm_log_observer.py +++ b/src/pipecat/observers/loggers/llm_log_observer.py @@ -7,7 +7,6 @@ from loguru import logger from pipecat.frames.frames import ( - Frame, FunctionCallInProgressFrame, FunctionCallResultFrame, LLMFullResponseEndFrame, @@ -17,7 +16,7 @@ from pipecat.frames.frames import ( ) from pipecat.observers.base_observer import BaseObserver, FramePushed from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContextFrame -from pipecat.processors.frame_processor import FrameDirection, FrameProcessor +from pipecat.processors.frame_processor import FrameDirection from pipecat.services.llm_service import LLMService diff --git a/src/pipecat/observers/loggers/transcription_log_observer.py b/src/pipecat/observers/loggers/transcription_log_observer.py index 57e38c952..8ca1d9c9b 100644 --- a/src/pipecat/observers/loggers/transcription_log_observer.py +++ b/src/pipecat/observers/loggers/transcription_log_observer.py @@ -7,12 +7,10 @@ from loguru import logger from pipecat.frames.frames import ( - Frame, InterimTranscriptionFrame, TranscriptionFrame, ) from pipecat.observers.base_observer import BaseObserver, FramePushed -from pipecat.processors.frame_processor import FrameDirection, FrameProcessor from pipecat.services.stt_service import STTService From 75ce632f8456b0dbaf6457a7c29b59793ec264a7 Mon Sep 17 00:00:00 2001 From: Mark Backman Date: Wed, 7 May 2025 15:05:15 -0400 Subject: [PATCH 93/97] Add DebugLogObserver --- CHANGELOG.md | 4 + examples/foundational/30-observer.py | 26 ++- .../observers/loggers/debug_log_observer.py | 218 ++++++++++++++++++ 3 files changed, 244 insertions(+), 4 deletions(-) create mode 100644 src/pipecat/observers/loggers/debug_log_observer.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 5edb1b5a3..dacfb5fd4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added +- Added `DebugLogObserver` for detailed frame logging with configurable + filtering by frame type and endpoint. This observer automatically extracts + and formats all frame data fields for debug logging. + - `UserImageRequestFrame.video_source` field has been added to request an image from the desired video source. diff --git a/examples/foundational/30-observer.py b/examples/foundational/30-observer.py index 46bd96e53..c9cd08aee 100644 --- a/examples/foundational/30-observer.py +++ b/examples/foundational/30-observer.py @@ -14,18 +14,26 @@ from pipecat.audio.vad.silero import SileroVADAnalyzer from pipecat.frames.frames import ( BotStartedSpeakingFrame, BotStoppedSpeakingFrame, + EndFrame, StartInterruptionFrame, + TTSTextFrame, + UserStartedSpeakingFrame, ) from pipecat.observers.base_observer import BaseObserver, FramePushed +from pipecat.observers.loggers.debug_log_observer import DebugLogObserver, FrameEndpoint from pipecat.observers.loggers.llm_log_observer import LLMLogObserver from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner from pipecat.pipeline.task import PipelineParams, PipelineTask -from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext +from pipecat.processors.aggregators.openai_llm_context import ( + OpenAILLMContext, +) from pipecat.processors.frame_processor import FrameDirection from pipecat.services.cartesia.tts import CartesiaTTSService from pipecat.services.deepgram.stt import DeepgramSTTService from pipecat.services.openai.llm import OpenAILLMService +from pipecat.transports.base_input import BaseInputTransport +from pipecat.transports.base_output import BaseOutputTransport from pipecat.transports.base_transport import TransportParams from pipecat.transports.network.small_webrtc import SmallWebRTCTransport from pipecat.transports.network.webrtc_connection import SmallWebRTCConnection @@ -33,7 +41,7 @@ from pipecat.transports.network.webrtc_connection import SmallWebRTCConnection load_dotenv(override=True) -class DebugObserver(BaseObserver): +class CustomObserver(BaseObserver): """Observer to log interruptions and bot speaking events to the console. Logs all frame instances of: @@ -58,7 +66,7 @@ class DebugObserver(BaseObserver): # Create direction arrow arrow = "β†’" if direction == FrameDirection.DOWNSTREAM else "←" - if isinstance(frame, StartInterruptionFrame): + if isinstance(frame, StartInterruptionFrame) and isinstance(src, BaseOutputTransport): logger.info(f"⚑ INTERRUPTION START: {src} {arrow} {dst} at {time_sec:.2f}s") elif isinstance(frame, BotStartedSpeakingFrame): logger.info(f"πŸ€– BOT START SPEAKING: {src} {arrow} {dst} at {time_sec:.2f}s") @@ -117,7 +125,17 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection, _: argparse.Namespac enable_usage_metrics=True, report_only_initial_ttfb=True, ), - observers=[DebugObserver(), LLMLogObserver()], + observers=[ + CustomObserver(), + LLMLogObserver(), + DebugLogObserver( + frame_types={ + TTSTextFrame: (BaseOutputTransport, FrameEndpoint.DESTINATION), + UserStartedSpeakingFrame: (BaseInputTransport, FrameEndpoint.SOURCE), + EndFrame: None, + } + ), + ], ) @transport.event_handler("on_client_connected") diff --git a/src/pipecat/observers/loggers/debug_log_observer.py b/src/pipecat/observers/loggers/debug_log_observer.py new file mode 100644 index 000000000..bd09bd790 --- /dev/null +++ b/src/pipecat/observers/loggers/debug_log_observer.py @@ -0,0 +1,218 @@ +# +# Copyright (c) 2024–2025, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +from dataclasses import fields, is_dataclass +from enum import Enum, auto +from typing import Dict, List, Optional, Set, Tuple, Type, Union + +from loguru import logger + +from pipecat.frames.frames import Frame +from pipecat.observers.base_observer import BaseObserver, FramePushed +from pipecat.processors.frame_processor import FrameDirection + + +class FrameEndpoint(Enum): + """Specifies which endpoint (source or destination) to filter on.""" + + SOURCE = auto() + DESTINATION = auto() + + +class DebugLogObserver(BaseObserver): + """Observer that logs frame activity with detailed content to the console. + + Automatically extracts and formats data from any frame type, making it useful + for debugging pipeline behavior without needing frame-specific observers. + + Args: + frame_types: Optional list of frame types to log, or a dict with frame type + filters. If None, logs all frame types. + exclude_fields: Optional set of field names to exclude from logging. + + Examples: + Log all frames from all services: + ```python + observer = DebugLogObserver() + ``` + + Log specific frame types from any source/destination: + ```python + from pipecat.frames.frames import TranscriptionFrame, InterimTranscriptionFrame + observer = DebugLogObserver(frame_types=[TranscriptionFrame, InterimTranscriptionFrame]) + ``` + + Log frames with specific source/destination filters: + ```python + from pipecat.frames.frames import StartInterruptionFrame, UserStartedSpeakingFrame, LLMTextFrame + from pipecat.transports.base_output_transport import BaseOutputTransport + from pipecat.services.stt_service import STTService + + observer = DebugLogObserver(frame_types={ + # Only log StartInterruptionFrame when source is BaseOutputTransport + StartInterruptionFrame: (BaseOutputTransport, FrameEndpoint.SOURCE), + + # Only log UserStartedSpeakingFrame when destination is STTService + UserStartedSpeakingFrame: (STTService, FrameEndpoint.DESTINATION), + + # Log LLMTextFrame regardless of source or destination type + LLMTextFrame: None + }) + ``` + """ + + def __init__( + self, + frame_types: Optional[ + Union[List[Type[Frame]], Dict[Type[Frame], Optional[Tuple[Type, FrameEndpoint]]]] + ] = None, + exclude_fields: Optional[Set[str]] = None, + ): + """Initialize the debug log observer. + + Args: + frame_types: List of frame types to log, or a dict mapping frame types to + filter configurations. Filter configs can be: + - None to log all instances of the frame type + - A tuple of (service_type, endpoint) to filter on a specific service + and endpoint (SOURCE or DESTINATION) + If None is provided instead of a dict/list, log all frames. + exclude_fields: Set of field names to exclude from logging. If None, only binary + data fields are excluded. + """ + # Process frame filters + self.frame_filters = {} + + if frame_types is not None: + if isinstance(frame_types, list): + # List of frame types - log all instances + self.frame_filters = {frame_type: None for frame_type in frame_types} + else: + # Dict of frame types with filters + self.frame_filters = frame_types + + # By default, exclude binary data fields that would clutter logs + self.exclude_fields = ( + exclude_fields + if exclude_fields is not None + else { + "audio", # Skip binary audio data + "image", # Skip binary image data + "images", # Skip lists of images + } + ) + + def _format_value(self, value): + """Format a value for logging. + + Args: + value: The value to format. + + Returns: + str: A string representation of the value suitable for logging. + """ + if value is None: + return "None" + elif isinstance(value, str): + return f"{value!r}" + elif isinstance(value, (list, tuple)): + if len(value) == 0: + return "[]" + if isinstance(value[0], dict) and len(value) > 3: + # For message lists, just show count + return f"{len(value)} items" + return str(value) + elif isinstance(value, (bytes, bytearray)): + return f"{len(value)} bytes" + elif hasattr(value, "get_messages_for_logging") and callable( + getattr(value, "get_messages_for_logging") + ): + # Special case for OpenAI context + return f"{value.__class__.__name__} with messages: {value.get_messages_for_logging()}" + else: + return str(value) + + def _should_log_frame(self, frame, src, dst): + """Determine if a frame should be logged based on filters. + + Args: + frame: The frame being processed + src: The source component + dst: The destination component + + Returns: + bool: True if the frame should be logged, False otherwise + """ + # If no filters, log all frames + if not self.frame_filters: + return True + + # Check if this frame type is in our filters + for frame_type, filter_config in self.frame_filters.items(): + if isinstance(frame, frame_type): + # If filter is None, log all instances of this frame type + if filter_config is None: + return True + + # Otherwise, check the specific filter + service_type, endpoint = filter_config + + if endpoint == FrameEndpoint.SOURCE: + return isinstance(src, service_type) + elif endpoint == FrameEndpoint.DESTINATION: + return isinstance(dst, service_type) + + return False + + async def on_push_frame(self, data: FramePushed): + """Process a frame being pushed into the pipeline. + + Logs frame details to the console with all relevant fields and values. + + Args: + data: Event data containing the frame, source, destination, direction, and timestamp. + """ + src = data.source + dst = data.destination + frame = data.frame + direction = data.direction + timestamp = data.timestamp + + # Check if we should log this frame + if not self._should_log_frame(frame, src, dst): + return + + # Format direction arrow + arrow = "β†’" if direction == FrameDirection.DOWNSTREAM else "←" + + time_sec = timestamp / 1_000_000_000 + class_name = frame.__class__.__name__ + + # Build frame representation + frame_details = [] + + # If dataclass, extract fields + if is_dataclass(frame): + for field in fields(frame): + if field.name in self.exclude_fields: + continue + + value = getattr(frame, field.name) + if value is None: + continue + + formatted_value = self._format_value(value) + frame_details.append(f"{field.name}: {formatted_value}") + + # Format the message + if frame_details: + details = ", ".join(frame_details) + message = f"{class_name} {details} at {time_sec:.2f}s" + else: + message = f"{class_name} at {time_sec:.2f}s" + + # Log the message + logger.debug(f"{src} {arrow} {dst}: {message}") From 9e0b4fe5d158e10ad5ba2cb22a0ea9ce882b3bc9 Mon Sep 17 00:00:00 2001 From: Mark Backman Date: Wed, 7 May 2025 17:19:52 -0400 Subject: [PATCH 94/97] Replace list with tuple --- .../observers/loggers/debug_log_observer.py | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/pipecat/observers/loggers/debug_log_observer.py b/src/pipecat/observers/loggers/debug_log_observer.py index bd09bd790..575a31683 100644 --- a/src/pipecat/observers/loggers/debug_log_observer.py +++ b/src/pipecat/observers/loggers/debug_log_observer.py @@ -6,7 +6,7 @@ from dataclasses import fields, is_dataclass from enum import Enum, auto -from typing import Dict, List, Optional, Set, Tuple, Type, Union +from typing import Dict, Optional, Set, Tuple, Type, Union from loguru import logger @@ -29,20 +29,20 @@ class DebugLogObserver(BaseObserver): for debugging pipeline behavior without needing frame-specific observers. Args: - frame_types: Optional list of frame types to log, or a dict with frame type + frame_types: Optional tuple of frame types to log, or a dict with frame type filters. If None, logs all frame types. exclude_fields: Optional set of field names to exclude from logging. Examples: Log all frames from all services: ```python - observer = DebugLogObserver() + observers = DebugLogObserver() ``` Log specific frame types from any source/destination: ```python from pipecat.frames.frames import TranscriptionFrame, InterimTranscriptionFrame - observer = DebugLogObserver(frame_types=[TranscriptionFrame, InterimTranscriptionFrame]) + observers = DebugLogObserver(frame_types=(TranscriptionFrame, InterimTranscriptionFrame)) ``` Log frames with specific source/destination filters: @@ -51,7 +51,7 @@ class DebugLogObserver(BaseObserver): from pipecat.transports.base_output_transport import BaseOutputTransport from pipecat.services.stt_service import STTService - observer = DebugLogObserver(frame_types={ + observers = DebugLogObserver(frame_types={ # Only log StartInterruptionFrame when source is BaseOutputTransport StartInterruptionFrame: (BaseOutputTransport, FrameEndpoint.SOURCE), @@ -67,19 +67,19 @@ class DebugLogObserver(BaseObserver): def __init__( self, frame_types: Optional[ - Union[List[Type[Frame]], Dict[Type[Frame], Optional[Tuple[Type, FrameEndpoint]]]] + Union[Tuple[Type[Frame], ...], Dict[Type[Frame], Optional[Tuple[Type, FrameEndpoint]]]] ] = None, exclude_fields: Optional[Set[str]] = None, ): """Initialize the debug log observer. Args: - frame_types: List of frame types to log, or a dict mapping frame types to + frame_types: Tuple of frame types to log, or a dict mapping frame types to filter configurations. Filter configs can be: - None to log all instances of the frame type - A tuple of (service_type, endpoint) to filter on a specific service and endpoint (SOURCE or DESTINATION) - If None is provided instead of a dict/list, log all frames. + If None is provided instead of a tuple/dict, log all frames. exclude_fields: Set of field names to exclude from logging. If None, only binary data fields are excluded. """ @@ -87,8 +87,8 @@ class DebugLogObserver(BaseObserver): self.frame_filters = {} if frame_types is not None: - if isinstance(frame_types, list): - # List of frame types - log all instances + if isinstance(frame_types, tuple): + # Tuple of frame types - log all instances self.frame_filters = {frame_type: None for frame_type in frame_types} else: # Dict of frame types with filters From 7cfb9a4d15c70baa1ed0ae8be20bb198c9a2dc88 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= Date: Wed, 7 May 2025 14:59:16 -0700 Subject: [PATCH 95/97] update CHANGELOG for 0.0.67 --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index dacfb5fd4..9b85c7f87 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,7 @@ All notable changes to **Pipecat** will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [Unreleased] +## [0.0.67] - 2025-05-07 ### Added From 91364028460eecb82db916c26bbc68d5647104b2 Mon Sep 17 00:00:00 2001 From: Mark Backman Date: Wed, 7 May 2025 18:29:27 -0400 Subject: [PATCH 96/97] Add load_dotenv to moondream example server --- examples/moondream-chatbot/server.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/examples/moondream-chatbot/server.py b/examples/moondream-chatbot/server.py index bb322ff2e..9597bdc9a 100644 --- a/examples/moondream-chatbot/server.py +++ b/examples/moondream-chatbot/server.py @@ -10,12 +10,16 @@ import subprocess from contextlib import asynccontextmanager import aiohttp +from dotenv import load_dotenv from fastapi import FastAPI, HTTPException, Request from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import JSONResponse, RedirectResponse from pipecat.transports.services.helpers.daily_rest import DailyRESTHelper, DailyRoomParams +# Load environment variables from .env file +load_dotenv(override=True) + MAX_BOTS_PER_ROOM = 1 # Bot sub-process dict for status reporting and concurrency control From cb7e7a8aa30acda38dd4e07c4096ee3e62f8b2b4 Mon Sep 17 00:00:00 2001 From: Mark Backman Date: Wed, 7 May 2025 18:40:04 -0400 Subject: [PATCH 97/97] Add load_dotenv to patient-intake server file --- examples/patient-intake/server.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/examples/patient-intake/server.py b/examples/patient-intake/server.py index 347b17dbd..10ccfb3b7 100644 --- a/examples/patient-intake/server.py +++ b/examples/patient-intake/server.py @@ -10,12 +10,16 @@ import subprocess from contextlib import asynccontextmanager import aiohttp +from dotenv import load_dotenv from fastapi import FastAPI, HTTPException, Request from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import JSONResponse, RedirectResponse from pipecat.transports.services.helpers.daily_rest import DailyRESTHelper, DailyRoomParams +# Load environment variables from .env file +load_dotenv(override=True) + MAX_BOTS_PER_ROOM = 1 # Bot sub-process dict for status reporting and concurrency control