From e7ccaed56cfd3250fc78ea5cd8bbdfbe9047e89b Mon Sep 17 00:00:00 2001 From: Kwindla Hultman Kramer Date: Thu, 10 Oct 2024 15:34:25 -0700 Subject: [PATCH] temp commit; debugging --- .../foundational/19-openai-realtime-beta.py | 40 ++++++++++++++++- .../openai_realtime_beta/llm_and_context.py | 44 +++++++++++++++---- 2 files changed, 74 insertions(+), 10 deletions(-) diff --git a/examples/foundational/19-openai-realtime-beta.py b/examples/foundational/19-openai-realtime-beta.py index 4739b793e..247949a4c 100644 --- a/examples/foundational/19-openai-realtime-beta.py +++ b/examples/foundational/19-openai-realtime-beta.py @@ -38,6 +38,39 @@ logger.remove(0) logger.add(sys.stderr, level="DEBUG") +messages = [ + {"role": "user", "content": "Say 'Hello there' and ask my name."}, + {"role": "assistant", "content": [{"type": "text", "text": "Hello there! What's your name?"}]}, + # {"role": "user", "content": [{"type": "input_audio"}]}, + {"role": "user", "content": [{"type": "text", "text": "Tell me a joke.\n"}]}, + # { + # "role": "assistant", + # "content": [ + # { + # "type": "text", + # "text": "Why don't scientists trust atoms? Because they make up everything!", + # } + # ], + # }, + # {"role": "user", "content": [{"type": "text", "text": "me know the joke.\n"}]}, + # { + # "role": "assistant", + # "content": [{"type": "text", "text": "What do you call fake spaghetti? An impasta!"}], + # }, + # {"role": "user", "content": [{"type": "text", "text": "me another joke.\n"}]}, + # { + # "role": "assistant", + # "content": [ + # { + # "type": "text", + # "text": "Why couldn't the bicycle stand up by itself? It was two-tired!", + # } + # ], + # }, + # {"role": "user", "content": [{"type": "input_audio"}]}, +] + + async def fetch_weather_from_api(function_name, tool_call_id, args, llm, context, result_callback): temperature = 75 if args["format"] == "fahrenheit" else 24 await result_callback( @@ -193,7 +226,9 @@ Remember, your responses should be short. Just one or two sentences, usually. ) llm = OpenAILLMServiceRealtimeBeta( - api_key=os.getenv("OPENAI_API_KEY"), session_properties=session_properties + api_key=os.getenv("OPENAI_API_KEY"), + session_properties=session_properties, + start_audio_paused=True, ) # you can either register a single function for all function calls, or specific functions @@ -204,7 +239,8 @@ Remember, your responses should be short. Just one or two sentences, usually. llm.register_function("load_conversation", load_conversation) context = OpenAILLMContext( - [{"role": "user", "content": "Say 'hello'."}], + messages, + # [{"role": "user", "content": "Say 'hello'."}], # [{"role": "user", "content": "What's the weather right now in San Francisco?"}], # conversation load from file is a WIP -- not functional yet # [{"role": "user", "content": "Load the most recent conversation."}], diff --git a/src/pipecat/services/openai_realtime_beta/llm_and_context.py b/src/pipecat/services/openai_realtime_beta/llm_and_context.py index de7f367b4..f07391162 100644 --- a/src/pipecat/services/openai_realtime_beta/llm_and_context.py +++ b/src/pipecat/services/openai_realtime_beta/llm_and_context.py @@ -1,6 +1,9 @@ import asyncio import base64 import json + +# temp: websocket logger +import logging import traceback from copy import deepcopy from dataclasses import dataclass @@ -48,12 +51,10 @@ from pipecat.utils.time import time_now_iso8601 from . import events -# temp: websocket logger -# import logging -# logging.basicConfig( -# format="%(message)s", -# level=logging.DEBUG, -# ) +logging.basicConfig( + format="%(message)s", + level=logging.DEBUG, +) @dataclass @@ -332,6 +333,8 @@ class OpenAILLMServiceRealtimeBeta(LLMService): raise Exception("Websocket not connected") async def _update_settings(self): + # !!! LEAVE ALL DEFAULT SETTINGS FOR NOW + return settings = self._session_properties # tools given in the context override the tools in the session properties if self._context and self._context.tools: @@ -347,9 +350,13 @@ class OpenAILLMServiceRealtimeBeta(LLMService): if evt.type == "session.created": # session.created is received right after connecting. send a message # to configure the session properties. + logger.debug(f"!!! GOT SESSION CREATED {evt}") await self._update_settings() elif evt.type == "session.updated": + logger.debug(f"!!! GOT SESSION UPDATED {evt}") self._session_properties = evt.session + elif evt.type == "conversation.created": + logger.debug(f"!!! GOT CONVERSATION CREATED: {evt}") elif evt.type == "input_audio_buffer.speech_started": # user started speaking if self._send_user_started_speaking_frames: @@ -374,6 +381,7 @@ class OpenAILLMServiceRealtimeBeta(LLMService): elif evt.type == "response.created": # todo: 1. figure out TTS started/stopped frame semantics better # 2. do not push these frames in text-only mode + logger.debug(f"!!! GOT RESPONSE CREATED {evt}") if not self._bot_speaking: self._bot_speaking = True await self.push_frame(TTSStartedFrame()) @@ -569,16 +577,36 @@ class OpenAILLMServiceRealtimeBeta(LLMService): for item in items: context.note_manually_added_message(item.id) - await self.send_client_event(events.ConversationItemCreateEvent(item=item)) + evt = events.ConversationItemCreateEvent(item=item) + logger.debug( + f"!!! > Sending message: {evt.model_dump_json(indent=2, exclude_none=True)}" + ) + await self.send_client_event(evt) + await asyncio.sleep(2) + # await self.send_client_event(events.ConversationItemCreateEvent(item=item)) async def _create_response(self): if self._context.get_tools_list_updated(): await self._update_settings() + + # !!! DEBUGGING - testing await on conversation.create + logger.debug("!!! A waiting on conversation.created") + await asyncio.sleep(3) + logger.debug("!!! A ok, done waiting") + await self._send_messages_context_update() logger.debug(f"Creating response: {self._context.get_messages_for_logging()}") await self.push_frame(LLMFullResponseStartFrame()) await self.start_processing_metrics() - await self.send_client_event(events.ResponseCreateEvent()) + await self.send_client_event( + events.ResponseCreateEvent( + response=events.ResponseProperties(modalities=["audio", "text"]) + ) + ) + # !!! DEBUGGING + await asyncio.sleep(2) + # logger.debug("Unpausing microphone") + # self.set_audio_input_paused(False) async def _send_user_audio(self, frame): payload = base64.b64encode(frame.audio).decode("utf-8")