From e7ccaed56cfd3250fc78ea5cd8bbdfbe9047e89b Mon Sep 17 00:00:00 2001
From: Kwindla Hultman Kramer <kwindla@gmail.com>
Date: Thu, 10 Oct 2024 15:34:25 -0700
Subject: [PATCH] temp commit; debugging

---
 .../foundational/19-openai-realtime-beta.py   | 40 ++++++++++++++++-
 .../openai_realtime_beta/llm_and_context.py   | 44 +++++++++++++++----
 2 files changed, 74 insertions(+), 10 deletions(-)

diff --git a/examples/foundational/19-openai-realtime-beta.py b/examples/foundational/19-openai-realtime-beta.py
index 4739b793e..247949a4c 100644
--- a/examples/foundational/19-openai-realtime-beta.py
+++ b/examples/foundational/19-openai-realtime-beta.py
@@ -38,6 +38,39 @@ logger.remove(0)
 logger.add(sys.stderr, level="DEBUG")
 
 
+messages = [
+    {"role": "user", "content": "Say 'Hello there' and ask my name."},
+    {"role": "assistant", "content": [{"type": "text", "text": "Hello there! What's your name?"}]},
+    # {"role": "user", "content": [{"type": "input_audio"}]},
+    {"role": "user", "content": [{"type": "text", "text": "Tell me a joke.\n"}]},
+    # {
+    #     "role": "assistant",
+    #     "content": [
+    #         {
+    #             "type": "text",
+    #             "text": "Why don't scientists trust atoms? Because they make up everything!",
+    #         }
+    #     ],
+    # },
+    # {"role": "user", "content": [{"type": "text", "text": "me know the joke.\n"}]},
+    # {
+    #     "role": "assistant",
+    #     "content": [{"type": "text", "text": "What do you call fake spaghetti? An impasta!"}],
+    # },
+    # {"role": "user", "content": [{"type": "text", "text": "me another joke.\n"}]},
+    # {
+    #     "role": "assistant",
+    #     "content": [
+    #         {
+    #             "type": "text",
+    #             "text": "Why couldn't the bicycle stand up by itself? It was two-tired!",
+    #         }
+    #     ],
+    # },
+    # {"role": "user", "content": [{"type": "input_audio"}]},
+]
+
+
 async def fetch_weather_from_api(function_name, tool_call_id, args, llm, context, result_callback):
     temperature = 75 if args["format"] == "fahrenheit" else 24
     await result_callback(
@@ -193,7 +226,9 @@ Remember, your responses should be short. Just one or two sentences, usually.
         )
 
         llm = OpenAILLMServiceRealtimeBeta(
-            api_key=os.getenv("OPENAI_API_KEY"), session_properties=session_properties
+            api_key=os.getenv("OPENAI_API_KEY"),
+            session_properties=session_properties,
+            start_audio_paused=True,
         )
 
         # you can either register a single function for all function calls, or specific functions
@@ -204,7 +239,8 @@ Remember, your responses should be short. Just one or two sentences, usually.
         llm.register_function("load_conversation", load_conversation)
 
         context = OpenAILLMContext(
-            [{"role": "user", "content": "Say 'hello'."}],
+            messages,
+            # [{"role": "user", "content": "Say 'hello'."}],
             # [{"role": "user", "content": "What's the weather right now in San Francisco?"}],
             # conversation load from file is a WIP -- not functional yet
             # [{"role": "user", "content": "Load the most recent conversation."}],
diff --git a/src/pipecat/services/openai_realtime_beta/llm_and_context.py b/src/pipecat/services/openai_realtime_beta/llm_and_context.py
index de7f367b4..f07391162 100644
--- a/src/pipecat/services/openai_realtime_beta/llm_and_context.py
+++ b/src/pipecat/services/openai_realtime_beta/llm_and_context.py
@@ -1,6 +1,9 @@
 import asyncio
 import base64
 import json
+
+# temp: websocket logger
+import logging
 import traceback
 from copy import deepcopy
 from dataclasses import dataclass
@@ -48,12 +51,10 @@ from pipecat.utils.time import time_now_iso8601
 
 from . import events
 
-# temp: websocket logger
-# import logging
-# logging.basicConfig(
-#     format="%(message)s",
-#     level=logging.DEBUG,
-# )
+logging.basicConfig(
+    format="%(message)s",
+    level=logging.DEBUG,
+)
 
 
 @dataclass
@@ -332,6 +333,8 @@ class OpenAILLMServiceRealtimeBeta(LLMService):
         raise Exception("Websocket not connected")
 
     async def _update_settings(self):
+        # !!! LEAVE ALL DEFAULT SETTINGS FOR NOW
+        return
         settings = self._session_properties
         # tools given in the context override the tools in the session properties
         if self._context and self._context.tools:
@@ -347,9 +350,13 @@ class OpenAILLMServiceRealtimeBeta(LLMService):
                 if evt.type == "session.created":
                     # session.created is received right after connecting. send a message
                     # to configure the session properties.
+                    logger.debug(f"!!! GOT SESSION CREATED {evt}")
                     await self._update_settings()
                 elif evt.type == "session.updated":
+                    logger.debug(f"!!! GOT SESSION UPDATED {evt}")
                     self._session_properties = evt.session
+                elif evt.type == "conversation.created":
+                    logger.debug(f"!!! GOT CONVERSATION CREATED: {evt}")
                 elif evt.type == "input_audio_buffer.speech_started":
                     # user started speaking
                     if self._send_user_started_speaking_frames:
@@ -374,6 +381,7 @@ class OpenAILLMServiceRealtimeBeta(LLMService):
                 elif evt.type == "response.created":
                     # todo: 1. figure out TTS started/stopped frame semantics better
                     #       2. do not push these frames in text-only mode
+                    logger.debug(f"!!! GOT RESPONSE CREATED {evt}")
                     if not self._bot_speaking:
                         self._bot_speaking = True
                         await self.push_frame(TTSStartedFrame())
@@ -569,16 +577,36 @@ class OpenAILLMServiceRealtimeBeta(LLMService):
 
         for item in items:
             context.note_manually_added_message(item.id)
-            await self.send_client_event(events.ConversationItemCreateEvent(item=item))
+            evt = events.ConversationItemCreateEvent(item=item)
+            logger.debug(
+                f"!!! > Sending message: {evt.model_dump_json(indent=2, exclude_none=True)}"
+            )
+            await self.send_client_event(evt)
+            await asyncio.sleep(2)
+            # await self.send_client_event(events.ConversationItemCreateEvent(item=item))
 
     async def _create_response(self):
         if self._context.get_tools_list_updated():
             await self._update_settings()
+
+        # !!! DEBUGGING - testing await on conversation.create
+        logger.debug("!!! A waiting on conversation.created")
+        await asyncio.sleep(3)
+        logger.debug("!!! A ok, done waiting")
+
         await self._send_messages_context_update()
         logger.debug(f"Creating response: {self._context.get_messages_for_logging()}")
         await self.push_frame(LLMFullResponseStartFrame())
         await self.start_processing_metrics()
-        await self.send_client_event(events.ResponseCreateEvent())
+        await self.send_client_event(
+            events.ResponseCreateEvent(
+                response=events.ResponseProperties(modalities=["audio", "text"])
+            )
+        )
+        # !!! DEBUGGING
+        await asyncio.sleep(2)
+        # logger.debug("Unpausing microphone")
+        # self.set_audio_input_paused(False)
 
     async def _send_user_audio(self, frame):
         payload = base64.b64encode(frame.audio).decode("utf-8")