temp commit; debugging

2024-10-10 15:34:25 -07:00
parent 07124bfafc
commit e7ccaed56c
2 changed files with 74 additions and 10 deletions
--- a/examples/foundational/19-openai-realtime-beta.py
+++ b/examples/foundational/19-openai-realtime-beta.py
@@ -38,6 +38,39 @@ logger.remove(0)
 logger.add(sys.stderr, level="DEBUG")


+messages = [
+    {"role": "user", "content": "Say 'Hello there' and ask my name."},
+    {"role": "assistant", "content": [{"type": "text", "text": "Hello there! What's your name?"}]},
+    # {"role": "user", "content": [{"type": "input_audio"}]},
+    {"role": "user", "content": [{"type": "text", "text": "Tell me a joke.\n"}]},
+    # {
+    #     "role": "assistant",
+    #     "content": [
+    #         {
+    #             "type": "text",
+    #             "text": "Why don't scientists trust atoms? Because they make up everything!",
+    #         }
+    #     ],
+    # },
+    # {"role": "user", "content": [{"type": "text", "text": "me know the joke.\n"}]},
+    # {
+    #     "role": "assistant",
+    #     "content": [{"type": "text", "text": "What do you call fake spaghetti? An impasta!"}],
+    # },
+    # {"role": "user", "content": [{"type": "text", "text": "me another joke.\n"}]},
+    # {
+    #     "role": "assistant",
+    #     "content": [
+    #         {
+    #             "type": "text",
+    #             "text": "Why couldn't the bicycle stand up by itself? It was two-tired!",
+    #         }
+    #     ],
+    # },
+    # {"role": "user", "content": [{"type": "input_audio"}]},
+]
+
+
 async def fetch_weather_from_api(function_name, tool_call_id, args, llm, context, result_callback):
    temperature = 75 if args["format"] == "fahrenheit" else 24
    await result_callback(
@@ -193,7 +226,9 @@ Remember, your responses should be short. Just one or two sentences, usually.
        )

        llm = OpenAILLMServiceRealtimeBeta(
-            api_key=os.getenv("OPENAI_API_KEY"), session_properties=session_properties
+            api_key=os.getenv("OPENAI_API_KEY"),
+            session_properties=session_properties,
+            start_audio_paused=True,
        )

        # you can either register a single function for all function calls, or specific functions
@@ -204,7 +239,8 @@ Remember, your responses should be short. Just one or two sentences, usually.
        llm.register_function("load_conversation", load_conversation)

        context = OpenAILLMContext(
-            [{"role": "user", "content": "Say 'hello'."}],
+            messages,
+            # [{"role": "user", "content": "Say 'hello'."}],
            # [{"role": "user", "content": "What's the weather right now in San Francisco?"}],
            # conversation load from file is a WIP -- not functional yet
            # [{"role": "user", "content": "Load the most recent conversation."}],
--- a/src/pipecat/services/openai_realtime_beta/llm_and_context.py
+++ b/src/pipecat/services/openai_realtime_beta/llm_and_context.py
@@ -1,6 +1,9 @@
 import asyncio
 import base64
 import json
+
+# temp: websocket logger
+import logging
 import traceback
 from copy import deepcopy
 from dataclasses import dataclass
@@ -48,12 +51,10 @@ from pipecat.utils.time import time_now_iso8601

 from . import events

-# temp: websocket logger
-# import logging
-# logging.basicConfig(
-#     format="%(message)s",
-#     level=logging.DEBUG,
-# )
+logging.basicConfig(
+    format="%(message)s",
+    level=logging.DEBUG,
+)


@dataclass
@@ -332,6 +333,8 @@ class OpenAILLMServiceRealtimeBeta(LLMService):
        raise Exception("Websocket not connected")

    async def _update_settings(self):
+        # !!! LEAVE ALL DEFAULT SETTINGS FOR NOW
+        return
        settings = self._session_properties
        # tools given in the context override the tools in the session properties
        if self._context and self._context.tools:
@@ -347,9 +350,13 @@ class OpenAILLMServiceRealtimeBeta(LLMService):
                if evt.type == "session.created":
                    # session.created is received right after connecting. send a message
                    # to configure the session properties.
+                    logger.debug(f"!!! GOT SESSION CREATED {evt}")
                    await self._update_settings()
                elif evt.type == "session.updated":
+                    logger.debug(f"!!! GOT SESSION UPDATED {evt}")
                    self._session_properties = evt.session
+                elif evt.type == "conversation.created":
+                    logger.debug(f"!!! GOT CONVERSATION CREATED: {evt}")
                elif evt.type == "input_audio_buffer.speech_started":
                    # user started speaking
                    if self._send_user_started_speaking_frames:
@@ -374,6 +381,7 @@ class OpenAILLMServiceRealtimeBeta(LLMService):
                elif evt.type == "response.created":
                    # todo: 1. figure out TTS started/stopped frame semantics better
                    #       2. do not push these frames in text-only mode
+                    logger.debug(f"!!! GOT RESPONSE CREATED {evt}")
                    if not self._bot_speaking:
                        self._bot_speaking = True
                        await self.push_frame(TTSStartedFrame())
@@ -569,16 +577,36 @@ class OpenAILLMServiceRealtimeBeta(LLMService):

        for item in items:
            context.note_manually_added_message(item.id)
-            await self.send_client_event(events.ConversationItemCreateEvent(item=item))
+            evt = events.ConversationItemCreateEvent(item=item)
+            logger.debug(
+                f"!!! > Sending message: {evt.model_dump_json(indent=2, exclude_none=True)}"
+            )
+            await self.send_client_event(evt)
+            await asyncio.sleep(2)
+            # await self.send_client_event(events.ConversationItemCreateEvent(item=item))

    async def _create_response(self):
        if self._context.get_tools_list_updated():
            await self._update_settings()
+
+        # !!! DEBUGGING - testing await on conversation.create
+        logger.debug("!!! A waiting on conversation.created")
+        await asyncio.sleep(3)
+        logger.debug("!!! A ok, done waiting")
+
        await self._send_messages_context_update()
        logger.debug(f"Creating response: {self._context.get_messages_for_logging()}")
        await self.push_frame(LLMFullResponseStartFrame())
        await self.start_processing_metrics()
-        await self.send_client_event(events.ResponseCreateEvent())
+        await self.send_client_event(
+            events.ResponseCreateEvent(
+                response=events.ResponseProperties(modalities=["audio", "text"])
+            )
+        )
+        # !!! DEBUGGING
+        await asyncio.sleep(2)
+        # logger.debug("Unpausing microphone")
+        # self.set_audio_input_paused(False)

    async def _send_user_audio(self, frame):
        payload = base64.b64encode(frame.audio).decode("utf-8")