ctrl-c for simple-sample.py

2023-12-31 19:35:52 -08:00
9 changed files with 30 additions and 395 deletions
--- a/src/khk-working/functional/just-say-one-thing.py
+++ b/src/khk-working/functional/just-say-one-thing.py
@@ -1,55 +0,0 @@
-import argparse
-import time
-
-from dailyai.orchestrator import OrchestratorConfig, Orchestrator
-from dailyai.message_handler.message_handler import MessageHandler
-from dailyai.services.ai_services import AIServiceConfig
-from dailyai.services.azure_ai_services import AzureTTSService, AzureLLMService
-
-
-# For now, use Azure service for the TTS. Todo: make tts service
-# and tts args (like which voice to use) configurable via command
-# line arguments.
-# Need the following environment variables:
-# - AZURE_SPEECH_SERVICE_KEY
-# - AZURE_SPEECH_SERVICE_REGION
-
-
-def add_bot_to_room(room_url, text) -> None:
-    message_handler = MessageHandler(
-        "Respond with only the following text: " + text)
-
-    services = AIServiceConfig(
-        tts=AzureTTSService(), image=None, llm=AzureLLMService()
-    )
-
-    orchestrator_config = OrchestratorConfig(
-        room_url=room_url,
-        # todo: token should be optional
-        token=None,
-        bot_name="Minimal Speaking Bot",
-        # todo: expiration should be optional
-        expiration=time.time() + 10
-    )
-
-    orchestrator = Orchestrator(
-        orchestrator_config,
-        services,
-        message_handler,
-    )
-
-    orchestrator.start()
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Say one phrase and exit")
-    parser.add_argument("-u", "--url", type=str,
-                        required=True, help="URL of the Daily room")
-
-    parser.add_argument(
-        "-t", "--text", type=str, required=True, help="text to send into the session as speech"
-    )
-
-    args: argparse.Namespace = parser.parse_args()
-
-    add_bot_to_room(args.url, args.text)
--- a/src/khk-working/theoretical/01-say-one-thing.py
+++ b/src/khk-working/theoretical/01-say-one-thing.py
@@ -1,48 +0,0 @@
-from dailyai.services.transport.DailyTransport import DailyTransportService
-from dailyai.services.tts.AzureTTSService import AzureTTSService
-
-
-transport = None
-mic = None
-tts = None
-
-
-def main():
-    global transport
-    global mic
-    global tts
-
-    # create a transport service object using environment variables for
-    # the transport service's API key, room url, and any other configuration.
-    # services can all define and document the environment variables they use.
-    # services all also take an optional config object that is used instead of
-    # environment variables.
-    #
-    # the abstract transport service APIs presumably can map pretty closely
-    # to the daily-python basic API
-    transport = DailyTransportService()
-
-    # similarly, create a tts service
-    tts = AzureTTSService()
-
-    # ask the transport to create a local audio "device"/queue for
-    # chunks of audio to play sequentially. the "mic" object is a handle
-    # we can use to inspect and control the queue if we need to. in this
-    # case we will pipe into this queue from the tts service
-    mic = transport.create_audio_queue()
-    tts.set_output(mic)
-
-    transport.on("error", lambda e: print(e))
-    transport.on("joined-meeting", say_one_thing)
-    transport.start()
-
-
-def say_one_thing():
-    # say one thing, then leave
-    tts.run_tts("hello world")
-    mic.on("audio-queue-empty", shutdown)
-
-
-def shutdown():
-    transport.stop()
-    tts.close()
--- a/src/khk-working/theoretical/02-llm-say-one-thing.py
+++ b/src/khk-working/theoretical/02-llm-say-one-thing.py
@@ -1,35 +0,0 @@
-from dailyai.services.transport.DailyTransport import DailyTransportService
-from dailyai.services.llm.AzureLLMService import AzureLLMService
-from dailyai.services.tts.AzureTTSService import AzureTTSService
-
-transport = None
-llm = None
-tts = None
-
-
-def main():
-    global transport
-    global llm
-    global tts
-
-    transport = DailyTransportService()
-    llm = AzureLLMService()
-    tts = AzureTTSService()
-    mic = transport.create_audio_queue()
-    tts.set_output(mic)
-    llm.set_output(tts)
-
-    transport.on("error", lambda e: print(e))
-    transport.on("joined-meeting", make_one_inference_call)
-    transport.start()
-
-
-def make_one_inference_call():
-    # ask our llm to say one thing, then leave
-    llm.run_llm("tell me a joke about llamas")
-    transport.on("audio-queue-empty", shutdown)
-
-
-def shutdown():
-    transport.stop()
-    tts.close()
--- a/src/khk-working/theoretical/03-generate-one-video-frame.py
+++ b/src/khk-working/theoretical/03-generate-one-video-frame.py
@@ -1,27 +0,0 @@
-from dailyai.services.transport.DailyTransport import DailyTransportService
-from dailyai.services.genimage.AzureDalleService import AzureDalleService
-
-dalle = None
-
-
-def main():
-    global dalle
-
-    transport = DailyTransportService()
-    dalle = AzureDalleService()
-
-    # create_video_queue() could presumably take configuration parameters that
-    # correspond to Daily video settings (resolution, framerate, target
-    # bitrate, etc.)
-    cam = transport.create_video_queue()
-    dalle.set_output(cam)
-
-    transport.on("error", lambda e: print(e))
-    transport.on("joined-meeting", say_one_thing)
-    transport.start()
-
-
-def say_one_thing():
-    # make one image, send it to the video queue, then just hang out.
-    # for simplicity we have not implemented graceful shutdown :-)
-    dalle.generate_image("an astronaut riding a skateboard")
--- a/src/khk-working/theoretical/04-say-two-things.py
+++ b/src/khk-working/theoretical/04-say-two-things.py
@@ -1,37 +0,0 @@
-from dailyai.services.transport.DailyTransport import DailyTransportService
-from dailyai.services.llm.AzureLLMService import AzureLLMService
-from dailyai.services.tts.AzureTTSService import AzureTTSService
-
-transport = None
-llm = None
-tts = None
-
-
-def main():
-    global transport
-    global llm
-    global tts
-
-    transport = DailyTransportService()
-    llm = AzureLLMService()
-    tts = AzureTTSService()
-    mic = transport.create_audio_queue()
-    tts.set_output(mic)
-    llm.set_output(tts)
-
-    transport.on("error", lambda e: print(e))
-    transport.on("joined-meeting", say_two_things)
-    transport.start()
-
-
-def say_two_things():
-    # queue two pieces of speech: one specified as a text literal,
-    # and one generated by an llm
-    tts.run_tts("My friend the LLM is now going to tell a joke about llamas.")
-    llm.run_llm("tell me a joke about llamas")
-    transport.on("audio-queue-empty", shutdown)
-
-
-def shutdown():
-    transport.stop()
-    tts.close()
--- a/src/khk-working/theoretical/05-llm-speech-and-images.py
+++ b/src/khk-working/theoretical/05-llm-speech-and-images.py
@@ -1,101 +0,0 @@
-from dailyai.services.transport.DailyTransport import DailyTransportService
-from dailyai.services.llm.AzureLLMService import AzureLLMService
-from dailyai.services.tts.AzureTTSService import AzureTTSService
-from dailyai.services.genimage.AzureDalleService import AzureDalleService
-from dailyai.services.utils.AudioImageSynchronizedPair import AudioImageSynchronizedPair
-
-transport = None
-llm = None
-tts = None
-dalle = None
-mic = None
-cam = None
-
-
-def main():
-    global transport
-    global llm
-    global tts
-    global dalle
-
-    transport = DailyTransportService()
-    llm = AzureLLMService()
-    tts = AzureTTSService()
-    dalle = AzureDalleService()
-
-    # set up mic and cam. but don't wire up automatic output to the mic
-    # and cam from our AI services because we need to manage synchronization
-    # of image/speech pairings
-    mic = transport.create_audio_queue()
-    cam = transport.create_video_queue()
-
-    transport.on("error", lambda e: print(e))
-    transport.on("joined-meeting", narrate_calendar_images)
-    transport.start()
-
-
-def narrate_calendar_images():
-    # let's loop over the months of the year. for each month name, we will have
-    # our llm generate a description of a nice photograph for that month's page
-    # in a calendar.
-    #
-    # then we'll take the text description and:
-    #  1. turn it into speech that we send into the session as audio
-    #  2. turn it into an image that we send into the session as video
-    # we want the audio and video to be synchronized, so we'll use a helper
-    # class to manage that.
-    #
-    # the first `run_llm()` call defines a lambda to process its output.
-    #
-    # the design idea here is that output can be piped into a function that
-    # takes inference completion text as its argument. *or* output can be
-    # piped into an object that has more options (maybe a callback for streaming
-    # results, or a callback for inference completion, or both).
-    #
-    # note that we might queue up the month outputs out of order, but that's
-    # okay for this demo
-    #
-    for month in ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]:
-        synchronizer = AudioImageSynchronizedPair(
-            audio_output=mic, video_output=cam)
-        llm.run_llm(
-            f""""
-            Describe a nature photograph suitable for use in a calendar,
-            for the month of {month}. Include only the image description
-            with no preamble.
-            """,
-            output=lambda inference_text: (
-                dalle.generate_image(inference_text, output=synchronizer),
-                tts.run_tts(inference_text, output=synchronizer)
-            ),
-        )
-
-
-# the AudioImageSynchronizedPair class seems useful enough that I've listed
-# it above as a standard utility we can import. but here's a theoretical
-# implementation
-
-class TheoreticalAudioImageSynchronizedPair:
-    def __init__(self, audio_output, video_output):
-        self.audio_output = audio_output
-        self.video_output = video_output
-        self.image = None
-        self.audio = None
-
-    def image_generation_complete(self, image):
-        self.image = image
-        self._maybe_send()
-
-    def tts_complete(self, audio):
-        self.audio = audio
-        self._maybe_send()
-
-    def _maybe_send(self):
-        if self.image is not None and self.audio is not None:
-            self.video_output.queue_frame(self.image)
-            self.audio_output.queue_audio(self.audio)
-
-
-def shutdown():
-    transport.stop()
-    tts.close()
--- a/src/khk-working/theoretical/06-llm-voice-chat.py
+++ b/src/khk-working/theoretical/06-llm-voice-chat.py
@@ -1,72 +0,0 @@
-from dailyai.services.transport.DailyTransport import DailyTransportService
-from dailyai.services.llm.AzureLLMService import AzureLLMService
-from dailyai.services.tts.AzureTTSService import AzureTTSService
-from dailyai.services.utils import Tee
-from dailyai.services.utils import ReadySoundWav
-
-initial_prompt = "You are a helpful assistant. Introduce yourself and ask how you can be helpful."
-
-llm_messages = [{
-    "role": "system",
-    "content": initial_prompt
-}]
-
-
-transport = None
-llm = None
-tts = None
-mic = None
-transcription = None
-
-
-def main():
-    global transport
-    global llm
-    global tts
-    global mic
-    global transcription
-
-    transport = DailyTransportService()
-    llm = AzureLLMService()
-    tts = AzureTTSService()
-
-    # using Moishe's combined output queue rather than an audio-only queue
-    mic = transport.create_output_queue(audio=True, video=False)
-
-    llm.set_output(Tee(tts, accumulate_assistant_messages))
-    tts.set_output(mic)
-
-    # DailyTransport implements transcription internally. we'll grab a handle to this
-    # Transcription service, configure it to use silence-based endpointing, and
-    # set the silence interval to 1.5 seconds
-    transcription = transport.transcription_service()
-    transcription.configure(endpointing_pause=1.5)
-
-    transport.on("error", lambda e: print(e))
-    transport.on("joined-meeting", llm_prompt)
-    transport.start()
-
-
-def llm_prompt():
-    llm.run_llm(
-        """You are a friendly assistant. Introduce yourself and ask how you can be helpful""")
-    mic.once("audio-queue-empty", listen)
-
-
-def listen():
-    mic.queue(ReadySoundWav)
-    # ignore any transcription results that come in before we're ready
-    _ = transcription.read()
-    user_text_input = transcription.read_until_silence()
-    llm_messages.push({
-        "role": "user",
-        "content": user_text_input
-    })
-    llm_prompt()
-
-
-def accumulate_assistant_messages(completed_inference_text):
-    llm_messages.push({
-        "role": "assistant",
-        "content": completed_inference_text
-    })
--- a/src/khk-working/theoretical/notes.txt
+++ b/src/khk-working/theoretical/notes.txt
@@ -1,15 +0,0 @@
-
-01 just say one thing and exit
-02 llm say one thing and exit
-03 send "still frame" of video 
-04 manual intro utterance and then llm say one thing and exit
-05 generate images for the months of the year, synchronized with their spoken descriptions
-06 chat: llm speak and respond (ignoring transcription input while speaking)
-07 chat: llm speak and respond (interruptible)
-08 two llms arguing about a topic (in the same process)
-09 two llms arguing about a topic (two separate bots)
-10 listen for wake word before sending commands to llm
-11 06 plus sound effects queued from sound file
-12 06 plus background music played through a second "mic" device 
-
-
--- a/src/samples/simple-sample/simple-sample.py
+++ b/src/samples/simple-sample/simple-sample.py
@@ -4,6 +4,8 @@ from re import A
 import requests
 import time
 import urllib.parse
+import signal
+

 from dailyai.async_processor.async_processor import (
    LLMResponse,
@@ -14,11 +16,19 @@ from dailyai.message_handler.message_handler import MessageHandler
 from dailyai.services.ai_services import AIServiceConfig
 from dailyai.services.azure_ai_services import AzureImageGenService, AzureTTSService, AzureLLMService

+orchestrator = None
+message_handler = None
+services = None
+
+
 def add_bot_to_room(room_url, token, expiration) -> None:
+    global orchestrator
+    global message_handler
+    global services

    # A simple prompt for a simple sample.
    message_handler = MessageHandler(
-    """
+        """
        You are a sample bot, meant to demonstrate how to use an LLM with transcription at TTS.
        Answer user's questions and be friendly, and if you can, give some ideas about how someone
        could use a bot like you in a more in-depth way. Because your responses will be spoken,
@@ -51,24 +61,37 @@ def add_bot_to_room(room_url, token, expiration) -> None:
        expiration=expiration,
    )

+    # khk note: my expectation was that we'd join the Daily session below
+    # when we call orchestrator.start(), but we actually join it here.
+    #
    orchestrator = Orchestrator(
        orchestrator_config,
        services,
        message_handler,
    )
-    orchestrator.start()

+    orchestrator.start()
+    print("simple-sample.py should be finished now")
+
+
+def keyboard_interrupt_handler(signal, frame):
+    print("keyboard interrupt handler: shutting down gracefully")
+    orchestrator.stop()
+    orchestrator.participant_left = True
+    print("we called orchestrator.stop() and set participant_left to True")
    # When the orchestrator's done, we need to shut it down,
    # and the various services and handlers we've created.
-    orchestrator.stop()
    message_handler.shutdown()

    services.tts.close()
    services.llm.close()
+    print("we got past services.llm.close()")
+

 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Simple Daily Bot Sample")
-    parser.add_argument("-u", "--url", type=str, required=True, help="URL of the Daily room")
+    parser.add_argument("-u", "--url", type=str,
+                        required=True, help="URL of the Daily room")
    parser.add_argument(
        "-k", "--apikey", type=str, required=True, help="Daily API Key (needed to create token)"
    )
@@ -88,8 +111,10 @@ if __name__ == "__main__":
    )

    if res.status_code != 200:
-        raise Exception(f'Failed to create meeting token: {res.status_code} {res.text}')
+        raise Exception(
+            f'Failed to create meeting token: {res.status_code} {res.text}')

    token: str = res.json()['token']

+    signal.signal(signal.SIGINT, keyboard_interrupt_handler)
    add_bot_to_room(args.url, token, expiration)