theoretical sample: basic voice chat

three more theoretical samples
mic doesn't need to be global in 02
2024-01-03 20:54:51 -08:00 · 2024-01-03 11:55:48 -08:00 · 2024-01-01 21:54:14 -08:00 · 2024-01-01 21:46:10 -08:00 · 2023-12-31 21:17:11 -08:00
8 changed files with 390 additions and 0 deletions
--- a/src/khk-working/functional/just-say-one-thing.py
+++ b/src/khk-working/functional/just-say-one-thing.py
@@ -0,0 +1,55 @@
+import argparse
+import time
+
+from dailyai.orchestrator import OrchestratorConfig, Orchestrator
+from dailyai.message_handler.message_handler import MessageHandler
+from dailyai.services.ai_services import AIServiceConfig
+from dailyai.services.azure_ai_services import AzureTTSService, AzureLLMService
+
+
+# For now, use Azure service for the TTS. Todo: make tts service
+# and tts args (like which voice to use) configurable via command
+# line arguments.
+# Need the following environment variables:
+# - AZURE_SPEECH_SERVICE_KEY
+# - AZURE_SPEECH_SERVICE_REGION
+
+
+def add_bot_to_room(room_url, text) -> None:
+    message_handler = MessageHandler(
+        "Respond with only the following text: " + text)
+
+    services = AIServiceConfig(
+        tts=AzureTTSService(), image=None, llm=AzureLLMService()
+    )
+
+    orchestrator_config = OrchestratorConfig(
+        room_url=room_url,
+        # todo: token should be optional
+        token=None,
+        bot_name="Minimal Speaking Bot",
+        # todo: expiration should be optional
+        expiration=time.time() + 10
+    )
+
+    orchestrator = Orchestrator(
+        orchestrator_config,
+        services,
+        message_handler,
+    )
+
+    orchestrator.start()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Say one phrase and exit")
+    parser.add_argument("-u", "--url", type=str,
+                        required=True, help="URL of the Daily room")
+
+    parser.add_argument(
+        "-t", "--text", type=str, required=True, help="text to send into the session as speech"
+    )
+
+    args: argparse.Namespace = parser.parse_args()
+
+    add_bot_to_room(args.url, args.text)
--- a/src/khk-working/theoretical/01-say-one-thing.py
+++ b/src/khk-working/theoretical/01-say-one-thing.py
@@ -0,0 +1,48 @@
+from dailyai.services.transport.DailyTransport import DailyTransportService
+from dailyai.services.tts.AzureTTSService import AzureTTSService
+
+
+transport = None
+mic = None
+tts = None
+
+
+def main():
+    global transport
+    global mic
+    global tts
+
+    # create a transport service object using environment variables for
+    # the transport service's API key, room url, and any other configuration.
+    # services can all define and document the environment variables they use.
+    # services all also take an optional config object that is used instead of
+    # environment variables.
+    #
+    # the abstract transport service APIs presumably can map pretty closely
+    # to the daily-python basic API
+    transport = DailyTransportService()
+
+    # similarly, create a tts service
+    tts = AzureTTSService()
+
+    # ask the transport to create a local audio "device"/queue for
+    # chunks of audio to play sequentially. the "mic" object is a handle
+    # we can use to inspect and control the queue if we need to. in this
+    # case we will pipe into this queue from the tts service
+    mic = transport.create_audio_queue()
+    tts.set_output(mic)
+
+    transport.on("error", lambda e: print(e))
+    transport.on("joined-meeting", say_one_thing)
+    transport.start()
+
+
+def say_one_thing():
+    # say one thing, then leave
+    tts.run_tts("hello world")
+    mic.on("audio-queue-empty", shutdown)
+
+
+def shutdown():
+    transport.stop()
+    tts.close()
--- a/src/khk-working/theoretical/02-llm-say-one-thing.py
+++ b/src/khk-working/theoretical/02-llm-say-one-thing.py
@@ -0,0 +1,35 @@
+from dailyai.services.transport.DailyTransport import DailyTransportService
+from dailyai.services.llm.AzureLLMService import AzureLLMService
+from dailyai.services.tts.AzureTTSService import AzureTTSService
+
+transport = None
+llm = None
+tts = None
+
+
+def main():
+    global transport
+    global llm
+    global tts
+
+    transport = DailyTransportService()
+    llm = AzureLLMService()
+    tts = AzureTTSService()
+    mic = transport.create_audio_queue()
+    tts.set_output(mic)
+    llm.set_output(tts)
+
+    transport.on("error", lambda e: print(e))
+    transport.on("joined-meeting", make_one_inference_call)
+    transport.start()
+
+
+def make_one_inference_call():
+    # ask our llm to say one thing, then leave
+    llm.run_llm("tell me a joke about llamas")
+    transport.on("audio-queue-empty", shutdown)
+
+
+def shutdown():
+    transport.stop()
+    tts.close()
--- a/src/khk-working/theoretical/03-generate-one-video-frame.py
+++ b/src/khk-working/theoretical/03-generate-one-video-frame.py
@@ -0,0 +1,27 @@
+from dailyai.services.transport.DailyTransport import DailyTransportService
+from dailyai.services.genimage.AzureDalleService import AzureDalleService
+
+dalle = None
+
+
+def main():
+    global dalle
+
+    transport = DailyTransportService()
+    dalle = AzureDalleService()
+
+    # create_video_queue() could presumably take configuration parameters that
+    # correspond to Daily video settings (resolution, framerate, target
+    # bitrate, etc.)
+    cam = transport.create_video_queue()
+    dalle.set_output(cam)
+
+    transport.on("error", lambda e: print(e))
+    transport.on("joined-meeting", say_one_thing)
+    transport.start()
+
+
+def say_one_thing():
+    # make one image, send it to the video queue, then just hang out.
+    # for simplicity we have not implemented graceful shutdown :-)
+    dalle.generate_image("an astronaut riding a skateboard")
--- a/src/khk-working/theoretical/04-say-two-things.py
+++ b/src/khk-working/theoretical/04-say-two-things.py
@@ -0,0 +1,37 @@
+from dailyai.services.transport.DailyTransport import DailyTransportService
+from dailyai.services.llm.AzureLLMService import AzureLLMService
+from dailyai.services.tts.AzureTTSService import AzureTTSService
+
+transport = None
+llm = None
+tts = None
+
+
+def main():
+    global transport
+    global llm
+    global tts
+
+    transport = DailyTransportService()
+    llm = AzureLLMService()
+    tts = AzureTTSService()
+    mic = transport.create_audio_queue()
+    tts.set_output(mic)
+    llm.set_output(tts)
+
+    transport.on("error", lambda e: print(e))
+    transport.on("joined-meeting", say_two_things)
+    transport.start()
+
+
+def say_two_things():
+    # queue two pieces of speech: one specified as a text literal,
+    # and one generated by an llm
+    tts.run_tts("My friend the LLM is now going to tell a joke about llamas.")
+    llm.run_llm("tell me a joke about llamas")
+    transport.on("audio-queue-empty", shutdown)
+
+
+def shutdown():
+    transport.stop()
+    tts.close()
--- a/src/khk-working/theoretical/05-llm-speech-and-images.py
+++ b/src/khk-working/theoretical/05-llm-speech-and-images.py
@@ -0,0 +1,101 @@
+from dailyai.services.transport.DailyTransport import DailyTransportService
+from dailyai.services.llm.AzureLLMService import AzureLLMService
+from dailyai.services.tts.AzureTTSService import AzureTTSService
+from dailyai.services.genimage.AzureDalleService import AzureDalleService
+from dailyai.services.utils.AudioImageSynchronizedPair import AudioImageSynchronizedPair
+
+transport = None
+llm = None
+tts = None
+dalle = None
+mic = None
+cam = None
+
+
+def main():
+    global transport
+    global llm
+    global tts
+    global dalle
+
+    transport = DailyTransportService()
+    llm = AzureLLMService()
+    tts = AzureTTSService()
+    dalle = AzureDalleService()
+
+    # set up mic and cam. but don't wire up automatic output to the mic
+    # and cam from our AI services because we need to manage synchronization
+    # of image/speech pairings
+    mic = transport.create_audio_queue()
+    cam = transport.create_video_queue()
+
+    transport.on("error", lambda e: print(e))
+    transport.on("joined-meeting", narrate_calendar_images)
+    transport.start()
+
+
+def narrate_calendar_images():
+    # let's loop over the months of the year. for each month name, we will have
+    # our llm generate a description of a nice photograph for that month's page
+    # in a calendar.
+    #
+    # then we'll take the text description and:
+    #  1. turn it into speech that we send into the session as audio
+    #  2. turn it into an image that we send into the session as video
+    # we want the audio and video to be synchronized, so we'll use a helper
+    # class to manage that.
+    #
+    # the first `run_llm()` call defines a lambda to process its output.
+    #
+    # the design idea here is that output can be piped into a function that
+    # takes inference completion text as its argument. *or* output can be
+    # piped into an object that has more options (maybe a callback for streaming
+    # results, or a callback for inference completion, or both).
+    #
+    # note that we might queue up the month outputs out of order, but that's
+    # okay for this demo
+    #
+    for month in ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]:
+        synchronizer = AudioImageSynchronizedPair(
+            audio_output=mic, video_output=cam)
+        llm.run_llm(
+            f""""
+            Describe a nature photograph suitable for use in a calendar,
+            for the month of {month}. Include only the image description
+            with no preamble.
+            """,
+            output=lambda inference_text: (
+                dalle.generate_image(inference_text, output=synchronizer),
+                tts.run_tts(inference_text, output=synchronizer)
+            ),
+        )
+
+
+# the AudioImageSynchronizedPair class seems useful enough that I've listed
+# it above as a standard utility we can import. but here's a theoretical
+# implementation
+
+class TheoreticalAudioImageSynchronizedPair:
+    def __init__(self, audio_output, video_output):
+        self.audio_output = audio_output
+        self.video_output = video_output
+        self.image = None
+        self.audio = None
+
+    def image_generation_complete(self, image):
+        self.image = image
+        self._maybe_send()
+
+    def tts_complete(self, audio):
+        self.audio = audio
+        self._maybe_send()
+
+    def _maybe_send(self):
+        if self.image is not None and self.audio is not None:
+            self.video_output.queue_frame(self.image)
+            self.audio_output.queue_audio(self.audio)
+
+
+def shutdown():
+    transport.stop()
+    tts.close()
--- a/src/khk-working/theoretical/06-llm-voice-chat.py
+++ b/src/khk-working/theoretical/06-llm-voice-chat.py
@@ -0,0 +1,72 @@
+from dailyai.services.transport.DailyTransport import DailyTransportService
+from dailyai.services.llm.AzureLLMService import AzureLLMService
+from dailyai.services.tts.AzureTTSService import AzureTTSService
+from dailyai.services.utils import Tee
+from dailyai.services.utils import ReadySoundWav
+
+initial_prompt = "You are a helpful assistant. Introduce yourself and ask how you can be helpful."
+
+llm_messages = [{
+    "role": "system",
+    "content": initial_prompt
+}]
+
+
+transport = None
+llm = None
+tts = None
+mic = None
+transcription = None
+
+
+def main():
+    global transport
+    global llm
+    global tts
+    global mic
+    global transcription
+
+    transport = DailyTransportService()
+    llm = AzureLLMService()
+    tts = AzureTTSService()
+
+    # using Moishe's combined output queue rather than an audio-only queue
+    mic = transport.create_output_queue(audio=True, video=False)
+
+    llm.set_output(Tee(tts, accumulate_assistant_messages))
+    tts.set_output(mic)
+
+    # DailyTransport implements transcription internally. we'll grab a handle to this
+    # Transcription service, configure it to use silence-based endpointing, and
+    # set the silence interval to 1.5 seconds
+    transcription = transport.transcription_service()
+    transcription.configure(endpointing_pause=1.5)
+
+    transport.on("error", lambda e: print(e))
+    transport.on("joined-meeting", llm_prompt)
+    transport.start()
+
+
+def llm_prompt():
+    llm.run_llm(
+        """You are a friendly assistant. Introduce yourself and ask how you can be helpful""")
+    mic.once("audio-queue-empty", listen)
+
+
+def listen():
+    mic.queue(ReadySoundWav)
+    # ignore any transcription results that come in before we're ready
+    _ = transcription.read()
+    user_text_input = transcription.read_until_silence()
+    llm_messages.push({
+        "role": "user",
+        "content": user_text_input
+    })
+    llm_prompt()
+
+
+def accumulate_assistant_messages(completed_inference_text):
+    llm_messages.push({
+        "role": "assistant",
+        "content": completed_inference_text
+    })
--- a/src/khk-working/theoretical/notes.txt
+++ b/src/khk-working/theoretical/notes.txt
@@ -0,0 +1,15 @@
+
+-01 just say one thing and exit
+-02 llm say one thing and exit
+-03 send "still frame" of video 
+-04 manual intro utterance and then llm say one thing and exit
+-05 generate images for the months of the year, synchronized with their spoken descriptions
+-06 chat: llm speak and respond (ignoring transcription input while speaking)
+-07 chat: llm speak and respond (interruptible)
+-08 two llms arguing about a topic (in the same process)
+-09 two llms arguing about a topic (two separate bots)
+-10 listen for wake word before sending commands to llm
+-11 06 plus sound effects queued from sound file
+-12 06 plus background music played through a second "mic" device 
+
+
Author	SHA1	Message	Date
Kwindla Hultman Kramer	297b9402a8	theoretical sample: basic voice chat	2024-01-03 20:54:51 -08:00
Kwindla Hultman Kramer	36f4001877	three more theoretical samples	2024-01-03 11:55:48 -08:00
Kwindla Hultman Kramer	4ee34ce796	mic doesn't need to be global in 02	2024-01-01 21:54:14 -08:00
Kwindla Hultman Kramer	0db2cf5a80	working on theoretical API examples	2024-01-01 21:46:10 -08:00
Kwindla Hultman Kramer	72aa034c85	start of khk minimal samples	2023-12-31 21:17:11 -08:00