Compare commits

..

1 Commits

Author SHA1 Message Date
Kwindla Hultman Kramer
487f5b47bf ctrl-c for simple-sample.py 2023-12-31 19:35:52 -08:00
9 changed files with 30 additions and 395 deletions

View File

@@ -1,55 +0,0 @@
import argparse
import time
from dailyai.orchestrator import OrchestratorConfig, Orchestrator
from dailyai.message_handler.message_handler import MessageHandler
from dailyai.services.ai_services import AIServiceConfig
from dailyai.services.azure_ai_services import AzureTTSService, AzureLLMService
# For now, use Azure service for the TTS. Todo: make tts service
# and tts args (like which voice to use) configurable via command
# line arguments.
# Need the following environment variables:
# - AZURE_SPEECH_SERVICE_KEY
# - AZURE_SPEECH_SERVICE_REGION
def add_bot_to_room(room_url, text) -> None:
message_handler = MessageHandler(
"Respond with only the following text: " + text)
services = AIServiceConfig(
tts=AzureTTSService(), image=None, llm=AzureLLMService()
)
orchestrator_config = OrchestratorConfig(
room_url=room_url,
# todo: token should be optional
token=None,
bot_name="Minimal Speaking Bot",
# todo: expiration should be optional
expiration=time.time() + 10
)
orchestrator = Orchestrator(
orchestrator_config,
services,
message_handler,
)
orchestrator.start()
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Say one phrase and exit")
parser.add_argument("-u", "--url", type=str,
required=True, help="URL of the Daily room")
parser.add_argument(
"-t", "--text", type=str, required=True, help="text to send into the session as speech"
)
args: argparse.Namespace = parser.parse_args()
add_bot_to_room(args.url, args.text)

View File

@@ -1,48 +0,0 @@
from dailyai.services.transport.DailyTransport import DailyTransportService
from dailyai.services.tts.AzureTTSService import AzureTTSService
transport = None
mic = None
tts = None
def main():
global transport
global mic
global tts
# create a transport service object using environment variables for
# the transport service's API key, room url, and any other configuration.
# services can all define and document the environment variables they use.
# services all also take an optional config object that is used instead of
# environment variables.
#
# the abstract transport service APIs presumably can map pretty closely
# to the daily-python basic API
transport = DailyTransportService()
# similarly, create a tts service
tts = AzureTTSService()
# ask the transport to create a local audio "device"/queue for
# chunks of audio to play sequentially. the "mic" object is a handle
# we can use to inspect and control the queue if we need to. in this
# case we will pipe into this queue from the tts service
mic = transport.create_audio_queue()
tts.set_output(mic)
transport.on("error", lambda e: print(e))
transport.on("joined-meeting", say_one_thing)
transport.start()
def say_one_thing():
# say one thing, then leave
tts.run_tts("hello world")
mic.on("audio-queue-empty", shutdown)
def shutdown():
transport.stop()
tts.close()

View File

@@ -1,35 +0,0 @@
from dailyai.services.transport.DailyTransport import DailyTransportService
from dailyai.services.llm.AzureLLMService import AzureLLMService
from dailyai.services.tts.AzureTTSService import AzureTTSService
transport = None
llm = None
tts = None
def main():
global transport
global llm
global tts
transport = DailyTransportService()
llm = AzureLLMService()
tts = AzureTTSService()
mic = transport.create_audio_queue()
tts.set_output(mic)
llm.set_output(tts)
transport.on("error", lambda e: print(e))
transport.on("joined-meeting", make_one_inference_call)
transport.start()
def make_one_inference_call():
# ask our llm to say one thing, then leave
llm.run_llm("tell me a joke about llamas")
transport.on("audio-queue-empty", shutdown)
def shutdown():
transport.stop()
tts.close()

View File

@@ -1,27 +0,0 @@
from dailyai.services.transport.DailyTransport import DailyTransportService
from dailyai.services.genimage.AzureDalleService import AzureDalleService
dalle = None
def main():
global dalle
transport = DailyTransportService()
dalle = AzureDalleService()
# create_video_queue() could presumably take configuration parameters that
# correspond to Daily video settings (resolution, framerate, target
# bitrate, etc.)
cam = transport.create_video_queue()
dalle.set_output(cam)
transport.on("error", lambda e: print(e))
transport.on("joined-meeting", say_one_thing)
transport.start()
def say_one_thing():
# make one image, send it to the video queue, then just hang out.
# for simplicity we have not implemented graceful shutdown :-)
dalle.generate_image("an astronaut riding a skateboard")

View File

@@ -1,37 +0,0 @@
from dailyai.services.transport.DailyTransport import DailyTransportService
from dailyai.services.llm.AzureLLMService import AzureLLMService
from dailyai.services.tts.AzureTTSService import AzureTTSService
transport = None
llm = None
tts = None
def main():
global transport
global llm
global tts
transport = DailyTransportService()
llm = AzureLLMService()
tts = AzureTTSService()
mic = transport.create_audio_queue()
tts.set_output(mic)
llm.set_output(tts)
transport.on("error", lambda e: print(e))
transport.on("joined-meeting", say_two_things)
transport.start()
def say_two_things():
# queue two pieces of speech: one specified as a text literal,
# and one generated by an llm
tts.run_tts("My friend the LLM is now going to tell a joke about llamas.")
llm.run_llm("tell me a joke about llamas")
transport.on("audio-queue-empty", shutdown)
def shutdown():
transport.stop()
tts.close()

View File

@@ -1,101 +0,0 @@
from dailyai.services.transport.DailyTransport import DailyTransportService
from dailyai.services.llm.AzureLLMService import AzureLLMService
from dailyai.services.tts.AzureTTSService import AzureTTSService
from dailyai.services.genimage.AzureDalleService import AzureDalleService
from dailyai.services.utils.AudioImageSynchronizedPair import AudioImageSynchronizedPair
transport = None
llm = None
tts = None
dalle = None
mic = None
cam = None
def main():
global transport
global llm
global tts
global dalle
transport = DailyTransportService()
llm = AzureLLMService()
tts = AzureTTSService()
dalle = AzureDalleService()
# set up mic and cam. but don't wire up automatic output to the mic
# and cam from our AI services because we need to manage synchronization
# of image/speech pairings
mic = transport.create_audio_queue()
cam = transport.create_video_queue()
transport.on("error", lambda e: print(e))
transport.on("joined-meeting", narrate_calendar_images)
transport.start()
def narrate_calendar_images():
# let's loop over the months of the year. for each month name, we will have
# our llm generate a description of a nice photograph for that month's page
# in a calendar.
#
# then we'll take the text description and:
# 1. turn it into speech that we send into the session as audio
# 2. turn it into an image that we send into the session as video
# we want the audio and video to be synchronized, so we'll use a helper
# class to manage that.
#
# the first `run_llm()` call defines a lambda to process its output.
#
# the design idea here is that output can be piped into a function that
# takes inference completion text as its argument. *or* output can be
# piped into an object that has more options (maybe a callback for streaming
# results, or a callback for inference completion, or both).
#
# note that we might queue up the month outputs out of order, but that's
# okay for this demo
#
for month in ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]:
synchronizer = AudioImageSynchronizedPair(
audio_output=mic, video_output=cam)
llm.run_llm(
f""""
Describe a nature photograph suitable for use in a calendar,
for the month of {month}. Include only the image description
with no preamble.
""",
output=lambda inference_text: (
dalle.generate_image(inference_text, output=synchronizer),
tts.run_tts(inference_text, output=synchronizer)
),
)
# the AudioImageSynchronizedPair class seems useful enough that I've listed
# it above as a standard utility we can import. but here's a theoretical
# implementation
class TheoreticalAudioImageSynchronizedPair:
def __init__(self, audio_output, video_output):
self.audio_output = audio_output
self.video_output = video_output
self.image = None
self.audio = None
def image_generation_complete(self, image):
self.image = image
self._maybe_send()
def tts_complete(self, audio):
self.audio = audio
self._maybe_send()
def _maybe_send(self):
if self.image is not None and self.audio is not None:
self.video_output.queue_frame(self.image)
self.audio_output.queue_audio(self.audio)
def shutdown():
transport.stop()
tts.close()

View File

@@ -1,72 +0,0 @@
from dailyai.services.transport.DailyTransport import DailyTransportService
from dailyai.services.llm.AzureLLMService import AzureLLMService
from dailyai.services.tts.AzureTTSService import AzureTTSService
from dailyai.services.utils import Tee
from dailyai.services.utils import ReadySoundWav
initial_prompt = "You are a helpful assistant. Introduce yourself and ask how you can be helpful."
llm_messages = [{
"role": "system",
"content": initial_prompt
}]
transport = None
llm = None
tts = None
mic = None
transcription = None
def main():
global transport
global llm
global tts
global mic
global transcription
transport = DailyTransportService()
llm = AzureLLMService()
tts = AzureTTSService()
# using Moishe's combined output queue rather than an audio-only queue
mic = transport.create_output_queue(audio=True, video=False)
llm.set_output(Tee(tts, accumulate_assistant_messages))
tts.set_output(mic)
# DailyTransport implements transcription internally. we'll grab a handle to this
# Transcription service, configure it to use silence-based endpointing, and
# set the silence interval to 1.5 seconds
transcription = transport.transcription_service()
transcription.configure(endpointing_pause=1.5)
transport.on("error", lambda e: print(e))
transport.on("joined-meeting", llm_prompt)
transport.start()
def llm_prompt():
llm.run_llm(
"""You are a friendly assistant. Introduce yourself and ask how you can be helpful""")
mic.once("audio-queue-empty", listen)
def listen():
mic.queue(ReadySoundWav)
# ignore any transcription results that come in before we're ready
_ = transcription.read()
user_text_input = transcription.read_until_silence()
llm_messages.push({
"role": "user",
"content": user_text_input
})
llm_prompt()
def accumulate_assistant_messages(completed_inference_text):
llm_messages.push({
"role": "assistant",
"content": completed_inference_text
})

View File

@@ -1,15 +0,0 @@
-01 just say one thing and exit
-02 llm say one thing and exit
-03 send "still frame" of video
-04 manual intro utterance and then llm say one thing and exit
-05 generate images for the months of the year, synchronized with their spoken descriptions
-06 chat: llm speak and respond (ignoring transcription input while speaking)
-07 chat: llm speak and respond (interruptible)
-08 two llms arguing about a topic (in the same process)
-09 two llms arguing about a topic (two separate bots)
-10 listen for wake word before sending commands to llm
-11 06 plus sound effects queued from sound file
-12 06 plus background music played through a second "mic" device

View File

@@ -4,6 +4,8 @@ from re import A
import requests
import time
import urllib.parse
import signal
from dailyai.async_processor.async_processor import (
LLMResponse,
@@ -14,11 +16,19 @@ from dailyai.message_handler.message_handler import MessageHandler
from dailyai.services.ai_services import AIServiceConfig
from dailyai.services.azure_ai_services import AzureImageGenService, AzureTTSService, AzureLLMService
orchestrator = None
message_handler = None
services = None
def add_bot_to_room(room_url, token, expiration) -> None:
global orchestrator
global message_handler
global services
# A simple prompt for a simple sample.
message_handler = MessageHandler(
"""
"""
You are a sample bot, meant to demonstrate how to use an LLM with transcription at TTS.
Answer user's questions and be friendly, and if you can, give some ideas about how someone
could use a bot like you in a more in-depth way. Because your responses will be spoken,
@@ -51,24 +61,37 @@ def add_bot_to_room(room_url, token, expiration) -> None:
expiration=expiration,
)
# khk note: my expectation was that we'd join the Daily session below
# when we call orchestrator.start(), but we actually join it here.
#
orchestrator = Orchestrator(
orchestrator_config,
services,
message_handler,
)
orchestrator.start()
orchestrator.start()
print("simple-sample.py should be finished now")
def keyboard_interrupt_handler(signal, frame):
print("keyboard interrupt handler: shutting down gracefully")
orchestrator.stop()
orchestrator.participant_left = True
print("we called orchestrator.stop() and set participant_left to True")
# When the orchestrator's done, we need to shut it down,
# and the various services and handlers we've created.
orchestrator.stop()
message_handler.shutdown()
services.tts.close()
services.llm.close()
print("we got past services.llm.close()")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Simple Daily Bot Sample")
parser.add_argument("-u", "--url", type=str, required=True, help="URL of the Daily room")
parser.add_argument("-u", "--url", type=str,
required=True, help="URL of the Daily room")
parser.add_argument(
"-k", "--apikey", type=str, required=True, help="Daily API Key (needed to create token)"
)
@@ -88,8 +111,10 @@ if __name__ == "__main__":
)
if res.status_code != 200:
raise Exception(f'Failed to create meeting token: {res.status_code} {res.text}')
raise Exception(
f'Failed to create meeting token: {res.status_code} {res.text}')
token: str = res.json()['token']
signal.signal(signal.SIGINT, keyboard_interrupt_handler)
add_bot_to_room(args.url, token, expiration)