Compare commits

...

5 Commits

Author SHA1 Message Date
Kwindla Hultman Kramer
297b9402a8 theoretical sample: basic voice chat 2024-01-03 20:54:51 -08:00
Kwindla Hultman Kramer
36f4001877 three more theoretical samples 2024-01-03 11:55:48 -08:00
Kwindla Hultman Kramer
4ee34ce796 mic doesn't need to be global in 02 2024-01-01 21:54:14 -08:00
Kwindla Hultman Kramer
0db2cf5a80 working on theoretical API examples 2024-01-01 21:46:10 -08:00
Kwindla Hultman Kramer
72aa034c85 start of khk minimal samples 2023-12-31 21:17:11 -08:00
8 changed files with 390 additions and 0 deletions

View File

@@ -0,0 +1,55 @@
import argparse
import time
from dailyai.orchestrator import OrchestratorConfig, Orchestrator
from dailyai.message_handler.message_handler import MessageHandler
from dailyai.services.ai_services import AIServiceConfig
from dailyai.services.azure_ai_services import AzureTTSService, AzureLLMService
# For now, use Azure service for the TTS. Todo: make tts service
# and tts args (like which voice to use) configurable via command
# line arguments.
# Need the following environment variables:
# - AZURE_SPEECH_SERVICE_KEY
# - AZURE_SPEECH_SERVICE_REGION
def add_bot_to_room(room_url, text) -> None:
message_handler = MessageHandler(
"Respond with only the following text: " + text)
services = AIServiceConfig(
tts=AzureTTSService(), image=None, llm=AzureLLMService()
)
orchestrator_config = OrchestratorConfig(
room_url=room_url,
# todo: token should be optional
token=None,
bot_name="Minimal Speaking Bot",
# todo: expiration should be optional
expiration=time.time() + 10
)
orchestrator = Orchestrator(
orchestrator_config,
services,
message_handler,
)
orchestrator.start()
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Say one phrase and exit")
parser.add_argument("-u", "--url", type=str,
required=True, help="URL of the Daily room")
parser.add_argument(
"-t", "--text", type=str, required=True, help="text to send into the session as speech"
)
args: argparse.Namespace = parser.parse_args()
add_bot_to_room(args.url, args.text)

View File

@@ -0,0 +1,48 @@
from dailyai.services.transport.DailyTransport import DailyTransportService
from dailyai.services.tts.AzureTTSService import AzureTTSService
transport = None
mic = None
tts = None
def main():
global transport
global mic
global tts
# create a transport service object using environment variables for
# the transport service's API key, room url, and any other configuration.
# services can all define and document the environment variables they use.
# services all also take an optional config object that is used instead of
# environment variables.
#
# the abstract transport service APIs presumably can map pretty closely
# to the daily-python basic API
transport = DailyTransportService()
# similarly, create a tts service
tts = AzureTTSService()
# ask the transport to create a local audio "device"/queue for
# chunks of audio to play sequentially. the "mic" object is a handle
# we can use to inspect and control the queue if we need to. in this
# case we will pipe into this queue from the tts service
mic = transport.create_audio_queue()
tts.set_output(mic)
transport.on("error", lambda e: print(e))
transport.on("joined-meeting", say_one_thing)
transport.start()
def say_one_thing():
# say one thing, then leave
tts.run_tts("hello world")
mic.on("audio-queue-empty", shutdown)
def shutdown():
transport.stop()
tts.close()

View File

@@ -0,0 +1,35 @@
from dailyai.services.transport.DailyTransport import DailyTransportService
from dailyai.services.llm.AzureLLMService import AzureLLMService
from dailyai.services.tts.AzureTTSService import AzureTTSService
transport = None
llm = None
tts = None
def main():
global transport
global llm
global tts
transport = DailyTransportService()
llm = AzureLLMService()
tts = AzureTTSService()
mic = transport.create_audio_queue()
tts.set_output(mic)
llm.set_output(tts)
transport.on("error", lambda e: print(e))
transport.on("joined-meeting", make_one_inference_call)
transport.start()
def make_one_inference_call():
# ask our llm to say one thing, then leave
llm.run_llm("tell me a joke about llamas")
transport.on("audio-queue-empty", shutdown)
def shutdown():
transport.stop()
tts.close()

View File

@@ -0,0 +1,27 @@
from dailyai.services.transport.DailyTransport import DailyTransportService
from dailyai.services.genimage.AzureDalleService import AzureDalleService
dalle = None
def main():
global dalle
transport = DailyTransportService()
dalle = AzureDalleService()
# create_video_queue() could presumably take configuration parameters that
# correspond to Daily video settings (resolution, framerate, target
# bitrate, etc.)
cam = transport.create_video_queue()
dalle.set_output(cam)
transport.on("error", lambda e: print(e))
transport.on("joined-meeting", say_one_thing)
transport.start()
def say_one_thing():
# make one image, send it to the video queue, then just hang out.
# for simplicity we have not implemented graceful shutdown :-)
dalle.generate_image("an astronaut riding a skateboard")

View File

@@ -0,0 +1,37 @@
from dailyai.services.transport.DailyTransport import DailyTransportService
from dailyai.services.llm.AzureLLMService import AzureLLMService
from dailyai.services.tts.AzureTTSService import AzureTTSService
transport = None
llm = None
tts = None
def main():
global transport
global llm
global tts
transport = DailyTransportService()
llm = AzureLLMService()
tts = AzureTTSService()
mic = transport.create_audio_queue()
tts.set_output(mic)
llm.set_output(tts)
transport.on("error", lambda e: print(e))
transport.on("joined-meeting", say_two_things)
transport.start()
def say_two_things():
# queue two pieces of speech: one specified as a text literal,
# and one generated by an llm
tts.run_tts("My friend the LLM is now going to tell a joke about llamas.")
llm.run_llm("tell me a joke about llamas")
transport.on("audio-queue-empty", shutdown)
def shutdown():
transport.stop()
tts.close()

View File

@@ -0,0 +1,101 @@
from dailyai.services.transport.DailyTransport import DailyTransportService
from dailyai.services.llm.AzureLLMService import AzureLLMService
from dailyai.services.tts.AzureTTSService import AzureTTSService
from dailyai.services.genimage.AzureDalleService import AzureDalleService
from dailyai.services.utils.AudioImageSynchronizedPair import AudioImageSynchronizedPair
transport = None
llm = None
tts = None
dalle = None
mic = None
cam = None
def main():
global transport
global llm
global tts
global dalle
transport = DailyTransportService()
llm = AzureLLMService()
tts = AzureTTSService()
dalle = AzureDalleService()
# set up mic and cam. but don't wire up automatic output to the mic
# and cam from our AI services because we need to manage synchronization
# of image/speech pairings
mic = transport.create_audio_queue()
cam = transport.create_video_queue()
transport.on("error", lambda e: print(e))
transport.on("joined-meeting", narrate_calendar_images)
transport.start()
def narrate_calendar_images():
# let's loop over the months of the year. for each month name, we will have
# our llm generate a description of a nice photograph for that month's page
# in a calendar.
#
# then we'll take the text description and:
# 1. turn it into speech that we send into the session as audio
# 2. turn it into an image that we send into the session as video
# we want the audio and video to be synchronized, so we'll use a helper
# class to manage that.
#
# the first `run_llm()` call defines a lambda to process its output.
#
# the design idea here is that output can be piped into a function that
# takes inference completion text as its argument. *or* output can be
# piped into an object that has more options (maybe a callback for streaming
# results, or a callback for inference completion, or both).
#
# note that we might queue up the month outputs out of order, but that's
# okay for this demo
#
for month in ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]:
synchronizer = AudioImageSynchronizedPair(
audio_output=mic, video_output=cam)
llm.run_llm(
f""""
Describe a nature photograph suitable for use in a calendar,
for the month of {month}. Include only the image description
with no preamble.
""",
output=lambda inference_text: (
dalle.generate_image(inference_text, output=synchronizer),
tts.run_tts(inference_text, output=synchronizer)
),
)
# the AudioImageSynchronizedPair class seems useful enough that I've listed
# it above as a standard utility we can import. but here's a theoretical
# implementation
class TheoreticalAudioImageSynchronizedPair:
def __init__(self, audio_output, video_output):
self.audio_output = audio_output
self.video_output = video_output
self.image = None
self.audio = None
def image_generation_complete(self, image):
self.image = image
self._maybe_send()
def tts_complete(self, audio):
self.audio = audio
self._maybe_send()
def _maybe_send(self):
if self.image is not None and self.audio is not None:
self.video_output.queue_frame(self.image)
self.audio_output.queue_audio(self.audio)
def shutdown():
transport.stop()
tts.close()

View File

@@ -0,0 +1,72 @@
from dailyai.services.transport.DailyTransport import DailyTransportService
from dailyai.services.llm.AzureLLMService import AzureLLMService
from dailyai.services.tts.AzureTTSService import AzureTTSService
from dailyai.services.utils import Tee
from dailyai.services.utils import ReadySoundWav
initial_prompt = "You are a helpful assistant. Introduce yourself and ask how you can be helpful."
llm_messages = [{
"role": "system",
"content": initial_prompt
}]
transport = None
llm = None
tts = None
mic = None
transcription = None
def main():
global transport
global llm
global tts
global mic
global transcription
transport = DailyTransportService()
llm = AzureLLMService()
tts = AzureTTSService()
# using Moishe's combined output queue rather than an audio-only queue
mic = transport.create_output_queue(audio=True, video=False)
llm.set_output(Tee(tts, accumulate_assistant_messages))
tts.set_output(mic)
# DailyTransport implements transcription internally. we'll grab a handle to this
# Transcription service, configure it to use silence-based endpointing, and
# set the silence interval to 1.5 seconds
transcription = transport.transcription_service()
transcription.configure(endpointing_pause=1.5)
transport.on("error", lambda e: print(e))
transport.on("joined-meeting", llm_prompt)
transport.start()
def llm_prompt():
llm.run_llm(
"""You are a friendly assistant. Introduce yourself and ask how you can be helpful""")
mic.once("audio-queue-empty", listen)
def listen():
mic.queue(ReadySoundWav)
# ignore any transcription results that come in before we're ready
_ = transcription.read()
user_text_input = transcription.read_until_silence()
llm_messages.push({
"role": "user",
"content": user_text_input
})
llm_prompt()
def accumulate_assistant_messages(completed_inference_text):
llm_messages.push({
"role": "assistant",
"content": completed_inference_text
})

View File

@@ -0,0 +1,15 @@
-01 just say one thing and exit
-02 llm say one thing and exit
-03 send "still frame" of video
-04 manual intro utterance and then llm say one thing and exit
-05 generate images for the months of the year, synchronized with their spoken descriptions
-06 chat: llm speak and respond (ignoring transcription input while speaking)
-07 chat: llm speak and respond (interruptible)
-08 two llms arguing about a topic (in the same process)
-09 two llms arguing about a topic (two separate bots)
-10 listen for wake word before sending commands to llm
-11 06 plus sound effects queued from sound file
-12 06 plus background music played through a second "mic" device