Compare commits
5 Commits
hush/aggre
...
khk/minima
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
297b9402a8 | ||
|
|
36f4001877 | ||
|
|
4ee34ce796 | ||
|
|
0db2cf5a80 | ||
|
|
72aa034c85 |
55
src/khk-working/functional/just-say-one-thing.py
Normal file
55
src/khk-working/functional/just-say-one-thing.py
Normal file
@@ -0,0 +1,55 @@
|
||||
import argparse
|
||||
import time
|
||||
|
||||
from dailyai.orchestrator import OrchestratorConfig, Orchestrator
|
||||
from dailyai.message_handler.message_handler import MessageHandler
|
||||
from dailyai.services.ai_services import AIServiceConfig
|
||||
from dailyai.services.azure_ai_services import AzureTTSService, AzureLLMService
|
||||
|
||||
|
||||
# For now, use Azure service for the TTS. Todo: make tts service
|
||||
# and tts args (like which voice to use) configurable via command
|
||||
# line arguments.
|
||||
# Need the following environment variables:
|
||||
# - AZURE_SPEECH_SERVICE_KEY
|
||||
# - AZURE_SPEECH_SERVICE_REGION
|
||||
|
||||
|
||||
def add_bot_to_room(room_url, text) -> None:
|
||||
message_handler = MessageHandler(
|
||||
"Respond with only the following text: " + text)
|
||||
|
||||
services = AIServiceConfig(
|
||||
tts=AzureTTSService(), image=None, llm=AzureLLMService()
|
||||
)
|
||||
|
||||
orchestrator_config = OrchestratorConfig(
|
||||
room_url=room_url,
|
||||
# todo: token should be optional
|
||||
token=None,
|
||||
bot_name="Minimal Speaking Bot",
|
||||
# todo: expiration should be optional
|
||||
expiration=time.time() + 10
|
||||
)
|
||||
|
||||
orchestrator = Orchestrator(
|
||||
orchestrator_config,
|
||||
services,
|
||||
message_handler,
|
||||
)
|
||||
|
||||
orchestrator.start()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Say one phrase and exit")
|
||||
parser.add_argument("-u", "--url", type=str,
|
||||
required=True, help="URL of the Daily room")
|
||||
|
||||
parser.add_argument(
|
||||
"-t", "--text", type=str, required=True, help="text to send into the session as speech"
|
||||
)
|
||||
|
||||
args: argparse.Namespace = parser.parse_args()
|
||||
|
||||
add_bot_to_room(args.url, args.text)
|
||||
48
src/khk-working/theoretical/01-say-one-thing.py
Normal file
48
src/khk-working/theoretical/01-say-one-thing.py
Normal file
@@ -0,0 +1,48 @@
|
||||
from dailyai.services.transport.DailyTransport import DailyTransportService
|
||||
from dailyai.services.tts.AzureTTSService import AzureTTSService
|
||||
|
||||
|
||||
transport = None
|
||||
mic = None
|
||||
tts = None
|
||||
|
||||
|
||||
def main():
|
||||
global transport
|
||||
global mic
|
||||
global tts
|
||||
|
||||
# create a transport service object using environment variables for
|
||||
# the transport service's API key, room url, and any other configuration.
|
||||
# services can all define and document the environment variables they use.
|
||||
# services all also take an optional config object that is used instead of
|
||||
# environment variables.
|
||||
#
|
||||
# the abstract transport service APIs presumably can map pretty closely
|
||||
# to the daily-python basic API
|
||||
transport = DailyTransportService()
|
||||
|
||||
# similarly, create a tts service
|
||||
tts = AzureTTSService()
|
||||
|
||||
# ask the transport to create a local audio "device"/queue for
|
||||
# chunks of audio to play sequentially. the "mic" object is a handle
|
||||
# we can use to inspect and control the queue if we need to. in this
|
||||
# case we will pipe into this queue from the tts service
|
||||
mic = transport.create_audio_queue()
|
||||
tts.set_output(mic)
|
||||
|
||||
transport.on("error", lambda e: print(e))
|
||||
transport.on("joined-meeting", say_one_thing)
|
||||
transport.start()
|
||||
|
||||
|
||||
def say_one_thing():
|
||||
# say one thing, then leave
|
||||
tts.run_tts("hello world")
|
||||
mic.on("audio-queue-empty", shutdown)
|
||||
|
||||
|
||||
def shutdown():
|
||||
transport.stop()
|
||||
tts.close()
|
||||
35
src/khk-working/theoretical/02-llm-say-one-thing.py
Normal file
35
src/khk-working/theoretical/02-llm-say-one-thing.py
Normal file
@@ -0,0 +1,35 @@
|
||||
from dailyai.services.transport.DailyTransport import DailyTransportService
|
||||
from dailyai.services.llm.AzureLLMService import AzureLLMService
|
||||
from dailyai.services.tts.AzureTTSService import AzureTTSService
|
||||
|
||||
transport = None
|
||||
llm = None
|
||||
tts = None
|
||||
|
||||
|
||||
def main():
|
||||
global transport
|
||||
global llm
|
||||
global tts
|
||||
|
||||
transport = DailyTransportService()
|
||||
llm = AzureLLMService()
|
||||
tts = AzureTTSService()
|
||||
mic = transport.create_audio_queue()
|
||||
tts.set_output(mic)
|
||||
llm.set_output(tts)
|
||||
|
||||
transport.on("error", lambda e: print(e))
|
||||
transport.on("joined-meeting", make_one_inference_call)
|
||||
transport.start()
|
||||
|
||||
|
||||
def make_one_inference_call():
|
||||
# ask our llm to say one thing, then leave
|
||||
llm.run_llm("tell me a joke about llamas")
|
||||
transport.on("audio-queue-empty", shutdown)
|
||||
|
||||
|
||||
def shutdown():
|
||||
transport.stop()
|
||||
tts.close()
|
||||
27
src/khk-working/theoretical/03-generate-one-video-frame.py
Normal file
27
src/khk-working/theoretical/03-generate-one-video-frame.py
Normal file
@@ -0,0 +1,27 @@
|
||||
from dailyai.services.transport.DailyTransport import DailyTransportService
|
||||
from dailyai.services.genimage.AzureDalleService import AzureDalleService
|
||||
|
||||
dalle = None
|
||||
|
||||
|
||||
def main():
|
||||
global dalle
|
||||
|
||||
transport = DailyTransportService()
|
||||
dalle = AzureDalleService()
|
||||
|
||||
# create_video_queue() could presumably take configuration parameters that
|
||||
# correspond to Daily video settings (resolution, framerate, target
|
||||
# bitrate, etc.)
|
||||
cam = transport.create_video_queue()
|
||||
dalle.set_output(cam)
|
||||
|
||||
transport.on("error", lambda e: print(e))
|
||||
transport.on("joined-meeting", say_one_thing)
|
||||
transport.start()
|
||||
|
||||
|
||||
def say_one_thing():
|
||||
# make one image, send it to the video queue, then just hang out.
|
||||
# for simplicity we have not implemented graceful shutdown :-)
|
||||
dalle.generate_image("an astronaut riding a skateboard")
|
||||
37
src/khk-working/theoretical/04-say-two-things.py
Normal file
37
src/khk-working/theoretical/04-say-two-things.py
Normal file
@@ -0,0 +1,37 @@
|
||||
from dailyai.services.transport.DailyTransport import DailyTransportService
|
||||
from dailyai.services.llm.AzureLLMService import AzureLLMService
|
||||
from dailyai.services.tts.AzureTTSService import AzureTTSService
|
||||
|
||||
transport = None
|
||||
llm = None
|
||||
tts = None
|
||||
|
||||
|
||||
def main():
|
||||
global transport
|
||||
global llm
|
||||
global tts
|
||||
|
||||
transport = DailyTransportService()
|
||||
llm = AzureLLMService()
|
||||
tts = AzureTTSService()
|
||||
mic = transport.create_audio_queue()
|
||||
tts.set_output(mic)
|
||||
llm.set_output(tts)
|
||||
|
||||
transport.on("error", lambda e: print(e))
|
||||
transport.on("joined-meeting", say_two_things)
|
||||
transport.start()
|
||||
|
||||
|
||||
def say_two_things():
|
||||
# queue two pieces of speech: one specified as a text literal,
|
||||
# and one generated by an llm
|
||||
tts.run_tts("My friend the LLM is now going to tell a joke about llamas.")
|
||||
llm.run_llm("tell me a joke about llamas")
|
||||
transport.on("audio-queue-empty", shutdown)
|
||||
|
||||
|
||||
def shutdown():
|
||||
transport.stop()
|
||||
tts.close()
|
||||
101
src/khk-working/theoretical/05-llm-speech-and-images.py
Normal file
101
src/khk-working/theoretical/05-llm-speech-and-images.py
Normal file
@@ -0,0 +1,101 @@
|
||||
from dailyai.services.transport.DailyTransport import DailyTransportService
|
||||
from dailyai.services.llm.AzureLLMService import AzureLLMService
|
||||
from dailyai.services.tts.AzureTTSService import AzureTTSService
|
||||
from dailyai.services.genimage.AzureDalleService import AzureDalleService
|
||||
from dailyai.services.utils.AudioImageSynchronizedPair import AudioImageSynchronizedPair
|
||||
|
||||
transport = None
|
||||
llm = None
|
||||
tts = None
|
||||
dalle = None
|
||||
mic = None
|
||||
cam = None
|
||||
|
||||
|
||||
def main():
|
||||
global transport
|
||||
global llm
|
||||
global tts
|
||||
global dalle
|
||||
|
||||
transport = DailyTransportService()
|
||||
llm = AzureLLMService()
|
||||
tts = AzureTTSService()
|
||||
dalle = AzureDalleService()
|
||||
|
||||
# set up mic and cam. but don't wire up automatic output to the mic
|
||||
# and cam from our AI services because we need to manage synchronization
|
||||
# of image/speech pairings
|
||||
mic = transport.create_audio_queue()
|
||||
cam = transport.create_video_queue()
|
||||
|
||||
transport.on("error", lambda e: print(e))
|
||||
transport.on("joined-meeting", narrate_calendar_images)
|
||||
transport.start()
|
||||
|
||||
|
||||
def narrate_calendar_images():
|
||||
# let's loop over the months of the year. for each month name, we will have
|
||||
# our llm generate a description of a nice photograph for that month's page
|
||||
# in a calendar.
|
||||
#
|
||||
# then we'll take the text description and:
|
||||
# 1. turn it into speech that we send into the session as audio
|
||||
# 2. turn it into an image that we send into the session as video
|
||||
# we want the audio and video to be synchronized, so we'll use a helper
|
||||
# class to manage that.
|
||||
#
|
||||
# the first `run_llm()` call defines a lambda to process its output.
|
||||
#
|
||||
# the design idea here is that output can be piped into a function that
|
||||
# takes inference completion text as its argument. *or* output can be
|
||||
# piped into an object that has more options (maybe a callback for streaming
|
||||
# results, or a callback for inference completion, or both).
|
||||
#
|
||||
# note that we might queue up the month outputs out of order, but that's
|
||||
# okay for this demo
|
||||
#
|
||||
for month in ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]:
|
||||
synchronizer = AudioImageSynchronizedPair(
|
||||
audio_output=mic, video_output=cam)
|
||||
llm.run_llm(
|
||||
f""""
|
||||
Describe a nature photograph suitable for use in a calendar,
|
||||
for the month of {month}. Include only the image description
|
||||
with no preamble.
|
||||
""",
|
||||
output=lambda inference_text: (
|
||||
dalle.generate_image(inference_text, output=synchronizer),
|
||||
tts.run_tts(inference_text, output=synchronizer)
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
# the AudioImageSynchronizedPair class seems useful enough that I've listed
|
||||
# it above as a standard utility we can import. but here's a theoretical
|
||||
# implementation
|
||||
|
||||
class TheoreticalAudioImageSynchronizedPair:
|
||||
def __init__(self, audio_output, video_output):
|
||||
self.audio_output = audio_output
|
||||
self.video_output = video_output
|
||||
self.image = None
|
||||
self.audio = None
|
||||
|
||||
def image_generation_complete(self, image):
|
||||
self.image = image
|
||||
self._maybe_send()
|
||||
|
||||
def tts_complete(self, audio):
|
||||
self.audio = audio
|
||||
self._maybe_send()
|
||||
|
||||
def _maybe_send(self):
|
||||
if self.image is not None and self.audio is not None:
|
||||
self.video_output.queue_frame(self.image)
|
||||
self.audio_output.queue_audio(self.audio)
|
||||
|
||||
|
||||
def shutdown():
|
||||
transport.stop()
|
||||
tts.close()
|
||||
72
src/khk-working/theoretical/06-llm-voice-chat.py
Normal file
72
src/khk-working/theoretical/06-llm-voice-chat.py
Normal file
@@ -0,0 +1,72 @@
|
||||
from dailyai.services.transport.DailyTransport import DailyTransportService
|
||||
from dailyai.services.llm.AzureLLMService import AzureLLMService
|
||||
from dailyai.services.tts.AzureTTSService import AzureTTSService
|
||||
from dailyai.services.utils import Tee
|
||||
from dailyai.services.utils import ReadySoundWav
|
||||
|
||||
initial_prompt = "You are a helpful assistant. Introduce yourself and ask how you can be helpful."
|
||||
|
||||
llm_messages = [{
|
||||
"role": "system",
|
||||
"content": initial_prompt
|
||||
}]
|
||||
|
||||
|
||||
transport = None
|
||||
llm = None
|
||||
tts = None
|
||||
mic = None
|
||||
transcription = None
|
||||
|
||||
|
||||
def main():
|
||||
global transport
|
||||
global llm
|
||||
global tts
|
||||
global mic
|
||||
global transcription
|
||||
|
||||
transport = DailyTransportService()
|
||||
llm = AzureLLMService()
|
||||
tts = AzureTTSService()
|
||||
|
||||
# using Moishe's combined output queue rather than an audio-only queue
|
||||
mic = transport.create_output_queue(audio=True, video=False)
|
||||
|
||||
llm.set_output(Tee(tts, accumulate_assistant_messages))
|
||||
tts.set_output(mic)
|
||||
|
||||
# DailyTransport implements transcription internally. we'll grab a handle to this
|
||||
# Transcription service, configure it to use silence-based endpointing, and
|
||||
# set the silence interval to 1.5 seconds
|
||||
transcription = transport.transcription_service()
|
||||
transcription.configure(endpointing_pause=1.5)
|
||||
|
||||
transport.on("error", lambda e: print(e))
|
||||
transport.on("joined-meeting", llm_prompt)
|
||||
transport.start()
|
||||
|
||||
|
||||
def llm_prompt():
|
||||
llm.run_llm(
|
||||
"""You are a friendly assistant. Introduce yourself and ask how you can be helpful""")
|
||||
mic.once("audio-queue-empty", listen)
|
||||
|
||||
|
||||
def listen():
|
||||
mic.queue(ReadySoundWav)
|
||||
# ignore any transcription results that come in before we're ready
|
||||
_ = transcription.read()
|
||||
user_text_input = transcription.read_until_silence()
|
||||
llm_messages.push({
|
||||
"role": "user",
|
||||
"content": user_text_input
|
||||
})
|
||||
llm_prompt()
|
||||
|
||||
|
||||
def accumulate_assistant_messages(completed_inference_text):
|
||||
llm_messages.push({
|
||||
"role": "assistant",
|
||||
"content": completed_inference_text
|
||||
})
|
||||
15
src/khk-working/theoretical/notes.txt
Normal file
15
src/khk-working/theoretical/notes.txt
Normal file
@@ -0,0 +1,15 @@
|
||||
|
||||
-01 just say one thing and exit
|
||||
-02 llm say one thing and exit
|
||||
-03 send "still frame" of video
|
||||
-04 manual intro utterance and then llm say one thing and exit
|
||||
-05 generate images for the months of the year, synchronized with their spoken descriptions
|
||||
-06 chat: llm speak and respond (ignoring transcription input while speaking)
|
||||
-07 chat: llm speak and respond (interruptible)
|
||||
-08 two llms arguing about a topic (in the same process)
|
||||
-09 two llms arguing about a topic (two separate bots)
|
||||
-10 listen for wake word before sending commands to llm
|
||||
-11 06 plus sound effects queued from sound file
|
||||
-12 06 plus background music played through a second "mic" device
|
||||
|
||||
|
||||
Reference in New Issue
Block a user