diff --git a/src/dailyai/queue_frame.py b/src/dailyai/queue_frame.py index 75a524813..3e249b38c 100644 --- a/src/dailyai/queue_frame.py +++ b/src/dailyai/queue_frame.py @@ -18,6 +18,8 @@ class StartStreamQueueFrame(ControlQueueFrame): class EndStreamQueueFrame(ControlQueueFrame): pass +class LLMResponseEndQueueFrame(QueueFrame): + pass @dataclass() class AudioQueueFrame(QueueFrame): diff --git a/src/dailyai/services/ai_services.py b/src/dailyai/services/ai_services.py index 263cf186e..a7a301ca5 100644 --- a/src/dailyai/services/ai_services.py +++ b/src/dailyai/services/ai_services.py @@ -9,6 +9,7 @@ from dailyai.queue_frame import ( EndStreamQueueFrame, ImageQueueFrame, LLMMessagesQueueFrame, + LLMResponseEndQueueFrame, QueueFrame, TextQueueFrame, ) @@ -89,6 +90,9 @@ class LLMService(AIService): if isinstance(frame, LLMMessagesQueueFrame): async for text_chunk in self.run_llm_async(frame.messages): yield TextQueueFrame(text_chunk) + yield LLMResponseEndQueueFrame() + else: + yield frame class TTSService(AIService): @@ -186,6 +190,18 @@ class STTService(AIService): text = await self.run_stt(content) yield TextQueueFrame(text) +class FrameLogger(AIService): + def __init__(self, prefix="Frame", **kwargs): + super().__init__(**kwargs) + self.prefix = prefix + + async def process_frame(self, frame: QueueFrame) -> AsyncGenerator[QueueFrame, None]: + if isinstance(frame, (AudioQueueFrame, ImageQueueFrame)): + self.logger.info(f"{self.prefix}: {type(frame)}") + else: + print(f"{self.prefix}: {frame}") + + yield frame @dataclass class AIServiceConfig: diff --git a/src/dailyai/services/daily_transport_service.py b/src/dailyai/services/daily_transport_service.py index 258bd7da2..f253db9f6 100644 --- a/src/dailyai/services/daily_transport_service.py +++ b/src/dailyai/services/daily_transport_service.py @@ -305,6 +305,12 @@ class DailyTransportService(EventHandler): t = Thread(target=self._receive_audio, daemon=True) t.start() + def dialout(self, number): + self.client.start_dialout({"phoneNumber": number}) + + def start_recording(self): + self.client.start_recording() + def on_error(self, error): self._logger.error(f"on_error: {error}") diff --git a/src/samples/foundational/06a-image-sync.py b/src/samples/foundational/06a-image-sync.py index 5967cecaa..f8898ebe4 100644 --- a/src/samples/foundational/06a-image-sync.py +++ b/src/samples/foundational/06a-image-sync.py @@ -79,8 +79,8 @@ async def main(room_url: str, token): messages, transport.my_participant_id ) image_sync_aggregator = ImageSyncAggregator( - os.path.join(os.path.dirname(__file__), "images", "speaking.png"), - os.path.join(os.path.dirname(__file__), "images", "waiting.png"), + os.path.join(os.path.dirname(__file__), "assets", "speaking.png"), + os.path.join(os.path.dirname(__file__), "assets", "waiting.png"), ) await tts.run_to_queue( transport.send_queue, diff --git a/src/samples/foundational/08b-debate-generator.py b/src/samples/foundational/08b-debate-generator.py index 2e74ac90c..4366f836d 100644 --- a/src/samples/foundational/08b-debate-generator.py +++ b/src/samples/foundational/08b-debate-generator.py @@ -36,9 +36,9 @@ async def main(room_url:str): affirmative = "A woman dressed as a cowboy, outside on a ranch" negative = "Pikachu in a business suit" - topic = "Is a hot dog a sandwich?" - affirmative = "A woman conservatively dressed as a librarian in a library surrounded by books" - negative = "A cat dressed in a hot dog costume" + # topic = "Is a hot dog a sandwich?" + # affirmative = "A woman conservatively dressed as a librarian in a library surrounded by books" + # negative = "A cat dressed in a hot dog costume" diff --git a/src/samples/foundational/10-wake-word.py b/src/samples/foundational/10-wake-word.py index 54575091f..6125a2a56 100644 --- a/src/samples/foundational/10-wake-word.py +++ b/src/samples/foundational/10-wake-word.py @@ -39,7 +39,7 @@ script_dir = os.path.dirname(__file__) for file in image_files: # Build the full path to the image file - full_path = os.path.join(script_dir, "images", file) + full_path = os.path.join(script_dir, "assets", file) # Get the filename without the extension to use as the dictionary key filename = os.path.splitext(os.path.basename(full_path))[0] # Open the image and convert it to bytes diff --git a/src/samples/foundational/11-sound-effects.py b/src/samples/foundational/11-sound-effects.py new file mode 100644 index 000000000..ee913fe56 --- /dev/null +++ b/src/samples/foundational/11-sound-effects.py @@ -0,0 +1,159 @@ +import argparse +import asyncio +import logging +import os +import wave +import requests +import time +import urllib.parse + +from dailyai.services.daily_transport_service import DailyTransportService +from dailyai.services.azure_ai_services import AzureLLMService, AzureTTSService +from dailyai.queue_aggregators import LLMContextAggregator, LLMUserContextAggregator, LLMAssistantContextAggregator +from dailyai.services.ai_services import AIService, FrameLogger +from dailyai.queue_frame import QueueFrame, AudioQueueFrame, LLMResponseEndQueueFrame, LLMMessagesQueueFrame +from typing import AsyncGenerator + +logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s") # or whatever +logger = logging.getLogger("dailyai") +logger.setLevel(logging.DEBUG) + +sounds = {} +sound_files = [ + 'ding1.wav', + 'ding2.wav' +] + +script_dir = os.path.dirname(__file__) + +for file in sound_files: + # Build the full path to the image file + full_path = os.path.join(script_dir, "assets", file) + # Get the filename without the extension to use as the dictionary key + filename = os.path.splitext(os.path.basename(full_path))[0] + # Open the image and convert it to bytes + with wave.open(full_path) as audio_file: + sounds[file] = audio_file.readframes(-1) + + + + +class OutboundSoundEffectWrapper(AIService): + def __init__(self): + pass + + async def process_frame(self, frame: QueueFrame) -> AsyncGenerator[QueueFrame, None]: + if isinstance(frame, LLMResponseEndQueueFrame): + yield AudioQueueFrame(sounds["ding1.wav"]) + # In case anything else up the stack needs it + yield frame + else: + yield frame + +class InboundSoundEffectWrapper(AIService): + def __init__(self): + pass + + async def process_frame(self, frame: QueueFrame) -> AsyncGenerator[QueueFrame, None]: + if isinstance(frame, LLMMessagesQueueFrame): + yield AudioQueueFrame(sounds["ding2.wav"]) + # In case anything else up the stack needs it + yield frame + else: + yield frame + + +async def main(room_url: str, token): + global transport + global llm + global tts + + transport = DailyTransportService( + room_url, + token, + "Respond bot", + 5, + ) + transport.mic_enabled = True + transport.mic_sample_rate = 16000 + transport.camera_enabled = False + + llm = AzureLLMService() + tts = AzureTTSService() + + @transport.event_handler("on_first_other_participant_joined") + async def on_first_other_participant_joined(transport): + await tts.say("Hi, I'm listening!", transport.send_queue) + await transport.send_queue.put(AudioQueueFrame(sounds["ding1.wav"])) + async def handle_transcriptions(): + messages = [ + {"role": "system", "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio. Respond to what the user said in a creative and helpful way."}, + ] + + tma_in = LLMUserContextAggregator( + messages, transport.my_participant_id + ) + tma_out = LLMAssistantContextAggregator( + messages, transport.my_participant_id + ) + out_sound = OutboundSoundEffectWrapper() + in_sound = InboundSoundEffectWrapper() + fl = FrameLogger("LLM Out") + fl2 = FrameLogger("Transcription In") + await out_sound.run_to_queue( + transport.send_queue, + tts.run( + fl.run( + tma_out.run( + llm.run( + fl2.run( + in_sound.run( + tma_in.run( + transport.get_receive_frames() + ) + ) + ) + ) + ) + ) + ) + ) + + + transport.transcription_settings["extra"]["punctuate"] = True + await asyncio.gather(transport.run(), handle_transcriptions()) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Simple Daily Bot Sample") + parser.add_argument( + "-u", "--url", type=str, required=True, help="URL of the Daily room to join" + ) + parser.add_argument( + "-k", + "--apikey", + type=str, + required=True, + help="Daily API Key (needed to create token)", + ) + + args, unknown = parser.parse_known_args() + + # Create a meeting token for the given room with an expiration 1 hour in the future. + room_name: str = urllib.parse.urlparse(args.url).path[1:] + expiration: float = time.time() + 60 * 60 + + res: requests.Response = requests.post( + f"https://api.daily.co/v1/meeting-tokens", + headers={"Authorization": f"Bearer {args.apikey}"}, + json={ + "properties": {"room_name": room_name, "is_owner": True, "exp": expiration} + }, + ) + + if res.status_code != 200: + raise Exception(f"Failed to create meeting token: {res.status_code} {res.text}") + + token: str = res.json()["token"] + + asyncio.run(main(args.url, token)) diff --git a/src/samples/foundational/11a-dial-out.py b/src/samples/foundational/11a-dial-out.py new file mode 100644 index 000000000..95beb586e --- /dev/null +++ b/src/samples/foundational/11a-dial-out.py @@ -0,0 +1,165 @@ +import argparse +import asyncio +import os +import wave +import requests +import time +import urllib.parse + +from dailyai.services.daily_transport_service import DailyTransportService +from dailyai.services.azure_ai_services import AzureLLMService, AzureTTSService +from dailyai.queue_aggregators import LLMContextAggregator +from dailyai.services.ai_services import AIService, FrameLogger +from dailyai.queue_frame import QueueFrame, AudioQueueFrame, LLMResponseEndQueueFrame, LLMMessagesQueueFrame +from typing import AsyncGenerator + +sounds = {} +sound_files = [ + 'ding1.wav', + 'ding2.wav' +] + +script_dir = os.path.dirname(__file__) + +for file in sound_files: + # Build the full path to the image file + full_path = os.path.join(script_dir, "assets", file) + # Get the filename without the extension to use as the dictionary key + filename = os.path.splitext(os.path.basename(full_path))[0] + # Open the image and convert it to bytes + with wave.open(full_path) as audio_file: + sounds[file] = audio_file.readframes(-1) + + + + +class OutboundSoundEffectWrapper(AIService): + def __init__(self): + pass + + async def process_frame(self, frame: QueueFrame) -> AsyncGenerator[QueueFrame, None]: + if isinstance(frame, LLMResponseEndQueueFrame): + yield AudioQueueFrame(sounds["ding1.wav"]) + # In case anything else up the stack needs it + yield frame + else: + yield frame + +class InboundSoundEffectWrapper(AIService): + def __init__(self): + pass + + async def process_frame(self, frame: QueueFrame) -> AsyncGenerator[QueueFrame, None]: + if isinstance(frame, LLMMessagesQueueFrame): + yield AudioQueueFrame(sounds["ding2.wav"]) + # In case anything else up the stack needs it + yield frame + else: + yield frame + + +async def main(room_url: str, token, phone): + global transport + global llm + global tts + + transport = DailyTransportService( + room_url, + token, + "Respond bot", + 300, + ) + transport.mic_enabled = True + transport.mic_sample_rate = 16000 + transport.camera_enabled = False + + llm = AzureLLMService() + tts = AzureTTSService() + + @transport.event_handler("on_first_other_participant_joined") + async def on_first_other_participant_joined(transport): + await tts.say("Hi, I'm listening!", transport.send_queue) + await transport.send_queue.put(AudioQueueFrame(sounds["ding1.wav"])) + async def handle_transcriptions(): + messages = [ + {"role": "system", "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio. Respond to what the user said in a creative and helpful way."}, + ] + + tma_in = LLMContextAggregator( + messages, "user", transport.my_participant_id + ) + tma_out = LLMContextAggregator( + messages, "assistant", transport.my_participant_id + ) + out_sound = OutboundSoundEffectWrapper() + in_sound = InboundSoundEffectWrapper() + fl = FrameLogger("LLM Out") + fl2 = FrameLogger("Transcription In") + await out_sound.run_to_queue( + transport.send_queue, + tts.run( + tma_out.run( + llm.run( + fl2.run( + in_sound.run( + tma_in.run( + transport.get_receive_frames() + ) + ) + ) + ) + ) + ) + ) + + @transport.event_handler("on_participant_joined") + async def pax_joined(transport, pax): + print(f"PARTICIPANT JOINED: {pax}") + + @transport.event_handler("on_call_state_updated") + async def on_call_state_updated(transport, state): + if (state == "joined"): + if (phone): + transport.start_recording() + transport.dialout(phone) + + + transport.transcription_settings["extra"]["punctuate"] = True + + await asyncio.gather(transport.run(), handle_transcriptions()) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Simple Daily Bot Sample") + parser.add_argument( + "-u", "--url", type=str, required=True, help="URL of the Daily room to join" + ) + parser.add_argument( + "-k", + "--apikey", + type=str, + required=True, + help="Daily API Key (needed to create token)", + ) + + parser.add_argument("-p", "--phone", type=str, required=False, help="A phone number to call when the bot joins the room") + + args, unknown = parser.parse_known_args() + + # Create a meeting token for the given room with an expiration 1 hour in the future. + room_name: str = urllib.parse.urlparse(args.url).path[1:] + expiration: float = time.time() + 60 * 60 + + res: requests.Response = requests.post( + f"https://api.staging.daily.co/v1/meeting-tokens", + headers={"Authorization": f"Bearer {args.apikey}"}, + json={ + "properties": {"room_name": room_name, "is_owner": True, "exp": expiration} + }, + ) + + if res.status_code != 200: + raise Exception(f"Failed to create meeting token: {res.status_code} {res.text}") + + token: str = res.json()["token"] + asyncio.run(main(args.url, token, args.phone)) diff --git a/src/samples/foundational/assets/ding1.wav b/src/samples/foundational/assets/ding1.wav new file mode 100644 index 000000000..b508dae9a Binary files /dev/null and b/src/samples/foundational/assets/ding1.wav differ diff --git a/src/samples/foundational/assets/ding2.wav b/src/samples/foundational/assets/ding2.wav new file mode 100644 index 000000000..31871f089 Binary files /dev/null and b/src/samples/foundational/assets/ding2.wav differ diff --git a/src/samples/foundational/images/sc-default.png b/src/samples/foundational/assets/sc-default.png similarity index 100% rename from src/samples/foundational/images/sc-default.png rename to src/samples/foundational/assets/sc-default.png diff --git a/src/samples/foundational/images/sc-listen-1.png b/src/samples/foundational/assets/sc-listen-1.png similarity index 100% rename from src/samples/foundational/images/sc-listen-1.png rename to src/samples/foundational/assets/sc-listen-1.png diff --git a/src/samples/foundational/images/sc-listen-2.png b/src/samples/foundational/assets/sc-listen-2.png similarity index 100% rename from src/samples/foundational/images/sc-listen-2.png rename to src/samples/foundational/assets/sc-listen-2.png diff --git a/src/samples/foundational/images/sc-talk.png b/src/samples/foundational/assets/sc-talk.png similarity index 100% rename from src/samples/foundational/images/sc-talk.png rename to src/samples/foundational/assets/sc-talk.png diff --git a/src/samples/foundational/images/sc-think-1.png b/src/samples/foundational/assets/sc-think-1.png similarity index 100% rename from src/samples/foundational/images/sc-think-1.png rename to src/samples/foundational/assets/sc-think-1.png diff --git a/src/samples/foundational/images/sc-think-2.png b/src/samples/foundational/assets/sc-think-2.png similarity index 100% rename from src/samples/foundational/images/sc-think-2.png rename to src/samples/foundational/assets/sc-think-2.png diff --git a/src/samples/foundational/images/sc-think-3.png b/src/samples/foundational/assets/sc-think-3.png similarity index 100% rename from src/samples/foundational/images/sc-think-3.png rename to src/samples/foundational/assets/sc-think-3.png diff --git a/src/samples/foundational/images/sc-think-4.png b/src/samples/foundational/assets/sc-think-4.png similarity index 100% rename from src/samples/foundational/images/sc-think-4.png rename to src/samples/foundational/assets/sc-think-4.png diff --git a/src/samples/foundational/images/speaking.png b/src/samples/foundational/assets/speaking.png similarity index 100% rename from src/samples/foundational/images/speaking.png rename to src/samples/foundational/assets/speaking.png diff --git a/src/samples/foundational/images/waiting.png b/src/samples/foundational/assets/waiting.png similarity index 100% rename from src/samples/foundational/images/waiting.png rename to src/samples/foundational/assets/waiting.png