Added sound effect example (#18)

* added sound effect example * added dialout to this branch too * fixup * fixup for more dialout testing * cleanup
2024-02-01 10:26:50 -06:00
parent 4e9586595d
commit 0d96f91cde
20 changed files with 354 additions and 6 deletions
--- a/src/dailyai/queue_frame.py
+++ b/src/dailyai/queue_frame.py
@@ -18,6 +18,8 @@ class StartStreamQueueFrame(ControlQueueFrame):
 class EndStreamQueueFrame(ControlQueueFrame):
    pass

+class LLMResponseEndQueueFrame(QueueFrame):
+    pass

@dataclass()
 class AudioQueueFrame(QueueFrame):
--- a/src/dailyai/services/ai_services.py
+++ b/src/dailyai/services/ai_services.py
@@ -9,6 +9,7 @@ from dailyai.queue_frame import (
    EndStreamQueueFrame,
    ImageQueueFrame,
    LLMMessagesQueueFrame,
+    LLMResponseEndQueueFrame,
    QueueFrame,
    TextQueueFrame,
 )
@@ -89,6 +90,9 @@ class LLMService(AIService):
        if isinstance(frame, LLMMessagesQueueFrame):
            async for text_chunk in self.run_llm_async(frame.messages):
                yield TextQueueFrame(text_chunk)
+            yield LLMResponseEndQueueFrame()
+        else:
+            yield frame


 class TTSService(AIService):
@@ -186,6 +190,18 @@ class STTService(AIService):
        text = await self.run_stt(content)
        yield TextQueueFrame(text)

+class FrameLogger(AIService):
+    def __init__(self, prefix="Frame", **kwargs):
+        super().__init__(**kwargs)
+        self.prefix = prefix
+
+    async def process_frame(self, frame: QueueFrame) -> AsyncGenerator[QueueFrame, None]:
+        if isinstance(frame, (AudioQueueFrame, ImageQueueFrame)):
+            self.logger.info(f"{self.prefix}: {type(frame)}")
+        else:
+            print(f"{self.prefix}: {frame}")
+
+        yield frame

@dataclass
 class AIServiceConfig:
--- a/src/dailyai/services/daily_transport_service.py
+++ b/src/dailyai/services/daily_transport_service.py
@@ -305,6 +305,12 @@ class DailyTransportService(EventHandler):
            t = Thread(target=self._receive_audio, daemon=True)
            t.start()

+    def dialout(self, number):
+        self.client.start_dialout({"phoneNumber": number})
+
+    def start_recording(self):
+        self.client.start_recording()
+
    def on_error(self, error):
        self._logger.error(f"on_error: {error}")

--- a/src/samples/foundational/06a-image-sync.py
+++ b/src/samples/foundational/06a-image-sync.py
@@ -79,8 +79,8 @@ async def main(room_url: str, token):
                messages, transport.my_participant_id
            )
            image_sync_aggregator = ImageSyncAggregator(
-                os.path.join(os.path.dirname(__file__), "images", "speaking.png"),
-                os.path.join(os.path.dirname(__file__), "images", "waiting.png"),
+                os.path.join(os.path.dirname(__file__), "assets", "speaking.png"),
+                os.path.join(os.path.dirname(__file__), "assets", "waiting.png"),
            )
            await tts.run_to_queue(
                transport.send_queue,
--- a/src/samples/foundational/08b-debate-generator.py
+++ b/src/samples/foundational/08b-debate-generator.py
@@ -36,9 +36,9 @@ async def main(room_url:str):
        affirmative = "A woman dressed as a cowboy, outside on a ranch"
        negative = "Pikachu in a business suit"

-        topic = "Is a hot dog a sandwich?"
-        affirmative = "A woman conservatively dressed as a librarian in a library surrounded by books"
-        negative = "A cat dressed in a hot dog costume"
+        # topic = "Is a hot dog a sandwich?"
+        # affirmative = "A woman conservatively dressed as a librarian in a library surrounded by books"
+        # negative = "A cat dressed in a hot dog costume"



--- a/src/samples/foundational/10-wake-word.py
+++ b/src/samples/foundational/10-wake-word.py
@@ -39,7 +39,7 @@ script_dir = os.path.dirname(__file__)

 for file in image_files:
    # Build the full path to the image file
-    full_path = os.path.join(script_dir, "images", file)
+    full_path = os.path.join(script_dir, "assets", file)
    # Get the filename without the extension to use as the dictionary key
    filename = os.path.splitext(os.path.basename(full_path))[0]
    # Open the image and convert it to bytes
--- a/src/samples/foundational/11-sound-effects.py
+++ b/src/samples/foundational/11-sound-effects.py
@@ -0,0 +1,159 @@
+import argparse
+import asyncio
+import logging
+import os
+import wave
+import requests
+import time
+import urllib.parse
+
+from dailyai.services.daily_transport_service import DailyTransportService
+from dailyai.services.azure_ai_services import AzureLLMService, AzureTTSService
+from dailyai.queue_aggregators import LLMContextAggregator, LLMUserContextAggregator, LLMAssistantContextAggregator
+from dailyai.services.ai_services import AIService, FrameLogger
+from dailyai.queue_frame import QueueFrame, AudioQueueFrame, LLMResponseEndQueueFrame, LLMMessagesQueueFrame
+from typing import AsyncGenerator
+
+logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s") # or whatever
+logger = logging.getLogger("dailyai")
+logger.setLevel(logging.DEBUG)
+
+sounds = {}
+sound_files = [
+    'ding1.wav',
+    'ding2.wav'
+]
+
+script_dir = os.path.dirname(__file__)
+
+for file in sound_files:
+    # Build the full path to the image file
+    full_path = os.path.join(script_dir, "assets", file)
+    # Get the filename without the extension to use as the dictionary key
+    filename = os.path.splitext(os.path.basename(full_path))[0]
+    # Open the image and convert it to bytes
+    with wave.open(full_path) as audio_file:
+        sounds[file] = audio_file.readframes(-1)
+
+
+
+
+class OutboundSoundEffectWrapper(AIService):
+    def __init__(self):
+        pass
+
+    async def process_frame(self, frame: QueueFrame) -> AsyncGenerator[QueueFrame, None]:
+        if isinstance(frame, LLMResponseEndQueueFrame):
+            yield AudioQueueFrame(sounds["ding1.wav"])
+            # In case anything else up the stack needs it
+            yield frame
+        else:
+            yield frame
+
+class InboundSoundEffectWrapper(AIService):
+    def __init__(self):
+        pass
+
+    async def process_frame(self, frame: QueueFrame) -> AsyncGenerator[QueueFrame, None]:
+        if isinstance(frame, LLMMessagesQueueFrame):
+            yield AudioQueueFrame(sounds["ding2.wav"])
+            # In case anything else up the stack needs it
+            yield frame
+        else:
+            yield frame
+
+
+async def main(room_url: str, token):
+    global transport
+    global llm
+    global tts
+
+    transport = DailyTransportService(
+        room_url,
+        token,
+        "Respond bot",
+        5,
+    )
+    transport.mic_enabled = True
+    transport.mic_sample_rate = 16000
+    transport.camera_enabled = False
+
+    llm = AzureLLMService()
+    tts = AzureTTSService()
+
+    @transport.event_handler("on_first_other_participant_joined")
+    async def on_first_other_participant_joined(transport):
+        await tts.say("Hi, I'm listening!", transport.send_queue)
+        await transport.send_queue.put(AudioQueueFrame(sounds["ding1.wav"]))
+    async def handle_transcriptions():
+        messages = [
+            {"role": "system", "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio. Respond to what the user said in a creative and helpful way."},
+        ]
+
+        tma_in = LLMUserContextAggregator(
+            messages, transport.my_participant_id
+        )
+        tma_out = LLMAssistantContextAggregator(
+            messages, transport.my_participant_id
+        )
+        out_sound = OutboundSoundEffectWrapper()
+        in_sound = InboundSoundEffectWrapper()
+        fl = FrameLogger("LLM Out")
+        fl2 = FrameLogger("Transcription In")
+        await out_sound.run_to_queue(
+            transport.send_queue,
+            tts.run(
+                fl.run(
+                    tma_out.run(
+                        llm.run(
+                            fl2.run(
+                                in_sound.run(
+                                    tma_in.run(
+                                        transport.get_receive_frames()
+                                    )
+                                )
+                            )
+                        )
+                    )
+                )
+            )
+        )
+        
+
+    transport.transcription_settings["extra"]["punctuate"] = True
+    await asyncio.gather(transport.run(), handle_transcriptions())
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Simple Daily Bot Sample")
+    parser.add_argument(
+        "-u", "--url", type=str, required=True, help="URL of the Daily room to join"
+    )
+    parser.add_argument(
+        "-k",
+        "--apikey",
+        type=str,
+        required=True,
+        help="Daily API Key (needed to create token)",
+    )
+
+    args, unknown = parser.parse_known_args()
+
+    # Create a meeting token for the given room with an expiration 1 hour in the future.
+    room_name: str = urllib.parse.urlparse(args.url).path[1:]
+    expiration: float = time.time() + 60 * 60
+
+    res: requests.Response = requests.post(
+        f"https://api.daily.co/v1/meeting-tokens",
+        headers={"Authorization": f"Bearer {args.apikey}"},
+        json={
+            "properties": {"room_name": room_name, "is_owner": True, "exp": expiration}
+        },
+    )
+
+    if res.status_code != 200:
+        raise Exception(f"Failed to create meeting token: {res.status_code} {res.text}")
+
+    token: str = res.json()["token"]
+
+    asyncio.run(main(args.url, token))
--- a/src/samples/foundational/11a-dial-out.py
+++ b/src/samples/foundational/11a-dial-out.py
@@ -0,0 +1,165 @@
+import argparse
+import asyncio
+import os
+import wave
+import requests
+import time
+import urllib.parse
+
+from dailyai.services.daily_transport_service import DailyTransportService
+from dailyai.services.azure_ai_services import AzureLLMService, AzureTTSService
+from dailyai.queue_aggregators import LLMContextAggregator
+from dailyai.services.ai_services import AIService, FrameLogger
+from dailyai.queue_frame import QueueFrame, AudioQueueFrame, LLMResponseEndQueueFrame, LLMMessagesQueueFrame
+from typing import AsyncGenerator
+
+sounds = {}
+sound_files = [
+    'ding1.wav',
+    'ding2.wav'
+]
+
+script_dir = os.path.dirname(__file__)
+
+for file in sound_files:
+    # Build the full path to the image file
+    full_path = os.path.join(script_dir, "assets", file)
+    # Get the filename without the extension to use as the dictionary key
+    filename = os.path.splitext(os.path.basename(full_path))[0]
+    # Open the image and convert it to bytes
+    with wave.open(full_path) as audio_file:
+        sounds[file] = audio_file.readframes(-1)
+
+
+
+
+class OutboundSoundEffectWrapper(AIService):
+    def __init__(self):
+        pass
+
+    async def process_frame(self, frame: QueueFrame) -> AsyncGenerator[QueueFrame, None]:
+        if isinstance(frame, LLMResponseEndQueueFrame):
+            yield AudioQueueFrame(sounds["ding1.wav"])
+            # In case anything else up the stack needs it
+            yield frame
+        else:
+            yield frame
+
+class InboundSoundEffectWrapper(AIService):
+    def __init__(self):
+        pass
+
+    async def process_frame(self, frame: QueueFrame) -> AsyncGenerator[QueueFrame, None]:
+        if isinstance(frame, LLMMessagesQueueFrame):
+            yield AudioQueueFrame(sounds["ding2.wav"])
+            # In case anything else up the stack needs it
+            yield frame
+        else:
+            yield frame
+
+
+async def main(room_url: str, token, phone):
+    global transport
+    global llm
+    global tts
+
+    transport = DailyTransportService(
+        room_url,
+        token,
+        "Respond bot",
+        300,
+    )
+    transport.mic_enabled = True
+    transport.mic_sample_rate = 16000
+    transport.camera_enabled = False
+
+    llm = AzureLLMService()
+    tts = AzureTTSService()
+
+    @transport.event_handler("on_first_other_participant_joined")
+    async def on_first_other_participant_joined(transport):
+        await tts.say("Hi, I'm listening!", transport.send_queue)
+        await transport.send_queue.put(AudioQueueFrame(sounds["ding1.wav"]))
+    async def handle_transcriptions():
+        messages = [
+            {"role": "system", "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio. Respond to what the user said in a creative and helpful way."},
+        ]
+
+        tma_in = LLMContextAggregator(
+            messages, "user", transport.my_participant_id
+        )
+        tma_out = LLMContextAggregator(
+            messages, "assistant", transport.my_participant_id
+        )
+        out_sound = OutboundSoundEffectWrapper()
+        in_sound = InboundSoundEffectWrapper()
+        fl = FrameLogger("LLM Out")
+        fl2 = FrameLogger("Transcription In")
+        await out_sound.run_to_queue(
+            transport.send_queue,
+            tts.run(
+                tma_out.run(
+                    llm.run(
+                        fl2.run(
+                            in_sound.run(
+                                tma_in.run(
+                                    transport.get_receive_frames()
+                                )
+                            )
+                        )
+                    )
+                )   
+            )
+        )
+
+    @transport.event_handler("on_participant_joined")
+    async def pax_joined(transport, pax):
+        print(f"PARTICIPANT JOINED: {pax}")
+        
+    @transport.event_handler("on_call_state_updated")
+    async def on_call_state_updated(transport, state):
+        if (state == "joined"):
+            if (phone):
+                transport.start_recording()
+                transport.dialout(phone)
+
+
+    transport.transcription_settings["extra"]["punctuate"] = True
+
+    await asyncio.gather(transport.run(), handle_transcriptions())
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Simple Daily Bot Sample")
+    parser.add_argument(
+        "-u", "--url", type=str, required=True, help="URL of the Daily room to join"
+    )
+    parser.add_argument(
+        "-k",
+        "--apikey",
+        type=str,
+        required=True,
+        help="Daily API Key (needed to create token)",
+    )
+
+    parser.add_argument("-p", "--phone", type=str, required=False, help="A phone number to call when the bot joins the room")
+
+    args, unknown = parser.parse_known_args()
+
+    # Create a meeting token for the given room with an expiration 1 hour in the future.
+    room_name: str = urllib.parse.urlparse(args.url).path[1:]
+    expiration: float = time.time() + 60 * 60
+
+    res: requests.Response = requests.post(
+        f"https://api.staging.daily.co/v1/meeting-tokens",
+        headers={"Authorization": f"Bearer {args.apikey}"},
+        json={
+            "properties": {"room_name": room_name, "is_owner": True, "exp": expiration}
+        },
+    )
+
+    if res.status_code != 200:
+        raise Exception(f"Failed to create meeting token: {res.status_code} {res.text}")
+
+    token: str = res.json()["token"]
+    asyncio.run(main(args.url, token, args.phone))
--- a/src/samples/foundational/assets/ding1.wav
+++ b/src/samples/foundational/assets/ding1.wav
--- a/src/samples/foundational/assets/ding2.wav
+++ b/src/samples/foundational/assets/ding2.wav
--- a/src/samples/foundational/assets/sc-default.png
+++ b/src/samples/foundational/assets/sc-default.png
--- a/src/samples/foundational/assets/sc-listen-1.png
+++ b/src/samples/foundational/assets/sc-listen-1.png
--- a/src/samples/foundational/assets/sc-listen-2.png
+++ b/src/samples/foundational/assets/sc-listen-2.png
--- a/src/samples/foundational/assets/sc-talk.png
+++ b/src/samples/foundational/assets/sc-talk.png
--- a/src/samples/foundational/assets/sc-think-1.png
+++ b/src/samples/foundational/assets/sc-think-1.png
--- a/src/samples/foundational/assets/sc-think-2.png
+++ b/src/samples/foundational/assets/sc-think-2.png
--- a/src/samples/foundational/assets/sc-think-3.png
+++ b/src/samples/foundational/assets/sc-think-3.png
--- a/src/samples/foundational/assets/sc-think-4.png
+++ b/src/samples/foundational/assets/sc-think-4.png
--- a/src/samples/foundational/assets/speaking.png
+++ b/src/samples/foundational/assets/speaking.png
--- a/src/samples/foundational/assets/waiting.png
+++ b/src/samples/foundational/assets/waiting.png