Added sound effect example (#18)
* added sound effect example * added dialout to this branch too * fixup * fixup for more dialout testing * cleanup
@@ -18,6 +18,8 @@ class StartStreamQueueFrame(ControlQueueFrame):
|
||||
class EndStreamQueueFrame(ControlQueueFrame):
|
||||
pass
|
||||
|
||||
class LLMResponseEndQueueFrame(QueueFrame):
|
||||
pass
|
||||
|
||||
@dataclass()
|
||||
class AudioQueueFrame(QueueFrame):
|
||||
|
||||
@@ -9,6 +9,7 @@ from dailyai.queue_frame import (
|
||||
EndStreamQueueFrame,
|
||||
ImageQueueFrame,
|
||||
LLMMessagesQueueFrame,
|
||||
LLMResponseEndQueueFrame,
|
||||
QueueFrame,
|
||||
TextQueueFrame,
|
||||
)
|
||||
@@ -89,6 +90,9 @@ class LLMService(AIService):
|
||||
if isinstance(frame, LLMMessagesQueueFrame):
|
||||
async for text_chunk in self.run_llm_async(frame.messages):
|
||||
yield TextQueueFrame(text_chunk)
|
||||
yield LLMResponseEndQueueFrame()
|
||||
else:
|
||||
yield frame
|
||||
|
||||
|
||||
class TTSService(AIService):
|
||||
@@ -186,6 +190,18 @@ class STTService(AIService):
|
||||
text = await self.run_stt(content)
|
||||
yield TextQueueFrame(text)
|
||||
|
||||
class FrameLogger(AIService):
|
||||
def __init__(self, prefix="Frame", **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.prefix = prefix
|
||||
|
||||
async def process_frame(self, frame: QueueFrame) -> AsyncGenerator[QueueFrame, None]:
|
||||
if isinstance(frame, (AudioQueueFrame, ImageQueueFrame)):
|
||||
self.logger.info(f"{self.prefix}: {type(frame)}")
|
||||
else:
|
||||
print(f"{self.prefix}: {frame}")
|
||||
|
||||
yield frame
|
||||
|
||||
@dataclass
|
||||
class AIServiceConfig:
|
||||
|
||||
@@ -305,6 +305,12 @@ class DailyTransportService(EventHandler):
|
||||
t = Thread(target=self._receive_audio, daemon=True)
|
||||
t.start()
|
||||
|
||||
def dialout(self, number):
|
||||
self.client.start_dialout({"phoneNumber": number})
|
||||
|
||||
def start_recording(self):
|
||||
self.client.start_recording()
|
||||
|
||||
def on_error(self, error):
|
||||
self._logger.error(f"on_error: {error}")
|
||||
|
||||
|
||||
@@ -79,8 +79,8 @@ async def main(room_url: str, token):
|
||||
messages, transport.my_participant_id
|
||||
)
|
||||
image_sync_aggregator = ImageSyncAggregator(
|
||||
os.path.join(os.path.dirname(__file__), "images", "speaking.png"),
|
||||
os.path.join(os.path.dirname(__file__), "images", "waiting.png"),
|
||||
os.path.join(os.path.dirname(__file__), "assets", "speaking.png"),
|
||||
os.path.join(os.path.dirname(__file__), "assets", "waiting.png"),
|
||||
)
|
||||
await tts.run_to_queue(
|
||||
transport.send_queue,
|
||||
|
||||
@@ -36,9 +36,9 @@ async def main(room_url:str):
|
||||
affirmative = "A woman dressed as a cowboy, outside on a ranch"
|
||||
negative = "Pikachu in a business suit"
|
||||
|
||||
topic = "Is a hot dog a sandwich?"
|
||||
affirmative = "A woman conservatively dressed as a librarian in a library surrounded by books"
|
||||
negative = "A cat dressed in a hot dog costume"
|
||||
# topic = "Is a hot dog a sandwich?"
|
||||
# affirmative = "A woman conservatively dressed as a librarian in a library surrounded by books"
|
||||
# negative = "A cat dressed in a hot dog costume"
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -39,7 +39,7 @@ script_dir = os.path.dirname(__file__)
|
||||
|
||||
for file in image_files:
|
||||
# Build the full path to the image file
|
||||
full_path = os.path.join(script_dir, "images", file)
|
||||
full_path = os.path.join(script_dir, "assets", file)
|
||||
# Get the filename without the extension to use as the dictionary key
|
||||
filename = os.path.splitext(os.path.basename(full_path))[0]
|
||||
# Open the image and convert it to bytes
|
||||
|
||||
159
src/samples/foundational/11-sound-effects.py
Normal file
@@ -0,0 +1,159 @@
|
||||
import argparse
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
import wave
|
||||
import requests
|
||||
import time
|
||||
import urllib.parse
|
||||
|
||||
from dailyai.services.daily_transport_service import DailyTransportService
|
||||
from dailyai.services.azure_ai_services import AzureLLMService, AzureTTSService
|
||||
from dailyai.queue_aggregators import LLMContextAggregator, LLMUserContextAggregator, LLMAssistantContextAggregator
|
||||
from dailyai.services.ai_services import AIService, FrameLogger
|
||||
from dailyai.queue_frame import QueueFrame, AudioQueueFrame, LLMResponseEndQueueFrame, LLMMessagesQueueFrame
|
||||
from typing import AsyncGenerator
|
||||
|
||||
logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s") # or whatever
|
||||
logger = logging.getLogger("dailyai")
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
sounds = {}
|
||||
sound_files = [
|
||||
'ding1.wav',
|
||||
'ding2.wav'
|
||||
]
|
||||
|
||||
script_dir = os.path.dirname(__file__)
|
||||
|
||||
for file in sound_files:
|
||||
# Build the full path to the image file
|
||||
full_path = os.path.join(script_dir, "assets", file)
|
||||
# Get the filename without the extension to use as the dictionary key
|
||||
filename = os.path.splitext(os.path.basename(full_path))[0]
|
||||
# Open the image and convert it to bytes
|
||||
with wave.open(full_path) as audio_file:
|
||||
sounds[file] = audio_file.readframes(-1)
|
||||
|
||||
|
||||
|
||||
|
||||
class OutboundSoundEffectWrapper(AIService):
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
async def process_frame(self, frame: QueueFrame) -> AsyncGenerator[QueueFrame, None]:
|
||||
if isinstance(frame, LLMResponseEndQueueFrame):
|
||||
yield AudioQueueFrame(sounds["ding1.wav"])
|
||||
# In case anything else up the stack needs it
|
||||
yield frame
|
||||
else:
|
||||
yield frame
|
||||
|
||||
class InboundSoundEffectWrapper(AIService):
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
async def process_frame(self, frame: QueueFrame) -> AsyncGenerator[QueueFrame, None]:
|
||||
if isinstance(frame, LLMMessagesQueueFrame):
|
||||
yield AudioQueueFrame(sounds["ding2.wav"])
|
||||
# In case anything else up the stack needs it
|
||||
yield frame
|
||||
else:
|
||||
yield frame
|
||||
|
||||
|
||||
async def main(room_url: str, token):
|
||||
global transport
|
||||
global llm
|
||||
global tts
|
||||
|
||||
transport = DailyTransportService(
|
||||
room_url,
|
||||
token,
|
||||
"Respond bot",
|
||||
5,
|
||||
)
|
||||
transport.mic_enabled = True
|
||||
transport.mic_sample_rate = 16000
|
||||
transport.camera_enabled = False
|
||||
|
||||
llm = AzureLLMService()
|
||||
tts = AzureTTSService()
|
||||
|
||||
@transport.event_handler("on_first_other_participant_joined")
|
||||
async def on_first_other_participant_joined(transport):
|
||||
await tts.say("Hi, I'm listening!", transport.send_queue)
|
||||
await transport.send_queue.put(AudioQueueFrame(sounds["ding1.wav"]))
|
||||
async def handle_transcriptions():
|
||||
messages = [
|
||||
{"role": "system", "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio. Respond to what the user said in a creative and helpful way."},
|
||||
]
|
||||
|
||||
tma_in = LLMUserContextAggregator(
|
||||
messages, transport.my_participant_id
|
||||
)
|
||||
tma_out = LLMAssistantContextAggregator(
|
||||
messages, transport.my_participant_id
|
||||
)
|
||||
out_sound = OutboundSoundEffectWrapper()
|
||||
in_sound = InboundSoundEffectWrapper()
|
||||
fl = FrameLogger("LLM Out")
|
||||
fl2 = FrameLogger("Transcription In")
|
||||
await out_sound.run_to_queue(
|
||||
transport.send_queue,
|
||||
tts.run(
|
||||
fl.run(
|
||||
tma_out.run(
|
||||
llm.run(
|
||||
fl2.run(
|
||||
in_sound.run(
|
||||
tma_in.run(
|
||||
transport.get_receive_frames()
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
transport.transcription_settings["extra"]["punctuate"] = True
|
||||
await asyncio.gather(transport.run(), handle_transcriptions())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Simple Daily Bot Sample")
|
||||
parser.add_argument(
|
||||
"-u", "--url", type=str, required=True, help="URL of the Daily room to join"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-k",
|
||||
"--apikey",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Daily API Key (needed to create token)",
|
||||
)
|
||||
|
||||
args, unknown = parser.parse_known_args()
|
||||
|
||||
# Create a meeting token for the given room with an expiration 1 hour in the future.
|
||||
room_name: str = urllib.parse.urlparse(args.url).path[1:]
|
||||
expiration: float = time.time() + 60 * 60
|
||||
|
||||
res: requests.Response = requests.post(
|
||||
f"https://api.daily.co/v1/meeting-tokens",
|
||||
headers={"Authorization": f"Bearer {args.apikey}"},
|
||||
json={
|
||||
"properties": {"room_name": room_name, "is_owner": True, "exp": expiration}
|
||||
},
|
||||
)
|
||||
|
||||
if res.status_code != 200:
|
||||
raise Exception(f"Failed to create meeting token: {res.status_code} {res.text}")
|
||||
|
||||
token: str = res.json()["token"]
|
||||
|
||||
asyncio.run(main(args.url, token))
|
||||
165
src/samples/foundational/11a-dial-out.py
Normal file
@@ -0,0 +1,165 @@
|
||||
import argparse
|
||||
import asyncio
|
||||
import os
|
||||
import wave
|
||||
import requests
|
||||
import time
|
||||
import urllib.parse
|
||||
|
||||
from dailyai.services.daily_transport_service import DailyTransportService
|
||||
from dailyai.services.azure_ai_services import AzureLLMService, AzureTTSService
|
||||
from dailyai.queue_aggregators import LLMContextAggregator
|
||||
from dailyai.services.ai_services import AIService, FrameLogger
|
||||
from dailyai.queue_frame import QueueFrame, AudioQueueFrame, LLMResponseEndQueueFrame, LLMMessagesQueueFrame
|
||||
from typing import AsyncGenerator
|
||||
|
||||
sounds = {}
|
||||
sound_files = [
|
||||
'ding1.wav',
|
||||
'ding2.wav'
|
||||
]
|
||||
|
||||
script_dir = os.path.dirname(__file__)
|
||||
|
||||
for file in sound_files:
|
||||
# Build the full path to the image file
|
||||
full_path = os.path.join(script_dir, "assets", file)
|
||||
# Get the filename without the extension to use as the dictionary key
|
||||
filename = os.path.splitext(os.path.basename(full_path))[0]
|
||||
# Open the image and convert it to bytes
|
||||
with wave.open(full_path) as audio_file:
|
||||
sounds[file] = audio_file.readframes(-1)
|
||||
|
||||
|
||||
|
||||
|
||||
class OutboundSoundEffectWrapper(AIService):
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
async def process_frame(self, frame: QueueFrame) -> AsyncGenerator[QueueFrame, None]:
|
||||
if isinstance(frame, LLMResponseEndQueueFrame):
|
||||
yield AudioQueueFrame(sounds["ding1.wav"])
|
||||
# In case anything else up the stack needs it
|
||||
yield frame
|
||||
else:
|
||||
yield frame
|
||||
|
||||
class InboundSoundEffectWrapper(AIService):
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
async def process_frame(self, frame: QueueFrame) -> AsyncGenerator[QueueFrame, None]:
|
||||
if isinstance(frame, LLMMessagesQueueFrame):
|
||||
yield AudioQueueFrame(sounds["ding2.wav"])
|
||||
# In case anything else up the stack needs it
|
||||
yield frame
|
||||
else:
|
||||
yield frame
|
||||
|
||||
|
||||
async def main(room_url: str, token, phone):
|
||||
global transport
|
||||
global llm
|
||||
global tts
|
||||
|
||||
transport = DailyTransportService(
|
||||
room_url,
|
||||
token,
|
||||
"Respond bot",
|
||||
300,
|
||||
)
|
||||
transport.mic_enabled = True
|
||||
transport.mic_sample_rate = 16000
|
||||
transport.camera_enabled = False
|
||||
|
||||
llm = AzureLLMService()
|
||||
tts = AzureTTSService()
|
||||
|
||||
@transport.event_handler("on_first_other_participant_joined")
|
||||
async def on_first_other_participant_joined(transport):
|
||||
await tts.say("Hi, I'm listening!", transport.send_queue)
|
||||
await transport.send_queue.put(AudioQueueFrame(sounds["ding1.wav"]))
|
||||
async def handle_transcriptions():
|
||||
messages = [
|
||||
{"role": "system", "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio. Respond to what the user said in a creative and helpful way."},
|
||||
]
|
||||
|
||||
tma_in = LLMContextAggregator(
|
||||
messages, "user", transport.my_participant_id
|
||||
)
|
||||
tma_out = LLMContextAggregator(
|
||||
messages, "assistant", transport.my_participant_id
|
||||
)
|
||||
out_sound = OutboundSoundEffectWrapper()
|
||||
in_sound = InboundSoundEffectWrapper()
|
||||
fl = FrameLogger("LLM Out")
|
||||
fl2 = FrameLogger("Transcription In")
|
||||
await out_sound.run_to_queue(
|
||||
transport.send_queue,
|
||||
tts.run(
|
||||
tma_out.run(
|
||||
llm.run(
|
||||
fl2.run(
|
||||
in_sound.run(
|
||||
tma_in.run(
|
||||
transport.get_receive_frames()
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
@transport.event_handler("on_participant_joined")
|
||||
async def pax_joined(transport, pax):
|
||||
print(f"PARTICIPANT JOINED: {pax}")
|
||||
|
||||
@transport.event_handler("on_call_state_updated")
|
||||
async def on_call_state_updated(transport, state):
|
||||
if (state == "joined"):
|
||||
if (phone):
|
||||
transport.start_recording()
|
||||
transport.dialout(phone)
|
||||
|
||||
|
||||
transport.transcription_settings["extra"]["punctuate"] = True
|
||||
|
||||
await asyncio.gather(transport.run(), handle_transcriptions())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Simple Daily Bot Sample")
|
||||
parser.add_argument(
|
||||
"-u", "--url", type=str, required=True, help="URL of the Daily room to join"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-k",
|
||||
"--apikey",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Daily API Key (needed to create token)",
|
||||
)
|
||||
|
||||
parser.add_argument("-p", "--phone", type=str, required=False, help="A phone number to call when the bot joins the room")
|
||||
|
||||
args, unknown = parser.parse_known_args()
|
||||
|
||||
# Create a meeting token for the given room with an expiration 1 hour in the future.
|
||||
room_name: str = urllib.parse.urlparse(args.url).path[1:]
|
||||
expiration: float = time.time() + 60 * 60
|
||||
|
||||
res: requests.Response = requests.post(
|
||||
f"https://api.staging.daily.co/v1/meeting-tokens",
|
||||
headers={"Authorization": f"Bearer {args.apikey}"},
|
||||
json={
|
||||
"properties": {"room_name": room_name, "is_owner": True, "exp": expiration}
|
||||
},
|
||||
)
|
||||
|
||||
if res.status_code != 200:
|
||||
raise Exception(f"Failed to create meeting token: {res.status_code} {res.text}")
|
||||
|
||||
token: str = res.json()["token"]
|
||||
asyncio.run(main(args.url, token, args.phone))
|
||||
BIN
src/samples/foundational/assets/ding1.wav
Normal file
BIN
src/samples/foundational/assets/ding2.wav
Normal file
|
Before Width: | Height: | Size: 871 KiB After Width: | Height: | Size: 871 KiB |
|
Before Width: | Height: | Size: 868 KiB After Width: | Height: | Size: 868 KiB |
|
Before Width: | Height: | Size: 868 KiB After Width: | Height: | Size: 868 KiB |
|
Before Width: | Height: | Size: 870 KiB After Width: | Height: | Size: 870 KiB |
|
Before Width: | Height: | Size: 871 KiB After Width: | Height: | Size: 871 KiB |
|
Before Width: | Height: | Size: 871 KiB After Width: | Height: | Size: 871 KiB |
|
Before Width: | Height: | Size: 872 KiB After Width: | Height: | Size: 872 KiB |
|
Before Width: | Height: | Size: 868 KiB After Width: | Height: | Size: 868 KiB |
|
Before Width: | Height: | Size: 33 KiB After Width: | Height: | Size: 33 KiB |
|
Before Width: | Height: | Size: 30 KiB After Width: | Height: | Size: 30 KiB |