Compare commits

...

52 Commits

Author SHA1 Message Date
Aleix Conchillo Flaqué
db05a9b29b Merge pull request #116 from daily-co/moondream-use-cpu
moondream: allow passing use_cpu
2024-04-11 09:08:11 +08:00
Aleix Conchillo Flaqué
130e418800 moondream: allow passing use_cpu 2024-04-10 17:43:44 -07:00
Aleix Conchillo Flaqué
1a0a66e503 Merge pull request #114 from daily-co/jpt/fal-updates
Updated Fal.ai service to take a params model and allow for model string param
2024-04-11 00:47:33 +08:00
Aleix Conchillo Flaqué
e22babbae2 examples: update with new FalImageGenService parameters 2024-04-10 09:45:08 -07:00
Aleix Conchillo Flaqué
bfe2e0f36e services: don't use image_size in ImageGenService 2024-04-10 09:44:42 -07:00
Aleix Conchillo Flaqué
26d401e5de Merge pull request #115 from daily-co/add-vision-and-moondream-service
add vision and moondream service
2024-04-11 00:22:26 +08:00
Aleix Conchillo Flaqué
3c20f9153d added VisionImageFrame and VisionImageFrameAggregator 2024-04-10 09:19:34 -07:00
Aleix Conchillo Flaqué
2f9899af5a update macos-py3.10 requirements 2024-04-09 22:39:04 -07:00
Aleix Conchillo Flaqué
5ef5cf30f4 update linux-py3.10 requirements 2024-04-09 22:36:35 -07:00
Aleix Conchillo Flaqué
34a6c5691b examples: added 12-describe-video 2024-04-09 22:36:35 -07:00
Aleix Conchillo Flaqué
18bf09c704 services: added MoondreamService 2024-04-09 22:36:35 -07:00
Aleix Conchillo Flaqué
84cfa7cc95 services: added VisionService 2024-04-09 22:36:35 -07:00
Aleix Conchillo Flaqué
a5eba0106b transport: allow requesting a user frame 2024-04-09 22:36:35 -07:00
Aleix Conchillo Flaqué
b117a185e3 frames: added UserImageRequestFrame 2024-04-09 22:14:54 -07:00
Aleix Conchillo Flaqué
0219230827 Merge pull request #113 from daily-co/aleix/only-subcribe-to-participant
only subcribe to participant
2024-04-10 10:47:29 +08:00
Aleix Conchillo Flaqué
9fcbb36997 examples: add 14a-local-render-remote-participant 2024-04-09 19:46:10 -07:00
Aleix Conchillo Flaqué
0bf15fd6eb daily: only subscribe to participant video source 2024-04-09 19:46:10 -07:00
Aleix Conchillo Flaqué
989252bb52 daily: always check camera/mic/speaker enabled 2024-04-09 19:46:10 -07:00
Jon Taylor
7b44a79a5b added params and model attribute to fal service 2024-04-09 17:43:27 -07:00
Aleix Conchillo Flaqué
4bd29b0080 Merge pull request #110 from daily-co/compatible-versions
pyproject: use compatible version
2024-04-10 00:41:22 +08:00
Aleix Conchillo Flaqué
ebb76fdae9 update macos-py3.10 requirements 2024-04-09 08:52:37 -07:00
Aleix Conchillo Flaqué
5d52def0fe update linux-py3.10 requirements 2024-04-09 08:49:41 -07:00
Aleix Conchillo Flaqué
9ada56d0b0 pyproject: use compatible version 2024-04-09 08:41:54 -07:00
Aleix Conchillo Flaqué
8d73cdb2ee Merge pull request #111 from daily-co/user-transcription-aggregator
pipeline: add UserTranscriptionAggregator
2024-04-09 23:34:52 +08:00
Aleix Conchillo Flaqué
4f04b10202 Merge pull request #112 from daily-co/user-image-frame
user image frames and other updates
2024-04-09 23:34:32 +08:00
Aleix Conchillo Flaqué
97b923e37e llm user and assistant aggregator renames 2024-04-09 08:31:48 -07:00
Aleix Conchillo Flaqué
57aabea0a3 examples: added 14-render-remote-participant 2024-04-09 08:01:14 -07:00
Aleix Conchillo Flaqué
319b8e7816 updated ImageFrame and added URLImageFrame and UserImageFrame 2024-04-08 23:23:33 -07:00
Aleix Conchillo Flaqué
96950ca6df daily: on_first_other_participant_joined now gets the participant 2024-04-08 23:23:33 -07:00
Aleix Conchillo Flaqué
d7b2e67c35 pipeline: add UserTranscriptionAggregator 2024-04-08 17:15:14 -07:00
Aleix Conchillo Flaqué
53930b47a5 github: just some rewording 2024-04-06 18:03:53 -07:00
Aleix Conchillo Flaqué
86c8ab02cc github: also publish stables releases to test pypi 2024-04-06 17:58:13 -07:00
Aleix Conchillo Flaqué
b678097f6d Merge pull request #109 from daily-co/only-use-fps
transport: only use fps to set maxFramerate
2024-04-07 07:02:44 +08:00
Aleix Conchillo Flaqué
eb455043c4 transport: use camera_bitrate and camera_framerate 2024-04-06 12:27:05 -07:00
Aleix Conchillo Flaqué
dd696be04c Merge pull request #108 from daily-co/add-camera-max-framerate
transport: add camera_max_framerate argument
2024-04-06 11:18:42 +08:00
Aleix Conchillo Flaqué
96b2337183 transport: add camera_max_framerate argument 2024-04-05 20:16:03 -07:00
Aleix Conchillo Flaqué
ea52e73f57 Merge pull request #107 from daily-co/increase-max-framerate
transport: increase daily maxFramerate to 30
2024-04-06 11:08:21 +08:00
Aleix Conchillo Flaqué
88404e4739 Merge pull request #106 from daily-co/updated-to-be-updated-examples
examples: updated to_be_updated examples
2024-04-06 11:06:30 +08:00
Aleix Conchillo Flaqué
0fd323714e transport: add camera_max_bitrate argument 2024-04-05 20:05:58 -07:00
Aleix Conchillo Flaqué
a362ca4d3d transport: increase daily maxFramerate to 30 2024-04-05 19:44:25 -07:00
Aleix Conchillo Flaqué
02b5c3dd5f update dot-env.template 2024-04-05 16:16:56 -07:00
Aleix Conchillo Flaqué
497a09cbc8 examples: updated to_be_updated examples 2024-04-05 16:01:23 -07:00
Aleix Conchillo Flaqué
172a14245d Merge pull request #104 from daily-co/threaded-transport-allow-sink-override
examples: fix whisper examples
2024-04-06 04:46:12 +08:00
Aleix Conchillo Flaqué
302246399b Merge pull request #105 from daily-co/local-tranport-read-audio-frames
transports: fix local transport read_audio_frames
2024-04-06 04:44:37 +08:00
Aleix Conchillo Flaqué
9590cc2fbc examples: fix whisper examples 2024-04-05 13:43:51 -07:00
Aleix Conchillo Flaqué
09e4044c72 transports: fix local transport read_audio_frames 2024-04-05 13:34:01 -07:00
Aleix Conchillo Flaqué
efdfb74dc3 github: increase fetch-depth to 100 for test publish 2024-04-05 08:32:29 -07:00
Aleix Conchillo Flaqué
158de6f20b github: fetch-tags and increase fetch-depth for test publish 2024-04-05 08:25:37 -07:00
Aleix Conchillo Flaqué
47f68b742d pyproject: user proper environment for test pypi 2024-04-05 08:02:45 -07:00
Aleix Conchillo Flaqué
2654ca1f62 pyproject: don't use local version for test pypi 2024-04-05 07:51:52 -07:00
Aleix Conchillo Flaqué
4263827ee8 README: use double-quotes with optional dependencies 2024-04-04 17:47:16 -07:00
Aleix Conchillo Flaqué
97fe529b0e github: update test publish workflow 2024-04-04 17:41:31 -07:00
43 changed files with 1007 additions and 359 deletions

View File

@@ -60,3 +60,25 @@ jobs:
with:
verbose: true
print-hash: true
publish-to-test-pypi:
name: "Publish to Test PyPI"
runs-on: ubuntu-latest
needs: [ build ]
environment:
name: testpypi
url: https://pypi.org/p/dailyai
permissions:
id-token: write
steps:
- name: Download wheels
uses: actions/download-artifact@v4
with:
name: wheels
path: ./dist
- name: Publish to PyPI
uses: pypa/gh-action-pypi-publish@release/v1
with:
verbose: true
print-hash: true
repository-url: https://test.pypi.org/legacy/

View File

@@ -15,6 +15,8 @@ jobs:
uses: actions/checkout@v4
with:
ref: ${{ github.event.inputs.gitref }}
fetch-tags: true
fetch-depth: 100
- name: Set up Python
id: setup_python
uses: actions/setup-python@v4
@@ -35,14 +37,15 @@ jobs:
- name: Upload wheels
uses: actions/upload-artifact@v4
with:
name: wheels
path: ./dist
publish-to-pypi:
name: "Test publish to PyPI"
name: "Publish to Test PyPI"
runs-on: ubuntu-latest
needs: [ build ]
environment:
name: pypi
name: testpypi
url: https://pypi.org/p/dailyai
permissions:
id-token: write
@@ -50,6 +53,7 @@ jobs:
- name: Download wheels
uses: actions/download-artifact@v4
with:
name: wheels
path: ./dist
- name: Publish to PyPI
uses: pypa/gh-action-pypi-publish@release/v1

1
.gitignore vendored
View File

@@ -3,6 +3,7 @@ env/
__pycache__/
*~
venv
.venv
#*#
# Distribution / packaging

View File

@@ -39,6 +39,8 @@ Currently implemented services:
- Transport
- Daily
- Local (in progress, intended as a quick start example service)
- Vision
- Moondream
If you'd like to [implement a service]((https://github.com/daily-co/daily-ai-sdk/tree/main/src/dailyai/services)), we welcome PRs! Our goal is to support lots of services in all of the above categories, plus new categories (like real-time video) as they emerge.
@@ -58,12 +60,12 @@ By default, in order to minimize dependencies, only the basic framework function
dependencies that you can install with:
```
pip install dailyai[option,...]
pip install "dailyai[option,...]"
```
Your project may or may not need these, so they're made available as optional requirements. Here is a list:
- **AI services**: `anthropic`, `azure`, `fal`, `openai`, `playht`, `silero`, `whisper`
- **AI services**: `anthropic`, `azure`, `fal`, `moondream`, `openai`, `playht`, `silero`, `whisper`
- **Transports**: `daily`, `local`, `websocket`
## Code examples

View File

@@ -2,8 +2,16 @@
ANTHROPIC_API_KEY=...
# Azure
SPEECH_KEY=...
SPEECH_REGION=...
AZURE_SPEECH_REGION=...
AZURE_SPEECH_API_KEY=...
AZURE_CHATGPT_API_KEY=...
AZURE_CHATGPT_ENDPOINT=https://...
AZURE_CHATGPT_MODEL=...
AZURE_DALLE_API_KEY=...
AZURE_DALLE_ENDPOINT=https://...
AZURE_DALLE_MODEL=...
# Daily
DAILY_API_KEY=...

View File

@@ -48,7 +48,7 @@ async def main(room_url):
pipeline = Pipeline([llm, tts])
@transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport):
async def on_first_other_participant_joined(transport, participant):
await pipeline.queue_frames([LLMMessagesFrame(messages), EndFrame()])
await transport.run(pipeline)

View File

@@ -31,7 +31,9 @@ async def main(room_url):
)
imagegen = FalImageGenService(
image_size="square_hd",
params=FalImageGenService.InputParams(
image_size="square_hd"
),
aiohttp_session=session,
key_id=os.getenv("FAL_KEY_ID"),
key_secret=os.getenv("FAL_KEY_SECRET"),
@@ -40,7 +42,7 @@ async def main(room_url):
pipeline = Pipeline([imagegen])
@transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport):
async def on_first_other_participant_joined(transport, participant):
# Note that we do not put an EndFrame() item in the pipeline for this demo.
# This means that the bot will stay in the channel until it times out.
# An EndFrame() in the pipeline would cause the transport to shut

View File

@@ -5,7 +5,7 @@ import os
import tkinter as tk
from dailyai.pipeline.frames import TextFrame, EndFrame
from dailyai.pipeline.frames import TextFrame
from dailyai.pipeline.pipeline import Pipeline
from dailyai.services.fal_ai_services import FalImageGenService
from dailyai.transports.local_transport import LocalTransport
@@ -35,7 +35,9 @@ async def main():
)
imagegen = FalImageGenService(
image_size="square_hd",
params=FalImageGenService.InputParams(
image_size="square_hd"
),
aiohttp_session=session,
key_id=os.getenv("FAL_KEY_ID"),
key_secret=os.getenv("FAL_KEY_SECRET"),

View File

@@ -85,7 +85,9 @@ async def main(room_url):
model="gpt-4-turbo-preview")
imagegen = FalImageGenService(
image_size="square_hd",
params=FalImageGenService.InputParams(
image_size="square_hd"
),
aiohttp_session=session,
key_id=os.getenv("FAL_KEY_ID"),
key_secret=os.getenv("FAL_KEY_SECRET"),

View File

@@ -3,8 +3,9 @@ import asyncio
import logging
import tkinter as tk
import os
from dailyai.pipeline.aggregators import LLMFullResponseAggregator
from dailyai.pipeline.frames import AudioFrame, ImageFrame
from dailyai.pipeline.frames import AudioFrame, URLImageFrame, LLMMessagesFrame, TextFrame
from dailyai.services.open_ai_services import OpenAILLMService
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
from dailyai.services.fal_ai_services import FalImageGenService
@@ -22,7 +23,7 @@ async def main():
async with aiohttp.ClientSession() as session:
meeting_duration_minutes = 5
tk_root = tk.Tk()
tk_root.title("Calendar")
tk_root.title("dailyai")
transport = LocalTransport(
mic_enabled=True,
@@ -43,8 +44,10 @@ async def main():
api_key=os.getenv("OPENAI_API_KEY"),
model="gpt-4-turbo-preview")
dalle = FalImageGenService(
image_size="1024x1024",
imagegen = FalImageGenService(
params=FalImageGenService.InputParams(
image_size="1024x1024"
),
aiohttp_session=session,
key_id=os.getenv("FAL_KEY_ID"),
key_secret=os.getenv("FAL_KEY_SECRET"),
@@ -60,18 +63,32 @@ async def main():
return all_audio
async def get_month_data(month):
messages = [{"role": "system", "content": f"Describe a nature photograph suitable for use in a calendar, for the month of {
month}. Include only the image description with no preamble. Limit the description to one sentence, please.", }]
async def get_month_description(aggregator, frame):
async for frame in aggregator.process_frame(frame):
if isinstance(frame, TextFrame):
return frame.text
async def get_month_data(month):
messages = [{"role": "system", "content": f"Describe a nature photograph suitable for use in a calendar, for the month of {month}. Include only the image description with no preamble. Limit the description to one sentence, please.", }]
messages_frame = LLMMessagesFrame(messages)
llm_full_response_aggregator = LLMFullResponseAggregator()
image_description = None
async for frame in llm.process_frame(messages_frame):
result = await get_month_description(llm_full_response_aggregator, frame)
if result:
image_description = result
break
image_description = await llm.run_llm(messages)
if not image_description:
return
to_speak = f"{month}: {image_description}"
audio_task = asyncio.create_task(get_all_audio(to_speak))
image_task = asyncio.create_task(
dalle.run_image_gen(image_description))
imagegen.run_image_gen(image_description))
(audio, image_data) = await asyncio.gather(audio_task, image_task)
return {
@@ -79,22 +96,18 @@ async def main():
"text": image_description,
"image_url": image_data[0],
"image": image_data[1],
"image_size": image_data[2],
"audio": audio,
}
# We only specify 5 months as we create tasks all at once and we might
# get rate limited otherwise.
months: list[str] = [
"January",
"February",
"March",
"April",
"May",
"June",
"July",
"August",
"September",
"October",
"November",
"December",
]
async def show_images():
@@ -107,7 +120,7 @@ async def main():
if data:
await transport.send_queue.put(
[
ImageFrame(data["image_url"], data["image"]),
URLImageFrame(data["image_url"], data["image"], data["image_size"]),
AudioFrame(data["audio"]),
]
)

View File

@@ -72,7 +72,7 @@ async def main(room_url: str, token):
)
@transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport):
async def on_first_other_participant_joined(transport, participant):
# Kick off the conversation.
messages.append(
{"role": "system", "content": "Please introduce yourself to the user."})

View File

@@ -0,0 +1,96 @@
import asyncio
import os
import logging
from typing import AsyncGenerator
import aiohttp
from PIL import Image
from dailyai.pipeline.frames import ImageFrame, Frame, TextFrame
from dailyai.pipeline.pipeline import Pipeline
from dailyai.transports.daily_transport import DailyTransport
from dailyai.services.ai_services import AIService
from dailyai.pipeline.aggregators import (
LLMAssistantContextAggregator,
LLMUserContextAggregator,
)
from dailyai.services.open_ai_services import OpenAILLMService
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
from runner import configure
from dotenv import load_dotenv
load_dotenv(override=True)
logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
logger = logging.getLogger("dailyai")
logger.setLevel(logging.DEBUG)
class ImageSyncAggregator(AIService):
def __init__(self, speaking_path: str, waiting_path: str):
self._speaking_image = Image.open(speaking_path)
self._speaking_image_bytes = self._speaking_image.tobytes()
self._waiting_image = Image.open(waiting_path)
self._waiting_image_bytes = self._waiting_image.tobytes()
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
yield ImageFrame(self._speaking_image_bytes, (1024, 1024))
yield frame
yield ImageFrame(self._waiting_image_bytes, (1024, 1024))
async def main(room_url: str, token):
async with aiohttp.ClientSession() as session:
transport = DailyTransport(
room_url,
token,
"Respond bot",
5,
)
transport._camera_enabled = True
transport._camera_width = 1024
transport._camera_height = 1024
transport._mic_enabled = True
transport._mic_sample_rate = 16000
transport.transcription_settings["extra"]["punctuate"] = True
tts = ElevenLabsTTSService(
aiohttp_session=session,
api_key=os.getenv("ELEVENLABS_API_KEY"),
voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
)
llm = OpenAILLMService(
api_key=os.getenv("OPENAI_API_KEY"),
model="gpt-4-turbo-preview")
messages = [
{
"role": "system",
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so it should not include any special characters. Respond to what the user said in a creative and helpful way.",
},
]
tma_in = LLMUserContextAggregator(
messages, transport._my_participant_id)
tma_out = LLMAssistantContextAggregator(
messages, transport._my_participant_id
)
image_sync_aggregator = ImageSyncAggregator(
os.path.join(os.path.dirname(__file__), "assets", "speaking.png"),
os.path.join(os.path.dirname(__file__), "assets", "waiting.png"),
)
pipeline = Pipeline([image_sync_aggregator, tma_in, llm, tma_out, tts])
@transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport, participant):
await pipeline.queue_frames([TextFrame("Hi, I'm listening!")])
await transport.run(pipeline)
if __name__ == "__main__":
(url, token) = configure()
asyncio.run(main(url, token))

View File

@@ -3,8 +3,8 @@ import aiohttp
import logging
import os
from dailyai.pipeline.aggregators import (
LLMResponseAggregator,
UserResponseAggregator,
LLMAssistantResponseAggregator,
LLMUserResponseAggregator,
)
from dailyai.pipeline.pipeline import Pipeline
@@ -50,7 +50,7 @@ async def main(room_url: str, token):
pipeline = Pipeline([FrameLogger(), llm, FrameLogger(), tts])
@transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport):
async def on_first_other_participant_joined(transport, participant):
await transport.say("Hi, I'm listening!", tts)
async def run_conversation():
@@ -63,8 +63,8 @@ async def main(room_url: str, token):
await transport.run_interruptible_pipeline(
pipeline,
post_processor=LLMResponseAggregator(messages),
pre_processor=UserResponseAggregator(messages),
post_processor=LLMAssistantResponseAggregator(messages),
pre_processor=LLMUserResponseAggregator(messages),
)
transport.transcription_settings["extra"]["punctuate"] = False

View File

@@ -51,7 +51,9 @@ async def main(room_url: str):
voice_id="jBpfuIE2acCO8z3wKNLl",
)
dalle = FalImageGenService(
image_size="1024x1024",
params=FalImageGenService.InputParams(
image_size="1024x1024"
),
aiohttp_session=session,
key_id=os.getenv("FAL_KEY_ID"),
key_secret=os.getenv("FAL_KEY_SECRET"),
@@ -122,7 +124,7 @@ async def main(room_url: str):
)
await transport.send_queue.put(
[
ImageFrame(None, image_data1[1]),
ImageFrame(image_data1[1], image_data1[2]),
AudioFrame(audio1),
]
)
@@ -134,7 +136,7 @@ async def main(room_url: str):
)
await transport.send_queue.put(
[
ImageFrame(None, image_data2[1]),
ImageFrame(image_data2[1], image_data2[2]),
AudioFrame(audio2),
]
)

View File

@@ -5,6 +5,7 @@ import os
import random
from typing import AsyncGenerator
from PIL import Image
from dailyai.pipeline.pipeline import Pipeline
from dailyai.transports.daily_transport import DailyTransport
from dailyai.services.open_ai_services import OpenAILLMService
@@ -54,7 +55,7 @@ for file in image_files:
sprites[file] = img.tobytes()
# When the bot isn't talking, show a static image of the cat listening
quiet_frame = ImageFrame("", sprites["sc-listen-1.png"])
quiet_frame = ImageFrame(sprites["sc-listen-1.png"], (720, 1280))
# When the bot is talking, build an animation from two sprites
talking_list = [sprites["sc-default.png"], sprites["sc-talk.png"]]
talking = [random.choice(talking_list) for x in range(30)]
@@ -133,6 +134,7 @@ async def main(room_url: str, token):
transport._camera_enabled = True
transport._camera_width = 720
transport._camera_height = 1280
transport.transcription_settings["extra"]["punctuate"] = True
llm = OpenAILLMService(
api_key=os.getenv("OPENAI_API_KEY"),
@@ -145,45 +147,34 @@ async def main(room_url: str, token):
)
isa = ImageSyncAggregator()
messages = [
{
"role": "system",
"content": "You are Santa Cat, a cat that lives in Santa's workshop at the North Pole. You should be clever, and a bit sarcastic. You should also tell jokes every once in a while. Your responses should only be a few sentences long.",
},
]
tma_in = LLMUserContextAggregator(
messages, transport._my_participant_id)
tma_out = LLMAssistantContextAggregator(
messages, transport._my_participant_id
)
tf = TranscriptFilter(transport._my_participant_id)
ncf = NameCheckFilter(["Santa Cat", "Santa"])
pipeline = Pipeline([isa, tf, ncf, tma_in, llm, tma_out, tts])
@transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport):
await tts.say(
async def on_first_other_participant_joined(transport, participant):
await transport.say(
"Hi! If you want to talk to me, just say 'hey Santa Cat'.",
transport.send_queue,
)
async def handle_transcriptions():
messages = [
{
"role": "system",
"content": "You are Santa Cat, a cat that lives in Santa's workshop at the North Pole. You should be clever, and a bit sarcastic. You should also tell jokes every once in a while. Your responses should only be a few sentences long.",
},
]
tma_in = LLMUserContextAggregator(
messages, transport._my_participant_id)
tma_out = LLMAssistantContextAggregator(
messages, transport._my_participant_id
)
tf = TranscriptFilter(transport._my_participant_id)
ncf = NameCheckFilter(["Santa Cat", "Santa"])
await tts.run_to_queue(
transport.send_queue,
isa.run(
tma_out.run(
llm.run(
tma_in.run(
ncf.run(tf.run(transport.get_receive_frames())))
)
)
),
tts,
)
async def starting_image():
await transport.send_queue.put(quiet_frame)
transport.transcription_settings["extra"]["punctuate"] = True
await asyncio.gather(transport.run(), handle_transcriptions(), starting_image())
await asyncio.gather(transport.run(pipeline), starting_image())
if __name__ == "__main__":

View File

@@ -3,6 +3,7 @@ import asyncio
import logging
import os
import wave
from dailyai.pipeline.pipeline import Pipeline
from dailyai.transports.daily_transport import DailyTransport
from dailyai.services.open_ai_services import OpenAILLMService
@@ -81,6 +82,7 @@ async def main(room_url: str, token):
mic_sample_rate=16000,
camera_enabled=False,
)
transport.transcription_settings["extra"]["punctuate"] = True
llm = OpenAILLMService(
api_key=os.getenv("OPENAI_API_KEY"),
@@ -92,47 +94,31 @@ async def main(room_url: str, token):
voice_id="ErXwobaYiN019PkySvjV",
)
messages = [
{
"role": "system",
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio. Respond to what the user said in a creative and helpful way.",
},
]
tma_in = LLMUserContextAggregator(
messages, transport._my_participant_id)
tma_out = LLMAssistantContextAggregator(
messages, transport._my_participant_id
)
out_sound = OutboundSoundEffectWrapper()
in_sound = InboundSoundEffectWrapper()
fl = FrameLogger("LLM Out")
fl2 = FrameLogger("Transcription In")
pipeline = Pipeline([tma_in, in_sound, fl2, llm, tma_out, fl, tts, out_sound])
@transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport):
await tts.say("Hi, I'm listening!", transport.send_queue)
async def on_first_other_participant_joined(transport, participant):
await transport.say("Hi, I'm listening!", tts)
await transport.send_queue.put(AudioFrame(sounds["ding1.wav"]))
async def handle_transcriptions():
messages = [
{
"role": "system",
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio. Respond to what the user said in a creative and helpful way.",
},
]
tma_in = LLMUserContextAggregator(
messages, transport._my_participant_id)
tma_out = LLMAssistantContextAggregator(
messages, transport._my_participant_id
)
out_sound = OutboundSoundEffectWrapper()
in_sound = InboundSoundEffectWrapper()
fl = FrameLogger("LLM Out")
fl2 = FrameLogger("Transcription In")
await out_sound.run_to_queue(
transport.send_queue,
tts.run(
fl.run(
tma_out.run(
llm.run(
fl2.run(
in_sound.run(
tma_in.run(transport.get_receive_frames())
)
)
)
)
)
),
)
transport.transcription_settings["extra"]["punctuate"] = True
await asyncio.gather(transport.run(), handle_transcriptions())
await asyncio.gather(transport.run(pipeline))
if __name__ == "__main__":

View File

@@ -0,0 +1,85 @@
import asyncio
import aiohttp
import logging
import os
from typing import AsyncGenerator
from dailyai.pipeline.aggregators import FrameProcessor, UserResponseAggregator, VisionImageFrameAggregator
from dailyai.pipeline.frames import Frame, TextFrame, UserImageRequestFrame
from dailyai.pipeline.pipeline import Pipeline
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
from dailyai.services.moondream_ai_service import MoondreamService
from dailyai.transports.daily_transport import DailyTransport
from runner import configure
from dotenv import load_dotenv
load_dotenv(override=True)
logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
logger = logging.getLogger("dailyai")
logger.setLevel(logging.DEBUG)
class UserImageRequester(FrameProcessor):
participant_id: str
def set_participant_id(self, participant_id: str):
self.participant_id = participant_id
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
if self.participant_id and isinstance(frame, TextFrame):
yield UserImageRequestFrame(self.participant_id)
yield frame
async def main(room_url: str, token):
async with aiohttp.ClientSession() as session:
transport = DailyTransport(
room_url,
token,
"Describe participant video",
duration_minutes=5,
mic_enabled=True,
mic_sample_rate=16000,
vad_enabled=True,
start_transcription=True,
video_rendering_enabled=True
)
tts = ElevenLabsTTSService(
aiohttp_session=session,
api_key=os.getenv("ELEVENLABS_API_KEY"),
voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
)
user_response = UserResponseAggregator()
image_requester = UserImageRequester()
vision_aggregator = VisionImageFrameAggregator()
# If you run into weird description, try with use_cpu=True
moondream = MoondreamService()
tts = ElevenLabsTTSService(
aiohttp_session=session,
api_key=os.getenv("ELEVENLABS_API_KEY"),
voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
)
@transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport, participant):
await transport.say("Hi there! Feel free to ask me what I see.", tts)
transport.render_participant_video(participant["id"], framerate=0)
image_requester.set_participant_id(participant["id"])
pipeline = Pipeline([user_response, image_requester, vision_aggregator, moondream, tts])
await transport.run(pipeline)
if __name__ == "__main__":
(url, token) = configure()
asyncio.run(main(url, token))

View File

@@ -1,12 +1,16 @@
import asyncio
import logging
from dailyai.pipeline.frames import EndFrame, TranscriptionFrame
from dailyai.transports.daily_transport import DailyTransport
from dailyai.services.whisper_ai_services import WhisperSTTService
from dailyai.pipeline.pipeline import Pipeline
from runner import configure
from dotenv import load_dotenv
load_dotenv(override=True)
logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
logger = logging.getLogger("dailyai")
logger.setLevel(logging.DEBUG)
@@ -26,17 +30,27 @@ async def main(room_url: str):
stt = WhisperSTTService()
transcription_output_queue = asyncio.Queue()
transport_done = asyncio.Event()
pipeline = Pipeline([stt])
pipeline.set_sink(transcription_output_queue)
pipeline = Pipeline([stt], source=transport.receive_queue, sink=transcription_output_queue)
async def handle_transcription():
print("`````````TRANSCRIPTION`````````")
while True:
while not transport_done.is_set():
item = await transcription_output_queue.get()
print(item.text)
print("got item from queue", item)
if isinstance(item, TranscriptionFrame):
print(item.text)
elif isinstance(item, EndFrame):
break
print("handle_transcription done")
await asyncio.gather(transport.run(pipeline), handle_transcription())
async def run_until_done():
await transport.run()
transport_done.set()
print("run_until_done done")
await asyncio.gather(run_until_done(), pipeline.run_pipeline(), handle_transcription())
if __name__ == "__main__":

View File

@@ -15,11 +15,10 @@ async def main():
meeting_duration_minutes = 1
transport = LocalTransport(
mic_enabled=False,
mic_enabled=True,
camera_enabled=False,
speaker_enabled=True,
duration_minutes=meeting_duration_minutes,
start_transcription=False,
)
stt = WhisperSTTService()
@@ -27,8 +26,7 @@ async def main():
transcription_output_queue = asyncio.Queue()
transport_done = asyncio.Event()
pipeline = Pipeline([stt])
pipeline.set_sink(transcription_output_queue)
pipeline = Pipeline([stt], source=transport.receive_queue, sink=transcription_output_queue)
async def handle_transcription():
print("`````````TRANSCRIPTION`````````")
@@ -42,11 +40,11 @@ async def main():
print("handle_transcription done")
async def run_until_done():
await transport.run(pipeline)
await transport.run()
transport_done.set()
print("run_until_done done")
await asyncio.gather(run_until_done(), handle_transcription())
await asyncio.gather(run_until_done(), pipeline.run_pipeline(), handle_transcription())
if __name__ == "__main__":

View File

@@ -0,0 +1,52 @@
import asyncio
import logging
from typing import AsyncGenerator
from dailyai.pipeline.aggregators import FrameProcessor
from dailyai.pipeline.frames import ImageFrame, Frame, UserImageFrame
from dailyai.pipeline.pipeline import Pipeline
from dailyai.transports.daily_transport import DailyTransport
from runner import configure
from dotenv import load_dotenv
load_dotenv(override=True)
logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
logger = logging.getLogger("dailyai")
logger.setLevel(logging.DEBUG)
class UserImageProcessor(FrameProcessor):
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
if isinstance(frame, UserImageFrame):
yield ImageFrame(frame.image, frame.size)
else:
yield frame
async def main(room_url: str, token):
transport = DailyTransport(
room_url,
token,
"Render participant video",
camera_width=1280,
camera_height=720,
camera_enabled=True,
video_rendering_enabled=True
)
@ transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport, participant):
transport.render_participant_video(participant["id"])
pipeline = Pipeline([UserImageProcessor()])
await asyncio.gather(transport.run(pipeline))
if __name__ == "__main__":
(url, token) = configure()
asyncio.run(main(url, token))

View File

@@ -0,0 +1,71 @@
import asyncio
import logging
import tkinter as tk
from typing import AsyncGenerator
from dailyai.pipeline.aggregators import FrameProcessor
from dailyai.pipeline.frames import ImageFrame, Frame, UserImageFrame
from dailyai.pipeline.pipeline import Pipeline
from dailyai.transports.daily_transport import DailyTransport
from dailyai.transports.local_transport import LocalTransport
from runner import configure
from dotenv import load_dotenv
load_dotenv(override=True)
logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
logger = logging.getLogger("dailyai")
logger.setLevel(logging.DEBUG)
class UserImageProcessor(FrameProcessor):
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
if isinstance(frame, UserImageFrame):
yield ImageFrame(frame.image, frame.size)
else:
yield frame
async def main(room_url: str, token):
tk_root = tk.Tk()
tk_root.title("dailyai")
local_transport = LocalTransport(
tk_root=tk_root,
camera_enabled=True,
camera_width=1280,
camera_height=720
)
transport = DailyTransport(
room_url,
token,
"Render participant video",
video_rendering_enabled=True
)
@transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport, participant):
transport.render_participant_video(participant["id"])
async def run_tk():
while not transport._stop_threads.is_set():
tk_root.update()
tk_root.update_idletasks()
await asyncio.sleep(0.1)
local_pipeline = Pipeline([UserImageProcessor()], source=transport.receive_queue)
await asyncio.gather(
transport.run(),
local_transport.run(local_pipeline, override_pipeline_source_queue=False),
run_tk()
)
if __name__ == "__main__":
(url, token) = configure()
asyncio.run(main(url, token))

View File

@@ -1,122 +0,0 @@
import asyncio
import os
import logging
from typing import AsyncGenerator
import aiohttp
from PIL import Image
from dailyai.pipeline.frames import ImageFrame, Frame
from dailyai.transports.daily_transport import DailyTransport
from dailyai.services.ai_services import AIService
from dailyai.pipeline.aggregators import (
LLMAssistantContextAggregator,
LLMUserContextAggregator,
)
from dailyai.services.open_ai_services import OpenAILLMService
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
from dailyai.services.fal_ai_services import FalImageGenService
from runner import configure
from dotenv import load_dotenv
load_dotenv(override=True)
logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
logger = logging.getLogger("dailyai")
logger.setLevel(logging.DEBUG)
class ImageSyncAggregator(AIService):
def __init__(self, speaking_path: str, waiting_path: str):
self._speaking_image = Image.open(speaking_path)
self._speaking_image_bytes = self._speaking_image.tobytes()
self._waiting_image = Image.open(waiting_path)
self._waiting_image_bytes = self._waiting_image.tobytes()
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
yield ImageFrame(None, self._speaking_image_bytes)
yield frame
yield ImageFrame(None, self._waiting_image_bytes)
async def main(room_url: str, token):
async with aiohttp.ClientSession() as session:
transport = DailyTransport(
room_url,
token,
"Respond bot",
5,
)
transport._camera_enabled = True
transport._camera_width = 1024
transport._camera_height = 1024
transport._mic_enabled = True
transport._mic_sample_rate = 16000
tts = ElevenLabsTTSService(
aiohttp_session=session,
api_key=os.getenv("ELEVENLABS_API_KEY"),
voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
)
llm = OpenAILLMService(
api_key=os.getenv("OPENAI_API_KEY"),
model="gpt-4-turbo-preview")
img = FalImageGenService(
image_size="1024x1024",
aiohttp_session=session,
key_id=os.getenv("FAL_KEY_ID"),
key_secret=os.getenv("FAL_KEY_SECRET"),
)
async def get_images():
get_speaking_task = asyncio.create_task(
img.run_image_gen("An image of a cat speaking")
)
get_waiting_task = asyncio.create_task(
img.run_image_gen("An image of a cat waiting")
)
(speaking_data, waiting_data) = await asyncio.gather(
get_speaking_task, get_waiting_task
)
return speaking_data, waiting_data
@transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport):
await tts.say("Hi, I'm listening!", transport.send_queue)
async def handle_transcriptions():
messages = [
{
"role": "system",
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio. Respond to what the user said in a creative and helpful way.",
},
]
tma_in = LLMUserContextAggregator(
messages, transport._my_participant_id)
tma_out = LLMAssistantContextAggregator(
messages, transport._my_participant_id
)
image_sync_aggregator = ImageSyncAggregator(
os.path.join(
os.path.dirname(__file__), "assets", "speaking.png"), os.path.join(
os.path.dirname(__file__), "assets", "waiting.png"), )
await tts.run_to_queue(
transport.send_queue,
image_sync_aggregator.run(
tma_out.run(llm.run(tma_in.run(transport.get_receive_frames())))
),
)
transport.transcription_settings["extra"]["punctuate"] = True
await asyncio.gather(transport.run(), handle_transcriptions())
if __name__ == "__main__":
(url, token) = configure()
asyncio.run(main(url, token))

View File

@@ -80,7 +80,7 @@ async def main(room_url: str, token, phone):
tts = AzureTTSService()
@transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport):
async def on_first_other_participant_joined(transport, participant):
await tts.say("Hi, I'm listening!", transport.send_queue)
await transport.send_queue.put(AudioFrame(sounds["ding1.wav"]))

View File

@@ -6,8 +6,8 @@ from PIL import Image
from typing import AsyncGenerator
from dailyai.pipeline.aggregators import (
LLMResponseAggregator,
UserResponseAggregator,
LLMAssistantResponseAggregator,
LLMUserResponseAggregator,
)
from dailyai.pipeline.frames import (
ImageFrame,
@@ -48,7 +48,7 @@ for i in range(1, 26):
flipped = sprites[::-1]
sprites.extend(flipped)
# When the bot isn't talking, show a static image of the cat listening
quiet_frame = ImageFrame("", sprites[0])
quiet_frame = ImageFrame(sprites[0], (1024, 576))
talking_frame = SpriteFrame(images=sprites)
@@ -127,7 +127,7 @@ async def main(room_url: str, token):
]
@transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport):
async def on_first_other_participant_joined(transport, participant):
print(f"!!! in here, pipeline.source is {pipeline.source}")
await pipeline.queue_frames([LLMMessagesFrame(messages)])
@@ -135,8 +135,8 @@ async def main(room_url: str, token):
await transport.run_interruptible_pipeline(
pipeline,
post_processor=LLMResponseAggregator(messages),
pre_processor=UserResponseAggregator(messages),
post_processor=LLMAssistantResponseAggregator(messages),
pre_processor=LLMUserResponseAggregator(messages),
)
transport.transcription_settings["extra"]["endpointing"] = True

View File

@@ -330,7 +330,7 @@ async def main(room_url: str, token):
pipeline = Pipeline(processors=[fl, llm, fl2, checklist, tts])
@transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport):
async def on_first_other_participant_joined(transport, participant):
await pipeline.queue_frames([OpenAILLMContextFrame(context)])
async def handle_intake():

View File

@@ -19,8 +19,8 @@ from dailyai.services.deepgram_ai_services import DeepgramTTSService
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
from dailyai.pipeline.aggregators import (
LLMAssistantContextAggregator,
UserResponseAggregator,
LLMResponseAggregator,
LLMAssistantResponseAggregator,
LLMUserResponseAggregator,
)
from dailyai.pipeline.frames import (
EndPipeFrame,
@@ -99,7 +99,7 @@ class StoryProcessor(FrameProcessor):
1. Catch the frames that are generated by the LLM service
"""
if isinstance(frame, UserStoppedSpeakingFrame):
yield ImageFrame(None, images["grandma-writing.png"])
yield ImageFrame(images["grandma-writing.png"], (1024, 1024))
yield AudioFrame(sounds["talking.wav"])
elif isinstance(frame, TextFrame):
@@ -112,7 +112,7 @@ class StoryProcessor(FrameProcessor):
self._text = self._text.replace("\n", " ")
if len(self._text) > 2:
yield ImageFrame(None, images["grandma-writing.png"])
yield ImageFrame(images["grandma-writing.png"], (1024, 1024))
yield StoryStartFrame(self._text)
yield AudioFrame(sounds["ding3.wav"])
self._text = ""
@@ -146,11 +146,11 @@ class StoryProcessor(FrameProcessor):
# last bit
pass
elif isinstance(frame, LLMResponseEndFrame):
yield ImageFrame(None, images["grandma-writing.png"])
yield ImageFrame(images["grandma-writing.png"], (1024, 1024))
yield StoryPromptFrame(self._text)
self._text = ""
yield frame
yield ImageFrame(None, images["grandma-listening.png"])
yield ImageFrame(images["grandma-listening.png"], (1024, 1024))
yield AudioFrame(sounds["listening.wav"])
else:
@@ -204,13 +204,15 @@ async def main(room_url: str, token):
voice_id="Xb7hH8MSUJpSbSDYk0k2",
) # matilda
img = FalImageGenService(
image_size="1024x1024",
params={
image_size = "1024x1024",
},
aiohttp_session=session,
key_id=os.getenv("FAL_KEY_ID"),
key_secret=os.getenv("FAL_KEY_SECRET"),
)
lra = LLMResponseAggregator(messages)
ura = UserResponseAggregator(messages)
lra = LLMAssistantResponseAggregator(messages)
ura = LLMUserResponseAggregator(messages)
sp = StoryProcessor(messages, story)
sig = StoryImageGenerator(story, llm, img)
@@ -232,7 +234,7 @@ async def main(room_url: str, token):
start_story_event = asyncio.Event()
@transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport):
async def on_first_other_participant_joined(transport, participant):
start_story_event.set()
async def storytime():
@@ -252,7 +254,7 @@ async def main(room_url: str, token):
[llm, lca, tts], sink=transport.send_queue)
await local_pipeline.queue_frames(
[
ImageFrame(None, images["grandma-listening.png"]),
ImageFrame(images["grandma-listening.png"], (1024, 1024)),
LLMMessagesFrame(intro_messages),
AudioFrame(sounds["listening.wav"]),
EndPipeFrame(),

View File

@@ -33,19 +33,24 @@ certifi==2024.2.2
# httpcore
# httpx
# requests
cffi==1.16.0
# via cryptography
charset-normalizer==3.3.2
# via requests
click==8.1.7
# via
# fal
# flask
# rich-click
colorama==0.4.6
# via fal
coloredlogs==15.0.1
# via onnxruntime
cryptography==42.0.5
# via pyjwt
ctranslate2==4.1.0
# via faster-whisper
daily-python==0.7.2
daily-python==0.7.3
# via dailyai (pyproject.toml)
deprecated==1.2.14
# via opentelemetry-api
@@ -57,22 +62,25 @@ distro==1.9.0
# via
# anthropic
# openai
einops==0.7.0
# via dailyai (pyproject.toml)
exceptiongroup==1.2.0
# via anyio
fal==0.12.3
fal==0.12.7
# via dailyai (pyproject.toml)
fastapi==0.99.1
# via fal
faster-whisper==1.0.1
# via dailyai (pyproject.toml)
filelock==3.13.3
filelock==3.13.4
# via
# huggingface-hub
# pyht
# torch
# transformers
# triton
# virtualenv
flask==3.0.2
flask==3.0.3
# via
# dailyai (pyproject.toml)
# flask-cors
@@ -109,7 +117,9 @@ httpx==0.27.0
huggingface-hub==0.22.2
# via
# faster-whisper
# timm
# tokenizers
# transformers
humanfriendly==10.0
# via coloredlogs
idna==3.6
@@ -124,7 +134,7 @@ isolate[build]==0.12.7
# via
# fal
# isolate-proto
isolate-proto==0.3.3
isolate-proto==0.3.4
# via fal
itsdangerous==2.1.2
# via flask
@@ -148,13 +158,15 @@ multidict==6.0.5
# via
# aiohttp
# yarl
networkx==3.2.1
networkx==3.3
# via torch
numpy==1.26.4
# via
# ctranslate2
# dailyai (pyproject.toml)
# onnxruntime
# torchvision
# transformers
nvidia-cublas-cu12==12.1.3.1
# via
# nvidia-cudnn-cu12
@@ -188,7 +200,7 @@ nvidia-nvtx-cu12==12.1.105
# via torch
onnxruntime==1.17.1
# via faster-whisper
openai==1.14.2
openai==1.14.3
# via dailyai (pyproject.toml)
opentelemetry-api==1.24.0
# via
@@ -203,12 +215,14 @@ packaging==24.0
# fal
# huggingface-hub
# onnxruntime
# transformers
pathspec==0.11.2
# via fal
pillow==10.2.0
# via
# dailyai (pyproject.toml)
# fal
# torchvision
platformdirs==4.2.0
# via
# isolate
@@ -223,6 +237,8 @@ protobuf==4.25.3
# pyht
pyaudio==0.2.14
# via dailyai (pyproject.toml)
pycparser==2.22
# via cffi
pydantic==1.10.15
# via
# anthropic
@@ -231,9 +247,9 @@ pydantic==1.10.15
# openai
pygments==2.17.2
# via rich
pyht==0.0.26
pyht==0.0.27
# via dailyai (pyproject.toml)
pyjwt==2.8.0
pyjwt[crypto]==2.8.0
# via fal
python-dateutil==2.9.0.post0
# via fal
@@ -244,12 +260,25 @@ pyyaml==6.0.1
# ctranslate2
# huggingface-hub
# isolate
# timm
# transformers
regex==2023.12.25
# via transformers
requests==2.31.0
# via
# huggingface-hub
# pyht
# transformers
rich==13.7.1
# via
# fal
# rich-click
rich-click==1.7.4
# via fal
safetensors==0.4.2
# via
# timm
# transformers
six==1.16.0
# via python-dateutil
sniffio==1.3.1
@@ -268,20 +297,30 @@ sympy==1.12
# torch
tblib==3.0.0
# via isolate
timm==0.9.16
# via dailyai (pyproject.toml)
tokenizers==0.15.2
# via
# anthropic
# faster-whisper
torch==2.2.1
# transformers
torch==2.2.2
# via
# dailyai (pyproject.toml)
# timm
# torchaudio
torchaudio==2.2.1
# torchvision
torchaudio==2.2.2
# via dailyai (pyproject.toml)
torchvision==0.17.2
# via timm
tqdm==4.66.2
# via
# huggingface-hub
# openai
# transformers
transformers==4.39.3
# via dailyai (pyproject.toml)
triton==2.2.0
# via torch
types-python-dateutil==2.9.0.20240316
@@ -297,6 +336,7 @@ typing-extensions==4.10.0
# openai
# opentelemetry-sdk
# pydantic
# rich-click
# torch
urllib3==2.2.1
# via requests

View File

@@ -33,19 +33,24 @@ certifi==2024.2.2
# httpcore
# httpx
# requests
cffi==1.16.0
# via cryptography
charset-normalizer==3.3.2
# via requests
click==8.1.7
# via
# fal
# flask
# rich-click
colorama==0.4.6
# via fal
coloredlogs==15.0.1
# via onnxruntime
cryptography==42.0.5
# via pyjwt
ctranslate2==4.1.0
# via faster-whisper
daily-python==0.7.2
daily-python==0.7.3
# via dailyai (pyproject.toml)
deprecated==1.2.14
# via opentelemetry-api
@@ -57,21 +62,24 @@ distro==1.9.0
# via
# anthropic
# openai
einops==0.7.0
# via dailyai (pyproject.toml)
exceptiongroup==1.2.0
# via anyio
fal==0.12.3
fal==0.12.7
# via dailyai (pyproject.toml)
fastapi==0.99.1
# via fal
faster-whisper==1.0.1
# via dailyai (pyproject.toml)
filelock==3.13.3
filelock==3.13.4
# via
# huggingface-hub
# pyht
# torch
# transformers
# virtualenv
flask==3.0.2
flask==3.0.3
# via
# dailyai (pyproject.toml)
# flask-cors
@@ -108,7 +116,9 @@ httpx==0.27.0
huggingface-hub==0.22.2
# via
# faster-whisper
# timm
# tokenizers
# transformers
humanfriendly==10.0
# via coloredlogs
idna==3.6
@@ -123,7 +133,7 @@ isolate[build]==0.12.7
# via
# fal
# isolate-proto
isolate-proto==0.3.3
isolate-proto==0.3.4
# via fal
itsdangerous==2.1.2
# via flask
@@ -147,16 +157,18 @@ multidict==6.0.5
# via
# aiohttp
# yarl
networkx==3.2.1
networkx==3.3
# via torch
numpy==1.26.4
# via
# ctranslate2
# dailyai (pyproject.toml)
# onnxruntime
# torchvision
# transformers
onnxruntime==1.17.1
# via faster-whisper
openai==1.14.2
openai==1.14.3
# via dailyai (pyproject.toml)
opentelemetry-api==1.24.0
# via
@@ -171,12 +183,14 @@ packaging==24.0
# fal
# huggingface-hub
# onnxruntime
# transformers
pathspec==0.11.2
# via fal
pillow==10.2.0
# via
# dailyai (pyproject.toml)
# fal
# torchvision
platformdirs==4.2.0
# via
# isolate
@@ -191,6 +205,8 @@ protobuf==4.25.3
# pyht
pyaudio==0.2.14
# via dailyai (pyproject.toml)
pycparser==2.22
# via cffi
pydantic==1.10.15
# via
# anthropic
@@ -199,9 +215,9 @@ pydantic==1.10.15
# openai
pygments==2.17.2
# via rich
pyht==0.0.26
pyht==0.0.27
# via dailyai (pyproject.toml)
pyjwt==2.8.0
pyjwt[crypto]==2.8.0
# via fal
python-dateutil==2.9.0.post0
# via fal
@@ -212,12 +228,25 @@ pyyaml==6.0.1
# ctranslate2
# huggingface-hub
# isolate
# timm
# transformers
regex==2023.12.25
# via transformers
requests==2.31.0
# via
# huggingface-hub
# pyht
# transformers
rich==13.7.1
# via
# fal
# rich-click
rich-click==1.7.4
# via fal
safetensors==0.4.2
# via
# timm
# transformers
six==1.16.0
# via python-dateutil
sniffio==1.3.1
@@ -236,20 +265,30 @@ sympy==1.12
# torch
tblib==3.0.0
# via isolate
timm==0.9.16
# via dailyai (pyproject.toml)
tokenizers==0.15.2
# via
# anthropic
# faster-whisper
torch==2.2.1
# transformers
torch==2.2.2
# via
# dailyai (pyproject.toml)
# timm
# torchaudio
torchaudio==2.2.1
# torchvision
torchaudio==2.2.2
# via dailyai (pyproject.toml)
torchvision==0.17.2
# via timm
tqdm==4.66.2
# via
# huggingface-hub
# openai
# transformers
transformers==4.39.3
# via dailyai (pyproject.toml)
types-python-dateutil==2.9.0.20240316
# via fal
typing-extensions==4.10.0
@@ -263,6 +302,7 @@ typing-extensions==4.10.0
# openai
# opentelemetry-sdk
# pydantic
# rich-click
# torch
urllib3==2.2.1
# via requests

View File

@@ -20,10 +20,10 @@ classifiers = [
"Topic :: Scientific/Engineering :: Artificial Intelligence"
]
dependencies = [
"aiohttp==3.9.3",
"numpy==1.26.4",
"Pillow==10.2.0",
"typing-extensions==4.10.0",
"aiohttp~=3.9.0",
"numpy~=1.26.0",
"Pillow~=10.2.0",
"typing-extensions~=4.10.0",
]
[project.urls]
@@ -31,17 +31,18 @@ Source = "https://github.com/daily-co/dailyai"
Website = "https://daily.co"
[project.optional-dependencies]
anthropic = [ "anthropic==0.20.0" ]
azure = [ "azure-cognitiveservices-speech==1.36.0" ]
daily = [ "daily-python==0.7.2" ]
examples = [ "python-dotenv==1.0.1", "flask==3.0.2", "flask_cors==4.0.0" ]
fal = [ "fal==0.12.3" ]
local = [ "pyaudio==0.2.14" ]
openai = [ "openai==1.14.2" ]
playht = [ "pyht==0.0.26" ]
silero = [ "torch==2.2.1", "torchaudio==2.2.1" ]
websocket = [ "websockets==12.0" ]
whisper = [ "faster_whisper==1.0.1" ]
anthropic = [ "anthropic~=0.20.0" ]
azure = [ "azure-cognitiveservices-speech~=1.36.0" ]
daily = [ "daily-python~=0.7.0" ]
examples = [ "python-dotenv~=1.0.0", "flask~=3.0.0", "flask_cors~=4.0.0" ]
fal = [ "fal~=0.12.0" ]
local = [ "pyaudio~=0.2.0" ]
moondream = [ "einops~=0.7.0", "timm~=0.9.0", "transformers~=4.39.0" ]
openai = [ "openai~=1.14.0" ]
playht = [ "pyht~=0.0.26" ]
silero = [ "torch~=2.2.0", "torchaudio~=2.2.0" ]
websocket = [ "websockets~=12.0" ]
whisper = [ "faster_whisper~=1.0.0" ]
[tool.setuptools.packages.find]
# All the following settings are optional:
@@ -51,4 +52,4 @@ where = ["src"]
pythonpath = ["src"]
[tool.setuptools_scm]
# Empty
local_scheme = "no-local-version"

View File

@@ -7,6 +7,7 @@ from dailyai.pipeline.frames import (
EndFrame,
EndPipeFrame,
Frame,
ImageFrame,
LLMMessagesFrame,
LLMResponseEndFrame,
LLMResponseStartFrame,
@@ -14,6 +15,7 @@ from dailyai.pipeline.frames import (
TranscriptionFrame,
UserStartedSpeakingFrame,
UserStoppedSpeakingFrame,
VisionImageFrame,
)
from dailyai.pipeline.pipeline import Pipeline
from dailyai.services.ai_services import AIService
@@ -22,6 +24,79 @@ from typing import AsyncGenerator, Coroutine, List
class ResponseAggregator(FrameProcessor):
"""This frame processor aggregates frames between a start and an end frame
into complete text frame sentences.
For example, frame input/output:
UserStartedSpeakingFrame() -> None
TranscriptionFrame("Hello,") -> None
TranscriptionFrame(" world.") -> None
UserStoppedSpeakingFrame() -> TextFrame("Hello world.")
Doctest:
>>> async def print_frames(aggregator, frame):
... async for frame in aggregator.process_frame(frame):
... if isinstance(frame, TextFrame):
... print(frame.text)
>>> aggregator = ResponseAggregator(start_frame = UserStartedSpeakingFrame,
... end_frame=UserStoppedSpeakingFrame,
... accumulator_frame=TranscriptionFrame,
... pass_through=False)
>>> asyncio.run(print_frames(aggregator, UserStartedSpeakingFrame()))
>>> asyncio.run(print_frames(aggregator, TranscriptionFrame("Hello,", 1, 1)))
>>> asyncio.run(print_frames(aggregator, TranscriptionFrame("world.", 1, 2)))
>>> asyncio.run(print_frames(aggregator, UserStoppedSpeakingFrame()))
Hello, world.
"""
def __init__(
self,
*,
start_frame,
end_frame,
accumulator_frame,
pass_through=True,
):
self.aggregation = ""
self.aggregating = False
self._start_frame = start_frame
self._end_frame = end_frame
self._accumulator_frame = accumulator_frame
self._pass_through = pass_through
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
if isinstance(frame, self._start_frame):
self.aggregating = True
elif isinstance(frame, self._end_frame):
self.aggregating = False
# Sometimes VAD triggers quickly on and off. If we don't get any transcription,
# it creates empty LLM message queue frames
if len(self.aggregation) > 0:
output = self.aggregation
self.aggregation = ""
yield self._end_frame()
yield TextFrame(output.strip())
elif isinstance(frame, self._accumulator_frame) and self.aggregating:
self.aggregation += f" {frame.text}"
if self._pass_through:
yield frame
else:
yield frame
class UserResponseAggregator(ResponseAggregator):
def __init__(self):
super().__init__(
start_frame=UserStartedSpeakingFrame,
end_frame=UserStoppedSpeakingFrame,
accumulator_frame=TranscriptionFrame,
pass_through=False,
)
class LLMResponseAggregator(FrameProcessor):
def __init__(
self,
@@ -66,7 +141,7 @@ class ResponseAggregator(FrameProcessor):
yield frame
class LLMResponseAggregator(ResponseAggregator):
class LLMAssistantResponseAggregator(LLMResponseAggregator):
def __init__(self, messages: list[dict]):
super().__init__(
messages=messages,
@@ -77,7 +152,7 @@ class LLMResponseAggregator(ResponseAggregator):
)
class UserResponseAggregator(ResponseAggregator):
class LLMUserResponseAggregator(LLMResponseAggregator):
def __init__(self, messages: list[dict]):
super().__init__(
messages=messages,
@@ -360,7 +435,7 @@ class GatedAggregator(FrameProcessor):
... start_open=False)
>>> asyncio.run(print_frames(aggregator, TextFrame("Hello")))
>>> asyncio.run(print_frames(aggregator, TextFrame("Hello again.")))
>>> asyncio.run(print_frames(aggregator, ImageFrame(url='', image=bytes([]))))
>>> asyncio.run(print_frames(aggregator, ImageFrame(image=bytes([]), size=(0, 0))))
ImageFrame
Hello
Hello again.
@@ -390,3 +465,37 @@ class GatedAggregator(FrameProcessor):
self.accumulator = []
else:
self.accumulator.append(frame)
class VisionImageFrameAggregator(FrameProcessor):
"""This aggregator waits for a consecutive TextFrame and an
ImageFrame. After the ImageFrame arrives it will output a VisionImageFrame.
>>> from dailyai.pipeline.frames import ImageFrame
>>> async def print_frames(aggregator, frame):
... async for frame in aggregator.process_frame(frame):
... print(frame)
>>> aggregator = VisionImageFrameAggregator()
>>> asyncio.run(print_frames(aggregator, TextFrame("What do you see?")))
>>> asyncio.run(print_frames(aggregator, ImageFrame(image=bytes([]), size=(0, 0))))
VisionImageFrame, text: What do you see?, image size: 0x0, buffer size: 0 B
"""
def __init__(self, **kwargs):
super().__init__(**kwargs)
self._describe_text = None
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
if isinstance(frame, TextFrame):
self._describe_text = frame.text
elif isinstance(frame, ImageFrame):
if self._describe_text:
yield VisionImageFrame(self._describe_text, frame.image, frame.size)
self._describe_text = None
else:
yield frame
else:
yield frame

View File

@@ -70,11 +70,66 @@ class AudioFrame(Frame):
class ImageFrame(Frame):
"""An image. Will be shown by the transport if the transport's camera is
enabled."""
url: str | None
image: bytes
size: tuple[int, int]
def __str__(self):
return f"{self.__class__.__name__}, url: {self.url}, image size: {len(self.image)} B"
return f"{self.__class__.__name__}, image size: {self.size[0]}x{self.size[1]} buffer size: {len(self.image)} B"
@dataclass()
class URLImageFrame(ImageFrame):
"""An image with an associated URL. Will be shown by the transport if the
transport's camera is enabled.
"""
url: str | None
def __init__(self, url, image, size):
super().__init__(image, size)
self.url = url
def __str__(self):
return f"{self.__class__.__name__}, url: {self.url}, image size: {self.size[0]}x{self.size[1]}, buffer size: {len(self.image)} B"
@dataclass()
class VisionImageFrame(ImageFrame):
"""An image with an associated text to ask for a description of it. Will be shown by the
transport if the transport's camera is enabled.
"""
text: str | None
def __init__(self, text, image, size):
super().__init__(image, size)
self.text = text
def __str__(self):
return f"{self.__class__.__name__}, text: {self.text}, image size: {self.size[0]}x{self.size[1]}, buffer size: {len(self.image)} B"
@dataclass()
class UserImageFrame(ImageFrame):
"""An image associated to a user. Will be shown by the transport if the transport's camera is
enabled."""
user_id: str
def __init__(self, user_id, image, size):
super().__init__(image, size)
self.user_id = user_id
def __str__(self):
return f"{self.__class__.__name__}, user: {self.user_id}, image size: {self.size[0]}x{self.size[1]}, buffer size: {len(self.image)} B"
@dataclass()
class UserImageRequestFrame(Frame):
"""A frame user to request an image from the given user."""
user_id: str
def __str__(self):
return f"{self.__class__.__name__}, user: {self.user_id}"
@dataclass()
@@ -144,10 +199,10 @@ class ReceivedAppMessageFrame(Frame):
@dataclass()
class SendAppMessageFrame(Frame):
message: Any
participantId: str | None
participant_id: str | None
def __str__(self):
return f"SendAppMessageFrame: participantId: {self.participantId}, message: {self.message}"
return f"SendAppMessageFrame: participant: {self.participant_id}, message: {self.message}"
class UserStartedSpeakingFrame(Frame):

View File

@@ -14,6 +14,8 @@ from dailyai.pipeline.frames import (
TTSStartFrame,
TextFrame,
TranscriptionFrame,
URLImageFrame,
VisionImageFrame,
)
from abc import abstractmethod
@@ -81,13 +83,12 @@ class TTSService(AIService):
class ImageGenService(AIService):
def __init__(self, image_size, **kwargs):
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.image_size = image_size
# Renders the image. Returns an Image object.
@abstractmethod
async def run_image_gen(self, sentence: str) -> tuple[str, bytes]:
async def run_image_gen(self, prompt: str) -> tuple[str, bytes, tuple[int, int]]:
pass
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
@@ -95,8 +96,27 @@ class ImageGenService(AIService):
yield frame
return
(url, image_data) = await self.run_image_gen(frame.text)
yield ImageFrame(url, image_data)
(url, image_data, image_size) = await self.run_image_gen(frame.text)
yield URLImageFrame(url, image_data, image_size)
class VisionService(AIService):
"""VisionService is a base class for vision services."""
def __init__(self, **kwargs):
super().__init__(**kwargs)
self._describe_text = None
@abstractmethod
async def run_vision(self, frame: VisionImageFrame) -> str:
pass
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
if isinstance(frame, VisionImageFrame):
description = await self.run_vision(frame)
yield TextFrame(description)
else:
yield frame
class STTService(AIService):

View File

@@ -19,7 +19,7 @@ try:
except ModuleNotFoundError as e:
print(f"Exception: {e}")
print(
"In order to use Azure TTS, you need to `pip install dailyai[azure]`. Also, set `SPEECH_KEY` and `SPEECH_REGION` environment variables.")
"In order to use Azure TTS, you need to `pip install dailyai[azure]`. Also, set `AZURE_SPEECH_API_KEY` and `AZURE_SPEECH_REGION` environment variables.")
raise Exception(f"Missing module: {e}")
from dailyai.services.openai_api_llm_service import BaseOpenAILLMService
@@ -97,23 +97,24 @@ class AzureImageGenServiceREST(ImageGenService):
endpoint,
model,
):
super().__init__(image_size=image_size)
super().__init__()
self._api_key = api_key
self._azure_endpoint = endpoint
self._api_version = api_version
self._model = model
self._aiohttp_session = aiohttp_session
self._image_size = image_size
async def run_image_gen(self, sentence) -> tuple[str, bytes]:
async def run_image_gen(self, prompt: str) -> tuple[str, bytes, tuple[int, int]]:
url = f"{self._azure_endpoint}openai/images/generations:submit?api-version={self._api_version}"
headers = {
"api-key": self._api_key,
"Content-Type": "application/json"}
body = {
# Enter your prompt text here
"prompt": sentence,
"size": self.image_size,
"prompt": prompt,
"size": self._image_size,
"n": 1,
}
async with self._aiohttp_session.post(
@@ -146,4 +147,4 @@ class AzureImageGenServiceREST(ImageGenService):
async with self._aiohttp_session.get(image_url) as response:
image_stream = io.BytesIO(await response.content.read())
image = Image.open(image_stream)
return (image_url, image.tobytes())
return (image_url, image.tobytes(), image.size)

View File

@@ -3,6 +3,8 @@ import asyncio
import io
import os
from PIL import Image
from pydantic import BaseModel
from typing import Optional, Union, Dict
from dailyai.services.ai_services import ImageGenService
@@ -16,30 +18,44 @@ except ModuleNotFoundError as e:
class FalImageGenService(ImageGenService):
class InputParams(BaseModel):
seed: Optional[int] = None
num_inference_steps: int = 4
num_images: int = 1
image_size: Union[str, Dict[str, int]] = "square_hd"
expand_prompt: bool = False
enable_safety_checker: bool = True
format: str = "png"
def __init__(
self,
*,
image_size,
aiohttp_session: aiohttp.ClientSession,
params: InputParams,
model="fal-ai/fast-sdxl",
key_id=None,
key_secret=None
):
super().__init__(image_size)
super().__init__()
self._model = model
self._params = params
self._aiohttp_session = aiohttp_session
if key_id:
os.environ["FAL_KEY_ID"] = key_id
if key_secret:
os.environ["FAL_KEY_SECRET"] = key_secret
async def run_image_gen(self, sentence) -> tuple[str, bytes]:
def get_image_url(sentence, size):
handler = fal.apps.submit(
"110602490-fast-sdxl",
# "fal-ai/fast-sdxl",
arguments={"prompt": sentence},
async def run_image_gen(self, prompt: str) -> tuple[str, bytes, tuple[int, int]]:
def get_image_url(prompt):
handler = fal.apps.submit( # type: ignore
self._model,
arguments={
"prompt": prompt,
**self._params.dict(),
},
)
for event in handler.iter_events():
if isinstance(event, fal.apps.InProgress):
if isinstance(event, fal.apps.InProgress): # type: ignore
pass
result = handler.get()
@@ -50,9 +66,10 @@ class FalImageGenService(ImageGenService):
return image_url
image_url = await asyncio.to_thread(get_image_url, sentence, self.image_size)
image_url = await asyncio.to_thread(get_image_url, prompt)
# Load the image from the url
async with self._aiohttp_session.get(image_url) as response:
image_stream = io.BytesIO(await response.content.read())
image = Image.open(image_stream)
return (image_url, image.tobytes())
return (image_url, image.tobytes(), image.size)

View File

@@ -0,0 +1,52 @@
from dailyai.pipeline.frames import ImageFrame, VisionImageFrame
from dailyai.services.ai_services import VisionService
from PIL import Image
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
def detect_device():
"""
Detects the appropriate device to run on, and return the device and dtype.
"""
if torch.cuda.is_available():
return torch.device("cuda"), torch.float16
elif torch.backends.mps.is_available():
return torch.device("mps"), torch.float16
else:
return torch.device("cpu"), torch.float32
class MoondreamService(VisionService):
def __init__(
self,
model_id="vikhyatk/moondream2",
revision="2024-04-02",
use_cpu=False
):
super().__init__()
if not use_cpu:
device, dtype = detect_device()
else:
device = torch.device("cpu")
dtype = torch.float32
self._tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
self._model = AutoModelForCausalLM.from_pretrained(
model_id, trust_remote_code=True, revision=revision
).to(device=device, dtype=dtype)
self._model.eval()
async def run_vision(self, frame: VisionImageFrame) -> str:
image = Image.frombytes("RGB", (frame.size[0], frame.size[1]), frame.image)
image_embeds = self._model.encode_image(image)
description = self._model.answer_question(
image_embeds=image_embeds,
question=frame.text,
tokenizer=self._tokenizer)
return description

View File

@@ -1,3 +1,4 @@
from typing import Literal
import aiohttp
from PIL import Image
import io
@@ -26,24 +27,25 @@ class OpenAIImageGenService(ImageGenService):
def __init__(
self,
*,
image_size: str,
image_size: Literal["256x256", "512x512", "1024x1024", "1792x1024", "1024x1792"],
aiohttp_session: aiohttp.ClientSession,
api_key,
model="dall-e-3",
):
super().__init__(image_size=image_size)
super().__init__()
self._model = model
self._image_size = image_size
self._client = AsyncOpenAI(api_key=api_key)
self._aiohttp_session = aiohttp_session
async def run_image_gen(self, sentence) -> tuple[str, bytes]:
self.logger.info("Generating OpenAI image", sentence)
async def run_image_gen(self, prompt: str) -> tuple[str, bytes, tuple[int, int]]:
self.logger.info("Generating OpenAI image", prompt)
image = await self._client.images.generate(
prompt=sentence,
prompt=prompt,
model=self._model,
n=1,
size=self.image_size
size=self._image_size
)
image_url = image.data[0].url
if not image_url:
@@ -53,4 +55,4 @@ class OpenAIImageGenService(ImageGenService):
async with self._aiohttp_session.get(image_url) as response:
image_stream = io.BytesIO(await response.content.read())
image = Image.open(image_stream)
return (image_url, image.tobytes())
return (image_url, image.tobytes(), image.size)

View File

@@ -19,7 +19,7 @@ class MockAIService(AIService):
image_stream = io.BytesIO(response.content)
image = Image.open(image_stream)
time.sleep(1)
return (image_url, image)
return (image_url, image.tobytes(), image.size)
def run_llm(self, messages, latest_user_message=None, stream=True):
for i in range(5):

View File

@@ -21,9 +21,10 @@ class AbstractTransport:
self._camera_enabled = kwargs.get("camera_enabled") or False
self._camera_width = kwargs.get("camera_width") or 1024
self._camera_height = kwargs.get("camera_height") or 768
self._camera_bitrate = kwargs.get("camera_bitrate") or 250000
self._camera_framerate = kwargs.get("camera_framerate") or 10
self._speaker_enabled = kwargs.get("speaker_enabled") or False
self._speaker_sample_rate = kwargs.get("speaker_sample_rate") or 16000
self._fps = kwargs.get("fps") or 8
self._logger: logging.Logger = logging.getLogger("dailyai.transport")

View File

@@ -2,6 +2,7 @@ import asyncio
import inspect
import logging
import signal
import time
import threading
import types
@@ -11,6 +12,7 @@ from typing import Any
from dailyai.pipeline.frames import (
ReceivedAppMessageFrame,
TranscriptionFrame,
UserImageFrame,
)
from threading import Event
@@ -58,6 +60,7 @@ class DailyTransport(ThreadedTransport, EventHandler):
bot_name: str,
min_others_count: int = 1,
start_transcription: bool = False,
video_rendering_enabled: bool = False,
**kwargs,
):
kwargs['has_webrtc_vad'] = True
@@ -69,6 +72,7 @@ class DailyTransport(ThreadedTransport, EventHandler):
self._token: str | None = token
self._min_others_count = min_others_count
self._start_transcription = start_transcription
self._video_rendering_enabled = video_rendering_enabled
self._is_interrupted = Event()
self._stop_threads = Event()
@@ -76,6 +80,8 @@ class DailyTransport(ThreadedTransport, EventHandler):
self._other_participant_has_joined = False
self._my_participant_id = None
self._video_renderers = {}
self.transcription_settings = {
"language": "en",
"tier": "nova",
@@ -156,16 +162,24 @@ class DailyTransport(ThreadedTransport, EventHandler):
return decorator
def write_frame_to_camera(self, frame: bytes):
self.camera.write_frame(frame)
if self._camera_enabled:
self.camera.write_frame(frame)
def write_frame_to_mic(self, frame: bytes):
self.mic.write_frames(frame)
if self._mic_enabled:
self.mic.write_frames(frame)
def send_app_message(self, message: Any, participantId: str | None):
self.client.send_app_message(message, participantId)
def request_participant_image(self, participant_id: str):
if participant_id in self._video_renderers:
self._video_renderers[participant_id]["render_next_frame"] = True
def send_app_message(self, message: Any, participant_id: str | None):
self.client.send_app_message(message, participant_id)
def read_audio_frames(self, desired_frame_count):
bytes = self._speaker.read_frames(desired_frame_count)
bytes = b""
if self._speaker_enabled or self._vad_enabled:
bytes = self._speaker.read_frames(desired_frame_count)
return bytes
def _prerun(self):
@@ -222,9 +236,9 @@ class DailyTransport(ThreadedTransport, EventHandler):
"maxQuality": "low",
"encodings": {
"low": {
"maxBitrate": 250000,
"maxBitrate": self._camera_bitrate,
"scaleResolutionDownBy": 1.333,
"maxFramerate": 8,
"maxFramerate": self._camera_framerate,
}
},
}
@@ -234,9 +248,12 @@ class DailyTransport(ThreadedTransport, EventHandler):
)
self._my_participant_id = self.client.participants()["local"]["id"]
# For performance reasons, never subscribe to video streams (unless a
# video renderer is registered).
self.client.update_subscription_profiles({
"base": {
"camera": "unsubscribed",
"screenVideo": "unsubscribed"
}
})
@@ -255,7 +272,7 @@ class DailyTransport(ThreadedTransport, EventHandler):
self.client.leave()
self.client.release()
def on_first_other_participant_joined(self):
def on_first_other_participant_joined(self, participant):
pass
def call_joined(self, join_data, client_error):
@@ -268,6 +285,59 @@ class DailyTransport(ThreadedTransport, EventHandler):
def start_recording(self):
self.client.start_recording()
def render_participant_video(self,
participant_id,
framerate=10,
video_source="camera",
color_format="RGB") -> None:
if not self._video_rendering_enabled:
self._logger.warn("Video rendering is not enabled")
return
# Only enable camera subscription on this participant
self.client.update_subscriptions(participant_settings={
participant_id: {
"media": {
video_source: "subscribed"
}
}
})
self._video_renderers[participant_id] = {
"framerate": framerate,
"timestamp": 0,
"render_next_frame": False,
}
self.client.set_video_renderer(
participant_id,
self.on_participant_video_frame,
video_source=video_source,
color_format=color_format)
def on_participant_video_frame(self, participant_id, video_frame):
if not self._loop:
return
render_frame = False
curr_time = time.time()
framerate = self._video_renderers[participant_id]["framerate"]
if framerate > 0:
prev_time = self._video_renderers[participant_id]["timestamp"]
next_time = prev_time + 1 / framerate
render_frame = curr_time > next_time
elif self._video_renderers[participant_id]["render_next_frame"]:
self._video_renderers[participant_id]["render_next_frame"] = False
render_frame = True
if render_frame:
frame = UserImageFrame(participant_id, video_frame.buffer,
(video_frame.width, video_frame.height))
asyncio.run_coroutine_threadsafe(self.receive_queue.put(frame), self._loop)
self._video_renderers[participant_id]["timestamp"] = curr_time
def on_error(self, error):
self._logger.error(f"on_error: {error}")
@@ -277,7 +347,7 @@ class DailyTransport(ThreadedTransport, EventHandler):
def on_participant_joined(self, participant):
if not self._other_participant_has_joined and participant["id"] != self._my_participant_id:
self._other_participant_has_joined = True
self.on_first_other_participant_joined()
self.on_first_other_participant_joined(participant)
def on_participant_left(self, participant, reason):
if len(self.client.participants()) < self._min_others_count + 1:
@@ -286,7 +356,6 @@ class DailyTransport(ThreadedTransport, EventHandler):
def on_app_message(self, message: Any, sender: str):
if self._loop:
frame = ReceivedAppMessageFrame(message, sender)
print(frame)
asyncio.run_coroutine_threadsafe(
self.receive_queue.put(frame), self._loop
)

View File

@@ -19,6 +19,7 @@ class LocalTransport(ThreadedTransport):
self._sample_width = kwargs.get("sample_width") or 2
self._n_channels = kwargs.get("n_channels") or 1
self._tk_root = kwargs.get("tk_root") or None
self._pyaudio = None
if self._camera_enabled and not self._tk_root:
raise ValueError(
@@ -51,7 +52,7 @@ class LocalTransport(ThreadedTransport):
if self._mic_enabled:
self._audio_stream.write(frame)
def read_frames(self, desired_frame_count):
def read_audio_frames(self, desired_frame_count):
bytes = b""
if self._speaker_enabled:
bytes = self._speaker_stream.read(
@@ -62,7 +63,8 @@ class LocalTransport(ThreadedTransport):
def _prerun(self):
if self._mic_enabled:
self._pyaudio = pyaudio.PyAudio()
if not self._pyaudio:
self._pyaudio = pyaudio.PyAudio()
self._audio_stream = self._pyaudio.open(
format=self._pyaudio.get_format_from_width(self._sample_width),
channels=self._n_channels,
@@ -84,6 +86,8 @@ class LocalTransport(ThreadedTransport):
self._image_label.pack()
if self._speaker_enabled:
if not self._pyaudio:
self._pyaudio = pyaudio.PyAudio()
self._speaker_stream = self._pyaudio.open(
format=self._pyaudio.get_format_from_width(self._sample_width),
channels=self._n_channels,

View File

@@ -20,6 +20,7 @@ from dailyai.pipeline.frames import (
SpriteFrame,
StartFrame,
TextFrame,
UserImageRequestFrame,
UserStartedSpeakingFrame,
UserStoppedSpeakingFrame,
)
@@ -382,7 +383,11 @@ class ThreadedTransport(AbstractTransport):
def _set_images(self, images: list[bytes], start_frame=0):
self._images = itertools.cycle(images)
def send_app_message(self, message: Any, participantId: str | None):
def request_participant_image(self, participant_id: str):
""" Child classes should override this to force an image from a user. """
pass
def send_app_message(self, message: Any, participant_id: str | None):
""" Child classes should override this to send a custom message to the room. """
pass
@@ -393,7 +398,7 @@ class ThreadedTransport(AbstractTransport):
this_frame = next(self._images)
self.write_frame_to_camera(this_frame)
time.sleep(1.0 / self._fps)
time.sleep(1.0 / self._camera_framerate)
except Exception as e:
self._logger.error(f"Exception {e} in camera thread.")
raise e
@@ -458,9 +463,10 @@ class ThreadedTransport(AbstractTransport):
self._set_image(frame.image)
elif isinstance(frame, SpriteFrame):
self._set_images(frame.images)
elif isinstance(frame, UserImageRequestFrame):
self.request_participant_image(frame.user_id)
elif isinstance(frame, SendAppMessageFrame):
self.send_app_message(
frame.message, frame.participantId)
self.send_app_message(frame.message, frame.participant_id)
elif len(b):
self.write_frame_to_mic(bytes(b))
b = bytearray()

View File

@@ -54,13 +54,13 @@ class TestDailyFrameAggregators(unittest.IsolatedAsyncioTestCase):
TextFrame("Hello, "),
TextFrame("world."),
AudioFrame(b"hello"),
ImageFrame("image", b"image"),
ImageFrame(b"image", (0, 0)),
AudioFrame(b"world"),
LLMResponseEndFrame(),
]
expected_output_frames = [
ImageFrame("image", b"image"),
ImageFrame(b"image", (0, 0)),
LLMResponseStartFrame(),
TextFrame("Hello, "),
TextFrame("world."),

View File

@@ -11,11 +11,11 @@ class TestDailyTransport(unittest.IsolatedAsyncioTestCase):
was_called = False
@transport.event_handler("on_first_other_participant_joined")
def test_event_handler(transport):
def test_event_handler(transport, participant):
nonlocal was_called
was_called = True
transport.on_first_other_participant_joined()
transport.on_first_other_participant_joined({"id": "user-id"})
self.assertTrue(was_called)
@@ -29,7 +29,7 @@ class TestDailyTransport(unittest.IsolatedAsyncioTestCase):
event = asyncio.Event()
@transport.event_handler("on_first_other_participant_joined")
async def test_event_handler(transport):
async def test_event_handler(transport, participant):
nonlocal event
print("sleeping")
await asyncio.sleep(0.1)
@@ -68,7 +68,7 @@ class TestDailyTransport(unittest.IsolatedAsyncioTestCase):
await transport.send_queue.put(AudioFrame(bytes([0] * 3300)))
async def send_video_frame():
await transport.send_queue.put(ImageFrame(None, b"test"))
await transport.send_queue.put(ImageFrame(b"test", (0, 0)))
await asyncio.gather(transport.run(), send_audio_frame(), send_video_frame())