Compare commits
52 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
db05a9b29b | ||
|
|
130e418800 | ||
|
|
1a0a66e503 | ||
|
|
e22babbae2 | ||
|
|
bfe2e0f36e | ||
|
|
26d401e5de | ||
|
|
3c20f9153d | ||
|
|
2f9899af5a | ||
|
|
5ef5cf30f4 | ||
|
|
34a6c5691b | ||
|
|
18bf09c704 | ||
|
|
84cfa7cc95 | ||
|
|
a5eba0106b | ||
|
|
b117a185e3 | ||
|
|
0219230827 | ||
|
|
9fcbb36997 | ||
|
|
0bf15fd6eb | ||
|
|
989252bb52 | ||
|
|
7b44a79a5b | ||
|
|
4bd29b0080 | ||
|
|
ebb76fdae9 | ||
|
|
5d52def0fe | ||
|
|
9ada56d0b0 | ||
|
|
8d73cdb2ee | ||
|
|
4f04b10202 | ||
|
|
97b923e37e | ||
|
|
57aabea0a3 | ||
|
|
319b8e7816 | ||
|
|
96950ca6df | ||
|
|
d7b2e67c35 | ||
|
|
53930b47a5 | ||
|
|
86c8ab02cc | ||
|
|
b678097f6d | ||
|
|
eb455043c4 | ||
|
|
dd696be04c | ||
|
|
96b2337183 | ||
|
|
ea52e73f57 | ||
|
|
88404e4739 | ||
|
|
0fd323714e | ||
|
|
a362ca4d3d | ||
|
|
02b5c3dd5f | ||
|
|
497a09cbc8 | ||
|
|
172a14245d | ||
|
|
302246399b | ||
|
|
9590cc2fbc | ||
|
|
09e4044c72 | ||
|
|
efdfb74dc3 | ||
|
|
158de6f20b | ||
|
|
47f68b742d | ||
|
|
2654ca1f62 | ||
|
|
4263827ee8 | ||
|
|
97fe529b0e |
22
.github/workflows/publish.yaml
vendored
22
.github/workflows/publish.yaml
vendored
@@ -60,3 +60,25 @@ jobs:
|
||||
with:
|
||||
verbose: true
|
||||
print-hash: true
|
||||
|
||||
publish-to-test-pypi:
|
||||
name: "Publish to Test PyPI"
|
||||
runs-on: ubuntu-latest
|
||||
needs: [ build ]
|
||||
environment:
|
||||
name: testpypi
|
||||
url: https://pypi.org/p/dailyai
|
||||
permissions:
|
||||
id-token: write
|
||||
steps:
|
||||
- name: Download wheels
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: wheels
|
||||
path: ./dist
|
||||
- name: Publish to PyPI
|
||||
uses: pypa/gh-action-pypi-publish@release/v1
|
||||
with:
|
||||
verbose: true
|
||||
print-hash: true
|
||||
repository-url: https://test.pypi.org/legacy/
|
||||
|
||||
8
.github/workflows/publish_test.yaml
vendored
8
.github/workflows/publish_test.yaml
vendored
@@ -15,6 +15,8 @@ jobs:
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
ref: ${{ github.event.inputs.gitref }}
|
||||
fetch-tags: true
|
||||
fetch-depth: 100
|
||||
- name: Set up Python
|
||||
id: setup_python
|
||||
uses: actions/setup-python@v4
|
||||
@@ -35,14 +37,15 @@ jobs:
|
||||
- name: Upload wheels
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: wheels
|
||||
path: ./dist
|
||||
|
||||
publish-to-pypi:
|
||||
name: "Test publish to PyPI"
|
||||
name: "Publish to Test PyPI"
|
||||
runs-on: ubuntu-latest
|
||||
needs: [ build ]
|
||||
environment:
|
||||
name: pypi
|
||||
name: testpypi
|
||||
url: https://pypi.org/p/dailyai
|
||||
permissions:
|
||||
id-token: write
|
||||
@@ -50,6 +53,7 @@ jobs:
|
||||
- name: Download wheels
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: wheels
|
||||
path: ./dist
|
||||
- name: Publish to PyPI
|
||||
uses: pypa/gh-action-pypi-publish@release/v1
|
||||
|
||||
1
.gitignore
vendored
1
.gitignore
vendored
@@ -3,6 +3,7 @@ env/
|
||||
__pycache__/
|
||||
*~
|
||||
venv
|
||||
.venv
|
||||
#*#
|
||||
|
||||
# Distribution / packaging
|
||||
|
||||
@@ -39,6 +39,8 @@ Currently implemented services:
|
||||
- Transport
|
||||
- Daily
|
||||
- Local (in progress, intended as a quick start example service)
|
||||
- Vision
|
||||
- Moondream
|
||||
|
||||
If you'd like to [implement a service]((https://github.com/daily-co/daily-ai-sdk/tree/main/src/dailyai/services)), we welcome PRs! Our goal is to support lots of services in all of the above categories, plus new categories (like real-time video) as they emerge.
|
||||
|
||||
@@ -58,12 +60,12 @@ By default, in order to minimize dependencies, only the basic framework function
|
||||
dependencies that you can install with:
|
||||
|
||||
```
|
||||
pip install dailyai[option,...]
|
||||
pip install "dailyai[option,...]"
|
||||
```
|
||||
|
||||
Your project may or may not need these, so they're made available as optional requirements. Here is a list:
|
||||
|
||||
- **AI services**: `anthropic`, `azure`, `fal`, `openai`, `playht`, `silero`, `whisper`
|
||||
- **AI services**: `anthropic`, `azure`, `fal`, `moondream`, `openai`, `playht`, `silero`, `whisper`
|
||||
- **Transports**: `daily`, `local`, `websocket`
|
||||
|
||||
## Code examples
|
||||
|
||||
@@ -2,8 +2,16 @@
|
||||
ANTHROPIC_API_KEY=...
|
||||
|
||||
# Azure
|
||||
SPEECH_KEY=...
|
||||
SPEECH_REGION=...
|
||||
AZURE_SPEECH_REGION=...
|
||||
AZURE_SPEECH_API_KEY=...
|
||||
|
||||
AZURE_CHATGPT_API_KEY=...
|
||||
AZURE_CHATGPT_ENDPOINT=https://...
|
||||
AZURE_CHATGPT_MODEL=...
|
||||
|
||||
AZURE_DALLE_API_KEY=...
|
||||
AZURE_DALLE_ENDPOINT=https://...
|
||||
AZURE_DALLE_MODEL=...
|
||||
|
||||
# Daily
|
||||
DAILY_API_KEY=...
|
||||
|
||||
@@ -48,7 +48,7 @@ async def main(room_url):
|
||||
pipeline = Pipeline([llm, tts])
|
||||
|
||||
@transport.event_handler("on_first_other_participant_joined")
|
||||
async def on_first_other_participant_joined(transport):
|
||||
async def on_first_other_participant_joined(transport, participant):
|
||||
await pipeline.queue_frames([LLMMessagesFrame(messages), EndFrame()])
|
||||
|
||||
await transport.run(pipeline)
|
||||
|
||||
@@ -31,7 +31,9 @@ async def main(room_url):
|
||||
)
|
||||
|
||||
imagegen = FalImageGenService(
|
||||
image_size="square_hd",
|
||||
params=FalImageGenService.InputParams(
|
||||
image_size="square_hd"
|
||||
),
|
||||
aiohttp_session=session,
|
||||
key_id=os.getenv("FAL_KEY_ID"),
|
||||
key_secret=os.getenv("FAL_KEY_SECRET"),
|
||||
@@ -40,7 +42,7 @@ async def main(room_url):
|
||||
pipeline = Pipeline([imagegen])
|
||||
|
||||
@transport.event_handler("on_first_other_participant_joined")
|
||||
async def on_first_other_participant_joined(transport):
|
||||
async def on_first_other_participant_joined(transport, participant):
|
||||
# Note that we do not put an EndFrame() item in the pipeline for this demo.
|
||||
# This means that the bot will stay in the channel until it times out.
|
||||
# An EndFrame() in the pipeline would cause the transport to shut
|
||||
|
||||
@@ -5,7 +5,7 @@ import os
|
||||
|
||||
import tkinter as tk
|
||||
|
||||
from dailyai.pipeline.frames import TextFrame, EndFrame
|
||||
from dailyai.pipeline.frames import TextFrame
|
||||
from dailyai.pipeline.pipeline import Pipeline
|
||||
from dailyai.services.fal_ai_services import FalImageGenService
|
||||
from dailyai.transports.local_transport import LocalTransport
|
||||
@@ -35,7 +35,9 @@ async def main():
|
||||
)
|
||||
|
||||
imagegen = FalImageGenService(
|
||||
image_size="square_hd",
|
||||
params=FalImageGenService.InputParams(
|
||||
image_size="square_hd"
|
||||
),
|
||||
aiohttp_session=session,
|
||||
key_id=os.getenv("FAL_KEY_ID"),
|
||||
key_secret=os.getenv("FAL_KEY_SECRET"),
|
||||
|
||||
@@ -85,7 +85,9 @@ async def main(room_url):
|
||||
model="gpt-4-turbo-preview")
|
||||
|
||||
imagegen = FalImageGenService(
|
||||
image_size="square_hd",
|
||||
params=FalImageGenService.InputParams(
|
||||
image_size="square_hd"
|
||||
),
|
||||
aiohttp_session=session,
|
||||
key_id=os.getenv("FAL_KEY_ID"),
|
||||
key_secret=os.getenv("FAL_KEY_SECRET"),
|
||||
|
||||
@@ -3,8 +3,9 @@ import asyncio
|
||||
import logging
|
||||
import tkinter as tk
|
||||
import os
|
||||
from dailyai.pipeline.aggregators import LLMFullResponseAggregator
|
||||
|
||||
from dailyai.pipeline.frames import AudioFrame, ImageFrame
|
||||
from dailyai.pipeline.frames import AudioFrame, URLImageFrame, LLMMessagesFrame, TextFrame
|
||||
from dailyai.services.open_ai_services import OpenAILLMService
|
||||
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
|
||||
from dailyai.services.fal_ai_services import FalImageGenService
|
||||
@@ -22,7 +23,7 @@ async def main():
|
||||
async with aiohttp.ClientSession() as session:
|
||||
meeting_duration_minutes = 5
|
||||
tk_root = tk.Tk()
|
||||
tk_root.title("Calendar")
|
||||
tk_root.title("dailyai")
|
||||
|
||||
transport = LocalTransport(
|
||||
mic_enabled=True,
|
||||
@@ -43,8 +44,10 @@ async def main():
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
model="gpt-4-turbo-preview")
|
||||
|
||||
dalle = FalImageGenService(
|
||||
image_size="1024x1024",
|
||||
imagegen = FalImageGenService(
|
||||
params=FalImageGenService.InputParams(
|
||||
image_size="1024x1024"
|
||||
),
|
||||
aiohttp_session=session,
|
||||
key_id=os.getenv("FAL_KEY_ID"),
|
||||
key_secret=os.getenv("FAL_KEY_SECRET"),
|
||||
@@ -60,18 +63,32 @@ async def main():
|
||||
|
||||
return all_audio
|
||||
|
||||
async def get_month_data(month):
|
||||
messages = [{"role": "system", "content": f"Describe a nature photograph suitable for use in a calendar, for the month of {
|
||||
month}. Include only the image description with no preamble. Limit the description to one sentence, please.", }]
|
||||
async def get_month_description(aggregator, frame):
|
||||
async for frame in aggregator.process_frame(frame):
|
||||
if isinstance(frame, TextFrame):
|
||||
return frame.text
|
||||
|
||||
async def get_month_data(month):
|
||||
messages = [{"role": "system", "content": f"Describe a nature photograph suitable for use in a calendar, for the month of {month}. Include only the image description with no preamble. Limit the description to one sentence, please.", }]
|
||||
|
||||
messages_frame = LLMMessagesFrame(messages)
|
||||
|
||||
llm_full_response_aggregator = LLMFullResponseAggregator()
|
||||
|
||||
image_description = None
|
||||
async for frame in llm.process_frame(messages_frame):
|
||||
result = await get_month_description(llm_full_response_aggregator, frame)
|
||||
if result:
|
||||
image_description = result
|
||||
break
|
||||
|
||||
image_description = await llm.run_llm(messages)
|
||||
if not image_description:
|
||||
return
|
||||
|
||||
to_speak = f"{month}: {image_description}"
|
||||
audio_task = asyncio.create_task(get_all_audio(to_speak))
|
||||
image_task = asyncio.create_task(
|
||||
dalle.run_image_gen(image_description))
|
||||
imagegen.run_image_gen(image_description))
|
||||
(audio, image_data) = await asyncio.gather(audio_task, image_task)
|
||||
|
||||
return {
|
||||
@@ -79,22 +96,18 @@ async def main():
|
||||
"text": image_description,
|
||||
"image_url": image_data[0],
|
||||
"image": image_data[1],
|
||||
"image_size": image_data[2],
|
||||
"audio": audio,
|
||||
}
|
||||
|
||||
# We only specify 5 months as we create tasks all at once and we might
|
||||
# get rate limited otherwise.
|
||||
months: list[str] = [
|
||||
"January",
|
||||
"February",
|
||||
"March",
|
||||
"April",
|
||||
"May",
|
||||
"June",
|
||||
"July",
|
||||
"August",
|
||||
"September",
|
||||
"October",
|
||||
"November",
|
||||
"December",
|
||||
]
|
||||
|
||||
async def show_images():
|
||||
@@ -107,7 +120,7 @@ async def main():
|
||||
if data:
|
||||
await transport.send_queue.put(
|
||||
[
|
||||
ImageFrame(data["image_url"], data["image"]),
|
||||
URLImageFrame(data["image_url"], data["image"], data["image_size"]),
|
||||
AudioFrame(data["audio"]),
|
||||
]
|
||||
)
|
||||
@@ -72,7 +72,7 @@ async def main(room_url: str, token):
|
||||
)
|
||||
|
||||
@transport.event_handler("on_first_other_participant_joined")
|
||||
async def on_first_other_participant_joined(transport):
|
||||
async def on_first_other_participant_joined(transport, participant):
|
||||
# Kick off the conversation.
|
||||
messages.append(
|
||||
{"role": "system", "content": "Please introduce yourself to the user."})
|
||||
|
||||
96
examples/foundational/06a-image-sync.py
Normal file
96
examples/foundational/06a-image-sync.py
Normal file
@@ -0,0 +1,96 @@
|
||||
import asyncio
|
||||
import os
|
||||
import logging
|
||||
from typing import AsyncGenerator
|
||||
import aiohttp
|
||||
from PIL import Image
|
||||
|
||||
from dailyai.pipeline.frames import ImageFrame, Frame, TextFrame
|
||||
from dailyai.pipeline.pipeline import Pipeline
|
||||
from dailyai.transports.daily_transport import DailyTransport
|
||||
from dailyai.services.ai_services import AIService
|
||||
from dailyai.pipeline.aggregators import (
|
||||
LLMAssistantContextAggregator,
|
||||
LLMUserContextAggregator,
|
||||
)
|
||||
from dailyai.services.open_ai_services import OpenAILLMService
|
||||
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
|
||||
|
||||
from runner import configure
|
||||
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv(override=True)
|
||||
|
||||
logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
|
||||
logger = logging.getLogger("dailyai")
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
|
||||
class ImageSyncAggregator(AIService):
|
||||
def __init__(self, speaking_path: str, waiting_path: str):
|
||||
self._speaking_image = Image.open(speaking_path)
|
||||
self._speaking_image_bytes = self._speaking_image.tobytes()
|
||||
|
||||
self._waiting_image = Image.open(waiting_path)
|
||||
self._waiting_image_bytes = self._waiting_image.tobytes()
|
||||
|
||||
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||
yield ImageFrame(self._speaking_image_bytes, (1024, 1024))
|
||||
yield frame
|
||||
yield ImageFrame(self._waiting_image_bytes, (1024, 1024))
|
||||
|
||||
|
||||
async def main(room_url: str, token):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
transport = DailyTransport(
|
||||
room_url,
|
||||
token,
|
||||
"Respond bot",
|
||||
5,
|
||||
)
|
||||
transport._camera_enabled = True
|
||||
transport._camera_width = 1024
|
||||
transport._camera_height = 1024
|
||||
transport._mic_enabled = True
|
||||
transport._mic_sample_rate = 16000
|
||||
transport.transcription_settings["extra"]["punctuate"] = True
|
||||
|
||||
tts = ElevenLabsTTSService(
|
||||
aiohttp_session=session,
|
||||
api_key=os.getenv("ELEVENLABS_API_KEY"),
|
||||
voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
model="gpt-4-turbo-preview")
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so it should not include any special characters. Respond to what the user said in a creative and helpful way.",
|
||||
},
|
||||
]
|
||||
|
||||
tma_in = LLMUserContextAggregator(
|
||||
messages, transport._my_participant_id)
|
||||
tma_out = LLMAssistantContextAggregator(
|
||||
messages, transport._my_participant_id
|
||||
)
|
||||
image_sync_aggregator = ImageSyncAggregator(
|
||||
os.path.join(os.path.dirname(__file__), "assets", "speaking.png"),
|
||||
os.path.join(os.path.dirname(__file__), "assets", "waiting.png"),
|
||||
)
|
||||
|
||||
pipeline = Pipeline([image_sync_aggregator, tma_in, llm, tma_out, tts])
|
||||
|
||||
@transport.event_handler("on_first_other_participant_joined")
|
||||
async def on_first_other_participant_joined(transport, participant):
|
||||
await pipeline.queue_frames([TextFrame("Hi, I'm listening!")])
|
||||
|
||||
await transport.run(pipeline)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
(url, token) = configure()
|
||||
asyncio.run(main(url, token))
|
||||
@@ -3,8 +3,8 @@ import aiohttp
|
||||
import logging
|
||||
import os
|
||||
from dailyai.pipeline.aggregators import (
|
||||
LLMResponseAggregator,
|
||||
UserResponseAggregator,
|
||||
LLMAssistantResponseAggregator,
|
||||
LLMUserResponseAggregator,
|
||||
)
|
||||
|
||||
from dailyai.pipeline.pipeline import Pipeline
|
||||
@@ -50,7 +50,7 @@ async def main(room_url: str, token):
|
||||
pipeline = Pipeline([FrameLogger(), llm, FrameLogger(), tts])
|
||||
|
||||
@transport.event_handler("on_first_other_participant_joined")
|
||||
async def on_first_other_participant_joined(transport):
|
||||
async def on_first_other_participant_joined(transport, participant):
|
||||
await transport.say("Hi, I'm listening!", tts)
|
||||
|
||||
async def run_conversation():
|
||||
@@ -63,8 +63,8 @@ async def main(room_url: str, token):
|
||||
|
||||
await transport.run_interruptible_pipeline(
|
||||
pipeline,
|
||||
post_processor=LLMResponseAggregator(messages),
|
||||
pre_processor=UserResponseAggregator(messages),
|
||||
post_processor=LLMAssistantResponseAggregator(messages),
|
||||
pre_processor=LLMUserResponseAggregator(messages),
|
||||
)
|
||||
|
||||
transport.transcription_settings["extra"]["punctuate"] = False
|
||||
|
||||
@@ -51,7 +51,9 @@ async def main(room_url: str):
|
||||
voice_id="jBpfuIE2acCO8z3wKNLl",
|
||||
)
|
||||
dalle = FalImageGenService(
|
||||
image_size="1024x1024",
|
||||
params=FalImageGenService.InputParams(
|
||||
image_size="1024x1024"
|
||||
),
|
||||
aiohttp_session=session,
|
||||
key_id=os.getenv("FAL_KEY_ID"),
|
||||
key_secret=os.getenv("FAL_KEY_SECRET"),
|
||||
@@ -122,7 +124,7 @@ async def main(room_url: str):
|
||||
)
|
||||
await transport.send_queue.put(
|
||||
[
|
||||
ImageFrame(None, image_data1[1]),
|
||||
ImageFrame(image_data1[1], image_data1[2]),
|
||||
AudioFrame(audio1),
|
||||
]
|
||||
)
|
||||
@@ -134,7 +136,7 @@ async def main(room_url: str):
|
||||
)
|
||||
await transport.send_queue.put(
|
||||
[
|
||||
ImageFrame(None, image_data2[1]),
|
||||
ImageFrame(image_data2[1], image_data2[2]),
|
||||
AudioFrame(audio2),
|
||||
]
|
||||
)
|
||||
|
||||
@@ -5,6 +5,7 @@ import os
|
||||
import random
|
||||
from typing import AsyncGenerator
|
||||
from PIL import Image
|
||||
from dailyai.pipeline.pipeline import Pipeline
|
||||
|
||||
from dailyai.transports.daily_transport import DailyTransport
|
||||
from dailyai.services.open_ai_services import OpenAILLMService
|
||||
@@ -54,7 +55,7 @@ for file in image_files:
|
||||
sprites[file] = img.tobytes()
|
||||
|
||||
# When the bot isn't talking, show a static image of the cat listening
|
||||
quiet_frame = ImageFrame("", sprites["sc-listen-1.png"])
|
||||
quiet_frame = ImageFrame(sprites["sc-listen-1.png"], (720, 1280))
|
||||
# When the bot is talking, build an animation from two sprites
|
||||
talking_list = [sprites["sc-default.png"], sprites["sc-talk.png"]]
|
||||
talking = [random.choice(talking_list) for x in range(30)]
|
||||
@@ -133,6 +134,7 @@ async def main(room_url: str, token):
|
||||
transport._camera_enabled = True
|
||||
transport._camera_width = 720
|
||||
transport._camera_height = 1280
|
||||
transport.transcription_settings["extra"]["punctuate"] = True
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
@@ -145,45 +147,34 @@ async def main(room_url: str, token):
|
||||
)
|
||||
isa = ImageSyncAggregator()
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are Santa Cat, a cat that lives in Santa's workshop at the North Pole. You should be clever, and a bit sarcastic. You should also tell jokes every once in a while. Your responses should only be a few sentences long.",
|
||||
},
|
||||
]
|
||||
|
||||
tma_in = LLMUserContextAggregator(
|
||||
messages, transport._my_participant_id)
|
||||
tma_out = LLMAssistantContextAggregator(
|
||||
messages, transport._my_participant_id
|
||||
)
|
||||
tf = TranscriptFilter(transport._my_participant_id)
|
||||
ncf = NameCheckFilter(["Santa Cat", "Santa"])
|
||||
|
||||
pipeline = Pipeline([isa, tf, ncf, tma_in, llm, tma_out, tts])
|
||||
|
||||
@transport.event_handler("on_first_other_participant_joined")
|
||||
async def on_first_other_participant_joined(transport):
|
||||
await tts.say(
|
||||
async def on_first_other_participant_joined(transport, participant):
|
||||
await transport.say(
|
||||
"Hi! If you want to talk to me, just say 'hey Santa Cat'.",
|
||||
transport.send_queue,
|
||||
)
|
||||
|
||||
async def handle_transcriptions():
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are Santa Cat, a cat that lives in Santa's workshop at the North Pole. You should be clever, and a bit sarcastic. You should also tell jokes every once in a while. Your responses should only be a few sentences long.",
|
||||
},
|
||||
]
|
||||
|
||||
tma_in = LLMUserContextAggregator(
|
||||
messages, transport._my_participant_id)
|
||||
tma_out = LLMAssistantContextAggregator(
|
||||
messages, transport._my_participant_id
|
||||
)
|
||||
tf = TranscriptFilter(transport._my_participant_id)
|
||||
ncf = NameCheckFilter(["Santa Cat", "Santa"])
|
||||
await tts.run_to_queue(
|
||||
transport.send_queue,
|
||||
isa.run(
|
||||
tma_out.run(
|
||||
llm.run(
|
||||
tma_in.run(
|
||||
ncf.run(tf.run(transport.get_receive_frames())))
|
||||
)
|
||||
)
|
||||
),
|
||||
tts,
|
||||
)
|
||||
|
||||
async def starting_image():
|
||||
await transport.send_queue.put(quiet_frame)
|
||||
|
||||
transport.transcription_settings["extra"]["punctuate"] = True
|
||||
await asyncio.gather(transport.run(), handle_transcriptions(), starting_image())
|
||||
await asyncio.gather(transport.run(pipeline), starting_image())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
@@ -3,6 +3,7 @@ import asyncio
|
||||
import logging
|
||||
import os
|
||||
import wave
|
||||
from dailyai.pipeline.pipeline import Pipeline
|
||||
|
||||
from dailyai.transports.daily_transport import DailyTransport
|
||||
from dailyai.services.open_ai_services import OpenAILLMService
|
||||
@@ -81,6 +82,7 @@ async def main(room_url: str, token):
|
||||
mic_sample_rate=16000,
|
||||
camera_enabled=False,
|
||||
)
|
||||
transport.transcription_settings["extra"]["punctuate"] = True
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
@@ -92,47 +94,31 @@ async def main(room_url: str, token):
|
||||
voice_id="ErXwobaYiN019PkySvjV",
|
||||
)
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio. Respond to what the user said in a creative and helpful way.",
|
||||
},
|
||||
]
|
||||
|
||||
tma_in = LLMUserContextAggregator(
|
||||
messages, transport._my_participant_id)
|
||||
tma_out = LLMAssistantContextAggregator(
|
||||
messages, transport._my_participant_id
|
||||
)
|
||||
out_sound = OutboundSoundEffectWrapper()
|
||||
in_sound = InboundSoundEffectWrapper()
|
||||
fl = FrameLogger("LLM Out")
|
||||
fl2 = FrameLogger("Transcription In")
|
||||
|
||||
pipeline = Pipeline([tma_in, in_sound, fl2, llm, tma_out, fl, tts, out_sound])
|
||||
|
||||
@transport.event_handler("on_first_other_participant_joined")
|
||||
async def on_first_other_participant_joined(transport):
|
||||
await tts.say("Hi, I'm listening!", transport.send_queue)
|
||||
async def on_first_other_participant_joined(transport, participant):
|
||||
await transport.say("Hi, I'm listening!", tts)
|
||||
await transport.send_queue.put(AudioFrame(sounds["ding1.wav"]))
|
||||
|
||||
async def handle_transcriptions():
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio. Respond to what the user said in a creative and helpful way.",
|
||||
},
|
||||
]
|
||||
|
||||
tma_in = LLMUserContextAggregator(
|
||||
messages, transport._my_participant_id)
|
||||
tma_out = LLMAssistantContextAggregator(
|
||||
messages, transport._my_participant_id
|
||||
)
|
||||
out_sound = OutboundSoundEffectWrapper()
|
||||
in_sound = InboundSoundEffectWrapper()
|
||||
fl = FrameLogger("LLM Out")
|
||||
fl2 = FrameLogger("Transcription In")
|
||||
await out_sound.run_to_queue(
|
||||
transport.send_queue,
|
||||
tts.run(
|
||||
fl.run(
|
||||
tma_out.run(
|
||||
llm.run(
|
||||
fl2.run(
|
||||
in_sound.run(
|
||||
tma_in.run(transport.get_receive_frames())
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
),
|
||||
)
|
||||
|
||||
transport.transcription_settings["extra"]["punctuate"] = True
|
||||
await asyncio.gather(transport.run(), handle_transcriptions())
|
||||
await asyncio.gather(transport.run(pipeline))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
85
examples/foundational/12-describe-video.py
Normal file
85
examples/foundational/12-describe-video.py
Normal file
@@ -0,0 +1,85 @@
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import logging
|
||||
import os
|
||||
|
||||
from typing import AsyncGenerator
|
||||
|
||||
from dailyai.pipeline.aggregators import FrameProcessor, UserResponseAggregator, VisionImageFrameAggregator
|
||||
|
||||
from dailyai.pipeline.frames import Frame, TextFrame, UserImageRequestFrame
|
||||
from dailyai.pipeline.pipeline import Pipeline
|
||||
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
|
||||
from dailyai.services.moondream_ai_service import MoondreamService
|
||||
from dailyai.transports.daily_transport import DailyTransport
|
||||
|
||||
from runner import configure
|
||||
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv(override=True)
|
||||
|
||||
logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
|
||||
logger = logging.getLogger("dailyai")
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
|
||||
class UserImageRequester(FrameProcessor):
|
||||
participant_id: str
|
||||
|
||||
def set_participant_id(self, participant_id: str):
|
||||
self.participant_id = participant_id
|
||||
|
||||
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||
if self.participant_id and isinstance(frame, TextFrame):
|
||||
yield UserImageRequestFrame(self.participant_id)
|
||||
yield frame
|
||||
|
||||
|
||||
async def main(room_url: str, token):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
transport = DailyTransport(
|
||||
room_url,
|
||||
token,
|
||||
"Describe participant video",
|
||||
duration_minutes=5,
|
||||
mic_enabled=True,
|
||||
mic_sample_rate=16000,
|
||||
vad_enabled=True,
|
||||
start_transcription=True,
|
||||
video_rendering_enabled=True
|
||||
)
|
||||
|
||||
tts = ElevenLabsTTSService(
|
||||
aiohttp_session=session,
|
||||
api_key=os.getenv("ELEVENLABS_API_KEY"),
|
||||
voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
|
||||
)
|
||||
|
||||
user_response = UserResponseAggregator()
|
||||
|
||||
image_requester = UserImageRequester()
|
||||
|
||||
vision_aggregator = VisionImageFrameAggregator()
|
||||
|
||||
# If you run into weird description, try with use_cpu=True
|
||||
moondream = MoondreamService()
|
||||
|
||||
tts = ElevenLabsTTSService(
|
||||
aiohttp_session=session,
|
||||
api_key=os.getenv("ELEVENLABS_API_KEY"),
|
||||
voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
|
||||
)
|
||||
|
||||
@transport.event_handler("on_first_other_participant_joined")
|
||||
async def on_first_other_participant_joined(transport, participant):
|
||||
await transport.say("Hi there! Feel free to ask me what I see.", tts)
|
||||
transport.render_participant_video(participant["id"], framerate=0)
|
||||
image_requester.set_participant_id(participant["id"])
|
||||
|
||||
pipeline = Pipeline([user_response, image_requester, vision_aggregator, moondream, tts])
|
||||
|
||||
await transport.run(pipeline)
|
||||
|
||||
if __name__ == "__main__":
|
||||
(url, token) = configure()
|
||||
asyncio.run(main(url, token))
|
||||
@@ -1,12 +1,16 @@
|
||||
import asyncio
|
||||
import logging
|
||||
|
||||
from dailyai.pipeline.frames import EndFrame, TranscriptionFrame
|
||||
from dailyai.transports.daily_transport import DailyTransport
|
||||
from dailyai.services.whisper_ai_services import WhisperSTTService
|
||||
from dailyai.pipeline.pipeline import Pipeline
|
||||
|
||||
from runner import configure
|
||||
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv(override=True)
|
||||
|
||||
logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
|
||||
logger = logging.getLogger("dailyai")
|
||||
logger.setLevel(logging.DEBUG)
|
||||
@@ -26,17 +30,27 @@ async def main(room_url: str):
|
||||
stt = WhisperSTTService()
|
||||
|
||||
transcription_output_queue = asyncio.Queue()
|
||||
transport_done = asyncio.Event()
|
||||
|
||||
pipeline = Pipeline([stt])
|
||||
pipeline.set_sink(transcription_output_queue)
|
||||
pipeline = Pipeline([stt], source=transport.receive_queue, sink=transcription_output_queue)
|
||||
|
||||
async def handle_transcription():
|
||||
print("`````````TRANSCRIPTION`````````")
|
||||
while True:
|
||||
while not transport_done.is_set():
|
||||
item = await transcription_output_queue.get()
|
||||
print(item.text)
|
||||
print("got item from queue", item)
|
||||
if isinstance(item, TranscriptionFrame):
|
||||
print(item.text)
|
||||
elif isinstance(item, EndFrame):
|
||||
break
|
||||
print("handle_transcription done")
|
||||
|
||||
await asyncio.gather(transport.run(pipeline), handle_transcription())
|
||||
async def run_until_done():
|
||||
await transport.run()
|
||||
transport_done.set()
|
||||
print("run_until_done done")
|
||||
|
||||
await asyncio.gather(run_until_done(), pipeline.run_pipeline(), handle_transcription())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -15,11 +15,10 @@ async def main():
|
||||
meeting_duration_minutes = 1
|
||||
|
||||
transport = LocalTransport(
|
||||
mic_enabled=False,
|
||||
mic_enabled=True,
|
||||
camera_enabled=False,
|
||||
speaker_enabled=True,
|
||||
duration_minutes=meeting_duration_minutes,
|
||||
start_transcription=False,
|
||||
)
|
||||
|
||||
stt = WhisperSTTService()
|
||||
@@ -27,8 +26,7 @@ async def main():
|
||||
transcription_output_queue = asyncio.Queue()
|
||||
transport_done = asyncio.Event()
|
||||
|
||||
pipeline = Pipeline([stt])
|
||||
pipeline.set_sink(transcription_output_queue)
|
||||
pipeline = Pipeline([stt], source=transport.receive_queue, sink=transcription_output_queue)
|
||||
|
||||
async def handle_transcription():
|
||||
print("`````````TRANSCRIPTION`````````")
|
||||
@@ -42,11 +40,11 @@ async def main():
|
||||
print("handle_transcription done")
|
||||
|
||||
async def run_until_done():
|
||||
await transport.run(pipeline)
|
||||
await transport.run()
|
||||
transport_done.set()
|
||||
print("run_until_done done")
|
||||
|
||||
await asyncio.gather(run_until_done(), handle_transcription())
|
||||
await asyncio.gather(run_until_done(), pipeline.run_pipeline(), handle_transcription())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
52
examples/foundational/14-render-remote-participant.py
Normal file
52
examples/foundational/14-render-remote-participant.py
Normal file
@@ -0,0 +1,52 @@
|
||||
import asyncio
|
||||
import logging
|
||||
|
||||
from typing import AsyncGenerator
|
||||
|
||||
from dailyai.pipeline.aggregators import FrameProcessor
|
||||
|
||||
from dailyai.pipeline.frames import ImageFrame, Frame, UserImageFrame
|
||||
from dailyai.pipeline.pipeline import Pipeline
|
||||
from dailyai.transports.daily_transport import DailyTransport
|
||||
|
||||
from runner import configure
|
||||
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv(override=True)
|
||||
|
||||
logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
|
||||
logger = logging.getLogger("dailyai")
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
|
||||
class UserImageProcessor(FrameProcessor):
|
||||
|
||||
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||
if isinstance(frame, UserImageFrame):
|
||||
yield ImageFrame(frame.image, frame.size)
|
||||
else:
|
||||
yield frame
|
||||
|
||||
|
||||
async def main(room_url: str, token):
|
||||
transport = DailyTransport(
|
||||
room_url,
|
||||
token,
|
||||
"Render participant video",
|
||||
camera_width=1280,
|
||||
camera_height=720,
|
||||
camera_enabled=True,
|
||||
video_rendering_enabled=True
|
||||
)
|
||||
|
||||
@ transport.event_handler("on_first_other_participant_joined")
|
||||
async def on_first_other_participant_joined(transport, participant):
|
||||
transport.render_participant_video(participant["id"])
|
||||
|
||||
pipeline = Pipeline([UserImageProcessor()])
|
||||
|
||||
await asyncio.gather(transport.run(pipeline))
|
||||
|
||||
if __name__ == "__main__":
|
||||
(url, token) = configure()
|
||||
asyncio.run(main(url, token))
|
||||
71
examples/foundational/14a-local-render-remote-participant.py
Normal file
71
examples/foundational/14a-local-render-remote-participant.py
Normal file
@@ -0,0 +1,71 @@
|
||||
import asyncio
|
||||
import logging
|
||||
import tkinter as tk
|
||||
|
||||
from typing import AsyncGenerator
|
||||
|
||||
from dailyai.pipeline.aggregators import FrameProcessor
|
||||
|
||||
from dailyai.pipeline.frames import ImageFrame, Frame, UserImageFrame
|
||||
from dailyai.pipeline.pipeline import Pipeline
|
||||
from dailyai.transports.daily_transport import DailyTransport
|
||||
|
||||
from dailyai.transports.local_transport import LocalTransport
|
||||
from runner import configure
|
||||
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv(override=True)
|
||||
|
||||
logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
|
||||
logger = logging.getLogger("dailyai")
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
|
||||
class UserImageProcessor(FrameProcessor):
|
||||
|
||||
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||
if isinstance(frame, UserImageFrame):
|
||||
yield ImageFrame(frame.image, frame.size)
|
||||
else:
|
||||
yield frame
|
||||
|
||||
|
||||
async def main(room_url: str, token):
|
||||
tk_root = tk.Tk()
|
||||
tk_root.title("dailyai")
|
||||
|
||||
local_transport = LocalTransport(
|
||||
tk_root=tk_root,
|
||||
camera_enabled=True,
|
||||
camera_width=1280,
|
||||
camera_height=720
|
||||
)
|
||||
|
||||
transport = DailyTransport(
|
||||
room_url,
|
||||
token,
|
||||
"Render participant video",
|
||||
video_rendering_enabled=True
|
||||
)
|
||||
|
||||
@transport.event_handler("on_first_other_participant_joined")
|
||||
async def on_first_other_participant_joined(transport, participant):
|
||||
transport.render_participant_video(participant["id"])
|
||||
|
||||
async def run_tk():
|
||||
while not transport._stop_threads.is_set():
|
||||
tk_root.update()
|
||||
tk_root.update_idletasks()
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
local_pipeline = Pipeline([UserImageProcessor()], source=transport.receive_queue)
|
||||
|
||||
await asyncio.gather(
|
||||
transport.run(),
|
||||
local_transport.run(local_pipeline, override_pipeline_source_queue=False),
|
||||
run_tk()
|
||||
)
|
||||
|
||||
if __name__ == "__main__":
|
||||
(url, token) = configure()
|
||||
asyncio.run(main(url, token))
|
||||
@@ -1,122 +0,0 @@
|
||||
import asyncio
|
||||
import os
|
||||
import logging
|
||||
from typing import AsyncGenerator
|
||||
import aiohttp
|
||||
from PIL import Image
|
||||
|
||||
from dailyai.pipeline.frames import ImageFrame, Frame
|
||||
from dailyai.transports.daily_transport import DailyTransport
|
||||
from dailyai.services.ai_services import AIService
|
||||
from dailyai.pipeline.aggregators import (
|
||||
LLMAssistantContextAggregator,
|
||||
LLMUserContextAggregator,
|
||||
)
|
||||
from dailyai.services.open_ai_services import OpenAILLMService
|
||||
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
|
||||
from dailyai.services.fal_ai_services import FalImageGenService
|
||||
|
||||
from runner import configure
|
||||
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv(override=True)
|
||||
|
||||
logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
|
||||
logger = logging.getLogger("dailyai")
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
|
||||
class ImageSyncAggregator(AIService):
|
||||
def __init__(self, speaking_path: str, waiting_path: str):
|
||||
self._speaking_image = Image.open(speaking_path)
|
||||
self._speaking_image_bytes = self._speaking_image.tobytes()
|
||||
|
||||
self._waiting_image = Image.open(waiting_path)
|
||||
self._waiting_image_bytes = self._waiting_image.tobytes()
|
||||
|
||||
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||
yield ImageFrame(None, self._speaking_image_bytes)
|
||||
yield frame
|
||||
yield ImageFrame(None, self._waiting_image_bytes)
|
||||
|
||||
|
||||
async def main(room_url: str, token):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
transport = DailyTransport(
|
||||
room_url,
|
||||
token,
|
||||
"Respond bot",
|
||||
5,
|
||||
)
|
||||
transport._camera_enabled = True
|
||||
transport._camera_width = 1024
|
||||
transport._camera_height = 1024
|
||||
transport._mic_enabled = True
|
||||
transport._mic_sample_rate = 16000
|
||||
|
||||
tts = ElevenLabsTTSService(
|
||||
aiohttp_session=session,
|
||||
api_key=os.getenv("ELEVENLABS_API_KEY"),
|
||||
voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
model="gpt-4-turbo-preview")
|
||||
|
||||
img = FalImageGenService(
|
||||
image_size="1024x1024",
|
||||
aiohttp_session=session,
|
||||
key_id=os.getenv("FAL_KEY_ID"),
|
||||
key_secret=os.getenv("FAL_KEY_SECRET"),
|
||||
)
|
||||
|
||||
async def get_images():
|
||||
get_speaking_task = asyncio.create_task(
|
||||
img.run_image_gen("An image of a cat speaking")
|
||||
)
|
||||
get_waiting_task = asyncio.create_task(
|
||||
img.run_image_gen("An image of a cat waiting")
|
||||
)
|
||||
|
||||
(speaking_data, waiting_data) = await asyncio.gather(
|
||||
get_speaking_task, get_waiting_task
|
||||
)
|
||||
|
||||
return speaking_data, waiting_data
|
||||
|
||||
@transport.event_handler("on_first_other_participant_joined")
|
||||
async def on_first_other_participant_joined(transport):
|
||||
await tts.say("Hi, I'm listening!", transport.send_queue)
|
||||
|
||||
async def handle_transcriptions():
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio. Respond to what the user said in a creative and helpful way.",
|
||||
},
|
||||
]
|
||||
|
||||
tma_in = LLMUserContextAggregator(
|
||||
messages, transport._my_participant_id)
|
||||
tma_out = LLMAssistantContextAggregator(
|
||||
messages, transport._my_participant_id
|
||||
)
|
||||
image_sync_aggregator = ImageSyncAggregator(
|
||||
os.path.join(
|
||||
os.path.dirname(__file__), "assets", "speaking.png"), os.path.join(
|
||||
os.path.dirname(__file__), "assets", "waiting.png"), )
|
||||
await tts.run_to_queue(
|
||||
transport.send_queue,
|
||||
image_sync_aggregator.run(
|
||||
tma_out.run(llm.run(tma_in.run(transport.get_receive_frames())))
|
||||
),
|
||||
)
|
||||
|
||||
transport.transcription_settings["extra"]["punctuate"] = True
|
||||
await asyncio.gather(transport.run(), handle_transcriptions())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
(url, token) = configure()
|
||||
asyncio.run(main(url, token))
|
||||
@@ -80,7 +80,7 @@ async def main(room_url: str, token, phone):
|
||||
tts = AzureTTSService()
|
||||
|
||||
@transport.event_handler("on_first_other_participant_joined")
|
||||
async def on_first_other_participant_joined(transport):
|
||||
async def on_first_other_participant_joined(transport, participant):
|
||||
await tts.say("Hi, I'm listening!", transport.send_queue)
|
||||
await transport.send_queue.put(AudioFrame(sounds["ding1.wav"]))
|
||||
|
||||
|
||||
@@ -6,8 +6,8 @@ from PIL import Image
|
||||
from typing import AsyncGenerator
|
||||
|
||||
from dailyai.pipeline.aggregators import (
|
||||
LLMResponseAggregator,
|
||||
UserResponseAggregator,
|
||||
LLMAssistantResponseAggregator,
|
||||
LLMUserResponseAggregator,
|
||||
)
|
||||
from dailyai.pipeline.frames import (
|
||||
ImageFrame,
|
||||
@@ -48,7 +48,7 @@ for i in range(1, 26):
|
||||
flipped = sprites[::-1]
|
||||
sprites.extend(flipped)
|
||||
# When the bot isn't talking, show a static image of the cat listening
|
||||
quiet_frame = ImageFrame("", sprites[0])
|
||||
quiet_frame = ImageFrame(sprites[0], (1024, 576))
|
||||
talking_frame = SpriteFrame(images=sprites)
|
||||
|
||||
|
||||
@@ -127,7 +127,7 @@ async def main(room_url: str, token):
|
||||
]
|
||||
|
||||
@transport.event_handler("on_first_other_participant_joined")
|
||||
async def on_first_other_participant_joined(transport):
|
||||
async def on_first_other_participant_joined(transport, participant):
|
||||
print(f"!!! in here, pipeline.source is {pipeline.source}")
|
||||
await pipeline.queue_frames([LLMMessagesFrame(messages)])
|
||||
|
||||
@@ -135,8 +135,8 @@ async def main(room_url: str, token):
|
||||
|
||||
await transport.run_interruptible_pipeline(
|
||||
pipeline,
|
||||
post_processor=LLMResponseAggregator(messages),
|
||||
pre_processor=UserResponseAggregator(messages),
|
||||
post_processor=LLMAssistantResponseAggregator(messages),
|
||||
pre_processor=LLMUserResponseAggregator(messages),
|
||||
)
|
||||
|
||||
transport.transcription_settings["extra"]["endpointing"] = True
|
||||
|
||||
@@ -330,7 +330,7 @@ async def main(room_url: str, token):
|
||||
pipeline = Pipeline(processors=[fl, llm, fl2, checklist, tts])
|
||||
|
||||
@transport.event_handler("on_first_other_participant_joined")
|
||||
async def on_first_other_participant_joined(transport):
|
||||
async def on_first_other_participant_joined(transport, participant):
|
||||
await pipeline.queue_frames([OpenAILLMContextFrame(context)])
|
||||
|
||||
async def handle_intake():
|
||||
|
||||
@@ -19,8 +19,8 @@ from dailyai.services.deepgram_ai_services import DeepgramTTSService
|
||||
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
|
||||
from dailyai.pipeline.aggregators import (
|
||||
LLMAssistantContextAggregator,
|
||||
UserResponseAggregator,
|
||||
LLMResponseAggregator,
|
||||
LLMAssistantResponseAggregator,
|
||||
LLMUserResponseAggregator,
|
||||
)
|
||||
from dailyai.pipeline.frames import (
|
||||
EndPipeFrame,
|
||||
@@ -99,7 +99,7 @@ class StoryProcessor(FrameProcessor):
|
||||
1. Catch the frames that are generated by the LLM service
|
||||
"""
|
||||
if isinstance(frame, UserStoppedSpeakingFrame):
|
||||
yield ImageFrame(None, images["grandma-writing.png"])
|
||||
yield ImageFrame(images["grandma-writing.png"], (1024, 1024))
|
||||
yield AudioFrame(sounds["talking.wav"])
|
||||
|
||||
elif isinstance(frame, TextFrame):
|
||||
@@ -112,7 +112,7 @@ class StoryProcessor(FrameProcessor):
|
||||
|
||||
self._text = self._text.replace("\n", " ")
|
||||
if len(self._text) > 2:
|
||||
yield ImageFrame(None, images["grandma-writing.png"])
|
||||
yield ImageFrame(images["grandma-writing.png"], (1024, 1024))
|
||||
yield StoryStartFrame(self._text)
|
||||
yield AudioFrame(sounds["ding3.wav"])
|
||||
self._text = ""
|
||||
@@ -146,11 +146,11 @@ class StoryProcessor(FrameProcessor):
|
||||
# last bit
|
||||
pass
|
||||
elif isinstance(frame, LLMResponseEndFrame):
|
||||
yield ImageFrame(None, images["grandma-writing.png"])
|
||||
yield ImageFrame(images["grandma-writing.png"], (1024, 1024))
|
||||
yield StoryPromptFrame(self._text)
|
||||
self._text = ""
|
||||
yield frame
|
||||
yield ImageFrame(None, images["grandma-listening.png"])
|
||||
yield ImageFrame(images["grandma-listening.png"], (1024, 1024))
|
||||
yield AudioFrame(sounds["listening.wav"])
|
||||
|
||||
else:
|
||||
@@ -204,13 +204,15 @@ async def main(room_url: str, token):
|
||||
voice_id="Xb7hH8MSUJpSbSDYk0k2",
|
||||
) # matilda
|
||||
img = FalImageGenService(
|
||||
image_size="1024x1024",
|
||||
params={
|
||||
image_size = "1024x1024",
|
||||
},
|
||||
aiohttp_session=session,
|
||||
key_id=os.getenv("FAL_KEY_ID"),
|
||||
key_secret=os.getenv("FAL_KEY_SECRET"),
|
||||
)
|
||||
lra = LLMResponseAggregator(messages)
|
||||
ura = UserResponseAggregator(messages)
|
||||
lra = LLMAssistantResponseAggregator(messages)
|
||||
ura = LLMUserResponseAggregator(messages)
|
||||
sp = StoryProcessor(messages, story)
|
||||
sig = StoryImageGenerator(story, llm, img)
|
||||
|
||||
@@ -232,7 +234,7 @@ async def main(room_url: str, token):
|
||||
start_story_event = asyncio.Event()
|
||||
|
||||
@transport.event_handler("on_first_other_participant_joined")
|
||||
async def on_first_other_participant_joined(transport):
|
||||
async def on_first_other_participant_joined(transport, participant):
|
||||
start_story_event.set()
|
||||
|
||||
async def storytime():
|
||||
@@ -252,7 +254,7 @@ async def main(room_url: str, token):
|
||||
[llm, lca, tts], sink=transport.send_queue)
|
||||
await local_pipeline.queue_frames(
|
||||
[
|
||||
ImageFrame(None, images["grandma-listening.png"]),
|
||||
ImageFrame(images["grandma-listening.png"], (1024, 1024)),
|
||||
LLMMessagesFrame(intro_messages),
|
||||
AudioFrame(sounds["listening.wav"]),
|
||||
EndPipeFrame(),
|
||||
|
||||
@@ -33,19 +33,24 @@ certifi==2024.2.2
|
||||
# httpcore
|
||||
# httpx
|
||||
# requests
|
||||
cffi==1.16.0
|
||||
# via cryptography
|
||||
charset-normalizer==3.3.2
|
||||
# via requests
|
||||
click==8.1.7
|
||||
# via
|
||||
# fal
|
||||
# flask
|
||||
# rich-click
|
||||
colorama==0.4.6
|
||||
# via fal
|
||||
coloredlogs==15.0.1
|
||||
# via onnxruntime
|
||||
cryptography==42.0.5
|
||||
# via pyjwt
|
||||
ctranslate2==4.1.0
|
||||
# via faster-whisper
|
||||
daily-python==0.7.2
|
||||
daily-python==0.7.3
|
||||
# via dailyai (pyproject.toml)
|
||||
deprecated==1.2.14
|
||||
# via opentelemetry-api
|
||||
@@ -57,22 +62,25 @@ distro==1.9.0
|
||||
# via
|
||||
# anthropic
|
||||
# openai
|
||||
einops==0.7.0
|
||||
# via dailyai (pyproject.toml)
|
||||
exceptiongroup==1.2.0
|
||||
# via anyio
|
||||
fal==0.12.3
|
||||
fal==0.12.7
|
||||
# via dailyai (pyproject.toml)
|
||||
fastapi==0.99.1
|
||||
# via fal
|
||||
faster-whisper==1.0.1
|
||||
# via dailyai (pyproject.toml)
|
||||
filelock==3.13.3
|
||||
filelock==3.13.4
|
||||
# via
|
||||
# huggingface-hub
|
||||
# pyht
|
||||
# torch
|
||||
# transformers
|
||||
# triton
|
||||
# virtualenv
|
||||
flask==3.0.2
|
||||
flask==3.0.3
|
||||
# via
|
||||
# dailyai (pyproject.toml)
|
||||
# flask-cors
|
||||
@@ -109,7 +117,9 @@ httpx==0.27.0
|
||||
huggingface-hub==0.22.2
|
||||
# via
|
||||
# faster-whisper
|
||||
# timm
|
||||
# tokenizers
|
||||
# transformers
|
||||
humanfriendly==10.0
|
||||
# via coloredlogs
|
||||
idna==3.6
|
||||
@@ -124,7 +134,7 @@ isolate[build]==0.12.7
|
||||
# via
|
||||
# fal
|
||||
# isolate-proto
|
||||
isolate-proto==0.3.3
|
||||
isolate-proto==0.3.4
|
||||
# via fal
|
||||
itsdangerous==2.1.2
|
||||
# via flask
|
||||
@@ -148,13 +158,15 @@ multidict==6.0.5
|
||||
# via
|
||||
# aiohttp
|
||||
# yarl
|
||||
networkx==3.2.1
|
||||
networkx==3.3
|
||||
# via torch
|
||||
numpy==1.26.4
|
||||
# via
|
||||
# ctranslate2
|
||||
# dailyai (pyproject.toml)
|
||||
# onnxruntime
|
||||
# torchvision
|
||||
# transformers
|
||||
nvidia-cublas-cu12==12.1.3.1
|
||||
# via
|
||||
# nvidia-cudnn-cu12
|
||||
@@ -188,7 +200,7 @@ nvidia-nvtx-cu12==12.1.105
|
||||
# via torch
|
||||
onnxruntime==1.17.1
|
||||
# via faster-whisper
|
||||
openai==1.14.2
|
||||
openai==1.14.3
|
||||
# via dailyai (pyproject.toml)
|
||||
opentelemetry-api==1.24.0
|
||||
# via
|
||||
@@ -203,12 +215,14 @@ packaging==24.0
|
||||
# fal
|
||||
# huggingface-hub
|
||||
# onnxruntime
|
||||
# transformers
|
||||
pathspec==0.11.2
|
||||
# via fal
|
||||
pillow==10.2.0
|
||||
# via
|
||||
# dailyai (pyproject.toml)
|
||||
# fal
|
||||
# torchvision
|
||||
platformdirs==4.2.0
|
||||
# via
|
||||
# isolate
|
||||
@@ -223,6 +237,8 @@ protobuf==4.25.3
|
||||
# pyht
|
||||
pyaudio==0.2.14
|
||||
# via dailyai (pyproject.toml)
|
||||
pycparser==2.22
|
||||
# via cffi
|
||||
pydantic==1.10.15
|
||||
# via
|
||||
# anthropic
|
||||
@@ -231,9 +247,9 @@ pydantic==1.10.15
|
||||
# openai
|
||||
pygments==2.17.2
|
||||
# via rich
|
||||
pyht==0.0.26
|
||||
pyht==0.0.27
|
||||
# via dailyai (pyproject.toml)
|
||||
pyjwt==2.8.0
|
||||
pyjwt[crypto]==2.8.0
|
||||
# via fal
|
||||
python-dateutil==2.9.0.post0
|
||||
# via fal
|
||||
@@ -244,12 +260,25 @@ pyyaml==6.0.1
|
||||
# ctranslate2
|
||||
# huggingface-hub
|
||||
# isolate
|
||||
# timm
|
||||
# transformers
|
||||
regex==2023.12.25
|
||||
# via transformers
|
||||
requests==2.31.0
|
||||
# via
|
||||
# huggingface-hub
|
||||
# pyht
|
||||
# transformers
|
||||
rich==13.7.1
|
||||
# via
|
||||
# fal
|
||||
# rich-click
|
||||
rich-click==1.7.4
|
||||
# via fal
|
||||
safetensors==0.4.2
|
||||
# via
|
||||
# timm
|
||||
# transformers
|
||||
six==1.16.0
|
||||
# via python-dateutil
|
||||
sniffio==1.3.1
|
||||
@@ -268,20 +297,30 @@ sympy==1.12
|
||||
# torch
|
||||
tblib==3.0.0
|
||||
# via isolate
|
||||
timm==0.9.16
|
||||
# via dailyai (pyproject.toml)
|
||||
tokenizers==0.15.2
|
||||
# via
|
||||
# anthropic
|
||||
# faster-whisper
|
||||
torch==2.2.1
|
||||
# transformers
|
||||
torch==2.2.2
|
||||
# via
|
||||
# dailyai (pyproject.toml)
|
||||
# timm
|
||||
# torchaudio
|
||||
torchaudio==2.2.1
|
||||
# torchvision
|
||||
torchaudio==2.2.2
|
||||
# via dailyai (pyproject.toml)
|
||||
torchvision==0.17.2
|
||||
# via timm
|
||||
tqdm==4.66.2
|
||||
# via
|
||||
# huggingface-hub
|
||||
# openai
|
||||
# transformers
|
||||
transformers==4.39.3
|
||||
# via dailyai (pyproject.toml)
|
||||
triton==2.2.0
|
||||
# via torch
|
||||
types-python-dateutil==2.9.0.20240316
|
||||
@@ -297,6 +336,7 @@ typing-extensions==4.10.0
|
||||
# openai
|
||||
# opentelemetry-sdk
|
||||
# pydantic
|
||||
# rich-click
|
||||
# torch
|
||||
urllib3==2.2.1
|
||||
# via requests
|
||||
|
||||
@@ -33,19 +33,24 @@ certifi==2024.2.2
|
||||
# httpcore
|
||||
# httpx
|
||||
# requests
|
||||
cffi==1.16.0
|
||||
# via cryptography
|
||||
charset-normalizer==3.3.2
|
||||
# via requests
|
||||
click==8.1.7
|
||||
# via
|
||||
# fal
|
||||
# flask
|
||||
# rich-click
|
||||
colorama==0.4.6
|
||||
# via fal
|
||||
coloredlogs==15.0.1
|
||||
# via onnxruntime
|
||||
cryptography==42.0.5
|
||||
# via pyjwt
|
||||
ctranslate2==4.1.0
|
||||
# via faster-whisper
|
||||
daily-python==0.7.2
|
||||
daily-python==0.7.3
|
||||
# via dailyai (pyproject.toml)
|
||||
deprecated==1.2.14
|
||||
# via opentelemetry-api
|
||||
@@ -57,21 +62,24 @@ distro==1.9.0
|
||||
# via
|
||||
# anthropic
|
||||
# openai
|
||||
einops==0.7.0
|
||||
# via dailyai (pyproject.toml)
|
||||
exceptiongroup==1.2.0
|
||||
# via anyio
|
||||
fal==0.12.3
|
||||
fal==0.12.7
|
||||
# via dailyai (pyproject.toml)
|
||||
fastapi==0.99.1
|
||||
# via fal
|
||||
faster-whisper==1.0.1
|
||||
# via dailyai (pyproject.toml)
|
||||
filelock==3.13.3
|
||||
filelock==3.13.4
|
||||
# via
|
||||
# huggingface-hub
|
||||
# pyht
|
||||
# torch
|
||||
# transformers
|
||||
# virtualenv
|
||||
flask==3.0.2
|
||||
flask==3.0.3
|
||||
# via
|
||||
# dailyai (pyproject.toml)
|
||||
# flask-cors
|
||||
@@ -108,7 +116,9 @@ httpx==0.27.0
|
||||
huggingface-hub==0.22.2
|
||||
# via
|
||||
# faster-whisper
|
||||
# timm
|
||||
# tokenizers
|
||||
# transformers
|
||||
humanfriendly==10.0
|
||||
# via coloredlogs
|
||||
idna==3.6
|
||||
@@ -123,7 +133,7 @@ isolate[build]==0.12.7
|
||||
# via
|
||||
# fal
|
||||
# isolate-proto
|
||||
isolate-proto==0.3.3
|
||||
isolate-proto==0.3.4
|
||||
# via fal
|
||||
itsdangerous==2.1.2
|
||||
# via flask
|
||||
@@ -147,16 +157,18 @@ multidict==6.0.5
|
||||
# via
|
||||
# aiohttp
|
||||
# yarl
|
||||
networkx==3.2.1
|
||||
networkx==3.3
|
||||
# via torch
|
||||
numpy==1.26.4
|
||||
# via
|
||||
# ctranslate2
|
||||
# dailyai (pyproject.toml)
|
||||
# onnxruntime
|
||||
# torchvision
|
||||
# transformers
|
||||
onnxruntime==1.17.1
|
||||
# via faster-whisper
|
||||
openai==1.14.2
|
||||
openai==1.14.3
|
||||
# via dailyai (pyproject.toml)
|
||||
opentelemetry-api==1.24.0
|
||||
# via
|
||||
@@ -171,12 +183,14 @@ packaging==24.0
|
||||
# fal
|
||||
# huggingface-hub
|
||||
# onnxruntime
|
||||
# transformers
|
||||
pathspec==0.11.2
|
||||
# via fal
|
||||
pillow==10.2.0
|
||||
# via
|
||||
# dailyai (pyproject.toml)
|
||||
# fal
|
||||
# torchvision
|
||||
platformdirs==4.2.0
|
||||
# via
|
||||
# isolate
|
||||
@@ -191,6 +205,8 @@ protobuf==4.25.3
|
||||
# pyht
|
||||
pyaudio==0.2.14
|
||||
# via dailyai (pyproject.toml)
|
||||
pycparser==2.22
|
||||
# via cffi
|
||||
pydantic==1.10.15
|
||||
# via
|
||||
# anthropic
|
||||
@@ -199,9 +215,9 @@ pydantic==1.10.15
|
||||
# openai
|
||||
pygments==2.17.2
|
||||
# via rich
|
||||
pyht==0.0.26
|
||||
pyht==0.0.27
|
||||
# via dailyai (pyproject.toml)
|
||||
pyjwt==2.8.0
|
||||
pyjwt[crypto]==2.8.0
|
||||
# via fal
|
||||
python-dateutil==2.9.0.post0
|
||||
# via fal
|
||||
@@ -212,12 +228,25 @@ pyyaml==6.0.1
|
||||
# ctranslate2
|
||||
# huggingface-hub
|
||||
# isolate
|
||||
# timm
|
||||
# transformers
|
||||
regex==2023.12.25
|
||||
# via transformers
|
||||
requests==2.31.0
|
||||
# via
|
||||
# huggingface-hub
|
||||
# pyht
|
||||
# transformers
|
||||
rich==13.7.1
|
||||
# via
|
||||
# fal
|
||||
# rich-click
|
||||
rich-click==1.7.4
|
||||
# via fal
|
||||
safetensors==0.4.2
|
||||
# via
|
||||
# timm
|
||||
# transformers
|
||||
six==1.16.0
|
||||
# via python-dateutil
|
||||
sniffio==1.3.1
|
||||
@@ -236,20 +265,30 @@ sympy==1.12
|
||||
# torch
|
||||
tblib==3.0.0
|
||||
# via isolate
|
||||
timm==0.9.16
|
||||
# via dailyai (pyproject.toml)
|
||||
tokenizers==0.15.2
|
||||
# via
|
||||
# anthropic
|
||||
# faster-whisper
|
||||
torch==2.2.1
|
||||
# transformers
|
||||
torch==2.2.2
|
||||
# via
|
||||
# dailyai (pyproject.toml)
|
||||
# timm
|
||||
# torchaudio
|
||||
torchaudio==2.2.1
|
||||
# torchvision
|
||||
torchaudio==2.2.2
|
||||
# via dailyai (pyproject.toml)
|
||||
torchvision==0.17.2
|
||||
# via timm
|
||||
tqdm==4.66.2
|
||||
# via
|
||||
# huggingface-hub
|
||||
# openai
|
||||
# transformers
|
||||
transformers==4.39.3
|
||||
# via dailyai (pyproject.toml)
|
||||
types-python-dateutil==2.9.0.20240316
|
||||
# via fal
|
||||
typing-extensions==4.10.0
|
||||
@@ -263,6 +302,7 @@ typing-extensions==4.10.0
|
||||
# openai
|
||||
# opentelemetry-sdk
|
||||
# pydantic
|
||||
# rich-click
|
||||
# torch
|
||||
urllib3==2.2.1
|
||||
# via requests
|
||||
|
||||
@@ -20,10 +20,10 @@ classifiers = [
|
||||
"Topic :: Scientific/Engineering :: Artificial Intelligence"
|
||||
]
|
||||
dependencies = [
|
||||
"aiohttp==3.9.3",
|
||||
"numpy==1.26.4",
|
||||
"Pillow==10.2.0",
|
||||
"typing-extensions==4.10.0",
|
||||
"aiohttp~=3.9.0",
|
||||
"numpy~=1.26.0",
|
||||
"Pillow~=10.2.0",
|
||||
"typing-extensions~=4.10.0",
|
||||
]
|
||||
|
||||
[project.urls]
|
||||
@@ -31,17 +31,18 @@ Source = "https://github.com/daily-co/dailyai"
|
||||
Website = "https://daily.co"
|
||||
|
||||
[project.optional-dependencies]
|
||||
anthropic = [ "anthropic==0.20.0" ]
|
||||
azure = [ "azure-cognitiveservices-speech==1.36.0" ]
|
||||
daily = [ "daily-python==0.7.2" ]
|
||||
examples = [ "python-dotenv==1.0.1", "flask==3.0.2", "flask_cors==4.0.0" ]
|
||||
fal = [ "fal==0.12.3" ]
|
||||
local = [ "pyaudio==0.2.14" ]
|
||||
openai = [ "openai==1.14.2" ]
|
||||
playht = [ "pyht==0.0.26" ]
|
||||
silero = [ "torch==2.2.1", "torchaudio==2.2.1" ]
|
||||
websocket = [ "websockets==12.0" ]
|
||||
whisper = [ "faster_whisper==1.0.1" ]
|
||||
anthropic = [ "anthropic~=0.20.0" ]
|
||||
azure = [ "azure-cognitiveservices-speech~=1.36.0" ]
|
||||
daily = [ "daily-python~=0.7.0" ]
|
||||
examples = [ "python-dotenv~=1.0.0", "flask~=3.0.0", "flask_cors~=4.0.0" ]
|
||||
fal = [ "fal~=0.12.0" ]
|
||||
local = [ "pyaudio~=0.2.0" ]
|
||||
moondream = [ "einops~=0.7.0", "timm~=0.9.0", "transformers~=4.39.0" ]
|
||||
openai = [ "openai~=1.14.0" ]
|
||||
playht = [ "pyht~=0.0.26" ]
|
||||
silero = [ "torch~=2.2.0", "torchaudio~=2.2.0" ]
|
||||
websocket = [ "websockets~=12.0" ]
|
||||
whisper = [ "faster_whisper~=1.0.0" ]
|
||||
|
||||
[tool.setuptools.packages.find]
|
||||
# All the following settings are optional:
|
||||
@@ -51,4 +52,4 @@ where = ["src"]
|
||||
pythonpath = ["src"]
|
||||
|
||||
[tool.setuptools_scm]
|
||||
# Empty
|
||||
local_scheme = "no-local-version"
|
||||
|
||||
@@ -7,6 +7,7 @@ from dailyai.pipeline.frames import (
|
||||
EndFrame,
|
||||
EndPipeFrame,
|
||||
Frame,
|
||||
ImageFrame,
|
||||
LLMMessagesFrame,
|
||||
LLMResponseEndFrame,
|
||||
LLMResponseStartFrame,
|
||||
@@ -14,6 +15,7 @@ from dailyai.pipeline.frames import (
|
||||
TranscriptionFrame,
|
||||
UserStartedSpeakingFrame,
|
||||
UserStoppedSpeakingFrame,
|
||||
VisionImageFrame,
|
||||
)
|
||||
from dailyai.pipeline.pipeline import Pipeline
|
||||
from dailyai.services.ai_services import AIService
|
||||
@@ -22,6 +24,79 @@ from typing import AsyncGenerator, Coroutine, List
|
||||
|
||||
|
||||
class ResponseAggregator(FrameProcessor):
|
||||
"""This frame processor aggregates frames between a start and an end frame
|
||||
into complete text frame sentences.
|
||||
|
||||
For example, frame input/output:
|
||||
UserStartedSpeakingFrame() -> None
|
||||
TranscriptionFrame("Hello,") -> None
|
||||
TranscriptionFrame(" world.") -> None
|
||||
UserStoppedSpeakingFrame() -> TextFrame("Hello world.")
|
||||
|
||||
Doctest:
|
||||
>>> async def print_frames(aggregator, frame):
|
||||
... async for frame in aggregator.process_frame(frame):
|
||||
... if isinstance(frame, TextFrame):
|
||||
... print(frame.text)
|
||||
|
||||
>>> aggregator = ResponseAggregator(start_frame = UserStartedSpeakingFrame,
|
||||
... end_frame=UserStoppedSpeakingFrame,
|
||||
... accumulator_frame=TranscriptionFrame,
|
||||
... pass_through=False)
|
||||
>>> asyncio.run(print_frames(aggregator, UserStartedSpeakingFrame()))
|
||||
>>> asyncio.run(print_frames(aggregator, TranscriptionFrame("Hello,", 1, 1)))
|
||||
>>> asyncio.run(print_frames(aggregator, TranscriptionFrame("world.", 1, 2)))
|
||||
>>> asyncio.run(print_frames(aggregator, UserStoppedSpeakingFrame()))
|
||||
Hello, world.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
start_frame,
|
||||
end_frame,
|
||||
accumulator_frame,
|
||||
pass_through=True,
|
||||
):
|
||||
self.aggregation = ""
|
||||
self.aggregating = False
|
||||
self._start_frame = start_frame
|
||||
self._end_frame = end_frame
|
||||
self._accumulator_frame = accumulator_frame
|
||||
self._pass_through = pass_through
|
||||
|
||||
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||
if isinstance(frame, self._start_frame):
|
||||
self.aggregating = True
|
||||
elif isinstance(frame, self._end_frame):
|
||||
self.aggregating = False
|
||||
# Sometimes VAD triggers quickly on and off. If we don't get any transcription,
|
||||
# it creates empty LLM message queue frames
|
||||
if len(self.aggregation) > 0:
|
||||
output = self.aggregation
|
||||
self.aggregation = ""
|
||||
yield self._end_frame()
|
||||
yield TextFrame(output.strip())
|
||||
elif isinstance(frame, self._accumulator_frame) and self.aggregating:
|
||||
self.aggregation += f" {frame.text}"
|
||||
if self._pass_through:
|
||||
yield frame
|
||||
else:
|
||||
yield frame
|
||||
|
||||
|
||||
class UserResponseAggregator(ResponseAggregator):
|
||||
def __init__(self):
|
||||
super().__init__(
|
||||
start_frame=UserStartedSpeakingFrame,
|
||||
end_frame=UserStoppedSpeakingFrame,
|
||||
accumulator_frame=TranscriptionFrame,
|
||||
pass_through=False,
|
||||
)
|
||||
|
||||
|
||||
class LLMResponseAggregator(FrameProcessor):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -66,7 +141,7 @@ class ResponseAggregator(FrameProcessor):
|
||||
yield frame
|
||||
|
||||
|
||||
class LLMResponseAggregator(ResponseAggregator):
|
||||
class LLMAssistantResponseAggregator(LLMResponseAggregator):
|
||||
def __init__(self, messages: list[dict]):
|
||||
super().__init__(
|
||||
messages=messages,
|
||||
@@ -77,7 +152,7 @@ class LLMResponseAggregator(ResponseAggregator):
|
||||
)
|
||||
|
||||
|
||||
class UserResponseAggregator(ResponseAggregator):
|
||||
class LLMUserResponseAggregator(LLMResponseAggregator):
|
||||
def __init__(self, messages: list[dict]):
|
||||
super().__init__(
|
||||
messages=messages,
|
||||
@@ -360,7 +435,7 @@ class GatedAggregator(FrameProcessor):
|
||||
... start_open=False)
|
||||
>>> asyncio.run(print_frames(aggregator, TextFrame("Hello")))
|
||||
>>> asyncio.run(print_frames(aggregator, TextFrame("Hello again.")))
|
||||
>>> asyncio.run(print_frames(aggregator, ImageFrame(url='', image=bytes([]))))
|
||||
>>> asyncio.run(print_frames(aggregator, ImageFrame(image=bytes([]), size=(0, 0))))
|
||||
ImageFrame
|
||||
Hello
|
||||
Hello again.
|
||||
@@ -390,3 +465,37 @@ class GatedAggregator(FrameProcessor):
|
||||
self.accumulator = []
|
||||
else:
|
||||
self.accumulator.append(frame)
|
||||
|
||||
|
||||
class VisionImageFrameAggregator(FrameProcessor):
|
||||
"""This aggregator waits for a consecutive TextFrame and an
|
||||
ImageFrame. After the ImageFrame arrives it will output a VisionImageFrame.
|
||||
|
||||
>>> from dailyai.pipeline.frames import ImageFrame
|
||||
|
||||
>>> async def print_frames(aggregator, frame):
|
||||
... async for frame in aggregator.process_frame(frame):
|
||||
... print(frame)
|
||||
|
||||
>>> aggregator = VisionImageFrameAggregator()
|
||||
>>> asyncio.run(print_frames(aggregator, TextFrame("What do you see?")))
|
||||
>>> asyncio.run(print_frames(aggregator, ImageFrame(image=bytes([]), size=(0, 0))))
|
||||
VisionImageFrame, text: What do you see?, image size: 0x0, buffer size: 0 B
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self._describe_text = None
|
||||
|
||||
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||
if isinstance(frame, TextFrame):
|
||||
self._describe_text = frame.text
|
||||
elif isinstance(frame, ImageFrame):
|
||||
if self._describe_text:
|
||||
yield VisionImageFrame(self._describe_text, frame.image, frame.size)
|
||||
self._describe_text = None
|
||||
else:
|
||||
yield frame
|
||||
else:
|
||||
yield frame
|
||||
|
||||
@@ -70,11 +70,66 @@ class AudioFrame(Frame):
|
||||
class ImageFrame(Frame):
|
||||
"""An image. Will be shown by the transport if the transport's camera is
|
||||
enabled."""
|
||||
url: str | None
|
||||
image: bytes
|
||||
size: tuple[int, int]
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.__class__.__name__}, url: {self.url}, image size: {len(self.image)} B"
|
||||
return f"{self.__class__.__name__}, image size: {self.size[0]}x{self.size[1]} buffer size: {len(self.image)} B"
|
||||
|
||||
|
||||
@dataclass()
|
||||
class URLImageFrame(ImageFrame):
|
||||
"""An image with an associated URL. Will be shown by the transport if the
|
||||
transport's camera is enabled.
|
||||
|
||||
"""
|
||||
url: str | None
|
||||
|
||||
def __init__(self, url, image, size):
|
||||
super().__init__(image, size)
|
||||
self.url = url
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.__class__.__name__}, url: {self.url}, image size: {self.size[0]}x{self.size[1]}, buffer size: {len(self.image)} B"
|
||||
|
||||
|
||||
@dataclass()
|
||||
class VisionImageFrame(ImageFrame):
|
||||
"""An image with an associated text to ask for a description of it. Will be shown by the
|
||||
transport if the transport's camera is enabled.
|
||||
|
||||
"""
|
||||
text: str | None
|
||||
|
||||
def __init__(self, text, image, size):
|
||||
super().__init__(image, size)
|
||||
self.text = text
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.__class__.__name__}, text: {self.text}, image size: {self.size[0]}x{self.size[1]}, buffer size: {len(self.image)} B"
|
||||
|
||||
|
||||
@dataclass()
|
||||
class UserImageFrame(ImageFrame):
|
||||
"""An image associated to a user. Will be shown by the transport if the transport's camera is
|
||||
enabled."""
|
||||
user_id: str
|
||||
|
||||
def __init__(self, user_id, image, size):
|
||||
super().__init__(image, size)
|
||||
self.user_id = user_id
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.__class__.__name__}, user: {self.user_id}, image size: {self.size[0]}x{self.size[1]}, buffer size: {len(self.image)} B"
|
||||
|
||||
|
||||
@dataclass()
|
||||
class UserImageRequestFrame(Frame):
|
||||
"""A frame user to request an image from the given user."""
|
||||
user_id: str
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.__class__.__name__}, user: {self.user_id}"
|
||||
|
||||
|
||||
@dataclass()
|
||||
@@ -144,10 +199,10 @@ class ReceivedAppMessageFrame(Frame):
|
||||
@dataclass()
|
||||
class SendAppMessageFrame(Frame):
|
||||
message: Any
|
||||
participantId: str | None
|
||||
participant_id: str | None
|
||||
|
||||
def __str__(self):
|
||||
return f"SendAppMessageFrame: participantId: {self.participantId}, message: {self.message}"
|
||||
return f"SendAppMessageFrame: participant: {self.participant_id}, message: {self.message}"
|
||||
|
||||
|
||||
class UserStartedSpeakingFrame(Frame):
|
||||
|
||||
@@ -14,6 +14,8 @@ from dailyai.pipeline.frames import (
|
||||
TTSStartFrame,
|
||||
TextFrame,
|
||||
TranscriptionFrame,
|
||||
URLImageFrame,
|
||||
VisionImageFrame,
|
||||
)
|
||||
|
||||
from abc import abstractmethod
|
||||
@@ -81,13 +83,12 @@ class TTSService(AIService):
|
||||
|
||||
|
||||
class ImageGenService(AIService):
|
||||
def __init__(self, image_size, **kwargs):
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.image_size = image_size
|
||||
|
||||
# Renders the image. Returns an Image object.
|
||||
@abstractmethod
|
||||
async def run_image_gen(self, sentence: str) -> tuple[str, bytes]:
|
||||
async def run_image_gen(self, prompt: str) -> tuple[str, bytes, tuple[int, int]]:
|
||||
pass
|
||||
|
||||
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||
@@ -95,8 +96,27 @@ class ImageGenService(AIService):
|
||||
yield frame
|
||||
return
|
||||
|
||||
(url, image_data) = await self.run_image_gen(frame.text)
|
||||
yield ImageFrame(url, image_data)
|
||||
(url, image_data, image_size) = await self.run_image_gen(frame.text)
|
||||
yield URLImageFrame(url, image_data, image_size)
|
||||
|
||||
|
||||
class VisionService(AIService):
|
||||
"""VisionService is a base class for vision services."""
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self._describe_text = None
|
||||
|
||||
@abstractmethod
|
||||
async def run_vision(self, frame: VisionImageFrame) -> str:
|
||||
pass
|
||||
|
||||
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||
if isinstance(frame, VisionImageFrame):
|
||||
description = await self.run_vision(frame)
|
||||
yield TextFrame(description)
|
||||
else:
|
||||
yield frame
|
||||
|
||||
|
||||
class STTService(AIService):
|
||||
|
||||
@@ -19,7 +19,7 @@ try:
|
||||
except ModuleNotFoundError as e:
|
||||
print(f"Exception: {e}")
|
||||
print(
|
||||
"In order to use Azure TTS, you need to `pip install dailyai[azure]`. Also, set `SPEECH_KEY` and `SPEECH_REGION` environment variables.")
|
||||
"In order to use Azure TTS, you need to `pip install dailyai[azure]`. Also, set `AZURE_SPEECH_API_KEY` and `AZURE_SPEECH_REGION` environment variables.")
|
||||
raise Exception(f"Missing module: {e}")
|
||||
|
||||
from dailyai.services.openai_api_llm_service import BaseOpenAILLMService
|
||||
@@ -97,23 +97,24 @@ class AzureImageGenServiceREST(ImageGenService):
|
||||
endpoint,
|
||||
model,
|
||||
):
|
||||
super().__init__(image_size=image_size)
|
||||
super().__init__()
|
||||
|
||||
self._api_key = api_key
|
||||
self._azure_endpoint = endpoint
|
||||
self._api_version = api_version
|
||||
self._model = model
|
||||
self._aiohttp_session = aiohttp_session
|
||||
self._image_size = image_size
|
||||
|
||||
async def run_image_gen(self, sentence) -> tuple[str, bytes]:
|
||||
async def run_image_gen(self, prompt: str) -> tuple[str, bytes, tuple[int, int]]:
|
||||
url = f"{self._azure_endpoint}openai/images/generations:submit?api-version={self._api_version}"
|
||||
headers = {
|
||||
"api-key": self._api_key,
|
||||
"Content-Type": "application/json"}
|
||||
body = {
|
||||
# Enter your prompt text here
|
||||
"prompt": sentence,
|
||||
"size": self.image_size,
|
||||
"prompt": prompt,
|
||||
"size": self._image_size,
|
||||
"n": 1,
|
||||
}
|
||||
async with self._aiohttp_session.post(
|
||||
@@ -146,4 +147,4 @@ class AzureImageGenServiceREST(ImageGenService):
|
||||
async with self._aiohttp_session.get(image_url) as response:
|
||||
image_stream = io.BytesIO(await response.content.read())
|
||||
image = Image.open(image_stream)
|
||||
return (image_url, image.tobytes())
|
||||
return (image_url, image.tobytes(), image.size)
|
||||
|
||||
@@ -3,6 +3,8 @@ import asyncio
|
||||
import io
|
||||
import os
|
||||
from PIL import Image
|
||||
from pydantic import BaseModel
|
||||
from typing import Optional, Union, Dict
|
||||
|
||||
from dailyai.services.ai_services import ImageGenService
|
||||
|
||||
@@ -16,30 +18,44 @@ except ModuleNotFoundError as e:
|
||||
|
||||
|
||||
class FalImageGenService(ImageGenService):
|
||||
class InputParams(BaseModel):
|
||||
seed: Optional[int] = None
|
||||
num_inference_steps: int = 4
|
||||
num_images: int = 1
|
||||
image_size: Union[str, Dict[str, int]] = "square_hd"
|
||||
expand_prompt: bool = False
|
||||
enable_safety_checker: bool = True
|
||||
format: str = "png"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
image_size,
|
||||
aiohttp_session: aiohttp.ClientSession,
|
||||
params: InputParams,
|
||||
model="fal-ai/fast-sdxl",
|
||||
key_id=None,
|
||||
key_secret=None
|
||||
):
|
||||
super().__init__(image_size)
|
||||
super().__init__()
|
||||
self._model = model
|
||||
self._params = params
|
||||
self._aiohttp_session = aiohttp_session
|
||||
if key_id:
|
||||
os.environ["FAL_KEY_ID"] = key_id
|
||||
if key_secret:
|
||||
os.environ["FAL_KEY_SECRET"] = key_secret
|
||||
|
||||
async def run_image_gen(self, sentence) -> tuple[str, bytes]:
|
||||
def get_image_url(sentence, size):
|
||||
handler = fal.apps.submit(
|
||||
"110602490-fast-sdxl",
|
||||
# "fal-ai/fast-sdxl",
|
||||
arguments={"prompt": sentence},
|
||||
async def run_image_gen(self, prompt: str) -> tuple[str, bytes, tuple[int, int]]:
|
||||
def get_image_url(prompt):
|
||||
handler = fal.apps.submit( # type: ignore
|
||||
self._model,
|
||||
arguments={
|
||||
"prompt": prompt,
|
||||
**self._params.dict(),
|
||||
},
|
||||
)
|
||||
for event in handler.iter_events():
|
||||
if isinstance(event, fal.apps.InProgress):
|
||||
if isinstance(event, fal.apps.InProgress): # type: ignore
|
||||
pass
|
||||
|
||||
result = handler.get()
|
||||
@@ -50,9 +66,10 @@ class FalImageGenService(ImageGenService):
|
||||
|
||||
return image_url
|
||||
|
||||
image_url = await asyncio.to_thread(get_image_url, sentence, self.image_size)
|
||||
image_url = await asyncio.to_thread(get_image_url, prompt)
|
||||
|
||||
# Load the image from the url
|
||||
async with self._aiohttp_session.get(image_url) as response:
|
||||
image_stream = io.BytesIO(await response.content.read())
|
||||
image = Image.open(image_stream)
|
||||
return (image_url, image.tobytes())
|
||||
return (image_url, image.tobytes(), image.size)
|
||||
|
||||
52
src/dailyai/services/moondream_ai_service.py
Normal file
52
src/dailyai/services/moondream_ai_service.py
Normal file
@@ -0,0 +1,52 @@
|
||||
from dailyai.pipeline.frames import ImageFrame, VisionImageFrame
|
||||
from dailyai.services.ai_services import VisionService
|
||||
|
||||
from PIL import Image
|
||||
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
import torch
|
||||
|
||||
|
||||
def detect_device():
|
||||
"""
|
||||
Detects the appropriate device to run on, and return the device and dtype.
|
||||
"""
|
||||
if torch.cuda.is_available():
|
||||
return torch.device("cuda"), torch.float16
|
||||
elif torch.backends.mps.is_available():
|
||||
return torch.device("mps"), torch.float16
|
||||
else:
|
||||
return torch.device("cpu"), torch.float32
|
||||
|
||||
|
||||
class MoondreamService(VisionService):
|
||||
def __init__(
|
||||
self,
|
||||
model_id="vikhyatk/moondream2",
|
||||
revision="2024-04-02",
|
||||
use_cpu=False
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
if not use_cpu:
|
||||
device, dtype = detect_device()
|
||||
else:
|
||||
device = torch.device("cpu")
|
||||
dtype = torch.float32
|
||||
|
||||
self._tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
|
||||
|
||||
self._model = AutoModelForCausalLM.from_pretrained(
|
||||
model_id, trust_remote_code=True, revision=revision
|
||||
).to(device=device, dtype=dtype)
|
||||
self._model.eval()
|
||||
|
||||
async def run_vision(self, frame: VisionImageFrame) -> str:
|
||||
image = Image.frombytes("RGB", (frame.size[0], frame.size[1]), frame.image)
|
||||
image_embeds = self._model.encode_image(image)
|
||||
description = self._model.answer_question(
|
||||
image_embeds=image_embeds,
|
||||
question=frame.text,
|
||||
tokenizer=self._tokenizer)
|
||||
return description
|
||||
@@ -1,3 +1,4 @@
|
||||
from typing import Literal
|
||||
import aiohttp
|
||||
from PIL import Image
|
||||
import io
|
||||
@@ -26,24 +27,25 @@ class OpenAIImageGenService(ImageGenService):
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
image_size: str,
|
||||
image_size: Literal["256x256", "512x512", "1024x1024", "1792x1024", "1024x1792"],
|
||||
aiohttp_session: aiohttp.ClientSession,
|
||||
api_key,
|
||||
model="dall-e-3",
|
||||
):
|
||||
super().__init__(image_size=image_size)
|
||||
super().__init__()
|
||||
self._model = model
|
||||
self._image_size = image_size
|
||||
self._client = AsyncOpenAI(api_key=api_key)
|
||||
self._aiohttp_session = aiohttp_session
|
||||
|
||||
async def run_image_gen(self, sentence) -> tuple[str, bytes]:
|
||||
self.logger.info("Generating OpenAI image", sentence)
|
||||
async def run_image_gen(self, prompt: str) -> tuple[str, bytes, tuple[int, int]]:
|
||||
self.logger.info("Generating OpenAI image", prompt)
|
||||
|
||||
image = await self._client.images.generate(
|
||||
prompt=sentence,
|
||||
prompt=prompt,
|
||||
model=self._model,
|
||||
n=1,
|
||||
size=self.image_size
|
||||
size=self._image_size
|
||||
)
|
||||
image_url = image.data[0].url
|
||||
if not image_url:
|
||||
@@ -53,4 +55,4 @@ class OpenAIImageGenService(ImageGenService):
|
||||
async with self._aiohttp_session.get(image_url) as response:
|
||||
image_stream = io.BytesIO(await response.content.read())
|
||||
image = Image.open(image_stream)
|
||||
return (image_url, image.tobytes())
|
||||
return (image_url, image.tobytes(), image.size)
|
||||
|
||||
@@ -19,7 +19,7 @@ class MockAIService(AIService):
|
||||
image_stream = io.BytesIO(response.content)
|
||||
image = Image.open(image_stream)
|
||||
time.sleep(1)
|
||||
return (image_url, image)
|
||||
return (image_url, image.tobytes(), image.size)
|
||||
|
||||
def run_llm(self, messages, latest_user_message=None, stream=True):
|
||||
for i in range(5):
|
||||
|
||||
@@ -21,9 +21,10 @@ class AbstractTransport:
|
||||
self._camera_enabled = kwargs.get("camera_enabled") or False
|
||||
self._camera_width = kwargs.get("camera_width") or 1024
|
||||
self._camera_height = kwargs.get("camera_height") or 768
|
||||
self._camera_bitrate = kwargs.get("camera_bitrate") or 250000
|
||||
self._camera_framerate = kwargs.get("camera_framerate") or 10
|
||||
self._speaker_enabled = kwargs.get("speaker_enabled") or False
|
||||
self._speaker_sample_rate = kwargs.get("speaker_sample_rate") or 16000
|
||||
self._fps = kwargs.get("fps") or 8
|
||||
|
||||
self._logger: logging.Logger = logging.getLogger("dailyai.transport")
|
||||
|
||||
|
||||
@@ -2,6 +2,7 @@ import asyncio
|
||||
import inspect
|
||||
import logging
|
||||
import signal
|
||||
import time
|
||||
import threading
|
||||
import types
|
||||
|
||||
@@ -11,6 +12,7 @@ from typing import Any
|
||||
from dailyai.pipeline.frames import (
|
||||
ReceivedAppMessageFrame,
|
||||
TranscriptionFrame,
|
||||
UserImageFrame,
|
||||
)
|
||||
|
||||
from threading import Event
|
||||
@@ -58,6 +60,7 @@ class DailyTransport(ThreadedTransport, EventHandler):
|
||||
bot_name: str,
|
||||
min_others_count: int = 1,
|
||||
start_transcription: bool = False,
|
||||
video_rendering_enabled: bool = False,
|
||||
**kwargs,
|
||||
):
|
||||
kwargs['has_webrtc_vad'] = True
|
||||
@@ -69,6 +72,7 @@ class DailyTransport(ThreadedTransport, EventHandler):
|
||||
self._token: str | None = token
|
||||
self._min_others_count = min_others_count
|
||||
self._start_transcription = start_transcription
|
||||
self._video_rendering_enabled = video_rendering_enabled
|
||||
|
||||
self._is_interrupted = Event()
|
||||
self._stop_threads = Event()
|
||||
@@ -76,6 +80,8 @@ class DailyTransport(ThreadedTransport, EventHandler):
|
||||
self._other_participant_has_joined = False
|
||||
self._my_participant_id = None
|
||||
|
||||
self._video_renderers = {}
|
||||
|
||||
self.transcription_settings = {
|
||||
"language": "en",
|
||||
"tier": "nova",
|
||||
@@ -156,16 +162,24 @@ class DailyTransport(ThreadedTransport, EventHandler):
|
||||
return decorator
|
||||
|
||||
def write_frame_to_camera(self, frame: bytes):
|
||||
self.camera.write_frame(frame)
|
||||
if self._camera_enabled:
|
||||
self.camera.write_frame(frame)
|
||||
|
||||
def write_frame_to_mic(self, frame: bytes):
|
||||
self.mic.write_frames(frame)
|
||||
if self._mic_enabled:
|
||||
self.mic.write_frames(frame)
|
||||
|
||||
def send_app_message(self, message: Any, participantId: str | None):
|
||||
self.client.send_app_message(message, participantId)
|
||||
def request_participant_image(self, participant_id: str):
|
||||
if participant_id in self._video_renderers:
|
||||
self._video_renderers[participant_id]["render_next_frame"] = True
|
||||
|
||||
def send_app_message(self, message: Any, participant_id: str | None):
|
||||
self.client.send_app_message(message, participant_id)
|
||||
|
||||
def read_audio_frames(self, desired_frame_count):
|
||||
bytes = self._speaker.read_frames(desired_frame_count)
|
||||
bytes = b""
|
||||
if self._speaker_enabled or self._vad_enabled:
|
||||
bytes = self._speaker.read_frames(desired_frame_count)
|
||||
return bytes
|
||||
|
||||
def _prerun(self):
|
||||
@@ -222,9 +236,9 @@ class DailyTransport(ThreadedTransport, EventHandler):
|
||||
"maxQuality": "low",
|
||||
"encodings": {
|
||||
"low": {
|
||||
"maxBitrate": 250000,
|
||||
"maxBitrate": self._camera_bitrate,
|
||||
"scaleResolutionDownBy": 1.333,
|
||||
"maxFramerate": 8,
|
||||
"maxFramerate": self._camera_framerate,
|
||||
}
|
||||
},
|
||||
}
|
||||
@@ -234,9 +248,12 @@ class DailyTransport(ThreadedTransport, EventHandler):
|
||||
)
|
||||
self._my_participant_id = self.client.participants()["local"]["id"]
|
||||
|
||||
# For performance reasons, never subscribe to video streams (unless a
|
||||
# video renderer is registered).
|
||||
self.client.update_subscription_profiles({
|
||||
"base": {
|
||||
"camera": "unsubscribed",
|
||||
"screenVideo": "unsubscribed"
|
||||
}
|
||||
})
|
||||
|
||||
@@ -255,7 +272,7 @@ class DailyTransport(ThreadedTransport, EventHandler):
|
||||
self.client.leave()
|
||||
self.client.release()
|
||||
|
||||
def on_first_other_participant_joined(self):
|
||||
def on_first_other_participant_joined(self, participant):
|
||||
pass
|
||||
|
||||
def call_joined(self, join_data, client_error):
|
||||
@@ -268,6 +285,59 @@ class DailyTransport(ThreadedTransport, EventHandler):
|
||||
def start_recording(self):
|
||||
self.client.start_recording()
|
||||
|
||||
def render_participant_video(self,
|
||||
participant_id,
|
||||
framerate=10,
|
||||
video_source="camera",
|
||||
color_format="RGB") -> None:
|
||||
if not self._video_rendering_enabled:
|
||||
self._logger.warn("Video rendering is not enabled")
|
||||
return
|
||||
|
||||
# Only enable camera subscription on this participant
|
||||
self.client.update_subscriptions(participant_settings={
|
||||
participant_id: {
|
||||
"media": {
|
||||
video_source: "subscribed"
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
self._video_renderers[participant_id] = {
|
||||
"framerate": framerate,
|
||||
"timestamp": 0,
|
||||
"render_next_frame": False,
|
||||
}
|
||||
self.client.set_video_renderer(
|
||||
participant_id,
|
||||
self.on_participant_video_frame,
|
||||
video_source=video_source,
|
||||
color_format=color_format)
|
||||
|
||||
def on_participant_video_frame(self, participant_id, video_frame):
|
||||
if not self._loop:
|
||||
return
|
||||
|
||||
render_frame = False
|
||||
|
||||
curr_time = time.time()
|
||||
framerate = self._video_renderers[participant_id]["framerate"]
|
||||
|
||||
if framerate > 0:
|
||||
prev_time = self._video_renderers[participant_id]["timestamp"]
|
||||
next_time = prev_time + 1 / framerate
|
||||
render_frame = curr_time > next_time
|
||||
elif self._video_renderers[participant_id]["render_next_frame"]:
|
||||
self._video_renderers[participant_id]["render_next_frame"] = False
|
||||
render_frame = True
|
||||
|
||||
if render_frame:
|
||||
frame = UserImageFrame(participant_id, video_frame.buffer,
|
||||
(video_frame.width, video_frame.height))
|
||||
asyncio.run_coroutine_threadsafe(self.receive_queue.put(frame), self._loop)
|
||||
|
||||
self._video_renderers[participant_id]["timestamp"] = curr_time
|
||||
|
||||
def on_error(self, error):
|
||||
self._logger.error(f"on_error: {error}")
|
||||
|
||||
@@ -277,7 +347,7 @@ class DailyTransport(ThreadedTransport, EventHandler):
|
||||
def on_participant_joined(self, participant):
|
||||
if not self._other_participant_has_joined and participant["id"] != self._my_participant_id:
|
||||
self._other_participant_has_joined = True
|
||||
self.on_first_other_participant_joined()
|
||||
self.on_first_other_participant_joined(participant)
|
||||
|
||||
def on_participant_left(self, participant, reason):
|
||||
if len(self.client.participants()) < self._min_others_count + 1:
|
||||
@@ -286,7 +356,6 @@ class DailyTransport(ThreadedTransport, EventHandler):
|
||||
def on_app_message(self, message: Any, sender: str):
|
||||
if self._loop:
|
||||
frame = ReceivedAppMessageFrame(message, sender)
|
||||
print(frame)
|
||||
asyncio.run_coroutine_threadsafe(
|
||||
self.receive_queue.put(frame), self._loop
|
||||
)
|
||||
|
||||
@@ -19,6 +19,7 @@ class LocalTransport(ThreadedTransport):
|
||||
self._sample_width = kwargs.get("sample_width") or 2
|
||||
self._n_channels = kwargs.get("n_channels") or 1
|
||||
self._tk_root = kwargs.get("tk_root") or None
|
||||
self._pyaudio = None
|
||||
|
||||
if self._camera_enabled and not self._tk_root:
|
||||
raise ValueError(
|
||||
@@ -51,7 +52,7 @@ class LocalTransport(ThreadedTransport):
|
||||
if self._mic_enabled:
|
||||
self._audio_stream.write(frame)
|
||||
|
||||
def read_frames(self, desired_frame_count):
|
||||
def read_audio_frames(self, desired_frame_count):
|
||||
bytes = b""
|
||||
if self._speaker_enabled:
|
||||
bytes = self._speaker_stream.read(
|
||||
@@ -62,7 +63,8 @@ class LocalTransport(ThreadedTransport):
|
||||
|
||||
def _prerun(self):
|
||||
if self._mic_enabled:
|
||||
self._pyaudio = pyaudio.PyAudio()
|
||||
if not self._pyaudio:
|
||||
self._pyaudio = pyaudio.PyAudio()
|
||||
self._audio_stream = self._pyaudio.open(
|
||||
format=self._pyaudio.get_format_from_width(self._sample_width),
|
||||
channels=self._n_channels,
|
||||
@@ -84,6 +86,8 @@ class LocalTransport(ThreadedTransport):
|
||||
self._image_label.pack()
|
||||
|
||||
if self._speaker_enabled:
|
||||
if not self._pyaudio:
|
||||
self._pyaudio = pyaudio.PyAudio()
|
||||
self._speaker_stream = self._pyaudio.open(
|
||||
format=self._pyaudio.get_format_from_width(self._sample_width),
|
||||
channels=self._n_channels,
|
||||
|
||||
@@ -20,6 +20,7 @@ from dailyai.pipeline.frames import (
|
||||
SpriteFrame,
|
||||
StartFrame,
|
||||
TextFrame,
|
||||
UserImageRequestFrame,
|
||||
UserStartedSpeakingFrame,
|
||||
UserStoppedSpeakingFrame,
|
||||
)
|
||||
@@ -382,7 +383,11 @@ class ThreadedTransport(AbstractTransport):
|
||||
def _set_images(self, images: list[bytes], start_frame=0):
|
||||
self._images = itertools.cycle(images)
|
||||
|
||||
def send_app_message(self, message: Any, participantId: str | None):
|
||||
def request_participant_image(self, participant_id: str):
|
||||
""" Child classes should override this to force an image from a user. """
|
||||
pass
|
||||
|
||||
def send_app_message(self, message: Any, participant_id: str | None):
|
||||
""" Child classes should override this to send a custom message to the room. """
|
||||
pass
|
||||
|
||||
@@ -393,7 +398,7 @@ class ThreadedTransport(AbstractTransport):
|
||||
this_frame = next(self._images)
|
||||
self.write_frame_to_camera(this_frame)
|
||||
|
||||
time.sleep(1.0 / self._fps)
|
||||
time.sleep(1.0 / self._camera_framerate)
|
||||
except Exception as e:
|
||||
self._logger.error(f"Exception {e} in camera thread.")
|
||||
raise e
|
||||
@@ -458,9 +463,10 @@ class ThreadedTransport(AbstractTransport):
|
||||
self._set_image(frame.image)
|
||||
elif isinstance(frame, SpriteFrame):
|
||||
self._set_images(frame.images)
|
||||
elif isinstance(frame, UserImageRequestFrame):
|
||||
self.request_participant_image(frame.user_id)
|
||||
elif isinstance(frame, SendAppMessageFrame):
|
||||
self.send_app_message(
|
||||
frame.message, frame.participantId)
|
||||
self.send_app_message(frame.message, frame.participant_id)
|
||||
elif len(b):
|
||||
self.write_frame_to_mic(bytes(b))
|
||||
b = bytearray()
|
||||
|
||||
@@ -54,13 +54,13 @@ class TestDailyFrameAggregators(unittest.IsolatedAsyncioTestCase):
|
||||
TextFrame("Hello, "),
|
||||
TextFrame("world."),
|
||||
AudioFrame(b"hello"),
|
||||
ImageFrame("image", b"image"),
|
||||
ImageFrame(b"image", (0, 0)),
|
||||
AudioFrame(b"world"),
|
||||
LLMResponseEndFrame(),
|
||||
]
|
||||
|
||||
expected_output_frames = [
|
||||
ImageFrame("image", b"image"),
|
||||
ImageFrame(b"image", (0, 0)),
|
||||
LLMResponseStartFrame(),
|
||||
TextFrame("Hello, "),
|
||||
TextFrame("world."),
|
||||
|
||||
@@ -11,11 +11,11 @@ class TestDailyTransport(unittest.IsolatedAsyncioTestCase):
|
||||
was_called = False
|
||||
|
||||
@transport.event_handler("on_first_other_participant_joined")
|
||||
def test_event_handler(transport):
|
||||
def test_event_handler(transport, participant):
|
||||
nonlocal was_called
|
||||
was_called = True
|
||||
|
||||
transport.on_first_other_participant_joined()
|
||||
transport.on_first_other_participant_joined({"id": "user-id"})
|
||||
|
||||
self.assertTrue(was_called)
|
||||
|
||||
@@ -29,7 +29,7 @@ class TestDailyTransport(unittest.IsolatedAsyncioTestCase):
|
||||
event = asyncio.Event()
|
||||
|
||||
@transport.event_handler("on_first_other_participant_joined")
|
||||
async def test_event_handler(transport):
|
||||
async def test_event_handler(transport, participant):
|
||||
nonlocal event
|
||||
print("sleeping")
|
||||
await asyncio.sleep(0.1)
|
||||
@@ -68,7 +68,7 @@ class TestDailyTransport(unittest.IsolatedAsyncioTestCase):
|
||||
await transport.send_queue.put(AudioFrame(bytes([0] * 3300)))
|
||||
|
||||
async def send_video_frame():
|
||||
await transport.send_queue.put(ImageFrame(None, b"test"))
|
||||
await transport.send_queue.put(ImageFrame(b"test", (0, 0)))
|
||||
|
||||
await asyncio.gather(transport.run(), send_audio_frame(), send_video_frame())
|
||||
|
||||
|
||||
Reference in New Issue
Block a user