Compare commits

...

20 Commits

Author SHA1 Message Date
Aleix Conchillo Flaqué
b678097f6d Merge pull request #109 from daily-co/only-use-fps
transport: only use fps to set maxFramerate
2024-04-07 07:02:44 +08:00
Aleix Conchillo Flaqué
eb455043c4 transport: use camera_bitrate and camera_framerate 2024-04-06 12:27:05 -07:00
Aleix Conchillo Flaqué
dd696be04c Merge pull request #108 from daily-co/add-camera-max-framerate
transport: add camera_max_framerate argument
2024-04-06 11:18:42 +08:00
Aleix Conchillo Flaqué
96b2337183 transport: add camera_max_framerate argument 2024-04-05 20:16:03 -07:00
Aleix Conchillo Flaqué
ea52e73f57 Merge pull request #107 from daily-co/increase-max-framerate
transport: increase daily maxFramerate to 30
2024-04-06 11:08:21 +08:00
Aleix Conchillo Flaqué
88404e4739 Merge pull request #106 from daily-co/updated-to-be-updated-examples
examples: updated to_be_updated examples
2024-04-06 11:06:30 +08:00
Aleix Conchillo Flaqué
0fd323714e transport: add camera_max_bitrate argument 2024-04-05 20:05:58 -07:00
Aleix Conchillo Flaqué
a362ca4d3d transport: increase daily maxFramerate to 30 2024-04-05 19:44:25 -07:00
Aleix Conchillo Flaqué
02b5c3dd5f update dot-env.template 2024-04-05 16:16:56 -07:00
Aleix Conchillo Flaqué
497a09cbc8 examples: updated to_be_updated examples 2024-04-05 16:01:23 -07:00
Aleix Conchillo Flaqué
172a14245d Merge pull request #104 from daily-co/threaded-transport-allow-sink-override
examples: fix whisper examples
2024-04-06 04:46:12 +08:00
Aleix Conchillo Flaqué
302246399b Merge pull request #105 from daily-co/local-tranport-read-audio-frames
transports: fix local transport read_audio_frames
2024-04-06 04:44:37 +08:00
Aleix Conchillo Flaqué
9590cc2fbc examples: fix whisper examples 2024-04-05 13:43:51 -07:00
Aleix Conchillo Flaqué
09e4044c72 transports: fix local transport read_audio_frames 2024-04-05 13:34:01 -07:00
Aleix Conchillo Flaqué
efdfb74dc3 github: increase fetch-depth to 100 for test publish 2024-04-05 08:32:29 -07:00
Aleix Conchillo Flaqué
158de6f20b github: fetch-tags and increase fetch-depth for test publish 2024-04-05 08:25:37 -07:00
Aleix Conchillo Flaqué
47f68b742d pyproject: user proper environment for test pypi 2024-04-05 08:02:45 -07:00
Aleix Conchillo Flaqué
2654ca1f62 pyproject: don't use local version for test pypi 2024-04-05 07:51:52 -07:00
Aleix Conchillo Flaqué
4263827ee8 README: use double-quotes with optional dependencies 2024-04-04 17:47:16 -07:00
Aleix Conchillo Flaqué
97fe529b0e github: update test publish workflow 2024-04-04 17:41:31 -07:00
16 changed files with 142 additions and 150 deletions

View File

@@ -15,6 +15,8 @@ jobs:
uses: actions/checkout@v4
with:
ref: ${{ github.event.inputs.gitref }}
fetch-tags: true
fetch-depth: 100
- name: Set up Python
id: setup_python
uses: actions/setup-python@v4
@@ -35,6 +37,7 @@ jobs:
- name: Upload wheels
uses: actions/upload-artifact@v4
with:
name: wheels
path: ./dist
publish-to-pypi:
@@ -42,7 +45,7 @@ jobs:
runs-on: ubuntu-latest
needs: [ build ]
environment:
name: pypi
name: testpypi
url: https://pypi.org/p/dailyai
permissions:
id-token: write
@@ -50,6 +53,7 @@ jobs:
- name: Download wheels
uses: actions/download-artifact@v4
with:
name: wheels
path: ./dist
- name: Publish to PyPI
uses: pypa/gh-action-pypi-publish@release/v1

1
.gitignore vendored
View File

@@ -3,6 +3,7 @@ env/
__pycache__/
*~
venv
.venv
#*#
# Distribution / packaging

View File

@@ -58,7 +58,7 @@ By default, in order to minimize dependencies, only the basic framework function
dependencies that you can install with:
```
pip install dailyai[option,...]
pip install "dailyai[option,...]"
```
Your project may or may not need these, so they're made available as optional requirements. Here is a list:

View File

@@ -2,8 +2,16 @@
ANTHROPIC_API_KEY=...
# Azure
SPEECH_KEY=...
SPEECH_REGION=...
AZURE_SPEECH_REGION=...
AZURE_SPEECH_API_KEY=...
AZURE_CHATGPT_API_KEY=...
AZURE_CHATGPT_ENDPOINT=https://...
AZURE_CHATGPT_MODEL=...
AZURE_DALLE_API_KEY=...
AZURE_DALLE_ENDPOINT=https://...
AZURE_DALLE_MODEL=...
# Daily
DAILY_API_KEY=...

View File

@@ -3,8 +3,9 @@ import asyncio
import logging
import tkinter as tk
import os
from dailyai.pipeline.aggregators import LLMFullResponseAggregator
from dailyai.pipeline.frames import AudioFrame, ImageFrame
from dailyai.pipeline.frames import AudioFrame, ImageFrame, LLMMessagesFrame, TextFrame
from dailyai.services.open_ai_services import OpenAILLMService
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
from dailyai.services.fal_ai_services import FalImageGenService
@@ -22,7 +23,7 @@ async def main():
async with aiohttp.ClientSession() as session:
meeting_duration_minutes = 5
tk_root = tk.Tk()
tk_root.title("Calendar")
tk_root.title("dailyai")
transport = LocalTransport(
mic_enabled=True,
@@ -43,7 +44,7 @@ async def main():
api_key=os.getenv("OPENAI_API_KEY"),
model="gpt-4-turbo-preview")
dalle = FalImageGenService(
imagegen = FalImageGenService(
image_size="1024x1024",
aiohttp_session=session,
key_id=os.getenv("FAL_KEY_ID"),
@@ -60,18 +61,33 @@ async def main():
return all_audio
async def get_month_description(aggregator, frame):
async for frame in aggregator.process_frame(frame):
if isinstance(frame, TextFrame):
return frame.text
async def get_month_data(month):
messages = [{"role": "system", "content": f"Describe a nature photograph suitable for use in a calendar, for the month of {
month}. Include only the image description with no preamble. Limit the description to one sentence, please.", }]
image_description = await llm.run_llm(messages)
messages_frame = LLMMessagesFrame(messages)
llm_full_response_aggregator = LLMFullResponseAggregator()
image_description = None
async for frame in llm.process_frame(messages_frame):
result = await get_month_description(llm_full_response_aggregator, frame)
if result:
image_description = result
break
if not image_description:
return
to_speak = f"{month}: {image_description}"
audio_task = asyncio.create_task(get_all_audio(to_speak))
image_task = asyncio.create_task(
dalle.run_image_gen(image_description))
imagegen.run_image_gen(image_description))
(audio, image_data) = await asyncio.gather(audio_task, image_task)
return {
@@ -82,19 +98,14 @@ async def main():
"audio": audio,
}
# We only specify 5 months as we create tasks all at once and we might
# get rate limited otherwise.
months: list[str] = [
"January",
"February",
"March",
"April",
"May",
"June",
"July",
"August",
"September",
"October",
"November",
"December",
]
async def show_images():

View File

@@ -5,7 +5,8 @@ from typing import AsyncGenerator
import aiohttp
from PIL import Image
from dailyai.pipeline.frames import ImageFrame, Frame
from dailyai.pipeline.frames import ImageFrame, Frame, TextFrame
from dailyai.pipeline.pipeline import Pipeline
from dailyai.transports.daily_transport import DailyTransport
from dailyai.services.ai_services import AIService
from dailyai.pipeline.aggregators import (
@@ -14,7 +15,6 @@ from dailyai.pipeline.aggregators import (
)
from dailyai.services.open_ai_services import OpenAILLMService
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
from dailyai.services.fal_ai_services import FalImageGenService
from runner import configure
@@ -53,6 +53,7 @@ async def main(room_url: str, token):
transport._camera_height = 1024
transport._mic_enabled = True
transport._mic_sample_rate = 16000
transport.transcription_settings["extra"]["punctuate"] = True
tts = ElevenLabsTTSService(
aiohttp_session=session,
@@ -64,57 +65,30 @@ async def main(room_url: str, token):
api_key=os.getenv("OPENAI_API_KEY"),
model="gpt-4-turbo-preview")
img = FalImageGenService(
image_size="1024x1024",
aiohttp_session=session,
key_id=os.getenv("FAL_KEY_ID"),
key_secret=os.getenv("FAL_KEY_SECRET"),
messages = [
{
"role": "system",
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so it should not include any special characters. Respond to what the user said in a creative and helpful way.",
},
]
tma_in = LLMUserContextAggregator(
messages, transport._my_participant_id)
tma_out = LLMAssistantContextAggregator(
messages, transport._my_participant_id
)
image_sync_aggregator = ImageSyncAggregator(
os.path.join(os.path.dirname(__file__), "assets", "speaking.png"),
os.path.join(os.path.dirname(__file__), "assets", "waiting.png"),
)
async def get_images():
get_speaking_task = asyncio.create_task(
img.run_image_gen("An image of a cat speaking")
)
get_waiting_task = asyncio.create_task(
img.run_image_gen("An image of a cat waiting")
)
(speaking_data, waiting_data) = await asyncio.gather(
get_speaking_task, get_waiting_task
)
return speaking_data, waiting_data
pipeline = Pipeline([image_sync_aggregator, tma_in, llm, tma_out, tts])
@transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport):
await tts.say("Hi, I'm listening!", transport.send_queue)
await pipeline.queue_frames([TextFrame("Hi, I'm listening!")])
async def handle_transcriptions():
messages = [
{
"role": "system",
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio. Respond to what the user said in a creative and helpful way.",
},
]
tma_in = LLMUserContextAggregator(
messages, transport._my_participant_id)
tma_out = LLMAssistantContextAggregator(
messages, transport._my_participant_id
)
image_sync_aggregator = ImageSyncAggregator(
os.path.join(
os.path.dirname(__file__), "assets", "speaking.png"), os.path.join(
os.path.dirname(__file__), "assets", "waiting.png"), )
await tts.run_to_queue(
transport.send_queue,
image_sync_aggregator.run(
tma_out.run(llm.run(tma_in.run(transport.get_receive_frames())))
),
)
transport.transcription_settings["extra"]["punctuate"] = True
await asyncio.gather(transport.run(), handle_transcriptions())
await transport.run(pipeline)
if __name__ == "__main__":

View File

@@ -5,6 +5,7 @@ import os
import random
from typing import AsyncGenerator
from PIL import Image
from dailyai.pipeline.pipeline import Pipeline
from dailyai.transports.daily_transport import DailyTransport
from dailyai.services.open_ai_services import OpenAILLMService
@@ -133,6 +134,7 @@ async def main(room_url: str, token):
transport._camera_enabled = True
transport._camera_width = 720
transport._camera_height = 1280
transport.transcription_settings["extra"]["punctuate"] = True
llm = OpenAILLMService(
api_key=os.getenv("OPENAI_API_KEY"),
@@ -145,45 +147,34 @@ async def main(room_url: str, token):
)
isa = ImageSyncAggregator()
messages = [
{
"role": "system",
"content": "You are Santa Cat, a cat that lives in Santa's workshop at the North Pole. You should be clever, and a bit sarcastic. You should also tell jokes every once in a while. Your responses should only be a few sentences long.",
},
]
tma_in = LLMUserContextAggregator(
messages, transport._my_participant_id)
tma_out = LLMAssistantContextAggregator(
messages, transport._my_participant_id
)
tf = TranscriptFilter(transport._my_participant_id)
ncf = NameCheckFilter(["Santa Cat", "Santa"])
pipeline = Pipeline([isa, tf, ncf, tma_in, llm, tma_out, tts])
@transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport):
await tts.say(
await transport.say(
"Hi! If you want to talk to me, just say 'hey Santa Cat'.",
transport.send_queue,
)
async def handle_transcriptions():
messages = [
{
"role": "system",
"content": "You are Santa Cat, a cat that lives in Santa's workshop at the North Pole. You should be clever, and a bit sarcastic. You should also tell jokes every once in a while. Your responses should only be a few sentences long.",
},
]
tma_in = LLMUserContextAggregator(
messages, transport._my_participant_id)
tma_out = LLMAssistantContextAggregator(
messages, transport._my_participant_id
)
tf = TranscriptFilter(transport._my_participant_id)
ncf = NameCheckFilter(["Santa Cat", "Santa"])
await tts.run_to_queue(
transport.send_queue,
isa.run(
tma_out.run(
llm.run(
tma_in.run(
ncf.run(tf.run(transport.get_receive_frames())))
)
)
),
tts,
)
async def starting_image():
await transport.send_queue.put(quiet_frame)
transport.transcription_settings["extra"]["punctuate"] = True
await asyncio.gather(transport.run(), handle_transcriptions(), starting_image())
await asyncio.gather(transport.run(pipeline), starting_image())
if __name__ == "__main__":

View File

@@ -3,6 +3,7 @@ import asyncio
import logging
import os
import wave
from dailyai.pipeline.pipeline import Pipeline
from dailyai.transports.daily_transport import DailyTransport
from dailyai.services.open_ai_services import OpenAILLMService
@@ -81,6 +82,7 @@ async def main(room_url: str, token):
mic_sample_rate=16000,
camera_enabled=False,
)
transport.transcription_settings["extra"]["punctuate"] = True
llm = OpenAILLMService(
api_key=os.getenv("OPENAI_API_KEY"),
@@ -92,47 +94,31 @@ async def main(room_url: str, token):
voice_id="ErXwobaYiN019PkySvjV",
)
messages = [
{
"role": "system",
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio. Respond to what the user said in a creative and helpful way.",
},
]
tma_in = LLMUserContextAggregator(
messages, transport._my_participant_id)
tma_out = LLMAssistantContextAggregator(
messages, transport._my_participant_id
)
out_sound = OutboundSoundEffectWrapper()
in_sound = InboundSoundEffectWrapper()
fl = FrameLogger("LLM Out")
fl2 = FrameLogger("Transcription In")
pipeline = Pipeline([tma_in, in_sound, fl2, llm, tma_out, fl, tts, out_sound])
@transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport):
await tts.say("Hi, I'm listening!", transport.send_queue)
await transport.say("Hi, I'm listening!", tts)
await transport.send_queue.put(AudioFrame(sounds["ding1.wav"]))
async def handle_transcriptions():
messages = [
{
"role": "system",
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio. Respond to what the user said in a creative and helpful way.",
},
]
tma_in = LLMUserContextAggregator(
messages, transport._my_participant_id)
tma_out = LLMAssistantContextAggregator(
messages, transport._my_participant_id
)
out_sound = OutboundSoundEffectWrapper()
in_sound = InboundSoundEffectWrapper()
fl = FrameLogger("LLM Out")
fl2 = FrameLogger("Transcription In")
await out_sound.run_to_queue(
transport.send_queue,
tts.run(
fl.run(
tma_out.run(
llm.run(
fl2.run(
in_sound.run(
tma_in.run(transport.get_receive_frames())
)
)
)
)
)
),
)
transport.transcription_settings["extra"]["punctuate"] = True
await asyncio.gather(transport.run(), handle_transcriptions())
await asyncio.gather(transport.run(pipeline))
if __name__ == "__main__":

View File

@@ -1,12 +1,16 @@
import asyncio
import logging
from dailyai.pipeline.frames import EndFrame, TranscriptionFrame
from dailyai.transports.daily_transport import DailyTransport
from dailyai.services.whisper_ai_services import WhisperSTTService
from dailyai.pipeline.pipeline import Pipeline
from runner import configure
from dotenv import load_dotenv
load_dotenv(override=True)
logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
logger = logging.getLogger("dailyai")
logger.setLevel(logging.DEBUG)
@@ -26,17 +30,27 @@ async def main(room_url: str):
stt = WhisperSTTService()
transcription_output_queue = asyncio.Queue()
transport_done = asyncio.Event()
pipeline = Pipeline([stt])
pipeline.set_sink(transcription_output_queue)
pipeline = Pipeline([stt], source=transport.receive_queue, sink=transcription_output_queue)
async def handle_transcription():
print("`````````TRANSCRIPTION`````````")
while True:
while not transport_done.is_set():
item = await transcription_output_queue.get()
print(item.text)
print("got item from queue", item)
if isinstance(item, TranscriptionFrame):
print(item.text)
elif isinstance(item, EndFrame):
break
print("handle_transcription done")
await asyncio.gather(transport.run(pipeline), handle_transcription())
async def run_until_done():
await transport.run()
transport_done.set()
print("run_until_done done")
await asyncio.gather(run_until_done(), pipeline.run_pipeline(), handle_transcription())
if __name__ == "__main__":

View File

@@ -15,11 +15,10 @@ async def main():
meeting_duration_minutes = 1
transport = LocalTransport(
mic_enabled=False,
mic_enabled=True,
camera_enabled=False,
speaker_enabled=True,
duration_minutes=meeting_duration_minutes,
start_transcription=False,
)
stt = WhisperSTTService()
@@ -27,8 +26,7 @@ async def main():
transcription_output_queue = asyncio.Queue()
transport_done = asyncio.Event()
pipeline = Pipeline([stt])
pipeline.set_sink(transcription_output_queue)
pipeline = Pipeline([stt], source=transport.receive_queue, sink=transcription_output_queue)
async def handle_transcription():
print("`````````TRANSCRIPTION`````````")
@@ -42,11 +40,11 @@ async def main():
print("handle_transcription done")
async def run_until_done():
await transport.run(pipeline)
await transport.run()
transport_done.set()
print("run_until_done done")
await asyncio.gather(run_until_done(), handle_transcription())
await asyncio.gather(run_until_done(), pipeline.run_pipeline(), handle_transcription())
if __name__ == "__main__":

View File

@@ -51,4 +51,4 @@ where = ["src"]
pythonpath = ["src"]
[tool.setuptools_scm]
# Empty
local_scheme = "no-local-version"

View File

@@ -19,7 +19,7 @@ try:
except ModuleNotFoundError as e:
print(f"Exception: {e}")
print(
"In order to use Azure TTS, you need to `pip install dailyai[azure]`. Also, set `SPEECH_KEY` and `SPEECH_REGION` environment variables.")
"In order to use Azure TTS, you need to `pip install dailyai[azure]`. Also, set `AZURE_SPEECH_API_KEY` and `AZURE_SPEECH_REGION` environment variables.")
raise Exception(f"Missing module: {e}")
from dailyai.services.openai_api_llm_service import BaseOpenAILLMService

View File

@@ -21,9 +21,10 @@ class AbstractTransport:
self._camera_enabled = kwargs.get("camera_enabled") or False
self._camera_width = kwargs.get("camera_width") or 1024
self._camera_height = kwargs.get("camera_height") or 768
self._camera_bitrate = kwargs.get("camera_bitrate") or 250000
self._camera_framerate = kwargs.get("camera_framerate") or 10
self._speaker_enabled = kwargs.get("speaker_enabled") or False
self._speaker_sample_rate = kwargs.get("speaker_sample_rate") or 16000
self._fps = kwargs.get("fps") or 8
self._logger: logging.Logger = logging.getLogger("dailyai.transport")

View File

@@ -222,9 +222,9 @@ class DailyTransport(ThreadedTransport, EventHandler):
"maxQuality": "low",
"encodings": {
"low": {
"maxBitrate": 250000,
"maxBitrate": self._camera_bitrate,
"scaleResolutionDownBy": 1.333,
"maxFramerate": 8,
"maxFramerate": self._camera_framerate,
}
},
}

View File

@@ -19,6 +19,7 @@ class LocalTransport(ThreadedTransport):
self._sample_width = kwargs.get("sample_width") or 2
self._n_channels = kwargs.get("n_channels") or 1
self._tk_root = kwargs.get("tk_root") or None
self._pyaudio = None
if self._camera_enabled and not self._tk_root:
raise ValueError(
@@ -51,7 +52,7 @@ class LocalTransport(ThreadedTransport):
if self._mic_enabled:
self._audio_stream.write(frame)
def read_frames(self, desired_frame_count):
def read_audio_frames(self, desired_frame_count):
bytes = b""
if self._speaker_enabled:
bytes = self._speaker_stream.read(
@@ -62,7 +63,8 @@ class LocalTransport(ThreadedTransport):
def _prerun(self):
if self._mic_enabled:
self._pyaudio = pyaudio.PyAudio()
if not self._pyaudio:
self._pyaudio = pyaudio.PyAudio()
self._audio_stream = self._pyaudio.open(
format=self._pyaudio.get_format_from_width(self._sample_width),
channels=self._n_channels,
@@ -84,6 +86,8 @@ class LocalTransport(ThreadedTransport):
self._image_label.pack()
if self._speaker_enabled:
if not self._pyaudio:
self._pyaudio = pyaudio.PyAudio()
self._speaker_stream = self._pyaudio.open(
format=self._pyaudio.get_format_from_width(self._sample_width),
channels=self._n_channels,

View File

@@ -393,7 +393,7 @@ class ThreadedTransport(AbstractTransport):
this_frame = next(self._images)
self.write_frame_to_camera(this_frame)
time.sleep(1.0 / self._fps)
time.sleep(1.0 / self._camera_framerate)
except Exception as e:
self._logger.error(f"Exception {e} in camera thread.")
raise e