Compare commits

...

65 Commits

Author SHA1 Message Date
Moishe Lettvin
646db8b9bd cleanup continues 2024-01-26 07:57:41 -05:00
Moishe Lettvin
42c142aff0 ... 2024-01-25 14:55:51 -05:00
Moishe Lettvin
6da78dbf9c getting started on cleanup 2024-01-25 13:50:10 -05:00
Moishe Lettvin
f0d9b0613e Add faster_whisper to module dependencies; remove unneeded import 2024-01-25 11:27:00 -05:00
Moishe Lettvin
a661905d7f Merge pull request #9 from daily-co/interruptions
Interruptable conversation wrapper
2024-01-25 11:24:57 -05:00
Moishe Lettvin
c9c2e5f561 Remove unnecessary try/except 2024-01-25 11:18:55 -05:00
Moishe Lettvin
795a339542 Add InterruptibleConversationWrapper 2024-01-25 11:15:04 -05:00
Liza
31db156dfc Local Whisper transcription (#10)
* First pass at Whisper transcription

* deletions

* Revise based on feedback, add autopep8
2024-01-25 13:43:25 +01:00
Moishe Lettvin
690cf2e47d Merge pull request #8 from daily-co/queueframe-refactor
Refactor QueueFrame
2024-01-23 13:15:11 -05:00
Moishe Lettvin
ba89e41c5b remove commented-out code 2024-01-23 09:37:15 -05:00
Moishe Lettvin
c134598a77 Refactor QueueFrame 2024-01-23 09:33:51 -05:00
Liza
b51abd2969 facilitate manual call management (#7) 2024-01-23 14:33:27 +01:00
Moishe Lettvin
3fda9b0ecb Use more flexibile aggregator 2024-01-22 16:02:35 -05:00
Moishe Lettvin
95c92e5304 Aggregators for LLM messages 2024-01-22 10:59:13 -05:00
Moishe Lettvin
b443fbdb60 Very rough draft at intro/overview in README 2024-01-19 16:20:08 -05:00
Moishe Lettvin
ccd2fa31e5 Rename 'theoretical-to-real' samples to 'foundational' 2024-01-19 13:57:52 -05:00
Moishe Lettvin
9b65286216 Merge pull request #6 from daily-co/rm-sentence-aggregator
Cleanup: no more sentence aggregator
2024-01-19 13:42:27 -05:00
Moishe Lettvin
6ae733ebfe Cleanup: no more sentence aggregator, let the TTS service deal with that; also removed the queue typing stuff from ai_services 2024-01-19 13:06:15 -05:00
Liza
1071dede1a Only initialize Daily once (#5) 2024-01-19 14:59:48 +01:00
Moishe Lettvin
ceeb93dd46 Clean up example 5 2024-01-18 11:58:08 -05:00
Moishe Lettvin
f55333844e Merge pull request #4 from daily-co/service-refactor
Service refactor: Generators over queues
2024-01-18 11:30:59 -05:00
Moishe Lettvin
0245f98eb5 update fal image_gen to use size in constructor 2024-01-18 11:29:13 -05:00
Moishe Lettvin
f9f2e2d7ea stop_when_done 2024-01-18 11:22:06 -05:00
Moishe Lettvin
0d21768d00 Fix example 5 2024-01-18 11:22:02 -05:00
Moishe Lettvin
13f2f792af refactor party tonight 2024-01-18 11:21:38 -05:00
Moishe Lettvin
a3ac0d84e8 working on making services more consistent/terse/easy 2024-01-18 11:20:56 -05:00
chadbailey59
6e8ebbd34c fal.ai integration (#3)
* fal.ai image gen

* some sample and readme updates

* holy cow this is fast

* basic image-gen working

* starting audio prompt and reset

* short confirmation words

* moved fal module to pyproject.toml

---------

Co-authored-by: Moishe Lettvin <moishel@gmail.com>
2024-01-17 12:08:00 -06:00
Chad Bailey
b1e6a89acd README tweaks 2024-01-17 16:56:17 +00:00
Moishe Lettvin
115b744861 input/output -> receive/send queues, for transport service 2024-01-16 21:11:38 -05:00
Moishe Lettvin
755059c358 a little cleanup 2024-01-16 19:58:11 -05:00
Moishe Lettvin
cfaccefe9c some sample and readme updates 2024-01-12 16:22:03 -05:00
Moishe Lettvin
5b49597854 Pass entire transcription message on queue; don't respond to yourself in 06- 2024-01-12 13:54:55 -05:00
Moishe Lettvin
869d557ded remove mistakenly added import: 2024-01-12 13:49:34 -05:00
Moishe Lettvin
a42f6bc531 Merge pull request #2 from daily-co/queueing
Adding queue transportation to services
2024-01-12 13:40:51 -05:00
Moishe Lettvin
9bbfc8ad05 Updating 06- sample 2024-01-12 11:40:49 -05:00
Moishe Lettvin
ad427bea3a Update 05- sample, sticking with generators here because they give us a lot more control over the order that things get queued. 2024-01-12 10:47:03 -05:00
Moishe Lettvin
ec1f2362c5 add on_first_other_participant_joined event 2024-01-12 08:39:56 -05:00
Moishe Lettvin
37207745c9 Remove some unused variables 2024-01-11 19:40:19 -05:00
Moishe Lettvin
b9b82695c6 Adding queue transportation to services 2024-01-11 19:14:19 -05:00
Moishe Lettvin
7ca7764be3 a little cleanup 2024-01-10 21:14:55 -05:00
Moishe Lettvin
df1bbef653 something is up with transcription, trying to figure it out 2024-01-10 17:12:40 -05:00
Moishe Lettvin
7229fc806e getting started on 06- 2024-01-10 13:00:36 -05:00
Moishe Lettvin
cd204ebd21 Add sample 04- 2024-01-09 14:19:27 -05:00
Moishe Lettvin
cb63307ddf Improve pipeline of data gathering for 05- sample (I think it can be better, though) 2024-01-09 12:56:56 -05:00
Moishe Lettvin
37f48d9f04 different take on 05 maybe 2024-01-09 11:59:21 -05:00
Moishe Lettvin
a29cb1a5ad Update 05 sample to use decorators and exit correctly 2024-01-09 06:40:48 -05:00
Moishe Lettvin
20a9c6a938 simplify the join, no need for END_STREAM 2024-01-09 06:12:24 -05:00
Moishe Lettvin
534a4e2fa1 Exit meeting after saying something for samples 1 and 2 2024-01-09 06:05:24 -05:00
Moishe Lettvin
11a553240f Sample 3 2024-01-09 05:45:49 -05:00
Moishe Lettvin
4efeae46bc stop using my random room in the demos 2024-01-09 05:32:42 -05:00
Moishe Lettvin
a59cec526f elevenlabs test in 02- 2024-01-08 20:28:53 -05:00
Moishe Lettvin
ac0e4b0c27 remove leftover prints 2024-01-08 17:56:21 -05:00
Moishe Lettvin
6cace129fd shut down correctly when a participant leaves 2024-01-08 17:55:19 -05:00
chadbailey59
290c1e7efa Added async OpenAI services (#1)
* added async openai services

* added async openai services

* added Deepgram service with 05 example

* modernized the 'say one thing' example

* async all the things

* cleanup and user greeting

* more cleanup
2024-01-08 16:07:21 -06:00
Moishe Lettvin
d95bca479d update 01 sample 2024-01-08 15:58:11 -05:00
Moishe Lettvin
0451ae498f Make transportservice.run async, and add 02- sample 2024-01-08 15:38:32 -05:00
Moishe Lettvin
712ab97f88 Support coroutine callbacks 2024-01-08 11:27:58 -05:00
Moishe Lettvin
b5c7e30efa Use a decorator for callbacks 2024-01-06 14:09:28 -05:00
Moishe Lettvin
7f51c0c9b2 A little cleanup 2024-01-05 20:41:52 -05:00
Moishe Lettvin
b48a377b17 a little cleanup, uglier-than-I'd-like 01-say-one-thing sample added 2024-01-05 14:19:18 -05:00
Moishe Lettvin
5b4c085cd2 even more demo cleanup, making run_llm play well with asyncio 2024-01-04 18:14:28 -05:00
Moishe Lettvin
cd2c9700ad more demo cleanup, allow bundled frames in output_queue 2024-01-04 17:54:13 -05:00
Moishe Lettvin
fcd9a248d9 use inference text in demo, clean up image generation 2024-01-04 17:26:13 -05:00
Moishe Lettvin
c68703749b remove unnecessary file 2024-01-04 15:49:12 -05:00
Moishe Lettvin
aab06f4441 a few tweaks to the prompt 2024-01-02 10:46:48 -05:00
47 changed files with 2013 additions and 1395 deletions

5
.gitignore vendored
View File

@@ -2,6 +2,7 @@
env/
__pycache__/
*~
venv
#*#
# Distribution / packaging
@@ -22,4 +23,6 @@ share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
MANIFEST
.DS_Store
.env

View File

@@ -5,12 +5,14 @@ This SDK can help you build applications that participate in WebRTC meetings and
## Build/Install
_Note that you may need to set up a virtual environment before following the instructions below. For instance, you might need to run the following from the root of the repo:_
```
python3 -m venv env
source env/bin/activate
```
From the root of this repo, run the following:
```
pip install -r requirements.txt
python -m build
@@ -19,10 +21,11 @@ python -m build
This builds the package. To use the package locally (eg to run sample files), run
```
pip install .
pip install --editable .
```
If you want to use this package from another directory, you can run:
```
pip install path_to_this_repo
```
@@ -32,7 +35,7 @@ pip install path_to_this_repo
Tou can run the simple sample like so:
```
python src/samples/simple-sample/simple-sample.py -u your_room_url -k your_daily_api_key
python src/samples/theoretical-to-real/01-say-one-thing.py -u <url of your Daily meeting> -k <your Daily API Key>
```
Note that the sample uses Azure's TTS and LLM services. You'll need to set the following environment variables for the sample to work:
@@ -44,3 +47,94 @@ AZURE_CHATGPT_KEY
AZURE_CHATGPT_ENDPOINT
AZURE_CHATGPT_DEPLOYMENT_ID
```
If you have those environment variables stored in an .env file, you can quickly load them into your terminal's environment by running this:
```bash
export $(grep -v '^#' .env | xargs)
```
## Overview
The Daily AI SDK allows you to build applications that can participate in WebRTC sessions and interact with AI Services. Some examples of what you can build with this:
* conversational bots that interact 1:1 with a user, using voice recognition and text-to-speech
* assistant bots that aggregate transcriptions from multiple participants in a meeting and provide realtime summaries or other AI-generated output.
* image-recognition bots
* etc
## Concepts
### Transport Service
The SDK provides one “transport service”, which is a wrapper around Dailys `daily-python` client (tk add link). You can use this service to listen for events related to a WebRTC session, such as “a participant joined the meeting”.
The transport service also exposes a send queue, and a receive queue. You can use the send queue to send audio and video to the WebRTC session, and you can listen to the receive queue to see audio, video and transcription data from the WebRTC session.
### AI Services
The AI Service classes provide wrappers around various AI providers, and allow you to query LLMs, convert text to speech and make images from text. The audio and images can then be placed on the transport services send queue, where theyll be sent to the WebRTC session.
### Queue Frames
Communication between the transport service and AI services, and between various AI services, takes place in Queue Frames. These frames contain an indication of the type of data as well as the data itself.
## Using Transports, AI Services and Frames
AI Services all define a `.run` method. This method consumes and generates `QueueFrame` frames. The kind of frames that can be consumed and generated depend on the kind of service. For instance, an LLM AI Service consumes `LLM_MESSAGE` frames (which define a history of interaction with an LLM) and emit `TEXT` frames (the response from the LLM).
The `.run` method is an `AsyncIterable`, and it takes an `iterable`, `AsyncIterable` or `asyncio.Queue` that produces QueueFrames as a parameter. This makes it easy to chain AI Services, and consume input from the Transports `receive_queue` .
AI Services also have a `.run_to_queue` method. This method is not an AsyncIterable, but instead sends processed QueueFrames to a queue. This makes it easy to send the output of an AI Service to the Transports `send_queue`.
AI Services also define convenience functions that let you bypass creating QueueFrames for some simple cases (eg. using the TTS service to convert a string to audio output and send that audio to the transports `send_queue`). See below for examples.
## Examples
### Say Something
The base TTS AI service exposes a `.say` method. After creating a transport and TTS service, you can use this method like so:
```
transport = DailyTransportService(...)
tts = AzureTTSService()
await tts.say("hello world", transport.send_queue)
```
This will call the TTS service to render the text to audio frames, then put the audio frames on the transports send queue. The transport will then send those frames along to the WebRTC session.
### Speak an LLM response
Given a system prompt contained in a `messages` array, you can emit the LLMs response as audio with a chain like this:
```
transport = DailyTransportService(...) # setup parameters omitted
tts = AzureTTSService()
llm = AzureLLMService()
messages = [...] # system prompt omitted for brevity
await tts.run_to_queue(
transport.send_queue,
llm.run([QueueFrame.LLM_MESSAGES, messages])
)
```
In this code, the LLM service object sends the messages to Azures OpenAI implementation, which streams chunks back asynchronously. Those chunks are aggregated by the TTS Service to ensure the best audio response (TTS works best when it gets complete sentence, so it can inflect correctly), then sent to Azures TTS service, converted to audio frames, and sent to the WebRTC session via the Daily transport.
### Pre-cache an LLM response
Sometimes LLMs can be slower than wed like for natural-feeling communication. Heres an example where we take advantage of the time it takes to speak some pre-defined text to get a head start on the LLM response:
(TK link to 04- sample)
In this sample, we set up a buffer queue to receive the audio frames from the LLM response before while we are joining the call and start an asynchronous task to start filling this buffer:
```
buffer_queue = asyncio.Queue()
llm_response_task = asyncio.create_task(
elevenlabs_tts.run_to_queue(
buffer_queue,
llm.run([QueueFrame(FrameType.LLM_MESSAGE, messages)]),
True,
)
)
```
Then, when weve joined the call, we speak the static text:
```
await azure_tts.say("My friend...", transport.send_queue)
```
As that text is being spoken, the asynchronous LLM task continues in the background. When the text is done, we pull the frames off the buffer queue and put them in the transports `send_queue`:
```
async def buffer_to_send_queue():
while True:
frame = await buffer_queue.get()
await transport.send_queue.put(frame)
buffer_queue.task_done()
if frame.frame_type == FrameType.END_STREAM:
break
await asyncio.gather(llm_response_task, buffer_to_send_queue())
```
One thing to note here is the last parameter to `run_to_queue` in the first code clause above: this causes the `run_to_queue` method to send an `END_STREAM` frame when its done rendering. This lets us know when to stop our `buffer_to_send_queue` task above.

View File

@@ -14,7 +14,10 @@ dependencies = [
"google-cloud-texttospeech",
"azure-cognitiveservices-speech",
"pyht",
"opentelemetry-sdk"
"opentelemetry-sdk",
"aiohttp",
"fal",
"faster_whisper"
]
[tool.setuptools.packages.find]

View File

@@ -1,3 +1,4 @@
autopep8==2.0.4
build==1.0.3
packaging==23.2
pyproject_hooks==1.0.0
pyproject_hooks==1.0.0

View File

@@ -1,347 +0,0 @@
import json
import logging
import re
from collections import defaultdict
from dataclasses import dataclass, field
from enum import Enum
from queue import Queue, PriorityQueue, Empty
from threading import Event, Semaphore, Thread
from typing import Any, Generator, Iterator, Optional, Type
from dailyai.output_queue import OutputQueueFrame, FrameType
from dailyai.message_handler.message_handler import MessageHandler
from dailyai.services.ai_services import AIServiceConfig
class AsyncProcessorState:
# Setting class variables, other synchronous activities
INIT = 0
# Making asynchronous requests to LLM and other services to render response
PREPARING = 1
# Ready to start presenting to user (but may not have all data yet)
READY = 2
# Playing response
PLAYING = 3
# An interrupt has been requested and the response is shutting down in-flight processing
INTERRUPTING = 4
# An interrupt has been requested and the response is finished stopping in-flight processing
INTERRUPTED = 5
# Response has been played or interrupted
DONE = 6
# Response is being finalized (updating records of speech, updating LLM context, etc.)
FINALIZING = 7
# Response is complete. This could mean that everything is updated, or that the response
# was interrupted.
FINALIZED = 8
state_transitions = {
INIT: [PREPARING, INTERRUPTING],
PREPARING: [READY, INTERRUPTING],
READY: [PLAYING, INTERRUPTING],
PLAYING: [DONE, INTERRUPTING],
INTERRUPTING: [INTERRUPTED],
INTERRUPTED: [DONE],
DONE: [FINALIZING],
FINALIZING: [FINALIZED],
FINALIZED: [FINALIZED],
}
@dataclass(order=True)
class StateTransitionItem:
state: int
evt: Event = field(compare=False)
class AsyncProcessor:
def __init__(
self,
services: AIServiceConfig
) -> None:
self.state = AsyncProcessorState.INIT
self.prepare_thread = None
self.play_thread = None
self.finalize_thread = None
self.services: AIServiceConfig = services
self.state_transition_semaphore = Semaphore()
self.waiting_for_state_changes = PriorityQueue()
self.state_queue = Queue()
self.state_change_callbacks = defaultdict(list)
self.was_interrupted = False
self.logger: logging.Logger = logging.getLogger("dailyai")
def set_state(self, state: int) -> None:
if state in AsyncProcessorState.state_transitions[self.state]:
self.state_transition_semaphore.acquire()
self.state: int = state
self.state_transition_semaphore.release()
# wake up any threads waiting for this state transition
try:
while True:
waiter = self.waiting_for_state_changes.get_nowait()
if waiter.state <= state:
waiter.evt.set()
else:
self.waiting_for_state_changes.put(waiter)
break
except Empty:
pass
# make all the callbacks for this state
for callback in self.state_change_callbacks[state]:
callback(self)
else:
self.logger.error(
f"Invalid state transition from {self.state} to {state} in {self.__class__.__name__}"
)
raise Exception(f"Invalid state transition from {self.state} to {state}")
#
# This is used for state transitions that could be blocked by an interruption.
# If we are interrupted, we silently fail this call. Use only if you know that
# this state transition should fail if the processor has been interrupted.
#
def maybe_set_state(self, state: int) -> bool:
if state in AsyncProcessorState.state_transitions[self.state]:
self.set_state(state)
return True
else:
return False
def wait_for_state_transition(self, state: int) -> None:
if self.state >= state:
return
self.state_transition_semaphore.acquire()
evt = Event()
self.waiting_for_state_changes.put(StateTransitionItem(state, evt))
self.state_transition_semaphore.release()
result = evt.wait(120.0)
if not result:
self.logger.error(
f"Timed out waiting for state transition to {state} from {self.state}"
)
def set_state_callback(self, state: int, callback: callable) -> None:
self.state_change_callbacks[state].append(callback)
def prepare(self) -> None:
self.prepare_thread = Thread(target=self.async_prepare, daemon=True)
self.prepare_thread.start()
self.wait_for_state_transition(AsyncProcessorState.READY)
def play(self) -> None:
self.wait_for_state_transition(AsyncProcessorState.READY)
self.play_thread = Thread(target=self.async_play, daemon=True)
self.play_thread.start()
self.wait_for_state_transition(AsyncProcessorState.PLAYING)
def finalize(self) -> None:
# don't finalize until we're done playing.
self.wait_for_state_transition(AsyncProcessorState.DONE)
self.set_state(AsyncProcessorState.FINALIZING)
self.do_finalization()
self.set_state(AsyncProcessorState.FINALIZED)
def interrupt(self) -> None:
# nothing to interrupt if we're already finalizing or finalized, no-op
if self.state in [
AsyncProcessorState.FINALIZING,
AsyncProcessorState.FINALIZED,
]:
return
self.set_state(AsyncProcessorState.INTERRUPTING)
self.was_interrupted = True
self.do_interruption()
self.set_state(AsyncProcessorState.INTERRUPTED)
self.set_state(AsyncProcessorState.DONE)
def async_play(self) -> None:
self.logger.info(f"Starting to play")
if self.maybe_set_state(AsyncProcessorState.PLAYING):
self.do_play()
self.maybe_set_state(AsyncProcessorState.DONE)
def async_prepare(self) -> None:
self.set_state(AsyncProcessorState.PREPARING)
self.start_preparation()
self.set_state(AsyncProcessorState.READY)
self.continue_preparation()
self.logger.info(f"Preparation done for {self.__class__.__name__}")
self.preparation_done()
def start_preparation(self) -> None:
pass
def continue_preparation(self) -> None:
pass
def preparation_done(self):
pass
def get_preparation_iterator(self) -> Iterator:
yield None
def process_chunk(self, chunk) -> None:
pass
def do_interruption(self) -> None:
pass
def do_play(self) -> None:
pass
def do_finalization(self) -> None:
pass
# A common class for responses that use a message queue and
# an output queue.
class OrchestratorResponse(AsyncProcessor):
def __init__(
self,
services,
message_handler,
output_queue,
) -> None:
super().__init__(services)
self.message_handler: MessageHandler = message_handler
self.output_queue: Queue = output_queue
class LLMResponse(OrchestratorResponse):
def __init__(
self,
services,
message_handler,
output_queue,
) -> None:
super().__init__(services, message_handler, output_queue)
self.has_sent_first_frame = False
self.chunks_in_preparation = Queue()
self.llm_responses: list[str] = []
def get_preparation_iterator(self) -> Iterator:
messages_for_llm = self.message_handler.get_llm_messages()
self.logger.debug(f"Messages for llm: {json.dumps(messages_for_llm, indent=2)}")
return self.clauses_from_chunks(
self.services.llm.run_llm_async(messages_for_llm)
)
def clauses_from_chunks(self, chunks) -> Iterator:
out = ""
for chunk in chunks:
if self.state not in [
AsyncProcessorState.READY,
AsyncProcessorState.PLAYING,
]:
break
out += chunk
if re.match(r"^.*[.!?]$", out): # it looks like a sentence
yield out.strip()
out = ""
if out.strip():
yield out.strip()
def get_frames_from_tts_response(self, audio_frame) -> list[OutputQueueFrame]:
return [OutputQueueFrame(FrameType.AUDIO_FRAME, audio_frame)]
def get_frames_from_chunk(self, chunk) -> Generator[list[OutputQueueFrame], Any, None]:
for audio_frame in self.services.tts.run_tts(chunk):
yield self.get_frames_from_tts_response(audio_frame)
def start_preparation(self) -> None:
self.preparation_iterator = self.get_preparation_iterator()
def continue_preparation(self) -> None:
for chunk in self.preparation_iterator:
if self.state not in [
AsyncProcessorState.READY,
AsyncProcessorState.PLAYING,
]:
break
self.process_chunk(chunk)
def process_chunk(self, chunk) -> None:
self.chunks_in_preparation.put((chunk, self.get_frames_from_chunk(chunk)))
def preparation_done(self):
self.chunks_in_preparation.put((None, None))
def do_play(self) -> None:
while True:
if self.state not in [
AsyncProcessorState.READY,
AsyncProcessorState.PLAYING,
]:
break
prepared_chunk = self.chunks_in_preparation.get()
if prepared_chunk[0] == None:
return
self.play_prepared_chunk(prepared_chunk)
def play_prepared_chunk(self, prepared_chunk) -> None:
chunk, tts_generator = prepared_chunk
for frames in tts_generator:
if self.state not in [
AsyncProcessorState.READY,
AsyncProcessorState.PLAYING,
]:
break
if not self.has_sent_first_frame:
self.output_queue.put(OutputQueueFrame(FrameType.START_STREAM, None))
self.has_sent_first_frame = True
for frame in frames:
self.output_queue.put(frame)
self.output_queue.join()
self.llm_responses.append(chunk)
def do_finalization(self) -> None:
self.message_handler.add_assistant_messages(self.llm_responses)
def do_interruption(self) -> None:
self.chunks_in_preparation.put((None, None))
if self.prepare_thread and self.prepare_thread.is_alive():
self.prepare_thread.join()
if self.play_thread and self.play_thread.is_alive():
self.play_thread.join()
@dataclass(frozen=True)
class ConversationProcessorCollection:
introduction: Optional[Type[OrchestratorResponse]] = None
waiting: Optional[Type[OrchestratorResponse]] = None
response: Optional[Type[OrchestratorResponse]] = None
goodbye: Optional[Type[OrchestratorResponse]] = None

View File

@@ -0,0 +1,76 @@
import asyncio
import copy
import functools
from typing import AsyncGenerator, Awaitable, Callable
from dailyai.queue_aggregators import LLMContextAggregator
from dailyai.queue_frame import EndStreamQueueFrame, QueueFrame, TranscriptionQueueFrame
class InterruptibleConversationWrapper:
def __init__(
self,
frame_generator: Callable[[], AsyncGenerator[QueueFrame, None]],
runner: Callable[
[str, LLMContextAggregator, LLMContextAggregator], Awaitable[None]
],
interrupt: Callable[[], None],
my_participant_id: str|None,
llm_messages: list[dict[str, str]],
llm_context_aggregator_in=LLMContextAggregator,
llm_context_aggregator_out=LLMContextAggregator,
delay_before_speech_seconds: float = 1.0,
):
self._frame_generator: Callable[[], AsyncGenerator[QueueFrame, None]] = frame_generator
self._runner: Callable[
[str, LLMContextAggregator, LLMContextAggregator], Awaitable[None]
] = runner
self._interrupt: Callable[[], None] = interrupt
self._my_participant_id = my_participant_id
self._messages: list[dict[str, str]] = llm_messages
self._delay_before_speech_seconds = delay_before_speech_seconds
self._llm_context_aggregator_in = llm_context_aggregator_in
self._llm_context_aggregator_out = llm_context_aggregator_out
self._current_phrase = ""
def update_messages(self, new_messages: list[dict[str, str]], task: asyncio.Task | None):
if task:
if not task.cancelled():
self._current_phrase = ""
self._messages = new_messages
async def speak_after_delay(self, user_speech, messages):
await asyncio.sleep(self._delay_before_speech_seconds)
tma_in = self._llm_context_aggregator_in(
messages, "user", self._my_participant_id, False
)
tma_out = self._llm_context_aggregator_out(
messages, "assistant", self._my_participant_id
)
await self._runner(user_speech, tma_in, tma_out)
async def run_conversation(self):
current_response_task = None
async for frame in self._frame_generator():
if isinstance(frame, EndStreamQueueFrame):
break
elif not isinstance(frame, TranscriptionQueueFrame):
continue
if frame.participantId == self._my_participant_id:
continue
if current_response_task:
current_response_task.cancel()
self._interrupt()
self._current_phrase += " " + frame.text
current_llm_messages = copy.deepcopy(self._messages)
current_response_task = asyncio.create_task(
self.speak_after_delay(self._current_phrase, current_llm_messages)
)
current_response_task.add_done_callback(
functools.partial(self.update_messages, current_llm_messages)
)

View File

@@ -1,127 +0,0 @@
import logging
import time
from dataclasses import dataclass
from queue import Queue, Empty
from threading import Thread
from dailyai.storage.search import SearchIndexer
from dailyai.services.ai_services import AIServiceConfig
@dataclass
class Message:
type: str
timestamp: float
message: str
class MessageHandler:
def __init__(self, intro):
self.messages: list[Message] = [Message("system", time.time(), intro)]
self.last_user_message_idx:int | None = None
self.finalized_user_message_idx: int | None = None
def add_user_message(self, message) -> None:
if self.last_user_message_idx is not None and self.last_user_message_idx != self.finalized_user_message_idx:
previous_message: str = self.messages[self.last_user_message_idx].message
self.messages[self.last_user_message_idx] = Message(
"user", time.time(), ' '.join([previous_message, message])
)
self.messages = self.messages[: self.last_user_message_idx + 1]
else:
self.messages.append(Message("user", time.time(), message))
self.last_user_message_idx = len(self.messages) - 1
def add_assistant_message(self, message) -> None:
if self.messages[-1].type == "assistant":
self.messages[-1].message += " " + message
else:
self.messages.append(Message("assistant", time.time(), message))
def add_assistant_messages(self, messages) -> None:
self.messages.append(Message("assistant", time.time(), " ".join(messages)))
def get_llm_messages(self) -> list[dict[str, str]]:
return [{"role": m.type, "content": m.message} for m in self.messages]
def finalize_user_message(self) -> None:
self.finalized_user_message_idx = self.last_user_message_idx
def shutdown(self) -> None:
pass
class IndexingMessageHandler(MessageHandler):
def __init__(
self, intro, services: AIServiceConfig, indexer: SearchIndexer
) -> None:
super().__init__(intro)
self.services = services
self.search_indexer = indexer
self.last_written_idx = 0
self.storage_message_queue = Queue()
self.index_writer_thread = Thread(target=self.storage_writer, daemon=True)
self.index_writer_thread.start()
self.logger = logging.getLogger("dailyai")
def shutdown(self):
self.finalize_user_message()
self.storage_message_queue.put(None)
self.index_writer_thread.join()
def storage_writer(self) -> None:
while True:
try:
message_idx = self.storage_message_queue.get()
self.storage_message_queue.task_done()
if message_idx is None:
return
if message_idx <= self.last_written_idx:
continue
self.last_written_idx = message_idx
message = self.messages[message_idx]
content = message.message
if message.type == "user":
content = self.cleanup_user_message(content)
# sometimes the LLM returns a string wrapped in quotes and sometimes it doesn't.
# if it didn't, wrap it in quotes
if content[0] != '"':
content = '"' + content + '"'
self.search_indexer.index_text(content)
except Empty:
pass
def cleanup_user_message(self, user_message) -> str:
return user_message
def finalize_user_message(self):
super().finalize_user_message()
self.write_messages_to_storage()
def write_messages_to_storage(self):
if self.finalized_user_message_idx is None:
return
for idx in range(self.last_written_idx, len(self.messages)):
self.logger.info(
f"Writing to storage: {self.messages[idx].type} {self.messages[idx].message}"
)
if (
self.messages[idx].type == "user"
and idx > self.finalized_user_message_idx
):
break
if self.messages[idx].type != "system":
self.storage_message_queue.put(idx)

View File

@@ -1,406 +0,0 @@
import logging
import os
import time
import wave
from dataclasses import dataclass
from enum import Enum
from queue import Queue, Empty
from opentelemetry import trace, context
from dailyai.async_processor.async_processor import (
AsyncProcessor,
AsyncProcessorState,
ConversationProcessorCollection,
OrchestratorResponse,
LLMResponse,
)
from dailyai.output_queue import OutputQueueFrame, FrameType
from dailyai.services.ai_services import AIServiceConfig
from dailyai.message_handler.message_handler import MessageHandler
from threading import Thread, Semaphore, Event, Timer
from opentelemetry import context
from opentelemetry.context.context import Context
from daily import (
EventHandler,
CallClient,
Daily,
VirtualCameraDevice,
VirtualMicrophoneDevice,
VirtualSpeakerDevice,
)
@dataclass
class OrchestratorConfig:
room_url: str
token: str
bot_name: str
expiration: float
# Note that we use this as a default parameter value in the Orchestrator
# constructor. The dataclass is defined with Frozen=True, so this should
# be safe.
default_conversation_collection = ConversationProcessorCollection(
introduction=LLMResponse,
waiting=None,
response=LLMResponse,
goodbye=None,
)
class Orchestrator(EventHandler):
def __init__(
self,
daily_config: OrchestratorConfig,
ai_service_config: AIServiceConfig,
message_handler: MessageHandler,
conversation_processors: ConversationProcessorCollection = default_conversation_collection,
tracer=None,
):
self.bot_name: str = daily_config.bot_name
self.room_url: str = daily_config.room_url
self.token: str = daily_config.token
self.expiration: float = daily_config.expiration
self.logger: logging.Logger = logging.getLogger("dailyai")
self.tracer = tracer or trace.get_tracer("orchestrator")
self.ctx: Context = context.get_current()
self.transcription = ""
self.last_fragment_at = None
self.talked_at = None
self.paused_at = None
self.logger.info(f"Creating Response for introductions")
self.services: AIServiceConfig = ai_service_config
self.output_queue = Queue()
self.is_interrupted = Event()
self.stop_threads = Event()
self.story_started = False
self.message_handler = message_handler
self.conversation_processors: ConversationProcessorCollection = conversation_processors
if conversation_processors.introduction is not None:
intro = conversation_processors.introduction(
services=self.services, message_handler=self.message_handler, output_queue=self.output_queue
)
intro.prepare()
intro.set_state_callback(AsyncProcessorState.DONE, self.on_intro_played)
intro.set_state_callback(AsyncProcessorState.FINALIZED, self.on_intro_finished)
self.logger.info(f"Introduction is preparing")
self.current_response: AsyncProcessor = intro
self.can_interrupt = False
# self.response_event.set()
self.response_semaphore = Semaphore()
self.speech_timeout = None
self.interrupt_time = None
self.logger.info("Configuring daily")
self.configure_daily()
def configure_daily(self):
Daily.init()
self.client = CallClient(event_handler=self)
self.logger.info(f"Mic sample rate: {self.services.tts.get_mic_sample_rate()}")
self.mic: VirtualMicrophoneDevice = Daily.create_microphone_device(
"mic", sample_rate=self.services.tts.get_mic_sample_rate(), channels=1
)
self.speaker: VirtualSpeakerDevice = Daily.create_speaker_device(
"speaker", sample_rate=16000, channels=1
)
self.camera: VirtualCameraDevice = Daily.create_camera_device(
"camera", width=720, height=1280, color_format="RGB"
)
Daily.select_speaker_device("speaker")
self.client.set_user_name(self.bot_name)
self.client.join(self.room_url, self.token, completion=self.call_joined)
self.client.update_inputs(
{
"camera": {
"isEnabled": True,
"settings": {
"deviceId": "camera",
},
},
"microphone": {
"isEnabled": True,
"settings": {
"deviceId": "mic",
"customConstraints": {
"autoGainControl": {"exact": False},
"echoCancellation": {"exact": False},
"noiseSuppression": {"exact": False},
},
},
},
}
)
self.client.update_publishing(
{
"camera": {
"sendSettings": {
"maxQuality": "low",
"encodings": {
"low": {
"maxBitrate": 250000,
"scaleResolutionDownBy": 1.333,
"maxFramerate": 8,
}
},
}
}
}
)
self.my_participant_id = self.client.participants()["local"]["id"]
def start(self) -> None:
# TODO: this loop could, I think, be replaced with a timer and an event
self.participant_left = False
try:
participant_count: int = len(self.client.participants())
self.logger.info(f"{participant_count} participants in room")
while time.time() < self.expiration and not self.participant_left:
# all handling of incoming transcriptions happens in on_transcription_message
time.sleep(1)
except Exception as e:
self.logger.error(f"Exception {e}")
finally:
self.client.leave()
def stop(self):
self.logger.info("Stop current response")
if self.current_response:
if self.current_response.state < AsyncProcessorState.INTERRUPTED:
self.current_response.interrupt()
self.logger.info("Wait for state transition")
self.current_response.wait_for_state_transition(AsyncProcessorState.FINALIZED)
self.stop_threads.set()
self.camera_thread.join()
self.logger.info("Camera thread stopped")
self.logger.info("Put stop in output queue")
self.output_queue.put(OutputQueueFrame(FrameType.END_STREAM, None))
self.frame_consumer_thread.join()
self.logger.info("Orchestrator stopped.")
def on_intro_played(self, intro):
self.logger.info(f"Introduction has played")
self.can_interrupt = True
intro.finalize()
def on_intro_finished(self, intro):
self.logger.info(f"Introduction has finished")
def on_response_played(self, response):
response.finalize()
def on_response_finished(self, response):
if not response.was_interrupted:
self.message_handler.finalize_user_message()
def call_joined(self, join_data, client_error):
self.logger.info(f"Call_joined: {join_data}, {client_error}")
self.client.start_transcription(
{
"language": "en",
"tier": "nova",
"model": "2-conversationalai",
"profanity_filter": True,
"redact": False,
"extra": {
"endpointing": True,
"punctuate": False,
}
}
)
def on_participant_joined(self, participant):
with self.tracer.start_as_current_span("on_participant_joined", context=self.ctx):
self.logger.info(f"on_participant_joined: {participant}")
# TODO: figure out the architecture to get the story id to the client
# self.client.send_app_message({"event": "story-id", "storyID": self.story_id})
time.sleep(2)
if not self.story_started:
self.action()
self.story_started = True
def on_participant_left(self, participant, reason):
self.logger.info(f"Participant {participant} left")
if len(self.client.participants()) < 2:
self.participant_left = True
def on_app_message(self, message, sender):
with self.tracer.start_as_current_span("on_app_message", context=self.ctx):
self.logger.info(f"on_app_message {message} from {sender}")
if "isSpeaking" in message and message["isSpeaking"] == True:
self.handle_user_started_talking()
if "isSpeaking" in message and message["isSpeaking"] == False:
self.handle_user_stopped_talking()
def on_transcription_message(self, message):
with self.tracer.start_as_current_span("on_transcription_message", context=self.ctx):
if message["session_id"] != self.my_participant_id:
self.handle_transcription_fragment(message['text'])
def on_transcription_stopped(self, stopped_by, stopped_by_error):
self.logger.info(f"Transcription stopped {stopped_by}, {stopped_by_error}")
def on_transcription_error(self, message):
self.logger.error(f"Transcription error {message}")
def on_transcription_started(self, status):
self.logger.info(f"Transcription started {status}")
def set_image(self, image: bytes):
self.image: bytes | None = image
def run_camera(self):
try:
while not self.stop_threads.is_set():
if self.image:
self.camera.write_frame(self.image)
time.sleep(1.0 / 8.0) # 8 fps
except Exception as e:
self.logger.error(f"Exception {e} in camera thread.")
def handle_user_started_talking(self):
# TODO: allow configuration of the timer timeout
self.logger.error("user started talking")
self.speech_timeout = Timer(1.0, self.utterance_interrupt)
def handle_user_stopped_talking(self):
self.logger.error("user stopped talking, canceling utterance interrupt")
if self.speech_timeout:
self.speech_timeout.cancel()
def utterance_interrupt(self):
self.logger.error("utterance interrupt")
self.is_interrupted.set()
def handle_transcription_fragment(self, fragment):
if not self.can_interrupt:
return
# start generating a new response. We'll do the fast parts of the interrupt
# now but wait for the state transition after we've kicked off the prepare
# on the new response.
if (
self.current_response
and self.current_response.state < AsyncProcessorState.INTERRUPTED
):
self.interrupt_time = time.perf_counter()
self.is_interrupted.set()
self.current_response.interrupt()
self.message_handler.add_user_message(fragment)
response_type: type[OrchestratorResponse] | type[LLMResponse] = self.conversation_processors.response or LLMResponse
new_response: OrchestratorResponse = response_type(
self.services, self.message_handler, self.output_queue
)
new_response.set_state_callback(
AsyncProcessorState.DONE, self.on_response_played
)
new_response.set_state_callback(
AsyncProcessorState.FINALIZED, self.on_response_finished
)
new_response.prepare()
self.response_semaphore.acquire()
if (
self.current_response
and self.current_response.state < AsyncProcessorState.INTERRUPTED
):
self.current_response.wait_for_state_transition(
AsyncProcessorState.FINALIZED
)
self.current_response = new_response
self.current_response.play()
self.response_semaphore.release()
def action(self):
self.logger.info("Starting camera thread")
self.image: bytes | None = None
self.camera_thread = Thread(target=self.run_camera, daemon=True)
self.camera_thread.start()
self.logger.info("Starting frame consumer thread")
self.frame_consumer_thread = Thread(target=self.frame_consumer, daemon=True)
self.frame_consumer_thread.start()
self.logger.info("Playing introduction")
self.can_interrupt = False
self.current_response.play()
def frame_consumer(self):
self.logger.info("🎬 Starting frame consumer thread")
b = bytearray()
smallest_write_size = 3200
all_audio_frames = bytearray()
while True:
try:
frame:OutputQueueFrame = self.output_queue.get()
if frame.frame_type == FrameType.END_STREAM:
self.logger.info("Stopping frame consumer thread")
return
# if interrupted, we just pull frames off the queue and discard them
if not self.is_interrupted.is_set():
if frame:
if frame.frame_type == FrameType.AUDIO_FRAME:
chunk = frame.frame_data
all_audio_frames.extend(chunk)
b.extend(chunk)
l = len(b) - (len(b) % smallest_write_size)
if l:
self.mic.write_frames(bytes(b[:l]))
b = b[l:]
elif frame.frame_type == FrameType.IMAGE_FRAME:
self.set_image(frame.frame_data)
elif len(b):
self.mic.write_frames(bytes(b))
b = bytearray()
else:
if self.interrupt_time:
self.logger.info(f"Lag to stop stream after interruption {time.perf_counter() - self.interrupt_time}")
self.interrupt_time = None
if frame.frame_type == FrameType.START_STREAM:
self.is_interrupted.clear()
self.output_queue.task_done()
except Empty:
try:
if len(b):
self.mic.write_frames(bytes(b))
except Exception as e:
self.logger.error(f"Exception in frame_consumer: {e}, {len(b)}")
b = bytearray()

View File

@@ -1,14 +0,0 @@
from enum import Enum
from dataclasses import dataclass
class FrameType(Enum):
AUDIO_FRAME = 1
IMAGE_FRAME = 2
START_STREAM = 3
END_STREAM = 4
@dataclass(frozen=True)
class OutputQueueFrame:
frame_type: FrameType
frame_data: bytes

View File

@@ -0,0 +1,48 @@
import asyncio
from dailyai.queue_frame import LLMMessagesQueueFrame, QueueFrame, TextQueueFrame
from dailyai.services.ai_services import AIService
from typing import AsyncGenerator, List
class QueueTee:
async def run_to_queue_and_generate(
self,
output_queue: asyncio.Queue,
generator: AsyncGenerator[QueueFrame, None]
) -> AsyncGenerator[QueueFrame, None]:
async for frame in generator:
await output_queue.put(frame)
yield frame
async def run_to_queues(
self,
output_queues: List[asyncio.Queue],
generator: AsyncGenerator[QueueFrame, None]
):
async for frame in generator:
for queue in output_queues:
await queue.put(frame)
class LLMContextAggregator(AIService):
def __init__(self, messages: list[dict], role:str, bot_participant_id=None, complete_sentences=True):
self.messages = messages
self.bot_participant_id = bot_participant_id
self.role = role
self.sentence = ""
self.complete_sentences = complete_sentences
async def process_frame(self, frame:QueueFrame) -> AsyncGenerator[QueueFrame, None]:
# TODO: split up transcription by participant
if isinstance(frame, TextQueueFrame):
if self.complete_sentences:
self.sentence += frame.text
if self.sentence.endswith((".", "?", "!")):
self.messages.append({"role": self.role, "content": self.sentence})
self.sentence = ""
yield LLMMessagesQueueFrame(self.messages)
else:
self.messages.append({"role": self.role, "content": frame.text})
yield LLMMessagesQueueFrame(self.messages)
yield frame

View File

@@ -0,0 +1,41 @@
from enum import Enum
from dataclasses import dataclass
from typing import Any
class QueueFrame:
pass
class ControlQueueFrame(QueueFrame):
pass
class StartStreamQueueFrame(ControlQueueFrame):
pass
class EndStreamQueueFrame(ControlQueueFrame):
pass
@dataclass()
class AudioQueueFrame(QueueFrame):
data: bytes
@dataclass()
class ImageQueueFrame(QueueFrame):
url: str | None
image: bytes
@dataclass()
class TextQueueFrame(QueueFrame):
text: str
@dataclass()
class TranscriptionQueueFrame(TextQueueFrame):
participantId: str
timestamp: str
@dataclass()
class LLMMessagesQueueFrame(QueueFrame):
messages: list[dict[str,str]] # TODO: define this more concretely!
class AppMessageQueueFrame(QueueFrame):
message: Any
participantId: str

View File

@@ -1,2 +1,3 @@
Pillow==10.1.0
typing_extensions==4.9.0
faster-whisper==0.10.0

View File

@@ -1,56 +1,196 @@
import asyncio
import io
import logging
import wave
from dailyai.queue_frame import (
AudioQueueFrame,
ControlQueueFrame,
EndStreamQueueFrame,
ImageQueueFrame,
LLMMessagesQueueFrame,
QueueFrame,
TextQueueFrame,
)
from abc import abstractmethod
from typing import AsyncGenerator, AsyncIterable, BinaryIO, Iterable
from dataclasses import dataclass
from typing import Generator
from PIL import Image
class AIService:
def __init__(self):
self.logger = logging.getLogger("dailyai")
def close(self):
def stop(self):
pass
async def run_to_queue(self, queue: asyncio.Queue, frames, add_end_of_stream=False) -> None:
async for frame in self.run(frames):
await queue.put(frame)
if add_end_of_stream:
await queue.put(EndStreamQueueFrame())
async def run(
self,
frames: Iterable[QueueFrame]
| AsyncIterable[QueueFrame]
| asyncio.Queue[QueueFrame],
) -> AsyncGenerator[QueueFrame, None]:
try:
if isinstance(frames, AsyncIterable):
async for frame in frames:
async for output_frame in self.process_frame(frame):
yield output_frame
elif isinstance(frames, Iterable):
for frame in frames:
async for output_frame in self.process_frame(frame):
yield output_frame
elif isinstance(frames, asyncio.Queue):
while True:
frame = await frames.get()
async for output_frame in self.process_frame(frame):
yield output_frame
if isinstance(frame, EndStreamQueueFrame):
break
else:
raise Exception("Frames must be an iterable or async iterable")
async for output_frame in self.finalize():
yield output_frame
except Exception as e:
self.logger.error("Exception occurred while running AI service", e)
raise e
@abstractmethod
async def process_frame(self, frame:QueueFrame) -> AsyncGenerator[QueueFrame, None]:
if isinstance(frame, ControlQueueFrame):
yield frame
@abstractmethod
async def finalize(self) -> AsyncGenerator[QueueFrame, None]:
# This is a trick for the interpreter (and linter) to know that this is a generator.
if False:
yield QueueFrame()
class LLMService(AIService):
# Generate a set of responses to a prompt. Yields a list of responses.
@abstractmethod
def run_llm_async(
self, messages
) -> Generator[str, None, None]:
async def run_llm_async(self, messages) -> AsyncGenerator[str, None]:
yield ""
@abstractmethod
async def run_llm(self, messages) -> str:
pass
# Generate a responses to a prompt. Returns the response
@abstractmethod
def run_llm(
self, messages
) -> str or None:
pass
async def process_frame(self, frame: QueueFrame) -> AsyncGenerator[QueueFrame, None]:
if isinstance(frame, ControlQueueFrame):
yield frame
elif isinstance(frame, LLMMessagesQueueFrame):
async for text_chunk in self.run_llm_async(frame.messages):
yield TextQueueFrame(text_chunk)
class TTSService(AIService):
def __init__(self, aggregate_sentences=True):
super().__init__()
self.aggregate_sentences: bool = aggregate_sentences
self.current_sentence: str = ""
# Some TTS services require a specific sample rate. We default to 16k
def get_mic_sample_rate(self):
return 16000
# Converts the sentence to audio. Yields a list of audio frames that can
# Converts the text to audio. Yields a list of audio frames that can
# be sent to the microphone device
@abstractmethod
def run_tts(self, sentence) -> Generator[bytes, None, None]:
pass
async def run_tts(self, text) -> AsyncGenerator[bytes, None]:
# yield empty bytes here, so linting can infer what this method does
yield bytes()
async def process_frame(self, frame: QueueFrame) -> AsyncGenerator[QueueFrame, None]:
if isinstance(frame, ControlQueueFrame):
yield frame
return
elif not isinstance(frame, TextQueueFrame):
return
text: str | None = None
if not self.aggregate_sentences:
text = frame.text
else:
self.current_sentence += frame.text
if self.current_sentence.endswith((".", "?", "!")):
text = self.current_sentence
self.current_sentence = ""
if text:
async for audio_chunk in self.run_tts(text):
yield AudioQueueFrame(audio_chunk)
async def finalize(self):
if self.current_sentence:
async for audio_chunk in self.run_tts(self.current_sentence):
yield AudioQueueFrame(audio_chunk)
# Convenience function to send the audio for a sentence to the given queue
async def say(self, sentence, queue: asyncio.Queue):
await self.run_to_queue(queue, [TextQueueFrame(sentence)])
class ImageGenService(AIService):
def __init__(self, image_size, **kwargs):
super().__init__(**kwargs)
self.image_size = image_size
# Renders the image. Returns an Image object.
@abstractmethod
def run_image_gen(self, sentence) -> tuple[str, Image.Image]:
async def run_image_gen(self, sentence:str) -> tuple[str, bytes]:
pass
async def process_frame(self, frame: QueueFrame) -> AsyncGenerator[QueueFrame, None]:
if not isinstance(frame, TextQueueFrame):
yield frame
return
(url, image_data) = await self.run_image_gen(frame.text)
yield ImageQueueFrame(url, image_data)
class STTService(AIService):
"""STTService is a base class for speech-to-text services."""
_frame_rate: int
def __init__(self, frame_rate: int = 16000, **kwargs):
super().__init__(**kwargs)
self._frame_rate = frame_rate
@abstractmethod
async def run_stt(self, audio: BinaryIO) -> str:
"""Returns transcript as a string"""
pass
async def process_frame(self, frame: QueueFrame) -> AsyncGenerator[QueueFrame, None]:
"""Processes a frame of audio data, either buffering or transcribing it."""
if not isinstance(frame, AudioQueueFrame):
return
data = frame.data
content = io.BufferedRandom(io.BytesIO())
ww = wave.open(self._content, "wb")
ww.setnchannels(1)
ww.setsampwidth(2)
ww.setframerate(self._frame_rate)
ww.writeframesraw(data)
ww.close()
content.seek(0)
text = await self.run_stt(content)
yield TextQueueFrame(text)
@dataclass
class AIServiceConfig:
tts: TTSService
image: ImageGenService
llm: LLMService
stt: STTService

View File

@@ -1,11 +1,12 @@
import json
import aiohttp
import asyncio
import io
from openai import AzureOpenAI
import json
from openai import AsyncAzureOpenAI
import os
import requests
from typing import Generator
from collections.abc import AsyncGenerator
from dailyai.services.ai_services import LLMService, TTSService, ImageGenService
from PIL import Image
@@ -14,7 +15,10 @@ from PIL import Image
from azure.cognitiveservices.speech import SpeechSynthesizer, SpeechConfig, ResultReason, CancellationReason
class AzureTTSService(TTSService):
def __init__(self, speech_key=None, speech_region=None):
def __init__(
self, speech_key=None, speech_region=None, voice_name="en-US-SaraNeural"
):
super().__init__()
speech_key = speech_key or os.getenv("AZURE_SPEECH_SERVICE_KEY")
@@ -23,17 +27,19 @@ class AzureTTSService(TTSService):
self.speech_config = SpeechConfig(subscription=speech_key, region=speech_region)
self.speech_synthesizer = SpeechSynthesizer(speech_config=self.speech_config, audio_config=None)
def run_tts(self, sentence) -> Generator[bytes, None, None]:
self.voice_name = voice_name
async def run_tts(self, sentence) -> AsyncGenerator[bytes, None]:
self.logger.info("Running azure tts")
ssml = "<speak version='1.0' xml:lang='en-US' xmlns='http://www.w3.org/2001/10/synthesis' " \
ssml = f"<speak version='1.0' xml:lang='en-US' xmlns='http://www.w3.org/2001/10/synthesis' " \
"xmlns:mstts='http://www.w3.org/2001/mstts'>" \
"<voice name='en-US-SaraNeural'>" \
f"<voice name={self.voice_name}>" \
"<mstts:silence type='Sentenceboundary' value='20ms' />" \
"<mstts:express-as style='lyrical' styledegree='2' role='SeniorFemale'>" \
"<prosody rate='1.05'>" \
f"{sentence}" \
"</prosody></mstts:express-as></voice></speak> "
result = self.speech_synthesizer.speak_ssml(ssml)
result = await asyncio.to_thread(self.speech_synthesizer.speak_ssml, (ssml))
self.logger.info("Got azure tts result")
if result.reason == ResultReason.SynthesizingAudioCompleted:
self.logger.info("Returning result")
@@ -49,76 +55,99 @@ class AzureLLMService(LLMService):
def __init__(self, api_key=None, azure_endpoint=None, api_version=None, model=None):
super().__init__()
api_key = api_key or os.getenv("AZURE_CHATGPT_KEY")
azure_endpoint = azure_endpoint or os.getenv("AZURE_CHATGPT_ENDPOINT")
if not azure_endpoint:
raise Exception("No azure endpoint specified for Azure LLM, please set AZURE_CHATGPT_ENDPOINT in the environment or pass it to the AzureLLMService constructor")
model: str | None = model or os.getenv("AZURE_CHATGPT_DEPLOYMENT_ID")
if not model:
raise Exception("No model specified for Azure LLM, please set AZURE_CHATGPT_DEPLOYMENT_ID in the environment or pass it to the AzureLLMService constructor")
self.model: str = model
api_version = api_version or "2023-12-01-preview"
self.client = AzureOpenAI(
self.client = AsyncAzureOpenAI(
api_key=api_key,
azure_endpoint=azure_endpoint,
api_version=api_version,
)
self.model = model or os.getenv("AZURE_CHATGPT_DEPLOYMENT_ID")
def get_response(self, messages, stream):
return self.client.chat.completions.create(
stream=stream,
messages=messages,
model=self.model,
)
def run_llm_async(self, messages) -> Generator[str, None, None]:
async def run_llm_async(self, messages) -> AsyncGenerator[str, None]:
messages_for_log = json.dumps(messages)
self.logger.debug(f"Generating chat via azure: {messages_for_log}")
response = self.get_response(messages, stream=True)
for chunk in response:
chunks = await self.client.chat.completions.create(model=self.model, stream=True, messages=messages)
async for chunk in chunks:
if len(chunk.choices) == 0:
continue
if chunk.choices[0].delta.content:
yield chunk.choices[0].delta.content
def run_llm(self, messages) -> str | None:
async def run_llm(self, messages) -> str | None:
messages_for_log = json.dumps(messages)
self.logger.debug(f"Generating chat via azure: {messages_for_log}")
response = self.get_response(messages, stream=False)
response = await self.client.chat.completions.create(model=self.model, stream=False, messages=messages)
if response and len(response.choices) > 0:
return response.choices[0].message.content
else:
return None
class AzureImageGenServiceREST(ImageGenService):
class AzureImageGenService(ImageGenService):
def __init__(self, api_key=None, azure_endpoint=None, api_version=None, model=None):
super().__init__()
api_key = api_key or os.getenv("AZURE_DALLE_KEY")
azure_endpoint = azure_endpoint or os.getenv("AZURE_DALLE_ENDPOINT")
api_version = api_version or "2023-12-01-preview"
def __init__(
self,
image_size: str,
api_key: str | None = None,
azure_endpoint: str | None = None,
api_version: str | None = None,
model: str | None = None,
aiohttp_session: aiohttp.ClientSession | None=None,
timeout_seconds=120,
):
super().__init__(image_size=image_size)
self.api_key = api_key or os.getenv("AZURE_DALLE_KEY")
self.azure_endpoint = azure_endpoint or os.getenv("AZURE_DALLE_ENDPOINT")
self.api_version = api_version or "2023-06-01-preview"
self.model = model or os.getenv("AZURE_DALLE_DEPLOYMENT_ID")
self.client = AzureOpenAI(
api_key=api_key,
azure_endpoint=azure_endpoint,
api_version=api_version,
self.aiohttp_session: aiohttp.ClientSession = (
aiohttp_session or aiohttp.ClientSession()
)
self.timeout_seconds = timeout_seconds
def run_image_gen(self, sentence) -> tuple[str, Image.Image]:
self.logger.info("Generating azure image", sentence)
async def run_image_gen(self, sentence) -> tuple[str, bytes]:
url = f"{self.azure_endpoint}openai/images/generations:submit?api-version={self.api_version}"
headers= { "api-key": self.api_key, "Content-Type": "application/json" }
body = {
"prompt": sentence,
"size": self.image_size,
"n": 1,
}
async with self.aiohttp_session.post(
url, headers=headers, json=body
) as submission:
operation_location = submission.headers['operation-location']
image = self.client.images.generate(
model=self.model,
prompt=sentence,
n=1,
size=f"1024x1024",
)
status = ""
attempts_left = self.timeout_seconds
json_response = None
while status != "succeeded":
attempts_left -= 1
if attempts_left == 0:
raise Exception("Image generation timed out")
url = image["data"][0]["url"]
response = requests.get(url)
await asyncio.sleep(1)
response = await self.aiohttp_session.get(operation_location, headers=headers)
json_response = await response.json()
status = json_response["status"]
dalle_stream = io.BytesIO(response.content)
dalle_im = Image.open(dalle_stream)
image_url = json_response["result"]["data"][0]["url"] if json_response else None
if not image_url:
raise Exception("Image generation failed")
return (url, dalle_im)
# Load the image from the url
async with self.aiohttp_session.get(image_url) as response:
image_stream = io.BytesIO(await response.content.read())
image = Image.open(image_stream)
return (image_url, image.tobytes())

View File

@@ -0,0 +1,411 @@
import asyncio
import inspect
import logging
import threading
import time
import types
from functools import partial
from queue import Queue, Empty
from typing import AsyncGenerator
from dailyai.queue_frame import (
AudioQueueFrame,
EndStreamQueueFrame,
ImageQueueFrame,
QueueFrame,
StartStreamQueueFrame,
TextQueueFrame,
TranscriptionQueueFrame,
)
from threading import Thread, Event
from daily import (
EventHandler,
CallClient,
Daily,
VirtualCameraDevice,
VirtualMicrophoneDevice,
VirtualSpeakerDevice,
)
class DailyTransportService(EventHandler):
_daily_initialized = False
_lock = threading.Lock()
speaker_enabled: bool
speaker_sample_rate: int
def __init__(
self,
room_url: str,
token: str | None,
bot_name: str,
duration: float = 10,
min_others_count: int = 1,
start_transcription: bool = True,
speaker_enabled: bool = False,
speaker_sample_rate: int = 16000,
):
super().__init__()
self.bot_name: str = bot_name
self.room_url: str = room_url
self.token: str | None = token
self.duration: float = duration
self.expiration = time.time() + duration * 60
self.min_others_count = min_others_count
self.start_transcription = start_transcription
# This queue is used to marshal frames from the async send queue to the thread that emits audio & video.
# We need this to maintain the asynchronous behavior of asyncio queues -- to give async functions
# a chance to run while waiting for queue items -- but also to maintain thread safety and have a threaded
# handler to send frames, to ensure that sending isn't subject to pauses in the async thread.
self.threadsafe_send_queue = Queue()
self.is_interrupted = Event()
self.stop_threads = Event()
self.story_started = False
self.mic_enabled = False
self.mic_sample_rate = 16000
self.camera_width = 1024
self.camera_height = 768
self.camera_enabled = False
self.speaker_enabled = speaker_enabled
self.speaker_sample_rate = speaker_sample_rate
self.send_queue = asyncio.Queue()
self.receive_queue = asyncio.Queue()
self.other_participant_has_joined = False
self.my_participant_id = None
self.camera_thread = None
self.frame_consumer_thread = None
self.transcription_settings = {
"language": "en",
"tier": "nova",
"model": "2-conversationalai",
"profanity_filter": True,
"redact": False,
"extra": {
"endpointing": True,
"punctuate": False,
},
}
self.logger: logging.Logger = logging.getLogger("dailyai")
self.event_handlers = {}
try:
self.loop = asyncio.get_running_loop()
except RuntimeError:
self.loop = None
def patch_method(self, event_name, *args, **kwargs):
try:
for handler in self.event_handlers[event_name]:
if inspect.iscoroutinefunction(handler):
if self.loop:
asyncio.run_coroutine_threadsafe(handler(*args, **kwargs), self.loop)
else:
raise Exception("No event loop to run coroutine. In order to use async event handlers, you must run the DailyTransportService in an asyncio event loop.")
else:
handler(*args, **kwargs)
except Exception as e:
self.logger.error(f"Exception in event handler {event_name}: {e}")
raise e
def add_event_handler(self, event_name: str, handler):
if not event_name.startswith("on_"):
raise Exception(f"Event handler {event_name} must start with 'on_'")
methods = inspect.getmembers(self, predicate=inspect.ismethod)
if event_name not in [method[0] for method in methods]:
raise Exception(f"Event handler {event_name} not found")
if not event_name in self.event_handlers:
self.event_handlers[event_name] = [getattr(self, event_name), types.MethodType(handler, self)]
setattr(self, event_name, partial(self.patch_method, event_name))
else:
self.event_handlers[event_name].append(types.MethodType(handler, self))
def event_handler(self, event_name: str):
def decorator(handler):
self.add_event_handler(event_name, handler)
return handler
return decorator
def configure_daily(self):
# Only initialize Daily once
if not DailyTransportService._daily_initialized:
with DailyTransportService._lock:
Daily.init()
DailyTransportService._daily_initialized = True
self.client = CallClient(event_handler=self)
if self.mic_enabled:
self.mic: VirtualMicrophoneDevice = Daily.create_microphone_device(
"mic", sample_rate=self.mic_sample_rate, channels=1
)
if self.camera_enabled:
self.camera: VirtualCameraDevice = Daily.create_camera_device(
"camera", width=self.camera_width, height=self.camera_height, color_format="RGB"
)
if self.speaker_enabled:
self.speaker: VirtualSpeakerDevice = Daily.create_speaker_device(
"speaker", sample_rate=self.speaker_sample_rate, channels=1
)
Daily.select_speaker_device("speaker")
self.image: bytes | None = None
self.camera_thread = Thread(target=self.run_camera, daemon=True)
self.camera_thread.start()
self.logger.info("Starting frame consumer thread")
self.frame_consumer_thread = Thread(target=self.frame_consumer, daemon=True)
self.frame_consumer_thread.start()
self.client.set_user_name(self.bot_name)
self.client.join(self.room_url, self.token, completion=self.call_joined)
self.my_participant_id = self.client.participants()["local"]["id"]
self.client.update_inputs(
{
"camera": {
"isEnabled": True,
"settings": {
"deviceId": "camera",
},
},
"microphone": {
"isEnabled": True,
"settings": {
"deviceId": "mic",
"customConstraints": {
"autoGainControl": {"exact": False},
"echoCancellation": {"exact": False},
"noiseSuppression": {"exact": False},
},
},
},
}
)
self.client.update_publishing(
{
"camera": {
"sendSettings": {
"maxQuality": "low",
"encodings": {
"low": {
"maxBitrate": 250000,
"scaleResolutionDownBy": 1.333,
"maxFramerate": 8,
}
},
}
}
}
)
if self.token and self.start_transcription:
self.client.start_transcription(self.transcription_settings)
def _receive_audio(self):
"""Receive audio from the Daily call and put it on the receive queue"""
seconds = 1
desired_frame_count = self.speaker_sample_rate * seconds
while True:
buffer = self.speaker.read_frames(desired_frame_count)
if len(buffer) > 0:
frame = AudioQueueFrame(buffer)
if self.loop:
asyncio.run_coroutine_threadsafe(self.receive_queue.put(frame), self.loop)
def interrupt(self):
self.is_interrupted.set()
async def get_receive_frames(self) -> AsyncGenerator[QueueFrame, None]:
while True:
frame = await self.receive_queue.get()
yield frame
if isinstance(frame, EndStreamQueueFrame):
break
def get_async_send_queue(self):
return self.send_queue
async def marshal_frames(self):
while True:
frame: QueueFrame | list = await self.send_queue.get()
self.threadsafe_send_queue.put(frame)
self.send_queue.task_done()
if isinstance(frame, EndStreamQueueFrame):
break
async def wait_for_send_queue_to_empty(self):
await self.send_queue.join()
self.threadsafe_send_queue.join()
async def stop_when_done(self):
await self.wait_for_send_queue_to_empty()
self.stop()
async def run(self) -> None:
self.configure_daily()
self.do_shutdown = False
async_output_queue_marshal_task = asyncio.create_task(self.marshal_frames())
try:
participant_count: int = len(self.client.participants())
self.logger.info(f"{participant_count} participants in room")
while time.time() < self.expiration and not self.do_shutdown and not self.stop_threads.is_set():
await asyncio.sleep(1)
except Exception as e:
self.logger.error(f"Exception {e}")
raise e
finally:
self.client.leave()
self.stop_threads.set()
await self.receive_queue.put(EndStreamQueueFrame())
await self.send_queue.put(EndStreamQueueFrame())
await async_output_queue_marshal_task
if self.camera_thread and self.camera_thread.is_alive():
self.camera_thread.join()
if self.frame_consumer_thread and self.frame_consumer_thread.is_alive():
self.frame_consumer_thread.join()
def stop(self):
self.stop_threads.set()
def on_first_other_participant_joined(self):
pass
def call_joined(self, join_data, client_error):
self.logger.info(f"Call_joined: {join_data}, {client_error}")
if self.speaker_enabled:
t = Thread(target=self._receive_audio, daemon=True)
t.start()
def on_error(self, error):
self.logger.error(f"on_error: {error}")
def on_call_state_updated(self, state):
pass
def on_participant_joined(self, participant):
if not self.other_participant_has_joined and participant["id"] != self.my_participant_id:
self.other_participant_has_joined = True
self.on_first_other_participant_joined()
def on_participant_left(self, participant, reason):
if len(self.client.participants()) < self.min_others_count + 1:
self.do_shutdown = True
pass
def on_app_message(self, message, sender):
pass
def on_transcription_message(self, message:dict):
if self.loop:
participantId = ""
if "participantId" in message:
participantId = message["participantId"]
elif "session_id" in message:
participantId = message["session_id"]
frame = TranscriptionQueueFrame(message["text"], participantId, message["timestamp"])
asyncio.run_coroutine_threadsafe(self.receive_queue.put(frame), self.loop)
def on_transcription_stopped(self, stopped_by, stopped_by_error):
pass
def on_transcription_error(self, message):
pass
def on_transcription_started(self, status):
pass
def set_image(self, image: bytes):
self.image: bytes | None = image
def run_camera(self):
try:
while not self.stop_threads.is_set():
if self.image:
self.camera.write_frame(self.image)
time.sleep(1.0 / 8) # 8 fps
except Exception as e:
self.logger.error(f"Exception {e} in camera thread.")
raise e
def frame_consumer(self):
self.logger.info("🎬 Starting frame consumer thread")
b = bytearray()
smallest_write_size = 3200
all_audio_frames = bytearray()
while True:
try:
frames_or_frame: QueueFrame | list[QueueFrame] = self.threadsafe_send_queue.get()
if isinstance(frames_or_frame, QueueFrame):
frames: list[QueueFrame] = [frames_or_frame]
elif isinstance(frames_or_frame, list):
frames: list[QueueFrame] = frames_or_frame
else:
raise Exception("Unknown type in output queue")
for frame in frames:
if isinstance(frame, EndStreamQueueFrame):
self.logger.info("Stopping frame consumer thread")
self.threadsafe_send_queue.task_done()
return
# if interrupted, we just pull frames off the queue and discard them
if not self.is_interrupted.is_set():
if frame:
if isinstance(frame, AudioQueueFrame):
chunk = frame.data
all_audio_frames.extend(chunk)
b.extend(chunk)
l = len(b) - (len(b) % smallest_write_size)
if l:
self.mic.write_frames(bytes(b[:l]))
b = b[l:]
elif isinstance(frame, ImageQueueFrame):
self.set_image(frame.image)
elif len(b):
self.mic.write_frames(bytes(b))
b = bytearray()
else:
# if there are leftover audio bytes, write them now; failing to do so
# can cause static in the audio stream.
if len(b):
self.mic.write_frames(bytes(b))
b = bytearray()
if isinstance(frame, StartStreamQueueFrame):
self.is_interrupted.clear()
self.threadsafe_send_queue.task_done()
except Empty:
try:
if len(b):
self.mic.write_frames(bytes(b))
except Exception as e:
self.logger.error(f"Exception in frame_consumer: {e}, {len(b)}")
raise e
b = bytearray()

View File

@@ -0,0 +1,29 @@
import aiohttp
import asyncio
import os
import requests
from collections.abc import AsyncGenerator
from dailyai.services.ai_services import TTSService
class DeepgramTTSService(TTSService):
def __init__(self, speech_key=None, voice=None):
super().__init__()
self.voice = voice or os.getenv("DEEPGRAM_VOICE") or "alpha-asteria-en-v2"
self.speech_key = speech_key or os.getenv("DEEPGRAM_API_KEY")
def get_mic_sample_rate(self):
return 24000
async def run_tts(self, sentence) -> AsyncGenerator[bytes, None]:
self.logger.info(f"Running deepgram tts for {sentence}")
base_url = "https://api.beta.deepgram.com/v1/speak"
request_url = f"{base_url}?model={self.voice}&encoding=linear16&container=none&sample_rate=16000"
headers = {"authorization": f"token {self.speech_key}"}
body = { "text": sentence }
async with aiohttp.ClientSession() as session:
async with session.post(request_url, headers=headers, json=body) as r:
async for data in r.content:
yield data

View File

@@ -1,20 +1,22 @@
import aiohttp
import os
import requests
import time
from typing import Generator
from typing import AsyncGenerator
from ..services.ai_services import TTSService
from dailyai.services.ai_services import TTSService
class ElevenLabsTTSService(TTSService):
def __init__(self):
def __init__(self, api_key=None, voice_id=None, aiohttp_session:aiohttp.ClientSession=None):
super().__init__()
self.api_key = os.getenv("ELEVENLABS_API_KEY")
self.voice_id = os.getenv("ELEVENLABS_VOICE_ID")
self.api_key = api_key or os.getenv("ELEVENLABS_API_KEY")
self.voice_id = voice_id or os.getenv("ELEVENLABS_VOICE_ID")
self.aiohttp_session = aiohttp_session or aiohttp.ClientSession()
def run_tts(self, sentence) -> Generator[bytes, None, None]:
async def run_tts(self, sentence) -> AsyncGenerator[bytes, None]:
url = f"https://api.elevenlabs.io/v1/text-to-speech/{self.voice_id}/stream"
payload = {"text": sentence, "model_id": "eleven_turbo_v2"}
querystring = {"output_format": "pcm_16000", "optimize_streaming_latency": 2}
@@ -22,17 +24,15 @@ class ElevenLabsTTSService(TTSService):
"xi-api-key": self.api_key,
"Content-Type": "application/json",
}
async with self.aiohttp_session.post(
url, json=payload, headers=headers, params=querystring
) as r:
if r.status != 200:
self.logger.error(
f"audio fetch status code: {r.status}, error: {r.text}"
)
return
r = requests.request(
"POST", url, json=payload, headers=headers, params=querystring, stream=True
)
if r.status_code != 200:
self.logger.error(
f"audio fetch status code: {r.status_code}, error: {r.text}"
)
return
for chunk in r.iter_content(chunk_size=3200):
if chunk:
yield chunk
async for chunk in r.content:
if chunk:
yield chunk

View File

@@ -0,0 +1,49 @@
import fal
import aiohttp
import asyncio
import io
import json
from PIL import Image
from dailyai.services.ai_services import LLMService, TTSService, ImageGenService
# Fal expects FAL_KEY_ID and FAL_KEY_SECRET to be set in the env
class FalImageGenService(ImageGenService):
def __init__(self, image_size):
super().__init__(image_size)
async def run_image_gen(self, sentence) -> tuple[str, bytes]:
def get_image_url(sentence, size):
print("starting fal submit...")
handler = fal.apps.submit(
"110602490-fast-sdxl",
arguments={
"prompt": sentence
},
)
print("past fal handler init, about to wait for iter_events...")
for event in handler.iter_events():
if isinstance(event, fal.apps.InProgress):
print('Request in progress')
print(event.logs)
result = handler.get()
image_url = result["images"][0]["url"] if result else None
if not image_url:
raise Exception("Image generation failed")
return image_url
print("fetching image url...")
image_url = await asyncio.to_thread(get_image_url, sentence, self.image_size)
print("got image url, downloading image...")
# Load the image from the url
async with aiohttp.ClientSession() as session:
async with session.get(image_url) as response:
print("got image response")
image_stream = io.BytesIO(await response.content.read())
print("read image stream")
image = Image.open(image_stream)
return (image_url, image.tobytes())
# return (image_url, dalle_im.tobytes())

View File

@@ -0,0 +1,72 @@
import array
import io
import math
from typing import AsyncGenerator
import wave
from dailyai.queue_frame import AudioQueueFrame, QueueFrame, TextQueueFrame
from dailyai.services.ai_services import STTService
class LocalSTTService(STTService):
_content: io.BufferedRandom
_wave: wave.Wave_write
_current_silence_frames: int
# Configuration
_min_rms: int
_max_silence_frames: int
_frame_rate: int
def __init__(self,
min_rms: int = 400,
max_silence_frames: int = 3,
frame_rate: int = 16000,
**kwargs):
super().__init__(frame_rate, **kwargs)
self._current_silence_frames = 0
self._min_rms = min_rms
self._max_silence_frames = max_silence_frames
self._frame_rate = frame_rate
self._new_wave()
def _new_wave(self):
"""Creates a new wave object and content buffer."""
self._content = io.BufferedRandom(io.BytesIO())
ww = wave.open(self._content, "wb")
ww.setnchannels(1)
ww.setsampwidth(2)
ww.setframerate(self._frame_rate)
self._wave = ww
async def process_frame(self, frame: QueueFrame) -> AsyncGenerator[QueueFrame, None]:
"""Processes a frame of audio data, either buffering or transcribing it."""
if not isinstance(frame, AudioQueueFrame):
return
data = frame.data
# Try to filter out empty background noise
# (Very rudimentary approach, can be improved)
rms = self._get_volume(data)
if rms >= self._min_rms:
# If volume is high enough, write new data to wave file
self._wave.writeframesraw(data)
# If buffer is not empty and we detect a 3-frame pause in speech,
# transcribe the audio gathered so far.
if self._content.tell() > 0 and self._current_silence_frames > self._max_silence_frames:
self._current_silence_frames = 0
self._wave.close()
self._content.seek(0)
text = await self.run_stt(self._content)
self._new_wave()
yield TextQueueFrame(text)
# If we get this far, this is a frame of silence
self._current_silence_frames += 1
def _get_volume(self, audio: bytes) -> float:
# https://docs.python.org/3/library/array.html
audio_array = array.array('h', audio)
squares = [sample**2 for sample in audio_array]
mean = sum(squares) / len(audio_array)
rms = math.sqrt(mean)
return rms

View File

@@ -1,67 +0,0 @@
from dailyai.services.ai_services import AIService, TTSService, LLMService, ImageGenService
from typing import Generator
import requests
from PIL import Image
import io
from openai import OpenAI
import os
import json
class OpenAILLMService(LLMService):
def __init__(self, api_key=None, model=None):
super().__init__()
api_key = api_key or os.getenv("OPEN_AI_KEY")
self.model = model or os.getenv("OPEN_AI_MODEL")
self.client = OpenAI(api_key=api_key)
def get_response(self, messages, stream):
return self.client.chat.completions.create(
stream=stream,
messages=messages,
model=self.model
)
def run_llm_async(self, messages) -> Generator[str, None, None]:
messages_for_log = json.dumps(messages)
self.logger.debug(f"Generating chat via openai: {messages_for_log}")
response = self.get_response(messages, stream=True)
for chunk in response:
if len(chunk.choices) == 0:
continue
if chunk.choices[0].delta.content:
yield chunk.choices[0].delta.content
def run_llm(self, messages) -> str | None:
messages_for_log = json.dumps(messages)
self.logger.debug(f"Generating chat via azure: {messages_for_log}")
response = self.get_response(messages, stream=False)
if response and len(response.choices) > 0:
return response.choices[0].message.content
else:
return None
class OpenAIImageGenService(ImageGenService):
def __init__(self, api_key=None, model=None):
super().__init__()
api_key = api_key or os.getenv("OPEN_AI_KEY")
self.model = model or os.getenv("OPEN_AI_MODEL")
self.client = OpenAI(api_key=api_key)
def run_image_gen(self, sentence) -> tuple[str, Image.Image]:
image = self.client.images.generate(
prompt=sentence,
n=1,
size=f"1024x1024"
)
image_url = image.data[0].url
response = requests.get(image_url)
dalle_stream = io.BytesIO(response.content)
dalle_im = Image.open(dalle_stream)
return (image_url, dalle_im)

View File

@@ -0,0 +1,82 @@
import aiohttp
from PIL import Image
import io
from openai import AsyncOpenAI
import os
import json
from collections.abc import AsyncGenerator
from dailyai.services.ai_services import LLMService, ImageGenService
class OpenAILLMService(LLMService):
def __init__(self, api_key=None, model=None):
super().__init__()
api_key = api_key or os.getenv("OPEN_AI_KEY")
self.model = model or os.getenv("OPEN_AI_LLM_MODEL") or "gpt-4"
self.client = AsyncOpenAI(api_key=api_key)
async def get_response(self, messages, stream):
return await self.client.chat.completions.create(
stream=stream,
messages=messages,
model=self.model
)
async def run_llm_async(self, messages) -> AsyncGenerator[str, None]:
messages_for_log = json.dumps(messages)
self.logger.debug(f"Generating chat via openai: {messages_for_log}")
response = await self.get_response(messages, stream=True)
for chunk in response:
if len(chunk.choices) == 0:
continue
if chunk.choices[0].delta.content:
yield chunk.choices[0].delta.content
async def run_llm(self, messages) -> str | None:
messages_for_log = json.dumps(messages)
self.logger.debug(f"Generating chat via openai: {messages_for_log}")
response = await self.get_response(messages, stream=False)
if response and len(response.choices) > 0:
return response.choices[0].message.content
else:
return None
class OpenAIImageGenService(ImageGenService):
def __init__(
self,
image_size: str,
api_key=None,
model=None,
aiohttp_session: aiohttp.ClientSession | None = None,
):
super().__init__(image_size=image_size)
api_key = api_key or os.getenv("OPEN_AI_KEY")
self.model = model or os.getenv("OPEN_AI_IMAGE_MODEL") or "dall-e-3"
self.client = AsyncOpenAI(api_key=api_key)
self.aiohttp_session=aiohttp_session or aiohttp.ClientSession()
async def run_image_gen(self, sentence) -> tuple[str, bytes]:
self.logger.info("Generating OpenAI image", sentence)
image = await self.client.images.generate(
prompt=sentence,
model=self.model,
n=1,
size=self.image_size
)
image_url = image.data[0].url
if not image_url:
raise Exception("No image provided in response", image)
# Load the image from the url
async with self.aiohttp_session.get(image_url) as response:
image_stream = io.BytesIO(await response.content.read())
image = Image.open(image_stream)
return (image_url, image.tobytes())

View File

@@ -20,7 +20,6 @@ class GoogleAIService(AIService):
)
def run_tts(self, sentence):
print("running google tts")
synthesis_input = texttospeech.SynthesisInput(text = sentence.strip())
result = self.client.synthesize_speech(input=synthesis_input, voice=self.voice, audio_config=self.audio_config)
return result

View File

@@ -13,7 +13,6 @@ class HuggingFaceAIService(AIService):
# available models at https://huggingface.co/Helsinki-NLP (**not all models use 2-character language codes**)
def run_text_translation(self, sentence, source_language, target_language):
translator = pipeline(f"translation", model=f"Helsinki-NLP/opus-mt-{source_language}-{target_language}")
print(translator(sentence))
return translator(sentence)[0]["translation_text"]

View File

@@ -0,0 +1,55 @@
"""This module implements Whisper transcription with a locally-downloaded model."""
import asyncio
from enum import Enum
import logging
from typing import BinaryIO
from faster_whisper import WhisperModel
from dailyai.services.local_stt_service import LocalSTTService
class Model(Enum):
"""Class of basic Whisper model selection options"""
TINY = "tiny"
BASE = "base"
MEDIUM = "medium"
LARGE = "large-v3"
DISTIL_LARGE_V2 = "Systran/faster-distil-whisper-large-v2"
DISTIL_MEDIUM_EN = "Systran/faster-distil-whisper-medium.en"
class WhisperSTTService(LocalSTTService):
"""Class to transcribe audio with a locally-downloaded Whisper model"""
_model: WhisperModel
# Model configuration
_model_name: Model
_device: str
_compute_type: str
def __init__(self, model_name: Model = Model.DISTIL_MEDIUM_EN,
device: str = "auto",
compute_type: str = "default"):
super().__init__()
self.logger: logging.Logger = logging.getLogger("dailyai")
self._model_name = model_name
self._device = device
self._compute_type = compute_type
self._load()
def _load(self):
"""Loads the Whisper model. Note that if this is the first time
this model is being run, it will take time to download."""
model = WhisperModel(
self._model_name.value,
device=self._device,
compute_type=self._compute_type)
self._model = model
async def run_stt(self, audio: BinaryIO = None) -> str:
"""Transcribes given audio using Whisper"""
segments, _ = await asyncio.to_thread(self._model.transcribe, audio)
res: str = ""
for segment in segments:
res += f"{segment.text} "
return res

View File

@@ -0,0 +1,48 @@
from re import A
import unittest
from typing import AsyncGenerator, Generator
from dailyai.services.ai_services import AIService
from dailyai.queue_frame import EndStreamQueueFrame, QueueFrame, TextQueueFrame
class SimpleAIService(AIService):
async def process_frame(self, frame: QueueFrame) -> AsyncGenerator[QueueFrame, None]:
yield frame
class TestBaseAIService(unittest.IsolatedAsyncioTestCase):
async def test_async_input(self):
service = SimpleAIService()
input_frames = [
TextQueueFrame("hello"),
EndStreamQueueFrame()
]
async def iterate_frames() -> AsyncGenerator[QueueFrame, None]:
for frame in input_frames:
yield frame
output_frames = []
async for frame in service.run(iterate_frames()):
output_frames.append(frame)
self.assertEqual(input_frames, output_frames)
async def test_nonasync_input(self):
service = SimpleAIService()
input_frames = [TextQueueFrame("hello"), EndStreamQueueFrame()]
def iterate_frames() -> Generator[QueueFrame, None, None]:
for frame in input_frames:
yield frame
output_frames = []
async for frame in service.run(iterate_frames()):
output_frames.append(frame)
self.assertEqual(input_frames, output_frames)
if __name__ == "__main__":
unittest.main()

View File

@@ -1,179 +0,0 @@
import time
import unittest
from queue import Queue, Empty
from threading import Thread, Event
from typing import Generator
from dailyai.async_processor.async_processor import (
AsyncProcessor,
AsyncProcessorState,
LLMResponse,
)
from dailyai.message_handler.message_handler import MessageHandler
from dailyai.output_queue import OutputQueueFrame, FrameType
from dailyai.services.ai_services import (
AIServiceConfig,
ImageGenService,
LLMService,
TTSService,
)
class MockTTSService(TTSService):
def run_tts(self, sentence):
for word in sentence.split(' '):
time.sleep(0.1)
yield bytes(word, "utf-8")
class MockLLMService(LLMService):
def run_llm_async(self, messages) -> Generator[str, None, None]:
for i in ["Hello ", "there.", "How are ", "you?", "I ", "hope ", "you ", "are ", "well."]:
time.sleep(0.1)
yield i
class MockImageService(ImageGenService):
def run_image_gen(self, sentence) -> None:
return None
class TestResponse(unittest.TestCase):
def test_base_state_transitions(self):
mock_tts_service = MockTTSService()
mock_llm_service = MockLLMService()
mock_image_service = MockImageService()
processor = AsyncProcessor(AIServiceConfig(tts=mock_tts_service, llm=mock_llm_service, image=mock_image_service))
processor.prepare()
processor.play()
processor.finalize()
self.assertEqual(processor.state, AsyncProcessorState.FINALIZED)
def test_state_transitions(self):
output_queue = Queue()
mock_tts_service = MockTTSService()
mock_llm_service = MockLLMService()
mock_image_service = MockImageService()
message_handler = MessageHandler("Hello World")
processor = LLMResponse(
AIServiceConfig(
tts=mock_tts_service, llm=mock_llm_service, image=mock_image_service
),
message_handler,
output_queue,
)
processor.prepare()
processor.play()
# Consume the output from the output queue. It's necessary to mark these tasks as done for the
# play function to return.
expected_words = ["Hello", "there.", "How", "are", "you?", "I", "hope", "you", "are", "well."]
# remove the "start_stream" message from the queue
output_queue.get()
output_queue.task_done()
while expected_words:
actual_word:OutputQueueFrame = output_queue.get()
word = expected_words.pop(0)
self.assertEqual(actual_word.frame_type, FrameType.AUDIO_FRAME)
self.assertEqual(actual_word.frame_data, bytes(word, "utf-8"))
output_queue.task_done()
processor.finalize()
self.assertEqual(processor.state, AsyncProcessorState.FINALIZED)
def test_interrupt_preparation(self):
output_queue = Queue()
mock_tts_service = MockTTSService()
mock_llm_service = MockLLMService()
mock_image_service = MockImageService()
message_handler = MessageHandler("System Message")
processor = LLMResponse(
AIServiceConfig(
tts=mock_tts_service, llm=mock_llm_service, image=mock_image_service
),
message_handler,
output_queue,
)
processor.prepare()
interrupt_request_at = time.perf_counter()
processor.interrupt()
processor.finalize()
finalized_at = time.perf_counter()
self.assertTrue(0.1 < finalized_at - interrupt_request_at < 0.2)
print(f"delta: {interrupt_request_at, finalized_at}")
self.assertEqual(processor.state, AsyncProcessorState.FINALIZED)
def test_interrupt_play(self):
output_queue = Queue()
mock_tts_service = MockTTSService()
mock_llm_service = MockLLMService()
mock_image_service = MockImageService()
message_handler = MessageHandler("System Message")
processor = LLMResponse(
AIServiceConfig(
tts=mock_tts_service, llm=mock_llm_service, image=mock_image_service
),
message_handler,
output_queue,
)
processor.prepare()
processor.play()
stop_processing_output_queue = Event()
def process_output_queue_async():
# Consume the output from the output queue. It's necessary to mark these tasks as done for the
# play function to return.
time.sleep(0.1)
expected_words = ["Hello", "there.", "How", "are", "you?", "I", "hope", "you", "are", "well."]
while expected_words and not stop_processing_output_queue.is_set():
try:
actual_word:OutputQueueFrame = output_queue.get_nowait()
if actual_word.frame_type == FrameType.AUDIO_FRAME:
time.sleep(0.1)
word = expected_words.pop(0)
self.assertEqual(actual_word.frame_type, FrameType.AUDIO_FRAME)
self.assertEqual(actual_word.frame_data, bytes(word, "utf-8"))
output_queue.task_done()
except Empty:
pass
process_output_queue = Thread(target=process_output_queue_async, daemon=True)
process_output_queue.start()
time.sleep(0.5)
processor.interrupt()
stop_processing_output_queue.set()
process_output_queue.join()
processor.finalize()
self.assertEqual(processor.state, AsyncProcessorState.FINALIZED)
def test_statechange_callback(self):
mock_tts_service = MockTTSService()
mock_llm_service = MockLLMService()
mock_image_service = MockImageService()
processor = AsyncProcessor(
AIServiceConfig(
tts=mock_tts_service, llm=mock_llm_service, image=mock_image_service
)
)
is_finalized = False
def set_is_finalized(async_processor:AsyncProcessor):
nonlocal is_finalized
is_finalized = True
processor.set_state_callback(
AsyncProcessorState.FINALIZED, set_is_finalized
)
processor.prepare()
self.assertFalse(is_finalized)
processor.play()
self.assertFalse(is_finalized)
processor.finalize()
self.assertTrue(is_finalized)
self.assertEqual(processor.state, AsyncProcessorState.FINALIZED)
if __name__ == '__main__':
unittest.main()

View File

@@ -1,147 +0,0 @@
import time
import unittest
from unittest.mock import MagicMock, call
from dailyai.message_handler.message_handler import MessageHandler, IndexingMessageHandler
from dailyai.services.ai_services import (
AIServiceConfig,
TTSService,
LLMService,
ImageGenService,
)
from ..storage.search import SearchIndexer
class TestMessageHandler(unittest.TestCase):
def test_simple_intro(self):
message_handler = MessageHandler("Hello world")
self.assertEqual(
message_handler.get_llm_messages(),
[{"role": "system", "content": "Hello world"}],
)
def test_simple_user_message(self):
message_handler = MessageHandler("System prompt")
message_handler.add_user_message("User message")
self.assertEqual(
message_handler.get_llm_messages(),
[
{"role": "system", "content": "System prompt"},
{"role": "user", "content": "User message"},
],
)
def test_simple_user_and_assistant_message(self):
message_handler = MessageHandler("System prompt")
message_handler.add_user_message("User message")
message_handler.add_assistant_message("Assistant message")
self.assertEqual(
message_handler.get_llm_messages(),
[
{"role": "system", "content": "System prompt"},
{"role": "user", "content": "User message"},
{"role": "assistant", "content": "Assistant message"},
],
)
def test_user_message_overwrite(self):
message_handler = MessageHandler("System prompt")
message_handler.add_user_message("User message")
message_handler.add_assistant_message("Assistant message")
message_handler.add_user_message("plus something else")
self.assertEqual(
message_handler.get_llm_messages(),
[
{"role": "system", "content": "System prompt"},
{"role": "user", "content": "User message plus something else"},
],
)
def test_user_message_after_assistant(self):
message_handler = MessageHandler("System prompt")
message_handler.add_user_message("User message")
message_handler.add_assistant_message("Assistant message")
message_handler.finalize_user_message()
message_handler.add_user_message("other user message")
self.assertEqual(
message_handler.get_llm_messages(),
[
{"role": "system", "content": "System prompt"},
{"role": "user", "content": "User message"},
{"role": "assistant", "content": "Assistant message"},
{"role": "user", "content": "other user message"},
],
)
class MockTTSService(TTSService):
def run_tts(self, sentence):
for word in sentence.split(" "):
time.sleep(0.1)
yield bytes(word, "utf-8")
class MockLLMService(LLMService):
def run_llm(self, messages) -> str:
return "Parsed user message."
class MockImageService(ImageGenService):
def run_image_gen(self, sentence) -> None:
return None
class TestStorageMessageHandler(unittest.TestCase):
def test_user_message_finalized(self):
mock_tts_service = MockTTSService()
mock_llm_service = MockLLMService()
mock_image_service = MockImageService()
service_config = AIServiceConfig(
tts=mock_tts_service, llm=mock_llm_service, image=mock_image_service
)
mock_indexer = MagicMock(spec=SearchIndexer)
message_handler = IndexingMessageHandler(
"Hello world", service_config, mock_indexer
)
message_handler.cleanup_user_message = MagicMock(return_value="Parsed user message.")
message_handler.add_user_message("User message")
message_handler.add_assistant_message("Assistant message will be ignored")
message_handler.add_user_message("plus something else")
message_handler.finalize_user_message()
message_handler.add_assistant_message(
"New assistant message will not be ignored"
)
message_handler.add_user_message("User message second time")
message_handler.add_assistant_message("Assistant message second time")
message_handler.write_messages_to_storage()
time.sleep(0.5)
message_handler.cleanup_user_message.assert_called_with("User message plus something else")
self.assertEqual(
mock_indexer.mock_calls,
[
call.index_text('"Parsed user message."'),
call.index_text("New assistant message will not be ignored"),
],
)
mock_indexer.reset_mock()
message_handler.finalize_user_message()
time.sleep(0.5)
self.assertEqual(
mock_indexer.mock_calls,
[
call.index_text('"Parsed user message."'),
call.index_text("Assistant message second time"),
],
)
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1 @@
These samples need to be updated! Don't rely on them.

View File

@@ -13,16 +13,18 @@ from dailyai.orchestrator import OrchestratorConfig, Orchestrator
from dailyai.message_handler.message_handler import MessageHandler
from dailyai.services.ai_services import AIServiceConfig
from dailyai.services.azure_ai_services import AzureImageGenService, AzureTTSService, AzureLLMService
from dailyai.services.deepgram_ai_services import DeepgramTTSService
def add_bot_to_room(room_url, token, expiration) -> None:
# A simple prompt for a simple sample.
message_handler = MessageHandler(
"""
You are a sample bot, meant to demonstrate how to use an LLM with transcription at TTS.
You are a sample bot in a WebRTC session. You'll receive input as transcriptions of user's
speech, and your responses will be converted to audio via a TTS service.
Answer user's questions and be friendly, and if you can, give some ideas about how someone
could use a bot like you in a more in-depth way. Because your responses will be spoken,
try to keep them short and sweet.
try to keep them short.
"""
)
@@ -33,12 +35,6 @@ def add_bot_to_room(room_url, token, expiration) -> None:
# - AZURE_CHATGPT_KEY
# - AZURE_CHATGPT_ENDPOINT
# - AZURE_CHATGPT_DEPLOYMENT_ID
#
# This demo doesn't use image generation, but if you extend it to do so,
# you'll also need to set:
# - AZURE_DALLE_KEY
# - AZURE_DALLE_ENDPOINT
# - AZURE_DALLE_DEPLOYMENT_ID
services = AIServiceConfig(
tts=AzureTTSService(), image=None, llm=AzureLLMService()

View File

@@ -15,7 +15,7 @@ from dailyai.async_processor.async_processor import (
OrchestratorResponse
)
from dailyai.orchestrator import OrchestratorConfig, Orchestrator
from dailyai.output_queue import OutputQueueFrame, FrameType
from dailyai.queue_frame import QueueFrame, FrameType
from dailyai.message_handler.message_handler import MessageHandler
from dailyai.services.ai_services import AIServiceConfig
from dailyai.services.azure_ai_services import AzureImageGenService, AzureTTSService, AzureLLMService
@@ -40,7 +40,7 @@ class StaticSpriteResponse(OrchestratorResponse):
self.image_bytes = img.tobytes()
def do_play(self) -> None:
self.output_queue.put(OutputQueueFrame(FrameType.IMAGE_FRAME, self.image_bytes))
self.output_queue.put(QueueFrame(FrameType.IMAGE, self.image_bytes))
class IntroSpriteResponse(StaticSpriteResponse):
@@ -71,10 +71,10 @@ class AnimatedSpriteLLMResponse(LLMResponse):
with Image.open(full_path) as img:
self.image_bytes.append(img.tobytes())
def get_frames_from_tts_response(self, audio_frame) -> list[OutputQueueFrame]:
def get_frames_from_tts_response(self, audio_frame) -> list[QueueFrame]:
return [
OutputQueueFrame(FrameType.AUDIO_FRAME, audio_frame),
OutputQueueFrame(FrameType.IMAGE_FRAME, random.choice(self.image_bytes))
QueueFrame(FrameType.AUDIO, audio_frame),
QueueFrame(FrameType.IMAGE, random.choice(self.image_bytes))
]
@@ -83,10 +83,11 @@ def add_bot_to_room(room_url, token, expiration) -> None:
# A simple prompt for a simple sample.
message_handler = MessageHandler(
"""
You are a sample bot, meant to demonstrate how to use an LLM with transcription at TTS.
You are a sample bot in a WebRTC session. You'll receive input as transcriptions of user's
speech, and your responses will be converted to audio via a TTS service.
Answer user's questions and be friendly, and if you can, give some ideas about how someone
could use a bot like you in a more in-depth way. Because your responses will be spoken,
try to keep them short and sweet.
try to keep them short.
"""
)

View File

Before

Width:  |  Height:  |  Size: 871 KiB

After

Width:  |  Height:  |  Size: 871 KiB

View File

Before

Width:  |  Height:  |  Size: 870 KiB

After

Width:  |  Height:  |  Size: 870 KiB

View File

Before

Width:  |  Height:  |  Size: 871 KiB

After

Width:  |  Height:  |  Size: 871 KiB

View File

Before

Width:  |  Height:  |  Size: 868 KiB

After

Width:  |  Height:  |  Size: 868 KiB

View File

@@ -0,0 +1,51 @@
import argparse
import asyncio
from dailyai.services.daily_transport_service import DailyTransportService
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
async def main(room_url):
# create a transport service object using environment variables for
# the transport service's API key, room url, and any other configuration.
# services can all define and document the environment variables they use.
# services all also take an optional config object that is used instead of
# environment variables.
#
# the abstract transport service APIs presumably can map pretty closely
# to the daily-python basic API
meeting_duration_minutes = 1
transport = DailyTransportService(
room_url,
None,
"Say One Thing",
meeting_duration_minutes,
)
transport.mic_enabled = True
tts = ElevenLabsTTSService(voice_id="ErXwobaYiN019PkySvjV")
# Register an event handler so we can play the audio when the participant joins.
@transport.event_handler("on_participant_joined")
async def on_participant_joined(transport, participant):
if participant["info"]["isLocal"]:
return
await tts.say(
"Hello there, " + participant["info"]["userName"] + "!",
transport.send_queue,
)
# wait for the output queue to be empty, then leave the meeting
await transport.stop_when_done()
await transport.run()
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Simple Daily Bot Sample")
parser.add_argument(
"-u", "--url", type=str, required=True, help="URL of the Daily room to join"
)
args, unknown = parser.parse_known_args()
asyncio.run(main(args.url))

View File

@@ -0,0 +1,55 @@
import asyncio
import time
from typing import AsyncGenerator
from dailyai.queue_frame import QueueFrame, FrameType
from dailyai.services.daily_transport_service import DailyTransportService
from dailyai.services.azure_ai_services import AzureTTSService
from dailyai.services.deepgram_ai_services import DeepgramTTSService
async def main(room_url):
# create a transport service object using environment variables for
# the transport service's API key, room url, and any other configuration.
# services can all define and document the environment variables they use.
# services all also take an optional config object that is used instead of
# environment variables.
#
# the abstract transport service APIs presumably can map pretty closely
# to the daily-python basic API
meeting_duration_minutes = 1
transport = DailyTransportService(
room_url,
None,
"Greeter",
meeting_duration_minutes,
)
transport.mic_enabled = True
# similarly, create a tts service
tts = DeepgramTTSService()
# Get the generator for the audio. This will start running in the background,
# and when we ask the generator for its items, we'll get what it's generated.
# Register an event handler so we can play the audio when the participant joins.
print("settting up handler")
@transport.event_handler("on_participant_joined")
async def on_participant_joined(transport, participant):
print(f"participant joined: {participant['info']['userName']}")
if participant["info"]["isLocal"]:
return
audio_generator: AsyncGenerator[bytes, None] = tts.run_tts(f"Hello there, {participant['info']['userName']}!")
async for audio in audio_generator:
transport.output_queue.put(QueueFrame(FrameType.AUDIO, audio))
print("setting up call state handler")
@transport.event_handler("on_call_state_updated")
async def on_call_joined(transport, state):
print(f"call state callback: {state}")
await transport.run()
if __name__ == "__main__":
asyncio.run(main("https://chad-hq.daily.co/howdy"))

View File

@@ -0,0 +1,48 @@
import argparse
import asyncio
from dailyai.queue_frame import LLMMessagesQueueFrame
from dailyai.services.daily_transport_service import DailyTransportService
from dailyai.services.azure_ai_services import AzureLLMService
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
async def main(room_url):
meeting_duration_minutes = 1
transport = DailyTransportService(
room_url,
None,
"Say One Thing From an LLM",
meeting_duration_minutes,
)
transport.mic_enabled = True
tts = ElevenLabsTTSService(voice_id="29vD33N1CtxCmqQRPOHJ")
llm = AzureLLMService()
messages = [{
"role": "system",
"content": "You are an LLM in a WebRTC session, and this is a 'hello world' demo. Say hello to the world."
}]
tts_task = asyncio.create_task(
tts.run_to_queue(
transport.send_queue,
llm.run([LLMMessagesQueueFrame(messages)]),
)
)
@transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport):
await tts_task
await transport.stop_when_done()
await transport.run()
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Simple Daily Bot Sample")
parser.add_argument(
"-u", "--url", type=str, required=True, help="URL of the Daily room to join"
)
args, unknown = parser.parse_known_args()
asyncio.run(main(args.url))

View File

@@ -0,0 +1,45 @@
import argparse
import asyncio
from dailyai.queue_frame import TextQueueFrame
from dailyai.services.daily_transport_service import DailyTransportService
from dailyai.services.open_ai_services import OpenAIImageGenService
from dailyai.services.azure_ai_services import AzureImageGenServiceREST
local_joined = False
participant_joined = False
async def main(room_url):
meeting_duration_minutes = 1
transport = DailyTransportService(
room_url,
None,
"Show a still frame image",
meeting_duration_minutes,
)
transport.mic_enabled = False
transport.camera_enabled = True
transport.camera_width = 1024
transport.camera_height = 1024
imagegen = AzureImageGenServiceREST(image_size="1024x1024")
image_task = asyncio.create_task(
imagegen.run_to_queue(transport.send_queue, [TextQueueFrame("a cat in the style of picasso")])
)
@transport.event_handler("on_participant_joined")
async def on_participant_joined(transport, participant):
await image_task
await transport.run()
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Simple Daily Bot Sample")
parser.add_argument(
"-u", "--url", type=str, required=True, help="URL of the Daily room to join"
)
args, unknown = parser.parse_known_args()
asyncio.run(main(args.url))

View File

@@ -0,0 +1,73 @@
import argparse
import asyncio
import re
from dailyai.services.daily_transport_service import DailyTransportService
from dailyai.services.azure_ai_services import AzureLLMService, AzureTTSService
from dailyai.queue_frame import EndStreamQueueFrame, LLMMessagesQueueFrame
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
async def main(room_url:str):
global transport
global llm
global tts
transport = DailyTransportService(
room_url,
None,
"Say Two Things Bot",
1,
)
transport.mic_enabled = True
transport.mic_sample_rate = 16000
transport.camera_enabled = False
llm = AzureLLMService()
azure_tts = AzureTTSService()
elevenlabs_tts = ElevenLabsTTSService(voice_id="ErXwobaYiN019PkySvjV")
messages = [{"role": "system", "content": "tell the user a joke about llamas"}]
# Start a task to run the LLM to create a joke, and convert the LLM output to audio frames. This task
# will run in parallel with generating and speaking the audio for static text, so there's no delay to
# speak the LLM response.
buffer_queue = asyncio.Queue()
llm_response_task = asyncio.create_task(
elevenlabs_tts.run_to_queue(
buffer_queue,
llm.run([LLMMessagesQueueFrame(messages)]),
True,
)
)
@transport.event_handler("on_participant_joined")
async def on_joined(transport, participant):
if participant["id"] == transport.my_participant_id:
return
await azure_tts.say("My friend the LLM is now going to tell a joke about llamas.", transport.send_queue)
async def buffer_to_send_queue():
while True:
frame = await buffer_queue.get()
await transport.send_queue.put(frame)
buffer_queue.task_done()
if isinstance(frame, EndStreamQueueFrame):
break
await asyncio.gather(llm_response_task, buffer_to_send_queue())
await transport.stop_when_done()
await transport.run()
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Simple Daily Bot Sample")
parser.add_argument(
"-u", "--url", type=str, required=True, help="URL of the Daily room to join"
)
args, unknown = parser.parse_known_args()
asyncio.run(main(args.url))

View File

@@ -0,0 +1,109 @@
import argparse
import asyncio
from dailyai.queue_frame import AudioQueueFrame, ImageQueueFrame
from dailyai.services.azure_ai_services import AzureImageGenServiceREST, AzureLLMService
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
from dailyai.services.daily_transport_service import DailyTransportService
from dailyai.services.fal_ai_services import FalImageGenService
async def main(room_url):
meeting_duration_minutes = 5
transport = DailyTransportService(
room_url,
None,
"Month Narration Bot",
meeting_duration_minutes,
)
transport.mic_enabled = True
transport.camera_enabled = True
transport.mic_sample_rate = 16000
transport.camera_width = 1024
transport.camera_height = 1024
llm = AzureLLMService()
#dalle = FalImageGenService(image_size="1024x1024")
tts = ElevenLabsTTSService(voice_id="ErXwobaYiN019PkySvjV")
dalle = AzureImageGenServiceREST(image_size="1024x1024")
# Get a complete audio chunk from the given text. Splitting this into its own
# coroutine lets us ensure proper ordering of the audio chunks on the send queue.
async def get_all_audio(text):
all_audio = bytearray()
async for audio in tts.run_tts(text):
all_audio.extend(audio)
return all_audio
async def get_month_data(month):
messages = [
{
"role": "system",
"content": f"Describe a nature photograph suitable for use in a calendar, for the month of {month}. Include only the image description with no preamble. Limit the description to one sentence, please.",
}
]
image_description = await llm.run_llm(messages)
if not image_description:
return
to_speak = f"{month}: {image_description}"
audio_task = asyncio.create_task(get_all_audio(to_speak))
image_task = asyncio.create_task(dalle.run_image_gen(image_description))
(audio, image_data) = await asyncio.gather(
audio_task, image_task
)
return {
"month": month,
"text": image_description,
"image_url": image_data[0],
"image": image_data[1],
"audio": audio,
}
months: list[str] = [
"January",
"February",
"March",
"April",
"May",
"June",
"July",
"August",
"September",
"October",
"November",
"December",
]
@transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport):
# This will play the months in the order they're completed. The benefit
# is we'll have as little delay as possible before the first month, and
# likely no delay between months, but the months won't display in order.
for month_data_task in asyncio.as_completed(month_tasks):
data = await month_data_task
await transport.send_queue.put(
[
ImageQueueFrame(data["image_url"], data["image"]),
AudioQueueFrame(data["audio"]),
]
)
# wait for the output queue to be empty, then leave the meeting
await transport.stop_when_done()
month_tasks = [asyncio.create_task(get_month_data(month)) for month in months]
await transport.run()
if __name__=="__main__":
parser = argparse.ArgumentParser(description="Simple Daily Bot Sample")
parser.add_argument(
"-u", "--url", type=str, required=True, help="URL of the Daily room to join"
)
args, unknown = parser.parse_known_args()
asyncio.run(main(args.url))

View File

@@ -0,0 +1,91 @@
import argparse
import asyncio
import requests
import time
import urllib.parse
from dailyai.services.daily_transport_service import DailyTransportService
from dailyai.services.azure_ai_services import AzureLLMService, AzureTTSService
from dailyai.queue_aggregators import LLMContextAggregator
async def main(room_url:str, token):
global transport
global llm
global tts
transport = DailyTransportService(
room_url,
token,
"Respond bot",
5,
)
transport.mic_enabled = True
transport.mic_sample_rate = 16000
transport.camera_enabled = False
llm = AzureLLMService()
tts = AzureTTSService()
@transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport):
await tts.say("Hi, I'm listening!", transport.send_queue)
async def handle_transcriptions():
messages = [
{"role": "system", "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio. Respond to what the user said in a creative and helpful way."},
]
tma_in = LLMContextAggregator(
messages, "user", transport.my_participant_id
)
tma_out = LLMContextAggregator(
messages, "assistant", transport.my_participant_id
)
await tts.run_to_queue(
transport.send_queue,
tma_out.run(
llm.run(
tma_in.run(
transport.get_receive_frames()
)
)
)
)
transport.transcription_settings["extra"]["punctuate"] = True
await asyncio.gather(transport.run(), handle_transcriptions())
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Simple Daily Bot Sample")
parser.add_argument(
"-u", "--url", type=str, required=True, help="URL of the Daily room to join"
)
parser.add_argument(
"-k",
"--apikey",
type=str,
required=True,
help="Daily API Key (needed to create token)",
)
args, unknown = parser.parse_known_args()
# Create a meeting token for the given room with an expiration 1 hour in the future.
room_name: str = urllib.parse.urlparse(args.url).path[1:]
expiration: float = time.time() + 60 * 60
res: requests.Response = requests.post(
f"https://api.daily.co/v1/meeting-tokens",
headers={"Authorization": f"Bearer {args.apikey}"},
json={
"properties": {"room_name": room_name, "is_owner": True, "exp": expiration}
},
)
if res.status_code != 200:
raise Exception(f"Failed to create meeting token: {res.status_code} {res.text}")
token: str = res.json()["token"]
asyncio.run(main(args.url, token))

View File

@@ -0,0 +1,98 @@
import argparse
import asyncio
import requests
import time
import urllib.parse
from dailyai.conversation_wrappers import InterruptibleConversationWrapper
from dailyai.queue_frame import StartStreamQueueFrame, TextQueueFrame
from dailyai.services.daily_transport_service import DailyTransportService
from dailyai.services.azure_ai_services import AzureLLMService
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
async def main(room_url:str, token):
global transport
global llm
global tts
transport = DailyTransportService(
room_url,
token,
"Respond bot",
5,
)
transport.mic_enabled = True
transport.mic_sample_rate = 16000
transport.camera_enabled = False
llm = AzureLLMService()
tts = ElevenLabsTTSService(voice_id="ErXwobaYiN019PkySvjV")
async def run_response(user_speech, tma_in, tma_out):
await tts.run_to_queue(
transport.send_queue,
tma_out.run(
llm.run(
tma_in.run(
[StartStreamQueueFrame(), TextQueueFrame(user_speech)]
)
)
),
)
@transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport):
await tts.say("Hi, I'm listening!", transport.send_queue)
async def run_conversation():
messages = [
{"role": "system", "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio. Respond to what the user said in a creative and helpful way."},
]
conversation_wrapper = InterruptibleConversationWrapper(
frame_generator=transport.get_receive_frames,
runner=run_response,
interrupt=transport.interrupt,
my_participant_id=transport.my_participant_id,
llm_messages=messages,
)
await conversation_wrapper.run_conversation()
transport.transcription_settings["extra"]["punctuate"] = False
await asyncio.gather(transport.run(), run_conversation())
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Simple Daily Bot Sample")
parser.add_argument(
"-u", "--url", type=str, required=True, help="URL of the Daily room to join"
)
parser.add_argument(
"-k",
"--apikey",
type=str,
required=True,
help="Daily API Key (needed to create token)",
)
args, unknown = parser.parse_known_args()
# Create a meeting token for the given room with an expiration 1 hour in the future.
room_name: str = urllib.parse.urlparse(args.url).path[1:]
expiration: float = time.time() + 60 * 60
res: requests.Response = requests.post(
f"https://api.daily.co/v1/meeting-tokens",
headers={"Authorization": f"Bearer {args.apikey}"},
json={
"properties": {"room_name": room_name, "is_owner": True, "exp": expiration}
},
)
if res.status_code != 200:
raise Exception(f"Failed to create meeting token: {res.status_code} {res.text}")
token: str = res.json()["token"]
asyncio.run(main(args.url, token))

View File

@@ -0,0 +1,44 @@
import argparse
import asyncio
from dailyai.services.daily_transport_service import DailyTransportService
from dailyai.services.whisper_ai_services import WhisperSTTService
async def main(room_url: str):
global transport
global stt
transport = DailyTransportService(
room_url,
None,
"Transcription bot",
)
transport.mic_enabled = False
transport.camera_enabled = False
transport.speaker_enabled = True
stt = WhisperSTTService()
transcription_output_queue = asyncio.Queue()
async def handle_transcription():
print("`````````TRANSCRIPTION`````````")
while True:
item = await transcription_output_queue.get()
print(item.text)
async def handle_speaker():
await stt.run_to_queue(
transcription_output_queue,
transport.get_receive_frames()
)
await asyncio.gather(transport.run(), handle_speaker(), handle_transcription())
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Simple Daily Bot Sample")
parser.add_argument(
"-u", "--url", type=str, required=True, help="URL of the Daily room to join"
)
args, unknown = parser.parse_known_args()
asyncio.run(main(args.url))

113
src/samples/image-gen.py Normal file
View File

@@ -0,0 +1,113 @@
import argparse
import asyncio
import requests
import time
import urllib.parse
import random
from dailyai.services.daily_transport_service import DailyTransportService
from dailyai.services.azure_ai_services import AzureLLMService, AzureTTSService
from dailyai.queue_frame import QueueFrame, FrameType
from dailyai.services.fal_ai_services import FalImageGenService
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
async def main(room_url:str, token):
global transport
global llm
global tts
transport = DailyTransportService(
room_url,
token,
"Imagebot",
1,
)
transport.mic_enabled = True
transport.camera_enabled = True
transport.mic_sample_rate = 16000
transport.camera_width = 1024
transport.camera_height = 1024
llm = AzureLLMService()
tts = AzureTTSService()
img = FalImageGenService()
async def handle_transcriptions():
print("handle_transcriptions got called")
sentence = ""
async for message in transport.get_transcriptions():
print(f"transcription message: {message}")
if message["session_id"] == transport.my_participant_id:
continue
finder = message["text"].find("start over")
print(f"finder: {finder}")
if finder >= 0:
async for audio in tts.run_tts(f"Resetting."):
transport.output_queue.put(QueueFrame(FrameType.AUDIO_FRAME, audio))
sentence = ""
continue
# todo: we could differentiate between transcriptions from different participants
sentence += f" {message['text']}"
print(f"sentence is now: {sentence}")
# TODO: Cache this audio
phrase = random.choice(["OK.", "Got it.", "Sure.", "You bet.", "Sure thing."])
async for audio in tts.run_tts(phrase):
transport.output_queue.put(QueueFrame(FrameType.AUDIO_FRAME, audio))
img_result = img.run_image_gen(sentence, "1024x1024")
awaited_img = await asyncio.gather(img_result)
transport.output_queue.put(
[
QueueFrame(FrameType.IMAGE_FRAME, awaited_img[0][1]),
]
)
@transport.event_handler("on_participant_joined")
async def on_participant_joined(transport, participant):
print(f"participant joined: {participant['info']['userName']}")
if participant["info"]["isLocal"]:
return
async for audio in tts.run_tts("Describe an image, and I'll create it."):
audio_generator = tts.run_tts(f"Hello, {participant['info']['userName']}! Describe an image and I'll create it. To start over, just say 'start over'.")
async for audio in audio_generator:
transport.output_queue.put(QueueFrame(FrameType.AUDIO_FRAME, audio))
transport.transcription_settings["extra"]["punctuate"] = False
transport.transcription_settings["extra"]["endpointing"] = False
await asyncio.gather(transport.run(), handle_transcriptions())
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Simple Daily Bot Sample")
parser.add_argument(
"-u", "--url", type=str, required=True, help="URL of the Daily room to join"
)
parser.add_argument(
"-k",
"--apikey",
type=str,
required=True,
help="Daily API Key (needed to create token)",
)
args, unknown = parser.parse_known_args()
# Create a meeting token for the given room with an expiration 1 hour in the future.
room_name: str = urllib.parse.urlparse(args.url).path[1:]
expiration: float = time.time() + 60 * 60
res: requests.Response = requests.post(
f"https://api.daily.co/v1/meeting-tokens",
headers={"Authorization": f"Bearer {args.apikey}"},
json={
"properties": {"room_name": room_name, "is_owner": True, "exp": expiration}
},
)
if res.status_code != 200:
raise Exception(f"Failed to create meeting token: {res.status_code} {res.text}")
token: str = res.json()["token"]
asyncio.run(main(args.url, token))

Binary file not shown.