Compare commits

..

16 Commits

Author SHA1 Message Date
Chad Bailey
5477baff7d Merge branch 'main' into cb/golden-kitty-aws 2024-01-26 17:19:11 +00:00
Chad Bailey
6834d484ca i think animation defeated me 2024-01-25 02:44:26 +00:00
Chad Bailey
0a9fa24b14 basic animation kind of works 2024-01-25 01:07:22 +00:00
Chad Bailey
88e9c1ff71 removed otel 2024-01-24 23:56:15 +00:00
Chad Bailey
849171a9c6 basic wake word 2024-01-24 23:19:39 +00:00
Chad Bailey
60ebdfb958 WOOF 2024-01-24 22:26:16 +00:00
Chad Bailey
ad5dcdd760 Dockerfile 2024-01-24 21:15:05 +00:00
Chad Bailey
9c154c3d49 requirements cleanup 2024-01-24 19:53:05 +00:00
Chad Bailey
96256e90cb gunicorn 2024-01-24 19:12:54 +00:00
Chad Bailey
6f75db4d54 flask_cors 2024-01-24 19:01:19 +00:00
Chad Bailey
127fddfb1e added dotenv 2024-01-24 18:49:40 +00:00
Chad Bailey
5231243795 trying requirements.txt 2024-01-24 18:45:18 +00:00
Chad Bailey
cba14c2002 added flask to module build 2024-01-24 18:25:27 +00:00
Chad Bailey
8ae61bf2ac added health check 2024-01-24 18:22:59 +00:00
Chad Bailey
bc6849b255 added web server 2024-01-24 18:05:41 +00:00
Chad Bailey
9bbd14d5e7 WIP: golden kitty 2024-01-23 21:46:02 +00:00
81 changed files with 1743 additions and 3292 deletions

24
LICENSE
View File

@@ -1,24 +0,0 @@
BSD 2-Clause License
Copyright (c) 2024, Daily
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View File

@@ -1,19 +1,6 @@
# Daily AI SDK
# dailyai SDK
Build conversational, multi-modal AI apps with real-time voice and video, like this:
_Demo Video to come_
With built-in support for many of the best AI platforms (or [add your own](/docs)):
- Azure - DALL-E, ChatGPT, and Azure AI Text-to-Speech
- Deepgram - Speech-to-text, and Aura text-to-speech
- Eleven Labs text-to-speech
- Fal.ai image generation
- OpenAI DALL-E and ChatGPT
- Whisper local speech-to-text
## Step 1: Get Started
This SDK can help you build applications that participate in WebRTC meetings and use various AI services to interact with other participants.
## Build/Install
@@ -48,8 +35,25 @@ pip install path_to_this_repo
Tou can run the simple sample like so:
```
python src/examples/theoretical-to-real/01-say-one-thing.py -u <url of your Daily meeting> -k <your Daily API Key>
python src/samples/theoretical-to-real/01-say-one-thing.py -u <url of your Daily meeting> -k <your Daily API Key>
```
Note that the sample uses Azure's TTS and LLM services. You'll need to set the following environment variables for the sample to work:
```
AZURE_SPEECH_SERVICE_KEY
AZURE_SPEECH_SERVICE_REGION
AZURE_CHATGPT_KEY
AZURE_CHATGPT_ENDPOINT
AZURE_CHATGPT_DEPLOYMENT_ID
```
If you have those environment variables stored in an .env file, you can quickly load them into your terminal's environment by running this:
```bash
export $(grep -v '^#' .env | xargs)
```
## Overview
The Daily AI SDK allows you to build applications that can participate in WebRTC sessions and interact with AI Services. Some examples of what you can build with this:
@@ -157,3 +161,11 @@ As that text is being spoken, the asynchronous LLM task continues in the backgro
```
One thing to note here is the last parameter to `run_to_queue` in the first code clause above: this causes the `run_to_queue` method to send an `END_STREAM` frame when its done rendering. This lets us know when to stop our `buffer_to_send_queue` task above.
## Test Server
To start the test server:
```python
flask --app daily-bot-manager.py --debug run
```

View File

@@ -8,21 +8,14 @@ import os
load_dotenv()
def get_meeting_token(room_name, daily_api_key, token_expiry):
api_path = os.getenv('DAILY_API_PATH') or 'https://api.daily.co/v1'
if not token_expiry:
token_expiry = time.time() + 600
res = requests.post(
f'{api_path}/meeting-tokens',
headers={
'Authorization': f'Bearer {daily_api_key}'},
json={
'properties': {
'room_name': room_name,
'is_owner': True,
'exp': token_expiry}})
res = requests.post(f'{api_path}/meeting-tokens',
headers={'Authorization': f'Bearer {daily_api_key}'},
json={'properties': {'room_name': room_name, 'is_owner': True, 'exp': token_expiry}})
if res.status_code != 200:
return jsonify({'error': 'Unable to create meeting token', 'detail': res.text}), 500
meeting_token = res.json()['token']
@@ -30,4 +23,4 @@ def get_meeting_token(room_name, daily_api_key, token_expiry):
def get_room_name(room_url):
return urllib.parse.urlparse(room_url).path[1:]
return urllib.parse.urlparse(room_url).path[1:]

View File

@@ -5,7 +5,7 @@ import time
from flask import Flask, jsonify, request, redirect
from flask_cors import CORS
from examples.server.auth import get_meeting_token
from auth import get_meeting_token
from dotenv import load_dotenv
@@ -16,12 +16,11 @@ CORS(app)
print(f"I loaded an environment, and my FAL_KEY_ID is {os.getenv('FAL_KEY_ID')}")
def start_bot(bot_path, args=None):
daily_api_key = os.getenv("DAILY_API_KEY")
api_path = os.getenv("DAILY_API_PATH") or "https://api.daily.co/v1"
timeout = int(os.getenv("DAILY_ROOM_TIMEOUT") or os.getenv("DAILY_BOT_MAX_DURATION") or 300)
timeout = int(os.getenv("ROOM_TIMEOUT") or os.getenv("BOT_MAX_DURATION") or 300)
exp = time.time() + timeout
res = requests.post(
f"{api_path}/rooms",
@@ -78,23 +77,27 @@ def start_bot(bot_path, args=None):
if res.status_code == 200:
break
print(f"Took {attempts} attempts to join room {room_name}")
# Additional client config
config = {}
if os.getenv("CLIENT_VAD_TIMEOUT_SEC"):
config['vad_timeout_sec'] = float(os.getenv("DAILY_CLIENT_VAD_TIMEOUT_SEC"))
config['vad_timeout_sec'] = float(os.getenv("CLIENT_VAD_TIMEOUT_SEC"))
else:
config['vad_timeout_sec'] = 1.5
# return jsonify({"room_url": room_url, "token": meeting_token, "config": config}), 200
return redirect(room_url, code=301)
#return jsonify({"room_url": room_url, "token": meeting_token, "config": config}), 200
return redirect(room_url, code=302)
@app.route("/spin-up-kitty", methods=["GET", "POST"])
@app.route("/spin-up-kitty", methods=["POST"])
def spin_up_kitty():
return start_bot("./src/examples/foundational/10-wake-word.py")
return start_bot("./src/samples/foundational/06a-golden-kitty.py")
@app.route("/spin-up-kitty", methods=["GET"])
def quick_start_kitty():
return start_bot("./src/samples/foundational/06a-golden-kitty.py")
@app.route("/healthz")
def health_check():
return "ok", 200
return "ok", 200

View File

@@ -1,13 +0,0 @@
# Daily AI SDK Docs
## [Architecture Overview](architecture.md)
Learn about the thinking behind the SDK's design.
## [Example Code](examples/)
The repo includes several example apps in the `src/examples` directory. The docs explain how they work.
## [API Reference](api/)
Complete documentation of the available classes and methods in the SDK.

View File

@@ -1,2 +0,0 @@
# Daily AI SDK Architecture Guide

View File

@@ -1,119 +0,0 @@
# 01: Say One Thing
_video here - youtube?_
This example uses a text-to-speech (TTS) service to say one predefined sentence. But first, a quick overview of the general structure of these examples.
## Running the demos
All of the demos have something like this at the bottom of the file:
```python
if __name__ == "__main__":
(url, token) = configure()
asyncio.run(main(url, token))
```
### `configure()`
The `configure()` function comes from `src/examples/foundational/support/runner.py`, and it allows you to configure the examples from the command line directly, or using environment variables:
```bash
python 01-say-one-thing.py -u https://YOUR_DOMAIN.daily.co/YOUR_ROOM -k YOUR_API_KEY
# or
DAILY_ROOM_URL=https://YOUR_DOMAIN.daily.co/YOUR_ROOM DAILY_API_KEY=YOUR_API_KEY python 01-say-one-thing.py
# or set DAILY_ROOM_URL and DAILY_API_KEY in a .env file
python 01-say-one-thing.py
```
You'll need a Daily account to run these demos. You can sign up for free at [daily.co](https://daily.co). Once you've signed up you can create a room from the [Dashboard](https://dashboard.daily.co/rooms), and grab [your API key](https://dashboard.daily.co/developers) while you're there.
Some functionality (such as transcription) requires the bot to have owner privileges in the room. `runner.py` uses the Daily REST API to create a meeting token with owner privileges. You can learn more about meeting tokens in the [Daily docs](https://docs.daily.co/reference/rest-api/meeting-tokens).
### `asyncio.run()`
The AI SDK makes heavy use of Python's `asyncio` module. [This is a reasonable intro to the topic](https://builtin.com/data-science/asyncio) if you haven't worked with `asyncio` and coroutines before.
You can learn a bit more about the specifics of how the Daily AI SDK uses coroutines in the [Architecture Guide](../architecture.md).
## The `main()` function
All of the examples have a `main()` function with a similar structure:
- Configure the transport
- Configure the AI service(s) used in the demo
- Configure any event listeners
- Define a processing pipeline
- Run the example's coroutine(s)
### Configuring the transport
The first section of the `main()` function configures the transport object:
```python
meeting_duration_minutes = 5
transport = DailyTransportService(
room_url,
None,
"Say One Thing",
meeting_duration_minutes,
)
transport.mic_enabled = True
```
The [Architecture Guide](../architecture.md) explains the transport object in more detail. In this case, we're configuring a Daily transport object and enabling the virtual microphone, so our bot can play audio.
### Configuring the services
As described in the [Architecture Guide](../architecture.md), 'a 'Service' is a class that processes 'Frames' as part of a 'Pipeline'. In this demo app, we'll only need one service: a text-to-speech generator. We can create an instance of the `ElevenLabsTTSService` class with this line of code:
```python
tts = ElevenLabsTTSService(aiohttp_session=session, api_key=os.getenv("ELEVENLABS_API_KEY"), voice_id=os.getenv("ELEVENLABS_VOICE_ID"))
```
You'll need to make sure and set those environment variables somewhere. The easiest way to do that is to copy the `example.env` file in the repo and rename it to `.env`, and then add your credentials to that file. `runner.py` loads the `python-dotenv` module and initializes it, making the values in that file available in the environment.
### Configuring event listeners
This part isn't strictly necessary for an app like this. You could include the contents of the `on_participant_joined` function directly in the body of the `main()` function, and it would run as soon as you started the script from the command line.
Instead, we can use an event handler to wait to run that code until someone else joins the meeting. We'll define a function called `greet_user()`, and use the `@transport.event_handler("on_participant_joined")` decorator to tell the SDK that we want to run that function whenever a user joins the room.
```python
@transport.event_handler("on_participant_joined")
async def greet_user(transport, participant):
if participant["info"]["isLocal"]:
return
await tts.say(
"Hello there, " + participant["info"]["userName"] + "!",
transport.send_queue,
)
# wait for the output queue to be empty, then leave the meeting
await transport.stop_when_done()
```
### Defining a processing pipeline
In this example, we don't actually have much of a processing pipeline! In fact, we're doing the whole thing inside the `greet_user()` function already.
Pipelines usually look like a bunch of nested calls to the `run()` or `run_to_queue()` function from different Services. In this example, we're using the `say()` function from the TTS service. This is effectively a convenience wrapper around the `run_to_queue()` function, which we'll discuss more later. It's important to `await` this function to ensure that the speech frames are queued for playback before the next line of code, because of the `stop_when_done()` function being called immediately afterward.
The output of the `say()` function goes to the transport's `send_queue`. This queue is the all-important connection between the world of the Services pipeline that's generating frames asynchronously and the ordered playback of audio and visual media in the WebRTC call.
### Running the coroutines
In this example, we don't actually have any separate processing pipelines—everything happens as a result of an event from the transport. So we only need to run the transport's coroutine, and await its completion:
```python
await transport.run()
```
In future examples, we'll run more processes in parallel. For now, this script can run until the transport exits—which will happen based on calling `stop_when_done()` in the `greet_user()` function.
## Next Steps
Next, we'll start connecting multiple AI services together by building a service pipeline.
## [02 - LLM Say One Thing »](02-llm-say-one-thing.md)

View File

@@ -1,5 +0,0 @@
# Daily AI SDK Examples
The docs in this folder pair with the example apps located in `src/examples/foundational`. They are designed to serve as a quick references for building different kinds of AI apps. But the examples also build on one another, so it can be really helpful to walk through them in order.
To start, you can learn about the overall structure of the examples in [01 - Say One Thing](01-say-one-thing.md).

View File

@@ -7,22 +7,17 @@ name = "daily_ai"
version = "0.0.1"
description = "Orchestrator for AI bots with Daily"
dependencies = [
"aiohttp",
"azure-cognitiveservices-speech",
"daily-python",
"fal",
"faster_whisper",
"groq",
"google-cloud-texttospeech",
"numpy",
"openai",
"Pillow",
"typing-extensions",
"openai",
"google-cloud-texttospeech",
"azure-cognitiveservices-speech",
"pyht",
"python-dotenv",
"torch",
"torchaudio",
"pyaudio",
"typing-extensions"
"opentelemetry-sdk",
"aiohttp",
"fal",
"faster_whisper"
]
[tool.setuptools.packages.find]

View File

@@ -34,7 +34,6 @@ class LLMContextAggregator(AIService):
bot_participant_id=None,
complete_sentences=True,
pass_through=True):
super().__init__()
self.messages = messages
self.bot_participant_id = bot_participant_id
self.role = role
@@ -61,31 +60,20 @@ class LLMContextAggregator(AIService):
# TODO: split up transcription by participant
if self.complete_sentences:
# type: ignore -- the linter thinks this isn't a TextQueueFrame, even
# though we check it above
self.sentence += frame.text
self.sentence += frame.text # type: ignore -- the linter thinks this isn't a TextQueueFrame, even though we check it above
if self.sentence.endswith((".", "?", "!")):
self.messages.append({"role": self.role, "content": self.sentence})
self.sentence = ""
yield LLMMessagesQueueFrame(self.messages)
else:
# type: ignore -- the linter thinks this isn't a TextQueueFrame, even
# though we check it above
self.messages.append({"role": self.role, "content": frame.text})
self.messages.append({"role": self.role, "content": frame.text}) # type: ignore -- the linter thinks this isn't a TextQueueFrame, even though we check it above
yield LLMMessagesQueueFrame(self.messages)
async def finalize(self) -> AsyncGenerator[QueueFrame, None]:
# Send any dangling words that weren't finished with punctuation.
if self.complete_sentences and self.sentence:
self.messages.append({"role": self.role, "content": self.sentence})
yield LLMMessagesQueueFrame(self.messages)
class LLMUserContextAggregator(LLMContextAggregator):
def __init__(self,
messages: list[dict],
bot_participant_id=None,
complete_sentences=True):
messages: list[dict],
bot_participant_id=None,
complete_sentences=True):
super().__init__(messages, "user", bot_participant_id, complete_sentences, pass_through=False)
@@ -94,5 +82,5 @@ class LLMAssistantContextAggregator(LLMContextAggregator):
self, messages: list[dict], bot_participant_id=None, complete_sentences=True
):
super().__init__(
messages, "assistant", bot_participant_id, complete_sentences, pass_through=True
messages, "assistan", bot_participant_id, complete_sentences, pass_through=True
)

View File

@@ -19,18 +19,6 @@ class EndStreamQueueFrame(ControlQueueFrame):
pass
class LLMResponseEndQueueFrame(QueueFrame):
pass
class UserStartedSpeakingFrame(QueueFrame):
pass
class UserStoppedSpeakingFrame(QueueFrame):
pass
@dataclass()
class AudioQueueFrame(QueueFrame):
data: bytes
@@ -43,26 +31,14 @@ class ImageQueueFrame(QueueFrame):
@dataclass()
class SpriteQueueFrame(QueueFrame):
images: list[bytes]
class ImageListQueueFrame(QueueFrame):
images: list[bytes] | None
@dataclass()
class TextQueueFrame(QueueFrame):
text: str
@dataclass()
class TextQueueOutOfBandFrame(TextQueueFrame):
outOfBand: bool = True
@dataclass()
class TTSCompletedFrame(QueueFrame):
text: str
outOfBand: bool = False
@dataclass()
class TranscriptionQueueFrame(TextQueueFrame):
participantId: str

View File

@@ -1,23 +1,16 @@
import asyncio
import io
import logging
import time
import datetime
import wave
from dailyai.queue_frame import (
QueueFrame,
AudioQueueFrame,
ControlQueueFrame,
EndStreamQueueFrame,
ImageQueueFrame,
LLMMessagesQueueFrame,
LLMResponseEndQueueFrame,
QueueFrame,
TextQueueFrame,
TTSCompletedFrame,
TranscriptionQueueFrame,
UserStoppedSpeakingFrame
)
from abc import abstractmethod
@@ -84,11 +77,6 @@ class AIService:
class LLMService(AIService):
def __init__(self, context):
super().__init__()
self._context = context
@abstractmethod
async def run_llm_async(self, messages) -> AsyncGenerator[str, None]:
yield ""
@@ -98,23 +86,9 @@ class LLMService(AIService):
pass
async def process_frame(self, frame: QueueFrame) -> AsyncGenerator[QueueFrame, None]:
print(f"##### process frame got a frame, {type(frame)}")
if isinstance(frame, UserStoppedSpeakingFrame):
print(
f"### Got a user stopped speaking frame, context is {self._context}")
async for chunk in self.run_llm_async(self._context):
# if we get a string, wrap it in a frame
if isinstance(chunk, str):
yield TextQueueFrame(chunk)
# if we get a frame, pass it through
elif isinstance(chunk, QueueFrame):
print(f"### Got a frame chunk: {chunk}")
yield chunk
else:
print(f"### Got an unknown chunk: {chunk}")
yield LLMResponseEndQueueFrame()
else:
yield frame
if isinstance(frame, LLMMessagesQueueFrame):
async for text_chunk in self.run_llm_async(frame.messages):
yield TextQueueFrame(text_chunk)
class TTSService(AIService):
@@ -136,12 +110,6 @@ class TTSService(AIService):
async def process_frame(self, frame: QueueFrame) -> AsyncGenerator[QueueFrame, None]:
if not isinstance(frame, TextQueueFrame):
# We don't want transcription frames, which are a subclass
yield frame
return
# TODO-CB: Clean this up
if isinstance(frame, TranscriptionQueueFrame):
yield frame
return
@@ -156,11 +124,7 @@ class TTSService(AIService):
if text:
async for audio_chunk in self.run_tts(text):
size = 8000
for i in range(0, len(audio_chunk), size):
yield AudioQueueFrame(audio_chunk[i: i+size])
print("### ABOUT TO YIELD TTS COMPLETED FRAME", frame)
yield TTSCompletedFrame(text, hasattr(frame, 'outOfBand') and frame.outOfBand)
yield AudioQueueFrame(audio_chunk)
async def finalize(self):
if self.current_sentence:
@@ -220,19 +184,12 @@ class STTService(AIService):
ww.close()
content.seek(0)
text = await self.run_stt(content)
yield TranscriptionQueueFrame(text, '', str(time.time()))
yield TextQueueFrame(text)
class FrameLogger(AIService):
def __init__(self, prefix="Frame", **kwargs):
super().__init__(**kwargs)
self.prefix = prefix
async def process_frame(self, frame: QueueFrame) -> AsyncGenerator[QueueFrame, None]:
if isinstance(frame, (AudioQueueFrame, ImageQueueFrame)):
self.logger.info(
f"{datetime.datetime.utcnow().isoformat()} {self.prefix}: {type(frame)}")
else:
print(f"{datetime.datetime.utcnow().isoformat()} {self.prefix}: {frame}")
yield frame
@dataclass
class AIServiceConfig:
tts: TTSService
image: ImageGenService
llm: LLMService
stt: STTService

View File

@@ -17,10 +17,13 @@ from azure.cognitiveservices.speech import SpeechSynthesizer, SpeechConfig, Resu
class AzureTTSService(TTSService):
def __init__(self, *, api_key, region):
def __init__(self, speech_key=None, speech_region=None):
super().__init__()
self.speech_config = SpeechConfig(subscription=api_key, region=region)
speech_key = speech_key or os.getenv("AZURE_SPEECH_SERVICE_KEY")
speech_region = speech_region or os.getenv("AZURE_SPEECH_SERVICE_REGION")
self.speech_config = SpeechConfig(subscription=speech_key, region=speech_region)
self.speech_synthesizer = SpeechSynthesizer(
speech_config=self.speech_config, audio_config=None)
@@ -42,21 +45,31 @@ class AzureTTSService(TTSService):
yield result.audio_data[44:]
elif result.reason == ResultReason.Canceled:
cancellation_details = result.cancellation_details
self.logger.info("Speech synthesis canceled: {}".format(
cancellation_details.reason))
self.logger.info("Speech synthesis canceled: {}".format(cancellation_details.reason))
if cancellation_details.reason == CancellationReason.Error:
self.logger.info("Error details: {}".format(
cancellation_details.error_details))
self.logger.info("Error details: {}".format(cancellation_details.error_details))
class AzureLLMService(LLMService):
def __init__(self, *, api_key, endpoint, api_version="2023-12-01-preview", model, context):
super().__init__(context)
self._model: str = model
def __init__(self, api_key=None, azure_endpoint=None, api_version=None, model=None):
super().__init__()
api_key = api_key or os.getenv("AZURE_CHATGPT_KEY")
self._client = AsyncAzureOpenAI(
azure_endpoint = azure_endpoint or os.getenv("AZURE_CHATGPT_ENDPOINT")
if not azure_endpoint:
raise Exception(
"No azure endpoint specified for Azure LLM, please set AZURE_CHATGPT_ENDPOINT in the environment or pass it to the AzureLLMService constructor")
model: str | None = model or os.getenv("AZURE_CHATGPT_DEPLOYMENT_ID")
if not model:
raise Exception(
"No model specified for Azure LLM, please set AZURE_CHATGPT_DEPLOYMENT_ID in the environment or pass it to the AzureLLMService constructor")
self.model: str = model
api_version = api_version or "2023-12-01-preview"
self.client = AsyncAzureOpenAI(
api_key=api_key,
azure_endpoint=endpoint,
azure_endpoint=azure_endpoint,
api_version=api_version,
)
@@ -64,7 +77,7 @@ class AzureLLMService(LLMService):
messages_for_log = json.dumps(messages)
self.logger.debug(f"Generating chat via azure: {messages_for_log}")
chunks = await self._client.chat.completions.create(model=self._model, stream=True, messages=messages)
chunks = await self.client.chat.completions.create(model=self.model, stream=True, messages=messages)
async for chunk in chunks:
if len(chunk.choices) == 0:
continue
@@ -76,7 +89,7 @@ class AzureLLMService(LLMService):
messages_for_log = json.dumps(messages)
self.logger.debug(f"Generating chat via azure: {messages_for_log}")
response = await self._client.chat.completions.create(model=self._model, stream=False, messages=messages)
response = await self.client.chat.completions.create(model=self.model, stream=False, messages=messages)
if response and len(response.choices) > 0:
return response.choices[0].message.content
else:
@@ -87,60 +100,85 @@ class AzureImageGenServiceREST(ImageGenService):
def __init__(
self,
*,
api_version="2023-06-01-preview",
image_size: str,
aiohttp_session: aiohttp.ClientSession,
api_key,
endpoint,
model):
api_key=None,
azure_endpoint=None,
api_version=None,
model=None):
super().__init__(image_size=image_size)
self._api_key = api_key
self._azure_endpoint = endpoint
self._api_version = api_version
self._model = model
self._aiohttp_session = aiohttp_session
self.api_key = api_key or os.getenv("AZURE_DALLE_KEY")
self.azure_endpoint = azure_endpoint or os.getenv("AZURE_DALLE_ENDPOINT")
self.api_version = api_version or "2023-06-01-preview"
self.model = model or os.getenv("AZURE_DALLE_DEPLOYMENT_ID")
async def run_image_gen(self, sentence) -> tuple[str, bytes]:
url = f"{self._azure_endpoint}openai/images/generations:submit?api-version={self._api_version}"
headers = {"api-key": self._api_key,
"Content-Type": "application/json"}
body = {
# Enter your prompt text here
"prompt": sentence,
"size": self.image_size,
"n": 1,
}
async with self._aiohttp_session.post(
url, headers=headers, json=body
) as submission:
print(f"submission: {submission}")
# We never get past this line, because this header isn't
# defined on a 429 response, but something is eating our exceptions!
operation_location = submission.headers['operation-location']
print(f"submission status: {submission.status}")
status = ""
attempts_left = 120
json_response = None
while status != "succeeded":
attempts_left -= 1
if attempts_left == 0:
raise Exception("Image generation timed out")
# TODO hoist the session to app-level
async with aiohttp.ClientSession() as session:
url = f"{self.azure_endpoint}openai/images/generations:submit?api-version={self.api_version}"
headers = {"api-key": self.api_key, "Content-Type": "application/json"}
body = {
# Enter your prompt text here
"prompt": sentence,
"size": self.image_size,
"n": 1,
}
async with session.post(url, headers=headers, json=body) as submission:
operation_location = submission.headers['operation-location']
await asyncio.sleep(1)
response = await self._aiohttp_session.get(
operation_location, headers=headers
)
json_response = await response.json()
status = json_response["status"]
status = ""
attempts_left = 120
json_response = None
while status != "succeeded":
attempts_left -= 1
if attempts_left == 0:
raise Exception("Image generation timed out")
image_url = json_response["result"]["data"][0]["url"] if json_response else None
if not image_url:
raise Exception("Image generation failed")
# Load the image from the url
async with self._aiohttp_session.get(image_url) as response:
image_stream = io.BytesIO(await response.content.read())
image = Image.open(image_stream)
print("i got an image file!")
return (image_url, image.tobytes())
await asyncio.sleep(1)
response = await session.get(operation_location, headers=headers)
json_response = await response.json()
status = json_response["status"]
image_url = json_response["result"]["data"][0]["url"] if json_response else None
if not image_url:
raise Exception("Image generation failed")
# Load the image from the url
async with session.get(image_url) as response:
image_stream = io.BytesIO(await response.content.read())
image = Image.open(image_stream)
return (image_url, image.tobytes())
class AzureImageGenService(ImageGenService):
def __init__(self, api_key=None, azure_endpoint=None, api_version=None, model=None):
super().__init__()
api_key = api_key or os.getenv("AZURE_DALLE_KEY")
azure_endpoint = azure_endpoint or os.getenv("AZURE_DALLE_ENDPOINT")
api_version = api_version or "2023-06-01-preview"
self.model = model or os.getenv("AZURE_DALLE_DEPLOYMENT_ID")
self.client = AzureOpenAI(
api_key=api_key,
azure_endpoint=azure_endpoint,
api_version=api_version,
)
async def run_image_gen(self, sentence) -> tuple[str, bytes]:
self.logger.info("Generating azure image", sentence)
image = self.client.images.generate(
model=self.model,
prompt=sentence,
n=1,
size=self.image_size,
)
url = image["data"][0]["url"]
response = requests.get(url)
dalle_stream = io.BytesIO(response.content)
dalle_im = Image.open(dalle_stream.tobytes())
return (url, dalle_im)

View File

@@ -1,456 +0,0 @@
from abc import abstractmethod
import asyncio
import copy
import functools
import itertools
import logging
import queue
import threading
import time
from typing import AsyncGenerator
import numpy as np
import pyaudio
import torch
import torchaudio
from enum import Enum
import datetime
import traceback
from typing import AsyncGenerator, AsyncIterable, BinaryIO, Iterable
from dailyai.queue_aggregators import LLMAssistantContextAggregator, LLMUserContextAggregator
from dailyai.queue_frame import (
AudioQueueFrame,
EndStreamQueueFrame,
ImageQueueFrame,
QueueFrame,
SpriteQueueFrame,
StartStreamQueueFrame,
TranscriptionQueueFrame,
TTSCompletedFrame,
UserStartedSpeakingFrame,
UserStoppedSpeakingFrame
)
torch.set_num_threads(1)
model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
model='silero_vad',
force_reload=False)
(get_speech_timestamps,
save_audio,
read_audio,
VADIterator,
collect_chunks) = utils
# Taken from utils_vad.py
def validate(model,
inputs: torch.Tensor):
with torch.no_grad():
outs = model(inputs)
return outs
# Provided by Alexander Veysov
def int2float(sound):
abs_max = np.abs(sound).max()
sound = sound.astype('float32')
if abs_max > 0:
sound *= 1/32768
sound = sound.squeeze() # depends on the use case
return sound
FORMAT = pyaudio.paInt16
CHANNELS = 1
SAMPLE_RATE = 16000
CHUNK = int(SAMPLE_RATE / 10)
audio = pyaudio.PyAudio()
class VADState(Enum):
QUIET = 1
STARTING = 2
SPEAKING = 3
STOPPING = 4
class BaseTransportService():
def __init__(
self,
**kwargs,
) -> None:
self._mic_enabled = kwargs.get("mic_enabled") or False
self._mic_sample_rate = kwargs.get("mic_sample_rate") or 16000
self._camera_enabled = kwargs.get("camera_enabled") or False
self._camera_width = kwargs.get("camera_width") or 1024
self._camera_height = kwargs.get("camera_height") or 768
self._speaker_enabled = kwargs.get("speaker_enabled") or False
self._speaker_sample_rate = kwargs.get("speaker_sample_rate") or 16000
self._fps = kwargs.get("fps") or 8
self._vad_start_s = kwargs.get("vad_start_s") or 0.2
self._vad_stop_s = kwargs.get("vad_stop_s") or 0.5
self._context = kwargs.get("context") or []
self._vad_samples = 1536
vad_frame_s = self._vad_samples / SAMPLE_RATE
self._vad_start_frames = round(self._vad_start_s / vad_frame_s)
self._vad_stop_frames = round(self._vad_stop_s / vad_frame_s)
self._vad_starting_count = 0
self._vad_stopping_count = 0
self._vad_state = VADState.QUIET
duration_minutes = kwargs.get("duration_minutes") or 10
self._expiration = time.time() + duration_minutes * 60
self.send_queue = asyncio.Queue()
self.receive_queue = asyncio.Queue()
self._threadsafe_send_queue = queue.Queue()
self._images = None
self._user_is_speaking = False
self._current_phrase = ""
try:
self._loop: asyncio.AbstractEventLoop | None = asyncio.get_running_loop()
except RuntimeError:
self._loop = None
self._stop_threads = threading.Event()
self._is_interrupted = threading.Event()
self._logger: logging.Logger = logging.getLogger()
def update_messages(self, new_context: list[dict[str, str]], task: asyncio.Task | None):
if task:
if not task.cancelled():
self._current_phrase = ""
self._context = new_context
def append_to_context(self, role, chunk_or_text):
print("IN APPEND", chunk_or_text)
# if we get a non-string, append it to the context without further error checking
# unless the outOfBand property is True
if not isinstance(chunk_or_text, str):
if not chunk_or_text.get("outOfBand") == True:
self._context.append(chunk_or_text)
return
text = chunk_or_text
last_context_item = self._context[-1]
print("TEXT", text)
print("LAST CONTEXT ITEM", last_context_item)
traceback.print_stack()
if last_context_item and last_context_item['role'] == role:
last_context_item['content'] += f" {text}"
else:
self._context.append({"role": role, "content": text})
async def run_pipeline(self, frame):
print(f"starting to speak_after_delay, {frame}")
# TODO-CB: This exception for missing class gets eaten!
await self._runner(frame)
async def run_conversation(self, runner: Iterable[QueueFrame]
| AsyncIterable[QueueFrame]
| asyncio.Queue[QueueFrame],
) -> AsyncGenerator[QueueFrame, None]:
current_response_task = None
self._runner = runner
async for frame in self.get_receive_frames():
print(f"got frame of type: {type(frame)}, {frame}")
if isinstance(frame, EndStreamQueueFrame):
break
# elif not isinstance(frame, TranscriptionQueueFrame):
# continue
# TODO-CB: Verify this is an accurate replacement
# if hasattr(frame, 'participantId') and frame.participantId == self._my_participant_id:
if not isinstance(frame, UserStoppedSpeakingFrame):
continue
if current_response_task:
# TODO-CB: Maybe not always interrupt? Are there frame types we can pass through?
current_response_task.cancel()
self.interrupt()
# self._current_phrase += " " + frame.text
# current_llm_context = copy.deepcopy(self._context)
current_response_task = asyncio.create_task(
self.run_pipeline(
frame)
)
current_response_task.add_done_callback(
functools.partial(self.update_messages, self._context)
)
async def run(self):
self._prerun()
async_output_queue_marshal_task = asyncio.create_task(
self._marshal_frames())
self._camera_thread = threading.Thread(
target=self._run_camera, daemon=True)
self._camera_thread.start()
self._frame_consumer_thread = threading.Thread(
target=self._frame_consumer, daemon=True)
self._frame_consumer_thread.start()
if self._speaker_enabled:
# TODO-CB: This is interesting
# self._receive_audio_thread = threading.Thread(
# target=self._receive_audio, daemon=True)
# self._receive_audio_thread.start()
self._vad_thread = threading.Thread(target=self._vad, daemon=True)
self._vad_thread.start()
try:
while (
time.time() < self._expiration
and not self._stop_threads.is_set()
):
await asyncio.sleep(1)
except Exception as e:
self._logger.error(f"Exception {e}")
raise e
finally:
# Do anything that must be done to clean up
self._post_run()
self._stop_threads.set()
await self.send_queue.put(EndStreamQueueFrame())
await async_output_queue_marshal_task
await self.send_queue.join()
self._frame_consumer_thread.join()
if self._speaker_enabled:
self._receive_audio_thread.join()
def _post_run(self):
# Note that this function must be idempotent! It can be called multiple times
# if, for example, a keyboard interrupt occurs.
pass
def stop(self):
self._stop_threads.set()
async def stop_when_done(self):
await self._wait_for_send_queue_to_empty()
self.stop()
async def _wait_for_send_queue_to_empty(self):
await self.send_queue.join()
self._threadsafe_send_queue.join()
@abstractmethod
def write_frame_to_camera(self, frame: bytes):
pass
@abstractmethod
def write_frame_to_mic(self, frame: bytes):
pass
@abstractmethod
def read_audio_frames(self, desired_frame_count):
return bytes()
@abstractmethod
def _prerun(self):
pass
def _vad(self):
# CB: Starting silero VAD stuff
# TODO-CB: Probably need to force virtual speaker creation if we're
# going to build this in?
# TODO-CB: pyaudio installation
while not self._stop_threads.is_set():
audio_chunk = self.read_audio_frames(self._vad_samples)
audio_int16 = np.frombuffer(audio_chunk, np.int16)
audio_float32 = int2float(audio_int16)
new_confidence = model(
torch.from_numpy(audio_float32), 16000).item()
speaking = new_confidence > 0.5
if speaking:
match self._vad_state:
case VADState.QUIET:
self._vad_state = VADState.STARTING
self._vad_starting_count = 1
case VADState.STARTING:
self._vad_starting_count += 1
case VADState.STOPPING:
self._vad_state = VADState.SPEAKING
self._vad_stopping_count = 0
else:
match self._vad_state:
case VADState.STARTING:
self._vad_state = VADState.QUIET
self._vad_starting_count = 0
case VADState.SPEAKING:
self._vad_state = VADState.STOPPING
self._vad_stopping_count = 1
case VADState.STOPPING:
self._vad_stopping_count += 1
if self._vad_state == VADState.STARTING and self._vad_starting_count >= self._vad_start_frames:
print(
f'!!! {datetime.datetime.utcnow().isoformat()} queueing start frame')
asyncio.run_coroutine_threadsafe(
self.receive_queue.put(
UserStartedSpeakingFrame()), self._loop
)
print(f"!!! VAD started, calling interrupt")
self.interrupt()
self._vad_state = VADState.SPEAKING
self._vad_starting_count = 0
if self._vad_state == VADState.STOPPING and self._vad_stopping_count >= self._vad_stop_frames:
print(
f'!!! {datetime.datetime.utcnow().isoformat()} queueing stop frame')
asyncio.run_coroutine_threadsafe(
self.receive_queue.put(
UserStoppedSpeakingFrame()), self._loop
)
self._vad_state = VADState.QUIET
self._vad_stopping_count = 0
async def _marshal_frames(self):
while True:
frame: QueueFrame | list = await self.send_queue.get()
self._threadsafe_send_queue.put(frame)
self.send_queue.task_done()
if isinstance(frame, EndStreamQueueFrame):
break
def interrupt(self):
print(f"!!! setting interrupt")
self._is_interrupted.set()
async def get_receive_frames(self) -> AsyncGenerator[QueueFrame, None]:
while True:
frame = await self.receive_queue.get()
yield frame
if isinstance(frame, EndStreamQueueFrame):
break
def _receive_audio(self):
if not self._loop:
self._logger.error("No loop available for audio thread")
return
seconds = 1
desired_frame_count = self._speaker_sample_rate * seconds
while not self._stop_threads.is_set():
buffer = self.read_audio_frames(desired_frame_count)
if len(buffer) > 0:
frame = AudioQueueFrame(buffer)
asyncio.run_coroutine_threadsafe(
self.receive_queue.put(frame), self._loop
)
asyncio.run_coroutine_threadsafe(
self.receive_queue.put(EndStreamQueueFrame()), self._loop
)
def _set_image(self, image: bytes):
self._images = itertools.cycle([image])
def _set_images(self, images: list[bytes], start_frame=0):
self._images = itertools.cycle(images)
def _run_camera(self):
try:
while not self._stop_threads.is_set():
if self._images:
this_frame = next(self._images)
self.write_frame_to_camera(this_frame)
time.sleep(1.0 / self._fps)
except Exception as e:
self._logger.error(f"Exception {e} in camera thread.")
raise e
def _frame_consumer(self):
self._logger.info("🎬 Starting frame consumer thread")
b = bytearray()
smallest_write_size = 3200
all_audio_frames = bytearray()
while True:
try:
frames_or_frame: QueueFrame | list[QueueFrame] = (
self._threadsafe_send_queue.get()
)
if isinstance(frames_or_frame, QueueFrame):
frames: list[QueueFrame] = [frames_or_frame]
elif isinstance(frames_or_frame, list):
frames: list[QueueFrame] = frames_or_frame
else:
raise Exception("Unknown type in output queue")
for frame in frames:
if isinstance(frame, EndStreamQueueFrame):
self._logger.info("Stopping frame consumer thread")
self._threadsafe_send_queue.task_done()
return
# if interrupted, we just pull frames off the queue and discard them
if not self._is_interrupted.is_set():
if frame:
if isinstance(frame, AudioQueueFrame):
chunk = frame.data
all_audio_frames.extend(chunk)
b.extend(chunk)
truncated_length: int = len(b) - (
len(b) % smallest_write_size
)
if truncated_length:
self.write_frame_to_mic(
bytes(b[:truncated_length]))
b = b[truncated_length:]
elif isinstance(frame, ImageQueueFrame):
self._set_image(frame.image)
elif isinstance(frame, SpriteQueueFrame):
self._set_images(frame.images)
elif isinstance(frame, TTSCompletedFrame) and not frame.outOfBand:
self.append_to_context(
"assistant", frame.text)
elif len(b):
self.write_frame_to_mic(bytes(b))
b = bytearray()
else:
# if there are leftover audio bytes, write them now; failing to do so
# can cause static in the audio stream.
print(f"!!! interrupted, flushing audio")
if len(b):
truncated_length = len(b) - (len(b) % 160)
self.write_frame_to_mic(
bytes(b[:truncated_length]))
b = bytearray()
if isinstance(frame, StartStreamQueueFrame):
self._is_interrupted.clear()
self._threadsafe_send_queue.task_done()
except queue.Empty:
if len(b):
self.write_frame_to_mic(bytes(b))
b = bytearray()
except Exception as e:
self._logger.error(
f"Exception in frame_consumer: {e}, {len(b)}")
raise e

View File

@@ -1,4 +1,28 @@
from dailyai.services.base_transport_service import BaseTransportService
import asyncio
import inspect
import logging
import sys
import threading
import time
import types
from functools import partial
from queue import Queue, Empty
from typing import AsyncGenerator
from dailyai.queue_frame import (
AudioQueueFrame,
EndStreamQueueFrame,
ImageQueueFrame,
ImageListQueueFrame,
QueueFrame,
StartStreamQueueFrame,
TextQueueFrame,
TranscriptionQueueFrame,
)
from threading import Thread, Event
from daily import (
EventHandler,
CallClient,
@@ -7,97 +31,60 @@ from daily import (
VirtualMicrophoneDevice,
VirtualSpeakerDevice,
)
from threading import Event
from dailyai.queue_frame import (
TranscriptionQueueFrame, UserStartedSpeakingFrame, UserStoppedSpeakingFrame
)
from functools import partial
import types
import pyaudio
import torchaudio
import asyncio
import inspect
import io
import logging
import numpy as np
import signal
import threading
import torch
torch.set_num_threads(1)
model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
model='silero_vad',
force_reload=False)
(get_speech_timestamps,
save_audio,
read_audio,
VADIterator,
collect_chunks) = utils
# Taken from utils_vad.py
def validate(model,
inputs: torch.Tensor):
with torch.no_grad():
outs = model(inputs)
return outs
# Provided by Alexander Veysov
def int2float(sound):
abs_max = np.abs(sound).max()
sound = sound.astype('float32')
if abs_max > 0:
sound *= 1/32768
sound = sound.squeeze() # depends on the use case
return sound
FORMAT = pyaudio.paInt16
CHANNELS = 1
SAMPLE_RATE = 16000
CHUNK = int(SAMPLE_RATE / 10)
audio = pyaudio.PyAudio()
class DailyTransportService(BaseTransportService, EventHandler):
class DailyTransportService(EventHandler):
_daily_initialized = False
_lock = threading.Lock()
_speaker_enabled: bool
_speaker_sample_rate: int
# This is necessary to override EventHandler's __new__ method.
def __new__(cls, *args, **kwargs):
return super().__new__(cls)
speaker_enabled: bool
speaker_sample_rate: int
def __init__(
self,
room_url: str,
token: str | None,
bot_name: str,
duration: float = 10,
min_others_count: int = 1,
start_transcription: bool = False,
**kwargs,
start_transcription: bool = True,
speaker_enabled: bool = False,
speaker_sample_rate: int = 16000,
):
# This will call BaseTransportService.__init__ method, not EventHandler
super().__init__(**kwargs)
super().__init__()
self.bot_name: str = bot_name
self.room_url: str = room_url
self.token: str | None = token
self.duration: float = duration
self.expiration = time.time() + duration * 60
self.min_others_count = min_others_count
self.start_transcription = start_transcription
self._room_url: str = room_url
self._bot_name: str = bot_name
self._token: str | None = token
self._min_others_count = min_others_count
self._start_transcription = start_transcription
# This queue is used to marshal frames from the async send queue to the thread that emits audio & video.
# We need this to maintain the asynchronous behavior of asyncio queues -- to give async functions
# a chance to run while waiting for queue items -- but also to maintain thread safety and have a threaded
# handler to send frames, to ensure that sending isn't subject to pauses in the async thread.
self.threadsafe_send_queue = Queue()
self._is_interrupted = Event()
self._stop_threads = Event()
self.is_interrupted = Event()
self.stop_threads = Event()
self.story_started = False
self.mic_enabled = False
self.mic_sample_rate = 16000
self.camera_width = 960
self.camera_height = 960
self.camera_enabled = False
self.speaker_enabled = speaker_enabled
self.speaker_sample_rate = speaker_sample_rate
self._other_participant_has_joined = False
self._my_participant_id = None
self.send_queue = asyncio.Queue()
self.receive_queue = asyncio.Queue()
self.other_participant_has_joined = False
self.my_participant_id = None
self.camera_thread = None
self.frame_consumer_thread = None
self.transcription_settings = {
"language": "en",
@@ -111,44 +98,46 @@ class DailyTransportService(BaseTransportService, EventHandler):
},
}
self._logger: logging.Logger = logging.getLogger("dailyai")
self.logger: logging.Logger = logging.getLogger("dailyai")
self._event_handlers = {}
self.event_handlers = {}
def _patch_method(self, event_name, *args, **kwargs):
try:
for handler in self._event_handlers[event_name]:
self.loop = asyncio.get_running_loop()
except RuntimeError:
self.loop = None
def patch_method(self, event_name, *args, **kwargs):
try:
for handler in self.event_handlers[event_name]:
if inspect.iscoroutinefunction(handler):
if self._loop:
asyncio.run_coroutine_threadsafe(
handler(*args, **kwargs), self._loop)
if self.loop:
asyncio.run_coroutine_threadsafe(handler(*args, **kwargs), self.loop)
else:
raise Exception(
"No event loop to run coroutine. In order to use async event handlers, you must run the DailyTransportService in an asyncio event loop.")
else:
handler(*args, **kwargs)
except Exception as e:
self._logger.error(f"Exception in event handler {event_name}: {e}")
self.logger.error(f"Exception in event handler {event_name}: {e}")
raise e
def add_event_handler(self, event_name: str, handler):
if not event_name.startswith("on_"):
raise Exception(
f"Event handler {event_name} must start with 'on_'")
raise Exception(f"Event handler {event_name} must start with 'on_'")
methods = inspect.getmembers(self, predicate=inspect.ismethod)
if event_name not in [method[0] for method in methods]:
raise Exception(f"Event handler {event_name} not found")
if event_name not in self._event_handlers:
self._event_handlers[event_name] = [
if event_name not in self.event_handlers:
self.event_handlers[event_name] = [
getattr(
self, event_name), types.MethodType(
handler, self)]
setattr(self, event_name, partial(self._patch_method, event_name))
setattr(self, event_name, partial(self.patch_method, event_name))
else:
self._event_handlers[event_name].append(
types.MethodType(handler, self))
self.event_handlers[event_name].append(types.MethodType(handler, self))
def event_handler(self, event_name: str):
def decorator(handler):
@@ -157,17 +146,7 @@ class DailyTransportService(BaseTransportService, EventHandler):
return decorator
def write_frame_to_camera(self, frame: bytes):
self.camera.write_frame(frame)
def write_frame_to_mic(self, frame: bytes):
self.mic.write_frames(frame)
def read_audio_frames(self, desired_frame_count):
bytes = self._speaker.read_frames(desired_frame_count)
return bytes
def _prerun(self):
def configure_daily(self):
# Only initialize Daily once
if not DailyTransportService._daily_initialized:
with DailyTransportService._lock:
@@ -175,26 +154,34 @@ class DailyTransportService(BaseTransportService, EventHandler):
DailyTransportService._daily_initialized = True
self.client = CallClient(event_handler=self)
if self._mic_enabled:
if self.mic_enabled:
self.mic: VirtualMicrophoneDevice = Daily.create_microphone_device(
"mic", sample_rate=self._mic_sample_rate, channels=1
"mic", sample_rate=self.mic_sample_rate, channels=1
)
if self._camera_enabled:
if self.camera_enabled:
self.camera: VirtualCameraDevice = Daily.create_camera_device(
"camera", width=self._camera_width, height=self._camera_height, color_format="RGB"
"camera", width=self.camera_width, height=self.camera_height, color_format="RGB"
)
if self._speaker_enabled:
self._speaker: VirtualSpeakerDevice = Daily.create_speaker_device(
"speaker", sample_rate=self._speaker_sample_rate, channels=1
if self.speaker_enabled:
self.speaker: VirtualSpeakerDevice = Daily.create_speaker_device(
"speaker", sample_rate=self.speaker_sample_rate, channels=1
)
Daily.select_speaker_device("speaker")
self.client.set_user_name(self._bot_name)
self.client.join(self._room_url, self._token,
completion=self.call_joined)
self._my_participant_id = self.client.participants()["local"]["id"]
self.image: bytes | None = None
self.images: list[bytes] | None = None
self.camera_thread = Thread(target=self.run_camera, daemon=True)
self.camera_thread.start()
self.logger.info("Starting frame consumer thread")
self.frame_consumer_thread = Thread(target=self.frame_consumer, daemon=True)
self.frame_consumer_thread.start()
self.client.set_user_name(self.bot_name)
self.client.join(self.room_url, self.token, completion=self.call_joined)
self.my_participant_id = self.client.participants()["local"]["id"]
self.client.update_inputs(
{
@@ -235,82 +222,118 @@ class DailyTransportService(BaseTransportService, EventHandler):
}
)
if self._token and self._start_transcription:
if self.token and self.start_transcription:
self.client.start_transcription(self.transcription_settings)
self.original_sigint_handler = signal.getsignal(signal.SIGINT)
signal.signal(signal.SIGINT, self.process_interrupt_handler)
def _receive_audio(self):
"""Receive audio from the Daily call and put it on the receive queue"""
seconds = 1
desired_frame_count = self.speaker_sample_rate * seconds
while True:
buffer = self.speaker.read_frames(desired_frame_count)
if len(buffer) > 0:
frame = AudioQueueFrame(buffer)
if self.loop:
asyncio.run_coroutine_threadsafe(self.receive_queue.put(frame), self.loop)
def process_interrupt_handler(self, signum, frame):
self._post_run()
if callable(self.original_sigint_handler):
self.original_sigint_handler(signum, frame)
def interrupt(self):
self.is_interrupted.set()
def _post_run(self):
self.client.leave()
async def get_receive_frames(self) -> AsyncGenerator[QueueFrame, None]:
while True:
frame = await self.receive_queue.get()
yield frame
if isinstance(frame, EndStreamQueueFrame):
break
def get_async_send_queue(self):
return self.send_queue
async def marshal_frames(self):
while True:
frame: QueueFrame | list = await self.send_queue.get()
self.threadsafe_send_queue.put(frame)
self.send_queue.task_done()
if isinstance(frame, EndStreamQueueFrame):
break
async def wait_for_send_queue_to_empty(self):
await self.send_queue.join()
self.threadsafe_send_queue.join()
async def stop_when_done(self):
await self.wait_for_send_queue_to_empty()
self.stop()
async def run(self) -> None:
self.configure_daily()
self.do_shutdown = False
async_output_queue_marshal_task = asyncio.create_task(self.marshal_frames())
try:
participant_count: int = len(self.client.participants())
self.logger.info(f"{participant_count} participants in room")
while time.time() < self.expiration and not self.do_shutdown and not self.stop_threads.is_set():
await asyncio.sleep(1)
except Exception as e:
self.logger.error(f"Exception {e}")
raise e
finally:
self.client.leave()
self.stop_threads.set()
await self.receive_queue.put(EndStreamQueueFrame())
await self.send_queue.put(EndStreamQueueFrame())
await async_output_queue_marshal_task
if self.camera_thread and self.camera_thread.is_alive():
self.camera_thread.join()
if self.frame_consumer_thread and self.frame_consumer_thread.is_alive():
self.frame_consumer_thread.join()
def stop(self):
self.stop_threads.set()
def on_first_other_participant_joined(self):
pass
def call_joined(self, join_data, client_error):
self._logger.info(f"Call_joined: {join_data}, {client_error}")
def dialout(self, number):
self.client.start_dialout({"phoneNumber": number})
def start_recording(self):
self.client.start_recording()
self.logger.info(f"Call_joined: {join_data}, {client_error}")
if self.speaker_enabled:
t = Thread(target=self._receive_audio, daemon=True)
t.start()
def on_error(self, error):
self._logger.error(f"on_error: {error}")
self.logger.error(f"on_error: {error}")
def on_call_state_updated(self, state):
pass
def on_participant_joined(self, participant):
if not self._other_participant_has_joined and participant["id"] != self._my_participant_id:
self._other_participant_has_joined = True
if not self.other_participant_has_joined and participant["id"] != self.my_participant_id:
self.other_participant_has_joined = True
self.on_first_other_participant_joined()
def on_participant_left(self, participant, reason):
if len(self.client.participants()) < self._min_others_count + 1:
self._stop_threads.set()
async def insert_speech(self, text, sender, date):
await self.receive_queue.put(UserStartedSpeakingFrame())
await asyncio.sleep(0.3)
# frame = TranscriptionQueueFrame(text, sender, date)
# await self.receive_queue.put(frame)
self.on_transcription_message({
"text": text,
"participantId": "cb65b845-aac0-4fc8-987d-2e7ce3c7d8f0",
"timestamp": date
})
await asyncio.sleep(0.3)
await self.receive_queue.put(UserStoppedSpeakingFrame())
if len(self.client.participants()) < self.min_others_count + 1:
self.do_shutdown = True
pass
def on_app_message(self, message, sender):
if self._loop:
print("APP MESSAGE", message)
asyncio.run_coroutine_threadsafe(
self.insert_speech(message["message"], sender, message["date"]), self._loop)
pass
def on_transcription_message(self, message: dict):
if self._loop:
print(f"transcription: {message}")
if self.loop:
participantId = ""
if "participantId" in message:
participantId = message["participantId"]
elif "session_id" in message:
participantId = message["session_id"]
frame = TranscriptionQueueFrame(
message["text"], participantId, message["timestamp"])
if self._my_participant_id and participantId != self._my_participant_id:
self.append_to_context("user", message["text"])
asyncio.run_coroutine_threadsafe(
self.receive_queue.put(frame), self._loop)
frame = TranscriptionQueueFrame(message["text"], participantId, message["timestamp"])
asyncio.run_coroutine_threadsafe(self.receive_queue.put(frame), self.loop)
def on_transcription_stopped(self, stopped_by, stopped_by_error):
pass
@@ -320,3 +343,90 @@ class DailyTransportService(BaseTransportService, EventHandler):
def on_transcription_started(self, status):
pass
def set_image(self, image: bytes):
self.image: bytes | None = image
self.images: list[bytes] | None = None
def set_images(self, images: list[bytes], start_frame=0):
self.images: list[bytes] | None = images
self.image = None
self.current_frame = start_frame
def run_camera(self):
try:
while not self.stop_threads.is_set():
if self.image:
self.camera.write_frame(self.image)
if self.images:
frame_index = self.current_frame % len(self.images)
this_frame = self.images[frame_index]
self.camera.write_frame(this_frame)
self.current_frame = frame_index + 1
time.sleep(1.0 / 8) # 8 fps
except Exception as e:
self.logger.error(f"Exception {e} in camera thread.")
raise e
def frame_consumer(self):
self.logger.info("🎬 Starting frame consumer thread")
b = bytearray()
smallest_write_size = 3200
all_audio_frames = bytearray()
while True:
try:
frames_or_frame: QueueFrame | list[QueueFrame] = self.threadsafe_send_queue.get()
if isinstance(frames_or_frame, QueueFrame):
frames: list[QueueFrame] = [frames_or_frame]
elif isinstance(frames_or_frame, list):
frames: list[QueueFrame] = frames_or_frame
else:
raise Exception("Unknown type in output queue")
for frame in frames:
if isinstance(frame, EndStreamQueueFrame):
self.logger.info("Stopping frame consumer thread")
self.threadsafe_send_queue.task_done()
return
# if interrupted, we just pull frames off the queue and discard them
if not self.is_interrupted.is_set():
if frame:
if isinstance(frame, AudioQueueFrame):
chunk = frame.data
all_audio_frames.extend(chunk)
b.extend(chunk)
l = len(b) - (len(b) % smallest_write_size)
if l:
self.mic.write_frames(bytes(b[:l]))
b = b[l:]
elif isinstance(frame, ImageQueueFrame):
self.set_image(frame.image)
elif isinstance(frame, ImageListQueueFrame):
self.set_images(frame.images)
elif len(b):
self.mic.write_frames(bytes(b))
b = bytearray()
else:
# if there are leftover audio bytes, write them now; failing to do so
# can cause static in the audio stream.
if len(b):
self.mic.write_frames(bytes(b))
b = bytearray()
if isinstance(frame, StartStreamQueueFrame):
self.is_interrupted.clear()
self.threadsafe_send_queue.task_done()
except Empty:
try:
if len(b):
self.mic.write_frames(bytes(b))
except Exception as e:
self.logger.error(f"Exception in frame_consumer: {e}, {len(b)}")
raise e
b = bytearray()

View File

@@ -1,36 +0,0 @@
import os
import aiohttp
import requests
from dailyai.services.ai_services import TTSService
class DeepgramAIService(TTSService):
def __init__(
self,
*,
aiohttp_session: aiohttp.ClientSession,
api_key,
voice,
sample_rate=16000
):
super().__init__()
self._api_key = api_key
self._voice = voice
self._sample_rate = sample_rate
self._aiohttp_session = aiohttp_session
async def run_tts(self, sentence):
self.logger.info(f"Running deepgram tts for {sentence}")
base_url = "https://api.beta.deepgram.com/v1/speak"
request_url = f"{base_url}?model={self._voice}&encoding=linear16&container=none&sample_rate={self._sample_rate}"
headers = {"authorization": f"token {self._api_key}", "Content-Type": "application/json"}
data = {"text": sentence}
async with self._aiohttp_session.post(
request_url, headers=headers, json=data
) as r:
async for chunk in r.content:
if chunk:
yield chunk

View File

@@ -9,12 +9,11 @@ from dailyai.services.ai_services import TTSService
class DeepgramTTSService(TTSService):
def __init__(self, *, aiohttp_session, api_key, voice="alpha-asteria-en-v2"):
def __init__(self, speech_key=None, voice=None):
super().__init__()
self._voice = voice
self._api_key = api_key
self._aiohttp_session = aiohttp_session
self.voice = voice or os.getenv("DEEPGRAM_VOICE") or "alpha-asteria-en-v2"
self.speech_key = speech_key or os.getenv("DEEPGRAM_API_KEY")
def get_mic_sample_rate(self):
return 24000
@@ -22,9 +21,10 @@ class DeepgramTTSService(TTSService):
async def run_tts(self, sentence) -> AsyncGenerator[bytes, None]:
self.logger.info(f"Running deepgram tts for {sentence}")
base_url = "https://api.beta.deepgram.com/v1/speak"
request_url = f"{base_url}?model={self._voice}&encoding=linear16&container=none&sample_rate=16000"
headers = {"authorization": f"token {self._api_key}"}
request_url = f"{base_url}?model={self.voice}&encoding=linear16&container=none&sample_rate=16000"
headers = {"authorization": f"token {self.speech_key}"}
body = {"text": sentence}
async with self._aiohttp_session.post(request_url, headers=headers, json=body) as r:
async for data in r.content:
yield data
async with aiohttp.ClientSession() as session:
async with session.post(request_url, headers=headers, json=body) as r:
async for data in r.content:
yield data

View File

@@ -9,37 +9,28 @@ from dailyai.services.ai_services import TTSService
class ElevenLabsTTSService(TTSService):
def __init__(
self,
*,
aiohttp_session: aiohttp.ClientSession,
api_key,
voice_id,
):
def __init__(self, api_key=None, voice_id=None):
super().__init__()
self._api_key = api_key
self._voice_id = voice_id
self._aiohttp_session = aiohttp_session
self.api_key = api_key or os.getenv("ELEVENLABS_API_KEY")
self.voice_id = voice_id or os.getenv("ELEVENLABS_VOICE_ID")
async def run_tts(self, sentence) -> AsyncGenerator[bytes, None]:
url = f"https://api.elevenlabs.io/v1/text-to-speech/{self._voice_id}/stream"
payload = {"text": sentence, "model_id": "eleven_turbo_v2"}
querystring = {"output_format": "pcm_16000", "optimize_streaming_latency": 2}
headers = {
"xi-api-key": self._api_key,
"Content-Type": "application/json",
}
async with self._aiohttp_session.post(
url, json=payload, headers=headers, params=querystring
) as r:
if r.status != 200:
self.logger.error(
f"audio fetch status code: {r.status}, error: {r.text}"
)
return
async with aiohttp.ClientSession() as session:
url = f"https://api.elevenlabs.io/v1/text-to-speech/{self.voice_id}/stream"
payload = {"text": sentence, "model_id": "eleven_turbo_v2"}
querystring = {"output_format": "pcm_16000", "optimize_streaming_latency": 2}
headers = {
"xi-api-key": self.api_key,
"Content-Type": "application/json",
}
async with session.post(url, json=payload, headers=headers, params=querystring) as r:
if r.status != 200:
self.logger.error(
f"audio fetch status code: {r.status}, error: {r.text}"
)
return
async for chunk in r.content:
if chunk:
yield chunk
async for chunk in r.content:
if chunk:
yield chunk

View File

@@ -2,43 +2,32 @@ import fal
import aiohttp
import asyncio
import io
import os
import json
from PIL import Image
from dailyai.services.ai_services import ImageGenService
from dailyai.services.ai_services import ImageGenService
from dailyai.services.ai_services import LLMService, TTSService, ImageGenService
# Fal expects FAL_KEY_ID and FAL_KEY_SECRET to be set in the env
class FalImageGenService(ImageGenService):
def __init__(
self,
*,
image_size,
aiohttp_session: aiohttp.ClientSession,
key_id=None,
key_secret=None):
def __init__(self, image_size):
super().__init__(image_size)
self._aiohttp_session = aiohttp_session
if key_id:
os.environ["FAL_KEY_ID"] = key_id
if key_secret:
os.environ["FAL_KEY_SECRET"] = key_secret
async def run_image_gen(self, sentence) -> tuple[str, bytes]:
def get_image_url(sentence, size):
print("starting fal submit...")
handler = fal.apps.submit(
"110602490-fast-sdxl",
arguments={
"prompt": sentence,
"seed": 23
"prompt": sentence
},
)
print("past fal handler init, about to wait for iter_events...")
for event in handler.iter_events():
if isinstance(event, fal.apps.InProgress):
pass
print('Request in progress')
print(event.logs)
result = handler.get()
@@ -47,9 +36,16 @@ class FalImageGenService(ImageGenService):
raise Exception("Image generation failed")
return image_url
print(f"fetching image url...")
image_url = await asyncio.to_thread(get_image_url, sentence, self.image_size)
print(f"got image url, downloading image...")
# Load the image from the url
async with self._aiohttp_session.get(image_url) as response:
image_stream = io.BytesIO(await response.content.read())
image = Image.open(image_stream)
return (image_url, image.tobytes())
async with aiohttp.ClientSession() as session:
async with session.get(image_url) as response:
print("got image response")
image_stream = io.BytesIO(await response.content.read())
print("read image stream")
image = Image.open(image_stream)
return (image_url, image.tobytes())
# return (image_url, dalle_im.tobytes())

View File

@@ -1,122 +0,0 @@
import aiohttp
from PIL import Image
import io
from openai import AsyncOpenAI
import asyncio
import json
from collections.abc import AsyncGenerator
from dailyai.services.ai_services import LLMService, ImageGenService
from dailyai.queue_frame import (TextQueueFrame, TextQueueOutOfBandFrame)
class FireworksLLMService(LLMService):
def __init__(self, *, api_key, model="", tools=[], context, change_appearance, transport=""):
super().__init__(context)
self._model = model
self._tools = tools
self._change_appearance = change_appearance
self._transport = transport
self._client = AsyncOpenAI(
api_key=api_key,
base_url="https://api.fireworks.ai/inference/v1"
)
async def get_response(self, messages, stream):
print("GET RESPONSE ... WHEN DO WE EXPECT THIS TO BE CALLED?")
return await self._client.chat.completions.create(
stream=stream,
messages=messages,
model=self._model,
temperature=0.1,
tools=self._tools
)
async def run_llm_async(self, messages) -> AsyncGenerator[str, None]:
print("IN ASYNC")
messages_for_log = json.dumps(messages)
self.logger.debug(f"Generating chat via openai: {messages_for_log}")
chunks = await self._client.chat.completions.create(
model=self._model,
stream=True, # BLARGH
messages=messages,
temperature=0.1,
tools=self._tools
)
tool_call = {}
async for chunk in chunks:
print(f"CHUNK: {chunk}")
if len(chunk.choices) == 0:
continue
if chunk.choices[0].delta.content:
yield chunk.choices[0].delta.content
if chunk.choices[0].delta.tool_calls:
print(f"TOOL CALLS: {chunk.choices[0].delta.tool_calls[0]}")
if chunk.choices[0].delta.tool_calls[0].function.name:
tool_call["id"] = chunk.choices[0].delta.tool_calls[0].id
tool_call["name"] = chunk.choices[0].delta.tool_calls[0].function.name
tool_call["arguments"] = ''
if chunk.choices[0].delta.tool_calls[0].function.arguments:
tool_call["arguments"] += chunk.choices[0].delta.tool_calls[0].function.arguments
if chunk.choices[0].finish_reason:
print(f"TOOL CALLS ACCUM -- {tool_call}")
if tool_call.get("name"):
# hard coding tool call action for now. we should assemble the tool call
# from the streaming response, then yield it to the pipeline.
# this approach works for the first few change appearance requests but
# then the model starts refusing. need to read more about function
# calling, try this with the OpenAI APIs, and talk to the Fireworks people.
self._transport.append_to_context("assistant", {
# pipeline will append the content to this context after it goes
# through tts. we need to manually append the tool call, though
"content": "",
"role": "assistant",
"tool_calls": [
{
"id": tool_call["id"],
"type": "function",
"index": 0,
"function": {
"name": tool_call["name"],
"arguments": tool_call["arguments"]
},
}
],
})
self._transport.append_to_context("tool", {
"content": "image generated by prompt arguments: " + tool_call["arguments"],
"role": "tool",
"tool_call_id": tool_call["id"]
})
self._transport.append_to_context("assistant", {
"content": f"call to {tool_call['name']} function succeeded",
"role": "assistant",
})
print("APPENDED TO CONTEXT")
image_prompt = json.loads(
tool_call["arguments"]).get("appearance")
print("IMAGE PROMPT", image_prompt)
asyncio.create_task(
self._change_appearance(image_prompt))
yield TextQueueOutOfBandFrame("Sure, let me work on that for you!")
# yield {"content": "Sure, let me work on that for you!"}
# yield "Sure, let me work on that for you!"
async def run_llm(self, messages) -> str | None:
print("--> IN SYNC ... WHEN DO WE EXPECT THIS TO BE CALLED?")
messages_for_log = json.dumps(messages)
self.logger.debug(f"Generating chat via openai: {messages_for_log}")
response = await self._client.chat.completions.create(model=self._model, stream=False, messages=messages)
if response and len(response.choices) > 0:
return response.choices[0].message.content
else:
return None

View File

@@ -1,33 +0,0 @@
import os
import groq
from groq import AsyncGroq
from dailyai.services.ai_services import LLMService
from collections.abc import AsyncGenerator
class GroqLLMService(LLMService):
def __init__(self, *, api_key, model="mixtral-8x7b-32768", context):
super().__init__(context)
self._model = model
# os.environ["GROQ_SECRET_ACCESS_KEY"] = api_key
self._client = AsyncGroq()
async def run_llm_async(self, messages) -> AsyncGenerator[str, None]:
print(f"messages are {messages}")
try:
resp = await self._client.chat.completions.create(messages=messages, model=self._model)
print(f"got chunks from groq: {resp}")
if resp.choices[0].message.content:
yield resp.choices[0].message.content
except groq.APIConnectionError as e:
print("The server could not be reached")
print(e.__cause__) # an underlying Exception, likely raised within httpx.
except groq.RateLimitError as e:
print("A 429 status code was received; we should back off a bit.")
except groq.APIStatusError as e:
print("Another non-200-range status code was received")
print(e.status_code)
print(e.response)

View File

@@ -1,10 +1,9 @@
import array
import io
import math
import time
from typing import AsyncGenerator
import wave
from dailyai.queue_frame import AudioQueueFrame, QueueFrame, TranscriptionQueueFrame
from dailyai.queue_frame import AudioQueueFrame, QueueFrame, TextQueueFrame
from dailyai.services.ai_services import STTService
@@ -60,7 +59,7 @@ class LocalSTTService(STTService):
self._content.seek(0)
text = await self.run_stt(self._content)
self._new_wave()
yield TranscriptionQueueFrame(text, '', str(time.time()))
yield TextQueueFrame(text)
# If we get this far, this is a frame of silence
self._current_silence_frames += 1

View File

@@ -1,76 +0,0 @@
import asyncio
import time
import numpy as np
import tkinter as tk
import pyaudio
from dailyai.services.base_transport_service import BaseTransportService
class LocalTransportService(BaseTransportService):
def __init__(self, **kwargs):
super().__init__(**kwargs)
self._sample_width = kwargs.get("sample_width") or 2
self._n_channels = kwargs.get("n_channels") or 1
self._tk_root = kwargs.get("tk_root") or None
if self._camera_enabled and not self._tk_root:
raise ValueError("If camera is enabled, a tkinter root must be provided")
if self._speaker_enabled:
self._speaker_buffer_pending = bytearray()
async def _write_frame_to_tkinter(self, frame: bytes):
data = f"P6 {self._camera_width} {self._camera_height} 255 ".encode() + frame
photo = tk.PhotoImage(
width=self._camera_width,
height=self._camera_height,
data=data,
format="PPM")
self._image_label.config(image=photo)
# This holds a reference to the photo, preventing it from being garbage collected.
self._image_label.image = photo # type: ignore
def write_frame_to_camera(self, frame: bytes):
if self._camera_enabled and self._loop:
asyncio.run_coroutine_threadsafe(
self._write_frame_to_tkinter(frame), self._loop
)
def write_frame_to_mic(self, frame: bytes):
self._audio_stream.write(frame)
def read_frames(self, desired_frame_count):
bytes = self._speaker_stream.read(
desired_frame_count,
exception_on_overflow=False,
)
return bytes
def _prerun(self):
if self._mic_enabled:
self._pyaudio = pyaudio.PyAudio()
self._audio_stream = self._pyaudio.open(
format=self._pyaudio.get_format_from_width(self._sample_width),
channels=self._n_channels,
rate=self._speaker_sample_rate,
output=True,
)
if self._camera_enabled:
# Start with a neutral gray background.
array = np.ones((1024, 1024, 3)) * 128
data = f"P5 {1024} {1024} 255 ".encode() + array.astype(np.uint8).tobytes()
photo = tk.PhotoImage(width=1024, height=1024, data=data, format="PPM")
self._image_label = tk.Label(self._tk_root, image=photo)
self._image_label.pack()
if self._speaker_enabled:
self._speaker_stream = self._pyaudio.open(
format=self._pyaudio.get_format_from_width(self._sample_width),
channels=self._n_channels,
rate=self._speaker_sample_rate,
frames_per_buffer=self._speaker_sample_rate,
input=True
)

View File

@@ -1,42 +0,0 @@
from openai import AsyncOpenAI
import json
from collections.abc import AsyncGenerator
from dailyai.services.ai_services import LLMService
class OLLamaLLMService(LLMService):
def __init__(self, model="llama2", base_url='http://localhost:11434/v1'):
super().__init__()
self._model = model
self._client = AsyncOpenAI(api_key="ollama", base_url=base_url)
async def get_response(self, messages, stream):
return await self._client.chat.completions.create(
stream=stream,
messages=messages,
model=self._model
)
async def run_llm_async(self, messages) -> AsyncGenerator[str, None]:
messages_for_log = json.dumps(messages)
self.logger.debug(f"Generating chat via openai: {messages_for_log}")
chunks = await self._client.chat.completions.create(model=self._model, stream=True, messages=messages)
async for chunk in chunks:
if len(chunk.choices) == 0:
continue
if chunk.choices[0].delta.content:
yield chunk.choices[0].delta.content
async def run_llm(self, messages) -> str | None:
messages_for_log = json.dumps(messages)
self.logger.debug(f"Generating chat via openai: {messages_for_log}")
response = await self._client.chat.completions.create(model=self._model, stream=False, messages=messages)
if response and len(response.choices) > 0:
return response.choices[0].message.content
else:
return None

View File

@@ -1,33 +1,38 @@
import requests
import aiohttp
import asyncio
from PIL import Image
import io
from openai import AsyncOpenAI
import os
import json
from collections.abc import AsyncGenerator
from dailyai.services.ai_services import LLMService, ImageGenService
from dailyai.services.ai_services import AIService, TTSService, LLMService, ImageGenService
class OpenAILLMService(LLMService):
def __init__(self, *, api_key, model="gpt-4-turbo-preview", context):
super().__init__(context)
self._model = model
self._client = AsyncOpenAI(api_key=api_key)
def __init__(self, api_key=None, model=None):
super().__init__()
api_key = api_key or os.getenv("OPEN_AI_KEY")
self.model = model or os.getenv("OPEN_AI_LLM_MODEL") or "gpt-4"
self.client = AsyncOpenAI(api_key=api_key)
async def get_response(self, messages, stream):
return await self._client.chat.completions.create(
return await self.client.chat.completions.create(
stream=stream,
messages=messages,
model=self._model
model=self.model
)
async def run_llm_async(self, messages) -> AsyncGenerator[str, None]:
messages_for_log = json.dumps(messages)
self.logger.debug(f"Generating chat via openai: {messages_for_log}")
chunks = await self._client.chat.completions.create(model=self._model, stream=True, messages=messages)
async for chunk in chunks:
response = await self.get_response(messages, stream=True)
for chunk in response:
if len(chunk.choices) == 0:
continue
@@ -38,7 +43,7 @@ class OpenAILLMService(LLMService):
messages_for_log = json.dumps(messages)
self.logger.debug(f"Generating chat via openai: {messages_for_log}")
response = await self._client.chat.completions.create(model=self._model, stream=False, messages=messages)
response = await self.get_response(messages, stream=False)
if response and len(response.choices) > 0:
return response.choices[0].message.content
else:
@@ -46,27 +51,18 @@ class OpenAILLMService(LLMService):
class OpenAIImageGenService(ImageGenService):
def __init__(
self,
*,
image_size: str,
aiohttp_session: aiohttp.ClientSession,
api_key,
model="dall-e-3",
):
def __init__(self, image_size: str, api_key=None, model=None):
super().__init__(image_size=image_size)
self._model = model
print(f"api key: {api_key}")
self._client = AsyncOpenAI(api_key=api_key)
self._aiohttp_session = aiohttp_session
api_key = api_key or os.getenv("OPEN_AI_KEY")
self.model = model or os.getenv("OPEN_AI_IMAGE_MODEL") or "dall-e-3"
self.client = AsyncOpenAI(api_key=api_key)
async def run_image_gen(self, sentence) -> tuple[str, bytes]:
self.logger.info("Generating OpenAI image", sentence)
image = await self._client.images.generate(
image = await self.client.images.generate(
prompt=sentence,
model=self._model,
model=self.model,
n=1,
size=self.image_size
)
@@ -75,7 +71,10 @@ class OpenAIImageGenService(ImageGenService):
raise Exception("No image provided in response", image)
# Load the image from the url
async with self._aiohttp_session.get(image_url) as response:
image_stream = io.BytesIO(await response.content.read())
image = Image.open(image_stream)
return (image_url, image.tobytes())
async with aiohttp.ClientSession() as session:
async with session.get(image_url) as response:
image_stream = io.BytesIO(await response.content.read())
image = Image.open(image_stream)
return (image_url, image.tobytes())
return (image_url, dalle_im.tobytes())

View File

@@ -0,0 +1,29 @@
import os
import requests
from services.ai_service import AIService
from PIL import Image
class DeepgramAIService(AIService):
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.api_key = os.getenv("DEEPGRAM_API_KEY")
def get_mic_sample_rate(self):
return 24000
def run_tts(self, sentence):
self.logger.info(f"Running deepgram tts for {sentence}")
base_url = "https://api.beta.deepgram.com/v1/speak"
# move this to an environment variable
voice = os.getenv("DEEPGRAM_VOICE") or "alpha-apollo-en-v1"
request_url = f"{base_url}?model={voice}&encoding=linear16&container=none"
headers = {"authorization": f"token {self.api_key}"}
r = requests.post(request_url, headers=headers, data=sentence)
self.logger.info(
f"audio fetch status code: {r.status_code}, content length: {len(r.content)}"
)
yield r.content

View File

@@ -1,40 +1,36 @@
import io
import os
import struct
from pyht import Client
from dotenv import load_dotenv
from pyht.client import TTSOptions
from pyht.protos.api_pb2 import Format
from dailyai.services.ai_services import TTSService
from services.ai_service import AIService
class PlayHTAIService(TTSService):
class PlayHTAIService(AIService):
def __init__(self, **kwargs):
super().__init__(**kwargs)
def __init__(
self,
*,
api_key,
user_id,
voice_url
):
super().__init__()
self.speech_key = api_key
self.user_id = user_id
self.speech_key = os.getenv("PLAY_HT_KEY") or ''
self.user_id = os.getenv("PLAY_HT_USER_ID") or ''
self.client = Client(
user_id=self.user_id,
api_key=self.speech_key,
)
self.options = TTSOptions(
voice=voice_url,
voice="s3://voice-cloning-zero-shot/820da3d2-3a3b-42e7-844d-e68db835a206/sarah/manifest.json",
sample_rate=16000,
quality="higher",
format=Format.FORMAT_WAV)
def __del__(self):
def close(self):
super().close()
self.client.close()
async def run_tts(self, sentence):
def run_tts(self, sentence):
b = bytearray()
in_header = True
for chunk in self.client.tts(sentence, self.options):

View File

@@ -46,7 +46,7 @@ class WhisperSTTService(LocalSTTService):
compute_type=self._compute_type)
self._model = model
async def run_stt(self, audio: BinaryIO) -> str:
async def run_stt(self, audio: BinaryIO = None) -> str:
"""Transcribes given audio using Whisper"""
segments, _ = await asyncio.to_thread(self._model.transcribe, audio)
res: str = ""

View File

@@ -1,3 +1,4 @@
from re import A
import unittest
from typing import AsyncGenerator, Generator

View File

@@ -1,81 +0,0 @@
import asyncio
import unittest
from unittest.mock import MagicMock, patch
from dailyai.queue_frame import AudioQueueFrame, ImageQueueFrame
class TestDailyTransport(unittest.IsolatedAsyncioTestCase):
async def test_event_handler(self):
from dailyai.services.daily_transport_service import DailyTransportService
transport = DailyTransportService("mock.daily.co/mock", "token", "bot")
was_called = False
@transport.event_handler("on_first_other_participant_joined")
def test_event_handler(transport):
nonlocal was_called
was_called = True
transport.on_first_other_participant_joined()
self.assertTrue(was_called)
async def test_event_handler_async(self):
from dailyai.services.daily_transport_service import DailyTransportService
transport = DailyTransportService("mock.daily.co/mock", "token", "bot")
event = asyncio.Event()
@transport.event_handler("on_first_other_participant_joined")
async def test_event_handler(transport):
nonlocal event
await asyncio.sleep(0.1)
event.set()
transport.on_first_other_participant_joined()
await asyncio.wait_for(event.wait(), timeout=1)
self.assertTrue(event.is_set())
@patch("dailyai.services.daily_transport_service.CallClient")
@patch("dailyai.services.daily_transport_service.Daily")
async def test_run_with_camera_and_mic(self, daily_mock, callclient_mock):
from dailyai.services.daily_transport_service import DailyTransportService
transport = DailyTransportService(
"https://mock.daily.co/mock",
"token",
"bot",
mic_enabled=True,
camera_enabled=True,
duration_minutes=0.01,
)
mic = MagicMock()
camera = MagicMock()
daily_mock.create_microphone_device.return_value = mic
daily_mock.create_camera_device.return_value = camera
async def send_audio_frame():
await transport.send_queue.put(AudioQueueFrame(bytes([0] * 3300)))
async def send_video_frame():
await transport.send_queue.put(ImageQueueFrame(None, b"test"))
await asyncio.gather(transport.run(), send_audio_frame(), send_video_frame())
daily_mock.init.assert_called_once_with()
daily_mock.create_microphone_device.assert_called_once()
daily_mock.create_camera_device.assert_called_once()
callclient_mock.return_value.set_user_name.assert_called_once_with("bot")
callclient_mock.return_value.join.assert_called_once_with(
"https://mock.daily.co/mock", "token", completion=transport.call_joined
)
camera.write_frame.assert_called_with(b"test")
mic.write_frames.assert_called()

View File

@@ -1,64 +0,0 @@
import asyncio
import aiohttp
import os
from dailyai.services.daily_transport_service import DailyTransportService
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
from dailyai.services.playht_ai_service import PlayHTAIService
from examples.foundational.support.runner import configure
async def main(room_url):
async with aiohttp.ClientSession() as session:
# create a transport service object using environment variables for
# the transport service's API key, room url, and any other configuration.
# services can all define and document the environment variables they use.
# services all also take an optional config object that is used instead of
# environment variables.
#
# the abstract transport service APIs presumably can map pretty closely
# to the daily-python basic API
meeting_duration_minutes = 5
transport = DailyTransportService(
room_url,
None,
"Say One Thing",
meeting_duration_minutes,
mic_enabled=True
)
"""
tts = ElevenLabsTTSService(
aiohttp_session=session,
api_key=os.getenv("ELEVENLABS_API_KEY"),
voice_id=os.getenv("ELEVENLABS_VOICE_ID"))
"""
tts = PlayHTAIService(
api_key=os.getenv("PLAY_HT_API_KEY"),
user_id=os.getenv("PLAY_HT_USER_ID"),
voice_url=os.getenv("PLAY_HT_VOICE_URL"),
)
# Register an event handler so we can play the audio when the participant joins.
@transport.event_handler("on_participant_joined")
async def on_participant_joined(transport, participant):
nonlocal tts
if participant["info"]["isLocal"]:
return
await tts.say(
"Hello there, " + participant["info"]["userName"] + "!",
transport.send_queue,
)
# wait for the output queue to be empty, then leave the meeting
await transport.stop_when_done()
await transport.run()
del(tts)
if __name__ == "__main__":
(url, token) = configure()
asyncio.run(main(url))

View File

@@ -1,34 +0,0 @@
import asyncio
import aiohttp
import os
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
from dailyai.services.local_transport_service import LocalTransportService
async def main():
async with aiohttp.ClientSession() as session:
meeting_duration_minutes = 1
transport = LocalTransportService(
duration_minutes=meeting_duration_minutes,
mic_enabled=True
)
tts = ElevenLabsTTSService(
aiohttp_session=session,
api_key=os.getenv("ELEVENLABS_API_KEY"),
voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
)
async def say_something():
await asyncio.sleep(1)
await tts.say(
"Hello there.",
transport.send_queue,
)
await transport.stop_when_done()
await asyncio.gather(transport.run(), say_something())
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -1,60 +0,0 @@
import asyncio
import os
import aiohttp
from dailyai.queue_frame import LLMMessagesQueueFrame
from dailyai.services.daily_transport_service import DailyTransportService
from dailyai.services.azure_ai_services import AzureLLMService, AzureTTSService
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
from dailyai.services.deepgram_ai_services import DeepgramTTSService
from dailyai.services.open_ai_services import OpenAILLMService
from examples.foundational.support.runner import configure
async def main(room_url):
async with aiohttp.ClientSession() as session:
meeting_duration_minutes = 1
transport = DailyTransportService(
room_url,
None,
"Say One Thing From an LLM",
duration_minutes=meeting_duration_minutes,
mic_enabled=True,
speaker_enabled=True
)
tts = ElevenLabsTTSService(
aiohttp_session=session,
api_key=os.getenv("ELEVENLABS_API_KEY"),
voice_id=os.getenv("ELEVENLABS_VOICE_ID"))
# tts = AzureTTSService(api_key=os.getenv("AZURE_SPEECH_API_KEY"), region=os.getenv("AZURE_SPEECH_REGION"))
# tts = DeepgramTTSService(aiohttp_session=session, api_key=os.getenv("DEEPGRAM_API_KEY"), voice=os.getenv("DEEPGRAM_VOICE"))
llm = AzureLLMService(
api_key=os.getenv("AZURE_CHATGPT_API_KEY"),
endpoint=os.getenv("AZURE_CHATGPT_ENDPOINT"),
model=os.getenv("AZURE_CHATGPT_MODEL"))
# llm = OpenAILLMService(api_key=os.getenv("OPENAI_CHATGPT_API_KEY"))
messages = [{
"role": "system",
"content": "You are an LLM in a WebRTC session, and this is a 'hello world' demo. Say hello to the world."
}]
tts_task = asyncio.create_task(
tts.run_to_queue(
transport.send_queue,
llm.run([LLMMessagesQueueFrame(messages)]),
)
)
@transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport):
await tts_task
await transport.stop_when_done()
await transport.run()
if __name__ == "__main__":
(url, token) = configure()
asyncio.run(main(url))

View File

@@ -1,53 +0,0 @@
import asyncio
import aiohttp
import os
from dailyai.queue_frame import TextQueueFrame
from dailyai.services.daily_transport_service import DailyTransportService
from dailyai.services.fal_ai_services import FalImageGenService
from dailyai.services.open_ai_services import OpenAIImageGenService
from dailyai.services.azure_ai_services import AzureImageGenServiceREST
from examples.foundational.support.runner import configure
local_joined = False
participant_joined = False
async def main(room_url):
async with aiohttp.ClientSession() as session:
meeting_duration_minutes = 1
transport = DailyTransportService(
room_url,
None,
"Show a still frame image",
duration_minutes=meeting_duration_minutes,
mic_enabled=False,
camera_enabled=True,
camera_width=1024,
camera_height=1024
)
imagegen = FalImageGenService(
image_size="1024x1024",
aiohttp_session=session,
key_id=os.getenv("FAL_KEY_ID"),
key_secret=os.getenv("FAL_KEY_SECRET"))
# imagegen = OpenAIImageGenService(aiohttp_session=session, api_key=os.getenv("OPENAI_DALLE_API_KEY"), image_size="1024x1024")
# imagegen = AzureImageGenServiceREST(image_size="1024x1024", aiohttp_session=session, api_key=os.getenv("AZURE_DALLE_API_KEY"), endpoint=os.getenv("AZURE_DALLE_ENDPOINT"), model=os.getenv("AZURE_DALLE_MODEL"))
image_task = asyncio.create_task(
imagegen.run_to_queue(
transport.send_queue, [
TextQueueFrame("a cat in the style of picasso")]))
@transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport):
await image_task
await transport.run()
if __name__ == "__main__":
(url, token) = configure()
asyncio.run(main(url))

View File

@@ -1,50 +0,0 @@
import asyncio
import aiohttp
import os
import tkinter as tk
from dailyai.queue_frame import TextQueueFrame
from dailyai.services.fal_ai_services import FalImageGenService
from dailyai.services.local_transport_service import LocalTransportService
local_joined = False
participant_joined = False
async def main():
async with aiohttp.ClientSession() as session:
meeting_duration_minutes = 2
tk_root = tk.Tk()
tk_root.title("Calendar")
transport = LocalTransportService(
tk_root=tk_root,
mic_enabled=True,
camera_enabled=True,
camera_width=1024,
camera_height=1024,
duration_minutes=meeting_duration_minutes,
)
imagegen = FalImageGenService(
image_size="1024x1024",
aiohttp_session=session,
key_id=os.getenv("FAL_KEY_ID"),
key_secret=os.getenv("FAL_KEY_SECRET"),
)
image_task = asyncio.create_task(
imagegen.run_to_queue(
transport.send_queue, [TextQueueFrame("a cat in the style of picasso")]
)
)
async def run_tk():
while not transport._stop_threads.is_set():
tk_root.update()
tk_root.update_idletasks()
await asyncio.sleep(0.1)
await asyncio.gather(transport.run(), image_task, run_tk())
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -1,73 +0,0 @@
import asyncio
import os
import aiohttp
from dailyai.services.daily_transport_service import DailyTransportService
from dailyai.services.azure_ai_services import AzureLLMService, AzureTTSService
from dailyai.queue_frame import EndStreamQueueFrame, LLMMessagesQueueFrame
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
from examples.foundational.support.runner import configure
async def main(room_url: str):
async with aiohttp.ClientSession() as session:
transport = DailyTransportService(
room_url,
None,
"Static And Dynamic Speech",
duration_minutes=1,
mic_enabled=True,
mic_sample_rate=16000,
camera_enabled=False
)
llm = AzureLLMService(
api_key=os.getenv("AZURE_CHATGPT_API_KEY"),
endpoint=os.getenv("AZURE_CHATGPT_ENDPOINT"),
model=os.getenv("AZURE_CHATGPT_MODEL"))
azure_tts = AzureTTSService(
api_key=os.getenv("AZURE_SPEECH_API_KEY"),
region=os.getenv("AZURE_SPEECH_REGION"))
elevenlabs_tts = ElevenLabsTTSService(
aiohttp_session=session,
api_key=os.getenv("ELEVENLABS_API_KEY"),
voice_id=os.getenv("ELEVENLABS_VOICE_ID"))
messages = [{"role": "system", "content": "tell the user a joke about llamas"}]
# Start a task to run the LLM to create a joke, and convert the LLM output to audio frames. This task
# will run in parallel with generating and speaking the audio for static text, so there's no delay to
# speak the LLM response.
buffer_queue = asyncio.Queue()
llm_response_task = asyncio.create_task(
elevenlabs_tts.run_to_queue(
buffer_queue,
llm.run([LLMMessagesQueueFrame(messages)]),
True,
)
)
@transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport):
await azure_tts.say("My friend the LLM is now going to tell a joke about llamas.", transport.send_queue)
async def buffer_to_send_queue():
while True:
frame = await buffer_queue.get()
await transport.send_queue.put(frame)
buffer_queue.task_done()
if isinstance(frame, EndStreamQueueFrame):
break
await asyncio.gather(llm_response_task, buffer_to_send_queue())
await transport.stop_when_done()
await transport.run()
if __name__ == "__main__":
(url, token) = configure()
asyncio.run(main(url))

View File

@@ -1,134 +0,0 @@
import asyncio
import aiohttp
import os
from dailyai.queue_frame import AudioQueueFrame, ImageQueueFrame
from dailyai.services.azure_ai_services import AzureLLMService, AzureImageGenServiceREST, AzureTTSService
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
from dailyai.services.daily_transport_service import DailyTransportService
from dailyai.services.fal_ai_services import FalImageGenService
from dailyai.services.open_ai_services import OpenAIImageGenService
from examples.foundational.support.runner import configure
async def main(room_url):
async with aiohttp.ClientSession() as session:
meeting_duration_minutes = 5
transport = DailyTransportService(
room_url,
None,
"Month Narration Bot",
duration_minutes=meeting_duration_minutes,
mic_enabled=True,
camera_enabled=True,
mic_sample_rate=16000,
camera_width=1024,
camera_height=1024
)
llm = AzureLLMService(
api_key=os.getenv("AZURE_CHATGPT_API_KEY"),
endpoint=os.getenv("AZURE_CHATGPT_ENDPOINT"),
model=os.getenv("AZURE_CHATGPT_MODEL"))
tts = ElevenLabsTTSService(
aiohttp_session=session,
api_key=os.getenv("ELEVENLABS_API_KEY"),
voice_id="ErXwobaYiN019PkySvjV")
# tts = AzureTTSService(api_key=os.getenv("AZURE_SPEECH_API_KEY"), region=os.getenv("AZURE_SPEECH_REGION"))
dalle = FalImageGenService(
image_size="1024x1024",
aiohttp_session=session,
key_id=os.getenv("FAL_KEY_ID"),
key_secret=os.getenv("FAL_KEY_SECRET"))
# dalle = OpenAIImageGenService(aiohttp_session=session, api_key=os.getenv("OPENAI_DALLE_API_KEY"), image_size="1024x1024")
# dalle = AzureImageGenServiceREST(image_size="1024x1024", aiohttp_session=session, api_key=os.getenv("AZURE_DALLE_API_KEY"), endpoint=os.getenv("AZURE_DALLE_ENDPOINT"), model=os.getenv("AZURE_DALLE_MODEL"))
# Get a complete audio chunk from the given text. Splitting this into its own
# coroutine lets us ensure proper ordering of the audio chunks on the send queue.
async def get_all_audio(text):
all_audio = bytearray()
async for audio in tts.run_tts(text):
all_audio.extend(audio)
return all_audio
async def get_month_data(month):
messages = [
{
"role": "system",
"content": f"Describe a nature photograph suitable for use in a calendar, for the month of {month}. Include only the image description with no preamble. Limit the description to one sentence, please.",
}
]
image_description = await llm.run_llm(messages)
if not image_description:
return
to_speak = f"{month}: {image_description}"
audio_task = asyncio.create_task(get_all_audio(to_speak))
image_task = asyncio.create_task(dalle.run_image_gen(image_description))
print(f"about to gather tasks for {month}")
(audio, image_data) = await asyncio.gather(
audio_task, image_task
)
print(f"about to return from get_month_data for {month}")
return {
"month": month,
"text": image_description,
"image_url": image_data[0],
"image": image_data[1],
"audio": audio,
}
months: list[str] = [
"January",
"February",
"March",
"April",
"May",
"June"
]
"""
"February",
"March",
"April",
"May",
"June",
"July",
"August",
"September",
"October",
"November",
"December",
"""
@transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport):
# This will play the months in the order they're completed. The benefit
# is we'll have as little delay as possible before the first month, and
# likely no delay between months, but the months won't display in order.
for month_data_task in asyncio.as_completed(month_tasks):
print(f"month_data_task: {month_data_task}")
try:
data = await month_data_task
except Exception:
print("OMG EXCEPTION!!!!")
if data:
await transport.send_queue.put(
[
ImageQueueFrame(data["image_url"], data["image"]),
AudioQueueFrame(data["audio"]),
]
)
# wait for the output queue to be empty, then leave the meeting
await transport.stop_when_done()
month_tasks = [asyncio.create_task(get_month_data(month)) for month in months]
await transport.run()
if __name__ == "__main__":
(url, token) = configure()
asyncio.run(main(url))

View File

@@ -1,134 +0,0 @@
import aiohttp
import argparse
import asyncio
import tkinter as tk
import os
from dailyai.queue_frame import AudioQueueFrame, ImageQueueFrame
from dailyai.services.azure_ai_services import AzureLLMService
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
from dailyai.services.fal_ai_services import FalImageGenService
from dailyai.services.local_transport_service import LocalTransportService
async def main(room_url):
async with aiohttp.ClientSession() as session:
meeting_duration_minutes = 5
tk_root = tk.Tk()
tk_root.title("Calendar")
transport = LocalTransportService(
mic_enabled=True,
camera_enabled=True,
camera_width=1024,
camera_height=1024,
duration_minutes=meeting_duration_minutes,
tk_root=tk_root,
)
llm = AzureLLMService(
api_key=os.getenv("AZURE_CHATGPT_API_KEY"),
endpoint=os.getenv("AZURE_CHATGPT_ENDPOINT"),
model=os.getenv("AZURE_CHATGPT_MODEL"),
)
tts = ElevenLabsTTSService(
aiohttp_session=session,
api_key=os.getenv("ELEVENLABS_API_KEY"),
voice_id="ErXwobaYiN019PkySvjV",
)
dalle = FalImageGenService(
image_size="1024x1024",
aiohttp_session=session,
key_id=os.getenv("FAL_KEY_ID"),
key_secret=os.getenv("FAL_KEY_SECRET"),
)
# Get a complete audio chunk from the given text. Splitting this into its own
# coroutine lets us ensure proper ordering of the audio chunks on the send queue.
async def get_all_audio(text):
all_audio = bytearray()
async for audio in tts.run_tts(text):
all_audio.extend(audio)
return all_audio
async def get_month_data(month):
messages = [
{
"role": "system",
"content": f"Describe a nature photograph suitable for use in a calendar, for the month of {month}. Include only the image description with no preamble. Limit the description to one sentence, please.",
}
]
image_description = await llm.run_llm(messages)
if not image_description:
return
to_speak = f"{month}: {image_description}"
audio_task = asyncio.create_task(get_all_audio(to_speak))
image_task = asyncio.create_task(dalle.run_image_gen(image_description))
(audio, image_data) = await asyncio.gather(
audio_task, image_task
)
return {
"month": month,
"text": image_description,
"image_url": image_data[0],
"image": image_data[1],
"audio": audio,
}
months: list[str] = [
"January",
"February",
"March",
"April",
"May",
"June",
"July",
"August",
"September",
"October",
"November",
"December",
]
async def show_images():
# This will play the months in the order they're completed. The benefit
# is we'll have as little delay as possible before the first month, and
# likely no delay between months, but the months won't display in order.
for month_data_task in asyncio.as_completed(month_tasks):
data = await month_data_task
if data:
await transport.send_queue.put(
[
ImageQueueFrame(data["image_url"], data["image"]),
AudioQueueFrame(data["audio"]),
]
)
await asyncio.sleep(25)
# wait for the output queue to be empty, then leave the meeting
await transport.stop_when_done()
async def run_tk():
while not transport._stop_threads.is_set():
tk_root.update()
tk_root.update_idletasks()
await asyncio.sleep(0.1)
month_tasks = [asyncio.create_task(get_month_data(month)) for month in months]
await asyncio.gather(transport.run(), show_images(), run_tk())
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Simple Daily Bot Sample")
parser.add_argument(
"-u", "--url", type=str, required=True, help="URL of the Daily room to join"
)
args, unknown = parser.parse_known_args()
asyncio.run(main(args.url))

View File

@@ -1,70 +0,0 @@
import asyncio
import os
from dailyai.services.daily_transport_service import DailyTransportService
from dailyai.services.azure_ai_services import AzureLLMService, AzureTTSService
from dailyai.queue_aggregators import LLMAssistantContextAggregator, LLMContextAggregator, LLMUserContextAggregator
from examples.foundational.support.runner import configure
from dailyai.services.ai_services import FrameLogger
async def main(room_url: str, token):
context = [
{
"role": "system",
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio. Respond to what the user said in a creative and helpful way.",
},
]
transport = DailyTransportService(
room_url,
token,
"Respond bot",
duration_minutes=5,
start_transcription=True,
mic_enabled=True,
mic_sample_rate=16000,
camera_enabled=False,
speaker_enabled=True,
context=context
)
llm = AzureLLMService(
api_key=os.getenv("AZURE_CHATGPT_API_KEY"),
endpoint=os.getenv("AZURE_CHATGPT_ENDPOINT"),
model=os.getenv("AZURE_CHATGPT_MODEL"))
tts = AzureTTSService(
api_key=os.getenv("AZURE_SPEECH_API_KEY"),
region=os.getenv("AZURE_SPEECH_REGION"))
fl = FrameLogger("transport")
@transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport):
await tts.say("Hi, I'm listening!", transport.send_queue)
async def handle_transcriptions():
tma_in = LLMUserContextAggregator(
context, transport._my_participant_id)
tma_out = LLMAssistantContextAggregator(
context, transport._my_participant_id)
await tts.run_to_queue(
transport.send_queue,
tma_out.run(
llm.run(
tma_in.run(
fl.run(
transport.get_receive_frames()
)
)
)
)
)
transport.transcription_settings["extra"]["punctuate"] = True
transport.transcription_settings["extra"]["endpointing"] = True
await asyncio.gather(transport.run(), handle_transcriptions())
if __name__ == "__main__":
(url, token) = configure()
asyncio.run(main(url, token))

View File

@@ -1,115 +0,0 @@
import argparse
import asyncio
import os
from typing import AsyncGenerator
import aiohttp
import requests
import time
import urllib.parse
from PIL import Image
from dailyai.queue_frame import ImageQueueFrame, QueueFrame
from dailyai.services.daily_transport_service import DailyTransportService
from dailyai.services.azure_ai_services import AzureLLMService, AzureTTSService
from dailyai.services.ai_services import AIService
from dailyai.queue_aggregators import LLMAssistantContextAggregator, LLMUserContextAggregator
from dailyai.services.fal_ai_services import FalImageGenService
from examples.foundational.support.runner import configure
class ImageSyncAggregator(AIService):
def __init__(self, speaking_path: str, waiting_path: str):
self._speaking_image = Image.open(speaking_path)
self._speaking_image_bytes = self._speaking_image.tobytes()
self._waiting_image = Image.open(waiting_path)
self._waiting_image_bytes = self._waiting_image.tobytes()
async def process_frame(self, frame: QueueFrame) -> AsyncGenerator[QueueFrame, None]:
yield ImageQueueFrame(None, self._speaking_image_bytes)
yield frame
yield ImageQueueFrame(None, self._waiting_image_bytes)
async def main(room_url: str, token):
async with aiohttp.ClientSession() as session:
transport = DailyTransportService(
room_url,
token,
"Respond bot",
5,
)
transport._camera_enabled = True
transport._camera_width = 1024
transport._camera_height = 1024
transport._mic_enabled = True
transport._mic_sample_rate = 16000
llm = AzureLLMService(
api_key=os.getenv("AZURE_CHATGPT_API_KEY"),
endpoint=os.getenv("AZURE_CHATGPT_ENDPOINT"),
model=os.getenv("AZURE_CHATGPT_MODEL"))
tts = AzureTTSService(
api_key=os.getenv("AZURE_SPEECH_API_KEY"),
region=os.getenv("AZURE_SPEECH_REGION"))
img = FalImageGenService(
image_size="1024x1024",
aiohttp_session=session,
key_id=os.getenv("FAL_KEY_ID"),
key_secret=os.getenv("FAL_KEY_SECRET"))
async def get_images():
get_speaking_task = asyncio.create_task(
img.run_image_gen("An image of a cat speaking")
)
get_waiting_task = asyncio.create_task(
img.run_image_gen("An image of a cat waiting")
)
(speaking_data, waiting_data) = await asyncio.gather(
get_speaking_task, get_waiting_task
)
return speaking_data, waiting_data
@transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport):
await tts.say("Hi, I'm listening!", transport.send_queue)
async def handle_transcriptions():
messages = [
{"role": "system", "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio. Respond to what the user said in a creative and helpful way."},
]
tma_in = LLMUserContextAggregator(
messages, transport._my_participant_id
)
tma_out = LLMAssistantContextAggregator(
messages, transport._my_participant_id
)
image_sync_aggregator = ImageSyncAggregator(
os.path.join(os.path.dirname(__file__), "assets", "speaking.png"),
os.path.join(os.path.dirname(__file__), "assets", "waiting.png"),
)
await tts.run_to_queue(
transport.send_queue,
image_sync_aggregator.run(
tma_out.run(
llm.run(
tma_in.run(
transport.get_receive_frames()
)
)
)
)
)
transport.transcription_settings["extra"]["punctuate"] = True
await asyncio.gather(transport.run(), handle_transcriptions())
if __name__ == "__main__":
(url, token) = configure()
asyncio.run(main(url, token))

View File

@@ -1,83 +0,0 @@
import asyncio
import aiohttp
import os
from dailyai.conversation_wrappers import InterruptibleConversationWrapper
from dailyai.queue_frame import StartStreamQueueFrame, TextQueueFrame
from dailyai.services.daily_transport_service import DailyTransportService
from dailyai.services.azure_ai_services import AzureLLMService, AzureTTSService
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
from dailyai.services.open_ai_services import OpenAILLMService
from dailyai.services.deepgram_ai_services import DeepgramTTSService
from dailyai.services.ai_services import FrameLogger
from dailyai.services.groq_ai_services import GroqLLMService
from examples.foundational.support.runner import configure
async def main(room_url: str, token):
async with aiohttp.ClientSession() as session:
context = [
{
"role": "system",
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio. Respond to what the user said in a creative and helpful way.",
},
]
transport = DailyTransportService(
room_url,
token,
"Respond bot",
duration_minutes=5,
start_transcription=True,
mic_enabled=True,
mic_sample_rate=16000,
camera_enabled=False,
# TODO-CB: Should this be VAD enabled or something?
speaker_enabled=True,
context=context
)
# llm = AzureLLMService(
# api_key=os.getenv("AZURE_CHATGPT_API_KEY"),
# endpoint=os.getenv("AZURE_CHATGPT_ENDPOINT"),
# model=os.getenv("AZURE_CHATGPT_MODEL"),
# context=context)
llm = OpenAILLMService(
context=context, api_key=os.getenv("OPENAI_CHATGPT_API_KEY"))
# llm = GroqLLMService(api_key=os.getenv("GROQ_API_KEY"), context=context)
# tts = AzureTTSService(
# api_key=os.getenv("AZURE_SPEECH_API_KEY"),
# region=os.getenv("AZURE_SPEECH_REGION"))
tts = ElevenLabsTTSService(
aiohttp_session=session,
api_key=os.getenv("ELEVENLABS_API_KEY"),
voice_id=os.getenv("ELEVENLABS_VOICE_ID"))
# tts = DeepgramTTSService(aiohttp_session=session, api_key=os.getenv("DEEPGRAM_API_KEY"), voice=os.getenv("DEEPGRAM_VOICE"))
fl = FrameLogger("just outside the innermost layer")
async def run_response(in_frame):
await tts.run_to_queue(
transport.send_queue,
# tma_out.run(
llm.run(
# tma_in.run(
fl.run(
[StartStreamQueueFrame(), in_frame]
)
# )
)
# ),
)
@transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport):
await tts.say("Hi, I'm listening!", transport.send_queue)
transport.transcription_settings["extra"]["endpointing"] = True
transport.transcription_settings["extra"]["punctuate"] = True
await asyncio.gather(transport.run(), transport.run_conversation(run_response))
if __name__ == "__main__":
(url, token) = configure()
asyncio.run(main(url, token))

View File

@@ -1,71 +0,0 @@
import asyncio
import aiohttp
import os
from dailyai.conversation_wrappers import InterruptibleConversationWrapper
from dailyai.queue_frame import StartStreamQueueFrame, TextQueueFrame
from dailyai.services.daily_transport_service import DailyTransportService
from dailyai.services.azure_ai_services import AzureLLMService, AzureTTSService
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
from examples.foundational.support.runner import configure
async def main(room_url: str, token):
async with aiohttp.ClientSession() as session:
transport = DailyTransportService(
room_url,
token,
"Respond bot",
duration_minutes=5,
start_transcription=True,
mic_enabled=True,
mic_sample_rate=16000,
camera_enabled=False,
)
llm = AzureLLMService(
api_key=os.getenv("AZURE_CHATGPT_API_KEY"),
endpoint=os.getenv("AZURE_CHATGPT_ENDPOINT"),
model=os.getenv("AZURE_CHATGPT_MODEL"))
tts = AzureTTSService(
api_key=os.getenv("AZURE_SPEECH_API_KEY"),
region=os.getenv("AZURE_SPEECH_REGION"))
async def run_response(user_speech, tma_in, tma_out):
await tts.run_to_queue(
transport.send_queue,
tma_out.run(
llm.run(
tma_in.run(
[StartStreamQueueFrame(), TextQueueFrame(user_speech)]
)
)
),
)
@transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport):
await tts.say("Hi, I'm listening!", transport.send_queue)
async def run_conversation():
messages = [
{"role": "system", "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio. Respond to what the user said in a creative and helpful way."},
]
conversation_wrapper = InterruptibleConversationWrapper(
frame_generator=transport.get_receive_frames,
runner=run_response,
interrupt=transport.interrupt,
my_participant_id=transport._my_participant_id,
llm_messages=messages,
)
await conversation_wrapper.run_conversation()
transport.transcription_settings["extra"]["punctuate"] = False
await asyncio.gather(transport.run(), run_conversation())
if __name__ == "__main__":
(url, token) = configure()
asyncio.run(main(url, token))

View File

@@ -1,115 +0,0 @@
import aiohttp
import asyncio
import os
from dailyai.services.daily_transport_service import DailyTransportService
from dailyai.services.azure_ai_services import AzureLLMService, AzureTTSService
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
from dailyai.services.fal_ai_services import FalImageGenService
from dailyai.queue_frame import AudioQueueFrame, ImageQueueFrame
from examples.foundational.support.runner import configure
async def main(room_url: str):
async with aiohttp.ClientSession() as session:
transport = DailyTransportService(
room_url,
None,
"Respond bot",
duration_minutes=10,
mic_enabled=True,
mic_sample_rate=16000,
camera_enabled=True,
camera_width=1024,
camera_height=1024
)
llm = AzureLLMService(
api_key=os.getenv("AZURE_CHATGPT_API_KEY"),
endpoint=os.getenv("AZURE_CHATGPT_ENDPOINT"),
model=os.getenv("AZURE_CHATGPT_MODEL"))
tts1 = AzureTTSService(
api_key=os.getenv("AZURE_SPEECH_API_KEY"),
region=os.getenv("AZURE_SPEECH_REGION"))
tts2 = ElevenLabsTTSService(
aiohttp_session=session,
api_key=os.getenv("ELEVENLABS_API_KEY"),
voice_id="jBpfuIE2acCO8z3wKNLl")
dalle = FalImageGenService(
image_size="1024x1024",
aiohttp_session=session,
key_id=os.getenv("FAL_KEY_ID"),
key_secret=os.getenv("FAL_KEY_SECRET"))
bot1_messages = [
{"role": "system", "content": "You are a stern librarian. You strongly believe that a hot dog is a sandwich. Start by stating this fact in a few sentences, then be prepared to debate this with the user. You shouldn't ever compromise on the fundamental truth that a hot dog is a sandwich. Your responses should only be a few sentences long."},
]
bot2_messages = [
{
"role": "system",
"content": "You are a silly cat, and you strongly believe that a hot dog is not a sandwich. Debate this with the user, only responding with a few sentences. Don't ever accept that a hot dog is a sandwich."},
]
async def get_bot1_statement():
# Run the LLMs synchronously for the back-and-forth
bot1_msg = await llm.run_llm(bot1_messages)
print(f"bot1_msg: {bot1_msg}")
if bot1_msg:
bot1_messages.append({"role": "assistant", "content": bot1_msg})
bot2_messages.append({"role": "user", "content": bot1_msg})
all_audio = bytearray()
async for audio in tts1.run_tts(bot1_msg):
all_audio.extend(audio)
return all_audio
async def get_bot2_statement():
# Run the LLMs synchronously for the back-and-forth
bot2_msg = await llm.run_llm(bot2_messages)
print(f"bot2_msg: {bot2_msg}")
if bot2_msg:
bot2_messages.append({"role": "assistant", "content": bot2_msg})
bot1_messages.append({"role": "user", "content": bot2_msg})
all_audio = bytearray()
async for audio in tts2.run_tts(bot2_msg):
all_audio.extend(audio)
return all_audio
async def argue():
for i in range(100):
print(f"In iteration {i}")
bot1_description = "A woman conservatively dressed as a librarian in a library surrounded by books, cartoon, serious, highly detailed"
(audio1, image_data1) = await asyncio.gather(
get_bot1_statement(), dalle.run_image_gen(bot1_description)
)
await transport.send_queue.put(
[
ImageQueueFrame(None, image_data1[1]),
AudioQueueFrame(audio1),
]
)
bot2_description = "A cat dressed in a hot dog costume, cartoon, bright colors, funny, highly detailed"
(audio2, image_data2) = await asyncio.gather(
get_bot2_statement(), dalle.run_image_gen(bot2_description)
)
await transport.send_queue.put(
[
ImageQueueFrame(None, image_data2[1]),
AudioQueueFrame(audio2),
]
)
await asyncio.gather(transport.run(), argue())
if __name__ == "__main__":
(url, token) = configure()
asyncio.run(main(url))

View File

@@ -1,179 +0,0 @@
import aiohttp
import asyncio
import os
import random
from typing import AsyncGenerator
from PIL import Image
from dailyai.services.daily_transport_service import DailyTransportService
from dailyai.services.azure_ai_services import AzureLLMService
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
from dailyai.queue_aggregators import LLMUserContextAggregator, LLMAssistantContextAggregator
from dailyai.queue_frame import (
QueueFrame,
TextQueueFrame,
ImageQueueFrame,
SpriteQueueFrame,
TranscriptionQueueFrame,
)
from dailyai.services.ai_services import AIService
from examples.foundational.support.runner import configure
sprites = {}
image_files = [
'sc-default.png',
'sc-talk.png',
'sc-listen-1.png',
'sc-think-1.png',
'sc-think-2.png',
'sc-think-3.png',
'sc-think-4.png'
]
script_dir = os.path.dirname(__file__)
for file in image_files:
# Build the full path to the image file
full_path = os.path.join(script_dir, "assets", file)
# Get the filename without the extension to use as the dictionary key
filename = os.path.splitext(os.path.basename(full_path))[0]
# Open the image and convert it to bytes
with Image.open(full_path) as img:
sprites[file] = img.tobytes()
# When the bot isn't talking, show a static image of the cat listening
quiet_frame = ImageQueueFrame("", sprites["sc-listen-1.png"])
# When the bot is talking, build an animation from two sprites
talking_list = [sprites['sc-default.png'], sprites['sc-talk.png']]
talking = [random.choice(talking_list) for x in range(30)]
talking_frame = SpriteQueueFrame(images=talking)
# TODO: Support "thinking" as soon as we get a valid transcript, while LLM is processing
thinking_list = [
sprites['sc-think-1.png'],
sprites['sc-think-2.png'],
sprites['sc-think-3.png'],
sprites['sc-think-4.png']]
thinking_frame = SpriteQueueFrame(images=thinking_list)
class TranscriptFilter(AIService):
def __init__(self, bot_participant_id=None):
self.bot_participant_id = bot_participant_id
async def process_frame(self, frame: QueueFrame) -> AsyncGenerator[QueueFrame, None]:
if isinstance(frame, TranscriptionQueueFrame):
if frame.participantId != self.bot_participant_id:
yield frame
class NameCheckFilter(AIService):
def __init__(self, names: list[str]):
self.names = names
self.sentence = ""
async def process_frame(self, frame: QueueFrame) -> AsyncGenerator[QueueFrame, None]:
content: str = ""
# TODO: split up transcription by participant
if isinstance(frame, TextQueueFrame):
content = frame.text
self.sentence += content
if self.sentence.endswith((".", "?", "!")):
if any(name in self.sentence for name in self.names):
out = self.sentence
self.sentence = ""
yield TextQueueFrame(out)
else:
out = self.sentence
self.sentence = ""
class ImageSyncAggregator(AIService):
def __init__(self):
pass
async def process_frame(self, frame: QueueFrame) -> AsyncGenerator[QueueFrame, None]:
yield talking_frame
yield frame
yield quiet_frame
async def main(room_url: str, token):
async with aiohttp.ClientSession() as session:
transport = DailyTransportService(
room_url,
token,
"Santa Cat",
duration_minutes=3,
start_transcription=True,
mic_enabled=True,
mic_sample_rate=16000,
camera_enabled=True,
camera_width=720,
camera_height=1280
)
transport._mic_enabled = True
transport._mic_sample_rate = 16000
transport._camera_enabled = True
transport._camera_width = 720
transport._camera_height = 1280
llm = AzureLLMService(
api_key=os.getenv("AZURE_CHATGPT_API_KEY"),
endpoint=os.getenv("AZURE_CHATGPT_ENDPOINT"),
model=os.getenv("AZURE_CHATGPT_MODEL"))
tts = ElevenLabsTTSService(
aiohttp_session=session,
api_key=os.getenv("ELEVENLABS_API_KEY"),
voice_id="jBpfuIE2acCO8z3wKNLl")
isa = ImageSyncAggregator()
@transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport):
await tts.say("Hi! If you want to talk to me, just say 'hey Santa Cat'.", transport.send_queue)
async def handle_transcriptions():
messages = [
{"role": "system", "content": "You are Santa Cat, a cat that lives in Santa's workshop at the North Pole. You should be clever, and a bit sarcastic. You should also tell jokes every once in a while. Your responses should only be a few sentences long."},
]
tma_in = LLMUserContextAggregator(
messages, transport._my_participant_id
)
tma_out = LLMAssistantContextAggregator(
messages, transport._my_participant_id
)
tf = TranscriptFilter(transport._my_participant_id)
ncf = NameCheckFilter(["Santa Cat", "Santa"])
await tts.run_to_queue(
transport.send_queue,
isa.run(
tma_out.run(
llm.run(
tma_in.run(
ncf.run(
tf.run(
transport.get_receive_frames()
)
)
)
)
)
)
)
async def starting_image():
await transport.send_queue.put(quiet_frame)
transport.transcription_settings["extra"]["punctuate"] = True
await asyncio.gather(transport.run(), handle_transcriptions(), starting_image())
if __name__ == "__main__":
(url, token) = configure()
asyncio.run(main(url, token))

View File

@@ -1,131 +0,0 @@
import aiohttp
import asyncio
import logging
import os
import wave
from dailyai.services.daily_transport_service import DailyTransportService
from dailyai.services.azure_ai_services import AzureLLMService, AzureTTSService
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
from dailyai.queue_aggregators import LLMContextAggregator, LLMUserContextAggregator, LLMAssistantContextAggregator
from dailyai.services.ai_services import AIService, FrameLogger
from dailyai.queue_frame import QueueFrame, AudioQueueFrame, LLMResponseEndQueueFrame, LLMMessagesQueueFrame
from typing import AsyncGenerator
from examples.foundational.support.runner import configure
logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s") # or whatever
logger = logging.getLogger("dailyai")
logger.setLevel(logging.DEBUG)
sounds = {}
sound_files = [
'ding1.wav',
'ding2.wav'
]
script_dir = os.path.dirname(__file__)
for file in sound_files:
# Build the full path to the image file
full_path = os.path.join(script_dir, "assets", file)
# Get the filename without the extension to use as the dictionary key
filename = os.path.splitext(os.path.basename(full_path))[0]
# Open the image and convert it to bytes
with wave.open(full_path) as audio_file:
sounds[file] = audio_file.readframes(-1)
class OutboundSoundEffectWrapper(AIService):
def __init__(self):
pass
async def process_frame(self, frame: QueueFrame) -> AsyncGenerator[QueueFrame, None]:
if isinstance(frame, LLMResponseEndQueueFrame):
yield AudioQueueFrame(sounds["ding1.wav"])
# In case anything else up the stack needs it
yield frame
else:
yield frame
class InboundSoundEffectWrapper(AIService):
def __init__(self):
pass
async def process_frame(self, frame: QueueFrame) -> AsyncGenerator[QueueFrame, None]:
if isinstance(frame, LLMMessagesQueueFrame):
yield AudioQueueFrame(sounds["ding2.wav"])
# In case anything else up the stack needs it
yield frame
else:
yield frame
async def main(room_url: str, token):
async with aiohttp.ClientSession() as session:
transport = DailyTransportService(
room_url,
token,
"Respond bot",
duration_minutes=5,
mic_enabled=True,
mic_sample_rate=16000,
camera_enabled=False
)
llm = AzureLLMService(
api_key=os.getenv("AZURE_CHATGPT_API_KEY"),
endpoint=os.getenv("AZURE_CHATGPT_ENDPOINT"),
model=os.getenv("AZURE_CHATGPT_MODEL"))
tts = ElevenLabsTTSService(
aiohttp_session=session,
api_key=os.getenv("ELEVENLABS_API_KEY"),
voice_id="ErXwobaYiN019PkySvjV")
@transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport):
await tts.say("Hi, I'm listening!", transport.send_queue)
await transport.send_queue.put(AudioQueueFrame(sounds["ding1.wav"]))
async def handle_transcriptions():
messages = [
{"role": "system", "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio. Respond to what the user said in a creative and helpful way."},
]
tma_in = LLMUserContextAggregator(
messages, transport._my_participant_id
)
tma_out = LLMAssistantContextAggregator(
messages, transport._my_participant_id
)
out_sound = OutboundSoundEffectWrapper()
in_sound = InboundSoundEffectWrapper()
fl = FrameLogger("LLM Out")
fl2 = FrameLogger("Transcription In")
await out_sound.run_to_queue(
transport.send_queue,
tts.run(
fl.run(
tma_out.run(
llm.run(
fl2.run(
in_sound.run(
tma_in.run(
transport.get_receive_frames()
)
)
)
)
)
)
)
)
transport.transcription_settings["extra"]["punctuate"] = True
await asyncio.gather(transport.run(), handle_transcriptions())
if __name__ == "__main__":
(url, token) = configure()
asyncio.run(main(url, token))

View File

@@ -1,59 +0,0 @@
import argparse
import asyncio
import wave
from dailyai.queue_frame import EndStreamQueueFrame, TranscriptionQueueFrame
from dailyai.services.local_transport_service import LocalTransportService
from dailyai.services.whisper_ai_services import WhisperSTTService
async def main(room_url: str):
global transport
global stt
meeting_duration_minutes = 1
transport = LocalTransportService(
mic_enabled=True,
camera_enabled=False,
speaker_enabled=True,
duration_minutes=meeting_duration_minutes,
start_transcription=True
)
stt = WhisperSTTService()
transcription_output_queue = asyncio.Queue()
transport_done = asyncio.Event()
async def handle_transcription():
print("`````````TRANSCRIPTION`````````")
while not transport_done.is_set():
item = await transcription_output_queue.get()
print("got item from queue", item)
if isinstance(item, TranscriptionQueueFrame):
print(item.text)
elif isinstance(item, EndStreamQueueFrame):
break
print("handle_transcription done")
async def handle_speaker():
await stt.run_to_queue(
transcription_output_queue, transport.get_receive_frames()
)
await transcription_output_queue.put(EndStreamQueueFrame())
print("handle speaker done.")
async def run_until_done():
await transport.run()
transport_done.set()
print("run_until_done done")
await asyncio.gather(run_until_done(), handle_speaker(), handle_transcription())
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Simple Daily Bot Sample")
parser.add_argument(
"-u", "--url", type=str, required=True, help="URL of the Daily room to join"
)
args, unknown = parser.parse_known_args()
asyncio.run(main(args.url))

Binary file not shown.

Before

Width:  |  Height:  |  Size: 868 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 871 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 872 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 868 KiB

View File

@@ -1,53 +0,0 @@
import argparse
import os
import time
import urllib
import requests
from dotenv import load_dotenv
load_dotenv()
def configure():
parser = argparse.ArgumentParser(description="Daily AI SDK Bot Sample")
parser.add_argument(
"-u", "--url", type=str, required=False, help="URL of the Daily room to join"
)
parser.add_argument(
"-k",
"--apikey",
type=str,
required=False,
help="Daily API Key (needed to create an owner token for the room)",
)
args, unknown = parser.parse_known_args()
url = args.url or os.getenv("DAILY_SAMPLE_ROOM_URL")
key = args.apikey or os.getenv("DAILY_API_KEY")
if not url:
raise Exception(
"No Daily room specified. use the -u/--url option from the command line, or set DAILY_SAMPLE_ROOM_URL in your environment to specify a Daily room URL.")
if not key:
raise Exception("No Daily API key specified. use the -k/--apikey option from the command line, or set DAILY_API_KEY in your environment to specify a Daily API key, available from https://dashboard.daily.co/developers.")
# Create a meeting token for the given room with an expiration 1 hour in the future.
room_name: str = urllib.parse.urlparse(url).path[1:]
expiration: float = time.time() + 60 * 60
res: requests.Response = requests.post(
f"https://api.daily.co/v1/meeting-tokens",
headers={"Authorization": f"Bearer {key}"},
json={
"properties": {"room_name": room_name, "is_owner": True, "exp": expiration}
},
)
if res.status_code != 200:
raise Exception(f"Failed to create meeting token: {res.status_code} {res.text}")
token: str = res.json()["token"]
return (url, token)

View File

@@ -1,134 +0,0 @@
import aiohttp
import asyncio
import os
import wave
from dailyai.services.daily_transport_service import DailyTransportService
from dailyai.services.azure_ai_services import AzureLLMService, AzureTTSService
from dailyai.queue_aggregators import LLMContextAggregator
from dailyai.services.ai_services import AIService, FrameLogger
from dailyai.queue_frame import QueueFrame, AudioQueueFrame, LLMResponseEndQueueFrame, LLMMessagesQueueFrame
from typing import AsyncGenerator
from examples.foundational.support.runner import configure
sounds = {}
sound_files = [
'ding1.wav',
'ding2.wav'
]
script_dir = os.path.dirname(__file__)
for file in sound_files:
# Build the full path to the image file
full_path = os.path.join(script_dir, "assets", file)
# Get the filename without the extension to use as the dictionary key
filename = os.path.splitext(os.path.basename(full_path))[0]
# Open the image and convert it to bytes
with wave.open(full_path) as audio_file:
sounds[file] = audio_file.readframes(-1)
class OutboundSoundEffectWrapper(AIService):
def __init__(self):
pass
async def process_frame(self, frame: QueueFrame) -> AsyncGenerator[QueueFrame, None]:
if isinstance(frame, LLMResponseEndQueueFrame):
yield AudioQueueFrame(sounds["ding1.wav"])
# In case anything else up the stack needs it
yield frame
else:
yield frame
class InboundSoundEffectWrapper(AIService):
def __init__(self):
pass
async def process_frame(self, frame: QueueFrame) -> AsyncGenerator[QueueFrame, None]:
if isinstance(frame, LLMMessagesQueueFrame):
yield AudioQueueFrame(sounds["ding2.wav"])
# In case anything else up the stack needs it
yield frame
else:
yield frame
async def main(room_url: str, token, phone):
async with aiohttp.ClientSession() as session:
global transport
global llm
global tts
transport = DailyTransportService(
room_url,
token,
"Respond bot",
300,
)
transport._mic_enabled = True
transport._mic_sample_rate = 16000
transport._camera_enabled = False
llm = AzureLLMService()
tts = AzureTTSService()
@transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport):
await tts.say("Hi, I'm listening!", transport.send_queue)
await transport.send_queue.put(AudioQueueFrame(sounds["ding1.wav"]))
async def handle_transcriptions():
messages = [
{"role": "system", "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio. Respond to what the user said in a creative and helpful way."},
]
tma_in = LLMContextAggregator(
messages, "user", transport._my_participant_id
)
tma_out = LLMContextAggregator(
messages, "assistant", transport._my_participant_id
)
out_sound = OutboundSoundEffectWrapper()
in_sound = InboundSoundEffectWrapper()
fl = FrameLogger("LLM Out")
fl2 = FrameLogger("Transcription In")
await out_sound.run_to_queue(
transport.send_queue,
tts.run(
tma_out.run(
llm.run(
fl2.run(
in_sound.run(
tma_in.run(
transport.get_receive_frames()
)
)
)
)
)
)
)
@transport.event_handler("on_participant_joined")
async def pax_joined(transport, pax):
print(f"PARTICIPANT JOINED: {pax}")
@transport.event_handler("on_call_state_updated")
async def on_call_state_updated(transport, state):
if (state == "joined"):
if (phone):
transport.start_recording()
transport.dialout(phone)
transport.transcription_settings["extra"]["punctuate"] = True
await asyncio.gather(transport.run(), handle_transcriptions())
if __name__ == "__main__":
(url, token) = configure()
asyncio.run(main(url, token))

View File

@@ -1,13 +0,0 @@
# Server Example
This is an example server based on [Santa Cat](https://santacat.ai). You can run the server with this command:
```
flask --app daily-bot-manager.py --debug run
```
Once the server is started, you can load `http://127.0.0.1:5000/spin-up-kitty` in a browser, and the server will do the following:
- Create a new, randomly-named Daily room with `DAILY_API_KEY` from your .env file or environment
- Start the `10-wake-word.py` example and connect it to that room
- 301 redirect your browser to the room

View File

@@ -1,160 +0,0 @@
from datetime import datetime
import asyncio
import aiohttp
import os
import sys
from dailyai.conversation_wrappers import InterruptibleConversationWrapper
from dailyai.queue_frame import StartStreamQueueFrame, TranscriptionQueueFrame, TextQueueFrame, UserStartedSpeakingFrame, UserStoppedSpeakingFrame
from dailyai.services.daily_transport_service import DailyTransportService
from dailyai.services.fireworks_ai_services import FireworksLLMService
from dailyai.services.deepgram_ai_services import DeepgramTTSService
from dailyai.services.ai_services import FrameLogger
from dailyai.services.fal_ai_services import FalImageGenService
from examples.foundational.support.runner import configure
command_line_prompt = ' '.join(sys.argv[1:])
system_prompt = """
You are a friendly robot character with a cartoon body with head, torso, arms, feet,
and legs.
You can change your appearance using the `change_appearance` function call.
You can add or remove items from your body, change
your color, and more. You can use function calling to change your appearance.
When changing your appearance, please create a prompt as an argument to the function.
The prompt will help the image generation model
create a new appearance for you. Include as much detail as possible. Include the
keywords "robot", "friendly", "cartoon", "smiling", "happy", "animated".
The initial image prompt you are adding to or changing is
"A friendly cartoon robot, smiling and happy, animated."
Do not include the image model prompt in your response. The prompt must be passed to the function
as a parameter.
"""
change_appearance_function = {
"name": "change_appearance",
"description": "Call this function when the users want you to change your appearance.",
"parameters": {
"type": "object",
"properties": {
"appearance": {
"type": "string",
"description": "The new appearance for the robot, in the form of a prompt for an generative AI diffusion model."
}
}
}
}
tools = [
{
"type": "function",
"function": change_appearance_function
}
]
async def main(room_url: str, token):
async with aiohttp.ClientSession() as session:
context = [
{
"role": "system",
"content": system_prompt,
},
]
transport = DailyTransportService(
room_url,
token,
"Respond bot",
duration_minutes=30,
start_transcription=True,
mic_enabled=True,
mic_sample_rate=16000,
camera_enabled=True,
camera_width=1024,
camera_height=1024,
# TODO-CB: Should this be VAD enabled or something?
speaker_enabled=True,
context=context
)
imagegen = FalImageGenService(
image_size="512x512",
aiohttp_session=session,
key_id=os.getenv("FAL_KEY_ID"),
key_secret=os.getenv("FAL_KEY_SECRET"))
async def change_appearance(appearance):
await asyncio.create_task(
imagegen.run_to_queue(
transport.send_queue, [
TextQueueFrame(appearance)]))
llm = FireworksLLMService(
context=context,
api_key=os.getenv("FIREWORKS_API_KEY"),
model="accounts/fireworks/models/firefunction-v1",
# TODO - how can we modify tools list on the fly?
tools=tools,
change_appearance=change_appearance,
transport=transport
)
tts = DeepgramTTSService(aiohttp_session=session, api_key=os.getenv(
"DEEPGRAM_API_KEY"), voice=os.getenv("DEEPGRAM_VOICE"))
fl = FrameLogger("just outside the innermost layer")
async def run_response(in_frame):
await tts.run_to_queue(
transport.send_queue,
# tma_out.run(
llm.run(
# tma_in.run(
fl.run(
[StartStreamQueueFrame(), in_frame]
)
# )
)
# ),
)
@transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport):
await change_appearance("A friendly cartoon robot, smiling and happy, animated.")
return
await tts.say("Hi, I'm listening!", transport.send_queue)
await asyncio.sleep(1)
await transport.receive_queue.put(UserStartedSpeakingFrame())
await asyncio.sleep(0.1)
transport.on_transcription_message({
"text": command_line_prompt,
"participantId": "cb65b845-aac0-4fc8-987d-2e7ce3c7d8f0",
"timestamp": datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%S.%f')[:-3] + 'Z'
})
# putting the frame into the queue directly doesn't seem to work
# await transport.receive_queue.put(
# TranscriptionQueueFrame(
# "tell me a joke.",
# "cb65b845-aac0-4fc8-987d-2e7ce3c7d8f0",
# datetime.utcnow().strftime(
# '%Y-%m-%dT%H:%M:%S.%f')[:-3] + 'Z'
# ))
await asyncio.sleep(0.1)
await transport.receive_queue.put(UserStoppedSpeakingFrame())
transport.transcription_settings["extra"]["endpointing"] = True
transport.transcription_settings["extra"]["punctuate"] = True
await asyncio.gather(transport.run(), transport.run_conversation(run_response))
if __name__ == "__main__":
(url, token) = configure()
asyncio.run(main(url, token))

View File

@@ -0,0 +1 @@
These samples need to be updated! Don't rely on them.

View File

@@ -0,0 +1,93 @@
import argparse
from email.mime import image
from re import A
import requests
import time
import urllib.parse
from dailyai.async_processor.async_processor import (
LLMResponse,
ConversationProcessorCollection,
)
from dailyai.orchestrator import OrchestratorConfig, Orchestrator
from dailyai.message_handler.message_handler import MessageHandler
from dailyai.services.ai_services import AIServiceConfig
from dailyai.services.azure_ai_services import AzureImageGenService, AzureTTSService, AzureLLMService
from dailyai.services.deepgram_ai_services import DeepgramTTSService
def add_bot_to_room(room_url, token, expiration) -> None:
# A simple prompt for a simple sample.
message_handler = MessageHandler(
"""
You are a sample bot in a WebRTC session. You'll receive input as transcriptions of user's
speech, and your responses will be converted to audio via a TTS service.
Answer user's questions and be friendly, and if you can, give some ideas about how someone
could use a bot like you in a more in-depth way. Because your responses will be spoken,
try to keep them short.
"""
)
# Use Azure services for the TTS, image generation, and LLM.
# Note that you'll need to set the following environment variables:
# - AZURE_SPEECH_SERVICE_KEY
# - AZURE_SPEECH_SERVICE_REGION
# - AZURE_CHATGPT_KEY
# - AZURE_CHATGPT_ENDPOINT
# - AZURE_CHATGPT_DEPLOYMENT_ID
services = AIServiceConfig(
tts=AzureTTSService(), image=None, llm=AzureLLMService()
)
orchestrator_config = OrchestratorConfig(
room_url=room_url,
token=token,
bot_name="Simple Bot",
expiration=expiration,
)
orchestrator = Orchestrator(
orchestrator_config,
services,
message_handler,
)
orchestrator.start()
# When the orchestrator's done, we need to shut it down,
# and the various services and handlers we've created.
orchestrator.stop()
message_handler.shutdown()
services.tts.close()
services.llm.close()
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Simple Daily Bot Sample")
parser.add_argument("-u", "--url", type=str, required=True, help="URL of the Daily room")
parser.add_argument(
"-k", "--apikey", type=str, required=True, help="Daily API Key (needed to create token)"
)
args: argparse.Namespace = parser.parse_args()
# Create a meeting token for the given room with an expiration 1 hour in the future.
room_name: str = urllib.parse.urlparse(args.url).path[1:]
expiration: float = time.time() + 60 * 60
res: requests.Response = requests.post(
f"https://api.daily.co/v1/meeting-tokens",
headers={"Authorization": f"Bearer {args.apikey}"},
json={
"properties": {"room_name": room_name, "is_owner": True, "exp": expiration}
},
)
if res.status_code != 200:
raise Exception(f'Failed to create meeting token: {res.status_code} {res.text}')
token: str = res.json()['token']
add_bot_to_room(args.url, token, expiration)

View File

@@ -0,0 +1,174 @@
import argparse
from email.mime import image
import logging
import os
import random
import requests
import time
import urllib.parse
from PIL import Image
from dailyai.async_processor.async_processor import (
ConversationProcessorCollection,
LLMResponse,
OrchestratorResponse
)
from dailyai.orchestrator import OrchestratorConfig, Orchestrator
from dailyai.queue_frame import QueueFrame, FrameType
from dailyai.message_handler.message_handler import MessageHandler
from dailyai.services.ai_services import AIServiceConfig
from dailyai.services.azure_ai_services import AzureImageGenService, AzureTTSService, AzureLLMService
class StaticSpriteResponse(OrchestratorResponse):
def __init__(
self,
services,
message_handler,
output_queue
) -> None:
super().__init__(services, message_handler, output_queue)
self.image_bytes: bytes | None = None
self.filenames = None # override this in subclasses
def start_preparation(self) -> None:
full_path = os.path.join(os.path.dirname(__file__), "sprites/", self.filename)
print(full_path)
with Image.open(full_path) as img:
self.image_bytes = img.tobytes()
def do_play(self) -> None:
self.output_queue.put(QueueFrame(FrameType.IMAGE, self.image_bytes))
class IntroSpriteResponse(StaticSpriteResponse):
def __init__(self, services, message_handler, output_queue) -> None:
super().__init__(services, message_handler, output_queue)
self.filename = "intro.png"
class WaitingSpriteResponse(StaticSpriteResponse):
def __init__(self, services, message_handler, output_queue) -> None:
super().__init__(services, message_handler, output_queue)
self.filename = "waiting.png"
class AnimatedSpriteLLMResponse(LLMResponse):
def __init__(self, services, message_handler, output_queue) -> None:
super().__init__(services, message_handler, output_queue)
self.filenames = ["talk-1.png", "talk-2.png"]
self.image_bytes = []
def start_preparation(self) -> None:
super().start_preparation()
for filename in self.filenames:
full_path = os.path.join(os.path.dirname(__file__), "sprites/", filename)
print(full_path)
with Image.open(full_path) as img:
self.image_bytes.append(img.tobytes())
def get_frames_from_tts_response(self, audio_frame) -> list[QueueFrame]:
return [
QueueFrame(FrameType.AUDIO, audio_frame),
QueueFrame(FrameType.IMAGE, random.choice(self.image_bytes))
]
def add_bot_to_room(room_url, token, expiration) -> None:
# A simple prompt for a simple sample.
message_handler = MessageHandler(
"""
You are a sample bot in a WebRTC session. You'll receive input as transcriptions of user's
speech, and your responses will be converted to audio via a TTS service.
Answer user's questions and be friendly, and if you can, give some ideas about how someone
could use a bot like you in a more in-depth way. Because your responses will be spoken,
try to keep them short.
"""
)
# Use Azure services for the TTS, image generation, and LLM.
# Note that you'll need to set the following environment variables:
# - AZURE_SPEECH_SERVICE_KEY
# - AZURE_SPEECH_SERVICE_REGION
# - AZURE_CHATGPT_KEY
# - AZURE_CHATGPT_ENDPOINT
# - AZURE_CHATGPT_DEPLOYMENT_ID
#
# This demo doesn't use image generation, but if you extend it to do so,
# you'll also need to set:
# - AZURE_DALLE_KEY
# - AZURE_DALLE_ENDPOINT
# - AZURE_DALLE_DEPLOYMENT_ID
services = AIServiceConfig(
tts=AzureTTSService(), image=AzureImageGenService(), llm=AzureLLMService()
)
sprite_conversation_processors = ConversationProcessorCollection(
introduction=IntroSpriteResponse,
waiting=WaitingSpriteResponse,
response=AnimatedSpriteLLMResponse,
)
orchestrator_config = OrchestratorConfig(
room_url=room_url,
token=token,
bot_name="Simple Bot",
expiration=expiration,
)
logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
logger: logging.Logger = logging.getLogger("dailyai")
logger.setLevel(logging.DEBUG)
orchestrator = Orchestrator(
orchestrator_config,
services,
message_handler,
sprite_conversation_processors
)
orchestrator.start()
# When the orchestrator's done, we need to shut it down,
# and the various services and handlers we've created.
orchestrator.stop()
message_handler.shutdown()
services.tts.close()
services.image.close()
services.llm.close()
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Simple Daily Bot Sample")
parser.add_argument("-u", "--url", type=str, required=True, help="URL of the Daily room")
parser.add_argument(
"-k", "--apikey", type=str, required=True, help="Daily API Key (needed to create token)"
)
args: argparse.Namespace = parser.parse_args()
# Create a meeting token for the given room with an expiration 1 hour in the future.
room_name: str = urllib.parse.urlparse(args.url).path[1:]
expiration: float = time.time() + 60 * 60
res: requests.Response = requests.post(
f"https://api.daily.co/v1/meeting-tokens",
headers={"Authorization": f"Bearer {args.apikey}"},
json={
"properties": {"room_name": room_name, "is_owner": True, "exp": expiration}
},
)
if res.status_code != 200:
raise Exception(f'Failed to create meeting token: {res.status_code} {res.text}')
token: str = res.json()['token']
add_bot_to_room(args.url, token, expiration)

View File

Before

Width:  |  Height:  |  Size: 871 KiB

After

Width:  |  Height:  |  Size: 871 KiB

View File

Before

Width:  |  Height:  |  Size: 870 KiB

After

Width:  |  Height:  |  Size: 870 KiB

View File

Before

Width:  |  Height:  |  Size: 871 KiB

After

Width:  |  Height:  |  Size: 871 KiB

View File

Before

Width:  |  Height:  |  Size: 868 KiB

After

Width:  |  Height:  |  Size: 868 KiB

View File

@@ -0,0 +1,52 @@
import argparse
import asyncio
from dailyai.services.daily_transport_service import DailyTransportService
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
async def main(room_url):
# create a transport service object using environment variables for
# the transport service's API key, room url, and any other configuration.
# services can all define and document the environment variables they use.
# services all also take an optional config object that is used instead of
# environment variables.
#
# the abstract transport service APIs presumably can map pretty closely
# to the daily-python basic API
meeting_duration_minutes = 1
transport = DailyTransportService(
room_url,
None,
"Say One Thing",
meeting_duration_minutes,
)
transport.mic_enabled = True
tts = ElevenLabsTTSService(voice_id="ErXwobaYiN019PkySvjV")
# Register an event handler so we can play the audio when the participant joins.
@transport.event_handler("on_participant_joined")
async def on_participant_joined(transport, participant):
if participant["info"]["isLocal"]:
return
await tts.say(
"Hello there, " + participant["info"]["userName"] + "!",
transport.send_queue,
)
# wait for the output queue to be empty, then leave the meeting
await transport.stop_when_done()
await transport.run()
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Simple Daily Bot Sample")
parser.add_argument(
"-u", "--url", type=str, required=True, help="URL of the Daily room to join"
)
args, unknown = parser.parse_known_args()
asyncio.run(main(args.url))

View File

@@ -0,0 +1,59 @@
import asyncio
import time
from typing import AsyncGenerator
from dailyai.queue_frame import QueueFrame, FrameType
from dailyai.services.daily_transport_service import DailyTransportService
from dailyai.services.azure_ai_services import AzureTTSService
from dailyai.services.deepgram_ai_services import DeepgramTTSService
async def main(room_url):
# create a transport service object using environment variables for
# the transport service's API key, room url, and any other configuration.
# services can all define and document the environment variables they use.
# services all also take an optional config object that is used instead of
# environment variables.
#
# the abstract transport service APIs presumably can map pretty closely
# to the daily-python basic API
meeting_duration_minutes = 1
transport = DailyTransportService(
room_url,
None,
"Greeter",
meeting_duration_minutes,
)
transport.mic_enabled = True
# similarly, create a tts service
tts = DeepgramTTSService()
# Get the generator for the audio. This will start running in the background,
# and when we ask the generator for its items, we'll get what it's generated.
# Register an event handler so we can play the audio when the participant joins.
print("settting up handler")
@transport.event_handler("on_participant_joined")
async def on_participant_joined(transport, participant):
print(f"participant joined: {participant['info']['userName']}")
if participant["info"]["isLocal"]:
return
audio_generator: AsyncGenerator[bytes, None] = tts.run_tts(
f"Hello there, {participant['info']['userName']}!")
async for audio in audio_generator:
transport.output_queue.put(QueueFrame(FrameType.AUDIO, audio))
print("setting up call state handler")
@transport.event_handler("on_call_state_updated")
async def on_call_joined(transport, state):
print(f"call state callback: {state}")
await transport.run()
if __name__ == "__main__":
asyncio.run(main("https://chad-hq.daily.co/howdy"))

View File

@@ -0,0 +1,49 @@
import argparse
import asyncio
from dailyai.queue_frame import LLMMessagesQueueFrame
from dailyai.services.daily_transport_service import DailyTransportService
from dailyai.services.azure_ai_services import AzureLLMService
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
async def main(room_url):
meeting_duration_minutes = 1
transport = DailyTransportService(
room_url,
None,
"Say One Thing From an LLM",
meeting_duration_minutes,
)
transport.mic_enabled = True
tts = ElevenLabsTTSService(voice_id="29vD33N1CtxCmqQRPOHJ")
llm = AzureLLMService()
messages = [{
"role": "system",
"content": "You are an LLM in a WebRTC session, and this is a 'hello world' demo. Say hello to the world."
}]
tts_task = asyncio.create_task(
tts.run_to_queue(
transport.send_queue,
llm.run([LLMMessagesQueueFrame(messages)]),
)
)
@transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport):
await tts_task
await transport.stop_when_done()
await transport.run()
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Simple Daily Bot Sample")
parser.add_argument(
"-u", "--url", type=str, required=True, help="URL of the Daily room to join"
)
args, unknown = parser.parse_known_args()
asyncio.run(main(args.url))

View File

@@ -0,0 +1,46 @@
import argparse
import asyncio
from dailyai.queue_frame import TextQueueFrame
from dailyai.services.daily_transport_service import DailyTransportService
from dailyai.services.open_ai_services import OpenAIImageGenService
local_joined = False
participant_joined = False
async def main(room_url):
meeting_duration_minutes = 1
transport = DailyTransportService(
room_url,
None,
"Show a still frame image",
meeting_duration_minutes,
)
transport.mic_enabled = False
transport.camera_enabled = True
transport.camera_width = 1024
transport.camera_height = 1024
imagegen = OpenAIImageGenService(image_size="1024x1024")
image_task = asyncio.create_task(
imagegen.run_to_queue(
transport.send_queue, [
TextQueueFrame("a cat in the style of picasso")]))
@transport.event_handler("on_participant_joined")
async def on_participant_joined(transport, participant):
await image_task
await transport.run()
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Simple Daily Bot Sample")
parser.add_argument(
"-u", "--url", type=str, required=True, help="URL of the Daily room to join"
)
args, unknown = parser.parse_known_args()
asyncio.run(main(args.url))

View File

@@ -0,0 +1,74 @@
import argparse
import asyncio
import re
from dailyai.services.daily_transport_service import DailyTransportService
from dailyai.services.azure_ai_services import AzureLLMService, AzureTTSService
from dailyai.queue_frame import EndStreamQueueFrame, LLMMessagesQueueFrame
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
async def main(room_url: str):
global transport
global llm
global tts
transport = DailyTransportService(
room_url,
None,
"Say Two Things Bot",
1,
)
transport.mic_enabled = True
transport.mic_sample_rate = 16000
transport.camera_enabled = False
llm = AzureLLMService()
azure_tts = AzureTTSService()
elevenlabs_tts = ElevenLabsTTSService(voice_id="ErXwobaYiN019PkySvjV")
messages = [{"role": "system", "content": "tell the user a joke about llamas"}]
# Start a task to run the LLM to create a joke, and convert the LLM output to audio frames. This task
# will run in parallel with generating and speaking the audio for static text, so there's no delay to
# speak the LLM response.
buffer_queue = asyncio.Queue()
llm_response_task = asyncio.create_task(
elevenlabs_tts.run_to_queue(
buffer_queue,
llm.run([LLMMessagesQueueFrame(messages)]),
True,
)
)
@transport.event_handler("on_participant_joined")
async def on_joined(transport, participant):
if participant["id"] == transport.my_participant_id:
return
await azure_tts.say("My friend the LLM is now going to tell a joke about llamas.", transport.send_queue)
async def buffer_to_send_queue():
while True:
frame = await buffer_queue.get()
await transport.send_queue.put(frame)
buffer_queue.task_done()
if isinstance(frame, EndStreamQueueFrame):
break
await asyncio.gather(llm_response_task, buffer_to_send_queue())
await transport.stop_when_done()
await transport.run()
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Simple Daily Bot Sample")
parser.add_argument(
"-u", "--url", type=str, required=True, help="URL of the Daily room to join"
)
args, unknown = parser.parse_known_args()
asyncio.run(main(args.url))

View File

@@ -0,0 +1,110 @@
import argparse
import asyncio
from dailyai.queue_frame import AudioQueueFrame, ImageQueueFrame
from dailyai.services.azure_ai_services import AzureLLMService
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
from dailyai.services.daily_transport_service import DailyTransportService
from dailyai.services.fal_ai_services import FalImageGenService
async def main(room_url):
meeting_duration_minutes = 5
transport = DailyTransportService(
room_url,
None,
"Month Narration Bot",
meeting_duration_minutes,
)
transport.mic_enabled = True
transport.camera_enabled = True
transport.mic_sample_rate = 16000
transport.camera_width = 1024
transport.camera_height = 1024
llm = AzureLLMService()
dalle = FalImageGenService(image_size="1024x1024")
tts = ElevenLabsTTSService(voice_id="ErXwobaYiN019PkySvjV")
# dalle = OpenAIImageGenService(image_size="1024x1024")
# Get a complete audio chunk from the given text. Splitting this into its own
# coroutine lets us ensure proper ordering of the audio chunks on the send queue.
async def get_all_audio(text):
all_audio = bytearray()
async for audio in tts.run_tts(text):
all_audio.extend(audio)
return all_audio
async def get_month_data(month):
messages = [
{
"role": "system",
"content": f"Describe a nature photograph suitable for use in a calendar, for the month of {month}. Include only the image description with no preamble. Limit the description to one sentence, please.",
}
]
image_description = await llm.run_llm(messages)
if not image_description:
return
to_speak = f"{month}: {image_description}"
audio_task = asyncio.create_task(get_all_audio(to_speak))
image_task = asyncio.create_task(dalle.run_image_gen(image_description))
(audio, image_data) = await asyncio.gather(
audio_task, image_task
)
return {
"month": month,
"text": image_description,
"image_url": image_data[0],
"image": image_data[1],
"audio": audio,
}
months: list[str] = [
"January",
"February",
"March",
"April",
"May",
"June",
"July",
"August",
"September",
"October",
"November",
"December",
]
@transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport):
# This will play the months in the order they're completed. The benefit
# is we'll have as little delay as possible before the first month, and
# likely no delay between months, but the months won't display in order.
for month_data_task in asyncio.as_completed(month_tasks):
data = await month_data_task
await transport.send_queue.put(
[
ImageQueueFrame(data["image_url"], data["image"]),
AudioQueueFrame(data["audio"]),
]
)
# wait for the output queue to be empty, then leave the meeting
await transport.stop_when_done()
month_tasks = [asyncio.create_task(get_month_data(month)) for month in months]
await transport.run()
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Simple Daily Bot Sample")
parser.add_argument(
"-u", "--url", type=str, required=True, help="URL of the Daily room to join"
)
args, unknown = parser.parse_known_args()
asyncio.run(main(args.url))

View File

@@ -0,0 +1,92 @@
import argparse
import asyncio
import requests
import time
import urllib.parse
from dailyai.services.daily_transport_service import DailyTransportService
from dailyai.services.azure_ai_services import AzureLLMService, AzureTTSService
from dailyai.queue_aggregators import LLMContextAggregator
async def main(room_url: str, token):
global transport
global llm
global tts
transport = DailyTransportService(
room_url,
token,
"Respond bot",
5,
)
transport.mic_enabled = True
transport.mic_sample_rate = 16000
transport.camera_enabled = False
llm = AzureLLMService()
tts = AzureTTSService()
@transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport):
await tts.say("Hi, I'm listening!", transport.send_queue)
async def handle_transcriptions():
messages = [
{"role": "system", "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio. Respond to what the user said in a creative and helpful way."},
]
tma_in = LLMContextAggregator(
messages, "user", transport.my_participant_id
)
tma_out = LLMContextAggregator(
messages, "assistant", transport.my_participant_id
)
await tts.run_to_queue(
transport.send_queue,
tma_out.run(
llm.run(
tma_in.run(
transport.get_receive_frames()
)
)
)
)
transport.transcription_settings["extra"]["punctuate"] = True
await asyncio.gather(transport.run(), handle_transcriptions())
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Simple Daily Bot Sample")
parser.add_argument(
"-u", "--url", type=str, required=True, help="URL of the Daily room to join"
)
parser.add_argument(
"-k",
"--apikey",
type=str,
required=True,
help="Daily API Key (needed to create token)",
)
args, unknown = parser.parse_known_args()
# Create a meeting token for the given room with an expiration 1 hour in the future.
room_name: str = urllib.parse.urlparse(args.url).path[1:]
expiration: float = time.time() + 60 * 60
res: requests.Response = requests.post(
f"https://api.daily.co/v1/meeting-tokens",
headers={"Authorization": f"Bearer {args.apikey}"},
json={
"properties": {"room_name": room_name, "is_owner": True, "exp": expiration}
},
)
if res.status_code != 200:
raise Exception(f"Failed to create meeting token: {res.status_code} {res.text}")
token: str = res.json()["token"]
asyncio.run(main(args.url, token))

View File

@@ -0,0 +1,174 @@
import argparse
import asyncio
import os
import random
import requests
import time
import urllib.parse
from dotenv import load_dotenv
from PIL import Image
load_dotenv()
from dailyai.services.daily_transport_service import DailyTransportService
from dailyai.services.azure_ai_services import AzureLLMService, AzureTTSService
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
from dailyai.services.fal_ai_services import FalImageGenService
from dailyai.services.open_ai_services import OpenAIImageGenService
from dailyai.queue_aggregators import LLMContextAggregator
from dailyai.queue_frame import LLMMessagesQueueFrame, QueueFrame, TextQueueFrame, ImageQueueFrame, ImageListQueueFrame
from dailyai.services.ai_services import AIService
from typing import AsyncGenerator, List
sprites = {}
image_files = [
'cat1.png',
'cat2.png',
'cat3.png'
]
script_dir = os.path.dirname(__file__)
for file in image_files:
# Build the full path to the image file
full_path = os.path.join(script_dir, "images", file)
# Get the filename without the extension to use as the dictionary key
filename = os.path.splitext(os.path.basename(full_path))[0]
# Open the image and convert it to bytes
with Image.open(full_path) as img:
sprites[file] = img.tobytes()
quiet_frame = ImageQueueFrame("", sprites["cat1.png"])
sprite_list = list(sprites.values())
talking = [random.choice(sprite_list) for x in range(30)]
talking_frame = ImageListQueueFrame(images=talking)
class TranscriptFilter(AIService):
def __init__(self, bot_participant_id=None):
self.bot_participant_id = bot_participant_id
async def process_frame(self, frame:QueueFrame) -> AsyncGenerator[QueueFrame, None]:
if frame.participantId != self.bot_participant_id:
yield frame
class NameCheckFilter(AIService):
def __init__(self, names=None):
self.names = names
self.sentence = ""
async def process_frame(self, frame:QueueFrame) -> AsyncGenerator[QueueFrame, None]:
content: str = ""
# TODO: split up transcription by participant
if isinstance(frame, TextQueueFrame):
content = frame.text
self.sentence += content
if self.sentence.endswith((".", "?", "!")):
if any(name in self.sentence for name in self.names):
print(f"I got one: {frame.text}")
out = self.sentence
self.sentence = ""
yield TextQueueFrame(out)
else:
out = self.sentence
self.sentence = ""
print(f"ignoring: {out}")
async def main(room_url:str, token):
global transport
global llm
global tts
transport = DailyTransportService(
room_url,
token,
"Derrick",
180,
)
transport.mic_enabled = True
transport.mic_sample_rate = 16000
transport.camera_enabled = True
transport.camera_width = 960
transport.camera_height = 960
llm = AzureLLMService()
tts = ElevenLabsTTSService()
@transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport):
await tts.say("Hi, I'm listening!", transport.send_queue)
async def handle_transcriptions():
messages = [
{"role": "system", "content": "You are Derek, the Golden Kitty, the mascot for Product Hunt's annual awards. You are a cat who knows everything about all the cool new tech startups. You should be clever, and a bit sarcastic. You should also tell jokes every once in a while. Your responses should only be a few sentences long."},
]
tma_in = LLMContextAggregator(
messages, "user", transport.my_participant_id
)
tma_out = LLMContextAggregator(
messages, "assistant", transport.my_participant_id
)
tf = TranscriptFilter(transport.my_participant_id)
ncf = NameCheckFilter(["Derek", "Derrick"])
await tts.run_to_queue(
transport.send_queue,
tma_out.run(
llm.run(
tma_in.run(
ncf.run(
tf.run(
transport.get_receive_frames()
)
)
)
)
)
)
async def make_cats():
await transport.send_queue.put(quiet_frame)
transport.transcription_settings["extra"]["punctuate"] = True
await asyncio.gather(transport.run(), handle_transcriptions(), make_cats())
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Simple Daily Bot Sample")
parser.add_argument(
"-u", "--url", type=str, required=True, help="URL of the Daily room to join"
)
parser.add_argument(
"-k",
"--apikey",
type=str,
required=True,
help="Daily API Key (needed to create token)",
)
args, unknown = parser.parse_known_args()
# Create a meeting token for the given room with an expiration 24 hours in the future.
room_name: str = urllib.parse.urlparse(args.url).path[1:]
expiration: float = time.time() + 60 * 60 * 24
res: requests.Response = requests.post(
f"https://api.daily.co/v1/meeting-tokens",
headers={"Authorization": f"Bearer {args.apikey}"},
json={
"properties": {"room_name": room_name, "is_owner": True, "exp": expiration}
},
)
if res.status_code != 200:
raise Exception(f"Failed to create meeting token: {res.status_code} {res.text}")
token: str = res.json()["token"]
asyncio.run(main(args.url, token))

View File

@@ -0,0 +1,134 @@
import argparse
import asyncio
from typing import AsyncGenerator
import requests
import time
import urllib.parse
from PIL import Image
from dailyai.queue_frame import ImageQueueFrame, QueueFrame
from dailyai.services.daily_transport_service import DailyTransportService
from dailyai.services.azure_ai_services import AzureLLMService, AzureTTSService
from dailyai.services.ai_services import AIService
from dailyai.queue_aggregators import LLMAssistantContextAggregator, LLMUserContextAggregator
from dailyai.services.fal_ai_services import FalImageGenService
class ImageSyncAggregator(AIService):
def __init__(self, speaking_path:str, waiting_path:str):
self._speaking_image = Image.open(speaking_path)
self._speaking_image_bytes = self._speaking_image.tobytes()
self._waiting_image = Image.open(waiting_path)
self._waiting_image_bytes = self._waiting_image.tobytes()
async def process_frame(self, frame: QueueFrame) -> AsyncGenerator[QueueFrame, None]:
yield ImageQueueFrame(None, self._speaking_image_bytes)
yield frame
yield ImageQueueFrame(None, self._waiting_image_bytes)
async def main(room_url: str, token):
global transport
global llm
global tts
transport = DailyTransportService(
room_url,
token,
"Respond bot",
5,
)
transport.camera_enabled = True
transport.camera_width = 1024
transport.camera_height = 1024
transport.mic_enabled = True
transport.mic_sample_rate = 16000
llm = AzureLLMService()
tts = AzureTTSService()
img = FalImageGenService(image_size="1024x1024")
async def get_images():
get_speaking_task = asyncio.create_task(
img.run_image_gen("An image of a cat speaking")
)
get_waiting_task = asyncio.create_task(
img.run_image_gen("An image of a cat waiting")
)
(speaking_data, waiting_data) = await asyncio.gather(
get_speaking_task, get_waiting_task
)
return speaking_data, waiting_data
@transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport):
await tts.say("Hi, I'm listening!", transport.send_queue)
async def handle_transcriptions():
messages = [
{"role": "system", "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio. Respond to what the user said in a creative and helpful way."},
]
tma_in = LLMUserContextAggregator(
messages, transport.my_participant_id
)
tma_out = LLMAssistantContextAggregator(
messages, transport.my_participant_id
)
image_sync_aggregator = ImageSyncAggregator(
"/Users/moishe/src/daily-ai-sdk/src/samples/foundational/speaking.png",
"/Users/moishe/src/daily-ai-sdk/src/samples/foundational/waiting.png",
)
await tts.run_to_queue(
transport.send_queue,
image_sync_aggregator.run(
tma_out.run(
llm.run(
tma_in.run(
transport.get_receive_frames()
)
)
)
)
)
transport.transcription_settings["extra"]["punctuate"] = True
await asyncio.gather(transport.run(), handle_transcriptions())
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Simple Daily Bot Sample")
parser.add_argument(
"-u", "--url", type=str, required=True, help="URL of the Daily room to join"
)
parser.add_argument(
"-k",
"--apikey",
type=str,
required=True,
help="Daily API Key (needed to create token)",
)
args, unknown = parser.parse_known_args()
# Create a meeting token for the given room with an expiration 1 hour in the future.
room_name: str = urllib.parse.urlparse(args.url).path[1:]
expiration: float = time.time() + 60 * 60
res: requests.Response = requests.post(
f"https://api.daily.co/v1/meeting-tokens",
headers={"Authorization": f"Bearer {args.apikey}"},
json={
"properties": {"room_name": room_name, "is_owner": True, "exp": expiration}
},
)
if res.status_code != 200:
raise Exception(f"Failed to create meeting token: {res.status_code} {res.text}")
token: str = res.json()["token"]
asyncio.run(main(args.url, token))

View File

@@ -0,0 +1,99 @@
import argparse
import asyncio
import requests
import time
import urllib.parse
from dailyai.conversation_wrappers import InterruptibleConversationWrapper
from dailyai.queue_frame import StartStreamQueueFrame, TextQueueFrame
from dailyai.services.daily_transport_service import DailyTransportService
from dailyai.services.azure_ai_services import AzureLLMService
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
async def main(room_url: str, token):
global transport
global llm
global tts
transport = DailyTransportService(
room_url,
token,
"Respond bot",
5,
)
transport.mic_enabled = True
transport.mic_sample_rate = 16000
transport.camera_enabled = False
transport.start_transcription = True
llm = AzureLLMService()
tts = ElevenLabsTTSService(voice_id="ErXwobaYiN019PkySvjV")
async def run_response(user_speech, tma_in, tma_out):
await tts.run_to_queue(
transport.send_queue,
tma_out.run(
llm.run(
tma_in.run(
[StartStreamQueueFrame(), TextQueueFrame(user_speech)]
)
)
),
)
@transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport):
await tts.say("Hi, I'm listening!", transport.send_queue)
async def run_conversation():
messages = [
{"role": "system", "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio. Respond to what the user said in a creative and helpful way."},
]
conversation_wrapper = InterruptibleConversationWrapper(
frame_generator=transport.get_receive_frames,
runner=run_response,
interrupt=transport.interrupt,
my_participant_id=transport.my_participant_id,
llm_messages=messages,
)
await conversation_wrapper.run_conversation()
transport.transcription_settings["extra"]["punctuate"] = False
await asyncio.gather(transport.run(), run_conversation())
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Simple Daily Bot Sample")
parser.add_argument(
"-u", "--url", type=str, required=True, help="URL of the Daily room to join"
)
parser.add_argument(
"-k",
"--apikey",
type=str,
required=True,
help="Daily API Key (needed to create token)",
)
args, unknown = parser.parse_known_args()
# Create a meeting token for the given room with an expiration 1 hour in the future.
room_name: str = urllib.parse.urlparse(args.url).path[1:]
expiration: float = time.time() + 60 * 60
res: requests.Response = requests.post(
f"https://api.daily.co/v1/meeting-tokens",
headers={"Authorization": f"Bearer {args.apikey}"},
json={
"properties": {"room_name": room_name, "is_owner": True, "exp": expiration}
},
)
if res.status_code != 200:
raise Exception(f"Failed to create meeting token: {res.status_code} {res.text}")
token: str = res.json()["token"]
asyncio.run(main(args.url, token))

View File

@@ -1,22 +1,22 @@
import argparse
import asyncio
from dailyai.services.daily_transport_service import DailyTransportService
from dailyai.services.whisper_ai_services import WhisperSTTService
from examples.foundational.support.runner import configure
async def main(room_url: str):
global transport
global stt
transport = DailyTransportService(
room_url,
None,
"Transcription bot",
start_transcription=True,
mic_enabled=False,
camera_enabled=False,
speaker_enabled=True
)
transport.mic_enabled = False
transport.camera_enabled = False
transport.speaker_enabled = True
stt = WhisperSTTService()
transcription_output_queue = asyncio.Queue()
@@ -35,5 +35,10 @@ async def main(room_url: str):
if __name__ == "__main__":
(url, token) = configure()
asyncio.run(main(url))
parser = argparse.ArgumentParser(description="Simple Daily Bot Sample")
parser.add_argument(
"-u", "--url", type=str, required=True, help="URL of the Daily room to join"
)
args, unknown = parser.parse_known_args()
asyncio.run(main(args.url))

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.5 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.6 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.6 MiB

View File

Before

Width:  |  Height:  |  Size: 33 KiB

After

Width:  |  Height:  |  Size: 33 KiB

View File

Before

Width:  |  Height:  |  Size: 30 KiB

After

Width:  |  Height:  |  Size: 30 KiB

View File

@@ -23,11 +23,11 @@ async def main(room_url: str, token):
"Imagebot",
1,
)
transport._mic_enabled = True
transport._camera_enabled = True
transport._mic_sample_rate = 16000
transport._camera_width = 1024
transport._camera_height = 1024
transport.mic_enabled = True
transport.camera_enabled = True
transport.mic_sample_rate = 16000
transport.camera_width = 1024
transport.camera_height = 1024
llm = AzureLLMService()
tts = AzureTTSService()
@@ -39,7 +39,7 @@ async def main(room_url: str, token):
sentence = ""
async for message in transport.get_transcriptions():
print(f"transcription message: {message}")
if message["session_id"] == transport._my_participant_id:
if message["session_id"] == transport.my_participant_id:
continue
finder = message["text"].find("start over")
print(f"finder: {finder}")