Compare commits
16 Commits
khk-functi
...
cb/golden-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5477baff7d | ||
|
|
6834d484ca | ||
|
|
0a9fa24b14 | ||
|
|
88e9c1ff71 | ||
|
|
849171a9c6 | ||
|
|
60ebdfb958 | ||
|
|
ad5dcdd760 | ||
|
|
9c154c3d49 | ||
|
|
96256e90cb | ||
|
|
6f75db4d54 | ||
|
|
127fddfb1e | ||
|
|
5231243795 | ||
|
|
cba14c2002 | ||
|
|
8ae61bf2ac | ||
|
|
bc6849b255 | ||
|
|
9bbd14d5e7 |
24
LICENSE
@@ -1,24 +0,0 @@
|
||||
BSD 2-Clause License
|
||||
|
||||
Copyright (c) 2024, Daily
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
1. Redistributions of source code must retain the above copyright notice, this
|
||||
list of conditions and the following disclaimer.
|
||||
|
||||
2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
this list of conditions and the following disclaimer in the documentation
|
||||
and/or other materials provided with the distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
44
README.md
@@ -1,19 +1,6 @@
|
||||
# Daily AI SDK
|
||||
# dailyai SDK
|
||||
|
||||
Build conversational, multi-modal AI apps with real-time voice and video, like this:
|
||||
|
||||
_Demo Video to come_
|
||||
|
||||
With built-in support for many of the best AI platforms (or [add your own](/docs)):
|
||||
|
||||
- Azure - DALL-E, ChatGPT, and Azure AI Text-to-Speech
|
||||
- Deepgram - Speech-to-text, and Aura text-to-speech
|
||||
- Eleven Labs text-to-speech
|
||||
- Fal.ai image generation
|
||||
- OpenAI DALL-E and ChatGPT
|
||||
- Whisper local speech-to-text
|
||||
|
||||
## Step 1: Get Started
|
||||
This SDK can help you build applications that participate in WebRTC meetings and use various AI services to interact with other participants.
|
||||
|
||||
## Build/Install
|
||||
|
||||
@@ -48,8 +35,25 @@ pip install path_to_this_repo
|
||||
Tou can run the simple sample like so:
|
||||
|
||||
```
|
||||
python src/examples/theoretical-to-real/01-say-one-thing.py -u <url of your Daily meeting> -k <your Daily API Key>
|
||||
python src/samples/theoretical-to-real/01-say-one-thing.py -u <url of your Daily meeting> -k <your Daily API Key>
|
||||
```
|
||||
|
||||
Note that the sample uses Azure's TTS and LLM services. You'll need to set the following environment variables for the sample to work:
|
||||
|
||||
```
|
||||
AZURE_SPEECH_SERVICE_KEY
|
||||
AZURE_SPEECH_SERVICE_REGION
|
||||
AZURE_CHATGPT_KEY
|
||||
AZURE_CHATGPT_ENDPOINT
|
||||
AZURE_CHATGPT_DEPLOYMENT_ID
|
||||
```
|
||||
|
||||
If you have those environment variables stored in an .env file, you can quickly load them into your terminal's environment by running this:
|
||||
|
||||
```bash
|
||||
export $(grep -v '^#' .env | xargs)
|
||||
```
|
||||
|
||||
## Overview
|
||||
|
||||
The Daily AI SDK allows you to build applications that can participate in WebRTC sessions and interact with AI Services. Some examples of what you can build with this:
|
||||
@@ -157,3 +161,11 @@ As that text is being spoken, the asynchronous LLM task continues in the backgro
|
||||
```
|
||||
|
||||
One thing to note here is the last parameter to `run_to_queue` in the first code clause above: this causes the `run_to_queue` method to send an `END_STREAM` frame when it’s done rendering. This lets us know when to stop our `buffer_to_send_queue` task above.
|
||||
|
||||
## Test Server
|
||||
|
||||
To start the test server:
|
||||
|
||||
```python
|
||||
flask --app daily-bot-manager.py --debug run
|
||||
```
|
||||
|
||||
@@ -8,21 +8,14 @@ import os
|
||||
|
||||
load_dotenv()
|
||||
|
||||
|
||||
def get_meeting_token(room_name, daily_api_key, token_expiry):
|
||||
api_path = os.getenv('DAILY_API_PATH') or 'https://api.daily.co/v1'
|
||||
|
||||
if not token_expiry:
|
||||
token_expiry = time.time() + 600
|
||||
res = requests.post(
|
||||
f'{api_path}/meeting-tokens',
|
||||
headers={
|
||||
'Authorization': f'Bearer {daily_api_key}'},
|
||||
json={
|
||||
'properties': {
|
||||
'room_name': room_name,
|
||||
'is_owner': True,
|
||||
'exp': token_expiry}})
|
||||
res = requests.post(f'{api_path}/meeting-tokens',
|
||||
headers={'Authorization': f'Bearer {daily_api_key}'},
|
||||
json={'properties': {'room_name': room_name, 'is_owner': True, 'exp': token_expiry}})
|
||||
if res.status_code != 200:
|
||||
return jsonify({'error': 'Unable to create meeting token', 'detail': res.text}), 500
|
||||
meeting_token = res.json()['token']
|
||||
@@ -30,4 +23,4 @@ def get_meeting_token(room_name, daily_api_key, token_expiry):
|
||||
|
||||
|
||||
def get_room_name(room_url):
|
||||
return urllib.parse.urlparse(room_url).path[1:]
|
||||
return urllib.parse.urlparse(room_url).path[1:]
|
||||
@@ -5,7 +5,7 @@ import time
|
||||
|
||||
from flask import Flask, jsonify, request, redirect
|
||||
from flask_cors import CORS
|
||||
from examples.server.auth import get_meeting_token
|
||||
from auth import get_meeting_token
|
||||
|
||||
from dotenv import load_dotenv
|
||||
|
||||
@@ -16,12 +16,11 @@ CORS(app)
|
||||
|
||||
print(f"I loaded an environment, and my FAL_KEY_ID is {os.getenv('FAL_KEY_ID')}")
|
||||
|
||||
|
||||
def start_bot(bot_path, args=None):
|
||||
daily_api_key = os.getenv("DAILY_API_KEY")
|
||||
api_path = os.getenv("DAILY_API_PATH") or "https://api.daily.co/v1"
|
||||
|
||||
timeout = int(os.getenv("DAILY_ROOM_TIMEOUT") or os.getenv("DAILY_BOT_MAX_DURATION") or 300)
|
||||
timeout = int(os.getenv("ROOM_TIMEOUT") or os.getenv("BOT_MAX_DURATION") or 300)
|
||||
exp = time.time() + timeout
|
||||
res = requests.post(
|
||||
f"{api_path}/rooms",
|
||||
@@ -78,23 +77,27 @@ def start_bot(bot_path, args=None):
|
||||
if res.status_code == 200:
|
||||
break
|
||||
print(f"Took {attempts} attempts to join room {room_name}")
|
||||
|
||||
|
||||
# Additional client config
|
||||
config = {}
|
||||
if os.getenv("CLIENT_VAD_TIMEOUT_SEC"):
|
||||
config['vad_timeout_sec'] = float(os.getenv("DAILY_CLIENT_VAD_TIMEOUT_SEC"))
|
||||
config['vad_timeout_sec'] = float(os.getenv("CLIENT_VAD_TIMEOUT_SEC"))
|
||||
else:
|
||||
config['vad_timeout_sec'] = 1.5
|
||||
|
||||
# return jsonify({"room_url": room_url, "token": meeting_token, "config": config}), 200
|
||||
return redirect(room_url, code=301)
|
||||
#return jsonify({"room_url": room_url, "token": meeting_token, "config": config}), 200
|
||||
return redirect(room_url, code=302)
|
||||
|
||||
|
||||
@app.route("/spin-up-kitty", methods=["GET", "POST"])
|
||||
@app.route("/spin-up-kitty", methods=["POST"])
|
||||
def spin_up_kitty():
|
||||
return start_bot("./src/examples/foundational/10-wake-word.py")
|
||||
return start_bot("./src/samples/foundational/06a-golden-kitty.py")
|
||||
|
||||
@app.route("/spin-up-kitty", methods=["GET"])
|
||||
def quick_start_kitty():
|
||||
return start_bot("./src/samples/foundational/06a-golden-kitty.py")
|
||||
|
||||
|
||||
@app.route("/healthz")
|
||||
def health_check():
|
||||
return "ok", 200
|
||||
return "ok", 200
|
||||
@@ -1,13 +0,0 @@
|
||||
# Daily AI SDK Docs
|
||||
|
||||
## [Architecture Overview](architecture.md)
|
||||
|
||||
Learn about the thinking behind the SDK's design.
|
||||
|
||||
## [Example Code](examples/)
|
||||
|
||||
The repo includes several example apps in the `src/examples` directory. The docs explain how they work.
|
||||
|
||||
## [API Reference](api/)
|
||||
|
||||
Complete documentation of the available classes and methods in the SDK.
|
||||
@@ -1,2 +0,0 @@
|
||||
# Daily AI SDK Architecture Guide
|
||||
|
||||
@@ -1,119 +0,0 @@
|
||||
# 01: Say One Thing
|
||||
|
||||
_video here - youtube?_
|
||||
|
||||
This example uses a text-to-speech (TTS) service to say one predefined sentence. But first, a quick overview of the general structure of these examples.
|
||||
|
||||
## Running the demos
|
||||
|
||||
All of the demos have something like this at the bottom of the file:
|
||||
|
||||
```python
|
||||
if __name__ == "__main__":
|
||||
(url, token) = configure()
|
||||
asyncio.run(main(url, token))
|
||||
```
|
||||
|
||||
### `configure()`
|
||||
|
||||
The `configure()` function comes from `src/examples/foundational/support/runner.py`, and it allows you to configure the examples from the command line directly, or using environment variables:
|
||||
|
||||
```bash
|
||||
python 01-say-one-thing.py -u https://YOUR_DOMAIN.daily.co/YOUR_ROOM -k YOUR_API_KEY
|
||||
# or
|
||||
DAILY_ROOM_URL=https://YOUR_DOMAIN.daily.co/YOUR_ROOM DAILY_API_KEY=YOUR_API_KEY python 01-say-one-thing.py
|
||||
# or set DAILY_ROOM_URL and DAILY_API_KEY in a .env file
|
||||
python 01-say-one-thing.py
|
||||
```
|
||||
|
||||
You'll need a Daily account to run these demos. You can sign up for free at [daily.co](https://daily.co). Once you've signed up you can create a room from the [Dashboard](https://dashboard.daily.co/rooms), and grab [your API key](https://dashboard.daily.co/developers) while you're there.
|
||||
|
||||
Some functionality (such as transcription) requires the bot to have owner privileges in the room. `runner.py` uses the Daily REST API to create a meeting token with owner privileges. You can learn more about meeting tokens in the [Daily docs](https://docs.daily.co/reference/rest-api/meeting-tokens).
|
||||
|
||||
### `asyncio.run()`
|
||||
|
||||
The AI SDK makes heavy use of Python's `asyncio` module. [This is a reasonable intro to the topic](https://builtin.com/data-science/asyncio) if you haven't worked with `asyncio` and coroutines before.
|
||||
|
||||
You can learn a bit more about the specifics of how the Daily AI SDK uses coroutines in the [Architecture Guide](../architecture.md).
|
||||
|
||||
## The `main()` function
|
||||
|
||||
All of the examples have a `main()` function with a similar structure:
|
||||
|
||||
- Configure the transport
|
||||
- Configure the AI service(s) used in the demo
|
||||
- Configure any event listeners
|
||||
- Define a processing pipeline
|
||||
- Run the example's coroutine(s)
|
||||
|
||||
### Configuring the transport
|
||||
|
||||
The first section of the `main()` function configures the transport object:
|
||||
|
||||
```python
|
||||
meeting_duration_minutes = 5
|
||||
transport = DailyTransportService(
|
||||
room_url,
|
||||
None,
|
||||
"Say One Thing",
|
||||
meeting_duration_minutes,
|
||||
)
|
||||
transport.mic_enabled = True
|
||||
```
|
||||
|
||||
The [Architecture Guide](../architecture.md) explains the transport object in more detail. In this case, we're configuring a Daily transport object and enabling the virtual microphone, so our bot can play audio.
|
||||
|
||||
### Configuring the services
|
||||
|
||||
As described in the [Architecture Guide](../architecture.md), 'a 'Service' is a class that processes 'Frames' as part of a 'Pipeline'. In this demo app, we'll only need one service: a text-to-speech generator. We can create an instance of the `ElevenLabsTTSService` class with this line of code:
|
||||
|
||||
```python
|
||||
tts = ElevenLabsTTSService(aiohttp_session=session, api_key=os.getenv("ELEVENLABS_API_KEY"), voice_id=os.getenv("ELEVENLABS_VOICE_ID"))
|
||||
```
|
||||
|
||||
You'll need to make sure and set those environment variables somewhere. The easiest way to do that is to copy the `example.env` file in the repo and rename it to `.env`, and then add your credentials to that file. `runner.py` loads the `python-dotenv` module and initializes it, making the values in that file available in the environment.
|
||||
|
||||
### Configuring event listeners
|
||||
|
||||
This part isn't strictly necessary for an app like this. You could include the contents of the `on_participant_joined` function directly in the body of the `main()` function, and it would run as soon as you started the script from the command line.
|
||||
|
||||
Instead, we can use an event handler to wait to run that code until someone else joins the meeting. We'll define a function called `greet_user()`, and use the `@transport.event_handler("on_participant_joined")` decorator to tell the SDK that we want to run that function whenever a user joins the room.
|
||||
|
||||
```python
|
||||
@transport.event_handler("on_participant_joined")
|
||||
async def greet_user(transport, participant):
|
||||
if participant["info"]["isLocal"]:
|
||||
return
|
||||
|
||||
await tts.say(
|
||||
"Hello there, " + participant["info"]["userName"] + "!",
|
||||
transport.send_queue,
|
||||
)
|
||||
|
||||
# wait for the output queue to be empty, then leave the meeting
|
||||
await transport.stop_when_done()
|
||||
```
|
||||
|
||||
### Defining a processing pipeline
|
||||
|
||||
In this example, we don't actually have much of a processing pipeline! In fact, we're doing the whole thing inside the `greet_user()` function already.
|
||||
|
||||
Pipelines usually look like a bunch of nested calls to the `run()` or `run_to_queue()` function from different Services. In this example, we're using the `say()` function from the TTS service. This is effectively a convenience wrapper around the `run_to_queue()` function, which we'll discuss more later. It's important to `await` this function to ensure that the speech frames are queued for playback before the next line of code, because of the `stop_when_done()` function being called immediately afterward.
|
||||
|
||||
The output of the `say()` function goes to the transport's `send_queue`. This queue is the all-important connection between the world of the Services pipeline that's generating frames asynchronously and the ordered playback of audio and visual media in the WebRTC call.
|
||||
|
||||
### Running the coroutines
|
||||
|
||||
In this example, we don't actually have any separate processing pipelines—everything happens as a result of an event from the transport. So we only need to run the transport's coroutine, and await its completion:
|
||||
|
||||
```python
|
||||
await transport.run()
|
||||
```
|
||||
|
||||
In future examples, we'll run more processes in parallel. For now, this script can run until the transport exits—which will happen based on calling `stop_when_done()` in the `greet_user()` function.
|
||||
|
||||
## Next Steps
|
||||
|
||||
Next, we'll start connecting multiple AI services together by building a service pipeline.
|
||||
|
||||
## [02 - LLM Say One Thing »](02-llm-say-one-thing.md)
|
||||
@@ -1,5 +0,0 @@
|
||||
# Daily AI SDK Examples
|
||||
|
||||
The docs in this folder pair with the example apps located in `src/examples/foundational`. They are designed to serve as a quick references for building different kinds of AI apps. But the examples also build on one another, so it can be really helpful to walk through them in order.
|
||||
|
||||
To start, you can learn about the overall structure of the examples in [01 - Say One Thing](01-say-one-thing.md).
|
||||
@@ -7,22 +7,17 @@ name = "daily_ai"
|
||||
version = "0.0.1"
|
||||
description = "Orchestrator for AI bots with Daily"
|
||||
dependencies = [
|
||||
"aiohttp",
|
||||
"azure-cognitiveservices-speech",
|
||||
"daily-python",
|
||||
"fal",
|
||||
"faster_whisper",
|
||||
"groq",
|
||||
"google-cloud-texttospeech",
|
||||
"numpy",
|
||||
"openai",
|
||||
"Pillow",
|
||||
"typing-extensions",
|
||||
"openai",
|
||||
"google-cloud-texttospeech",
|
||||
"azure-cognitiveservices-speech",
|
||||
"pyht",
|
||||
"python-dotenv",
|
||||
"torch",
|
||||
"torchaudio",
|
||||
"pyaudio",
|
||||
"typing-extensions"
|
||||
"opentelemetry-sdk",
|
||||
"aiohttp",
|
||||
"fal",
|
||||
"faster_whisper"
|
||||
]
|
||||
|
||||
[tool.setuptools.packages.find]
|
||||
|
||||
@@ -34,7 +34,6 @@ class LLMContextAggregator(AIService):
|
||||
bot_participant_id=None,
|
||||
complete_sentences=True,
|
||||
pass_through=True):
|
||||
super().__init__()
|
||||
self.messages = messages
|
||||
self.bot_participant_id = bot_participant_id
|
||||
self.role = role
|
||||
@@ -61,31 +60,20 @@ class LLMContextAggregator(AIService):
|
||||
|
||||
# TODO: split up transcription by participant
|
||||
if self.complete_sentences:
|
||||
# type: ignore -- the linter thinks this isn't a TextQueueFrame, even
|
||||
# though we check it above
|
||||
self.sentence += frame.text
|
||||
self.sentence += frame.text # type: ignore -- the linter thinks this isn't a TextQueueFrame, even though we check it above
|
||||
if self.sentence.endswith((".", "?", "!")):
|
||||
self.messages.append({"role": self.role, "content": self.sentence})
|
||||
self.sentence = ""
|
||||
yield LLMMessagesQueueFrame(self.messages)
|
||||
else:
|
||||
# type: ignore -- the linter thinks this isn't a TextQueueFrame, even
|
||||
# though we check it above
|
||||
self.messages.append({"role": self.role, "content": frame.text})
|
||||
self.messages.append({"role": self.role, "content": frame.text}) # type: ignore -- the linter thinks this isn't a TextQueueFrame, even though we check it above
|
||||
yield LLMMessagesQueueFrame(self.messages)
|
||||
|
||||
async def finalize(self) -> AsyncGenerator[QueueFrame, None]:
|
||||
# Send any dangling words that weren't finished with punctuation.
|
||||
if self.complete_sentences and self.sentence:
|
||||
self.messages.append({"role": self.role, "content": self.sentence})
|
||||
yield LLMMessagesQueueFrame(self.messages)
|
||||
|
||||
|
||||
class LLMUserContextAggregator(LLMContextAggregator):
|
||||
def __init__(self,
|
||||
messages: list[dict],
|
||||
bot_participant_id=None,
|
||||
complete_sentences=True):
|
||||
messages: list[dict],
|
||||
bot_participant_id=None,
|
||||
complete_sentences=True):
|
||||
super().__init__(messages, "user", bot_participant_id, complete_sentences, pass_through=False)
|
||||
|
||||
|
||||
@@ -94,5 +82,5 @@ class LLMAssistantContextAggregator(LLMContextAggregator):
|
||||
self, messages: list[dict], bot_participant_id=None, complete_sentences=True
|
||||
):
|
||||
super().__init__(
|
||||
messages, "assistant", bot_participant_id, complete_sentences, pass_through=True
|
||||
messages, "assistan", bot_participant_id, complete_sentences, pass_through=True
|
||||
)
|
||||
|
||||
@@ -19,18 +19,6 @@ class EndStreamQueueFrame(ControlQueueFrame):
|
||||
pass
|
||||
|
||||
|
||||
class LLMResponseEndQueueFrame(QueueFrame):
|
||||
pass
|
||||
|
||||
|
||||
class UserStartedSpeakingFrame(QueueFrame):
|
||||
pass
|
||||
|
||||
|
||||
class UserStoppedSpeakingFrame(QueueFrame):
|
||||
pass
|
||||
|
||||
|
||||
@dataclass()
|
||||
class AudioQueueFrame(QueueFrame):
|
||||
data: bytes
|
||||
@@ -43,26 +31,14 @@ class ImageQueueFrame(QueueFrame):
|
||||
|
||||
|
||||
@dataclass()
|
||||
class SpriteQueueFrame(QueueFrame):
|
||||
images: list[bytes]
|
||||
|
||||
class ImageListQueueFrame(QueueFrame):
|
||||
images: list[bytes] | None
|
||||
|
||||
@dataclass()
|
||||
class TextQueueFrame(QueueFrame):
|
||||
text: str
|
||||
|
||||
|
||||
@dataclass()
|
||||
class TextQueueOutOfBandFrame(TextQueueFrame):
|
||||
outOfBand: bool = True
|
||||
|
||||
|
||||
@dataclass()
|
||||
class TTSCompletedFrame(QueueFrame):
|
||||
text: str
|
||||
outOfBand: bool = False
|
||||
|
||||
|
||||
@dataclass()
|
||||
class TranscriptionQueueFrame(TextQueueFrame):
|
||||
participantId: str
|
||||
|
||||
@@ -1,23 +1,16 @@
|
||||
import asyncio
|
||||
import io
|
||||
import logging
|
||||
import time
|
||||
import datetime
|
||||
import wave
|
||||
|
||||
from dailyai.queue_frame import (
|
||||
QueueFrame,
|
||||
AudioQueueFrame,
|
||||
ControlQueueFrame,
|
||||
EndStreamQueueFrame,
|
||||
ImageQueueFrame,
|
||||
LLMMessagesQueueFrame,
|
||||
LLMResponseEndQueueFrame,
|
||||
QueueFrame,
|
||||
TextQueueFrame,
|
||||
TTSCompletedFrame,
|
||||
TranscriptionQueueFrame,
|
||||
UserStoppedSpeakingFrame
|
||||
)
|
||||
|
||||
from abc import abstractmethod
|
||||
@@ -84,11 +77,6 @@ class AIService:
|
||||
|
||||
|
||||
class LLMService(AIService):
|
||||
|
||||
def __init__(self, context):
|
||||
super().__init__()
|
||||
self._context = context
|
||||
|
||||
@abstractmethod
|
||||
async def run_llm_async(self, messages) -> AsyncGenerator[str, None]:
|
||||
yield ""
|
||||
@@ -98,23 +86,9 @@ class LLMService(AIService):
|
||||
pass
|
||||
|
||||
async def process_frame(self, frame: QueueFrame) -> AsyncGenerator[QueueFrame, None]:
|
||||
print(f"##### process frame got a frame, {type(frame)}")
|
||||
if isinstance(frame, UserStoppedSpeakingFrame):
|
||||
print(
|
||||
f"### Got a user stopped speaking frame, context is {self._context}")
|
||||
async for chunk in self.run_llm_async(self._context):
|
||||
# if we get a string, wrap it in a frame
|
||||
if isinstance(chunk, str):
|
||||
yield TextQueueFrame(chunk)
|
||||
# if we get a frame, pass it through
|
||||
elif isinstance(chunk, QueueFrame):
|
||||
print(f"### Got a frame chunk: {chunk}")
|
||||
yield chunk
|
||||
else:
|
||||
print(f"### Got an unknown chunk: {chunk}")
|
||||
yield LLMResponseEndQueueFrame()
|
||||
else:
|
||||
yield frame
|
||||
if isinstance(frame, LLMMessagesQueueFrame):
|
||||
async for text_chunk in self.run_llm_async(frame.messages):
|
||||
yield TextQueueFrame(text_chunk)
|
||||
|
||||
|
||||
class TTSService(AIService):
|
||||
@@ -136,12 +110,6 @@ class TTSService(AIService):
|
||||
|
||||
async def process_frame(self, frame: QueueFrame) -> AsyncGenerator[QueueFrame, None]:
|
||||
if not isinstance(frame, TextQueueFrame):
|
||||
# We don't want transcription frames, which are a subclass
|
||||
yield frame
|
||||
return
|
||||
|
||||
# TODO-CB: Clean this up
|
||||
if isinstance(frame, TranscriptionQueueFrame):
|
||||
yield frame
|
||||
return
|
||||
|
||||
@@ -156,11 +124,7 @@ class TTSService(AIService):
|
||||
|
||||
if text:
|
||||
async for audio_chunk in self.run_tts(text):
|
||||
size = 8000
|
||||
for i in range(0, len(audio_chunk), size):
|
||||
yield AudioQueueFrame(audio_chunk[i: i+size])
|
||||
print("### ABOUT TO YIELD TTS COMPLETED FRAME", frame)
|
||||
yield TTSCompletedFrame(text, hasattr(frame, 'outOfBand') and frame.outOfBand)
|
||||
yield AudioQueueFrame(audio_chunk)
|
||||
|
||||
async def finalize(self):
|
||||
if self.current_sentence:
|
||||
@@ -220,19 +184,12 @@ class STTService(AIService):
|
||||
ww.close()
|
||||
content.seek(0)
|
||||
text = await self.run_stt(content)
|
||||
yield TranscriptionQueueFrame(text, '', str(time.time()))
|
||||
yield TextQueueFrame(text)
|
||||
|
||||
|
||||
class FrameLogger(AIService):
|
||||
def __init__(self, prefix="Frame", **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.prefix = prefix
|
||||
|
||||
async def process_frame(self, frame: QueueFrame) -> AsyncGenerator[QueueFrame, None]:
|
||||
if isinstance(frame, (AudioQueueFrame, ImageQueueFrame)):
|
||||
self.logger.info(
|
||||
f"{datetime.datetime.utcnow().isoformat()} {self.prefix}: {type(frame)}")
|
||||
else:
|
||||
print(f"{datetime.datetime.utcnow().isoformat()} {self.prefix}: {frame}")
|
||||
|
||||
yield frame
|
||||
@dataclass
|
||||
class AIServiceConfig:
|
||||
tts: TTSService
|
||||
image: ImageGenService
|
||||
llm: LLMService
|
||||
stt: STTService
|
||||
|
||||
@@ -17,10 +17,13 @@ from azure.cognitiveservices.speech import SpeechSynthesizer, SpeechConfig, Resu
|
||||
|
||||
|
||||
class AzureTTSService(TTSService):
|
||||
def __init__(self, *, api_key, region):
|
||||
def __init__(self, speech_key=None, speech_region=None):
|
||||
super().__init__()
|
||||
|
||||
self.speech_config = SpeechConfig(subscription=api_key, region=region)
|
||||
speech_key = speech_key or os.getenv("AZURE_SPEECH_SERVICE_KEY")
|
||||
speech_region = speech_region or os.getenv("AZURE_SPEECH_SERVICE_REGION")
|
||||
|
||||
self.speech_config = SpeechConfig(subscription=speech_key, region=speech_region)
|
||||
self.speech_synthesizer = SpeechSynthesizer(
|
||||
speech_config=self.speech_config, audio_config=None)
|
||||
|
||||
@@ -42,21 +45,31 @@ class AzureTTSService(TTSService):
|
||||
yield result.audio_data[44:]
|
||||
elif result.reason == ResultReason.Canceled:
|
||||
cancellation_details = result.cancellation_details
|
||||
self.logger.info("Speech synthesis canceled: {}".format(
|
||||
cancellation_details.reason))
|
||||
self.logger.info("Speech synthesis canceled: {}".format(cancellation_details.reason))
|
||||
if cancellation_details.reason == CancellationReason.Error:
|
||||
self.logger.info("Error details: {}".format(
|
||||
cancellation_details.error_details))
|
||||
self.logger.info("Error details: {}".format(cancellation_details.error_details))
|
||||
|
||||
|
||||
class AzureLLMService(LLMService):
|
||||
def __init__(self, *, api_key, endpoint, api_version="2023-12-01-preview", model, context):
|
||||
super().__init__(context)
|
||||
self._model: str = model
|
||||
def __init__(self, api_key=None, azure_endpoint=None, api_version=None, model=None):
|
||||
super().__init__()
|
||||
api_key = api_key or os.getenv("AZURE_CHATGPT_KEY")
|
||||
|
||||
self._client = AsyncAzureOpenAI(
|
||||
azure_endpoint = azure_endpoint or os.getenv("AZURE_CHATGPT_ENDPOINT")
|
||||
if not azure_endpoint:
|
||||
raise Exception(
|
||||
"No azure endpoint specified for Azure LLM, please set AZURE_CHATGPT_ENDPOINT in the environment or pass it to the AzureLLMService constructor")
|
||||
|
||||
model: str | None = model or os.getenv("AZURE_CHATGPT_DEPLOYMENT_ID")
|
||||
if not model:
|
||||
raise Exception(
|
||||
"No model specified for Azure LLM, please set AZURE_CHATGPT_DEPLOYMENT_ID in the environment or pass it to the AzureLLMService constructor")
|
||||
self.model: str = model
|
||||
|
||||
api_version = api_version or "2023-12-01-preview"
|
||||
self.client = AsyncAzureOpenAI(
|
||||
api_key=api_key,
|
||||
azure_endpoint=endpoint,
|
||||
azure_endpoint=azure_endpoint,
|
||||
api_version=api_version,
|
||||
)
|
||||
|
||||
@@ -64,7 +77,7 @@ class AzureLLMService(LLMService):
|
||||
messages_for_log = json.dumps(messages)
|
||||
self.logger.debug(f"Generating chat via azure: {messages_for_log}")
|
||||
|
||||
chunks = await self._client.chat.completions.create(model=self._model, stream=True, messages=messages)
|
||||
chunks = await self.client.chat.completions.create(model=self.model, stream=True, messages=messages)
|
||||
async for chunk in chunks:
|
||||
if len(chunk.choices) == 0:
|
||||
continue
|
||||
@@ -76,7 +89,7 @@ class AzureLLMService(LLMService):
|
||||
messages_for_log = json.dumps(messages)
|
||||
self.logger.debug(f"Generating chat via azure: {messages_for_log}")
|
||||
|
||||
response = await self._client.chat.completions.create(model=self._model, stream=False, messages=messages)
|
||||
response = await self.client.chat.completions.create(model=self.model, stream=False, messages=messages)
|
||||
if response and len(response.choices) > 0:
|
||||
return response.choices[0].message.content
|
||||
else:
|
||||
@@ -87,60 +100,85 @@ class AzureImageGenServiceREST(ImageGenService):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
api_version="2023-06-01-preview",
|
||||
image_size: str,
|
||||
aiohttp_session: aiohttp.ClientSession,
|
||||
api_key,
|
||||
endpoint,
|
||||
model):
|
||||
api_key=None,
|
||||
azure_endpoint=None,
|
||||
api_version=None,
|
||||
model=None):
|
||||
super().__init__(image_size=image_size)
|
||||
|
||||
self._api_key = api_key
|
||||
self._azure_endpoint = endpoint
|
||||
self._api_version = api_version
|
||||
self._model = model
|
||||
self._aiohttp_session = aiohttp_session
|
||||
self.api_key = api_key or os.getenv("AZURE_DALLE_KEY")
|
||||
self.azure_endpoint = azure_endpoint or os.getenv("AZURE_DALLE_ENDPOINT")
|
||||
self.api_version = api_version or "2023-06-01-preview"
|
||||
self.model = model or os.getenv("AZURE_DALLE_DEPLOYMENT_ID")
|
||||
|
||||
async def run_image_gen(self, sentence) -> tuple[str, bytes]:
|
||||
url = f"{self._azure_endpoint}openai/images/generations:submit?api-version={self._api_version}"
|
||||
headers = {"api-key": self._api_key,
|
||||
"Content-Type": "application/json"}
|
||||
body = {
|
||||
# Enter your prompt text here
|
||||
"prompt": sentence,
|
||||
"size": self.image_size,
|
||||
"n": 1,
|
||||
}
|
||||
async with self._aiohttp_session.post(
|
||||
url, headers=headers, json=body
|
||||
) as submission:
|
||||
print(f"submission: {submission}")
|
||||
# We never get past this line, because this header isn't
|
||||
# defined on a 429 response, but something is eating our exceptions!
|
||||
operation_location = submission.headers['operation-location']
|
||||
print(f"submission status: {submission.status}")
|
||||
status = ""
|
||||
attempts_left = 120
|
||||
json_response = None
|
||||
while status != "succeeded":
|
||||
attempts_left -= 1
|
||||
if attempts_left == 0:
|
||||
raise Exception("Image generation timed out")
|
||||
# TODO hoist the session to app-level
|
||||
async with aiohttp.ClientSession() as session:
|
||||
url = f"{self.azure_endpoint}openai/images/generations:submit?api-version={self.api_version}"
|
||||
headers = {"api-key": self.api_key, "Content-Type": "application/json"}
|
||||
body = {
|
||||
# Enter your prompt text here
|
||||
"prompt": sentence,
|
||||
"size": self.image_size,
|
||||
"n": 1,
|
||||
}
|
||||
async with session.post(url, headers=headers, json=body) as submission:
|
||||
operation_location = submission.headers['operation-location']
|
||||
|
||||
await asyncio.sleep(1)
|
||||
response = await self._aiohttp_session.get(
|
||||
operation_location, headers=headers
|
||||
)
|
||||
json_response = await response.json()
|
||||
status = json_response["status"]
|
||||
status = ""
|
||||
attempts_left = 120
|
||||
json_response = None
|
||||
while status != "succeeded":
|
||||
attempts_left -= 1
|
||||
if attempts_left == 0:
|
||||
raise Exception("Image generation timed out")
|
||||
|
||||
image_url = json_response["result"]["data"][0]["url"] if json_response else None
|
||||
if not image_url:
|
||||
raise Exception("Image generation failed")
|
||||
# Load the image from the url
|
||||
async with self._aiohttp_session.get(image_url) as response:
|
||||
image_stream = io.BytesIO(await response.content.read())
|
||||
image = Image.open(image_stream)
|
||||
print("i got an image file!")
|
||||
return (image_url, image.tobytes())
|
||||
await asyncio.sleep(1)
|
||||
response = await session.get(operation_location, headers=headers)
|
||||
json_response = await response.json()
|
||||
status = json_response["status"]
|
||||
|
||||
image_url = json_response["result"]["data"][0]["url"] if json_response else None
|
||||
if not image_url:
|
||||
raise Exception("Image generation failed")
|
||||
|
||||
# Load the image from the url
|
||||
async with session.get(image_url) as response:
|
||||
image_stream = io.BytesIO(await response.content.read())
|
||||
image = Image.open(image_stream)
|
||||
return (image_url, image.tobytes())
|
||||
|
||||
|
||||
class AzureImageGenService(ImageGenService):
|
||||
|
||||
def __init__(self, api_key=None, azure_endpoint=None, api_version=None, model=None):
|
||||
super().__init__()
|
||||
|
||||
api_key = api_key or os.getenv("AZURE_DALLE_KEY")
|
||||
azure_endpoint = azure_endpoint or os.getenv("AZURE_DALLE_ENDPOINT")
|
||||
api_version = api_version or "2023-06-01-preview"
|
||||
self.model = model or os.getenv("AZURE_DALLE_DEPLOYMENT_ID")
|
||||
|
||||
self.client = AzureOpenAI(
|
||||
api_key=api_key,
|
||||
azure_endpoint=azure_endpoint,
|
||||
api_version=api_version,
|
||||
)
|
||||
|
||||
async def run_image_gen(self, sentence) -> tuple[str, bytes]:
|
||||
self.logger.info("Generating azure image", sentence)
|
||||
|
||||
image = self.client.images.generate(
|
||||
model=self.model,
|
||||
prompt=sentence,
|
||||
n=1,
|
||||
size=self.image_size,
|
||||
)
|
||||
|
||||
url = image["data"][0]["url"]
|
||||
response = requests.get(url)
|
||||
|
||||
dalle_stream = io.BytesIO(response.content)
|
||||
dalle_im = Image.open(dalle_stream.tobytes())
|
||||
|
||||
return (url, dalle_im)
|
||||
|
||||
@@ -1,456 +0,0 @@
|
||||
from abc import abstractmethod
|
||||
import asyncio
|
||||
import copy
|
||||
import functools
|
||||
import itertools
|
||||
import logging
|
||||
import queue
|
||||
import threading
|
||||
import time
|
||||
from typing import AsyncGenerator
|
||||
import numpy as np
|
||||
import pyaudio
|
||||
import torch
|
||||
import torchaudio
|
||||
from enum import Enum
|
||||
import datetime
|
||||
import traceback
|
||||
|
||||
from typing import AsyncGenerator, AsyncIterable, BinaryIO, Iterable
|
||||
from dailyai.queue_aggregators import LLMAssistantContextAggregator, LLMUserContextAggregator
|
||||
|
||||
from dailyai.queue_frame import (
|
||||
AudioQueueFrame,
|
||||
EndStreamQueueFrame,
|
||||
ImageQueueFrame,
|
||||
QueueFrame,
|
||||
SpriteQueueFrame,
|
||||
StartStreamQueueFrame,
|
||||
TranscriptionQueueFrame,
|
||||
TTSCompletedFrame,
|
||||
UserStartedSpeakingFrame,
|
||||
UserStoppedSpeakingFrame
|
||||
)
|
||||
|
||||
torch.set_num_threads(1)
|
||||
|
||||
model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
|
||||
model='silero_vad',
|
||||
force_reload=False)
|
||||
|
||||
(get_speech_timestamps,
|
||||
save_audio,
|
||||
read_audio,
|
||||
VADIterator,
|
||||
collect_chunks) = utils
|
||||
|
||||
# Taken from utils_vad.py
|
||||
|
||||
|
||||
def validate(model,
|
||||
inputs: torch.Tensor):
|
||||
with torch.no_grad():
|
||||
outs = model(inputs)
|
||||
return outs
|
||||
|
||||
# Provided by Alexander Veysov
|
||||
|
||||
|
||||
def int2float(sound):
|
||||
abs_max = np.abs(sound).max()
|
||||
sound = sound.astype('float32')
|
||||
if abs_max > 0:
|
||||
sound *= 1/32768
|
||||
sound = sound.squeeze() # depends on the use case
|
||||
return sound
|
||||
|
||||
|
||||
FORMAT = pyaudio.paInt16
|
||||
CHANNELS = 1
|
||||
SAMPLE_RATE = 16000
|
||||
CHUNK = int(SAMPLE_RATE / 10)
|
||||
|
||||
audio = pyaudio.PyAudio()
|
||||
|
||||
|
||||
class VADState(Enum):
|
||||
QUIET = 1
|
||||
STARTING = 2
|
||||
SPEAKING = 3
|
||||
STOPPING = 4
|
||||
|
||||
|
||||
class BaseTransportService():
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
self._mic_enabled = kwargs.get("mic_enabled") or False
|
||||
self._mic_sample_rate = kwargs.get("mic_sample_rate") or 16000
|
||||
self._camera_enabled = kwargs.get("camera_enabled") or False
|
||||
self._camera_width = kwargs.get("camera_width") or 1024
|
||||
self._camera_height = kwargs.get("camera_height") or 768
|
||||
self._speaker_enabled = kwargs.get("speaker_enabled") or False
|
||||
self._speaker_sample_rate = kwargs.get("speaker_sample_rate") or 16000
|
||||
self._fps = kwargs.get("fps") or 8
|
||||
self._vad_start_s = kwargs.get("vad_start_s") or 0.2
|
||||
self._vad_stop_s = kwargs.get("vad_stop_s") or 0.5
|
||||
self._context = kwargs.get("context") or []
|
||||
|
||||
self._vad_samples = 1536
|
||||
vad_frame_s = self._vad_samples / SAMPLE_RATE
|
||||
self._vad_start_frames = round(self._vad_start_s / vad_frame_s)
|
||||
self._vad_stop_frames = round(self._vad_stop_s / vad_frame_s)
|
||||
self._vad_starting_count = 0
|
||||
self._vad_stopping_count = 0
|
||||
self._vad_state = VADState.QUIET
|
||||
|
||||
duration_minutes = kwargs.get("duration_minutes") or 10
|
||||
self._expiration = time.time() + duration_minutes * 60
|
||||
|
||||
self.send_queue = asyncio.Queue()
|
||||
self.receive_queue = asyncio.Queue()
|
||||
|
||||
self._threadsafe_send_queue = queue.Queue()
|
||||
|
||||
self._images = None
|
||||
self._user_is_speaking = False
|
||||
self._current_phrase = ""
|
||||
|
||||
try:
|
||||
self._loop: asyncio.AbstractEventLoop | None = asyncio.get_running_loop()
|
||||
except RuntimeError:
|
||||
self._loop = None
|
||||
|
||||
self._stop_threads = threading.Event()
|
||||
self._is_interrupted = threading.Event()
|
||||
|
||||
self._logger: logging.Logger = logging.getLogger()
|
||||
|
||||
def update_messages(self, new_context: list[dict[str, str]], task: asyncio.Task | None):
|
||||
if task:
|
||||
if not task.cancelled():
|
||||
self._current_phrase = ""
|
||||
self._context = new_context
|
||||
|
||||
def append_to_context(self, role, chunk_or_text):
|
||||
print("IN APPEND", chunk_or_text)
|
||||
# if we get a non-string, append it to the context without further error checking
|
||||
# unless the outOfBand property is True
|
||||
if not isinstance(chunk_or_text, str):
|
||||
|
||||
if not chunk_or_text.get("outOfBand") == True:
|
||||
self._context.append(chunk_or_text)
|
||||
return
|
||||
|
||||
text = chunk_or_text
|
||||
last_context_item = self._context[-1]
|
||||
|
||||
print("TEXT", text)
|
||||
print("LAST CONTEXT ITEM", last_context_item)
|
||||
traceback.print_stack()
|
||||
|
||||
if last_context_item and last_context_item['role'] == role:
|
||||
last_context_item['content'] += f" {text}"
|
||||
else:
|
||||
self._context.append({"role": role, "content": text})
|
||||
|
||||
async def run_pipeline(self, frame):
|
||||
print(f"starting to speak_after_delay, {frame}")
|
||||
# TODO-CB: This exception for missing class gets eaten!
|
||||
await self._runner(frame)
|
||||
|
||||
async def run_conversation(self, runner: Iterable[QueueFrame]
|
||||
| AsyncIterable[QueueFrame]
|
||||
| asyncio.Queue[QueueFrame],
|
||||
) -> AsyncGenerator[QueueFrame, None]:
|
||||
current_response_task = None
|
||||
self._runner = runner
|
||||
|
||||
async for frame in self.get_receive_frames():
|
||||
print(f"got frame of type: {type(frame)}, {frame}")
|
||||
if isinstance(frame, EndStreamQueueFrame):
|
||||
break
|
||||
# elif not isinstance(frame, TranscriptionQueueFrame):
|
||||
# continue
|
||||
# TODO-CB: Verify this is an accurate replacement
|
||||
# if hasattr(frame, 'participantId') and frame.participantId == self._my_participant_id:
|
||||
if not isinstance(frame, UserStoppedSpeakingFrame):
|
||||
continue
|
||||
|
||||
if current_response_task:
|
||||
# TODO-CB: Maybe not always interrupt? Are there frame types we can pass through?
|
||||
current_response_task.cancel()
|
||||
self.interrupt()
|
||||
|
||||
# self._current_phrase += " " + frame.text
|
||||
# current_llm_context = copy.deepcopy(self._context)
|
||||
current_response_task = asyncio.create_task(
|
||||
self.run_pipeline(
|
||||
frame)
|
||||
)
|
||||
current_response_task.add_done_callback(
|
||||
functools.partial(self.update_messages, self._context)
|
||||
)
|
||||
|
||||
async def run(self):
|
||||
self._prerun()
|
||||
|
||||
async_output_queue_marshal_task = asyncio.create_task(
|
||||
self._marshal_frames())
|
||||
|
||||
self._camera_thread = threading.Thread(
|
||||
target=self._run_camera, daemon=True)
|
||||
self._camera_thread.start()
|
||||
|
||||
self._frame_consumer_thread = threading.Thread(
|
||||
target=self._frame_consumer, daemon=True)
|
||||
self._frame_consumer_thread.start()
|
||||
|
||||
if self._speaker_enabled:
|
||||
# TODO-CB: This is interesting
|
||||
# self._receive_audio_thread = threading.Thread(
|
||||
# target=self._receive_audio, daemon=True)
|
||||
# self._receive_audio_thread.start()
|
||||
|
||||
self._vad_thread = threading.Thread(target=self._vad, daemon=True)
|
||||
self._vad_thread.start()
|
||||
|
||||
try:
|
||||
while (
|
||||
time.time() < self._expiration
|
||||
and not self._stop_threads.is_set()
|
||||
):
|
||||
await asyncio.sleep(1)
|
||||
except Exception as e:
|
||||
self._logger.error(f"Exception {e}")
|
||||
raise e
|
||||
finally:
|
||||
# Do anything that must be done to clean up
|
||||
self._post_run()
|
||||
|
||||
self._stop_threads.set()
|
||||
|
||||
await self.send_queue.put(EndStreamQueueFrame())
|
||||
await async_output_queue_marshal_task
|
||||
await self.send_queue.join()
|
||||
self._frame_consumer_thread.join()
|
||||
|
||||
if self._speaker_enabled:
|
||||
self._receive_audio_thread.join()
|
||||
|
||||
def _post_run(self):
|
||||
# Note that this function must be idempotent! It can be called multiple times
|
||||
# if, for example, a keyboard interrupt occurs.
|
||||
pass
|
||||
|
||||
def stop(self):
|
||||
self._stop_threads.set()
|
||||
|
||||
async def stop_when_done(self):
|
||||
await self._wait_for_send_queue_to_empty()
|
||||
self.stop()
|
||||
|
||||
async def _wait_for_send_queue_to_empty(self):
|
||||
await self.send_queue.join()
|
||||
self._threadsafe_send_queue.join()
|
||||
|
||||
@abstractmethod
|
||||
def write_frame_to_camera(self, frame: bytes):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def write_frame_to_mic(self, frame: bytes):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def read_audio_frames(self, desired_frame_count):
|
||||
return bytes()
|
||||
|
||||
@abstractmethod
|
||||
def _prerun(self):
|
||||
pass
|
||||
|
||||
def _vad(self):
|
||||
# CB: Starting silero VAD stuff
|
||||
# TODO-CB: Probably need to force virtual speaker creation if we're
|
||||
# going to build this in?
|
||||
# TODO-CB: pyaudio installation
|
||||
while not self._stop_threads.is_set():
|
||||
audio_chunk = self.read_audio_frames(self._vad_samples)
|
||||
audio_int16 = np.frombuffer(audio_chunk, np.int16)
|
||||
audio_float32 = int2float(audio_int16)
|
||||
new_confidence = model(
|
||||
torch.from_numpy(audio_float32), 16000).item()
|
||||
speaking = new_confidence > 0.5
|
||||
|
||||
if speaking:
|
||||
match self._vad_state:
|
||||
case VADState.QUIET:
|
||||
self._vad_state = VADState.STARTING
|
||||
self._vad_starting_count = 1
|
||||
case VADState.STARTING:
|
||||
self._vad_starting_count += 1
|
||||
case VADState.STOPPING:
|
||||
self._vad_state = VADState.SPEAKING
|
||||
self._vad_stopping_count = 0
|
||||
else:
|
||||
match self._vad_state:
|
||||
case VADState.STARTING:
|
||||
self._vad_state = VADState.QUIET
|
||||
self._vad_starting_count = 0
|
||||
case VADState.SPEAKING:
|
||||
self._vad_state = VADState.STOPPING
|
||||
self._vad_stopping_count = 1
|
||||
case VADState.STOPPING:
|
||||
self._vad_stopping_count += 1
|
||||
|
||||
if self._vad_state == VADState.STARTING and self._vad_starting_count >= self._vad_start_frames:
|
||||
print(
|
||||
f'!!! {datetime.datetime.utcnow().isoformat()} queueing start frame')
|
||||
asyncio.run_coroutine_threadsafe(
|
||||
self.receive_queue.put(
|
||||
UserStartedSpeakingFrame()), self._loop
|
||||
)
|
||||
print(f"!!! VAD started, calling interrupt")
|
||||
self.interrupt()
|
||||
self._vad_state = VADState.SPEAKING
|
||||
self._vad_starting_count = 0
|
||||
if self._vad_state == VADState.STOPPING and self._vad_stopping_count >= self._vad_stop_frames:
|
||||
print(
|
||||
f'!!! {datetime.datetime.utcnow().isoformat()} queueing stop frame')
|
||||
asyncio.run_coroutine_threadsafe(
|
||||
self.receive_queue.put(
|
||||
UserStoppedSpeakingFrame()), self._loop
|
||||
)
|
||||
self._vad_state = VADState.QUIET
|
||||
self._vad_stopping_count = 0
|
||||
|
||||
async def _marshal_frames(self):
|
||||
while True:
|
||||
frame: QueueFrame | list = await self.send_queue.get()
|
||||
self._threadsafe_send_queue.put(frame)
|
||||
self.send_queue.task_done()
|
||||
if isinstance(frame, EndStreamQueueFrame):
|
||||
break
|
||||
|
||||
def interrupt(self):
|
||||
print(f"!!! setting interrupt")
|
||||
self._is_interrupted.set()
|
||||
|
||||
async def get_receive_frames(self) -> AsyncGenerator[QueueFrame, None]:
|
||||
while True:
|
||||
frame = await self.receive_queue.get()
|
||||
yield frame
|
||||
if isinstance(frame, EndStreamQueueFrame):
|
||||
break
|
||||
|
||||
def _receive_audio(self):
|
||||
if not self._loop:
|
||||
self._logger.error("No loop available for audio thread")
|
||||
return
|
||||
|
||||
seconds = 1
|
||||
desired_frame_count = self._speaker_sample_rate * seconds
|
||||
while not self._stop_threads.is_set():
|
||||
buffer = self.read_audio_frames(desired_frame_count)
|
||||
if len(buffer) > 0:
|
||||
frame = AudioQueueFrame(buffer)
|
||||
asyncio.run_coroutine_threadsafe(
|
||||
self.receive_queue.put(frame), self._loop
|
||||
)
|
||||
|
||||
asyncio.run_coroutine_threadsafe(
|
||||
self.receive_queue.put(EndStreamQueueFrame()), self._loop
|
||||
)
|
||||
|
||||
def _set_image(self, image: bytes):
|
||||
self._images = itertools.cycle([image])
|
||||
|
||||
def _set_images(self, images: list[bytes], start_frame=0):
|
||||
self._images = itertools.cycle(images)
|
||||
|
||||
def _run_camera(self):
|
||||
try:
|
||||
while not self._stop_threads.is_set():
|
||||
if self._images:
|
||||
this_frame = next(self._images)
|
||||
self.write_frame_to_camera(this_frame)
|
||||
|
||||
time.sleep(1.0 / self._fps)
|
||||
except Exception as e:
|
||||
self._logger.error(f"Exception {e} in camera thread.")
|
||||
raise e
|
||||
|
||||
def _frame_consumer(self):
|
||||
self._logger.info("🎬 Starting frame consumer thread")
|
||||
b = bytearray()
|
||||
smallest_write_size = 3200
|
||||
all_audio_frames = bytearray()
|
||||
while True:
|
||||
try:
|
||||
frames_or_frame: QueueFrame | list[QueueFrame] = (
|
||||
self._threadsafe_send_queue.get()
|
||||
)
|
||||
if isinstance(frames_or_frame, QueueFrame):
|
||||
frames: list[QueueFrame] = [frames_or_frame]
|
||||
elif isinstance(frames_or_frame, list):
|
||||
frames: list[QueueFrame] = frames_or_frame
|
||||
else:
|
||||
raise Exception("Unknown type in output queue")
|
||||
|
||||
for frame in frames:
|
||||
if isinstance(frame, EndStreamQueueFrame):
|
||||
self._logger.info("Stopping frame consumer thread")
|
||||
self._threadsafe_send_queue.task_done()
|
||||
return
|
||||
|
||||
# if interrupted, we just pull frames off the queue and discard them
|
||||
if not self._is_interrupted.is_set():
|
||||
if frame:
|
||||
if isinstance(frame, AudioQueueFrame):
|
||||
chunk = frame.data
|
||||
all_audio_frames.extend(chunk)
|
||||
|
||||
b.extend(chunk)
|
||||
truncated_length: int = len(b) - (
|
||||
len(b) % smallest_write_size
|
||||
)
|
||||
if truncated_length:
|
||||
self.write_frame_to_mic(
|
||||
bytes(b[:truncated_length]))
|
||||
b = b[truncated_length:]
|
||||
elif isinstance(frame, ImageQueueFrame):
|
||||
self._set_image(frame.image)
|
||||
elif isinstance(frame, SpriteQueueFrame):
|
||||
self._set_images(frame.images)
|
||||
elif isinstance(frame, TTSCompletedFrame) and not frame.outOfBand:
|
||||
self.append_to_context(
|
||||
"assistant", frame.text)
|
||||
elif len(b):
|
||||
self.write_frame_to_mic(bytes(b))
|
||||
b = bytearray()
|
||||
else:
|
||||
# if there are leftover audio bytes, write them now; failing to do so
|
||||
# can cause static in the audio stream.
|
||||
print(f"!!! interrupted, flushing audio")
|
||||
if len(b):
|
||||
truncated_length = len(b) - (len(b) % 160)
|
||||
self.write_frame_to_mic(
|
||||
bytes(b[:truncated_length]))
|
||||
b = bytearray()
|
||||
|
||||
if isinstance(frame, StartStreamQueueFrame):
|
||||
self._is_interrupted.clear()
|
||||
|
||||
self._threadsafe_send_queue.task_done()
|
||||
except queue.Empty:
|
||||
if len(b):
|
||||
self.write_frame_to_mic(bytes(b))
|
||||
|
||||
b = bytearray()
|
||||
except Exception as e:
|
||||
self._logger.error(
|
||||
f"Exception in frame_consumer: {e}, {len(b)}")
|
||||
raise e
|
||||
@@ -1,4 +1,28 @@
|
||||
from dailyai.services.base_transport_service import BaseTransportService
|
||||
import asyncio
|
||||
import inspect
|
||||
import logging
|
||||
import sys
|
||||
import threading
|
||||
import time
|
||||
import types
|
||||
|
||||
from functools import partial
|
||||
from queue import Queue, Empty
|
||||
from typing import AsyncGenerator
|
||||
|
||||
from dailyai.queue_frame import (
|
||||
AudioQueueFrame,
|
||||
EndStreamQueueFrame,
|
||||
ImageQueueFrame,
|
||||
ImageListQueueFrame,
|
||||
QueueFrame,
|
||||
StartStreamQueueFrame,
|
||||
TextQueueFrame,
|
||||
TranscriptionQueueFrame,
|
||||
)
|
||||
|
||||
from threading import Thread, Event
|
||||
|
||||
from daily import (
|
||||
EventHandler,
|
||||
CallClient,
|
||||
@@ -7,97 +31,60 @@ from daily import (
|
||||
VirtualMicrophoneDevice,
|
||||
VirtualSpeakerDevice,
|
||||
)
|
||||
from threading import Event
|
||||
from dailyai.queue_frame import (
|
||||
TranscriptionQueueFrame, UserStartedSpeakingFrame, UserStoppedSpeakingFrame
|
||||
)
|
||||
from functools import partial
|
||||
import types
|
||||
import pyaudio
|
||||
import torchaudio
|
||||
import asyncio
|
||||
import inspect
|
||||
import io
|
||||
import logging
|
||||
import numpy as np
|
||||
import signal
|
||||
import threading
|
||||
import torch
|
||||
torch.set_num_threads(1)
|
||||
|
||||
model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
|
||||
model='silero_vad',
|
||||
force_reload=False)
|
||||
|
||||
(get_speech_timestamps,
|
||||
save_audio,
|
||||
read_audio,
|
||||
VADIterator,
|
||||
collect_chunks) = utils
|
||||
|
||||
# Taken from utils_vad.py
|
||||
|
||||
|
||||
def validate(model,
|
||||
inputs: torch.Tensor):
|
||||
with torch.no_grad():
|
||||
outs = model(inputs)
|
||||
return outs
|
||||
|
||||
# Provided by Alexander Veysov
|
||||
|
||||
|
||||
def int2float(sound):
|
||||
abs_max = np.abs(sound).max()
|
||||
sound = sound.astype('float32')
|
||||
if abs_max > 0:
|
||||
sound *= 1/32768
|
||||
sound = sound.squeeze() # depends on the use case
|
||||
return sound
|
||||
|
||||
|
||||
FORMAT = pyaudio.paInt16
|
||||
CHANNELS = 1
|
||||
SAMPLE_RATE = 16000
|
||||
CHUNK = int(SAMPLE_RATE / 10)
|
||||
|
||||
audio = pyaudio.PyAudio()
|
||||
|
||||
|
||||
class DailyTransportService(BaseTransportService, EventHandler):
|
||||
class DailyTransportService(EventHandler):
|
||||
_daily_initialized = False
|
||||
_lock = threading.Lock()
|
||||
|
||||
_speaker_enabled: bool
|
||||
_speaker_sample_rate: int
|
||||
|
||||
# This is necessary to override EventHandler's __new__ method.
|
||||
def __new__(cls, *args, **kwargs):
|
||||
return super().__new__(cls)
|
||||
speaker_enabled: bool
|
||||
speaker_sample_rate: int
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
room_url: str,
|
||||
token: str | None,
|
||||
bot_name: str,
|
||||
duration: float = 10,
|
||||
min_others_count: int = 1,
|
||||
start_transcription: bool = False,
|
||||
**kwargs,
|
||||
start_transcription: bool = True,
|
||||
speaker_enabled: bool = False,
|
||||
speaker_sample_rate: int = 16000,
|
||||
):
|
||||
# This will call BaseTransportService.__init__ method, not EventHandler
|
||||
super().__init__(**kwargs)
|
||||
super().__init__()
|
||||
self.bot_name: str = bot_name
|
||||
self.room_url: str = room_url
|
||||
self.token: str | None = token
|
||||
self.duration: float = duration
|
||||
self.expiration = time.time() + duration * 60
|
||||
self.min_others_count = min_others_count
|
||||
self.start_transcription = start_transcription
|
||||
|
||||
self._room_url: str = room_url
|
||||
self._bot_name: str = bot_name
|
||||
self._token: str | None = token
|
||||
self._min_others_count = min_others_count
|
||||
self._start_transcription = start_transcription
|
||||
# This queue is used to marshal frames from the async send queue to the thread that emits audio & video.
|
||||
# We need this to maintain the asynchronous behavior of asyncio queues -- to give async functions
|
||||
# a chance to run while waiting for queue items -- but also to maintain thread safety and have a threaded
|
||||
# handler to send frames, to ensure that sending isn't subject to pauses in the async thread.
|
||||
self.threadsafe_send_queue = Queue()
|
||||
|
||||
self._is_interrupted = Event()
|
||||
self._stop_threads = Event()
|
||||
self.is_interrupted = Event()
|
||||
self.stop_threads = Event()
|
||||
self.story_started = False
|
||||
self.mic_enabled = False
|
||||
self.mic_sample_rate = 16000
|
||||
self.camera_width = 960
|
||||
self.camera_height = 960
|
||||
self.camera_enabled = False
|
||||
self.speaker_enabled = speaker_enabled
|
||||
self.speaker_sample_rate = speaker_sample_rate
|
||||
|
||||
self._other_participant_has_joined = False
|
||||
self._my_participant_id = None
|
||||
self.send_queue = asyncio.Queue()
|
||||
self.receive_queue = asyncio.Queue()
|
||||
|
||||
self.other_participant_has_joined = False
|
||||
self.my_participant_id = None
|
||||
|
||||
self.camera_thread = None
|
||||
self.frame_consumer_thread = None
|
||||
|
||||
self.transcription_settings = {
|
||||
"language": "en",
|
||||
@@ -111,44 +98,46 @@ class DailyTransportService(BaseTransportService, EventHandler):
|
||||
},
|
||||
}
|
||||
|
||||
self._logger: logging.Logger = logging.getLogger("dailyai")
|
||||
self.logger: logging.Logger = logging.getLogger("dailyai")
|
||||
|
||||
self._event_handlers = {}
|
||||
self.event_handlers = {}
|
||||
|
||||
def _patch_method(self, event_name, *args, **kwargs):
|
||||
try:
|
||||
for handler in self._event_handlers[event_name]:
|
||||
self.loop = asyncio.get_running_loop()
|
||||
except RuntimeError:
|
||||
self.loop = None
|
||||
|
||||
def patch_method(self, event_name, *args, **kwargs):
|
||||
try:
|
||||
for handler in self.event_handlers[event_name]:
|
||||
if inspect.iscoroutinefunction(handler):
|
||||
if self._loop:
|
||||
asyncio.run_coroutine_threadsafe(
|
||||
handler(*args, **kwargs), self._loop)
|
||||
if self.loop:
|
||||
asyncio.run_coroutine_threadsafe(handler(*args, **kwargs), self.loop)
|
||||
else:
|
||||
raise Exception(
|
||||
"No event loop to run coroutine. In order to use async event handlers, you must run the DailyTransportService in an asyncio event loop.")
|
||||
else:
|
||||
handler(*args, **kwargs)
|
||||
except Exception as e:
|
||||
self._logger.error(f"Exception in event handler {event_name}: {e}")
|
||||
self.logger.error(f"Exception in event handler {event_name}: {e}")
|
||||
raise e
|
||||
|
||||
def add_event_handler(self, event_name: str, handler):
|
||||
if not event_name.startswith("on_"):
|
||||
raise Exception(
|
||||
f"Event handler {event_name} must start with 'on_'")
|
||||
raise Exception(f"Event handler {event_name} must start with 'on_'")
|
||||
|
||||
methods = inspect.getmembers(self, predicate=inspect.ismethod)
|
||||
if event_name not in [method[0] for method in methods]:
|
||||
raise Exception(f"Event handler {event_name} not found")
|
||||
|
||||
if event_name not in self._event_handlers:
|
||||
self._event_handlers[event_name] = [
|
||||
if event_name not in self.event_handlers:
|
||||
self.event_handlers[event_name] = [
|
||||
getattr(
|
||||
self, event_name), types.MethodType(
|
||||
handler, self)]
|
||||
setattr(self, event_name, partial(self._patch_method, event_name))
|
||||
setattr(self, event_name, partial(self.patch_method, event_name))
|
||||
else:
|
||||
self._event_handlers[event_name].append(
|
||||
types.MethodType(handler, self))
|
||||
self.event_handlers[event_name].append(types.MethodType(handler, self))
|
||||
|
||||
def event_handler(self, event_name: str):
|
||||
def decorator(handler):
|
||||
@@ -157,17 +146,7 @@ class DailyTransportService(BaseTransportService, EventHandler):
|
||||
|
||||
return decorator
|
||||
|
||||
def write_frame_to_camera(self, frame: bytes):
|
||||
self.camera.write_frame(frame)
|
||||
|
||||
def write_frame_to_mic(self, frame: bytes):
|
||||
self.mic.write_frames(frame)
|
||||
|
||||
def read_audio_frames(self, desired_frame_count):
|
||||
bytes = self._speaker.read_frames(desired_frame_count)
|
||||
return bytes
|
||||
|
||||
def _prerun(self):
|
||||
def configure_daily(self):
|
||||
# Only initialize Daily once
|
||||
if not DailyTransportService._daily_initialized:
|
||||
with DailyTransportService._lock:
|
||||
@@ -175,26 +154,34 @@ class DailyTransportService(BaseTransportService, EventHandler):
|
||||
DailyTransportService._daily_initialized = True
|
||||
self.client = CallClient(event_handler=self)
|
||||
|
||||
if self._mic_enabled:
|
||||
if self.mic_enabled:
|
||||
self.mic: VirtualMicrophoneDevice = Daily.create_microphone_device(
|
||||
"mic", sample_rate=self._mic_sample_rate, channels=1
|
||||
"mic", sample_rate=self.mic_sample_rate, channels=1
|
||||
)
|
||||
|
||||
if self._camera_enabled:
|
||||
if self.camera_enabled:
|
||||
self.camera: VirtualCameraDevice = Daily.create_camera_device(
|
||||
"camera", width=self._camera_width, height=self._camera_height, color_format="RGB"
|
||||
"camera", width=self.camera_width, height=self.camera_height, color_format="RGB"
|
||||
)
|
||||
|
||||
if self._speaker_enabled:
|
||||
self._speaker: VirtualSpeakerDevice = Daily.create_speaker_device(
|
||||
"speaker", sample_rate=self._speaker_sample_rate, channels=1
|
||||
if self.speaker_enabled:
|
||||
self.speaker: VirtualSpeakerDevice = Daily.create_speaker_device(
|
||||
"speaker", sample_rate=self.speaker_sample_rate, channels=1
|
||||
)
|
||||
Daily.select_speaker_device("speaker")
|
||||
|
||||
self.client.set_user_name(self._bot_name)
|
||||
self.client.join(self._room_url, self._token,
|
||||
completion=self.call_joined)
|
||||
self._my_participant_id = self.client.participants()["local"]["id"]
|
||||
self.image: bytes | None = None
|
||||
self.images: list[bytes] | None = None
|
||||
self.camera_thread = Thread(target=self.run_camera, daemon=True)
|
||||
self.camera_thread.start()
|
||||
|
||||
self.logger.info("Starting frame consumer thread")
|
||||
self.frame_consumer_thread = Thread(target=self.frame_consumer, daemon=True)
|
||||
self.frame_consumer_thread.start()
|
||||
|
||||
self.client.set_user_name(self.bot_name)
|
||||
self.client.join(self.room_url, self.token, completion=self.call_joined)
|
||||
self.my_participant_id = self.client.participants()["local"]["id"]
|
||||
|
||||
self.client.update_inputs(
|
||||
{
|
||||
@@ -235,82 +222,118 @@ class DailyTransportService(BaseTransportService, EventHandler):
|
||||
}
|
||||
)
|
||||
|
||||
if self._token and self._start_transcription:
|
||||
if self.token and self.start_transcription:
|
||||
self.client.start_transcription(self.transcription_settings)
|
||||
|
||||
self.original_sigint_handler = signal.getsignal(signal.SIGINT)
|
||||
signal.signal(signal.SIGINT, self.process_interrupt_handler)
|
||||
def _receive_audio(self):
|
||||
"""Receive audio from the Daily call and put it on the receive queue"""
|
||||
seconds = 1
|
||||
desired_frame_count = self.speaker_sample_rate * seconds
|
||||
while True:
|
||||
buffer = self.speaker.read_frames(desired_frame_count)
|
||||
if len(buffer) > 0:
|
||||
frame = AudioQueueFrame(buffer)
|
||||
if self.loop:
|
||||
asyncio.run_coroutine_threadsafe(self.receive_queue.put(frame), self.loop)
|
||||
|
||||
def process_interrupt_handler(self, signum, frame):
|
||||
self._post_run()
|
||||
if callable(self.original_sigint_handler):
|
||||
self.original_sigint_handler(signum, frame)
|
||||
def interrupt(self):
|
||||
self.is_interrupted.set()
|
||||
|
||||
def _post_run(self):
|
||||
self.client.leave()
|
||||
async def get_receive_frames(self) -> AsyncGenerator[QueueFrame, None]:
|
||||
while True:
|
||||
frame = await self.receive_queue.get()
|
||||
yield frame
|
||||
if isinstance(frame, EndStreamQueueFrame):
|
||||
break
|
||||
|
||||
def get_async_send_queue(self):
|
||||
return self.send_queue
|
||||
|
||||
async def marshal_frames(self):
|
||||
while True:
|
||||
frame: QueueFrame | list = await self.send_queue.get()
|
||||
self.threadsafe_send_queue.put(frame)
|
||||
self.send_queue.task_done()
|
||||
if isinstance(frame, EndStreamQueueFrame):
|
||||
break
|
||||
|
||||
async def wait_for_send_queue_to_empty(self):
|
||||
await self.send_queue.join()
|
||||
self.threadsafe_send_queue.join()
|
||||
|
||||
async def stop_when_done(self):
|
||||
await self.wait_for_send_queue_to_empty()
|
||||
self.stop()
|
||||
|
||||
async def run(self) -> None:
|
||||
self.configure_daily()
|
||||
|
||||
self.do_shutdown = False
|
||||
|
||||
async_output_queue_marshal_task = asyncio.create_task(self.marshal_frames())
|
||||
|
||||
try:
|
||||
participant_count: int = len(self.client.participants())
|
||||
self.logger.info(f"{participant_count} participants in room")
|
||||
while time.time() < self.expiration and not self.do_shutdown and not self.stop_threads.is_set():
|
||||
await asyncio.sleep(1)
|
||||
except Exception as e:
|
||||
self.logger.error(f"Exception {e}")
|
||||
raise e
|
||||
finally:
|
||||
self.client.leave()
|
||||
|
||||
self.stop_threads.set()
|
||||
|
||||
await self.receive_queue.put(EndStreamQueueFrame())
|
||||
await self.send_queue.put(EndStreamQueueFrame())
|
||||
await async_output_queue_marshal_task
|
||||
|
||||
if self.camera_thread and self.camera_thread.is_alive():
|
||||
self.camera_thread.join()
|
||||
if self.frame_consumer_thread and self.frame_consumer_thread.is_alive():
|
||||
self.frame_consumer_thread.join()
|
||||
|
||||
def stop(self):
|
||||
self.stop_threads.set()
|
||||
|
||||
def on_first_other_participant_joined(self):
|
||||
pass
|
||||
|
||||
def call_joined(self, join_data, client_error):
|
||||
self._logger.info(f"Call_joined: {join_data}, {client_error}")
|
||||
|
||||
def dialout(self, number):
|
||||
self.client.start_dialout({"phoneNumber": number})
|
||||
|
||||
def start_recording(self):
|
||||
self.client.start_recording()
|
||||
self.logger.info(f"Call_joined: {join_data}, {client_error}")
|
||||
if self.speaker_enabled:
|
||||
t = Thread(target=self._receive_audio, daemon=True)
|
||||
t.start()
|
||||
|
||||
def on_error(self, error):
|
||||
self._logger.error(f"on_error: {error}")
|
||||
self.logger.error(f"on_error: {error}")
|
||||
|
||||
def on_call_state_updated(self, state):
|
||||
pass
|
||||
|
||||
def on_participant_joined(self, participant):
|
||||
if not self._other_participant_has_joined and participant["id"] != self._my_participant_id:
|
||||
self._other_participant_has_joined = True
|
||||
if not self.other_participant_has_joined and participant["id"] != self.my_participant_id:
|
||||
self.other_participant_has_joined = True
|
||||
self.on_first_other_participant_joined()
|
||||
|
||||
def on_participant_left(self, participant, reason):
|
||||
if len(self.client.participants()) < self._min_others_count + 1:
|
||||
self._stop_threads.set()
|
||||
|
||||
async def insert_speech(self, text, sender, date):
|
||||
await self.receive_queue.put(UserStartedSpeakingFrame())
|
||||
await asyncio.sleep(0.3)
|
||||
|
||||
# frame = TranscriptionQueueFrame(text, sender, date)
|
||||
# await self.receive_queue.put(frame)
|
||||
self.on_transcription_message({
|
||||
"text": text,
|
||||
"participantId": "cb65b845-aac0-4fc8-987d-2e7ce3c7d8f0",
|
||||
"timestamp": date
|
||||
})
|
||||
|
||||
await asyncio.sleep(0.3)
|
||||
await self.receive_queue.put(UserStoppedSpeakingFrame())
|
||||
if len(self.client.participants()) < self.min_others_count + 1:
|
||||
self.do_shutdown = True
|
||||
pass
|
||||
|
||||
def on_app_message(self, message, sender):
|
||||
if self._loop:
|
||||
print("APP MESSAGE", message)
|
||||
asyncio.run_coroutine_threadsafe(
|
||||
self.insert_speech(message["message"], sender, message["date"]), self._loop)
|
||||
pass
|
||||
|
||||
def on_transcription_message(self, message: dict):
|
||||
if self._loop:
|
||||
print(f"transcription: {message}")
|
||||
if self.loop:
|
||||
participantId = ""
|
||||
if "participantId" in message:
|
||||
participantId = message["participantId"]
|
||||
elif "session_id" in message:
|
||||
participantId = message["session_id"]
|
||||
frame = TranscriptionQueueFrame(
|
||||
message["text"], participantId, message["timestamp"])
|
||||
if self._my_participant_id and participantId != self._my_participant_id:
|
||||
self.append_to_context("user", message["text"])
|
||||
asyncio.run_coroutine_threadsafe(
|
||||
self.receive_queue.put(frame), self._loop)
|
||||
frame = TranscriptionQueueFrame(message["text"], participantId, message["timestamp"])
|
||||
asyncio.run_coroutine_threadsafe(self.receive_queue.put(frame), self.loop)
|
||||
|
||||
def on_transcription_stopped(self, stopped_by, stopped_by_error):
|
||||
pass
|
||||
@@ -320,3 +343,90 @@ class DailyTransportService(BaseTransportService, EventHandler):
|
||||
|
||||
def on_transcription_started(self, status):
|
||||
pass
|
||||
|
||||
def set_image(self, image: bytes):
|
||||
self.image: bytes | None = image
|
||||
self.images: list[bytes] | None = None
|
||||
|
||||
def set_images(self, images: list[bytes], start_frame=0):
|
||||
self.images: list[bytes] | None = images
|
||||
self.image = None
|
||||
self.current_frame = start_frame
|
||||
|
||||
def run_camera(self):
|
||||
try:
|
||||
while not self.stop_threads.is_set():
|
||||
if self.image:
|
||||
self.camera.write_frame(self.image)
|
||||
if self.images:
|
||||
frame_index = self.current_frame % len(self.images)
|
||||
this_frame = self.images[frame_index]
|
||||
self.camera.write_frame(this_frame)
|
||||
self.current_frame = frame_index + 1
|
||||
|
||||
time.sleep(1.0 / 8) # 8 fps
|
||||
except Exception as e:
|
||||
self.logger.error(f"Exception {e} in camera thread.")
|
||||
raise e
|
||||
|
||||
def frame_consumer(self):
|
||||
self.logger.info("🎬 Starting frame consumer thread")
|
||||
b = bytearray()
|
||||
smallest_write_size = 3200
|
||||
all_audio_frames = bytearray()
|
||||
while True:
|
||||
try:
|
||||
frames_or_frame: QueueFrame | list[QueueFrame] = self.threadsafe_send_queue.get()
|
||||
if isinstance(frames_or_frame, QueueFrame):
|
||||
frames: list[QueueFrame] = [frames_or_frame]
|
||||
elif isinstance(frames_or_frame, list):
|
||||
frames: list[QueueFrame] = frames_or_frame
|
||||
else:
|
||||
raise Exception("Unknown type in output queue")
|
||||
|
||||
for frame in frames:
|
||||
if isinstance(frame, EndStreamQueueFrame):
|
||||
self.logger.info("Stopping frame consumer thread")
|
||||
self.threadsafe_send_queue.task_done()
|
||||
return
|
||||
|
||||
# if interrupted, we just pull frames off the queue and discard them
|
||||
if not self.is_interrupted.is_set():
|
||||
if frame:
|
||||
if isinstance(frame, AudioQueueFrame):
|
||||
chunk = frame.data
|
||||
|
||||
all_audio_frames.extend(chunk)
|
||||
|
||||
b.extend(chunk)
|
||||
l = len(b) - (len(b) % smallest_write_size)
|
||||
if l:
|
||||
self.mic.write_frames(bytes(b[:l]))
|
||||
b = b[l:]
|
||||
elif isinstance(frame, ImageQueueFrame):
|
||||
self.set_image(frame.image)
|
||||
elif isinstance(frame, ImageListQueueFrame):
|
||||
self.set_images(frame.images)
|
||||
elif len(b):
|
||||
self.mic.write_frames(bytes(b))
|
||||
b = bytearray()
|
||||
else:
|
||||
# if there are leftover audio bytes, write them now; failing to do so
|
||||
# can cause static in the audio stream.
|
||||
if len(b):
|
||||
self.mic.write_frames(bytes(b))
|
||||
b = bytearray()
|
||||
|
||||
if isinstance(frame, StartStreamQueueFrame):
|
||||
self.is_interrupted.clear()
|
||||
|
||||
self.threadsafe_send_queue.task_done()
|
||||
except Empty:
|
||||
try:
|
||||
if len(b):
|
||||
self.mic.write_frames(bytes(b))
|
||||
except Exception as e:
|
||||
self.logger.error(f"Exception in frame_consumer: {e}, {len(b)}")
|
||||
raise e
|
||||
|
||||
b = bytearray()
|
||||
|
||||
@@ -1,36 +0,0 @@
|
||||
import os
|
||||
import aiohttp
|
||||
import requests
|
||||
|
||||
from dailyai.services.ai_services import TTSService
|
||||
|
||||
|
||||
class DeepgramAIService(TTSService):
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
aiohttp_session: aiohttp.ClientSession,
|
||||
api_key,
|
||||
voice,
|
||||
sample_rate=16000
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
self._api_key = api_key
|
||||
self._voice = voice
|
||||
self._sample_rate = sample_rate
|
||||
self._aiohttp_session = aiohttp_session
|
||||
|
||||
async def run_tts(self, sentence):
|
||||
self.logger.info(f"Running deepgram tts for {sentence}")
|
||||
base_url = "https://api.beta.deepgram.com/v1/speak"
|
||||
request_url = f"{base_url}?model={self._voice}&encoding=linear16&container=none&sample_rate={self._sample_rate}"
|
||||
headers = {"authorization": f"token {self._api_key}", "Content-Type": "application/json"}
|
||||
data = {"text": sentence}
|
||||
|
||||
async with self._aiohttp_session.post(
|
||||
request_url, headers=headers, json=data
|
||||
) as r:
|
||||
async for chunk in r.content:
|
||||
if chunk:
|
||||
yield chunk
|
||||
@@ -9,12 +9,11 @@ from dailyai.services.ai_services import TTSService
|
||||
|
||||
|
||||
class DeepgramTTSService(TTSService):
|
||||
def __init__(self, *, aiohttp_session, api_key, voice="alpha-asteria-en-v2"):
|
||||
def __init__(self, speech_key=None, voice=None):
|
||||
super().__init__()
|
||||
|
||||
self._voice = voice
|
||||
self._api_key = api_key
|
||||
self._aiohttp_session = aiohttp_session
|
||||
self.voice = voice or os.getenv("DEEPGRAM_VOICE") or "alpha-asteria-en-v2"
|
||||
self.speech_key = speech_key or os.getenv("DEEPGRAM_API_KEY")
|
||||
|
||||
def get_mic_sample_rate(self):
|
||||
return 24000
|
||||
@@ -22,9 +21,10 @@ class DeepgramTTSService(TTSService):
|
||||
async def run_tts(self, sentence) -> AsyncGenerator[bytes, None]:
|
||||
self.logger.info(f"Running deepgram tts for {sentence}")
|
||||
base_url = "https://api.beta.deepgram.com/v1/speak"
|
||||
request_url = f"{base_url}?model={self._voice}&encoding=linear16&container=none&sample_rate=16000"
|
||||
headers = {"authorization": f"token {self._api_key}"}
|
||||
request_url = f"{base_url}?model={self.voice}&encoding=linear16&container=none&sample_rate=16000"
|
||||
headers = {"authorization": f"token {self.speech_key}"}
|
||||
body = {"text": sentence}
|
||||
async with self._aiohttp_session.post(request_url, headers=headers, json=body) as r:
|
||||
async for data in r.content:
|
||||
yield data
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.post(request_url, headers=headers, json=body) as r:
|
||||
async for data in r.content:
|
||||
yield data
|
||||
|
||||
@@ -9,37 +9,28 @@ from dailyai.services.ai_services import TTSService
|
||||
|
||||
|
||||
class ElevenLabsTTSService(TTSService):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
aiohttp_session: aiohttp.ClientSession,
|
||||
api_key,
|
||||
voice_id,
|
||||
):
|
||||
def __init__(self, api_key=None, voice_id=None):
|
||||
super().__init__()
|
||||
|
||||
self._api_key = api_key
|
||||
self._voice_id = voice_id
|
||||
self._aiohttp_session = aiohttp_session
|
||||
self.api_key = api_key or os.getenv("ELEVENLABS_API_KEY")
|
||||
self.voice_id = voice_id or os.getenv("ELEVENLABS_VOICE_ID")
|
||||
|
||||
async def run_tts(self, sentence) -> AsyncGenerator[bytes, None]:
|
||||
url = f"https://api.elevenlabs.io/v1/text-to-speech/{self._voice_id}/stream"
|
||||
payload = {"text": sentence, "model_id": "eleven_turbo_v2"}
|
||||
querystring = {"output_format": "pcm_16000", "optimize_streaming_latency": 2}
|
||||
headers = {
|
||||
"xi-api-key": self._api_key,
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
async with self._aiohttp_session.post(
|
||||
url, json=payload, headers=headers, params=querystring
|
||||
) as r:
|
||||
if r.status != 200:
|
||||
self.logger.error(
|
||||
f"audio fetch status code: {r.status}, error: {r.text}"
|
||||
)
|
||||
return
|
||||
async with aiohttp.ClientSession() as session:
|
||||
url = f"https://api.elevenlabs.io/v1/text-to-speech/{self.voice_id}/stream"
|
||||
payload = {"text": sentence, "model_id": "eleven_turbo_v2"}
|
||||
querystring = {"output_format": "pcm_16000", "optimize_streaming_latency": 2}
|
||||
headers = {
|
||||
"xi-api-key": self.api_key,
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
async with session.post(url, json=payload, headers=headers, params=querystring) as r:
|
||||
if r.status != 200:
|
||||
self.logger.error(
|
||||
f"audio fetch status code: {r.status}, error: {r.text}"
|
||||
)
|
||||
return
|
||||
|
||||
async for chunk in r.content:
|
||||
if chunk:
|
||||
yield chunk
|
||||
async for chunk in r.content:
|
||||
if chunk:
|
||||
yield chunk
|
||||
|
||||
@@ -2,43 +2,32 @@ import fal
|
||||
import aiohttp
|
||||
import asyncio
|
||||
import io
|
||||
import os
|
||||
import json
|
||||
from PIL import Image
|
||||
|
||||
from dailyai.services.ai_services import ImageGenService
|
||||
|
||||
|
||||
from dailyai.services.ai_services import ImageGenService
|
||||
from dailyai.services.ai_services import LLMService, TTSService, ImageGenService
|
||||
# Fal expects FAL_KEY_ID and FAL_KEY_SECRET to be set in the env
|
||||
|
||||
|
||||
class FalImageGenService(ImageGenService):
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
image_size,
|
||||
aiohttp_session: aiohttp.ClientSession,
|
||||
key_id=None,
|
||||
key_secret=None):
|
||||
def __init__(self, image_size):
|
||||
super().__init__(image_size)
|
||||
self._aiohttp_session = aiohttp_session
|
||||
if key_id:
|
||||
os.environ["FAL_KEY_ID"] = key_id
|
||||
if key_secret:
|
||||
os.environ["FAL_KEY_SECRET"] = key_secret
|
||||
|
||||
async def run_image_gen(self, sentence) -> tuple[str, bytes]:
|
||||
def get_image_url(sentence, size):
|
||||
print("starting fal submit...")
|
||||
handler = fal.apps.submit(
|
||||
"110602490-fast-sdxl",
|
||||
arguments={
|
||||
"prompt": sentence,
|
||||
"seed": 23
|
||||
"prompt": sentence
|
||||
},
|
||||
)
|
||||
print("past fal handler init, about to wait for iter_events...")
|
||||
for event in handler.iter_events():
|
||||
if isinstance(event, fal.apps.InProgress):
|
||||
pass
|
||||
print('Request in progress')
|
||||
print(event.logs)
|
||||
|
||||
result = handler.get()
|
||||
|
||||
@@ -47,9 +36,16 @@ class FalImageGenService(ImageGenService):
|
||||
raise Exception("Image generation failed")
|
||||
|
||||
return image_url
|
||||
print(f"fetching image url...")
|
||||
image_url = await asyncio.to_thread(get_image_url, sentence, self.image_size)
|
||||
print(f"got image url, downloading image...")
|
||||
# Load the image from the url
|
||||
async with self._aiohttp_session.get(image_url) as response:
|
||||
image_stream = io.BytesIO(await response.content.read())
|
||||
image = Image.open(image_stream)
|
||||
return (image_url, image.tobytes())
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(image_url) as response:
|
||||
print("got image response")
|
||||
image_stream = io.BytesIO(await response.content.read())
|
||||
print("read image stream")
|
||||
image = Image.open(image_stream)
|
||||
return (image_url, image.tobytes())
|
||||
|
||||
# return (image_url, dalle_im.tobytes())
|
||||
|
||||
@@ -1,122 +0,0 @@
|
||||
import aiohttp
|
||||
from PIL import Image
|
||||
import io
|
||||
from openai import AsyncOpenAI
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
from collections.abc import AsyncGenerator
|
||||
|
||||
from dailyai.services.ai_services import LLMService, ImageGenService
|
||||
|
||||
from dailyai.queue_frame import (TextQueueFrame, TextQueueOutOfBandFrame)
|
||||
|
||||
|
||||
class FireworksLLMService(LLMService):
|
||||
def __init__(self, *, api_key, model="", tools=[], context, change_appearance, transport=""):
|
||||
super().__init__(context)
|
||||
self._model = model
|
||||
self._tools = tools
|
||||
self._change_appearance = change_appearance
|
||||
self._transport = transport
|
||||
self._client = AsyncOpenAI(
|
||||
api_key=api_key,
|
||||
base_url="https://api.fireworks.ai/inference/v1"
|
||||
)
|
||||
|
||||
async def get_response(self, messages, stream):
|
||||
print("GET RESPONSE ... WHEN DO WE EXPECT THIS TO BE CALLED?")
|
||||
return await self._client.chat.completions.create(
|
||||
stream=stream,
|
||||
messages=messages,
|
||||
model=self._model,
|
||||
temperature=0.1,
|
||||
tools=self._tools
|
||||
)
|
||||
|
||||
async def run_llm_async(self, messages) -> AsyncGenerator[str, None]:
|
||||
print("IN ASYNC")
|
||||
messages_for_log = json.dumps(messages)
|
||||
self.logger.debug(f"Generating chat via openai: {messages_for_log}")
|
||||
|
||||
chunks = await self._client.chat.completions.create(
|
||||
model=self._model,
|
||||
stream=True, # BLARGH
|
||||
messages=messages,
|
||||
temperature=0.1,
|
||||
tools=self._tools
|
||||
)
|
||||
|
||||
tool_call = {}
|
||||
|
||||
async for chunk in chunks:
|
||||
print(f"CHUNK: {chunk}")
|
||||
if len(chunk.choices) == 0:
|
||||
continue
|
||||
|
||||
if chunk.choices[0].delta.content:
|
||||
yield chunk.choices[0].delta.content
|
||||
|
||||
if chunk.choices[0].delta.tool_calls:
|
||||
print(f"TOOL CALLS: {chunk.choices[0].delta.tool_calls[0]}")
|
||||
if chunk.choices[0].delta.tool_calls[0].function.name:
|
||||
tool_call["id"] = chunk.choices[0].delta.tool_calls[0].id
|
||||
tool_call["name"] = chunk.choices[0].delta.tool_calls[0].function.name
|
||||
tool_call["arguments"] = ''
|
||||
if chunk.choices[0].delta.tool_calls[0].function.arguments:
|
||||
tool_call["arguments"] += chunk.choices[0].delta.tool_calls[0].function.arguments
|
||||
|
||||
if chunk.choices[0].finish_reason:
|
||||
print(f"TOOL CALLS ACCUM -- {tool_call}")
|
||||
if tool_call.get("name"):
|
||||
# hard coding tool call action for now. we should assemble the tool call
|
||||
# from the streaming response, then yield it to the pipeline.
|
||||
# this approach works for the first few change appearance requests but
|
||||
# then the model starts refusing. need to read more about function
|
||||
# calling, try this with the OpenAI APIs, and talk to the Fireworks people.
|
||||
self._transport.append_to_context("assistant", {
|
||||
# pipeline will append the content to this context after it goes
|
||||
# through tts. we need to manually append the tool call, though
|
||||
"content": "",
|
||||
"role": "assistant",
|
||||
"tool_calls": [
|
||||
{
|
||||
"id": tool_call["id"],
|
||||
"type": "function",
|
||||
"index": 0,
|
||||
"function": {
|
||||
"name": tool_call["name"],
|
||||
"arguments": tool_call["arguments"]
|
||||
},
|
||||
}
|
||||
],
|
||||
})
|
||||
self._transport.append_to_context("tool", {
|
||||
"content": "image generated by prompt arguments: " + tool_call["arguments"],
|
||||
"role": "tool",
|
||||
"tool_call_id": tool_call["id"]
|
||||
})
|
||||
self._transport.append_to_context("assistant", {
|
||||
"content": f"call to {tool_call['name']} function succeeded",
|
||||
"role": "assistant",
|
||||
})
|
||||
print("APPENDED TO CONTEXT")
|
||||
image_prompt = json.loads(
|
||||
tool_call["arguments"]).get("appearance")
|
||||
print("IMAGE PROMPT", image_prompt)
|
||||
asyncio.create_task(
|
||||
self._change_appearance(image_prompt))
|
||||
yield TextQueueOutOfBandFrame("Sure, let me work on that for you!")
|
||||
# yield {"content": "Sure, let me work on that for you!"}
|
||||
# yield "Sure, let me work on that for you!"
|
||||
|
||||
async def run_llm(self, messages) -> str | None:
|
||||
print("--> IN SYNC ... WHEN DO WE EXPECT THIS TO BE CALLED?")
|
||||
messages_for_log = json.dumps(messages)
|
||||
self.logger.debug(f"Generating chat via openai: {messages_for_log}")
|
||||
|
||||
response = await self._client.chat.completions.create(model=self._model, stream=False, messages=messages)
|
||||
if response and len(response.choices) > 0:
|
||||
return response.choices[0].message.content
|
||||
else:
|
||||
return None
|
||||
@@ -1,33 +0,0 @@
|
||||
import os
|
||||
import groq
|
||||
from groq import AsyncGroq
|
||||
from dailyai.services.ai_services import LLMService
|
||||
from collections.abc import AsyncGenerator
|
||||
|
||||
|
||||
class GroqLLMService(LLMService):
|
||||
def __init__(self, *, api_key, model="mixtral-8x7b-32768", context):
|
||||
super().__init__(context)
|
||||
self._model = model
|
||||
# os.environ["GROQ_SECRET_ACCESS_KEY"] = api_key
|
||||
|
||||
self._client = AsyncGroq()
|
||||
|
||||
async def run_llm_async(self, messages) -> AsyncGenerator[str, None]:
|
||||
print(f"messages are {messages}")
|
||||
try:
|
||||
resp = await self._client.chat.completions.create(messages=messages, model=self._model)
|
||||
print(f"got chunks from groq: {resp}")
|
||||
|
||||
if resp.choices[0].message.content:
|
||||
yield resp.choices[0].message.content
|
||||
except groq.APIConnectionError as e:
|
||||
print("The server could not be reached")
|
||||
print(e.__cause__) # an underlying Exception, likely raised within httpx.
|
||||
except groq.RateLimitError as e:
|
||||
print("A 429 status code was received; we should back off a bit.")
|
||||
except groq.APIStatusError as e:
|
||||
print("Another non-200-range status code was received")
|
||||
print(e.status_code)
|
||||
print(e.response)
|
||||
|
||||
@@ -1,10 +1,9 @@
|
||||
import array
|
||||
import io
|
||||
import math
|
||||
import time
|
||||
from typing import AsyncGenerator
|
||||
import wave
|
||||
from dailyai.queue_frame import AudioQueueFrame, QueueFrame, TranscriptionQueueFrame
|
||||
from dailyai.queue_frame import AudioQueueFrame, QueueFrame, TextQueueFrame
|
||||
from dailyai.services.ai_services import STTService
|
||||
|
||||
|
||||
@@ -60,7 +59,7 @@ class LocalSTTService(STTService):
|
||||
self._content.seek(0)
|
||||
text = await self.run_stt(self._content)
|
||||
self._new_wave()
|
||||
yield TranscriptionQueueFrame(text, '', str(time.time()))
|
||||
yield TextQueueFrame(text)
|
||||
# If we get this far, this is a frame of silence
|
||||
self._current_silence_frames += 1
|
||||
|
||||
|
||||
@@ -1,76 +0,0 @@
|
||||
import asyncio
|
||||
import time
|
||||
import numpy as np
|
||||
import tkinter as tk
|
||||
import pyaudio
|
||||
|
||||
from dailyai.services.base_transport_service import BaseTransportService
|
||||
|
||||
|
||||
class LocalTransportService(BaseTransportService):
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self._sample_width = kwargs.get("sample_width") or 2
|
||||
self._n_channels = kwargs.get("n_channels") or 1
|
||||
self._tk_root = kwargs.get("tk_root") or None
|
||||
|
||||
if self._camera_enabled and not self._tk_root:
|
||||
raise ValueError("If camera is enabled, a tkinter root must be provided")
|
||||
|
||||
if self._speaker_enabled:
|
||||
self._speaker_buffer_pending = bytearray()
|
||||
|
||||
async def _write_frame_to_tkinter(self, frame: bytes):
|
||||
data = f"P6 {self._camera_width} {self._camera_height} 255 ".encode() + frame
|
||||
photo = tk.PhotoImage(
|
||||
width=self._camera_width,
|
||||
height=self._camera_height,
|
||||
data=data,
|
||||
format="PPM")
|
||||
self._image_label.config(image=photo)
|
||||
|
||||
# This holds a reference to the photo, preventing it from being garbage collected.
|
||||
self._image_label.image = photo # type: ignore
|
||||
|
||||
def write_frame_to_camera(self, frame: bytes):
|
||||
if self._camera_enabled and self._loop:
|
||||
asyncio.run_coroutine_threadsafe(
|
||||
self._write_frame_to_tkinter(frame), self._loop
|
||||
)
|
||||
|
||||
def write_frame_to_mic(self, frame: bytes):
|
||||
self._audio_stream.write(frame)
|
||||
|
||||
def read_frames(self, desired_frame_count):
|
||||
bytes = self._speaker_stream.read(
|
||||
desired_frame_count,
|
||||
exception_on_overflow=False,
|
||||
)
|
||||
return bytes
|
||||
|
||||
def _prerun(self):
|
||||
if self._mic_enabled:
|
||||
self._pyaudio = pyaudio.PyAudio()
|
||||
self._audio_stream = self._pyaudio.open(
|
||||
format=self._pyaudio.get_format_from_width(self._sample_width),
|
||||
channels=self._n_channels,
|
||||
rate=self._speaker_sample_rate,
|
||||
output=True,
|
||||
)
|
||||
|
||||
if self._camera_enabled:
|
||||
# Start with a neutral gray background.
|
||||
array = np.ones((1024, 1024, 3)) * 128
|
||||
data = f"P5 {1024} {1024} 255 ".encode() + array.astype(np.uint8).tobytes()
|
||||
photo = tk.PhotoImage(width=1024, height=1024, data=data, format="PPM")
|
||||
self._image_label = tk.Label(self._tk_root, image=photo)
|
||||
self._image_label.pack()
|
||||
|
||||
if self._speaker_enabled:
|
||||
self._speaker_stream = self._pyaudio.open(
|
||||
format=self._pyaudio.get_format_from_width(self._sample_width),
|
||||
channels=self._n_channels,
|
||||
rate=self._speaker_sample_rate,
|
||||
frames_per_buffer=self._speaker_sample_rate,
|
||||
input=True
|
||||
)
|
||||
@@ -1,42 +0,0 @@
|
||||
from openai import AsyncOpenAI
|
||||
|
||||
import json
|
||||
from collections.abc import AsyncGenerator
|
||||
|
||||
from dailyai.services.ai_services import LLMService
|
||||
|
||||
|
||||
class OLLamaLLMService(LLMService):
|
||||
def __init__(self, model="llama2", base_url='http://localhost:11434/v1'):
|
||||
super().__init__()
|
||||
self._model = model
|
||||
self._client = AsyncOpenAI(api_key="ollama", base_url=base_url)
|
||||
|
||||
async def get_response(self, messages, stream):
|
||||
return await self._client.chat.completions.create(
|
||||
stream=stream,
|
||||
messages=messages,
|
||||
model=self._model
|
||||
)
|
||||
|
||||
async def run_llm_async(self, messages) -> AsyncGenerator[str, None]:
|
||||
messages_for_log = json.dumps(messages)
|
||||
self.logger.debug(f"Generating chat via openai: {messages_for_log}")
|
||||
|
||||
chunks = await self._client.chat.completions.create(model=self._model, stream=True, messages=messages)
|
||||
async for chunk in chunks:
|
||||
if len(chunk.choices) == 0:
|
||||
continue
|
||||
|
||||
if chunk.choices[0].delta.content:
|
||||
yield chunk.choices[0].delta.content
|
||||
|
||||
async def run_llm(self, messages) -> str | None:
|
||||
messages_for_log = json.dumps(messages)
|
||||
self.logger.debug(f"Generating chat via openai: {messages_for_log}")
|
||||
|
||||
response = await self._client.chat.completions.create(model=self._model, stream=False, messages=messages)
|
||||
if response and len(response.choices) > 0:
|
||||
return response.choices[0].message.content
|
||||
else:
|
||||
return None
|
||||
@@ -1,33 +1,38 @@
|
||||
import requests
|
||||
import aiohttp
|
||||
import asyncio
|
||||
from PIL import Image
|
||||
import io
|
||||
from openai import AsyncOpenAI
|
||||
|
||||
import os
|
||||
import json
|
||||
from collections.abc import AsyncGenerator
|
||||
|
||||
from dailyai.services.ai_services import LLMService, ImageGenService
|
||||
from dailyai.services.ai_services import AIService, TTSService, LLMService, ImageGenService
|
||||
|
||||
|
||||
class OpenAILLMService(LLMService):
|
||||
def __init__(self, *, api_key, model="gpt-4-turbo-preview", context):
|
||||
super().__init__(context)
|
||||
self._model = model
|
||||
self._client = AsyncOpenAI(api_key=api_key)
|
||||
def __init__(self, api_key=None, model=None):
|
||||
super().__init__()
|
||||
api_key = api_key or os.getenv("OPEN_AI_KEY")
|
||||
self.model = model or os.getenv("OPEN_AI_LLM_MODEL") or "gpt-4"
|
||||
self.client = AsyncOpenAI(api_key=api_key)
|
||||
|
||||
async def get_response(self, messages, stream):
|
||||
return await self._client.chat.completions.create(
|
||||
return await self.client.chat.completions.create(
|
||||
stream=stream,
|
||||
messages=messages,
|
||||
model=self._model
|
||||
model=self.model
|
||||
)
|
||||
|
||||
async def run_llm_async(self, messages) -> AsyncGenerator[str, None]:
|
||||
messages_for_log = json.dumps(messages)
|
||||
self.logger.debug(f"Generating chat via openai: {messages_for_log}")
|
||||
|
||||
chunks = await self._client.chat.completions.create(model=self._model, stream=True, messages=messages)
|
||||
async for chunk in chunks:
|
||||
response = await self.get_response(messages, stream=True)
|
||||
|
||||
for chunk in response:
|
||||
if len(chunk.choices) == 0:
|
||||
continue
|
||||
|
||||
@@ -38,7 +43,7 @@ class OpenAILLMService(LLMService):
|
||||
messages_for_log = json.dumps(messages)
|
||||
self.logger.debug(f"Generating chat via openai: {messages_for_log}")
|
||||
|
||||
response = await self._client.chat.completions.create(model=self._model, stream=False, messages=messages)
|
||||
response = await self.get_response(messages, stream=False)
|
||||
if response and len(response.choices) > 0:
|
||||
return response.choices[0].message.content
|
||||
else:
|
||||
@@ -46,27 +51,18 @@ class OpenAILLMService(LLMService):
|
||||
|
||||
|
||||
class OpenAIImageGenService(ImageGenService):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
image_size: str,
|
||||
aiohttp_session: aiohttp.ClientSession,
|
||||
api_key,
|
||||
model="dall-e-3",
|
||||
):
|
||||
def __init__(self, image_size: str, api_key=None, model=None):
|
||||
super().__init__(image_size=image_size)
|
||||
self._model = model
|
||||
print(f"api key: {api_key}")
|
||||
self._client = AsyncOpenAI(api_key=api_key)
|
||||
self._aiohttp_session = aiohttp_session
|
||||
api_key = api_key or os.getenv("OPEN_AI_KEY")
|
||||
self.model = model or os.getenv("OPEN_AI_IMAGE_MODEL") or "dall-e-3"
|
||||
self.client = AsyncOpenAI(api_key=api_key)
|
||||
|
||||
async def run_image_gen(self, sentence) -> tuple[str, bytes]:
|
||||
self.logger.info("Generating OpenAI image", sentence)
|
||||
|
||||
image = await self._client.images.generate(
|
||||
image = await self.client.images.generate(
|
||||
prompt=sentence,
|
||||
model=self._model,
|
||||
model=self.model,
|
||||
n=1,
|
||||
size=self.image_size
|
||||
)
|
||||
@@ -75,7 +71,10 @@ class OpenAIImageGenService(ImageGenService):
|
||||
raise Exception("No image provided in response", image)
|
||||
|
||||
# Load the image from the url
|
||||
async with self._aiohttp_session.get(image_url) as response:
|
||||
image_stream = io.BytesIO(await response.content.read())
|
||||
image = Image.open(image_stream)
|
||||
return (image_url, image.tobytes())
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(image_url) as response:
|
||||
image_stream = io.BytesIO(await response.content.read())
|
||||
image = Image.open(image_stream)
|
||||
return (image_url, image.tobytes())
|
||||
|
||||
return (image_url, dalle_im.tobytes())
|
||||
|
||||
29
src/dailyai/services/to_be_updated/deepgram_ai_service.py
Normal file
@@ -0,0 +1,29 @@
|
||||
import os
|
||||
import requests
|
||||
|
||||
from services.ai_service import AIService
|
||||
from PIL import Image
|
||||
|
||||
|
||||
class DeepgramAIService(AIService):
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
self.api_key = os.getenv("DEEPGRAM_API_KEY")
|
||||
|
||||
def get_mic_sample_rate(self):
|
||||
return 24000
|
||||
|
||||
def run_tts(self, sentence):
|
||||
self.logger.info(f"Running deepgram tts for {sentence}")
|
||||
base_url = "https://api.beta.deepgram.com/v1/speak"
|
||||
# move this to an environment variable
|
||||
voice = os.getenv("DEEPGRAM_VOICE") or "alpha-apollo-en-v1"
|
||||
request_url = f"{base_url}?model={voice}&encoding=linear16&container=none"
|
||||
headers = {"authorization": f"token {self.api_key}"}
|
||||
|
||||
r = requests.post(request_url, headers=headers, data=sentence)
|
||||
self.logger.info(
|
||||
f"audio fetch status code: {r.status_code}, content length: {len(r.content)}"
|
||||
)
|
||||
yield r.content
|
||||
@@ -1,40 +1,36 @@
|
||||
import io
|
||||
import os
|
||||
import struct
|
||||
from pyht import Client
|
||||
from dotenv import load_dotenv
|
||||
from pyht.client import TTSOptions
|
||||
from pyht.protos.api_pb2 import Format
|
||||
|
||||
from dailyai.services.ai_services import TTSService
|
||||
from services.ai_service import AIService
|
||||
|
||||
|
||||
class PlayHTAIService(TTSService):
|
||||
class PlayHTAIService(AIService):
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
api_key,
|
||||
user_id,
|
||||
voice_url
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
self.speech_key = api_key
|
||||
self.user_id = user_id
|
||||
self.speech_key = os.getenv("PLAY_HT_KEY") or ''
|
||||
self.user_id = os.getenv("PLAY_HT_USER_ID") or ''
|
||||
|
||||
self.client = Client(
|
||||
user_id=self.user_id,
|
||||
api_key=self.speech_key,
|
||||
)
|
||||
self.options = TTSOptions(
|
||||
voice=voice_url,
|
||||
voice="s3://voice-cloning-zero-shot/820da3d2-3a3b-42e7-844d-e68db835a206/sarah/manifest.json",
|
||||
sample_rate=16000,
|
||||
quality="higher",
|
||||
format=Format.FORMAT_WAV)
|
||||
|
||||
def __del__(self):
|
||||
def close(self):
|
||||
super().close()
|
||||
self.client.close()
|
||||
|
||||
async def run_tts(self, sentence):
|
||||
def run_tts(self, sentence):
|
||||
b = bytearray()
|
||||
in_header = True
|
||||
for chunk in self.client.tts(sentence, self.options):
|
||||
@@ -46,7 +46,7 @@ class WhisperSTTService(LocalSTTService):
|
||||
compute_type=self._compute_type)
|
||||
self._model = model
|
||||
|
||||
async def run_stt(self, audio: BinaryIO) -> str:
|
||||
async def run_stt(self, audio: BinaryIO = None) -> str:
|
||||
"""Transcribes given audio using Whisper"""
|
||||
segments, _ = await asyncio.to_thread(self._model.transcribe, audio)
|
||||
res: str = ""
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
from re import A
|
||||
import unittest
|
||||
|
||||
from typing import AsyncGenerator, Generator
|
||||
|
||||
@@ -1,81 +0,0 @@
|
||||
import asyncio
|
||||
import unittest
|
||||
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
from dailyai.queue_frame import AudioQueueFrame, ImageQueueFrame
|
||||
|
||||
|
||||
class TestDailyTransport(unittest.IsolatedAsyncioTestCase):
|
||||
|
||||
async def test_event_handler(self):
|
||||
from dailyai.services.daily_transport_service import DailyTransportService
|
||||
|
||||
transport = DailyTransportService("mock.daily.co/mock", "token", "bot")
|
||||
|
||||
was_called = False
|
||||
|
||||
@transport.event_handler("on_first_other_participant_joined")
|
||||
def test_event_handler(transport):
|
||||
nonlocal was_called
|
||||
was_called = True
|
||||
|
||||
transport.on_first_other_participant_joined()
|
||||
|
||||
self.assertTrue(was_called)
|
||||
|
||||
async def test_event_handler_async(self):
|
||||
from dailyai.services.daily_transport_service import DailyTransportService
|
||||
|
||||
transport = DailyTransportService("mock.daily.co/mock", "token", "bot")
|
||||
|
||||
event = asyncio.Event()
|
||||
|
||||
@transport.event_handler("on_first_other_participant_joined")
|
||||
async def test_event_handler(transport):
|
||||
nonlocal event
|
||||
await asyncio.sleep(0.1)
|
||||
event.set()
|
||||
|
||||
transport.on_first_other_participant_joined()
|
||||
|
||||
await asyncio.wait_for(event.wait(), timeout=1)
|
||||
self.assertTrue(event.is_set())
|
||||
|
||||
@patch("dailyai.services.daily_transport_service.CallClient")
|
||||
@patch("dailyai.services.daily_transport_service.Daily")
|
||||
async def test_run_with_camera_and_mic(self, daily_mock, callclient_mock):
|
||||
from dailyai.services.daily_transport_service import DailyTransportService
|
||||
transport = DailyTransportService(
|
||||
"https://mock.daily.co/mock",
|
||||
"token",
|
||||
"bot",
|
||||
mic_enabled=True,
|
||||
camera_enabled=True,
|
||||
duration_minutes=0.01,
|
||||
)
|
||||
|
||||
mic = MagicMock()
|
||||
camera = MagicMock()
|
||||
daily_mock.create_microphone_device.return_value = mic
|
||||
daily_mock.create_camera_device.return_value = camera
|
||||
|
||||
async def send_audio_frame():
|
||||
await transport.send_queue.put(AudioQueueFrame(bytes([0] * 3300)))
|
||||
|
||||
async def send_video_frame():
|
||||
await transport.send_queue.put(ImageQueueFrame(None, b"test"))
|
||||
|
||||
await asyncio.gather(transport.run(), send_audio_frame(), send_video_frame())
|
||||
|
||||
daily_mock.init.assert_called_once_with()
|
||||
daily_mock.create_microphone_device.assert_called_once()
|
||||
daily_mock.create_camera_device.assert_called_once()
|
||||
|
||||
callclient_mock.return_value.set_user_name.assert_called_once_with("bot")
|
||||
callclient_mock.return_value.join.assert_called_once_with(
|
||||
"https://mock.daily.co/mock", "token", completion=transport.call_joined
|
||||
)
|
||||
|
||||
camera.write_frame.assert_called_with(b"test")
|
||||
mic.write_frames.assert_called()
|
||||
@@ -1,64 +0,0 @@
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import os
|
||||
|
||||
from dailyai.services.daily_transport_service import DailyTransportService
|
||||
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
|
||||
from dailyai.services.playht_ai_service import PlayHTAIService
|
||||
|
||||
from examples.foundational.support.runner import configure
|
||||
|
||||
|
||||
async def main(room_url):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
# create a transport service object using environment variables for
|
||||
# the transport service's API key, room url, and any other configuration.
|
||||
# services can all define and document the environment variables they use.
|
||||
# services all also take an optional config object that is used instead of
|
||||
# environment variables.
|
||||
#
|
||||
# the abstract transport service APIs presumably can map pretty closely
|
||||
# to the daily-python basic API
|
||||
meeting_duration_minutes = 5
|
||||
transport = DailyTransportService(
|
||||
room_url,
|
||||
None,
|
||||
"Say One Thing",
|
||||
meeting_duration_minutes,
|
||||
mic_enabled=True
|
||||
)
|
||||
|
||||
"""
|
||||
tts = ElevenLabsTTSService(
|
||||
aiohttp_session=session,
|
||||
api_key=os.getenv("ELEVENLABS_API_KEY"),
|
||||
voice_id=os.getenv("ELEVENLABS_VOICE_ID"))
|
||||
"""
|
||||
tts = PlayHTAIService(
|
||||
api_key=os.getenv("PLAY_HT_API_KEY"),
|
||||
user_id=os.getenv("PLAY_HT_USER_ID"),
|
||||
voice_url=os.getenv("PLAY_HT_VOICE_URL"),
|
||||
)
|
||||
|
||||
# Register an event handler so we can play the audio when the participant joins.
|
||||
@transport.event_handler("on_participant_joined")
|
||||
async def on_participant_joined(transport, participant):
|
||||
nonlocal tts
|
||||
if participant["info"]["isLocal"]:
|
||||
return
|
||||
|
||||
await tts.say(
|
||||
"Hello there, " + participant["info"]["userName"] + "!",
|
||||
transport.send_queue,
|
||||
)
|
||||
|
||||
# wait for the output queue to be empty, then leave the meeting
|
||||
await transport.stop_when_done()
|
||||
|
||||
await transport.run()
|
||||
del(tts)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
(url, token) = configure()
|
||||
asyncio.run(main(url))
|
||||
@@ -1,34 +0,0 @@
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import os
|
||||
|
||||
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
|
||||
from dailyai.services.local_transport_service import LocalTransportService
|
||||
|
||||
|
||||
async def main():
|
||||
async with aiohttp.ClientSession() as session:
|
||||
meeting_duration_minutes = 1
|
||||
transport = LocalTransportService(
|
||||
duration_minutes=meeting_duration_minutes,
|
||||
mic_enabled=True
|
||||
)
|
||||
tts = ElevenLabsTTSService(
|
||||
aiohttp_session=session,
|
||||
api_key=os.getenv("ELEVENLABS_API_KEY"),
|
||||
voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
|
||||
)
|
||||
|
||||
async def say_something():
|
||||
await asyncio.sleep(1)
|
||||
await tts.say(
|
||||
"Hello there.",
|
||||
transport.send_queue,
|
||||
)
|
||||
await transport.stop_when_done()
|
||||
|
||||
await asyncio.gather(transport.run(), say_something())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -1,60 +0,0 @@
|
||||
import asyncio
|
||||
import os
|
||||
|
||||
import aiohttp
|
||||
|
||||
from dailyai.queue_frame import LLMMessagesQueueFrame
|
||||
from dailyai.services.daily_transport_service import DailyTransportService
|
||||
from dailyai.services.azure_ai_services import AzureLLMService, AzureTTSService
|
||||
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
|
||||
from dailyai.services.deepgram_ai_services import DeepgramTTSService
|
||||
from dailyai.services.open_ai_services import OpenAILLMService
|
||||
from examples.foundational.support.runner import configure
|
||||
|
||||
|
||||
async def main(room_url):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
meeting_duration_minutes = 1
|
||||
transport = DailyTransportService(
|
||||
room_url,
|
||||
None,
|
||||
"Say One Thing From an LLM",
|
||||
duration_minutes=meeting_duration_minutes,
|
||||
mic_enabled=True,
|
||||
speaker_enabled=True
|
||||
)
|
||||
|
||||
tts = ElevenLabsTTSService(
|
||||
aiohttp_session=session,
|
||||
api_key=os.getenv("ELEVENLABS_API_KEY"),
|
||||
voice_id=os.getenv("ELEVENLABS_VOICE_ID"))
|
||||
# tts = AzureTTSService(api_key=os.getenv("AZURE_SPEECH_API_KEY"), region=os.getenv("AZURE_SPEECH_REGION"))
|
||||
# tts = DeepgramTTSService(aiohttp_session=session, api_key=os.getenv("DEEPGRAM_API_KEY"), voice=os.getenv("DEEPGRAM_VOICE"))
|
||||
|
||||
llm = AzureLLMService(
|
||||
api_key=os.getenv("AZURE_CHATGPT_API_KEY"),
|
||||
endpoint=os.getenv("AZURE_CHATGPT_ENDPOINT"),
|
||||
model=os.getenv("AZURE_CHATGPT_MODEL"))
|
||||
# llm = OpenAILLMService(api_key=os.getenv("OPENAI_CHATGPT_API_KEY"))
|
||||
messages = [{
|
||||
"role": "system",
|
||||
"content": "You are an LLM in a WebRTC session, and this is a 'hello world' demo. Say hello to the world."
|
||||
}]
|
||||
tts_task = asyncio.create_task(
|
||||
tts.run_to_queue(
|
||||
transport.send_queue,
|
||||
llm.run([LLMMessagesQueueFrame(messages)]),
|
||||
)
|
||||
)
|
||||
|
||||
@transport.event_handler("on_first_other_participant_joined")
|
||||
async def on_first_other_participant_joined(transport):
|
||||
await tts_task
|
||||
await transport.stop_when_done()
|
||||
|
||||
await transport.run()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
(url, token) = configure()
|
||||
asyncio.run(main(url))
|
||||
@@ -1,53 +0,0 @@
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import os
|
||||
|
||||
from dailyai.queue_frame import TextQueueFrame
|
||||
from dailyai.services.daily_transport_service import DailyTransportService
|
||||
from dailyai.services.fal_ai_services import FalImageGenService
|
||||
from dailyai.services.open_ai_services import OpenAIImageGenService
|
||||
from dailyai.services.azure_ai_services import AzureImageGenServiceREST
|
||||
|
||||
from examples.foundational.support.runner import configure
|
||||
|
||||
local_joined = False
|
||||
participant_joined = False
|
||||
|
||||
|
||||
async def main(room_url):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
meeting_duration_minutes = 1
|
||||
transport = DailyTransportService(
|
||||
room_url,
|
||||
None,
|
||||
"Show a still frame image",
|
||||
duration_minutes=meeting_duration_minutes,
|
||||
mic_enabled=False,
|
||||
camera_enabled=True,
|
||||
camera_width=1024,
|
||||
camera_height=1024
|
||||
)
|
||||
|
||||
imagegen = FalImageGenService(
|
||||
image_size="1024x1024",
|
||||
aiohttp_session=session,
|
||||
key_id=os.getenv("FAL_KEY_ID"),
|
||||
key_secret=os.getenv("FAL_KEY_SECRET"))
|
||||
# imagegen = OpenAIImageGenService(aiohttp_session=session, api_key=os.getenv("OPENAI_DALLE_API_KEY"), image_size="1024x1024")
|
||||
# imagegen = AzureImageGenServiceREST(image_size="1024x1024", aiohttp_session=session, api_key=os.getenv("AZURE_DALLE_API_KEY"), endpoint=os.getenv("AZURE_DALLE_ENDPOINT"), model=os.getenv("AZURE_DALLE_MODEL"))
|
||||
|
||||
image_task = asyncio.create_task(
|
||||
imagegen.run_to_queue(
|
||||
transport.send_queue, [
|
||||
TextQueueFrame("a cat in the style of picasso")]))
|
||||
|
||||
@transport.event_handler("on_first_other_participant_joined")
|
||||
async def on_first_other_participant_joined(transport):
|
||||
await image_task
|
||||
|
||||
await transport.run()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
(url, token) = configure()
|
||||
asyncio.run(main(url))
|
||||
@@ -1,50 +0,0 @@
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import os
|
||||
|
||||
import tkinter as tk
|
||||
|
||||
from dailyai.queue_frame import TextQueueFrame
|
||||
from dailyai.services.fal_ai_services import FalImageGenService
|
||||
from dailyai.services.local_transport_service import LocalTransportService
|
||||
|
||||
local_joined = False
|
||||
participant_joined = False
|
||||
|
||||
|
||||
async def main():
|
||||
async with aiohttp.ClientSession() as session:
|
||||
meeting_duration_minutes = 2
|
||||
tk_root = tk.Tk()
|
||||
tk_root.title("Calendar")
|
||||
transport = LocalTransportService(
|
||||
tk_root=tk_root,
|
||||
mic_enabled=True,
|
||||
camera_enabled=True,
|
||||
camera_width=1024,
|
||||
camera_height=1024,
|
||||
duration_minutes=meeting_duration_minutes,
|
||||
)
|
||||
|
||||
imagegen = FalImageGenService(
|
||||
image_size="1024x1024",
|
||||
aiohttp_session=session,
|
||||
key_id=os.getenv("FAL_KEY_ID"),
|
||||
key_secret=os.getenv("FAL_KEY_SECRET"),
|
||||
)
|
||||
image_task = asyncio.create_task(
|
||||
imagegen.run_to_queue(
|
||||
transport.send_queue, [TextQueueFrame("a cat in the style of picasso")]
|
||||
)
|
||||
)
|
||||
|
||||
async def run_tk():
|
||||
while not transport._stop_threads.is_set():
|
||||
tk_root.update()
|
||||
tk_root.update_idletasks()
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
await asyncio.gather(transport.run(), image_task, run_tk())
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -1,73 +0,0 @@
|
||||
import asyncio
|
||||
import os
|
||||
|
||||
import aiohttp
|
||||
|
||||
from dailyai.services.daily_transport_service import DailyTransportService
|
||||
from dailyai.services.azure_ai_services import AzureLLMService, AzureTTSService
|
||||
from dailyai.queue_frame import EndStreamQueueFrame, LLMMessagesQueueFrame
|
||||
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
|
||||
|
||||
from examples.foundational.support.runner import configure
|
||||
|
||||
|
||||
async def main(room_url: str):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
transport = DailyTransportService(
|
||||
room_url,
|
||||
None,
|
||||
"Static And Dynamic Speech",
|
||||
duration_minutes=1,
|
||||
mic_enabled=True,
|
||||
mic_sample_rate=16000,
|
||||
camera_enabled=False
|
||||
)
|
||||
|
||||
llm = AzureLLMService(
|
||||
api_key=os.getenv("AZURE_CHATGPT_API_KEY"),
|
||||
endpoint=os.getenv("AZURE_CHATGPT_ENDPOINT"),
|
||||
model=os.getenv("AZURE_CHATGPT_MODEL"))
|
||||
azure_tts = AzureTTSService(
|
||||
api_key=os.getenv("AZURE_SPEECH_API_KEY"),
|
||||
region=os.getenv("AZURE_SPEECH_REGION"))
|
||||
elevenlabs_tts = ElevenLabsTTSService(
|
||||
aiohttp_session=session,
|
||||
api_key=os.getenv("ELEVENLABS_API_KEY"),
|
||||
voice_id=os.getenv("ELEVENLABS_VOICE_ID"))
|
||||
|
||||
messages = [{"role": "system", "content": "tell the user a joke about llamas"}]
|
||||
|
||||
# Start a task to run the LLM to create a joke, and convert the LLM output to audio frames. This task
|
||||
# will run in parallel with generating and speaking the audio for static text, so there's no delay to
|
||||
# speak the LLM response.
|
||||
buffer_queue = asyncio.Queue()
|
||||
llm_response_task = asyncio.create_task(
|
||||
elevenlabs_tts.run_to_queue(
|
||||
buffer_queue,
|
||||
llm.run([LLMMessagesQueueFrame(messages)]),
|
||||
True,
|
||||
)
|
||||
)
|
||||
|
||||
@transport.event_handler("on_first_other_participant_joined")
|
||||
async def on_first_other_participant_joined(transport):
|
||||
await azure_tts.say("My friend the LLM is now going to tell a joke about llamas.", transport.send_queue)
|
||||
|
||||
async def buffer_to_send_queue():
|
||||
while True:
|
||||
frame = await buffer_queue.get()
|
||||
await transport.send_queue.put(frame)
|
||||
buffer_queue.task_done()
|
||||
if isinstance(frame, EndStreamQueueFrame):
|
||||
break
|
||||
|
||||
await asyncio.gather(llm_response_task, buffer_to_send_queue())
|
||||
|
||||
await transport.stop_when_done()
|
||||
|
||||
await transport.run()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
(url, token) = configure()
|
||||
asyncio.run(main(url))
|
||||
@@ -1,134 +0,0 @@
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import os
|
||||
|
||||
from dailyai.queue_frame import AudioQueueFrame, ImageQueueFrame
|
||||
from dailyai.services.azure_ai_services import AzureLLMService, AzureImageGenServiceREST, AzureTTSService
|
||||
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
|
||||
from dailyai.services.daily_transport_service import DailyTransportService
|
||||
from dailyai.services.fal_ai_services import FalImageGenService
|
||||
from dailyai.services.open_ai_services import OpenAIImageGenService
|
||||
|
||||
from examples.foundational.support.runner import configure
|
||||
|
||||
|
||||
async def main(room_url):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
meeting_duration_minutes = 5
|
||||
transport = DailyTransportService(
|
||||
room_url,
|
||||
None,
|
||||
"Month Narration Bot",
|
||||
duration_minutes=meeting_duration_minutes,
|
||||
mic_enabled=True,
|
||||
camera_enabled=True,
|
||||
mic_sample_rate=16000,
|
||||
camera_width=1024,
|
||||
camera_height=1024
|
||||
)
|
||||
|
||||
llm = AzureLLMService(
|
||||
api_key=os.getenv("AZURE_CHATGPT_API_KEY"),
|
||||
endpoint=os.getenv("AZURE_CHATGPT_ENDPOINT"),
|
||||
model=os.getenv("AZURE_CHATGPT_MODEL"))
|
||||
tts = ElevenLabsTTSService(
|
||||
aiohttp_session=session,
|
||||
api_key=os.getenv("ELEVENLABS_API_KEY"),
|
||||
voice_id="ErXwobaYiN019PkySvjV")
|
||||
# tts = AzureTTSService(api_key=os.getenv("AZURE_SPEECH_API_KEY"), region=os.getenv("AZURE_SPEECH_REGION"))
|
||||
|
||||
dalle = FalImageGenService(
|
||||
image_size="1024x1024",
|
||||
aiohttp_session=session,
|
||||
key_id=os.getenv("FAL_KEY_ID"),
|
||||
key_secret=os.getenv("FAL_KEY_SECRET"))
|
||||
# dalle = OpenAIImageGenService(aiohttp_session=session, api_key=os.getenv("OPENAI_DALLE_API_KEY"), image_size="1024x1024")
|
||||
# dalle = AzureImageGenServiceREST(image_size="1024x1024", aiohttp_session=session, api_key=os.getenv("AZURE_DALLE_API_KEY"), endpoint=os.getenv("AZURE_DALLE_ENDPOINT"), model=os.getenv("AZURE_DALLE_MODEL"))
|
||||
|
||||
# Get a complete audio chunk from the given text. Splitting this into its own
|
||||
# coroutine lets us ensure proper ordering of the audio chunks on the send queue.
|
||||
async def get_all_audio(text):
|
||||
all_audio = bytearray()
|
||||
async for audio in tts.run_tts(text):
|
||||
all_audio.extend(audio)
|
||||
|
||||
return all_audio
|
||||
|
||||
async def get_month_data(month):
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": f"Describe a nature photograph suitable for use in a calendar, for the month of {month}. Include only the image description with no preamble. Limit the description to one sentence, please.",
|
||||
}
|
||||
]
|
||||
|
||||
image_description = await llm.run_llm(messages)
|
||||
if not image_description:
|
||||
return
|
||||
|
||||
to_speak = f"{month}: {image_description}"
|
||||
audio_task = asyncio.create_task(get_all_audio(to_speak))
|
||||
image_task = asyncio.create_task(dalle.run_image_gen(image_description))
|
||||
print(f"about to gather tasks for {month}")
|
||||
(audio, image_data) = await asyncio.gather(
|
||||
audio_task, image_task
|
||||
)
|
||||
print(f"about to return from get_month_data for {month}")
|
||||
return {
|
||||
"month": month,
|
||||
"text": image_description,
|
||||
"image_url": image_data[0],
|
||||
"image": image_data[1],
|
||||
"audio": audio,
|
||||
}
|
||||
|
||||
months: list[str] = [
|
||||
"January",
|
||||
"February",
|
||||
"March",
|
||||
"April",
|
||||
"May",
|
||||
"June"
|
||||
]
|
||||
"""
|
||||
"February",
|
||||
"March",
|
||||
"April",
|
||||
"May",
|
||||
"June",
|
||||
"July",
|
||||
"August",
|
||||
"September",
|
||||
"October",
|
||||
"November",
|
||||
"December",
|
||||
"""
|
||||
@transport.event_handler("on_first_other_participant_joined")
|
||||
async def on_first_other_participant_joined(transport):
|
||||
# This will play the months in the order they're completed. The benefit
|
||||
# is we'll have as little delay as possible before the first month, and
|
||||
# likely no delay between months, but the months won't display in order.
|
||||
for month_data_task in asyncio.as_completed(month_tasks):
|
||||
print(f"month_data_task: {month_data_task}")
|
||||
try:
|
||||
data = await month_data_task
|
||||
except Exception:
|
||||
print("OMG EXCEPTION!!!!")
|
||||
if data:
|
||||
await transport.send_queue.put(
|
||||
[
|
||||
ImageQueueFrame(data["image_url"], data["image"]),
|
||||
AudioQueueFrame(data["audio"]),
|
||||
]
|
||||
)
|
||||
|
||||
# wait for the output queue to be empty, then leave the meeting
|
||||
await transport.stop_when_done()
|
||||
|
||||
month_tasks = [asyncio.create_task(get_month_data(month)) for month in months]
|
||||
|
||||
await transport.run()
|
||||
|
||||
if __name__ == "__main__":
|
||||
(url, token) = configure()
|
||||
asyncio.run(main(url))
|
||||
@@ -1,134 +0,0 @@
|
||||
import aiohttp
|
||||
import argparse
|
||||
import asyncio
|
||||
import tkinter as tk
|
||||
import os
|
||||
|
||||
from dailyai.queue_frame import AudioQueueFrame, ImageQueueFrame
|
||||
from dailyai.services.azure_ai_services import AzureLLMService
|
||||
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
|
||||
from dailyai.services.fal_ai_services import FalImageGenService
|
||||
from dailyai.services.local_transport_service import LocalTransportService
|
||||
|
||||
|
||||
async def main(room_url):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
meeting_duration_minutes = 5
|
||||
tk_root = tk.Tk()
|
||||
tk_root.title("Calendar")
|
||||
|
||||
transport = LocalTransportService(
|
||||
mic_enabled=True,
|
||||
camera_enabled=True,
|
||||
camera_width=1024,
|
||||
camera_height=1024,
|
||||
duration_minutes=meeting_duration_minutes,
|
||||
tk_root=tk_root,
|
||||
)
|
||||
|
||||
llm = AzureLLMService(
|
||||
api_key=os.getenv("AZURE_CHATGPT_API_KEY"),
|
||||
endpoint=os.getenv("AZURE_CHATGPT_ENDPOINT"),
|
||||
model=os.getenv("AZURE_CHATGPT_MODEL"),
|
||||
)
|
||||
tts = ElevenLabsTTSService(
|
||||
aiohttp_session=session,
|
||||
api_key=os.getenv("ELEVENLABS_API_KEY"),
|
||||
voice_id="ErXwobaYiN019PkySvjV",
|
||||
)
|
||||
dalle = FalImageGenService(
|
||||
image_size="1024x1024",
|
||||
aiohttp_session=session,
|
||||
key_id=os.getenv("FAL_KEY_ID"),
|
||||
key_secret=os.getenv("FAL_KEY_SECRET"),
|
||||
)
|
||||
|
||||
# Get a complete audio chunk from the given text. Splitting this into its own
|
||||
# coroutine lets us ensure proper ordering of the audio chunks on the send queue.
|
||||
async def get_all_audio(text):
|
||||
all_audio = bytearray()
|
||||
async for audio in tts.run_tts(text):
|
||||
all_audio.extend(audio)
|
||||
|
||||
return all_audio
|
||||
|
||||
async def get_month_data(month):
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": f"Describe a nature photograph suitable for use in a calendar, for the month of {month}. Include only the image description with no preamble. Limit the description to one sentence, please.",
|
||||
}
|
||||
]
|
||||
|
||||
image_description = await llm.run_llm(messages)
|
||||
if not image_description:
|
||||
return
|
||||
|
||||
to_speak = f"{month}: {image_description}"
|
||||
audio_task = asyncio.create_task(get_all_audio(to_speak))
|
||||
image_task = asyncio.create_task(dalle.run_image_gen(image_description))
|
||||
(audio, image_data) = await asyncio.gather(
|
||||
audio_task, image_task
|
||||
)
|
||||
|
||||
return {
|
||||
"month": month,
|
||||
"text": image_description,
|
||||
"image_url": image_data[0],
|
||||
"image": image_data[1],
|
||||
"audio": audio,
|
||||
}
|
||||
|
||||
months: list[str] = [
|
||||
"January",
|
||||
"February",
|
||||
"March",
|
||||
"April",
|
||||
"May",
|
||||
"June",
|
||||
"July",
|
||||
"August",
|
||||
"September",
|
||||
"October",
|
||||
"November",
|
||||
"December",
|
||||
]
|
||||
|
||||
async def show_images():
|
||||
# This will play the months in the order they're completed. The benefit
|
||||
# is we'll have as little delay as possible before the first month, and
|
||||
# likely no delay between months, but the months won't display in order.
|
||||
for month_data_task in asyncio.as_completed(month_tasks):
|
||||
data = await month_data_task
|
||||
if data:
|
||||
await transport.send_queue.put(
|
||||
[
|
||||
ImageQueueFrame(data["image_url"], data["image"]),
|
||||
AudioQueueFrame(data["audio"]),
|
||||
]
|
||||
)
|
||||
|
||||
await asyncio.sleep(25)
|
||||
|
||||
# wait for the output queue to be empty, then leave the meeting
|
||||
await transport.stop_when_done()
|
||||
|
||||
async def run_tk():
|
||||
while not transport._stop_threads.is_set():
|
||||
tk_root.update()
|
||||
tk_root.update_idletasks()
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
month_tasks = [asyncio.create_task(get_month_data(month)) for month in months]
|
||||
|
||||
await asyncio.gather(transport.run(), show_images(), run_tk())
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Simple Daily Bot Sample")
|
||||
parser.add_argument(
|
||||
"-u", "--url", type=str, required=True, help="URL of the Daily room to join"
|
||||
)
|
||||
|
||||
args, unknown = parser.parse_known_args()
|
||||
|
||||
asyncio.run(main(args.url))
|
||||
@@ -1,70 +0,0 @@
|
||||
import asyncio
|
||||
import os
|
||||
|
||||
from dailyai.services.daily_transport_service import DailyTransportService
|
||||
from dailyai.services.azure_ai_services import AzureLLMService, AzureTTSService
|
||||
from dailyai.queue_aggregators import LLMAssistantContextAggregator, LLMContextAggregator, LLMUserContextAggregator
|
||||
from examples.foundational.support.runner import configure
|
||||
from dailyai.services.ai_services import FrameLogger
|
||||
|
||||
|
||||
async def main(room_url: str, token):
|
||||
context = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio. Respond to what the user said in a creative and helpful way.",
|
||||
},
|
||||
]
|
||||
transport = DailyTransportService(
|
||||
room_url,
|
||||
token,
|
||||
"Respond bot",
|
||||
duration_minutes=5,
|
||||
start_transcription=True,
|
||||
mic_enabled=True,
|
||||
mic_sample_rate=16000,
|
||||
camera_enabled=False,
|
||||
speaker_enabled=True,
|
||||
context=context
|
||||
)
|
||||
|
||||
llm = AzureLLMService(
|
||||
api_key=os.getenv("AZURE_CHATGPT_API_KEY"),
|
||||
endpoint=os.getenv("AZURE_CHATGPT_ENDPOINT"),
|
||||
model=os.getenv("AZURE_CHATGPT_MODEL"))
|
||||
tts = AzureTTSService(
|
||||
api_key=os.getenv("AZURE_SPEECH_API_KEY"),
|
||||
region=os.getenv("AZURE_SPEECH_REGION"))
|
||||
fl = FrameLogger("transport")
|
||||
|
||||
@transport.event_handler("on_first_other_participant_joined")
|
||||
async def on_first_other_participant_joined(transport):
|
||||
await tts.say("Hi, I'm listening!", transport.send_queue)
|
||||
|
||||
async def handle_transcriptions():
|
||||
|
||||
tma_in = LLMUserContextAggregator(
|
||||
context, transport._my_participant_id)
|
||||
tma_out = LLMAssistantContextAggregator(
|
||||
context, transport._my_participant_id)
|
||||
await tts.run_to_queue(
|
||||
transport.send_queue,
|
||||
tma_out.run(
|
||||
llm.run(
|
||||
tma_in.run(
|
||||
fl.run(
|
||||
transport.get_receive_frames()
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
transport.transcription_settings["extra"]["punctuate"] = True
|
||||
transport.transcription_settings["extra"]["endpointing"] = True
|
||||
await asyncio.gather(transport.run(), handle_transcriptions())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
(url, token) = configure()
|
||||
asyncio.run(main(url, token))
|
||||
@@ -1,115 +0,0 @@
|
||||
import argparse
|
||||
import asyncio
|
||||
import os
|
||||
from typing import AsyncGenerator
|
||||
import aiohttp
|
||||
import requests
|
||||
import time
|
||||
import urllib.parse
|
||||
|
||||
from PIL import Image
|
||||
from dailyai.queue_frame import ImageQueueFrame, QueueFrame
|
||||
|
||||
from dailyai.services.daily_transport_service import DailyTransportService
|
||||
from dailyai.services.azure_ai_services import AzureLLMService, AzureTTSService
|
||||
from dailyai.services.ai_services import AIService
|
||||
from dailyai.queue_aggregators import LLMAssistantContextAggregator, LLMUserContextAggregator
|
||||
from dailyai.services.fal_ai_services import FalImageGenService
|
||||
|
||||
from examples.foundational.support.runner import configure
|
||||
|
||||
|
||||
class ImageSyncAggregator(AIService):
|
||||
def __init__(self, speaking_path: str, waiting_path: str):
|
||||
self._speaking_image = Image.open(speaking_path)
|
||||
self._speaking_image_bytes = self._speaking_image.tobytes()
|
||||
|
||||
self._waiting_image = Image.open(waiting_path)
|
||||
self._waiting_image_bytes = self._waiting_image.tobytes()
|
||||
|
||||
async def process_frame(self, frame: QueueFrame) -> AsyncGenerator[QueueFrame, None]:
|
||||
yield ImageQueueFrame(None, self._speaking_image_bytes)
|
||||
yield frame
|
||||
yield ImageQueueFrame(None, self._waiting_image_bytes)
|
||||
|
||||
|
||||
async def main(room_url: str, token):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
transport = DailyTransportService(
|
||||
room_url,
|
||||
token,
|
||||
"Respond bot",
|
||||
5,
|
||||
)
|
||||
transport._camera_enabled = True
|
||||
transport._camera_width = 1024
|
||||
transport._camera_height = 1024
|
||||
transport._mic_enabled = True
|
||||
transport._mic_sample_rate = 16000
|
||||
|
||||
llm = AzureLLMService(
|
||||
api_key=os.getenv("AZURE_CHATGPT_API_KEY"),
|
||||
endpoint=os.getenv("AZURE_CHATGPT_ENDPOINT"),
|
||||
model=os.getenv("AZURE_CHATGPT_MODEL"))
|
||||
tts = AzureTTSService(
|
||||
api_key=os.getenv("AZURE_SPEECH_API_KEY"),
|
||||
region=os.getenv("AZURE_SPEECH_REGION"))
|
||||
img = FalImageGenService(
|
||||
image_size="1024x1024",
|
||||
aiohttp_session=session,
|
||||
key_id=os.getenv("FAL_KEY_ID"),
|
||||
key_secret=os.getenv("FAL_KEY_SECRET"))
|
||||
|
||||
async def get_images():
|
||||
get_speaking_task = asyncio.create_task(
|
||||
img.run_image_gen("An image of a cat speaking")
|
||||
)
|
||||
get_waiting_task = asyncio.create_task(
|
||||
img.run_image_gen("An image of a cat waiting")
|
||||
)
|
||||
|
||||
(speaking_data, waiting_data) = await asyncio.gather(
|
||||
get_speaking_task, get_waiting_task
|
||||
)
|
||||
|
||||
return speaking_data, waiting_data
|
||||
|
||||
@transport.event_handler("on_first_other_participant_joined")
|
||||
async def on_first_other_participant_joined(transport):
|
||||
await tts.say("Hi, I'm listening!", transport.send_queue)
|
||||
|
||||
async def handle_transcriptions():
|
||||
messages = [
|
||||
{"role": "system", "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio. Respond to what the user said in a creative and helpful way."},
|
||||
]
|
||||
|
||||
tma_in = LLMUserContextAggregator(
|
||||
messages, transport._my_participant_id
|
||||
)
|
||||
tma_out = LLMAssistantContextAggregator(
|
||||
messages, transport._my_participant_id
|
||||
)
|
||||
image_sync_aggregator = ImageSyncAggregator(
|
||||
os.path.join(os.path.dirname(__file__), "assets", "speaking.png"),
|
||||
os.path.join(os.path.dirname(__file__), "assets", "waiting.png"),
|
||||
)
|
||||
await tts.run_to_queue(
|
||||
transport.send_queue,
|
||||
image_sync_aggregator.run(
|
||||
tma_out.run(
|
||||
llm.run(
|
||||
tma_in.run(
|
||||
transport.get_receive_frames()
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
transport.transcription_settings["extra"]["punctuate"] = True
|
||||
await asyncio.gather(transport.run(), handle_transcriptions())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
(url, token) = configure()
|
||||
asyncio.run(main(url, token))
|
||||
@@ -1,83 +0,0 @@
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import os
|
||||
from dailyai.conversation_wrappers import InterruptibleConversationWrapper
|
||||
|
||||
from dailyai.queue_frame import StartStreamQueueFrame, TextQueueFrame
|
||||
from dailyai.services.daily_transport_service import DailyTransportService
|
||||
from dailyai.services.azure_ai_services import AzureLLMService, AzureTTSService
|
||||
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
|
||||
from dailyai.services.open_ai_services import OpenAILLMService
|
||||
from dailyai.services.deepgram_ai_services import DeepgramTTSService
|
||||
from dailyai.services.ai_services import FrameLogger
|
||||
from dailyai.services.groq_ai_services import GroqLLMService
|
||||
|
||||
from examples.foundational.support.runner import configure
|
||||
|
||||
|
||||
async def main(room_url: str, token):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
context = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio. Respond to what the user said in a creative and helpful way.",
|
||||
},
|
||||
]
|
||||
transport = DailyTransportService(
|
||||
room_url,
|
||||
token,
|
||||
"Respond bot",
|
||||
duration_minutes=5,
|
||||
start_transcription=True,
|
||||
mic_enabled=True,
|
||||
mic_sample_rate=16000,
|
||||
camera_enabled=False,
|
||||
# TODO-CB: Should this be VAD enabled or something?
|
||||
speaker_enabled=True,
|
||||
context=context
|
||||
)
|
||||
|
||||
# llm = AzureLLMService(
|
||||
# api_key=os.getenv("AZURE_CHATGPT_API_KEY"),
|
||||
# endpoint=os.getenv("AZURE_CHATGPT_ENDPOINT"),
|
||||
# model=os.getenv("AZURE_CHATGPT_MODEL"),
|
||||
# context=context)
|
||||
llm = OpenAILLMService(
|
||||
context=context, api_key=os.getenv("OPENAI_CHATGPT_API_KEY"))
|
||||
# llm = GroqLLMService(api_key=os.getenv("GROQ_API_KEY"), context=context)
|
||||
# tts = AzureTTSService(
|
||||
# api_key=os.getenv("AZURE_SPEECH_API_KEY"),
|
||||
# region=os.getenv("AZURE_SPEECH_REGION"))
|
||||
tts = ElevenLabsTTSService(
|
||||
aiohttp_session=session,
|
||||
api_key=os.getenv("ELEVENLABS_API_KEY"),
|
||||
voice_id=os.getenv("ELEVENLABS_VOICE_ID"))
|
||||
# tts = DeepgramTTSService(aiohttp_session=session, api_key=os.getenv("DEEPGRAM_API_KEY"), voice=os.getenv("DEEPGRAM_VOICE"))
|
||||
fl = FrameLogger("just outside the innermost layer")
|
||||
|
||||
async def run_response(in_frame):
|
||||
await tts.run_to_queue(
|
||||
transport.send_queue,
|
||||
# tma_out.run(
|
||||
llm.run(
|
||||
# tma_in.run(
|
||||
fl.run(
|
||||
[StartStreamQueueFrame(), in_frame]
|
||||
)
|
||||
# )
|
||||
)
|
||||
# ),
|
||||
)
|
||||
|
||||
@transport.event_handler("on_first_other_participant_joined")
|
||||
async def on_first_other_participant_joined(transport):
|
||||
await tts.say("Hi, I'm listening!", transport.send_queue)
|
||||
|
||||
transport.transcription_settings["extra"]["endpointing"] = True
|
||||
transport.transcription_settings["extra"]["punctuate"] = True
|
||||
await asyncio.gather(transport.run(), transport.run_conversation(run_response))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
(url, token) = configure()
|
||||
asyncio.run(main(url, token))
|
||||
@@ -1,71 +0,0 @@
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import os
|
||||
from dailyai.conversation_wrappers import InterruptibleConversationWrapper
|
||||
|
||||
from dailyai.queue_frame import StartStreamQueueFrame, TextQueueFrame
|
||||
from dailyai.services.daily_transport_service import DailyTransportService
|
||||
from dailyai.services.azure_ai_services import AzureLLMService, AzureTTSService
|
||||
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
|
||||
|
||||
from examples.foundational.support.runner import configure
|
||||
|
||||
|
||||
async def main(room_url: str, token):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
transport = DailyTransportService(
|
||||
room_url,
|
||||
token,
|
||||
"Respond bot",
|
||||
duration_minutes=5,
|
||||
start_transcription=True,
|
||||
mic_enabled=True,
|
||||
mic_sample_rate=16000,
|
||||
camera_enabled=False,
|
||||
)
|
||||
|
||||
llm = AzureLLMService(
|
||||
api_key=os.getenv("AZURE_CHATGPT_API_KEY"),
|
||||
endpoint=os.getenv("AZURE_CHATGPT_ENDPOINT"),
|
||||
model=os.getenv("AZURE_CHATGPT_MODEL"))
|
||||
tts = AzureTTSService(
|
||||
api_key=os.getenv("AZURE_SPEECH_API_KEY"),
|
||||
region=os.getenv("AZURE_SPEECH_REGION"))
|
||||
|
||||
async def run_response(user_speech, tma_in, tma_out):
|
||||
await tts.run_to_queue(
|
||||
transport.send_queue,
|
||||
tma_out.run(
|
||||
llm.run(
|
||||
tma_in.run(
|
||||
[StartStreamQueueFrame(), TextQueueFrame(user_speech)]
|
||||
)
|
||||
)
|
||||
),
|
||||
)
|
||||
|
||||
@transport.event_handler("on_first_other_participant_joined")
|
||||
async def on_first_other_participant_joined(transport):
|
||||
await tts.say("Hi, I'm listening!", transport.send_queue)
|
||||
|
||||
async def run_conversation():
|
||||
messages = [
|
||||
{"role": "system", "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio. Respond to what the user said in a creative and helpful way."},
|
||||
]
|
||||
|
||||
conversation_wrapper = InterruptibleConversationWrapper(
|
||||
frame_generator=transport.get_receive_frames,
|
||||
runner=run_response,
|
||||
interrupt=transport.interrupt,
|
||||
my_participant_id=transport._my_participant_id,
|
||||
llm_messages=messages,
|
||||
)
|
||||
await conversation_wrapper.run_conversation()
|
||||
|
||||
transport.transcription_settings["extra"]["punctuate"] = False
|
||||
await asyncio.gather(transport.run(), run_conversation())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
(url, token) = configure()
|
||||
asyncio.run(main(url, token))
|
||||
@@ -1,115 +0,0 @@
|
||||
import aiohttp
|
||||
import asyncio
|
||||
import os
|
||||
|
||||
from dailyai.services.daily_transport_service import DailyTransportService
|
||||
from dailyai.services.azure_ai_services import AzureLLMService, AzureTTSService
|
||||
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
|
||||
from dailyai.services.fal_ai_services import FalImageGenService
|
||||
from dailyai.queue_frame import AudioQueueFrame, ImageQueueFrame
|
||||
|
||||
from examples.foundational.support.runner import configure
|
||||
|
||||
|
||||
async def main(room_url: str):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
transport = DailyTransportService(
|
||||
room_url,
|
||||
None,
|
||||
"Respond bot",
|
||||
duration_minutes=10,
|
||||
mic_enabled=True,
|
||||
mic_sample_rate=16000,
|
||||
camera_enabled=True,
|
||||
camera_width=1024,
|
||||
camera_height=1024
|
||||
)
|
||||
|
||||
llm = AzureLLMService(
|
||||
api_key=os.getenv("AZURE_CHATGPT_API_KEY"),
|
||||
endpoint=os.getenv("AZURE_CHATGPT_ENDPOINT"),
|
||||
model=os.getenv("AZURE_CHATGPT_MODEL"))
|
||||
tts1 = AzureTTSService(
|
||||
api_key=os.getenv("AZURE_SPEECH_API_KEY"),
|
||||
region=os.getenv("AZURE_SPEECH_REGION"))
|
||||
tts2 = ElevenLabsTTSService(
|
||||
aiohttp_session=session,
|
||||
api_key=os.getenv("ELEVENLABS_API_KEY"),
|
||||
voice_id="jBpfuIE2acCO8z3wKNLl")
|
||||
dalle = FalImageGenService(
|
||||
image_size="1024x1024",
|
||||
aiohttp_session=session,
|
||||
key_id=os.getenv("FAL_KEY_ID"),
|
||||
key_secret=os.getenv("FAL_KEY_SECRET"))
|
||||
|
||||
bot1_messages = [
|
||||
{"role": "system", "content": "You are a stern librarian. You strongly believe that a hot dog is a sandwich. Start by stating this fact in a few sentences, then be prepared to debate this with the user. You shouldn't ever compromise on the fundamental truth that a hot dog is a sandwich. Your responses should only be a few sentences long."},
|
||||
]
|
||||
bot2_messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a silly cat, and you strongly believe that a hot dog is not a sandwich. Debate this with the user, only responding with a few sentences. Don't ever accept that a hot dog is a sandwich."},
|
||||
]
|
||||
|
||||
async def get_bot1_statement():
|
||||
# Run the LLMs synchronously for the back-and-forth
|
||||
bot1_msg = await llm.run_llm(bot1_messages)
|
||||
print(f"bot1_msg: {bot1_msg}")
|
||||
if bot1_msg:
|
||||
bot1_messages.append({"role": "assistant", "content": bot1_msg})
|
||||
bot2_messages.append({"role": "user", "content": bot1_msg})
|
||||
|
||||
all_audio = bytearray()
|
||||
async for audio in tts1.run_tts(bot1_msg):
|
||||
all_audio.extend(audio)
|
||||
|
||||
return all_audio
|
||||
|
||||
async def get_bot2_statement():
|
||||
# Run the LLMs synchronously for the back-and-forth
|
||||
bot2_msg = await llm.run_llm(bot2_messages)
|
||||
print(f"bot2_msg: {bot2_msg}")
|
||||
if bot2_msg:
|
||||
bot2_messages.append({"role": "assistant", "content": bot2_msg})
|
||||
bot1_messages.append({"role": "user", "content": bot2_msg})
|
||||
|
||||
all_audio = bytearray()
|
||||
async for audio in tts2.run_tts(bot2_msg):
|
||||
all_audio.extend(audio)
|
||||
|
||||
return all_audio
|
||||
|
||||
async def argue():
|
||||
for i in range(100):
|
||||
print(f"In iteration {i}")
|
||||
|
||||
bot1_description = "A woman conservatively dressed as a librarian in a library surrounded by books, cartoon, serious, highly detailed"
|
||||
|
||||
(audio1, image_data1) = await asyncio.gather(
|
||||
get_bot1_statement(), dalle.run_image_gen(bot1_description)
|
||||
)
|
||||
await transport.send_queue.put(
|
||||
[
|
||||
ImageQueueFrame(None, image_data1[1]),
|
||||
AudioQueueFrame(audio1),
|
||||
]
|
||||
)
|
||||
|
||||
bot2_description = "A cat dressed in a hot dog costume, cartoon, bright colors, funny, highly detailed"
|
||||
|
||||
(audio2, image_data2) = await asyncio.gather(
|
||||
get_bot2_statement(), dalle.run_image_gen(bot2_description)
|
||||
)
|
||||
await transport.send_queue.put(
|
||||
[
|
||||
ImageQueueFrame(None, image_data2[1]),
|
||||
AudioQueueFrame(audio2),
|
||||
]
|
||||
)
|
||||
|
||||
await asyncio.gather(transport.run(), argue())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
(url, token) = configure()
|
||||
asyncio.run(main(url))
|
||||
@@ -1,179 +0,0 @@
|
||||
import aiohttp
|
||||
import asyncio
|
||||
import os
|
||||
import random
|
||||
from typing import AsyncGenerator
|
||||
|
||||
from PIL import Image
|
||||
|
||||
from dailyai.services.daily_transport_service import DailyTransportService
|
||||
from dailyai.services.azure_ai_services import AzureLLMService
|
||||
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
|
||||
from dailyai.queue_aggregators import LLMUserContextAggregator, LLMAssistantContextAggregator
|
||||
from dailyai.queue_frame import (
|
||||
QueueFrame,
|
||||
TextQueueFrame,
|
||||
ImageQueueFrame,
|
||||
SpriteQueueFrame,
|
||||
TranscriptionQueueFrame,
|
||||
)
|
||||
from dailyai.services.ai_services import AIService
|
||||
|
||||
from examples.foundational.support.runner import configure
|
||||
|
||||
|
||||
sprites = {}
|
||||
image_files = [
|
||||
'sc-default.png',
|
||||
'sc-talk.png',
|
||||
'sc-listen-1.png',
|
||||
'sc-think-1.png',
|
||||
'sc-think-2.png',
|
||||
'sc-think-3.png',
|
||||
'sc-think-4.png'
|
||||
]
|
||||
|
||||
script_dir = os.path.dirname(__file__)
|
||||
|
||||
for file in image_files:
|
||||
# Build the full path to the image file
|
||||
full_path = os.path.join(script_dir, "assets", file)
|
||||
# Get the filename without the extension to use as the dictionary key
|
||||
filename = os.path.splitext(os.path.basename(full_path))[0]
|
||||
# Open the image and convert it to bytes
|
||||
with Image.open(full_path) as img:
|
||||
sprites[file] = img.tobytes()
|
||||
|
||||
# When the bot isn't talking, show a static image of the cat listening
|
||||
quiet_frame = ImageQueueFrame("", sprites["sc-listen-1.png"])
|
||||
# When the bot is talking, build an animation from two sprites
|
||||
talking_list = [sprites['sc-default.png'], sprites['sc-talk.png']]
|
||||
talking = [random.choice(talking_list) for x in range(30)]
|
||||
talking_frame = SpriteQueueFrame(images=talking)
|
||||
|
||||
# TODO: Support "thinking" as soon as we get a valid transcript, while LLM is processing
|
||||
thinking_list = [
|
||||
sprites['sc-think-1.png'],
|
||||
sprites['sc-think-2.png'],
|
||||
sprites['sc-think-3.png'],
|
||||
sprites['sc-think-4.png']]
|
||||
thinking_frame = SpriteQueueFrame(images=thinking_list)
|
||||
|
||||
|
||||
class TranscriptFilter(AIService):
|
||||
def __init__(self, bot_participant_id=None):
|
||||
self.bot_participant_id = bot_participant_id
|
||||
|
||||
async def process_frame(self, frame: QueueFrame) -> AsyncGenerator[QueueFrame, None]:
|
||||
if isinstance(frame, TranscriptionQueueFrame):
|
||||
if frame.participantId != self.bot_participant_id:
|
||||
yield frame
|
||||
|
||||
|
||||
class NameCheckFilter(AIService):
|
||||
def __init__(self, names: list[str]):
|
||||
self.names = names
|
||||
self.sentence = ""
|
||||
|
||||
async def process_frame(self, frame: QueueFrame) -> AsyncGenerator[QueueFrame, None]:
|
||||
content: str = ""
|
||||
|
||||
# TODO: split up transcription by participant
|
||||
if isinstance(frame, TextQueueFrame):
|
||||
content = frame.text
|
||||
|
||||
self.sentence += content
|
||||
if self.sentence.endswith((".", "?", "!")):
|
||||
if any(name in self.sentence for name in self.names):
|
||||
out = self.sentence
|
||||
self.sentence = ""
|
||||
yield TextQueueFrame(out)
|
||||
else:
|
||||
out = self.sentence
|
||||
self.sentence = ""
|
||||
|
||||
|
||||
class ImageSyncAggregator(AIService):
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
async def process_frame(self, frame: QueueFrame) -> AsyncGenerator[QueueFrame, None]:
|
||||
yield talking_frame
|
||||
yield frame
|
||||
yield quiet_frame
|
||||
|
||||
|
||||
async def main(room_url: str, token):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
transport = DailyTransportService(
|
||||
room_url,
|
||||
token,
|
||||
"Santa Cat",
|
||||
duration_minutes=3,
|
||||
start_transcription=True,
|
||||
mic_enabled=True,
|
||||
mic_sample_rate=16000,
|
||||
camera_enabled=True,
|
||||
camera_width=720,
|
||||
camera_height=1280
|
||||
)
|
||||
transport._mic_enabled = True
|
||||
transport._mic_sample_rate = 16000
|
||||
transport._camera_enabled = True
|
||||
transport._camera_width = 720
|
||||
transport._camera_height = 1280
|
||||
|
||||
llm = AzureLLMService(
|
||||
api_key=os.getenv("AZURE_CHATGPT_API_KEY"),
|
||||
endpoint=os.getenv("AZURE_CHATGPT_ENDPOINT"),
|
||||
model=os.getenv("AZURE_CHATGPT_MODEL"))
|
||||
tts = ElevenLabsTTSService(
|
||||
aiohttp_session=session,
|
||||
api_key=os.getenv("ELEVENLABS_API_KEY"),
|
||||
voice_id="jBpfuIE2acCO8z3wKNLl")
|
||||
isa = ImageSyncAggregator()
|
||||
|
||||
@transport.event_handler("on_first_other_participant_joined")
|
||||
async def on_first_other_participant_joined(transport):
|
||||
await tts.say("Hi! If you want to talk to me, just say 'hey Santa Cat'.", transport.send_queue)
|
||||
|
||||
async def handle_transcriptions():
|
||||
messages = [
|
||||
{"role": "system", "content": "You are Santa Cat, a cat that lives in Santa's workshop at the North Pole. You should be clever, and a bit sarcastic. You should also tell jokes every once in a while. Your responses should only be a few sentences long."},
|
||||
]
|
||||
|
||||
tma_in = LLMUserContextAggregator(
|
||||
messages, transport._my_participant_id
|
||||
)
|
||||
tma_out = LLMAssistantContextAggregator(
|
||||
messages, transport._my_participant_id
|
||||
)
|
||||
tf = TranscriptFilter(transport._my_participant_id)
|
||||
ncf = NameCheckFilter(["Santa Cat", "Santa"])
|
||||
await tts.run_to_queue(
|
||||
transport.send_queue,
|
||||
isa.run(
|
||||
tma_out.run(
|
||||
llm.run(
|
||||
tma_in.run(
|
||||
ncf.run(
|
||||
tf.run(
|
||||
transport.get_receive_frames()
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
async def starting_image():
|
||||
await transport.send_queue.put(quiet_frame)
|
||||
|
||||
transport.transcription_settings["extra"]["punctuate"] = True
|
||||
await asyncio.gather(transport.run(), handle_transcriptions(), starting_image())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
(url, token) = configure()
|
||||
asyncio.run(main(url, token))
|
||||
@@ -1,131 +0,0 @@
|
||||
import aiohttp
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
import wave
|
||||
|
||||
from dailyai.services.daily_transport_service import DailyTransportService
|
||||
from dailyai.services.azure_ai_services import AzureLLMService, AzureTTSService
|
||||
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
|
||||
from dailyai.queue_aggregators import LLMContextAggregator, LLMUserContextAggregator, LLMAssistantContextAggregator
|
||||
from dailyai.services.ai_services import AIService, FrameLogger
|
||||
from dailyai.queue_frame import QueueFrame, AudioQueueFrame, LLMResponseEndQueueFrame, LLMMessagesQueueFrame
|
||||
from typing import AsyncGenerator
|
||||
|
||||
from examples.foundational.support.runner import configure
|
||||
|
||||
logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s") # or whatever
|
||||
logger = logging.getLogger("dailyai")
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
sounds = {}
|
||||
sound_files = [
|
||||
'ding1.wav',
|
||||
'ding2.wav'
|
||||
]
|
||||
|
||||
script_dir = os.path.dirname(__file__)
|
||||
|
||||
for file in sound_files:
|
||||
# Build the full path to the image file
|
||||
full_path = os.path.join(script_dir, "assets", file)
|
||||
# Get the filename without the extension to use as the dictionary key
|
||||
filename = os.path.splitext(os.path.basename(full_path))[0]
|
||||
# Open the image and convert it to bytes
|
||||
with wave.open(full_path) as audio_file:
|
||||
sounds[file] = audio_file.readframes(-1)
|
||||
|
||||
|
||||
class OutboundSoundEffectWrapper(AIService):
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
async def process_frame(self, frame: QueueFrame) -> AsyncGenerator[QueueFrame, None]:
|
||||
if isinstance(frame, LLMResponseEndQueueFrame):
|
||||
yield AudioQueueFrame(sounds["ding1.wav"])
|
||||
# In case anything else up the stack needs it
|
||||
yield frame
|
||||
else:
|
||||
yield frame
|
||||
|
||||
|
||||
class InboundSoundEffectWrapper(AIService):
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
async def process_frame(self, frame: QueueFrame) -> AsyncGenerator[QueueFrame, None]:
|
||||
if isinstance(frame, LLMMessagesQueueFrame):
|
||||
yield AudioQueueFrame(sounds["ding2.wav"])
|
||||
# In case anything else up the stack needs it
|
||||
yield frame
|
||||
else:
|
||||
yield frame
|
||||
|
||||
|
||||
async def main(room_url: str, token):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
transport = DailyTransportService(
|
||||
room_url,
|
||||
token,
|
||||
"Respond bot",
|
||||
duration_minutes=5,
|
||||
mic_enabled=True,
|
||||
mic_sample_rate=16000,
|
||||
camera_enabled=False
|
||||
)
|
||||
|
||||
llm = AzureLLMService(
|
||||
api_key=os.getenv("AZURE_CHATGPT_API_KEY"),
|
||||
endpoint=os.getenv("AZURE_CHATGPT_ENDPOINT"),
|
||||
model=os.getenv("AZURE_CHATGPT_MODEL"))
|
||||
tts = ElevenLabsTTSService(
|
||||
aiohttp_session=session,
|
||||
api_key=os.getenv("ELEVENLABS_API_KEY"),
|
||||
voice_id="ErXwobaYiN019PkySvjV")
|
||||
|
||||
@transport.event_handler("on_first_other_participant_joined")
|
||||
async def on_first_other_participant_joined(transport):
|
||||
await tts.say("Hi, I'm listening!", transport.send_queue)
|
||||
await transport.send_queue.put(AudioQueueFrame(sounds["ding1.wav"]))
|
||||
|
||||
async def handle_transcriptions():
|
||||
messages = [
|
||||
{"role": "system", "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio. Respond to what the user said in a creative and helpful way."},
|
||||
]
|
||||
|
||||
tma_in = LLMUserContextAggregator(
|
||||
messages, transport._my_participant_id
|
||||
)
|
||||
tma_out = LLMAssistantContextAggregator(
|
||||
messages, transport._my_participant_id
|
||||
)
|
||||
out_sound = OutboundSoundEffectWrapper()
|
||||
in_sound = InboundSoundEffectWrapper()
|
||||
fl = FrameLogger("LLM Out")
|
||||
fl2 = FrameLogger("Transcription In")
|
||||
await out_sound.run_to_queue(
|
||||
transport.send_queue,
|
||||
tts.run(
|
||||
fl.run(
|
||||
tma_out.run(
|
||||
llm.run(
|
||||
fl2.run(
|
||||
in_sound.run(
|
||||
tma_in.run(
|
||||
transport.get_receive_frames()
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
transport.transcription_settings["extra"]["punctuate"] = True
|
||||
await asyncio.gather(transport.run(), handle_transcriptions())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
(url, token) = configure()
|
||||
asyncio.run(main(url, token))
|
||||
@@ -1,59 +0,0 @@
|
||||
import argparse
|
||||
import asyncio
|
||||
import wave
|
||||
from dailyai.queue_frame import EndStreamQueueFrame, TranscriptionQueueFrame
|
||||
|
||||
from dailyai.services.local_transport_service import LocalTransportService
|
||||
from dailyai.services.whisper_ai_services import WhisperSTTService
|
||||
|
||||
|
||||
async def main(room_url: str):
|
||||
global transport
|
||||
global stt
|
||||
|
||||
meeting_duration_minutes = 1
|
||||
transport = LocalTransportService(
|
||||
mic_enabled=True,
|
||||
camera_enabled=False,
|
||||
speaker_enabled=True,
|
||||
duration_minutes=meeting_duration_minutes,
|
||||
start_transcription=True
|
||||
)
|
||||
stt = WhisperSTTService()
|
||||
transcription_output_queue = asyncio.Queue()
|
||||
transport_done = asyncio.Event()
|
||||
|
||||
async def handle_transcription():
|
||||
print("`````````TRANSCRIPTION`````````")
|
||||
while not transport_done.is_set():
|
||||
item = await transcription_output_queue.get()
|
||||
print("got item from queue", item)
|
||||
if isinstance(item, TranscriptionQueueFrame):
|
||||
print(item.text)
|
||||
elif isinstance(item, EndStreamQueueFrame):
|
||||
break
|
||||
print("handle_transcription done")
|
||||
|
||||
async def handle_speaker():
|
||||
await stt.run_to_queue(
|
||||
transcription_output_queue, transport.get_receive_frames()
|
||||
)
|
||||
await transcription_output_queue.put(EndStreamQueueFrame())
|
||||
print("handle speaker done.")
|
||||
|
||||
async def run_until_done():
|
||||
await transport.run()
|
||||
transport_done.set()
|
||||
print("run_until_done done")
|
||||
|
||||
await asyncio.gather(run_until_done(), handle_speaker(), handle_transcription())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Simple Daily Bot Sample")
|
||||
parser.add_argument(
|
||||
"-u", "--url", type=str, required=True, help="URL of the Daily room to join"
|
||||
)
|
||||
|
||||
args, unknown = parser.parse_known_args()
|
||||
asyncio.run(main(args.url))
|
||||
|
Before Width: | Height: | Size: 868 KiB |
|
Before Width: | Height: | Size: 871 KiB |
|
Before Width: | Height: | Size: 872 KiB |
|
Before Width: | Height: | Size: 868 KiB |
@@ -1,53 +0,0 @@
|
||||
import argparse
|
||||
import os
|
||||
import time
|
||||
import urllib
|
||||
import requests
|
||||
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv()
|
||||
|
||||
|
||||
def configure():
|
||||
parser = argparse.ArgumentParser(description="Daily AI SDK Bot Sample")
|
||||
parser.add_argument(
|
||||
"-u", "--url", type=str, required=False, help="URL of the Daily room to join"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-k",
|
||||
"--apikey",
|
||||
type=str,
|
||||
required=False,
|
||||
help="Daily API Key (needed to create an owner token for the room)",
|
||||
)
|
||||
|
||||
args, unknown = parser.parse_known_args()
|
||||
|
||||
url = args.url or os.getenv("DAILY_SAMPLE_ROOM_URL")
|
||||
key = args.apikey or os.getenv("DAILY_API_KEY")
|
||||
|
||||
if not url:
|
||||
raise Exception(
|
||||
"No Daily room specified. use the -u/--url option from the command line, or set DAILY_SAMPLE_ROOM_URL in your environment to specify a Daily room URL.")
|
||||
|
||||
if not key:
|
||||
raise Exception("No Daily API key specified. use the -k/--apikey option from the command line, or set DAILY_API_KEY in your environment to specify a Daily API key, available from https://dashboard.daily.co/developers.")
|
||||
|
||||
# Create a meeting token for the given room with an expiration 1 hour in the future.
|
||||
room_name: str = urllib.parse.urlparse(url).path[1:]
|
||||
expiration: float = time.time() + 60 * 60
|
||||
|
||||
res: requests.Response = requests.post(
|
||||
f"https://api.daily.co/v1/meeting-tokens",
|
||||
headers={"Authorization": f"Bearer {key}"},
|
||||
json={
|
||||
"properties": {"room_name": room_name, "is_owner": True, "exp": expiration}
|
||||
},
|
||||
)
|
||||
|
||||
if res.status_code != 200:
|
||||
raise Exception(f"Failed to create meeting token: {res.status_code} {res.text}")
|
||||
|
||||
token: str = res.json()["token"]
|
||||
|
||||
return (url, token)
|
||||
@@ -1,134 +0,0 @@
|
||||
import aiohttp
|
||||
import asyncio
|
||||
import os
|
||||
import wave
|
||||
|
||||
from dailyai.services.daily_transport_service import DailyTransportService
|
||||
from dailyai.services.azure_ai_services import AzureLLMService, AzureTTSService
|
||||
from dailyai.queue_aggregators import LLMContextAggregator
|
||||
from dailyai.services.ai_services import AIService, FrameLogger
|
||||
from dailyai.queue_frame import QueueFrame, AudioQueueFrame, LLMResponseEndQueueFrame, LLMMessagesQueueFrame
|
||||
from typing import AsyncGenerator
|
||||
|
||||
from examples.foundational.support.runner import configure
|
||||
|
||||
sounds = {}
|
||||
sound_files = [
|
||||
'ding1.wav',
|
||||
'ding2.wav'
|
||||
]
|
||||
|
||||
script_dir = os.path.dirname(__file__)
|
||||
|
||||
for file in sound_files:
|
||||
# Build the full path to the image file
|
||||
full_path = os.path.join(script_dir, "assets", file)
|
||||
# Get the filename without the extension to use as the dictionary key
|
||||
filename = os.path.splitext(os.path.basename(full_path))[0]
|
||||
# Open the image and convert it to bytes
|
||||
with wave.open(full_path) as audio_file:
|
||||
sounds[file] = audio_file.readframes(-1)
|
||||
|
||||
|
||||
class OutboundSoundEffectWrapper(AIService):
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
async def process_frame(self, frame: QueueFrame) -> AsyncGenerator[QueueFrame, None]:
|
||||
if isinstance(frame, LLMResponseEndQueueFrame):
|
||||
yield AudioQueueFrame(sounds["ding1.wav"])
|
||||
# In case anything else up the stack needs it
|
||||
yield frame
|
||||
else:
|
||||
yield frame
|
||||
|
||||
|
||||
class InboundSoundEffectWrapper(AIService):
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
async def process_frame(self, frame: QueueFrame) -> AsyncGenerator[QueueFrame, None]:
|
||||
if isinstance(frame, LLMMessagesQueueFrame):
|
||||
yield AudioQueueFrame(sounds["ding2.wav"])
|
||||
# In case anything else up the stack needs it
|
||||
yield frame
|
||||
else:
|
||||
yield frame
|
||||
|
||||
|
||||
async def main(room_url: str, token, phone):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
|
||||
global transport
|
||||
global llm
|
||||
global tts
|
||||
|
||||
transport = DailyTransportService(
|
||||
room_url,
|
||||
token,
|
||||
"Respond bot",
|
||||
300,
|
||||
)
|
||||
transport._mic_enabled = True
|
||||
transport._mic_sample_rate = 16000
|
||||
transport._camera_enabled = False
|
||||
|
||||
llm = AzureLLMService()
|
||||
tts = AzureTTSService()
|
||||
|
||||
@transport.event_handler("on_first_other_participant_joined")
|
||||
async def on_first_other_participant_joined(transport):
|
||||
await tts.say("Hi, I'm listening!", transport.send_queue)
|
||||
await transport.send_queue.put(AudioQueueFrame(sounds["ding1.wav"]))
|
||||
|
||||
async def handle_transcriptions():
|
||||
messages = [
|
||||
{"role": "system", "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio. Respond to what the user said in a creative and helpful way."},
|
||||
]
|
||||
|
||||
tma_in = LLMContextAggregator(
|
||||
messages, "user", transport._my_participant_id
|
||||
)
|
||||
tma_out = LLMContextAggregator(
|
||||
messages, "assistant", transport._my_participant_id
|
||||
)
|
||||
out_sound = OutboundSoundEffectWrapper()
|
||||
in_sound = InboundSoundEffectWrapper()
|
||||
fl = FrameLogger("LLM Out")
|
||||
fl2 = FrameLogger("Transcription In")
|
||||
await out_sound.run_to_queue(
|
||||
transport.send_queue,
|
||||
tts.run(
|
||||
tma_out.run(
|
||||
llm.run(
|
||||
fl2.run(
|
||||
in_sound.run(
|
||||
tma_in.run(
|
||||
transport.get_receive_frames()
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
@transport.event_handler("on_participant_joined")
|
||||
async def pax_joined(transport, pax):
|
||||
print(f"PARTICIPANT JOINED: {pax}")
|
||||
|
||||
@transport.event_handler("on_call_state_updated")
|
||||
async def on_call_state_updated(transport, state):
|
||||
if (state == "joined"):
|
||||
if (phone):
|
||||
transport.start_recording()
|
||||
transport.dialout(phone)
|
||||
|
||||
transport.transcription_settings["extra"]["punctuate"] = True
|
||||
|
||||
await asyncio.gather(transport.run(), handle_transcriptions())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
(url, token) = configure()
|
||||
asyncio.run(main(url, token))
|
||||
@@ -1,13 +0,0 @@
|
||||
# Server Example
|
||||
|
||||
This is an example server based on [Santa Cat](https://santacat.ai). You can run the server with this command:
|
||||
|
||||
```
|
||||
flask --app daily-bot-manager.py --debug run
|
||||
```
|
||||
|
||||
Once the server is started, you can load `http://127.0.0.1:5000/spin-up-kitty` in a browser, and the server will do the following:
|
||||
|
||||
- Create a new, randomly-named Daily room with `DAILY_API_KEY` from your .env file or environment
|
||||
- Start the `10-wake-word.py` example and connect it to that room
|
||||
- 301 redirect your browser to the room
|
||||
@@ -1,160 +0,0 @@
|
||||
from datetime import datetime
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import os
|
||||
import sys
|
||||
from dailyai.conversation_wrappers import InterruptibleConversationWrapper
|
||||
|
||||
from dailyai.queue_frame import StartStreamQueueFrame, TranscriptionQueueFrame, TextQueueFrame, UserStartedSpeakingFrame, UserStoppedSpeakingFrame
|
||||
from dailyai.services.daily_transport_service import DailyTransportService
|
||||
from dailyai.services.fireworks_ai_services import FireworksLLMService
|
||||
from dailyai.services.deepgram_ai_services import DeepgramTTSService
|
||||
from dailyai.services.ai_services import FrameLogger
|
||||
|
||||
from dailyai.services.fal_ai_services import FalImageGenService
|
||||
|
||||
from examples.foundational.support.runner import configure
|
||||
|
||||
|
||||
command_line_prompt = ' '.join(sys.argv[1:])
|
||||
|
||||
system_prompt = """
|
||||
You are a friendly robot character with a cartoon body with head, torso, arms, feet,
|
||||
and legs.
|
||||
|
||||
You can change your appearance using the `change_appearance` function call.
|
||||
You can add or remove items from your body, change
|
||||
your color, and more. You can use function calling to change your appearance.
|
||||
|
||||
When changing your appearance, please create a prompt as an argument to the function.
|
||||
The prompt will help the image generation model
|
||||
create a new appearance for you. Include as much detail as possible. Include the
|
||||
keywords "robot", "friendly", "cartoon", "smiling", "happy", "animated".
|
||||
The initial image prompt you are adding to or changing is
|
||||
"A friendly cartoon robot, smiling and happy, animated."
|
||||
|
||||
Do not include the image model prompt in your response. The prompt must be passed to the function
|
||||
as a parameter.
|
||||
"""
|
||||
|
||||
change_appearance_function = {
|
||||
"name": "change_appearance",
|
||||
"description": "Call this function when the users want you to change your appearance.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"appearance": {
|
||||
"type": "string",
|
||||
"description": "The new appearance for the robot, in the form of a prompt for an generative AI diffusion model."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
tools = [
|
||||
{
|
||||
"type": "function",
|
||||
"function": change_appearance_function
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
async def main(room_url: str, token):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
context = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": system_prompt,
|
||||
},
|
||||
]
|
||||
transport = DailyTransportService(
|
||||
room_url,
|
||||
token,
|
||||
"Respond bot",
|
||||
duration_minutes=30,
|
||||
start_transcription=True,
|
||||
mic_enabled=True,
|
||||
mic_sample_rate=16000,
|
||||
camera_enabled=True,
|
||||
camera_width=1024,
|
||||
camera_height=1024,
|
||||
# TODO-CB: Should this be VAD enabled or something?
|
||||
speaker_enabled=True,
|
||||
context=context
|
||||
)
|
||||
|
||||
imagegen = FalImageGenService(
|
||||
image_size="512x512",
|
||||
aiohttp_session=session,
|
||||
key_id=os.getenv("FAL_KEY_ID"),
|
||||
key_secret=os.getenv("FAL_KEY_SECRET"))
|
||||
|
||||
async def change_appearance(appearance):
|
||||
await asyncio.create_task(
|
||||
imagegen.run_to_queue(
|
||||
transport.send_queue, [
|
||||
TextQueueFrame(appearance)]))
|
||||
|
||||
llm = FireworksLLMService(
|
||||
context=context,
|
||||
api_key=os.getenv("FIREWORKS_API_KEY"),
|
||||
model="accounts/fireworks/models/firefunction-v1",
|
||||
# TODO - how can we modify tools list on the fly?
|
||||
tools=tools,
|
||||
change_appearance=change_appearance,
|
||||
transport=transport
|
||||
)
|
||||
tts = DeepgramTTSService(aiohttp_session=session, api_key=os.getenv(
|
||||
"DEEPGRAM_API_KEY"), voice=os.getenv("DEEPGRAM_VOICE"))
|
||||
fl = FrameLogger("just outside the innermost layer")
|
||||
|
||||
async def run_response(in_frame):
|
||||
await tts.run_to_queue(
|
||||
transport.send_queue,
|
||||
# tma_out.run(
|
||||
llm.run(
|
||||
# tma_in.run(
|
||||
fl.run(
|
||||
[StartStreamQueueFrame(), in_frame]
|
||||
)
|
||||
# )
|
||||
)
|
||||
# ),
|
||||
)
|
||||
|
||||
@transport.event_handler("on_first_other_participant_joined")
|
||||
async def on_first_other_participant_joined(transport):
|
||||
await change_appearance("A friendly cartoon robot, smiling and happy, animated.")
|
||||
return
|
||||
|
||||
await tts.say("Hi, I'm listening!", transport.send_queue)
|
||||
await asyncio.sleep(1)
|
||||
|
||||
await transport.receive_queue.put(UserStartedSpeakingFrame())
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
transport.on_transcription_message({
|
||||
"text": command_line_prompt,
|
||||
"participantId": "cb65b845-aac0-4fc8-987d-2e7ce3c7d8f0",
|
||||
"timestamp": datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%S.%f')[:-3] + 'Z'
|
||||
})
|
||||
# putting the frame into the queue directly doesn't seem to work
|
||||
# await transport.receive_queue.put(
|
||||
# TranscriptionQueueFrame(
|
||||
# "tell me a joke.",
|
||||
# "cb65b845-aac0-4fc8-987d-2e7ce3c7d8f0",
|
||||
# datetime.utcnow().strftime(
|
||||
# '%Y-%m-%dT%H:%M:%S.%f')[:-3] + 'Z'
|
||||
# ))
|
||||
await asyncio.sleep(0.1)
|
||||
await transport.receive_queue.put(UserStoppedSpeakingFrame())
|
||||
|
||||
transport.transcription_settings["extra"]["endpointing"] = True
|
||||
transport.transcription_settings["extra"]["punctuate"] = True
|
||||
|
||||
await asyncio.gather(transport.run(), transport.run_conversation(run_response))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
(url, token) = configure()
|
||||
asyncio.run(main(url, token))
|
||||
1
src/samples/deprecated/README.md
Normal file
@@ -0,0 +1 @@
|
||||
These samples need to be updated! Don't rely on them.
|
||||
93
src/samples/deprecated/simple-sample/simple-sample.py
Normal file
@@ -0,0 +1,93 @@
|
||||
import argparse
|
||||
from email.mime import image
|
||||
from re import A
|
||||
import requests
|
||||
import time
|
||||
import urllib.parse
|
||||
|
||||
from dailyai.async_processor.async_processor import (
|
||||
LLMResponse,
|
||||
ConversationProcessorCollection,
|
||||
)
|
||||
from dailyai.orchestrator import OrchestratorConfig, Orchestrator
|
||||
from dailyai.message_handler.message_handler import MessageHandler
|
||||
from dailyai.services.ai_services import AIServiceConfig
|
||||
from dailyai.services.azure_ai_services import AzureImageGenService, AzureTTSService, AzureLLMService
|
||||
from dailyai.services.deepgram_ai_services import DeepgramTTSService
|
||||
|
||||
|
||||
def add_bot_to_room(room_url, token, expiration) -> None:
|
||||
|
||||
# A simple prompt for a simple sample.
|
||||
message_handler = MessageHandler(
|
||||
"""
|
||||
You are a sample bot in a WebRTC session. You'll receive input as transcriptions of user's
|
||||
speech, and your responses will be converted to audio via a TTS service.
|
||||
Answer user's questions and be friendly, and if you can, give some ideas about how someone
|
||||
could use a bot like you in a more in-depth way. Because your responses will be spoken,
|
||||
try to keep them short.
|
||||
"""
|
||||
)
|
||||
|
||||
# Use Azure services for the TTS, image generation, and LLM.
|
||||
# Note that you'll need to set the following environment variables:
|
||||
# - AZURE_SPEECH_SERVICE_KEY
|
||||
# - AZURE_SPEECH_SERVICE_REGION
|
||||
# - AZURE_CHATGPT_KEY
|
||||
# - AZURE_CHATGPT_ENDPOINT
|
||||
# - AZURE_CHATGPT_DEPLOYMENT_ID
|
||||
|
||||
services = AIServiceConfig(
|
||||
tts=AzureTTSService(), image=None, llm=AzureLLMService()
|
||||
)
|
||||
|
||||
orchestrator_config = OrchestratorConfig(
|
||||
room_url=room_url,
|
||||
token=token,
|
||||
bot_name="Simple Bot",
|
||||
expiration=expiration,
|
||||
)
|
||||
|
||||
orchestrator = Orchestrator(
|
||||
orchestrator_config,
|
||||
services,
|
||||
message_handler,
|
||||
)
|
||||
orchestrator.start()
|
||||
|
||||
# When the orchestrator's done, we need to shut it down,
|
||||
# and the various services and handlers we've created.
|
||||
orchestrator.stop()
|
||||
message_handler.shutdown()
|
||||
|
||||
services.tts.close()
|
||||
services.llm.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Simple Daily Bot Sample")
|
||||
parser.add_argument("-u", "--url", type=str, required=True, help="URL of the Daily room")
|
||||
parser.add_argument(
|
||||
"-k", "--apikey", type=str, required=True, help="Daily API Key (needed to create token)"
|
||||
)
|
||||
|
||||
args: argparse.Namespace = parser.parse_args()
|
||||
|
||||
# Create a meeting token for the given room with an expiration 1 hour in the future.
|
||||
room_name: str = urllib.parse.urlparse(args.url).path[1:]
|
||||
expiration: float = time.time() + 60 * 60
|
||||
|
||||
res: requests.Response = requests.post(
|
||||
f"https://api.daily.co/v1/meeting-tokens",
|
||||
headers={"Authorization": f"Bearer {args.apikey}"},
|
||||
json={
|
||||
"properties": {"room_name": room_name, "is_owner": True, "exp": expiration}
|
||||
},
|
||||
)
|
||||
|
||||
if res.status_code != 200:
|
||||
raise Exception(f'Failed to create meeting token: {res.status_code} {res.text}')
|
||||
|
||||
token: str = res.json()['token']
|
||||
|
||||
add_bot_to_room(args.url, token, expiration)
|
||||
174
src/samples/deprecated/static-sprite/sprite-sample.py
Normal file
@@ -0,0 +1,174 @@
|
||||
import argparse
|
||||
from email.mime import image
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
import requests
|
||||
import time
|
||||
import urllib.parse
|
||||
|
||||
from PIL import Image
|
||||
|
||||
from dailyai.async_processor.async_processor import (
|
||||
ConversationProcessorCollection,
|
||||
LLMResponse,
|
||||
OrchestratorResponse
|
||||
)
|
||||
from dailyai.orchestrator import OrchestratorConfig, Orchestrator
|
||||
from dailyai.queue_frame import QueueFrame, FrameType
|
||||
from dailyai.message_handler.message_handler import MessageHandler
|
||||
from dailyai.services.ai_services import AIServiceConfig
|
||||
from dailyai.services.azure_ai_services import AzureImageGenService, AzureTTSService, AzureLLMService
|
||||
|
||||
|
||||
class StaticSpriteResponse(OrchestratorResponse):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
services,
|
||||
message_handler,
|
||||
output_queue
|
||||
) -> None:
|
||||
super().__init__(services, message_handler, output_queue)
|
||||
self.image_bytes: bytes | None = None
|
||||
self.filenames = None # override this in subclasses
|
||||
|
||||
def start_preparation(self) -> None:
|
||||
full_path = os.path.join(os.path.dirname(__file__), "sprites/", self.filename)
|
||||
print(full_path)
|
||||
|
||||
with Image.open(full_path) as img:
|
||||
self.image_bytes = img.tobytes()
|
||||
|
||||
def do_play(self) -> None:
|
||||
self.output_queue.put(QueueFrame(FrameType.IMAGE, self.image_bytes))
|
||||
|
||||
|
||||
class IntroSpriteResponse(StaticSpriteResponse):
|
||||
def __init__(self, services, message_handler, output_queue) -> None:
|
||||
super().__init__(services, message_handler, output_queue)
|
||||
self.filename = "intro.png"
|
||||
|
||||
|
||||
class WaitingSpriteResponse(StaticSpriteResponse):
|
||||
def __init__(self, services, message_handler, output_queue) -> None:
|
||||
super().__init__(services, message_handler, output_queue)
|
||||
self.filename = "waiting.png"
|
||||
|
||||
|
||||
class AnimatedSpriteLLMResponse(LLMResponse):
|
||||
def __init__(self, services, message_handler, output_queue) -> None:
|
||||
super().__init__(services, message_handler, output_queue)
|
||||
self.filenames = ["talk-1.png", "talk-2.png"]
|
||||
self.image_bytes = []
|
||||
|
||||
def start_preparation(self) -> None:
|
||||
super().start_preparation()
|
||||
|
||||
for filename in self.filenames:
|
||||
full_path = os.path.join(os.path.dirname(__file__), "sprites/", filename)
|
||||
print(full_path)
|
||||
|
||||
with Image.open(full_path) as img:
|
||||
self.image_bytes.append(img.tobytes())
|
||||
|
||||
def get_frames_from_tts_response(self, audio_frame) -> list[QueueFrame]:
|
||||
return [
|
||||
QueueFrame(FrameType.AUDIO, audio_frame),
|
||||
QueueFrame(FrameType.IMAGE, random.choice(self.image_bytes))
|
||||
]
|
||||
|
||||
|
||||
def add_bot_to_room(room_url, token, expiration) -> None:
|
||||
|
||||
# A simple prompt for a simple sample.
|
||||
message_handler = MessageHandler(
|
||||
"""
|
||||
You are a sample bot in a WebRTC session. You'll receive input as transcriptions of user's
|
||||
speech, and your responses will be converted to audio via a TTS service.
|
||||
Answer user's questions and be friendly, and if you can, give some ideas about how someone
|
||||
could use a bot like you in a more in-depth way. Because your responses will be spoken,
|
||||
try to keep them short.
|
||||
"""
|
||||
)
|
||||
|
||||
# Use Azure services for the TTS, image generation, and LLM.
|
||||
# Note that you'll need to set the following environment variables:
|
||||
# - AZURE_SPEECH_SERVICE_KEY
|
||||
# - AZURE_SPEECH_SERVICE_REGION
|
||||
# - AZURE_CHATGPT_KEY
|
||||
# - AZURE_CHATGPT_ENDPOINT
|
||||
# - AZURE_CHATGPT_DEPLOYMENT_ID
|
||||
#
|
||||
# This demo doesn't use image generation, but if you extend it to do so,
|
||||
# you'll also need to set:
|
||||
# - AZURE_DALLE_KEY
|
||||
# - AZURE_DALLE_ENDPOINT
|
||||
# - AZURE_DALLE_DEPLOYMENT_ID
|
||||
|
||||
services = AIServiceConfig(
|
||||
tts=AzureTTSService(), image=AzureImageGenService(), llm=AzureLLMService()
|
||||
)
|
||||
|
||||
sprite_conversation_processors = ConversationProcessorCollection(
|
||||
introduction=IntroSpriteResponse,
|
||||
waiting=WaitingSpriteResponse,
|
||||
response=AnimatedSpriteLLMResponse,
|
||||
)
|
||||
|
||||
orchestrator_config = OrchestratorConfig(
|
||||
room_url=room_url,
|
||||
token=token,
|
||||
bot_name="Simple Bot",
|
||||
expiration=expiration,
|
||||
)
|
||||
|
||||
logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
|
||||
logger: logging.Logger = logging.getLogger("dailyai")
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
orchestrator = Orchestrator(
|
||||
orchestrator_config,
|
||||
services,
|
||||
message_handler,
|
||||
sprite_conversation_processors
|
||||
)
|
||||
orchestrator.start()
|
||||
|
||||
# When the orchestrator's done, we need to shut it down,
|
||||
# and the various services and handlers we've created.
|
||||
orchestrator.stop()
|
||||
message_handler.shutdown()
|
||||
|
||||
services.tts.close()
|
||||
services.image.close()
|
||||
services.llm.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Simple Daily Bot Sample")
|
||||
parser.add_argument("-u", "--url", type=str, required=True, help="URL of the Daily room")
|
||||
parser.add_argument(
|
||||
"-k", "--apikey", type=str, required=True, help="Daily API Key (needed to create token)"
|
||||
)
|
||||
|
||||
args: argparse.Namespace = parser.parse_args()
|
||||
|
||||
# Create a meeting token for the given room with an expiration 1 hour in the future.
|
||||
room_name: str = urllib.parse.urlparse(args.url).path[1:]
|
||||
expiration: float = time.time() + 60 * 60
|
||||
|
||||
res: requests.Response = requests.post(
|
||||
f"https://api.daily.co/v1/meeting-tokens",
|
||||
headers={"Authorization": f"Bearer {args.apikey}"},
|
||||
json={
|
||||
"properties": {"room_name": room_name, "is_owner": True, "exp": expiration}
|
||||
},
|
||||
)
|
||||
|
||||
if res.status_code != 200:
|
||||
raise Exception(f'Failed to create meeting token: {res.status_code} {res.text}')
|
||||
|
||||
token: str = res.json()['token']
|
||||
|
||||
add_bot_to_room(args.url, token, expiration)
|
||||
|
Before Width: | Height: | Size: 871 KiB After Width: | Height: | Size: 871 KiB |
|
Before Width: | Height: | Size: 870 KiB After Width: | Height: | Size: 870 KiB |
|
Before Width: | Height: | Size: 871 KiB After Width: | Height: | Size: 871 KiB |
|
Before Width: | Height: | Size: 868 KiB After Width: | Height: | Size: 868 KiB |
52
src/samples/foundational/01-say-one-thing.py
Normal file
@@ -0,0 +1,52 @@
|
||||
import argparse
|
||||
import asyncio
|
||||
|
||||
from dailyai.services.daily_transport_service import DailyTransportService
|
||||
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
|
||||
|
||||
|
||||
async def main(room_url):
|
||||
# create a transport service object using environment variables for
|
||||
# the transport service's API key, room url, and any other configuration.
|
||||
# services can all define and document the environment variables they use.
|
||||
# services all also take an optional config object that is used instead of
|
||||
# environment variables.
|
||||
#
|
||||
# the abstract transport service APIs presumably can map pretty closely
|
||||
# to the daily-python basic API
|
||||
meeting_duration_minutes = 1
|
||||
transport = DailyTransportService(
|
||||
room_url,
|
||||
None,
|
||||
"Say One Thing",
|
||||
meeting_duration_minutes,
|
||||
)
|
||||
transport.mic_enabled = True
|
||||
tts = ElevenLabsTTSService(voice_id="ErXwobaYiN019PkySvjV")
|
||||
|
||||
# Register an event handler so we can play the audio when the participant joins.
|
||||
@transport.event_handler("on_participant_joined")
|
||||
async def on_participant_joined(transport, participant):
|
||||
if participant["info"]["isLocal"]:
|
||||
return
|
||||
|
||||
await tts.say(
|
||||
"Hello there, " + participant["info"]["userName"] + "!",
|
||||
transport.send_queue,
|
||||
)
|
||||
|
||||
# wait for the output queue to be empty, then leave the meeting
|
||||
await transport.stop_when_done()
|
||||
|
||||
await transport.run()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Simple Daily Bot Sample")
|
||||
parser.add_argument(
|
||||
"-u", "--url", type=str, required=True, help="URL of the Daily room to join"
|
||||
)
|
||||
|
||||
args, unknown = parser.parse_known_args()
|
||||
|
||||
asyncio.run(main(args.url))
|
||||
59
src/samples/foundational/01a-greet-user.py
Normal file
@@ -0,0 +1,59 @@
|
||||
import asyncio
|
||||
import time
|
||||
from typing import AsyncGenerator
|
||||
|
||||
from dailyai.queue_frame import QueueFrame, FrameType
|
||||
from dailyai.services.daily_transport_service import DailyTransportService
|
||||
from dailyai.services.azure_ai_services import AzureTTSService
|
||||
from dailyai.services.deepgram_ai_services import DeepgramTTSService
|
||||
|
||||
|
||||
async def main(room_url):
|
||||
# create a transport service object using environment variables for
|
||||
# the transport service's API key, room url, and any other configuration.
|
||||
# services can all define and document the environment variables they use.
|
||||
# services all also take an optional config object that is used instead of
|
||||
# environment variables.
|
||||
#
|
||||
# the abstract transport service APIs presumably can map pretty closely
|
||||
# to the daily-python basic API
|
||||
meeting_duration_minutes = 1
|
||||
transport = DailyTransportService(
|
||||
room_url,
|
||||
None,
|
||||
"Greeter",
|
||||
meeting_duration_minutes,
|
||||
)
|
||||
transport.mic_enabled = True
|
||||
|
||||
# similarly, create a tts service
|
||||
tts = DeepgramTTSService()
|
||||
|
||||
# Get the generator for the audio. This will start running in the background,
|
||||
# and when we ask the generator for its items, we'll get what it's generated.
|
||||
|
||||
# Register an event handler so we can play the audio when the participant joins.
|
||||
print("settting up handler")
|
||||
|
||||
@transport.event_handler("on_participant_joined")
|
||||
async def on_participant_joined(transport, participant):
|
||||
print(f"participant joined: {participant['info']['userName']}")
|
||||
if participant["info"]["isLocal"]:
|
||||
return
|
||||
audio_generator: AsyncGenerator[bytes, None] = tts.run_tts(
|
||||
f"Hello there, {participant['info']['userName']}!")
|
||||
|
||||
async for audio in audio_generator:
|
||||
transport.output_queue.put(QueueFrame(FrameType.AUDIO, audio))
|
||||
|
||||
print("setting up call state handler")
|
||||
|
||||
@transport.event_handler("on_call_state_updated")
|
||||
async def on_call_joined(transport, state):
|
||||
print(f"call state callback: {state}")
|
||||
|
||||
await transport.run()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main("https://chad-hq.daily.co/howdy"))
|
||||
49
src/samples/foundational/02-llm-say-one-thing.py
Normal file
@@ -0,0 +1,49 @@
|
||||
import argparse
|
||||
import asyncio
|
||||
|
||||
from dailyai.queue_frame import LLMMessagesQueueFrame
|
||||
from dailyai.services.daily_transport_service import DailyTransportService
|
||||
from dailyai.services.azure_ai_services import AzureLLMService
|
||||
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
|
||||
|
||||
|
||||
async def main(room_url):
|
||||
meeting_duration_minutes = 1
|
||||
transport = DailyTransportService(
|
||||
room_url,
|
||||
None,
|
||||
"Say One Thing From an LLM",
|
||||
meeting_duration_minutes,
|
||||
)
|
||||
transport.mic_enabled = True
|
||||
|
||||
tts = ElevenLabsTTSService(voice_id="29vD33N1CtxCmqQRPOHJ")
|
||||
llm = AzureLLMService()
|
||||
|
||||
messages = [{
|
||||
"role": "system",
|
||||
"content": "You are an LLM in a WebRTC session, and this is a 'hello world' demo. Say hello to the world."
|
||||
}]
|
||||
tts_task = asyncio.create_task(
|
||||
tts.run_to_queue(
|
||||
transport.send_queue,
|
||||
llm.run([LLMMessagesQueueFrame(messages)]),
|
||||
)
|
||||
)
|
||||
|
||||
@transport.event_handler("on_first_other_participant_joined")
|
||||
async def on_first_other_participant_joined(transport):
|
||||
await tts_task
|
||||
await transport.stop_when_done()
|
||||
|
||||
await transport.run()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Simple Daily Bot Sample")
|
||||
parser.add_argument(
|
||||
"-u", "--url", type=str, required=True, help="URL of the Daily room to join"
|
||||
)
|
||||
|
||||
args, unknown = parser.parse_known_args()
|
||||
asyncio.run(main(args.url))
|
||||
46
src/samples/foundational/03-still-frame.py
Normal file
@@ -0,0 +1,46 @@
|
||||
import argparse
|
||||
import asyncio
|
||||
|
||||
from dailyai.queue_frame import TextQueueFrame
|
||||
from dailyai.services.daily_transport_service import DailyTransportService
|
||||
from dailyai.services.open_ai_services import OpenAIImageGenService
|
||||
|
||||
local_joined = False
|
||||
participant_joined = False
|
||||
|
||||
|
||||
async def main(room_url):
|
||||
meeting_duration_minutes = 1
|
||||
transport = DailyTransportService(
|
||||
room_url,
|
||||
None,
|
||||
"Show a still frame image",
|
||||
meeting_duration_minutes,
|
||||
)
|
||||
transport.mic_enabled = False
|
||||
transport.camera_enabled = True
|
||||
transport.camera_width = 1024
|
||||
transport.camera_height = 1024
|
||||
|
||||
imagegen = OpenAIImageGenService(image_size="1024x1024")
|
||||
image_task = asyncio.create_task(
|
||||
imagegen.run_to_queue(
|
||||
transport.send_queue, [
|
||||
TextQueueFrame("a cat in the style of picasso")]))
|
||||
|
||||
@transport.event_handler("on_participant_joined")
|
||||
async def on_participant_joined(transport, participant):
|
||||
await image_task
|
||||
|
||||
await transport.run()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Simple Daily Bot Sample")
|
||||
parser.add_argument(
|
||||
"-u", "--url", type=str, required=True, help="URL of the Daily room to join"
|
||||
)
|
||||
|
||||
args, unknown = parser.parse_known_args()
|
||||
|
||||
asyncio.run(main(args.url))
|
||||
74
src/samples/foundational/04-utterance-and-speech.py
Normal file
@@ -0,0 +1,74 @@
|
||||
import argparse
|
||||
import asyncio
|
||||
import re
|
||||
|
||||
from dailyai.services.daily_transport_service import DailyTransportService
|
||||
from dailyai.services.azure_ai_services import AzureLLMService, AzureTTSService
|
||||
from dailyai.queue_frame import EndStreamQueueFrame, LLMMessagesQueueFrame
|
||||
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
|
||||
|
||||
|
||||
async def main(room_url: str):
|
||||
global transport
|
||||
global llm
|
||||
global tts
|
||||
|
||||
transport = DailyTransportService(
|
||||
room_url,
|
||||
None,
|
||||
"Say Two Things Bot",
|
||||
1,
|
||||
)
|
||||
transport.mic_enabled = True
|
||||
transport.mic_sample_rate = 16000
|
||||
transport.camera_enabled = False
|
||||
|
||||
llm = AzureLLMService()
|
||||
azure_tts = AzureTTSService()
|
||||
elevenlabs_tts = ElevenLabsTTSService(voice_id="ErXwobaYiN019PkySvjV")
|
||||
|
||||
messages = [{"role": "system", "content": "tell the user a joke about llamas"}]
|
||||
|
||||
# Start a task to run the LLM to create a joke, and convert the LLM output to audio frames. This task
|
||||
# will run in parallel with generating and speaking the audio for static text, so there's no delay to
|
||||
# speak the LLM response.
|
||||
buffer_queue = asyncio.Queue()
|
||||
llm_response_task = asyncio.create_task(
|
||||
elevenlabs_tts.run_to_queue(
|
||||
buffer_queue,
|
||||
llm.run([LLMMessagesQueueFrame(messages)]),
|
||||
True,
|
||||
)
|
||||
)
|
||||
|
||||
@transport.event_handler("on_participant_joined")
|
||||
async def on_joined(transport, participant):
|
||||
if participant["id"] == transport.my_participant_id:
|
||||
return
|
||||
|
||||
await azure_tts.say("My friend the LLM is now going to tell a joke about llamas.", transport.send_queue)
|
||||
|
||||
async def buffer_to_send_queue():
|
||||
while True:
|
||||
frame = await buffer_queue.get()
|
||||
await transport.send_queue.put(frame)
|
||||
buffer_queue.task_done()
|
||||
if isinstance(frame, EndStreamQueueFrame):
|
||||
break
|
||||
|
||||
await asyncio.gather(llm_response_task, buffer_to_send_queue())
|
||||
|
||||
await transport.stop_when_done()
|
||||
|
||||
await transport.run()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Simple Daily Bot Sample")
|
||||
parser.add_argument(
|
||||
"-u", "--url", type=str, required=True, help="URL of the Daily room to join"
|
||||
)
|
||||
|
||||
args, unknown = parser.parse_known_args()
|
||||
|
||||
asyncio.run(main(args.url))
|
||||
110
src/samples/foundational/05-sync-speech-and-text.py
Normal file
@@ -0,0 +1,110 @@
|
||||
import argparse
|
||||
import asyncio
|
||||
|
||||
from dailyai.queue_frame import AudioQueueFrame, ImageQueueFrame
|
||||
from dailyai.services.azure_ai_services import AzureLLMService
|
||||
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
|
||||
from dailyai.services.daily_transport_service import DailyTransportService
|
||||
from dailyai.services.fal_ai_services import FalImageGenService
|
||||
|
||||
|
||||
async def main(room_url):
|
||||
meeting_duration_minutes = 5
|
||||
transport = DailyTransportService(
|
||||
room_url,
|
||||
None,
|
||||
"Month Narration Bot",
|
||||
meeting_duration_minutes,
|
||||
)
|
||||
transport.mic_enabled = True
|
||||
transport.camera_enabled = True
|
||||
transport.mic_sample_rate = 16000
|
||||
transport.camera_width = 1024
|
||||
transport.camera_height = 1024
|
||||
|
||||
llm = AzureLLMService()
|
||||
dalle = FalImageGenService(image_size="1024x1024")
|
||||
tts = ElevenLabsTTSService(voice_id="ErXwobaYiN019PkySvjV")
|
||||
# dalle = OpenAIImageGenService(image_size="1024x1024")
|
||||
|
||||
# Get a complete audio chunk from the given text. Splitting this into its own
|
||||
# coroutine lets us ensure proper ordering of the audio chunks on the send queue.
|
||||
async def get_all_audio(text):
|
||||
all_audio = bytearray()
|
||||
async for audio in tts.run_tts(text):
|
||||
all_audio.extend(audio)
|
||||
|
||||
return all_audio
|
||||
|
||||
async def get_month_data(month):
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": f"Describe a nature photograph suitable for use in a calendar, for the month of {month}. Include only the image description with no preamble. Limit the description to one sentence, please.",
|
||||
}
|
||||
]
|
||||
|
||||
image_description = await llm.run_llm(messages)
|
||||
if not image_description:
|
||||
return
|
||||
|
||||
to_speak = f"{month}: {image_description}"
|
||||
audio_task = asyncio.create_task(get_all_audio(to_speak))
|
||||
image_task = asyncio.create_task(dalle.run_image_gen(image_description))
|
||||
(audio, image_data) = await asyncio.gather(
|
||||
audio_task, image_task
|
||||
)
|
||||
|
||||
return {
|
||||
"month": month,
|
||||
"text": image_description,
|
||||
"image_url": image_data[0],
|
||||
"image": image_data[1],
|
||||
"audio": audio,
|
||||
}
|
||||
|
||||
months: list[str] = [
|
||||
"January",
|
||||
"February",
|
||||
"March",
|
||||
"April",
|
||||
"May",
|
||||
"June",
|
||||
"July",
|
||||
"August",
|
||||
"September",
|
||||
"October",
|
||||
"November",
|
||||
"December",
|
||||
]
|
||||
|
||||
@transport.event_handler("on_first_other_participant_joined")
|
||||
async def on_first_other_participant_joined(transport):
|
||||
# This will play the months in the order they're completed. The benefit
|
||||
# is we'll have as little delay as possible before the first month, and
|
||||
# likely no delay between months, but the months won't display in order.
|
||||
for month_data_task in asyncio.as_completed(month_tasks):
|
||||
data = await month_data_task
|
||||
await transport.send_queue.put(
|
||||
[
|
||||
ImageQueueFrame(data["image_url"], data["image"]),
|
||||
AudioQueueFrame(data["audio"]),
|
||||
]
|
||||
)
|
||||
|
||||
# wait for the output queue to be empty, then leave the meeting
|
||||
await transport.stop_when_done()
|
||||
|
||||
month_tasks = [asyncio.create_task(get_month_data(month)) for month in months]
|
||||
|
||||
await transport.run()
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Simple Daily Bot Sample")
|
||||
parser.add_argument(
|
||||
"-u", "--url", type=str, required=True, help="URL of the Daily room to join"
|
||||
)
|
||||
|
||||
args, unknown = parser.parse_known_args()
|
||||
|
||||
asyncio.run(main(args.url))
|
||||
92
src/samples/foundational/06-listen-and-respond.py
Normal file
@@ -0,0 +1,92 @@
|
||||
import argparse
|
||||
import asyncio
|
||||
import requests
|
||||
import time
|
||||
import urllib.parse
|
||||
|
||||
from dailyai.services.daily_transport_service import DailyTransportService
|
||||
from dailyai.services.azure_ai_services import AzureLLMService, AzureTTSService
|
||||
from dailyai.queue_aggregators import LLMContextAggregator
|
||||
|
||||
|
||||
async def main(room_url: str, token):
|
||||
global transport
|
||||
global llm
|
||||
global tts
|
||||
|
||||
transport = DailyTransportService(
|
||||
room_url,
|
||||
token,
|
||||
"Respond bot",
|
||||
5,
|
||||
)
|
||||
transport.mic_enabled = True
|
||||
transport.mic_sample_rate = 16000
|
||||
transport.camera_enabled = False
|
||||
|
||||
llm = AzureLLMService()
|
||||
tts = AzureTTSService()
|
||||
|
||||
@transport.event_handler("on_first_other_participant_joined")
|
||||
async def on_first_other_participant_joined(transport):
|
||||
await tts.say("Hi, I'm listening!", transport.send_queue)
|
||||
|
||||
async def handle_transcriptions():
|
||||
messages = [
|
||||
{"role": "system", "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio. Respond to what the user said in a creative and helpful way."},
|
||||
]
|
||||
|
||||
tma_in = LLMContextAggregator(
|
||||
messages, "user", transport.my_participant_id
|
||||
)
|
||||
tma_out = LLMContextAggregator(
|
||||
messages, "assistant", transport.my_participant_id
|
||||
)
|
||||
await tts.run_to_queue(
|
||||
transport.send_queue,
|
||||
tma_out.run(
|
||||
llm.run(
|
||||
tma_in.run(
|
||||
transport.get_receive_frames()
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
transport.transcription_settings["extra"]["punctuate"] = True
|
||||
await asyncio.gather(transport.run(), handle_transcriptions())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Simple Daily Bot Sample")
|
||||
parser.add_argument(
|
||||
"-u", "--url", type=str, required=True, help="URL of the Daily room to join"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-k",
|
||||
"--apikey",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Daily API Key (needed to create token)",
|
||||
)
|
||||
|
||||
args, unknown = parser.parse_known_args()
|
||||
|
||||
# Create a meeting token for the given room with an expiration 1 hour in the future.
|
||||
room_name: str = urllib.parse.urlparse(args.url).path[1:]
|
||||
expiration: float = time.time() + 60 * 60
|
||||
|
||||
res: requests.Response = requests.post(
|
||||
f"https://api.daily.co/v1/meeting-tokens",
|
||||
headers={"Authorization": f"Bearer {args.apikey}"},
|
||||
json={
|
||||
"properties": {"room_name": room_name, "is_owner": True, "exp": expiration}
|
||||
},
|
||||
)
|
||||
|
||||
if res.status_code != 200:
|
||||
raise Exception(f"Failed to create meeting token: {res.status_code} {res.text}")
|
||||
|
||||
token: str = res.json()["token"]
|
||||
|
||||
asyncio.run(main(args.url, token))
|
||||
174
src/samples/foundational/06a-golden-kitty.py
Normal file
@@ -0,0 +1,174 @@
|
||||
import argparse
|
||||
import asyncio
|
||||
import os
|
||||
import random
|
||||
import requests
|
||||
import time
|
||||
import urllib.parse
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from PIL import Image
|
||||
|
||||
load_dotenv()
|
||||
|
||||
from dailyai.services.daily_transport_service import DailyTransportService
|
||||
from dailyai.services.azure_ai_services import AzureLLMService, AzureTTSService
|
||||
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
|
||||
from dailyai.services.fal_ai_services import FalImageGenService
|
||||
from dailyai.services.open_ai_services import OpenAIImageGenService
|
||||
from dailyai.queue_aggregators import LLMContextAggregator
|
||||
from dailyai.queue_frame import LLMMessagesQueueFrame, QueueFrame, TextQueueFrame, ImageQueueFrame, ImageListQueueFrame
|
||||
from dailyai.services.ai_services import AIService
|
||||
|
||||
from typing import AsyncGenerator, List
|
||||
|
||||
sprites = {}
|
||||
image_files = [
|
||||
'cat1.png',
|
||||
'cat2.png',
|
||||
'cat3.png'
|
||||
]
|
||||
|
||||
script_dir = os.path.dirname(__file__)
|
||||
|
||||
for file in image_files:
|
||||
# Build the full path to the image file
|
||||
full_path = os.path.join(script_dir, "images", file)
|
||||
# Get the filename without the extension to use as the dictionary key
|
||||
filename = os.path.splitext(os.path.basename(full_path))[0]
|
||||
# Open the image and convert it to bytes
|
||||
with Image.open(full_path) as img:
|
||||
sprites[file] = img.tobytes()
|
||||
|
||||
quiet_frame = ImageQueueFrame("", sprites["cat1.png"])
|
||||
sprite_list = list(sprites.values())
|
||||
talking = [random.choice(sprite_list) for x in range(30)]
|
||||
talking_frame = ImageListQueueFrame(images=talking)
|
||||
class TranscriptFilter(AIService):
|
||||
def __init__(self, bot_participant_id=None):
|
||||
self.bot_participant_id = bot_participant_id
|
||||
|
||||
async def process_frame(self, frame:QueueFrame) -> AsyncGenerator[QueueFrame, None]:
|
||||
if frame.participantId != self.bot_participant_id:
|
||||
yield frame
|
||||
|
||||
class NameCheckFilter(AIService):
|
||||
def __init__(self, names=None):
|
||||
self.names = names
|
||||
self.sentence = ""
|
||||
|
||||
async def process_frame(self, frame:QueueFrame) -> AsyncGenerator[QueueFrame, None]:
|
||||
content: str = ""
|
||||
|
||||
# TODO: split up transcription by participant
|
||||
if isinstance(frame, TextQueueFrame):
|
||||
content = frame.text
|
||||
|
||||
self.sentence += content
|
||||
if self.sentence.endswith((".", "?", "!")):
|
||||
if any(name in self.sentence for name in self.names):
|
||||
print(f"I got one: {frame.text}")
|
||||
out = self.sentence
|
||||
self.sentence = ""
|
||||
yield TextQueueFrame(out)
|
||||
else:
|
||||
out = self.sentence
|
||||
self.sentence = ""
|
||||
print(f"ignoring: {out}")
|
||||
|
||||
async def main(room_url:str, token):
|
||||
global transport
|
||||
global llm
|
||||
global tts
|
||||
|
||||
transport = DailyTransportService(
|
||||
room_url,
|
||||
token,
|
||||
"Derrick",
|
||||
180,
|
||||
)
|
||||
transport.mic_enabled = True
|
||||
transport.mic_sample_rate = 16000
|
||||
transport.camera_enabled = True
|
||||
transport.camera_width = 960
|
||||
transport.camera_height = 960
|
||||
|
||||
llm = AzureLLMService()
|
||||
tts = ElevenLabsTTSService()
|
||||
|
||||
@transport.event_handler("on_first_other_participant_joined")
|
||||
async def on_first_other_participant_joined(transport):
|
||||
await tts.say("Hi, I'm listening!", transport.send_queue)
|
||||
|
||||
async def handle_transcriptions():
|
||||
messages = [
|
||||
{"role": "system", "content": "You are Derek, the Golden Kitty, the mascot for Product Hunt's annual awards. You are a cat who knows everything about all the cool new tech startups. You should be clever, and a bit sarcastic. You should also tell jokes every once in a while. Your responses should only be a few sentences long."},
|
||||
]
|
||||
|
||||
tma_in = LLMContextAggregator(
|
||||
messages, "user", transport.my_participant_id
|
||||
)
|
||||
tma_out = LLMContextAggregator(
|
||||
messages, "assistant", transport.my_participant_id
|
||||
)
|
||||
tf = TranscriptFilter(transport.my_participant_id)
|
||||
ncf = NameCheckFilter(["Derek", "Derrick"])
|
||||
await tts.run_to_queue(
|
||||
transport.send_queue,
|
||||
tma_out.run(
|
||||
llm.run(
|
||||
tma_in.run(
|
||||
ncf.run(
|
||||
tf.run(
|
||||
transport.get_receive_frames()
|
||||
)
|
||||
)
|
||||
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
async def make_cats():
|
||||
await transport.send_queue.put(quiet_frame)
|
||||
|
||||
transport.transcription_settings["extra"]["punctuate"] = True
|
||||
await asyncio.gather(transport.run(), handle_transcriptions(), make_cats())
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Simple Daily Bot Sample")
|
||||
parser.add_argument(
|
||||
"-u", "--url", type=str, required=True, help="URL of the Daily room to join"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-k",
|
||||
"--apikey",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Daily API Key (needed to create token)",
|
||||
)
|
||||
|
||||
args, unknown = parser.parse_known_args()
|
||||
|
||||
# Create a meeting token for the given room with an expiration 24 hours in the future.
|
||||
room_name: str = urllib.parse.urlparse(args.url).path[1:]
|
||||
expiration: float = time.time() + 60 * 60 * 24
|
||||
|
||||
res: requests.Response = requests.post(
|
||||
f"https://api.daily.co/v1/meeting-tokens",
|
||||
headers={"Authorization": f"Bearer {args.apikey}"},
|
||||
json={
|
||||
"properties": {"room_name": room_name, "is_owner": True, "exp": expiration}
|
||||
},
|
||||
)
|
||||
|
||||
if res.status_code != 200:
|
||||
raise Exception(f"Failed to create meeting token: {res.status_code} {res.text}")
|
||||
|
||||
token: str = res.json()["token"]
|
||||
|
||||
asyncio.run(main(args.url, token))
|
||||
134
src/samples/foundational/06a-image-sync.py
Normal file
@@ -0,0 +1,134 @@
|
||||
import argparse
|
||||
import asyncio
|
||||
from typing import AsyncGenerator
|
||||
import requests
|
||||
import time
|
||||
import urllib.parse
|
||||
|
||||
from PIL import Image
|
||||
from dailyai.queue_frame import ImageQueueFrame, QueueFrame
|
||||
|
||||
from dailyai.services.daily_transport_service import DailyTransportService
|
||||
from dailyai.services.azure_ai_services import AzureLLMService, AzureTTSService
|
||||
from dailyai.services.ai_services import AIService
|
||||
from dailyai.queue_aggregators import LLMAssistantContextAggregator, LLMUserContextAggregator
|
||||
from dailyai.services.fal_ai_services import FalImageGenService
|
||||
|
||||
|
||||
class ImageSyncAggregator(AIService):
|
||||
def __init__(self, speaking_path:str, waiting_path:str):
|
||||
self._speaking_image = Image.open(speaking_path)
|
||||
self._speaking_image_bytes = self._speaking_image.tobytes()
|
||||
|
||||
self._waiting_image = Image.open(waiting_path)
|
||||
self._waiting_image_bytes = self._waiting_image.tobytes()
|
||||
|
||||
async def process_frame(self, frame: QueueFrame) -> AsyncGenerator[QueueFrame, None]:
|
||||
yield ImageQueueFrame(None, self._speaking_image_bytes)
|
||||
yield frame
|
||||
yield ImageQueueFrame(None, self._waiting_image_bytes)
|
||||
|
||||
async def main(room_url: str, token):
|
||||
global transport
|
||||
global llm
|
||||
global tts
|
||||
|
||||
transport = DailyTransportService(
|
||||
room_url,
|
||||
token,
|
||||
"Respond bot",
|
||||
5,
|
||||
)
|
||||
transport.camera_enabled = True
|
||||
transport.camera_width = 1024
|
||||
transport.camera_height = 1024
|
||||
transport.mic_enabled = True
|
||||
transport.mic_sample_rate = 16000
|
||||
|
||||
llm = AzureLLMService()
|
||||
tts = AzureTTSService()
|
||||
img = FalImageGenService(image_size="1024x1024")
|
||||
|
||||
async def get_images():
|
||||
get_speaking_task = asyncio.create_task(
|
||||
img.run_image_gen("An image of a cat speaking")
|
||||
)
|
||||
get_waiting_task = asyncio.create_task(
|
||||
img.run_image_gen("An image of a cat waiting")
|
||||
)
|
||||
|
||||
(speaking_data, waiting_data) = await asyncio.gather(
|
||||
get_speaking_task, get_waiting_task
|
||||
)
|
||||
|
||||
return speaking_data, waiting_data
|
||||
|
||||
@transport.event_handler("on_first_other_participant_joined")
|
||||
async def on_first_other_participant_joined(transport):
|
||||
await tts.say("Hi, I'm listening!", transport.send_queue)
|
||||
|
||||
async def handle_transcriptions():
|
||||
messages = [
|
||||
{"role": "system", "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio. Respond to what the user said in a creative and helpful way."},
|
||||
]
|
||||
|
||||
tma_in = LLMUserContextAggregator(
|
||||
messages, transport.my_participant_id
|
||||
)
|
||||
tma_out = LLMAssistantContextAggregator(
|
||||
messages, transport.my_participant_id
|
||||
)
|
||||
image_sync_aggregator = ImageSyncAggregator(
|
||||
"/Users/moishe/src/daily-ai-sdk/src/samples/foundational/speaking.png",
|
||||
"/Users/moishe/src/daily-ai-sdk/src/samples/foundational/waiting.png",
|
||||
)
|
||||
await tts.run_to_queue(
|
||||
transport.send_queue,
|
||||
image_sync_aggregator.run(
|
||||
tma_out.run(
|
||||
llm.run(
|
||||
tma_in.run(
|
||||
transport.get_receive_frames()
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
transport.transcription_settings["extra"]["punctuate"] = True
|
||||
await asyncio.gather(transport.run(), handle_transcriptions())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Simple Daily Bot Sample")
|
||||
parser.add_argument(
|
||||
"-u", "--url", type=str, required=True, help="URL of the Daily room to join"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-k",
|
||||
"--apikey",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Daily API Key (needed to create token)",
|
||||
)
|
||||
|
||||
args, unknown = parser.parse_known_args()
|
||||
|
||||
# Create a meeting token for the given room with an expiration 1 hour in the future.
|
||||
room_name: str = urllib.parse.urlparse(args.url).path[1:]
|
||||
expiration: float = time.time() + 60 * 60
|
||||
|
||||
res: requests.Response = requests.post(
|
||||
f"https://api.daily.co/v1/meeting-tokens",
|
||||
headers={"Authorization": f"Bearer {args.apikey}"},
|
||||
json={
|
||||
"properties": {"room_name": room_name, "is_owner": True, "exp": expiration}
|
||||
},
|
||||
)
|
||||
|
||||
if res.status_code != 200:
|
||||
raise Exception(f"Failed to create meeting token: {res.status_code} {res.text}")
|
||||
|
||||
token: str = res.json()["token"]
|
||||
|
||||
asyncio.run(main(args.url, token))
|
||||
99
src/samples/foundational/07-interruptible.py
Normal file
@@ -0,0 +1,99 @@
|
||||
import argparse
|
||||
import asyncio
|
||||
import requests
|
||||
import time
|
||||
import urllib.parse
|
||||
from dailyai.conversation_wrappers import InterruptibleConversationWrapper
|
||||
|
||||
from dailyai.queue_frame import StartStreamQueueFrame, TextQueueFrame
|
||||
from dailyai.services.daily_transport_service import DailyTransportService
|
||||
from dailyai.services.azure_ai_services import AzureLLMService
|
||||
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
|
||||
|
||||
|
||||
async def main(room_url: str, token):
|
||||
global transport
|
||||
global llm
|
||||
global tts
|
||||
|
||||
transport = DailyTransportService(
|
||||
room_url,
|
||||
token,
|
||||
"Respond bot",
|
||||
5,
|
||||
)
|
||||
transport.mic_enabled = True
|
||||
transport.mic_sample_rate = 16000
|
||||
transport.camera_enabled = False
|
||||
transport.start_transcription = True
|
||||
|
||||
llm = AzureLLMService()
|
||||
tts = ElevenLabsTTSService(voice_id="ErXwobaYiN019PkySvjV")
|
||||
|
||||
async def run_response(user_speech, tma_in, tma_out):
|
||||
await tts.run_to_queue(
|
||||
transport.send_queue,
|
||||
tma_out.run(
|
||||
llm.run(
|
||||
tma_in.run(
|
||||
[StartStreamQueueFrame(), TextQueueFrame(user_speech)]
|
||||
)
|
||||
)
|
||||
),
|
||||
)
|
||||
|
||||
@transport.event_handler("on_first_other_participant_joined")
|
||||
async def on_first_other_participant_joined(transport):
|
||||
await tts.say("Hi, I'm listening!", transport.send_queue)
|
||||
|
||||
async def run_conversation():
|
||||
messages = [
|
||||
{"role": "system", "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio. Respond to what the user said in a creative and helpful way."},
|
||||
]
|
||||
|
||||
conversation_wrapper = InterruptibleConversationWrapper(
|
||||
frame_generator=transport.get_receive_frames,
|
||||
runner=run_response,
|
||||
interrupt=transport.interrupt,
|
||||
my_participant_id=transport.my_participant_id,
|
||||
llm_messages=messages,
|
||||
)
|
||||
await conversation_wrapper.run_conversation()
|
||||
|
||||
transport.transcription_settings["extra"]["punctuate"] = False
|
||||
await asyncio.gather(transport.run(), run_conversation())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Simple Daily Bot Sample")
|
||||
parser.add_argument(
|
||||
"-u", "--url", type=str, required=True, help="URL of the Daily room to join"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-k",
|
||||
"--apikey",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Daily API Key (needed to create token)",
|
||||
)
|
||||
|
||||
args, unknown = parser.parse_known_args()
|
||||
|
||||
# Create a meeting token for the given room with an expiration 1 hour in the future.
|
||||
room_name: str = urllib.parse.urlparse(args.url).path[1:]
|
||||
expiration: float = time.time() + 60 * 60
|
||||
|
||||
res: requests.Response = requests.post(
|
||||
f"https://api.daily.co/v1/meeting-tokens",
|
||||
headers={"Authorization": f"Bearer {args.apikey}"},
|
||||
json={
|
||||
"properties": {"room_name": room_name, "is_owner": True, "exp": expiration}
|
||||
},
|
||||
)
|
||||
|
||||
if res.status_code != 200:
|
||||
raise Exception(f"Failed to create meeting token: {res.status_code} {res.text}")
|
||||
|
||||
token: str = res.json()["token"]
|
||||
|
||||
asyncio.run(main(args.url, token))
|
||||
@@ -1,22 +1,22 @@
|
||||
import argparse
|
||||
import asyncio
|
||||
|
||||
from dailyai.services.daily_transport_service import DailyTransportService
|
||||
from dailyai.services.whisper_ai_services import WhisperSTTService
|
||||
|
||||
from examples.foundational.support.runner import configure
|
||||
|
||||
|
||||
async def main(room_url: str):
|
||||
global transport
|
||||
global stt
|
||||
|
||||
transport = DailyTransportService(
|
||||
room_url,
|
||||
None,
|
||||
"Transcription bot",
|
||||
start_transcription=True,
|
||||
mic_enabled=False,
|
||||
camera_enabled=False,
|
||||
speaker_enabled=True
|
||||
)
|
||||
|
||||
transport.mic_enabled = False
|
||||
transport.camera_enabled = False
|
||||
transport.speaker_enabled = True
|
||||
stt = WhisperSTTService()
|
||||
transcription_output_queue = asyncio.Queue()
|
||||
|
||||
@@ -35,5 +35,10 @@ async def main(room_url: str):
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
(url, token) = configure()
|
||||
asyncio.run(main(url))
|
||||
parser = argparse.ArgumentParser(description="Simple Daily Bot Sample")
|
||||
parser.add_argument(
|
||||
"-u", "--url", type=str, required=True, help="URL of the Daily room to join"
|
||||
)
|
||||
|
||||
args, unknown = parser.parse_known_args()
|
||||
asyncio.run(main(args.url))
|
||||
BIN
src/samples/foundational/images/cat1.png
Normal file
|
After Width: | Height: | Size: 1.5 MiB |
BIN
src/samples/foundational/images/cat2.png
Normal file
|
After Width: | Height: | Size: 1.6 MiB |
BIN
src/samples/foundational/images/cat3.png
Normal file
|
After Width: | Height: | Size: 1.6 MiB |
|
Before Width: | Height: | Size: 33 KiB After Width: | Height: | Size: 33 KiB |
|
Before Width: | Height: | Size: 30 KiB After Width: | Height: | Size: 30 KiB |
@@ -23,11 +23,11 @@ async def main(room_url: str, token):
|
||||
"Imagebot",
|
||||
1,
|
||||
)
|
||||
transport._mic_enabled = True
|
||||
transport._camera_enabled = True
|
||||
transport._mic_sample_rate = 16000
|
||||
transport._camera_width = 1024
|
||||
transport._camera_height = 1024
|
||||
transport.mic_enabled = True
|
||||
transport.camera_enabled = True
|
||||
transport.mic_sample_rate = 16000
|
||||
transport.camera_width = 1024
|
||||
transport.camera_height = 1024
|
||||
|
||||
llm = AzureLLMService()
|
||||
tts = AzureTTSService()
|
||||
@@ -39,7 +39,7 @@ async def main(room_url: str, token):
|
||||
sentence = ""
|
||||
async for message in transport.get_transcriptions():
|
||||
print(f"transcription message: {message}")
|
||||
if message["session_id"] == transport._my_participant_id:
|
||||
if message["session_id"] == transport.my_participant_id:
|
||||
continue
|
||||
finder = message["text"].find("start over")
|
||||
print(f"finder: {finder}")
|
||||