Compare commits
10 Commits
v0.0.4
...
khk-functi
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5d6d674ff6 | ||
|
|
1e552958aa | ||
|
|
17edfe98bd | ||
|
|
5100a7599b | ||
|
|
18c2b37358 | ||
|
|
0244f358d2 | ||
|
|
85fe6c0580 | ||
|
|
ae7482ed18 | ||
|
|
90d928be99 | ||
|
|
0703b926a3 |
@@ -1,30 +0,0 @@
|
||||
# flyctl launch added from .gitignore
|
||||
**/.vscode
|
||||
**/env
|
||||
**/__pycache__
|
||||
**/*~
|
||||
**/venv
|
||||
#*#
|
||||
|
||||
# Distribution / packaging
|
||||
**/.Python
|
||||
**/build
|
||||
**/develop-eggs
|
||||
**/dist
|
||||
**/downloads
|
||||
**/eggs
|
||||
**/.eggs
|
||||
**/lib
|
||||
**/lib64
|
||||
**/parts
|
||||
**/sdist
|
||||
**/var
|
||||
**/wheels
|
||||
**/share/python-wheels
|
||||
**/*.egg-info
|
||||
**/.installed.cfg
|
||||
**/*.egg
|
||||
**/MANIFEST
|
||||
**/.DS_Store
|
||||
**/.env
|
||||
fly.toml
|
||||
44
.github/workflows/build.yaml
vendored
@@ -1,44 +0,0 @@
|
||||
name: build
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
pull_request:
|
||||
branches:
|
||||
- "**"
|
||||
paths-ignore:
|
||||
- "docs/**"
|
||||
|
||||
concurrency:
|
||||
group: build-${{ github.event.pull_request.number || github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
build:
|
||||
name: "Build and Install"
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- name: Set up Python
|
||||
id: setup_python
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: '3.10'
|
||||
- name: Setup virtual environment
|
||||
run: |
|
||||
python -m venv .venv
|
||||
- name: Install basic Python dependencies
|
||||
run: |
|
||||
source .venv/bin/activate
|
||||
python -m pip install --upgrade pip
|
||||
pip install -r dev-requirements.txt
|
||||
- name: Build project
|
||||
run: |
|
||||
source .venv/bin/activate
|
||||
python -m build
|
||||
- name: Install project and other Python dependencies
|
||||
run: |
|
||||
source .venv/bin/activate
|
||||
pip install --editable .
|
||||
44
.github/workflows/lint.yaml
vendored
@@ -1,44 +0,0 @@
|
||||
name: lint
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
pull_request:
|
||||
branches:
|
||||
- "**"
|
||||
paths-ignore:
|
||||
- "docs/**"
|
||||
|
||||
concurrency:
|
||||
group: build-lint-${{ github.event.pull_request.number || github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
autopep8:
|
||||
name: "Formatting lints"
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout repo
|
||||
uses: actions/checkout@v4
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: '3.10'
|
||||
- name: Setup virtual environment
|
||||
run: |
|
||||
python -m venv .venv
|
||||
- name: Install development Python dependencies
|
||||
run: |
|
||||
source .venv/bin/activate
|
||||
python -m pip install --upgrade pip
|
||||
pip install -r dev-requirements.txt
|
||||
- name: autopep8
|
||||
id: autopep8
|
||||
run: |
|
||||
source .venv/bin/activate
|
||||
autopep8 --max-line-length 100 --exit-code -r -d --exclude "*_pb2.py" -a -a src/
|
||||
- name: Fail if autopep8 requires changes
|
||||
if: steps.autopep8.outputs.exit-code == 2
|
||||
run: exit 1
|
||||
62
.github/workflows/publish.yaml
vendored
@@ -1,62 +0,0 @@
|
||||
name: publish
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
gitref:
|
||||
type: string
|
||||
description: "what git ref to build"
|
||||
required: true
|
||||
|
||||
jobs:
|
||||
build:
|
||||
name: "Build and upload wheels"
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout repo
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
ref: ${{ github.event.inputs.gitref }}
|
||||
- name: Set up Python
|
||||
id: setup_python
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: '3.10'
|
||||
- name: Setup virtual environment
|
||||
run: |
|
||||
python -m venv .venv
|
||||
- name: Install basic Python dependencies
|
||||
run: |
|
||||
source .venv/bin/activate
|
||||
python -m pip install --upgrade pip
|
||||
pip install -r dev-requirements.txt
|
||||
- name: Build project
|
||||
run: |
|
||||
source .venv/bin/activate
|
||||
python -m build
|
||||
- name: Upload wheels
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: wheels
|
||||
path: ./dist
|
||||
|
||||
publish-to-pypi:
|
||||
name: "Publish to PyPI"
|
||||
runs-on: ubuntu-latest
|
||||
needs: [ build ]
|
||||
environment:
|
||||
name: pypi
|
||||
url: https://pypi.org/p/dailyai
|
||||
permissions:
|
||||
id-token: write
|
||||
steps:
|
||||
- name: Download wheels
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: wheels
|
||||
path: ./dist
|
||||
- name: Publish to PyPI
|
||||
uses: pypa/gh-action-pypi-publish@release/v1
|
||||
with:
|
||||
verbose: true
|
||||
print-hash: true
|
||||
59
.github/workflows/publish_test.yaml
vendored
@@ -1,59 +0,0 @@
|
||||
name: publish-test
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
|
||||
jobs:
|
||||
build:
|
||||
name: "Build and upload wheels"
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout repo
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
ref: ${{ github.event.inputs.gitref }}
|
||||
- name: Set up Python
|
||||
id: setup_python
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: '3.10'
|
||||
- name: Setup virtual environment
|
||||
run: |
|
||||
python -m venv .venv
|
||||
- name: Install basic Python dependencies
|
||||
run: |
|
||||
source .venv/bin/activate
|
||||
python -m pip install --upgrade pip
|
||||
pip install -r dev-requirements.txt
|
||||
- name: Build project
|
||||
run: |
|
||||
source .venv/bin/activate
|
||||
python -m build
|
||||
- name: Upload wheels
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: ./dist
|
||||
|
||||
publish-to-pypi:
|
||||
name: "Test publish to PyPI"
|
||||
runs-on: ubuntu-latest
|
||||
needs: [ build ]
|
||||
environment:
|
||||
name: pypi
|
||||
url: https://pypi.org/p/dailyai
|
||||
permissions:
|
||||
id-token: write
|
||||
steps:
|
||||
- name: Download wheels
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
path: ./dist
|
||||
- name: Publish to PyPI
|
||||
uses: pypa/gh-action-pypi-publish@release/v1
|
||||
with:
|
||||
verbose: true
|
||||
print-hash: true
|
||||
repository-url: https://test.pypi.org/legacy/
|
||||
49
.github/workflows/tests.yaml
vendored
@@ -1,49 +0,0 @@
|
||||
name: test
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
pull_request:
|
||||
branches:
|
||||
- "**"
|
||||
paths-ignore:
|
||||
- "docs/**"
|
||||
|
||||
concurrency:
|
||||
group: build-test-${{ github.event.pull_request.number || github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
test:
|
||||
name: "Unit and Integration Tests"
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- name: Set up Python
|
||||
id: setup_python
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: '3.10'
|
||||
- name: Cache virtual environment
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
# We are hashing requirements-dev.txt and requirements-extra.txt which
|
||||
# contain all dependencies needed to run the tests and examples.
|
||||
key: venv-${{ runner.os }}-${{ steps.setup_python.outputs.python-version}}-${{ hashFiles('linux-py3.10-requirements.txt') }}-${{ hashFiles('dev-requirements.txt') }}
|
||||
path: .venv
|
||||
- name: Install system packages
|
||||
run: sudo apt-get install -y portaudio19-dev
|
||||
- name: Setup virtual environment
|
||||
run: |
|
||||
python -m venv .venv
|
||||
- name: Install basic Python dependencies
|
||||
run: |
|
||||
source .venv/bin/activate
|
||||
python -m pip install --upgrade pip
|
||||
pip install -r linux-py3.10-requirements.txt -r dev-requirements.txt
|
||||
- name: Test with pytest
|
||||
run: |
|
||||
source .venv/bin/activate
|
||||
pytest --doctest-modules --ignore-glob="*to_be_updated*" src tests
|
||||
1
.gitignore
vendored
@@ -26,4 +26,3 @@ share/python-wheels/
|
||||
MANIFEST
|
||||
.DS_Store
|
||||
.env
|
||||
fly.toml
|
||||
235
README.md
@@ -1,103 +1,33 @@
|
||||
# dailyai — an open source framework for real-time, multi-modal, conversational AI applications
|
||||
# Daily AI SDK
|
||||
|
||||
Build things like this:
|
||||
Build conversational, multi-modal AI apps with real-time voice and video, like this:
|
||||
|
||||
[](https://www.youtube.com/watch?v=lDevgsp9vn0)
|
||||
_Demo Video to come_
|
||||
|
||||
**`dailyai` started as a toolkit for implementing generative AI voice bots.** Things like personal coaches, meeting assistants, story-telling toys for kids, customer support bots, and snarky social companions.
|
||||
With built-in support for many of the best AI platforms (or [add your own](/docs)):
|
||||
|
||||
In 2023 a *lot* of us got excited about the possibility of having open-ended conversations with LLMs. It became clear pretty quickly that we were all solving the same [low-level problems](https://www.daily.co/blog/how-to-talk-to-an-llm-with-your-voice/):
|
||||
- low-latency, reliable audio transport
|
||||
- echo cancellation
|
||||
- phrase endpointing (knowing when the bot should respond to human speech)
|
||||
- interruptibility
|
||||
- writing clean code to stream data through "pipelines" of speech-to-text, LLM inference, and text-to-speech models
|
||||
- Azure - DALL-E, ChatGPT, and Azure AI Text-to-Speech
|
||||
- Deepgram - Speech-to-text, and Aura text-to-speech
|
||||
- Eleven Labs text-to-speech
|
||||
- Fal.ai image generation
|
||||
- OpenAI DALL-E and ChatGPT
|
||||
- Whisper local speech-to-text
|
||||
|
||||
As our applications expanded to include additional things like image generation, function calling, and vision models, we started to think about what a complete framework for these kinds of apps could look like.
|
||||
## Step 1: Get Started
|
||||
|
||||
Today, `dailyai` is:
|
||||
|
||||
1. a set of code building blocks for interacting with generative AI services and creating low-latency, interruptible data pipelines that use multiple services
|
||||
2. transport services that moves audio, video, and events across the Internet
|
||||
3. implementations of specific generative AI services
|
||||
|
||||
Currently implemented services:
|
||||
- Speech-to-text
|
||||
- Deepgram
|
||||
- Whisper
|
||||
- LLMs
|
||||
- Azure
|
||||
- OpenAI
|
||||
- Image generation
|
||||
- Azure
|
||||
- Fal
|
||||
- OpenAI
|
||||
- Text-to-speech
|
||||
- Azure
|
||||
- Deepgram
|
||||
- ElevenLabs
|
||||
- Transport
|
||||
- Daily
|
||||
- Local (in progress, intended as a quick start example service)
|
||||
|
||||
If you'd like to [implement a service]((https://github.com/daily-co/daily-ai-sdk/tree/main/src/dailyai/services)), we welcome PRs! Our goal is to support lots of services in all of the above categories, plus new categories (like real-time video) as they emerge.
|
||||
|
||||
## Getting started
|
||||
|
||||
Today, the easiest way to get started with `dailyai` is to use [Daily](https://www.daily.co/) as your transport service. This toolkit started life as an internal SDK at Daily and millions of minutes of AI conversation have been served using it and its earlier prototype incarnations. (The [transport base class](https://github.com/daily-co/daily-ai-sdk/blob/main/src/dailyai/transports/abstract_transport.py) is easy to extend, though, so feel free to submit PRs if you'd like to implement another transport service.)
|
||||
|
||||
```
|
||||
# install the module
|
||||
pip install dailyai
|
||||
|
||||
# set up an .env file with API keys
|
||||
cp dot-env.template .env
|
||||
```
|
||||
|
||||
By default, in order to minimize dependencies, only the basic framework functionality is available. Some third-party AI services require additional
|
||||
dependencies that you can install with:
|
||||
|
||||
```
|
||||
pip install dailyai[option,...]
|
||||
```
|
||||
|
||||
Your project may or may not need these, so they're made available as optional requirements. Here is a list:
|
||||
|
||||
- **AI services**: `anthropic`, `azure`, `fal`, `openai`, `playht`, `silero`, `whisper`
|
||||
- **Transports**: `daily`, `local`, `websocket`
|
||||
|
||||
## Code examples
|
||||
|
||||
There are two directories of examples:
|
||||
|
||||
- [foundational](https://github.com/daily-co/daily-ai-sdk/tree/main/examples/foundational) — demos that build on each other, introducing one or two concepts at a time
|
||||
- [starter apps](https://github.com/daily-co/daily-ai-sdk/tree/main/examples/starter-apps) — complete applications that you can use as starting points for development
|
||||
|
||||
Before running the examples you need to install the dependencies (which will install all the dependencies to run all of the examples):
|
||||
|
||||
```
|
||||
pip install -r {env}-requirements.txt
|
||||
```
|
||||
|
||||
To run the example below you need to sign up for a [free Daily account](https://dashboard.daily.co/u/signup) and create a Daily room (so you can hear the LLM talking). After that, join the room's URL directly from a browser tab and run:
|
||||
|
||||
```
|
||||
python examples/foundational/02-llm-say-one-thing.py
|
||||
```
|
||||
|
||||
## Hacking on the framework itself
|
||||
## Build/Install
|
||||
|
||||
_Note that you may need to set up a virtual environment before following the instructions below. For instance, you might need to run the following from the root of the repo:_
|
||||
|
||||
```
|
||||
python3 -m venv venv
|
||||
source venv/bin/activate
|
||||
python3 -m venv env
|
||||
source env/bin/activate
|
||||
```
|
||||
|
||||
From the root of this repo, run the following:
|
||||
|
||||
```
|
||||
pip install -r {env}-requirements.txt -r dev-requirements.txt
|
||||
pip install -r requirements.txt
|
||||
python -m build
|
||||
```
|
||||
|
||||
@@ -113,54 +43,117 @@ If you want to use this package from another directory, you can run:
|
||||
pip install path_to_this_repo
|
||||
```
|
||||
|
||||
### Running tests
|
||||
## Running the samples
|
||||
|
||||
From the root directory, run:
|
||||
Tou can run the simple sample like so:
|
||||
|
||||
```
|
||||
pytest --doctest-modules --ignore-glob="*to_be_updated*" src tests
|
||||
python src/examples/theoretical-to-real/01-say-one-thing.py -u <url of your Daily meeting> -k <your Daily API Key>
|
||||
```
|
||||
## Overview
|
||||
|
||||
The Daily AI SDK allows you to build applications that can participate in WebRTC sessions and interact with AI Services. Some examples of what you can build with this:
|
||||
|
||||
- conversational bots that interact 1:1 with a user, using voice recognition and text-to-speech
|
||||
- assistant bots that aggregate transcriptions from multiple participants in a meeting and provide realtime summaries or other AI-generated output.
|
||||
- image-recognition bots
|
||||
- etc
|
||||
|
||||
## Concepts
|
||||
|
||||
### Transport Service
|
||||
|
||||
The SDK provides one “transport service”, which is a wrapper around Daily’s `daily-python` client (tk add link). You can use this service to listen for events related to a WebRTC session, such as “a participant joined the meeting”.
|
||||
The transport service also exposes a send queue, and a receive queue. You can use the send queue to send audio and video to the WebRTC session, and you can listen to the receive queue to see audio, video and transcription data from the WebRTC session.
|
||||
|
||||
### AI Services
|
||||
|
||||
The AI Service classes provide wrappers around various AI providers, and allow you to query LLMs, convert text to speech and make images from text. The audio and images can then be placed on the transport service’s send queue, where they’ll be sent to the WebRTC session.
|
||||
|
||||
### Queue Frames
|
||||
|
||||
Communication between the transport service and AI services, and between various AI services, takes place in Queue Frames. These frames contain an indication of the type of data as well as the data itself.
|
||||
|
||||
## Using Transports, AI Services and Frames
|
||||
|
||||
AI Services all define a `.run` method. This method consumes and generates `QueueFrame` frames. The kind of frames that can be consumed and generated depend on the kind of service. For instance, an LLM AI Service consumes `LLM_MESSAGE` frames (which define a history of interaction with an LLM) and emit `TEXT` frames (the response from the LLM).
|
||||
|
||||
The `.run` method is an `AsyncIterable`, and it takes an `iterable`, `AsyncIterable` or `asyncio.Queue` that produces QueueFrames as a parameter. This makes it easy to chain AI Services, and consume input from the Transport’s `receive_queue` .
|
||||
|
||||
AI Services also have a `.run_to_queue` method. This method is not an AsyncIterable, but instead sends processed QueueFrames to a queue. This makes it easy to send the output of an AI Service to the Transport’s `send_queue`.
|
||||
|
||||
AI Services also define convenience functions that let you bypass creating QueueFrames for some simple cases (eg. using the TTS service to convert a string to audio output and send that audio to the transport’s `send_queue`). See below for examples.
|
||||
|
||||
## Examples
|
||||
|
||||
### Say Something
|
||||
|
||||
The base TTS AI service exposes a `.say` method. After creating a transport and TTS service, you can use this method like so:
|
||||
|
||||
```
|
||||
transport = DailyTransportService(...)
|
||||
tts = AzureTTSService()
|
||||
await tts.say("hello world", transport.send_queue)
|
||||
```
|
||||
|
||||
## Setting up your editor
|
||||
This will call the TTS service to render the text to audio frames, then put the audio frames on the transport’s send queue. The transport will then send those frames along to the WebRTC session.
|
||||
|
||||
This project uses strict [PEP 8](https://peps.python.org/pep-0008/) formatting.
|
||||
### Speak an LLM response
|
||||
|
||||
### Emacs
|
||||
Given a system prompt contained in a `messages` array, you can emit the LLM’s response as audio with a chain like this:
|
||||
|
||||
You can use [use-package](https://github.com/jwiegley/use-package) to install [py-autopep8](https://codeberg.org/ideasman42/emacs-py-autopep8) package and configure `autopep8` arguments:
|
||||
```
|
||||
transport = DailyTransportService(...) # setup parameters omitted
|
||||
tts = AzureTTSService()
|
||||
llm = AzureLLMService()
|
||||
messages = [...] # system prompt omitted for brevity
|
||||
|
||||
```elisp
|
||||
(use-package py-autopep8
|
||||
:ensure t
|
||||
:defer t
|
||||
:hook ((python-mode . py-autopep8-mode))
|
||||
:config
|
||||
(setq py-autopep8-options '("-a" "-a", "--max-line-length=100")))
|
||||
await tts.run_to_queue(
|
||||
transport.send_queue,
|
||||
llm.run([QueueFrame.LLM_MESSAGES, messages])
|
||||
)
|
||||
```
|
||||
|
||||
`autopep8` was installed in the `venv` environment described before, so you should be able to use [pyvenv-auto](https://github.com/ryotaro612/pyvenv-auto) to automatically load that environment inside Emacs.
|
||||
In this code, the LLM service object sends the messages to Azure’s OpenAI implementation, which streams chunks back asynchronously. Those chunks are aggregated by the TTS Service to ensure the best audio response (TTS works best when it gets complete sentence, so it can inflect correctly), then sent to Azure’s TTS service, converted to audio frames, and sent to the WebRTC session via the Daily transport.
|
||||
|
||||
```elisp
|
||||
(use-package pyvenv-auto
|
||||
:ensure t
|
||||
:defer t
|
||||
:hook ((python-mode . pyvenv-auto-run)))
|
||||
### Pre-cache an LLM response
|
||||
|
||||
Sometimes LLMs can be slower than we’d like for natural-feeling communication. Here’s an example where we take advantage of the time it takes to speak some pre-defined text to get a head start on the LLM response:
|
||||
|
||||
(TK link to 04- sample)
|
||||
|
||||
In this sample, we set up a buffer queue to receive the audio frames from the LLM response before while we are joining the call and start an asynchronous task to start filling this buffer:
|
||||
|
||||
```
|
||||
buffer_queue = asyncio.Queue()
|
||||
llm_response_task = asyncio.create_task(
|
||||
elevenlabs_tts.run_to_queue(
|
||||
buffer_queue,
|
||||
llm.run([QueueFrame(FrameType.LLM_MESSAGE, messages)]),
|
||||
True,
|
||||
)
|
||||
)
|
||||
```
|
||||
|
||||
Then, when we’ve joined the call, we speak the static text:
|
||||
|
||||
```
|
||||
await azure_tts.say("My friend...", transport.send_queue)
|
||||
```
|
||||
|
||||
As that text is being spoken, the asynchronous LLM task continues in the background. When the text is done, we pull the frames off the buffer queue and put them in the transport’s `send_queue`:
|
||||
|
||||
```
|
||||
async def buffer_to_send_queue():
|
||||
while True:
|
||||
frame = await buffer_queue.get()
|
||||
await transport.send_queue.put(frame)
|
||||
buffer_queue.task_done()
|
||||
if frame.frame_type == FrameType.END_STREAM:
|
||||
break
|
||||
|
||||
await asyncio.gather(llm_response_task, buffer_to_send_queue())
|
||||
|
||||
```
|
||||
|
||||
### Visual Studio Code
|
||||
|
||||
Install the
|
||||
[autopep8](https://marketplace.visualstudio.com/items?itemName=ms-python.autopep8) extension. Then edit the user settings (_Ctrl-Shift-P_ `Open User Settings (JSON)`) and set it as the default Python formatter, enable formatting on save and configure `autopep8` arguments:
|
||||
|
||||
```json
|
||||
"[python]": {
|
||||
"editor.defaultFormatter": "ms-python.autopep8",
|
||||
"editor.formatOnSave": true
|
||||
},
|
||||
"autopep8.args": [
|
||||
"-a",
|
||||
"-a",
|
||||
"--max-line-length=100"
|
||||
],
|
||||
```
|
||||
One thing to note here is the last parameter to `run_to_queue` in the first code clause above: this causes the `run_to_queue` method to send an `END_STREAM` frame when it’s done rendering. This lets us know when to stop our `buffer_to_send_queue` task above.
|
||||
|
||||
@@ -1,6 +0,0 @@
|
||||
autopep8==2.0.4
|
||||
build==1.0.3
|
||||
pip-tools==7.4.1
|
||||
pytest==8.1.1
|
||||
setuptools==69.2.0
|
||||
setuptools_scm==8.0.4
|
||||
@@ -4,13 +4,9 @@
|
||||
|
||||
Learn about the thinking behind the SDK's design.
|
||||
|
||||
## [A Frame's Progress](frame-progress.md)
|
||||
|
||||
See how a Frame is processed through a Transport, a Pipeline, and a series of Frame Processors.
|
||||
|
||||
## [Example Code](examples/)
|
||||
|
||||
The repo includes several example apps in the `examples` directory. The docs explain how they work.
|
||||
The repo includes several example apps in the `src/examples` directory. The docs explain how they work.
|
||||
|
||||
## [API Reference](api/)
|
||||
|
||||
|
||||
@@ -1,17 +1,2 @@
|
||||
# Daily AI SDK Architecture Guide
|
||||
|
||||
## Frames
|
||||
|
||||
Frames can represent discrete chunks of data, for instance a chunk of text, a chunk of audio, or an image. They can also be used to as control flow, for instance a frame that indicates that there is no more data available, or that a user started or stopped talking. They can also represent more complex data structures, such as a message array used for an LLM completion.
|
||||
|
||||
## FrameProcessors
|
||||
|
||||
Frame processors operate on frames. Every frame processor implements a `process_frame` method that consumes one frame and produces zero or more frames. Frame processors can do simple transforms, such as concatenating text fragments into sentences, or they can treat frames as input for an AI Service, and emit chat completions based on message arrays or transform text into audio or images.
|
||||
|
||||
## Pipelines
|
||||
|
||||
Pipelines are lists of frame processors that read from a source queue and send the processed frames to a sink queue. A very simple pipeline might chain an LLM frame processor to a text-to-speech frame processor, with a transport's send queue as its sync. Placing LLM message frames on the pipeline's source queue will cause the LLM's response to be spoken. See example #2 for an implementation of this.
|
||||
|
||||
## Transports
|
||||
|
||||
Transports provide a receive queue, which is input from "the outside world", and a sink queue, which is data that will be sent "to the outside world". The `LocalTransportService` does this with the local camera, mic, display and speaker. The `DailyTransportService` does this with a WebRTC session joined to a Daily.co room.
|
||||
|
||||
@@ -16,7 +16,7 @@ if __name__ == "__main__":
|
||||
|
||||
### `configure()`
|
||||
|
||||
The `configure()` function comes from `examples/foundational/support/runner.py`, and it allows you to configure the examples from the command line directly, or using environment variables:
|
||||
The `configure()` function comes from `src/examples/foundational/support/runner.py`, and it allows you to configure the examples from the command line directly, or using environment variables:
|
||||
|
||||
```bash
|
||||
python 01-say-one-thing.py -u https://YOUR_DOMAIN.daily.co/YOUR_ROOM -k YOUR_API_KEY
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
# Daily AI SDK Examples
|
||||
|
||||
The docs in this folder pair with the example apps located in `examples/foundational`. They are designed to serve as a quick references for building different kinds of AI apps. But the examples also build on one another, so it can be really helpful to walk through them in order.
|
||||
The docs in this folder pair with the example apps located in `src/examples/foundational`. They are designed to serve as a quick references for building different kinds of AI apps. But the examples also build on one another, so it can be really helpful to walk through them in order.
|
||||
|
||||
To start, you can learn about the overall structure of the examples in [01 - Say One Thing](01-say-one-thing.md).
|
||||
|
||||
@@ -1,46 +0,0 @@
|
||||
# A Frame's Progress
|
||||
|
||||
1. A user says “Hello, LLM” and the cloud transcription service delivers a transcription to the Transport.
|
||||

|
||||
|
||||
2. The Transport places a Transcription frame in the Pipeline’s source queue.
|
||||

|
||||
|
||||
3. The Pipeline passes the Transcription frame to the first Frame Processor in its list, the LLM User Message Aggregator.
|
||||

|
||||
|
||||
4. The LLM User Message Aggregator updates the LLM Context with a `{“user”: “Hello LLM”}` message.
|
||||

|
||||
|
||||
5. The LLM User Message Aggregator yields an LLM Message Frame, containing the updated LLM Context. The Pipeline passes this frame to the LLM Frame Processor.
|
||||

|
||||
|
||||
6. The LLM Frame Processor creates a streaming chat completion based on the LLM context and yields the first chunk of a response, Text Frame with the value “Hi, “. The Pipeline passes this frame to the TTS Frame Processor. The TTS Frame Processor aggregates this response but doesn’t yield anything, yet, because it’s waiting for a full sentence.
|
||||

|
||||
|
||||
7. The LLM Frame Processor yields another Text Frame with the value “there.”. The Pipeline passes this frame to the TTS Frame Processor.
|
||||

|
||||
|
||||
8. The TTS Frame Processor now has a full sentence, so it starts streaming audio based on “Hi, there.” It yields the first chunk of streaming audio as an Audio frame, which the Pipeline passes to the LLM Assistant Message Aggregator.
|
||||

|
||||
|
||||
9. The LLM Assistant Message Aggregator doesn’t do anything with Audio frames, so it immediately yields the frame, unchanged. This is the convention for all Frame Processors: frames that the processor doesn’t process should be immediately yielded.
|
||||

|
||||
|
||||
10. The Pipeline places the first Audio frame in its sink queue, which is being watched by the Transport. Since the frame is now in a queue, the Pipeline can continue processing other frames. Note that the source and sink queues form a sort of “boundary of concurrent processing” between a Pipeline and the outside world. In a Pipeline, Frames are processed sequentially; once a Frame is on a queue it can be processed in parallel with the frames being processed by the Pipeline. TODO: link to a more in-depth section about this.
|
||||

|
||||
|
||||
11. The TTS Frame Processor yields another Audio frame as the Transport transmits the first Audio frame.
|
||||

|
||||
|
||||
12. As before, the LLM Assistant Message Aggregator immediately yields the Audio frame and the Pipeline places the Audio frame in the sink queue.
|
||||

|
||||
|
||||
13. The TTS Frame Processor has no more frames to yield. The LLM Frame Processor emits an LLM Response End Frame, which the Pipeline passes to the TTS Frame Processor.
|
||||

|
||||
|
||||
14. The TTS Frame Processor immediately yields the LLM Response End Frame, so the Pipeline passes it along to the LLM Assistant Message Aggregator. The LLM Assistant Message Aggregator updates the LLM Context with the full response from the LLM. TODO TODO: I realized I forgot that the TSS Frame Processor also yields the Text frames that the LLM emitted so that the LLM Assistant Message Aggregator could accumulate them, arrggh.
|
||||

|
||||
|
||||
15. The system is quiet, and waiting for the next message from the Transport.
|
||||

|
||||
|
Before Width: | Height: | Size: 98 KiB |
|
Before Width: | Height: | Size: 91 KiB |
|
Before Width: | Height: | Size: 92 KiB |
|
Before Width: | Height: | Size: 92 KiB |
|
Before Width: | Height: | Size: 98 KiB |
|
Before Width: | Height: | Size: 94 KiB |
|
Before Width: | Height: | Size: 94 KiB |
|
Before Width: | Height: | Size: 95 KiB |
|
Before Width: | Height: | Size: 94 KiB |
|
Before Width: | Height: | Size: 96 KiB |
|
Before Width: | Height: | Size: 110 KiB |
|
Before Width: | Height: | Size: 102 KiB |
|
Before Width: | Height: | Size: 111 KiB |
|
Before Width: | Height: | Size: 117 KiB |
|
Before Width: | Height: | Size: 98 KiB |
@@ -1,25 +0,0 @@
|
||||
# Anthropic
|
||||
ANTHROPIC_API_KEY=...
|
||||
|
||||
# Azure
|
||||
SPEECH_KEY=...
|
||||
SPEECH_REGION=...
|
||||
|
||||
# Daily
|
||||
DAILY_API_KEY=...
|
||||
DAILY_SAMPLE_ROOM_URL=https://...
|
||||
|
||||
# ElevenLabs
|
||||
ELEVENLABS_API_KEY=...
|
||||
ELEVENLABS_VOICE_ID=...
|
||||
|
||||
# Fal
|
||||
FAL_KEY_ID=...
|
||||
FAL_KEY_SECRET=...
|
||||
|
||||
# PlayHT
|
||||
PLAY_HT_USER_ID=...
|
||||
PLAY_HT_API_KEY=...
|
||||
|
||||
# OpenAI
|
||||
OPENAI_API_KEY=...
|
||||
@@ -1,54 +0,0 @@
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import logging
|
||||
import os
|
||||
from dailyai.pipeline.frames import EndFrame, TextFrame
|
||||
from dailyai.pipeline.pipeline import Pipeline
|
||||
|
||||
from dailyai.transports.daily_transport import DailyTransport
|
||||
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
|
||||
|
||||
from runner import configure
|
||||
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv(override=True)
|
||||
|
||||
logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
|
||||
logger = logging.getLogger("dailyai")
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
|
||||
async def main(room_url):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
transport = DailyTransport(
|
||||
room_url,
|
||||
None,
|
||||
"Say One Thing",
|
||||
mic_enabled=True,
|
||||
)
|
||||
|
||||
tts = ElevenLabsTTSService(
|
||||
aiohttp_session=session,
|
||||
api_key=os.getenv("ELEVENLABS_API_KEY"),
|
||||
voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
|
||||
)
|
||||
|
||||
pipeline = Pipeline([tts])
|
||||
|
||||
# Register an event handler so we can play the audio when the
|
||||
# participant joins.
|
||||
@transport.event_handler("on_participant_joined")
|
||||
async def on_participant_joined(transport, participant):
|
||||
if participant["info"]["isLocal"]:
|
||||
return
|
||||
|
||||
participant_name = participant["info"]["userName"] or ''
|
||||
await pipeline.queue_frames([TextFrame("Hello there, " + participant_name + "!"), EndFrame()])
|
||||
|
||||
await transport.run(pipeline)
|
||||
del tts
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
(url, token) = configure()
|
||||
asyncio.run(main(url))
|
||||
@@ -1,59 +0,0 @@
|
||||
import asyncio
|
||||
import os
|
||||
import logging
|
||||
|
||||
import aiohttp
|
||||
|
||||
from dailyai.pipeline.frames import EndFrame, LLMMessagesFrame
|
||||
from dailyai.pipeline.pipeline import Pipeline
|
||||
from dailyai.transports.daily_transport import DailyTransport
|
||||
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
|
||||
from dailyai.services.open_ai_services import OpenAILLMService
|
||||
|
||||
from runner import configure
|
||||
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv(override=True)
|
||||
|
||||
logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
|
||||
logger = logging.getLogger("dailyai")
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
|
||||
async def main(room_url):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
transport = DailyTransport(
|
||||
room_url,
|
||||
None,
|
||||
"Say One Thing From an LLM",
|
||||
mic_enabled=True,
|
||||
)
|
||||
|
||||
tts = ElevenLabsTTSService(
|
||||
aiohttp_session=session,
|
||||
api_key=os.getenv("ELEVENLABS_API_KEY"),
|
||||
voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
model="gpt-4-turbo-preview")
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are an LLM in a WebRTC session, and this is a 'hello world' demo. Say hello to the world.",
|
||||
}]
|
||||
|
||||
pipeline = Pipeline([llm, tts])
|
||||
|
||||
@transport.event_handler("on_first_other_participant_joined")
|
||||
async def on_first_other_participant_joined(transport):
|
||||
await pipeline.queue_frames([LLMMessagesFrame(messages), EndFrame()])
|
||||
|
||||
await transport.run(pipeline)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
(url, token) = configure()
|
||||
asyncio.run(main(url))
|
||||
@@ -1,57 +0,0 @@
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import logging
|
||||
import os
|
||||
|
||||
from dailyai.pipeline.frames import TextFrame
|
||||
from dailyai.pipeline.pipeline import Pipeline
|
||||
from dailyai.transports.daily_transport import DailyTransport
|
||||
from dailyai.services.fal_ai_services import FalImageGenService
|
||||
|
||||
from runner import configure
|
||||
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv(override=True)
|
||||
|
||||
logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
|
||||
logger = logging.getLogger("dailyai")
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
|
||||
async def main(room_url):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
transport = DailyTransport(
|
||||
room_url,
|
||||
None,
|
||||
"Show a still frame image",
|
||||
camera_enabled=True,
|
||||
camera_width=1024,
|
||||
camera_height=1024,
|
||||
duration_minutes=1
|
||||
)
|
||||
|
||||
imagegen = FalImageGenService(
|
||||
image_size="square_hd",
|
||||
aiohttp_session=session,
|
||||
key_id=os.getenv("FAL_KEY_ID"),
|
||||
key_secret=os.getenv("FAL_KEY_SECRET"),
|
||||
)
|
||||
|
||||
pipeline = Pipeline([imagegen])
|
||||
|
||||
@transport.event_handler("on_first_other_participant_joined")
|
||||
async def on_first_other_participant_joined(transport):
|
||||
# Note that we do not put an EndFrame() item in the pipeline for this demo.
|
||||
# This means that the bot will stay in the channel until it times out.
|
||||
# An EndFrame() in the pipeline would cause the transport to shut
|
||||
# down.
|
||||
await pipeline.queue_frames(
|
||||
[TextFrame("a cat in the style of picasso")]
|
||||
)
|
||||
|
||||
await transport.run(pipeline)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
(url, token) = configure()
|
||||
asyncio.run(main(url))
|
||||
@@ -1,85 +0,0 @@
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
|
||||
import aiohttp
|
||||
from dailyai.pipeline.merge_pipeline import SequentialMergePipeline
|
||||
from dailyai.pipeline.pipeline import Pipeline
|
||||
|
||||
from dailyai.transports.daily_transport import DailyTransport
|
||||
from dailyai.services.azure_ai_services import AzureLLMService, AzureTTSService
|
||||
from dailyai.services.deepgram_ai_services import DeepgramTTSService
|
||||
from dailyai.pipeline.frames import EndPipeFrame, LLMMessagesFrame, TextFrame
|
||||
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
|
||||
|
||||
from runner import configure
|
||||
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv(override=True)
|
||||
|
||||
logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
|
||||
logger = logging.getLogger("dailyai")
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
|
||||
async def main(room_url: str):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
transport = DailyTransport(
|
||||
room_url,
|
||||
None,
|
||||
"Static And Dynamic Speech",
|
||||
duration_minutes=1,
|
||||
mic_enabled=True,
|
||||
mic_sample_rate=16000,
|
||||
)
|
||||
|
||||
llm = AzureLLMService(
|
||||
api_key=os.getenv("AZURE_CHATGPT_API_KEY"),
|
||||
endpoint=os.getenv("AZURE_CHATGPT_ENDPOINT"),
|
||||
model=os.getenv("AZURE_CHATGPT_MODEL"),
|
||||
)
|
||||
azure_tts = AzureTTSService(
|
||||
api_key=os.getenv("AZURE_SPEECH_API_KEY"),
|
||||
region=os.getenv("AZURE_SPEECH_REGION"),
|
||||
)
|
||||
|
||||
deepgram_tts = DeepgramTTSService(
|
||||
aiohttp_session=session,
|
||||
api_key=os.getenv("DEEPGRAM_API_KEY"),
|
||||
)
|
||||
elevenlabs_tts = ElevenLabsTTSService(
|
||||
aiohttp_session=session,
|
||||
api_key=os.getenv("ELEVENLABS_API_KEY"),
|
||||
voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
|
||||
)
|
||||
|
||||
messages = [{"role": "system",
|
||||
"content": "tell the user a joke about llamas"}]
|
||||
|
||||
# Start a task to run the LLM to create a joke, and convert the LLM output to audio frames. This task
|
||||
# will run in parallel with generating and speaking the audio for static text, so there's no delay to
|
||||
# speak the LLM response.
|
||||
llm_pipeline = Pipeline([llm, elevenlabs_tts])
|
||||
await llm_pipeline.queue_frames([LLMMessagesFrame(messages), EndPipeFrame()])
|
||||
|
||||
simple_tts_pipeline = Pipeline([azure_tts])
|
||||
await simple_tts_pipeline.queue_frames(
|
||||
[
|
||||
TextFrame("My friend the LLM is going to tell a joke about llamas."),
|
||||
EndPipeFrame(),
|
||||
]
|
||||
)
|
||||
|
||||
merge_pipeline = SequentialMergePipeline(
|
||||
[simple_tts_pipeline, llm_pipeline])
|
||||
|
||||
await asyncio.gather(
|
||||
transport.run(merge_pipeline),
|
||||
simple_tts_pipeline.run_pipeline(),
|
||||
llm_pipeline.run_pipeline(),
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
(url, token) = configure()
|
||||
asyncio.run(main(url))
|
||||
@@ -1,146 +0,0 @@
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import os
|
||||
import logging
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import AsyncGenerator
|
||||
|
||||
from dailyai.pipeline.aggregators import (
|
||||
GatedAggregator,
|
||||
LLMFullResponseAggregator,
|
||||
ParallelPipeline,
|
||||
SentenceAggregator,
|
||||
)
|
||||
from dailyai.pipeline.frames import (
|
||||
Frame,
|
||||
TextFrame,
|
||||
EndFrame,
|
||||
ImageFrame,
|
||||
LLMMessagesFrame,
|
||||
LLMResponseStartFrame,
|
||||
)
|
||||
from dailyai.pipeline.frame_processor import FrameProcessor
|
||||
|
||||
from dailyai.pipeline.pipeline import Pipeline
|
||||
from dailyai.transports.daily_transport import DailyTransport
|
||||
from dailyai.services.open_ai_services import OpenAILLMService
|
||||
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
|
||||
from dailyai.services.fal_ai_services import FalImageGenService
|
||||
|
||||
from runner import configure
|
||||
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv(override=True)
|
||||
|
||||
logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
|
||||
logger = logging.getLogger("dailyai")
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
|
||||
@dataclass
|
||||
class MonthFrame(Frame):
|
||||
month: str
|
||||
|
||||
|
||||
class MonthPrepender(FrameProcessor):
|
||||
def __init__(self):
|
||||
self.most_recent_month = "Placeholder, month frame not yet received"
|
||||
self.prepend_to_next_text_frame = False
|
||||
|
||||
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||
if isinstance(frame, MonthFrame):
|
||||
self.most_recent_month = frame.month
|
||||
elif self.prepend_to_next_text_frame and isinstance(frame, TextFrame):
|
||||
yield TextFrame(f"{self.most_recent_month}: {frame.text}")
|
||||
self.prepend_to_next_text_frame = False
|
||||
elif isinstance(frame, LLMResponseStartFrame):
|
||||
self.prepend_to_next_text_frame = True
|
||||
yield frame
|
||||
else:
|
||||
yield frame
|
||||
|
||||
|
||||
async def main(room_url):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
transport = DailyTransport(
|
||||
room_url,
|
||||
None,
|
||||
"Month Narration Bot",
|
||||
mic_enabled=True,
|
||||
camera_enabled=True,
|
||||
mic_sample_rate=16000,
|
||||
camera_width=1024,
|
||||
camera_height=1024,
|
||||
)
|
||||
|
||||
tts = ElevenLabsTTSService(
|
||||
aiohttp_session=session,
|
||||
api_key=os.getenv("ELEVENLABS_API_KEY"),
|
||||
voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
model="gpt-4-turbo-preview")
|
||||
|
||||
imagegen = FalImageGenService(
|
||||
image_size="square_hd",
|
||||
aiohttp_session=session,
|
||||
key_id=os.getenv("FAL_KEY_ID"),
|
||||
key_secret=os.getenv("FAL_KEY_SECRET"),
|
||||
)
|
||||
|
||||
gated_aggregator = GatedAggregator(
|
||||
gate_open_fn=lambda frame: isinstance(
|
||||
frame, ImageFrame), gate_close_fn=lambda frame: isinstance(
|
||||
frame, LLMResponseStartFrame), start_open=False, )
|
||||
|
||||
sentence_aggregator = SentenceAggregator()
|
||||
month_prepender = MonthPrepender()
|
||||
llm_full_response_aggregator = LLMFullResponseAggregator()
|
||||
|
||||
pipeline = Pipeline(
|
||||
processors=[
|
||||
llm,
|
||||
sentence_aggregator,
|
||||
ParallelPipeline(
|
||||
[[month_prepender, tts], [llm_full_response_aggregator, imagegen]]
|
||||
),
|
||||
gated_aggregator,
|
||||
],
|
||||
)
|
||||
|
||||
frames = []
|
||||
for month in [
|
||||
"January",
|
||||
"February",
|
||||
"March",
|
||||
"April",
|
||||
"May",
|
||||
"June",
|
||||
"July",
|
||||
"August",
|
||||
"September",
|
||||
"October",
|
||||
"November",
|
||||
"December",
|
||||
]:
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": f"Describe a nature photograph suitable for use in a calendar, for the month of {month}. Include only the image description with no preamble. Limit the description to one sentence, please.",
|
||||
}
|
||||
]
|
||||
frames.append(MonthFrame(month))
|
||||
frames.append(LLMMessagesFrame(messages))
|
||||
|
||||
frames.append(EndFrame())
|
||||
await pipeline.queue_frames(frames)
|
||||
|
||||
await transport.run(pipeline, override_pipeline_source_queue=False)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
(url, token) = configure()
|
||||
asyncio.run(main(url))
|
||||
@@ -1,88 +0,0 @@
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import logging
|
||||
import os
|
||||
from dailyai.pipeline.frames import LLMMessagesFrame
|
||||
from dailyai.pipeline.pipeline import Pipeline
|
||||
|
||||
from dailyai.transports.daily_transport import DailyTransport
|
||||
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
|
||||
from dailyai.services.open_ai_services import OpenAILLMService
|
||||
from dailyai.services.ai_services import FrameLogger
|
||||
from dailyai.pipeline.aggregators import (
|
||||
LLMAssistantContextAggregator,
|
||||
LLMUserContextAggregator,
|
||||
)
|
||||
from runner import configure
|
||||
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv(override=True)
|
||||
|
||||
logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
|
||||
logger = logging.getLogger("dailyai")
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
|
||||
async def main(room_url: str, token):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
transport = DailyTransport(
|
||||
room_url,
|
||||
token,
|
||||
"Respond bot",
|
||||
duration_minutes=5,
|
||||
start_transcription=True,
|
||||
mic_enabled=True,
|
||||
mic_sample_rate=16000,
|
||||
camera_enabled=False,
|
||||
vad_enabled=True,
|
||||
)
|
||||
|
||||
tts = ElevenLabsTTSService(
|
||||
aiohttp_session=session,
|
||||
api_key=os.getenv("ELEVENLABS_API_KEY"),
|
||||
voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
model="gpt-4-turbo-preview")
|
||||
fl = FrameLogger("Inner")
|
||||
fl2 = FrameLogger("Outer")
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio. Respond to what the user said in a creative and helpful way.",
|
||||
},
|
||||
]
|
||||
|
||||
tma_in = LLMUserContextAggregator(
|
||||
messages, transport._my_participant_id)
|
||||
tma_out = LLMAssistantContextAggregator(
|
||||
messages, transport._my_participant_id
|
||||
)
|
||||
pipeline = Pipeline(
|
||||
processors=[
|
||||
fl,
|
||||
tma_in,
|
||||
llm,
|
||||
fl2,
|
||||
tts,
|
||||
tma_out,
|
||||
],
|
||||
)
|
||||
|
||||
@transport.event_handler("on_first_other_participant_joined")
|
||||
async def on_first_other_participant_joined(transport):
|
||||
# Kick off the conversation.
|
||||
messages.append(
|
||||
{"role": "system", "content": "Please introduce yourself to the user."})
|
||||
await pipeline.queue_frames([LLMMessagesFrame(messages)])
|
||||
|
||||
transport.transcription_settings["extra"]["endpointing"] = True
|
||||
transport.transcription_settings["extra"]["punctuate"] = True
|
||||
await transport.run(pipeline)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
(url, token) = configure()
|
||||
asyncio.run(main(url, token))
|
||||
@@ -1,76 +0,0 @@
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import logging
|
||||
import os
|
||||
from dailyai.pipeline.aggregators import (
|
||||
LLMResponseAggregator,
|
||||
UserResponseAggregator,
|
||||
)
|
||||
|
||||
from dailyai.pipeline.pipeline import Pipeline
|
||||
from dailyai.services.ai_services import FrameLogger
|
||||
from dailyai.transports.daily_transport import DailyTransport
|
||||
from dailyai.services.open_ai_services import OpenAILLMService
|
||||
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
|
||||
|
||||
from runner import configure
|
||||
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv(override=True)
|
||||
|
||||
logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
|
||||
logger = logging.getLogger("dailyai")
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
|
||||
async def main(room_url: str, token):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
transport = DailyTransport(
|
||||
room_url,
|
||||
token,
|
||||
"Respond bot",
|
||||
duration_minutes=5,
|
||||
start_transcription=True,
|
||||
mic_enabled=True,
|
||||
mic_sample_rate=16000,
|
||||
camera_enabled=False,
|
||||
vad_enabled=True,
|
||||
)
|
||||
|
||||
tts = ElevenLabsTTSService(
|
||||
aiohttp_session=session,
|
||||
api_key=os.getenv("ELEVENLABS_API_KEY"),
|
||||
voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
model="gpt-4-turbo-preview")
|
||||
|
||||
pipeline = Pipeline([FrameLogger(), llm, FrameLogger(), tts])
|
||||
|
||||
@transport.event_handler("on_first_other_participant_joined")
|
||||
async def on_first_other_participant_joined(transport):
|
||||
await transport.say("Hi, I'm listening!", tts)
|
||||
|
||||
async def run_conversation():
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio. Respond to what the user said in a creative and helpful way.",
|
||||
},
|
||||
]
|
||||
|
||||
await transport.run_interruptible_pipeline(
|
||||
pipeline,
|
||||
post_processor=LLMResponseAggregator(messages),
|
||||
pre_processor=UserResponseAggregator(messages),
|
||||
)
|
||||
|
||||
transport.transcription_settings["extra"]["punctuate"] = False
|
||||
await asyncio.gather(transport.run(), run_conversation())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
(url, token) = configure()
|
||||
asyncio.run(main(url, token))
|
||||
@@ -1,147 +0,0 @@
|
||||
from typing import Tuple
|
||||
import aiohttp
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
from dailyai.pipeline.aggregators import SentenceAggregator
|
||||
from dailyai.pipeline.pipeline import Pipeline
|
||||
|
||||
from dailyai.transports.daily_transport import DailyTransport
|
||||
from dailyai.services.azure_ai_services import AzureLLMService, AzureTTSService
|
||||
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
|
||||
from dailyai.services.fal_ai_services import FalImageGenService
|
||||
from dailyai.pipeline.frames import AudioFrame, EndFrame, ImageFrame, LLMMessagesFrame, TextFrame
|
||||
|
||||
from runner import configure
|
||||
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv(override=True)
|
||||
|
||||
logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
|
||||
logger = logging.getLogger("dailyai")
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
|
||||
async def main(room_url: str):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
transport = DailyTransport(
|
||||
room_url,
|
||||
None,
|
||||
"Respond bot",
|
||||
duration_minutes=10,
|
||||
mic_enabled=True,
|
||||
mic_sample_rate=16000,
|
||||
camera_enabled=True,
|
||||
camera_width=1024,
|
||||
camera_height=1024,
|
||||
)
|
||||
|
||||
llm = AzureLLMService(
|
||||
api_key=os.getenv("AZURE_CHATGPT_API_KEY"),
|
||||
endpoint=os.getenv("AZURE_CHATGPT_ENDPOINT"),
|
||||
model=os.getenv("AZURE_CHATGPT_MODEL"),
|
||||
)
|
||||
tts1 = AzureTTSService(
|
||||
api_key=os.getenv("AZURE_SPEECH_API_KEY"),
|
||||
region=os.getenv("AZURE_SPEECH_REGION"),
|
||||
)
|
||||
tts2 = ElevenLabsTTSService(
|
||||
aiohttp_session=session,
|
||||
api_key=os.getenv("ELEVENLABS_API_KEY"),
|
||||
voice_id="jBpfuIE2acCO8z3wKNLl",
|
||||
)
|
||||
dalle = FalImageGenService(
|
||||
image_size="1024x1024",
|
||||
aiohttp_session=session,
|
||||
key_id=os.getenv("FAL_KEY_ID"),
|
||||
key_secret=os.getenv("FAL_KEY_SECRET"),
|
||||
)
|
||||
|
||||
bot1_messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a stern librarian. You strongly believe that a hot dog is a sandwich. Start by stating this fact in a few sentences, then be prepared to debate this with the user. You shouldn't ever compromise on the fundamental truth that a hot dog is a sandwich. Your responses should only be a few sentences long.",
|
||||
},
|
||||
]
|
||||
bot2_messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a silly cat, and you strongly believe that a hot dog is not a sandwich. Debate this with the user, only responding with a few sentences. Don't ever accept that a hot dog is a sandwich.",
|
||||
},
|
||||
]
|
||||
|
||||
async def get_text_and_audio(messages) -> Tuple[str, bytearray]:
|
||||
"""This function streams text from the LLM and uses the TTS service to convert
|
||||
that text to speech as it's received. """
|
||||
source_queue = asyncio.Queue()
|
||||
sink_queue = asyncio.Queue()
|
||||
sentence_aggregator = SentenceAggregator()
|
||||
pipeline = Pipeline(
|
||||
[llm, sentence_aggregator, tts1], source_queue, sink_queue
|
||||
)
|
||||
|
||||
await source_queue.put(LLMMessagesFrame(messages))
|
||||
await source_queue.put(EndFrame())
|
||||
await pipeline.run_pipeline()
|
||||
|
||||
message = ""
|
||||
all_audio = bytearray()
|
||||
while sink_queue.qsize():
|
||||
frame = sink_queue.get_nowait()
|
||||
if isinstance(frame, TextFrame):
|
||||
message += frame.text
|
||||
elif isinstance(frame, AudioFrame):
|
||||
all_audio.extend(frame.data)
|
||||
|
||||
return (message, all_audio)
|
||||
|
||||
async def get_bot1_statement():
|
||||
message, audio = await get_text_and_audio(bot1_messages)
|
||||
|
||||
bot1_messages.append({"role": "assistant", "content": message})
|
||||
bot2_messages.append({"role": "user", "content": message})
|
||||
|
||||
return audio
|
||||
|
||||
async def get_bot2_statement():
|
||||
message, audio = await get_text_and_audio(bot2_messages)
|
||||
|
||||
bot2_messages.append({"role": "assistant", "content": message})
|
||||
bot1_messages.append({"role": "user", "content": message})
|
||||
|
||||
return audio
|
||||
|
||||
async def argue():
|
||||
for i in range(100):
|
||||
print(f"In iteration {i}")
|
||||
|
||||
bot1_description = "A woman conservatively dressed as a librarian in a library surrounded by books, cartoon, serious, highly detailed"
|
||||
|
||||
(audio1, image_data1) = await asyncio.gather(
|
||||
get_bot1_statement(), dalle.run_image_gen(bot1_description)
|
||||
)
|
||||
await transport.send_queue.put(
|
||||
[
|
||||
ImageFrame(None, image_data1[1]),
|
||||
AudioFrame(audio1),
|
||||
]
|
||||
)
|
||||
|
||||
bot2_description = "A cat dressed in a hot dog costume, cartoon, bright colors, funny, highly detailed"
|
||||
|
||||
(audio2, image_data2) = await asyncio.gather(
|
||||
get_bot2_statement(), dalle.run_image_gen(bot2_description)
|
||||
)
|
||||
await transport.send_queue.put(
|
||||
[
|
||||
ImageFrame(None, image_data2[1]),
|
||||
AudioFrame(audio2),
|
||||
]
|
||||
)
|
||||
|
||||
await asyncio.gather(transport.run(), argue())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
(url, token) = configure()
|
||||
asyncio.run(main(url))
|
||||
@@ -1,53 +0,0 @@
|
||||
import asyncio
|
||||
import logging
|
||||
|
||||
from dailyai.pipeline.frames import EndFrame, TranscriptionFrame
|
||||
from dailyai.transports.local_transport import LocalTransport
|
||||
from dailyai.services.whisper_ai_services import WhisperSTTService
|
||||
from dailyai.pipeline.pipeline import Pipeline
|
||||
|
||||
logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
|
||||
logger = logging.getLogger("dailyai")
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
|
||||
async def main():
|
||||
meeting_duration_minutes = 1
|
||||
|
||||
transport = LocalTransport(
|
||||
mic_enabled=False,
|
||||
camera_enabled=False,
|
||||
speaker_enabled=True,
|
||||
duration_minutes=meeting_duration_minutes,
|
||||
start_transcription=False,
|
||||
)
|
||||
|
||||
stt = WhisperSTTService()
|
||||
|
||||
transcription_output_queue = asyncio.Queue()
|
||||
transport_done = asyncio.Event()
|
||||
|
||||
pipeline = Pipeline([stt])
|
||||
pipeline.set_sink(transcription_output_queue)
|
||||
|
||||
async def handle_transcription():
|
||||
print("`````````TRANSCRIPTION`````````")
|
||||
while not transport_done.is_set():
|
||||
item = await transcription_output_queue.get()
|
||||
print("got item from queue", item)
|
||||
if isinstance(item, TranscriptionFrame):
|
||||
print(item.text)
|
||||
elif isinstance(item, EndFrame):
|
||||
break
|
||||
print("handle_transcription done")
|
||||
|
||||
async def run_until_done():
|
||||
await transport.run(pipeline)
|
||||
transport_done.set()
|
||||
print("run_until_done done")
|
||||
|
||||
await asyncio.gather(run_until_done(), handle_transcription())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -1,25 +0,0 @@
|
||||
syntax = "proto3";
|
||||
|
||||
package dailyai_proto;
|
||||
|
||||
message TextFrame {
|
||||
string text = 1;
|
||||
}
|
||||
|
||||
message AudioFrame {
|
||||
bytes audio = 1;
|
||||
}
|
||||
|
||||
message TranscriptionFrame {
|
||||
string text = 1;
|
||||
string participant_id = 2;
|
||||
string timestamp = 3;
|
||||
}
|
||||
|
||||
message Frame {
|
||||
oneof frame {
|
||||
TextFrame text = 1;
|
||||
AudioFrame audio = 2;
|
||||
TranscriptionFrame transcription = 3;
|
||||
}
|
||||
}
|
||||
@@ -1,134 +0,0 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<script src="//cdn.jsdelivr.net/npm/protobufjs@7.X.X/dist/protobuf.min.js"></script>
|
||||
<title>WebSocket Audio Stream</title>
|
||||
</head>
|
||||
|
||||
<body>
|
||||
<h1>WebSocket Audio Stream</h1>
|
||||
<button id="startAudioBtn">Start Audio</button>
|
||||
<button id="stopAudioBtn">Stop Audio</button>
|
||||
<script>
|
||||
const SAMPLE_RATE = 16000;
|
||||
const BUFFER_SIZE = 8192;
|
||||
const MIN_AUDIO_SIZE = 6400;
|
||||
|
||||
let audioContext;
|
||||
let microphoneStream;
|
||||
let scriptProcessor;
|
||||
let source;
|
||||
let frame;
|
||||
let audioChunks = [];
|
||||
let isPlaying = false;
|
||||
let ws;
|
||||
|
||||
const proto = protobuf.load("frames.proto", (err, root) => {
|
||||
if (err) throw err;
|
||||
frame = root.lookupType("dailyai_proto.Frame");
|
||||
});
|
||||
|
||||
function initWebSocket() {
|
||||
ws = new WebSocket('ws://localhost:8765');
|
||||
|
||||
ws.addEventListener('open', () => console.log('WebSocket connection established.'));
|
||||
ws.addEventListener('message', handleWebSocketMessage);
|
||||
ws.addEventListener('close', (event) => console.log("WebSocket connection closed.", event.code, event.reason));
|
||||
ws.addEventListener('error', (event) => console.error('WebSocket error:', event));
|
||||
}
|
||||
|
||||
async function handleWebSocketMessage(event) {
|
||||
const arrayBuffer = await event.data.arrayBuffer();
|
||||
enqueueAudioFromProto(arrayBuffer);
|
||||
}
|
||||
|
||||
function enqueueAudioFromProto(arrayBuffer) {
|
||||
const parsedFrame = frame.decode(new Uint8Array(arrayBuffer));
|
||||
if (!parsedFrame?.audio) return false;
|
||||
|
||||
const frameCount = parsedFrame.audio.data.length / 2;
|
||||
const audioOutBuffer = audioContext.createBuffer(1, frameCount, SAMPLE_RATE);
|
||||
const nowBuffering = audioOutBuffer.getChannelData(0);
|
||||
const view = new Int16Array(parsedFrame.audio.data.buffer);
|
||||
|
||||
for (let i = 0; i < frameCount; i++) {
|
||||
const word = view[i];
|
||||
nowBuffering[i] = ((word + 32768) % 65536 - 32768) / 32768.0;
|
||||
}
|
||||
|
||||
audioChunks.push(audioOutBuffer);
|
||||
if (!isPlaying) playNextChunk();
|
||||
}
|
||||
|
||||
function playNextChunk() {
|
||||
if (audioChunks.length === 0) {
|
||||
isPlaying = false;
|
||||
return;
|
||||
}
|
||||
|
||||
isPlaying = true;
|
||||
const audioOutBuffer = audioChunks.shift();
|
||||
const source = audioContext.createBufferSource();
|
||||
source.buffer = audioOutBuffer;
|
||||
source.connect(audioContext.destination);
|
||||
source.onended = playNextChunk;
|
||||
source.start();
|
||||
}
|
||||
|
||||
function startAudio() {
|
||||
if (!navigator.mediaDevices || !navigator.mediaDevices.getUserMedia) {
|
||||
alert('getUserMedia is not supported in your browser.');
|
||||
return;
|
||||
}
|
||||
|
||||
navigator.mediaDevices.getUserMedia({ audio: true })
|
||||
.then((stream) => {
|
||||
microphoneStream = stream;
|
||||
audioContext = new (window.AudioContext || window.webkitAudioContext)();
|
||||
scriptProcessor = audioContext.createScriptProcessor(BUFFER_SIZE, 1, 1);
|
||||
source = audioContext.createMediaStreamSource(stream);
|
||||
source.connect(scriptProcessor);
|
||||
scriptProcessor.connect(audioContext.destination);
|
||||
|
||||
const audioBuffer = [];
|
||||
const skipRatio = Math.floor(audioContext.sampleRate / (SAMPLE_RATE * 2));
|
||||
|
||||
scriptProcessor.onaudioprocess = (event) => {
|
||||
const rawLeftChannelData = event.inputBuffer.getChannelData(0);
|
||||
for (let i = 0; i < rawLeftChannelData.length; i += skipRatio) {
|
||||
const normalized = ((rawLeftChannelData[i] * 32768.0) + 32768) % 65536 - 32768;
|
||||
const swappedBytes = ((normalized & 0xff) << 8) | ((normalized >> 8) & 0xff);
|
||||
audioBuffer.push(swappedBytes);
|
||||
}
|
||||
|
||||
if (audioBuffer.length >= MIN_AUDIO_SIZE) {
|
||||
const audioFrame = frame.create({ audio: { audio: audioBuffer.slice(0, MIN_AUDIO_SIZE) } });
|
||||
const encodedFrame = new Uint8Array(frame.encode(audioFrame).finish());
|
||||
ws.send(encodedFrame);
|
||||
audioBuffer.splice(0, MIN_AUDIO_SIZE);
|
||||
}
|
||||
};
|
||||
|
||||
initWebSocket();
|
||||
})
|
||||
.catch((error) => console.error('Error accessing microphone:', error));
|
||||
}
|
||||
|
||||
function stopAudio() {
|
||||
if (ws) {
|
||||
ws.close();
|
||||
scriptProcessor.disconnect();
|
||||
source.disconnect();
|
||||
ws = undefined;
|
||||
}
|
||||
}
|
||||
|
||||
document.getElementById('startAudioBtn').addEventListener('click', startAudio);
|
||||
document.getElementById('stopAudioBtn').addEventListener('click', stopAudio);
|
||||
</script>
|
||||
</body>
|
||||
|
||||
</html>
|
||||
@@ -1,50 +0,0 @@
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import logging
|
||||
import os
|
||||
from dailyai.pipeline.frame_processor import FrameProcessor
|
||||
from dailyai.pipeline.frames import TextFrame, TranscriptionFrame
|
||||
from dailyai.pipeline.pipeline import Pipeline
|
||||
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
|
||||
from dailyai.transports.websocket_transport import WebsocketTransport
|
||||
from dailyai.services.whisper_ai_services import WhisperSTTService
|
||||
|
||||
logging.basicConfig(format="%(levelno)s %(asctime)s %(message)s")
|
||||
logger = logging.getLogger("dailyai")
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
|
||||
class WhisperTranscriber(FrameProcessor):
|
||||
async def process_frame(self, frame):
|
||||
if isinstance(frame, TranscriptionFrame):
|
||||
print(f"Transcribed: {frame.text}")
|
||||
else:
|
||||
yield frame
|
||||
|
||||
|
||||
async def main():
|
||||
async with aiohttp.ClientSession() as session:
|
||||
transport = WebsocketTransport(
|
||||
mic_enabled=True,
|
||||
speaker_enabled=True,
|
||||
)
|
||||
tts = ElevenLabsTTSService(
|
||||
aiohttp_session=session,
|
||||
api_key=os.getenv("ELEVENLABS_API_KEY"),
|
||||
voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
|
||||
)
|
||||
|
||||
pipeline = Pipeline([
|
||||
WhisperSTTService(),
|
||||
WhisperTranscriber(),
|
||||
tts,
|
||||
])
|
||||
|
||||
@transport.on_connection
|
||||
async def queue_frame():
|
||||
await pipeline.queue_frames([TextFrame("Hello there!")])
|
||||
|
||||
await transport.run(pipeline)
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -1,34 +0,0 @@
|
||||
# Server Example
|
||||
|
||||
Use this server app to quickly host a bot on the web:
|
||||
|
||||
```
|
||||
flask --app daily-bot-manager.py --debug run
|
||||
```
|
||||
|
||||
It's currently configured to serve example apps defined in the APPS constant in the server file:
|
||||
|
||||
```
|
||||
chatbot
|
||||
patient-intake
|
||||
storybot
|
||||
translator
|
||||
```
|
||||
|
||||
Once the server is started, you can create a bot instance by opening `http://127.0.0.1:5000/start/chatbot` in a browser, and the server will do the following:
|
||||
|
||||
- Create a new, randomly-named Daily room with `DAILY_API_KEY` from your .env file or environment
|
||||
- Start an instance of `chatbot.py` and connect it to that room
|
||||
- 301 redirect your browser to the room
|
||||
|
||||
### Options
|
||||
|
||||
The server supports several options, which can be set in the body of a POST request, or as params in the URL of a GET request.
|
||||
|
||||
- `room_url` (default: none): A room URL to join. If empty, the server will create a Daily room and return the URL in the response.
|
||||
room_properties (none): A JSON object (URL encoded if included as a GET parameter) for overriding default room creation properties, as described here: https://docs.daily.co/reference/rest-api/rooms/create-room This will be ignored if a room_url is provided.
|
||||
- `token_properties` (none): A JSON object (URL encoded if included as a GET parameter) for overriding default token properties. By default, the server creates an owner token with an expiration time of one hour.
|
||||
- `duration` (7200 seconds, or two hours): Use this property to set a time limit for the bot, as well as an expiration time for the room (if the server is creating one). This will not add an expiration time to an existing room. Expiration times in `token_properties` or `room_properties` will also take precedence over this value. You can set this property to `0` to disable timeouts, but this isn't recommended.
|
||||
- `bot_args` (none): A string containing any additional command-line args to pass to the bot.
|
||||
- `wait_for_bot` (true): Whether to wait for the bot to successfully join the room before returning a response from the server. If true, the server will start the bot script, then poll the room for up to 5 seconds to confirm the bot has joined the room. If it doesn't, the server will stop the bot and return a 500 response. If set to `false`, the server will start the bot, but immediately return a 200 response. This can be useful if the server is creating rooms for you, and you need the room URL to join the user to the room.
|
||||
- `redirect` (true): Instead of returning a 200 for GET requests, the server will return a 301 redirect to the ROOM_URL. This is handy for testing by creating a bot with a GET request directly in the browser. POST requests will never return redirects. Set to `false` to get 200 responses with info in a JSON object even for GET requests.
|
||||
@@ -1,165 +0,0 @@
|
||||
import os
|
||||
import requests
|
||||
import urllib
|
||||
import subprocess
|
||||
import time
|
||||
|
||||
from flask import Flask, jsonify, redirect, request
|
||||
from flask_cors import CORS
|
||||
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv(override=True)
|
||||
|
||||
app = Flask(__name__)
|
||||
CORS(app)
|
||||
|
||||
APPS = {
|
||||
"chatbot": "../starter-apps/chatbot.py",
|
||||
"patient-intake": "../starter-apps/patient-intake.py",
|
||||
"storybot": "../starter-apps/storybot.py",
|
||||
"translator": "../starter-apps/translator.py"
|
||||
}
|
||||
|
||||
daily_api_key = os.getenv("DAILY_API_KEY")
|
||||
api_path = os.getenv("DAILY_API_PATH") or "https://api.daily.co/v1"
|
||||
|
||||
|
||||
def get_room_name(room_url):
|
||||
return urllib.parse.urlparse(room_url).path[1:]
|
||||
|
||||
|
||||
def create_room(room_properties, exp):
|
||||
room_props = {
|
||||
"exp": exp,
|
||||
"enable_chat": True,
|
||||
"enable_emoji_reactions": True,
|
||||
"eject_at_room_exp": True,
|
||||
"enable_prejoin_ui": False,
|
||||
"enable_recording": "cloud"
|
||||
}
|
||||
if room_properties:
|
||||
room_props |= room_properties
|
||||
|
||||
res = requests.post(
|
||||
f"{api_path}/rooms",
|
||||
headers={"Authorization": f"Bearer {daily_api_key}"},
|
||||
json={
|
||||
"properties": room_props
|
||||
},
|
||||
)
|
||||
if res.status_code != 200:
|
||||
raise Exception(f"Unable to create room: {res.text}")
|
||||
|
||||
room_url = res.json()["url"]
|
||||
room_name = res.json()["name"]
|
||||
return (room_url, room_name)
|
||||
|
||||
|
||||
def create_token(room_name, token_properties, exp):
|
||||
token_props = {"exp": exp, "is_owner": True}
|
||||
if token_properties:
|
||||
token_props |= token_properties
|
||||
# Force the token to be limited to the room
|
||||
token_props |= {"room_name": room_name}
|
||||
res = requests.post(
|
||||
f'{api_path}/meeting-tokens',
|
||||
headers={
|
||||
'Authorization': f'Bearer {daily_api_key}'},
|
||||
json={
|
||||
'properties': token_props})
|
||||
if res.status_code != 200:
|
||||
if res.status_code != 200:
|
||||
raise Exception(f"Unable to create meeting token: {res.text}")
|
||||
|
||||
meeting_token = res.json()['token']
|
||||
return meeting_token
|
||||
|
||||
|
||||
def start_bot(*, bot_path, room_url, token, bot_args, wait_for_bot):
|
||||
|
||||
room_name = get_room_name(room_url)
|
||||
proc = subprocess.Popen(
|
||||
[f"python {bot_path} -u {room_url} -t {token} -k {daily_api_key} {bot_args}"],
|
||||
shell=True,
|
||||
bufsize=1,
|
||||
)
|
||||
|
||||
if wait_for_bot:
|
||||
# Don't return until the bot has joined the room, but wait for at most 5
|
||||
# seconds.
|
||||
attempts = 0
|
||||
while attempts < 50:
|
||||
time.sleep(0.1)
|
||||
attempts += 1
|
||||
res = requests.get(
|
||||
f"{api_path}/rooms/{room_name}/get-session-data",
|
||||
headers={"Authorization": f"Bearer {daily_api_key}"},
|
||||
)
|
||||
if res.status_code == 200:
|
||||
print(f"Took {attempts} attempts to join room {room_name}")
|
||||
return True
|
||||
|
||||
# If we don't break from the loop, that means we never found the bot in the room
|
||||
raise Exception("The bot was unable to join the room. Please try again.")
|
||||
|
||||
return True
|
||||
|
||||
|
||||
@app.route("/start/<string:botname>", methods=["GET", "POST"])
|
||||
def start(botname):
|
||||
try:
|
||||
if botname not in APPS:
|
||||
raise Exception(f"Bot '{botname}' is not in the allowlist.")
|
||||
|
||||
bot_path = APPS[botname]
|
||||
props = {
|
||||
"room_url": None,
|
||||
"room_properties": None,
|
||||
"token_properties": None,
|
||||
"bot_args": None,
|
||||
"wait_for_bot": True,
|
||||
"duration": None,
|
||||
"redirect": True
|
||||
}
|
||||
props |= request.values.to_dict() # gets URL params as well as plaintext POST body
|
||||
try:
|
||||
props |= request.json
|
||||
except BaseException:
|
||||
pass
|
||||
if props['redirect'] == "false":
|
||||
props['redirect'] = False
|
||||
if props['wait_for_bot'] == "false":
|
||||
props['wait_for_bot'] = False
|
||||
|
||||
duration = int(os.getenv("DAILY_BOT_DURATION") or 7200)
|
||||
if props['duration']:
|
||||
duration = props['duration']
|
||||
exp = time.time() + duration
|
||||
if (props['room_url']):
|
||||
room_url = props['room_url']
|
||||
try:
|
||||
room_name = get_room_name(room_url)
|
||||
except ValueError:
|
||||
raise Exception(
|
||||
"There was a problem detecting the room name. Please double-check the value of room_url.")
|
||||
else:
|
||||
room_url, room_name = create_room(props['room_properties'], exp)
|
||||
token = create_token(room_name, props['token_properties'], exp)
|
||||
bot = start_bot(
|
||||
room_url=room_url,
|
||||
bot_path=bot_path,
|
||||
token=token,
|
||||
bot_args=props['bot_args'],
|
||||
wait_for_bot=props['wait_for_bot'])
|
||||
|
||||
if props['redirect'] and request.method == "GET":
|
||||
return redirect(room_url, 302)
|
||||
else:
|
||||
return jsonify({"room_url": room_url, "token": token})
|
||||
except BaseException as e:
|
||||
return f"There was a problem starting the bot: {e}", 500
|
||||
|
||||
|
||||
@app.route("/healthz")
|
||||
def health_check():
|
||||
return "ok", 200
|
||||
|
Before Width: | Height: | Size: 1.1 MiB |
|
Before Width: | Height: | Size: 1.1 MiB |
|
Before Width: | Height: | Size: 759 KiB |
|
Before Width: | Height: | Size: 884 KiB |
|
Before Width: | Height: | Size: 876 KiB |
|
Before Width: | Height: | Size: 881 KiB |
|
Before Width: | Height: | Size: 866 KiB |
|
Before Width: | Height: | Size: 874 KiB |
|
Before Width: | Height: | Size: 882 KiB |
|
Before Width: | Height: | Size: 885 KiB |
|
Before Width: | Height: | Size: 888 KiB |
|
Before Width: | Height: | Size: 890 KiB |
|
Before Width: | Height: | Size: 898 KiB |
|
Before Width: | Height: | Size: 836 KiB |
|
Before Width: | Height: | Size: 903 KiB |
|
Before Width: | Height: | Size: 908 KiB |
|
Before Width: | Height: | Size: 908 KiB |
|
Before Width: | Height: | Size: 905 KiB |
|
Before Width: | Height: | Size: 903 KiB |
|
Before Width: | Height: | Size: 866 KiB |
|
Before Width: | Height: | Size: 849 KiB |
|
Before Width: | Height: | Size: 866 KiB |
|
Before Width: | Height: | Size: 866 KiB |
|
Before Width: | Height: | Size: 864 KiB |
|
Before Width: | Height: | Size: 858 KiB |
|
Before Width: | Height: | Size: 875 KiB |
|
Before Width: | Height: | Size: 881 KiB |
@@ -1,149 +0,0 @@
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import logging
|
||||
import os
|
||||
from PIL import Image
|
||||
from typing import AsyncGenerator
|
||||
|
||||
from dailyai.pipeline.aggregators import (
|
||||
LLMResponseAggregator,
|
||||
UserResponseAggregator,
|
||||
)
|
||||
from dailyai.pipeline.frames import (
|
||||
ImageFrame,
|
||||
SpriteFrame,
|
||||
Frame,
|
||||
LLMResponseEndFrame,
|
||||
LLMMessagesFrame,
|
||||
AudioFrame,
|
||||
PipelineStartedFrame,
|
||||
)
|
||||
from dailyai.services.ai_services import AIService
|
||||
from dailyai.pipeline.pipeline import Pipeline
|
||||
from dailyai.transports.daily_transport import DailyTransport
|
||||
from dailyai.services.open_ai_services import OpenAILLMService
|
||||
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
|
||||
|
||||
from runner import configure
|
||||
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv(override=True)
|
||||
|
||||
logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
|
||||
logger = logging.getLogger("dailyai")
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
sprites = []
|
||||
|
||||
script_dir = os.path.dirname(__file__)
|
||||
|
||||
for i in range(1, 26):
|
||||
# Build the full path to the image file
|
||||
full_path = os.path.join(script_dir, f"assets/robot0{i}.png")
|
||||
# Get the filename without the extension to use as the dictionary key
|
||||
# Open the image and convert it to bytes
|
||||
with Image.open(full_path) as img:
|
||||
sprites.append(img.tobytes())
|
||||
|
||||
flipped = sprites[::-1]
|
||||
sprites.extend(flipped)
|
||||
# When the bot isn't talking, show a static image of the cat listening
|
||||
quiet_frame = ImageFrame("", sprites[0])
|
||||
talking_frame = SpriteFrame(images=sprites)
|
||||
|
||||
|
||||
class TalkingAnimation(AIService):
|
||||
"""
|
||||
This class starts a talking animation when it receives an first AudioFrame,
|
||||
and then returns to a "quiet" sprite when it sees a LLMResponseEndFrame.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self._is_talking = False
|
||||
|
||||
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||
if isinstance(frame, AudioFrame):
|
||||
if not self._is_talking:
|
||||
yield talking_frame
|
||||
yield frame
|
||||
self._is_talking = True
|
||||
else:
|
||||
yield frame
|
||||
elif isinstance(frame, LLMResponseEndFrame):
|
||||
yield quiet_frame
|
||||
yield frame
|
||||
self._is_talking = False
|
||||
else:
|
||||
yield frame
|
||||
|
||||
|
||||
class AnimationInitializer(AIService):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||
if isinstance(frame, PipelineStartedFrame):
|
||||
yield quiet_frame
|
||||
yield frame
|
||||
else:
|
||||
yield frame
|
||||
|
||||
|
||||
async def main(room_url: str, token):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
transport = DailyTransport(
|
||||
room_url,
|
||||
token,
|
||||
"Chatbot",
|
||||
duration_minutes=5,
|
||||
start_transcription=True,
|
||||
mic_enabled=True,
|
||||
mic_sample_rate=16000,
|
||||
camera_enabled=True,
|
||||
camera_width=1024,
|
||||
camera_height=576,
|
||||
vad_enabled=True,
|
||||
)
|
||||
|
||||
tts = ElevenLabsTTSService(
|
||||
aiohttp_session=session,
|
||||
api_key=os.getenv("ELEVENLABS_API_KEY"),
|
||||
voice_id="pNInz6obpgDQGcFmaJgB",
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
model="gpt-4-turbo-preview")
|
||||
|
||||
ta = TalkingAnimation()
|
||||
ai = AnimationInitializer()
|
||||
pipeline = Pipeline([ai, llm, tts, ta])
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are Chatbot, a friendly, helpful robot. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio. Respond to what the user said in a creative and helpful way, but keep your responses brief. Start by introducing yourself.",
|
||||
},
|
||||
]
|
||||
|
||||
@transport.event_handler("on_first_other_participant_joined")
|
||||
async def on_first_other_participant_joined(transport):
|
||||
print(f"!!! in here, pipeline.source is {pipeline.source}")
|
||||
await pipeline.queue_frames([LLMMessagesFrame(messages)])
|
||||
|
||||
async def run_conversation():
|
||||
|
||||
await transport.run_interruptible_pipeline(
|
||||
pipeline,
|
||||
post_processor=LLMResponseAggregator(messages),
|
||||
pre_processor=UserResponseAggregator(messages),
|
||||
)
|
||||
|
||||
transport.transcription_settings["extra"]["endpointing"] = True
|
||||
transport.transcription_settings["extra"]["punctuate"] = True
|
||||
await asyncio.gather(transport.run(), run_conversation())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
(url, token) = configure()
|
||||
asyncio.run(main(url, token))
|
||||
@@ -1,354 +0,0 @@
|
||||
import copy
|
||||
import aiohttp
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import wave
|
||||
from typing import AsyncGenerator, List
|
||||
from dailyai.pipeline.opeanai_llm_aggregator import (
|
||||
OpenAIAssistantContextAggregator,
|
||||
OpenAIUserContextAggregator,
|
||||
)
|
||||
|
||||
from dailyai.pipeline.pipeline import Pipeline
|
||||
from dailyai.transports.daily_transport import DailyTransport
|
||||
from dailyai.services.openai_llm_context import OpenAILLMContext
|
||||
from dailyai.services.open_ai_services import OpenAILLMService
|
||||
# from dailyai.services.deepgram_ai_services import DeepgramTTSService
|
||||
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
|
||||
from dailyai.pipeline.frames import (
|
||||
Frame,
|
||||
LLMFunctionCallFrame,
|
||||
LLMFunctionStartFrame,
|
||||
AudioFrame,
|
||||
)
|
||||
from dailyai.pipeline.openai_frames import OpenAILLMContextFrame
|
||||
from dailyai.services.ai_services import FrameLogger, AIService
|
||||
from openai._types import NotGiven, NOT_GIVEN
|
||||
|
||||
from openai.types.chat import (
|
||||
ChatCompletionToolParam,
|
||||
)
|
||||
|
||||
from runner import configure
|
||||
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv(override=True)
|
||||
|
||||
logging.basicConfig(format="%(levelno)s %(asctime)s %(message)s")
|
||||
logger = logging.getLogger("dailyai")
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
sounds = {}
|
||||
sound_files = [
|
||||
"clack-short.wav",
|
||||
"clack.wav",
|
||||
"clack-short-quiet.wav",
|
||||
"ding.wav",
|
||||
"ding2.wav",
|
||||
]
|
||||
|
||||
script_dir = os.path.dirname(__file__)
|
||||
|
||||
for file in sound_files:
|
||||
# Build the full path to the sound file
|
||||
full_path = os.path.join(script_dir, "assets", file)
|
||||
# Get the filename without the extension to use as the dictionary key
|
||||
filename = os.path.splitext(os.path.basename(full_path))[0]
|
||||
# Open the sound and convert it to bytes
|
||||
with wave.open(full_path) as audio_file:
|
||||
sounds[file] = audio_file.readframes(-1)
|
||||
|
||||
|
||||
steps = [{"prompt": "Start by introducing yourself. Then, ask the user to confirm their identity by telling you their birthday, including the year. When they answer with their birthday, call the verify_birthday function.",
|
||||
"run_async": False,
|
||||
"failed": "The user provided an incorrect birthday. Ask them for their birthday again. When they answer, call the verify_birthday function.",
|
||||
"tools": [{"type": "function",
|
||||
"function": {"name": "verify_birthday",
|
||||
"description": "Use this function to verify the user has provided their correct birthday.",
|
||||
"parameters": {"type": "object",
|
||||
"properties": {"birthday": {"type": "string",
|
||||
"description": "The user's birthdate, including the year. The user can provide it in any format, but convert it to YYYY-MM-DD format to call this function.",
|
||||
}},
|
||||
},
|
||||
},
|
||||
}],
|
||||
},
|
||||
{"prompt": "Next, thank the user for confirming their identity, then ask the user to list their current prescriptions. Each prescription needs to have a medication name and a dosage. Do not call the list_prescriptions function with any unknown dosages.",
|
||||
"run_async": True,
|
||||
"tools": [{"type": "function",
|
||||
"function": {"name": "list_prescriptions",
|
||||
"description": "Once the user has provided a list of their prescription medications, call this function.",
|
||||
"parameters": {"type": "object",
|
||||
"properties": {"prescriptions": {"type": "array",
|
||||
"items": {"type": "object",
|
||||
"properties": {"medication": {"type": "string",
|
||||
"description": "The medication's name",
|
||||
},
|
||||
"dosage": {"type": "string",
|
||||
"description": "The prescription's dosage",
|
||||
},
|
||||
},
|
||||
},
|
||||
}},
|
||||
},
|
||||
},
|
||||
}],
|
||||
},
|
||||
{"prompt": "Next, ask the user if they have any allergies. Once they have listed their allergies or confirmed they don't have any, call the list_allergies function.",
|
||||
"run_async": True,
|
||||
"tools": [{"type": "function",
|
||||
"function": {"name": "list_allergies",
|
||||
"description": "Once the user has provided a list of their allergies, call this function.",
|
||||
"parameters": {"type": "object",
|
||||
"properties": {"allergies": {"type": "array",
|
||||
"items": {"type": "object",
|
||||
"properties": {"name": {"type": "string",
|
||||
"description": "What the user is allergic to",
|
||||
}},
|
||||
},
|
||||
}},
|
||||
},
|
||||
},
|
||||
}],
|
||||
},
|
||||
{"prompt": "Now ask the user if they have any medical conditions the doctor should know about. Once they've answered the question, call the list_conditions function.",
|
||||
"run_async": True,
|
||||
"tools": [{"type": "function",
|
||||
"function": {"name": "list_conditions",
|
||||
"description": "Once the user has provided a list of their medical conditions, call this function.",
|
||||
"parameters": {"type": "object",
|
||||
"properties": {"conditions": {"type": "array",
|
||||
"items": {"type": "object",
|
||||
"properties": {"name": {"type": "string",
|
||||
"description": "The user's medical condition",
|
||||
}},
|
||||
},
|
||||
}},
|
||||
},
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
{"prompt": "Finally, ask the user the reason for their doctor visit today. Once they answer, call the list_visit_reasons function.",
|
||||
"run_async": True,
|
||||
"tools": [{"type": "function",
|
||||
"function": {"name": "list_visit_reasons",
|
||||
"description": "Once the user has provided a list of the reasons they are visiting a doctor today, call this function.",
|
||||
"parameters": {"type": "object",
|
||||
"properties": {"visit_reasons": {"type": "array",
|
||||
"items": {"type": "object",
|
||||
"properties": {"name": {"type": "string",
|
||||
"description": "The user's reason for visiting the doctor",
|
||||
}},
|
||||
},
|
||||
}},
|
||||
},
|
||||
},
|
||||
}],
|
||||
},
|
||||
{"prompt": "Now, thank the user and end the conversation.",
|
||||
"run_async": True,
|
||||
"tools": [],
|
||||
},
|
||||
{"prompt": "",
|
||||
"run_async": True,
|
||||
"tools": []},
|
||||
]
|
||||
current_step = 0
|
||||
|
||||
|
||||
class ChecklistProcessor(AIService):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
context: OpenAILLMContext,
|
||||
llm: AIService,
|
||||
tools: List[ChatCompletionToolParam] | NotGiven = NOT_GIVEN,
|
||||
*args,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(*args, **kwargs)
|
||||
self._context: OpenAILLMContext = context
|
||||
self._llm = llm
|
||||
self._id = "You are Jessica, an agent for a company called Tri-County Health Services. Your job is to collect important information from the user before their doctor visit. You're talking to Chad Bailey. You should address the user by their first name and be polite and professional. You're not a medical professional, so you shouldn't provide any advice. Keep your responses short. Your job is to collect information to give to a doctor. Don't make assumptions about what values to plug into functions. Ask for clarification if a user response is ambiguous."
|
||||
self._acks = ["One sec.", "Let me confirm that.", "Thanks.", "OK."]
|
||||
|
||||
# Create an allowlist of functions that the LLM can call
|
||||
self._functions = [
|
||||
"verify_birthday",
|
||||
"list_prescriptions",
|
||||
"list_allergies",
|
||||
"list_conditions",
|
||||
"list_visit_reasons",
|
||||
]
|
||||
|
||||
self._context.add_message(
|
||||
{"role": "system", "content": f"{self._id} {steps[0]['prompt']}"}
|
||||
)
|
||||
|
||||
if tools:
|
||||
self._context.set_tools(tools)
|
||||
|
||||
def verify_birthday(self, args):
|
||||
return args["birthday"] == "1983-01-01"
|
||||
|
||||
def list_prescriptions(self, args):
|
||||
# print(f"--- Prescriptions: {args['prescriptions']}\n")
|
||||
pass
|
||||
|
||||
def list_allergies(self, args):
|
||||
# print(f"--- Allergies: {args['allergies']}\n")
|
||||
pass
|
||||
|
||||
def list_conditions(self, args):
|
||||
# print(f"--- Medical Conditions: {args['conditions']}")
|
||||
pass
|
||||
|
||||
def list_visit_reasons(self, args):
|
||||
# print(f"Visit Reasons: {args['visit_reasons']}")
|
||||
pass
|
||||
|
||||
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||
global current_step
|
||||
this_step = steps[current_step]
|
||||
self._context.set_tools(this_step["tools"])
|
||||
if isinstance(frame, LLMFunctionStartFrame):
|
||||
print(f"... Preparing function call: {frame.function_name}")
|
||||
self._function_name = frame.function_name
|
||||
if this_step["run_async"]:
|
||||
# Get the LLM talking about the next step before getting the rest
|
||||
# of the function call completion
|
||||
current_step += 1
|
||||
self._context.add_message(
|
||||
{"role": "system", "content": steps[current_step]["prompt"]}
|
||||
)
|
||||
yield OpenAILLMContextFrame(self._context)
|
||||
|
||||
local_context = copy.deepcopy(self._context)
|
||||
local_context.set_tool_choice("none")
|
||||
async for frame in llm.process_frame(
|
||||
OpenAILLMContextFrame(local_context)
|
||||
):
|
||||
yield frame
|
||||
else:
|
||||
# Insert a quick response while we run the function
|
||||
yield AudioFrame(sounds["ding2.wav"])
|
||||
pass
|
||||
elif isinstance(frame, LLMFunctionCallFrame):
|
||||
|
||||
if frame.function_name and frame.arguments:
|
||||
print(
|
||||
f"--> Calling function: {frame.function_name} with arguments:")
|
||||
pretty_json = re.sub(
|
||||
"\n", "\n ", json.dumps(
|
||||
json.loads(
|
||||
frame.arguments), indent=2))
|
||||
print(f"--> {pretty_json}\n")
|
||||
if frame.function_name not in self._functions:
|
||||
raise Exception(
|
||||
f"The LLM tried to call a function named {frame.function_name}, which isn't in the list of known functions. Please check your prompt and/or self._functions."
|
||||
)
|
||||
fn = getattr(self, frame.function_name)
|
||||
result = fn(json.loads(frame.arguments))
|
||||
|
||||
if not this_step["run_async"]:
|
||||
if result:
|
||||
current_step += 1
|
||||
self._context.add_message(
|
||||
{"role": "system", "content": steps[current_step]["prompt"]}
|
||||
)
|
||||
yield OpenAILLMContextFrame(self._context)
|
||||
|
||||
local_context = copy.deepcopy(self._context)
|
||||
local_context.set_tool_choice("none")
|
||||
async for frame in llm.process_frame(
|
||||
OpenAILLMContextFrame(local_context)
|
||||
):
|
||||
yield frame
|
||||
else:
|
||||
self._context.add_message(
|
||||
{"role": "system", "content": this_step["failed"]}
|
||||
)
|
||||
yield OpenAILLMContextFrame(self._context)
|
||||
|
||||
local_context = copy.deepcopy(self._context)
|
||||
local_context.set_tool_choice("none")
|
||||
async for frame in llm.process_frame(
|
||||
OpenAILLMContextFrame(local_context)
|
||||
):
|
||||
yield frame
|
||||
print(f"<-- Verify result: {result}\n")
|
||||
|
||||
else:
|
||||
yield frame
|
||||
|
||||
|
||||
async def main(room_url: str, token):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
global transport
|
||||
global llm
|
||||
global tts
|
||||
|
||||
transport = DailyTransport(
|
||||
room_url,
|
||||
token,
|
||||
"Intake Bot",
|
||||
5,
|
||||
mic_enabled=True,
|
||||
mic_sample_rate=16000,
|
||||
camera_enabled=False,
|
||||
start_transcription=True,
|
||||
vad_enabled=True,
|
||||
)
|
||||
|
||||
messages = []
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
model="gpt-4-1106-preview",
|
||||
)
|
||||
# tts = DeepgramTTSService(
|
||||
# aiohttp_session=session,
|
||||
# api_key=os.getenv("DEEPGRAM_API_KEY"),
|
||||
# voice="aura-asteria-en",
|
||||
# )
|
||||
tts = ElevenLabsTTSService(
|
||||
aiohttp_session=session,
|
||||
api_key=os.getenv("ELEVENLABS_API_KEY"),
|
||||
voice_id="XrExE9yKIg1WjnnlVkGX",
|
||||
)
|
||||
context = OpenAILLMContext(
|
||||
messages=messages,
|
||||
)
|
||||
|
||||
checklist = ChecklistProcessor(context, llm)
|
||||
fl = FrameLogger("FRAME LOGGER 1:")
|
||||
fl2 = FrameLogger("FRAME LOGGER 2:")
|
||||
pipeline = Pipeline(processors=[fl, llm, fl2, checklist, tts])
|
||||
|
||||
@transport.event_handler("on_first_other_participant_joined")
|
||||
async def on_first_other_participant_joined(transport):
|
||||
await pipeline.queue_frames([OpenAILLMContextFrame(context)])
|
||||
|
||||
async def handle_intake():
|
||||
await transport.run_interruptible_pipeline(
|
||||
pipeline,
|
||||
post_processor=OpenAIAssistantContextAggregator(context),
|
||||
pre_processor=OpenAIUserContextAggregator(context),
|
||||
)
|
||||
|
||||
transport.transcription_settings["extra"]["endpointing"] = True
|
||||
transport.transcription_settings["extra"]["punctuate"] = True
|
||||
try:
|
||||
await asyncio.gather(transport.run(), handle_intake())
|
||||
except (asyncio.CancelledError, KeyboardInterrupt):
|
||||
print("whoops")
|
||||
transport.stop()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
(url, token) = configure()
|
||||
asyncio.run(main(url, token))
|
||||
@@ -1,58 +0,0 @@
|
||||
import argparse
|
||||
import os
|
||||
import time
|
||||
import urllib
|
||||
import requests
|
||||
|
||||
|
||||
def configure():
|
||||
parser = argparse.ArgumentParser(description="Daily AI SDK Bot Sample")
|
||||
parser.add_argument(
|
||||
"-u",
|
||||
"--url",
|
||||
type=str,
|
||||
required=False,
|
||||
help="URL of the Daily room to join")
|
||||
parser.add_argument(
|
||||
"-k",
|
||||
"--apikey",
|
||||
type=str,
|
||||
required=False,
|
||||
help="Daily API Key (needed to create an owner token for the room)",
|
||||
)
|
||||
|
||||
args, unknown = parser.parse_known_args()
|
||||
|
||||
url = args.url or os.getenv("DAILY_SAMPLE_ROOM_URL")
|
||||
key = args.apikey or os.getenv("DAILY_API_KEY")
|
||||
|
||||
if not url:
|
||||
raise Exception(
|
||||
"No Daily room specified. use the -u/--url option from the command line, or set DAILY_SAMPLE_ROOM_URL in your environment to specify a Daily room URL.")
|
||||
|
||||
if not key:
|
||||
raise Exception("No Daily API key specified. use the -k/--apikey option from the command line, or set DAILY_API_KEY in your environment to specify a Daily API key, available from https://dashboard.daily.co/developers.")
|
||||
|
||||
# Create a meeting token for the given room with an expiration 1 hour in
|
||||
# the future.
|
||||
room_name: str = urllib.parse.urlparse(url).path[1:]
|
||||
expiration: float = time.time() + 60 * 60
|
||||
|
||||
res: requests.Response = requests.post(
|
||||
f"https://api.daily.co/v1/meeting-tokens",
|
||||
headers={
|
||||
"Authorization": f"Bearer {key}"},
|
||||
json={
|
||||
"properties": {
|
||||
"room_name": room_name,
|
||||
"is_owner": True,
|
||||
"exp": expiration}},
|
||||
)
|
||||
|
||||
if res.status_code != 200:
|
||||
raise Exception(
|
||||
f"Failed to create meeting token: {res.status_code} {res.text}")
|
||||
|
||||
token: str = res.json()["token"]
|
||||
|
||||
return (url, token)
|
||||
@@ -1,290 +0,0 @@
|
||||
import aiohttp
|
||||
import asyncio
|
||||
import json
|
||||
import random
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import wave
|
||||
from typing import AsyncGenerator
|
||||
from PIL import Image
|
||||
|
||||
from dailyai.pipeline.pipeline import Pipeline
|
||||
from dailyai.pipeline.frame_processor import FrameProcessor
|
||||
from dailyai.transports.daily_transport import DailyTransport
|
||||
from dailyai.services.azure_ai_services import AzureLLMService, AzureTTSService
|
||||
from dailyai.services.fal_ai_services import FalImageGenService
|
||||
from dailyai.services.open_ai_services import OpenAILLMService
|
||||
from dailyai.services.deepgram_ai_services import DeepgramTTSService
|
||||
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
|
||||
from dailyai.pipeline.aggregators import (
|
||||
LLMAssistantContextAggregator,
|
||||
UserResponseAggregator,
|
||||
LLMResponseAggregator,
|
||||
)
|
||||
from dailyai.pipeline.frames import (
|
||||
EndPipeFrame,
|
||||
LLMMessagesFrame,
|
||||
Frame,
|
||||
TextFrame,
|
||||
LLMResponseEndFrame,
|
||||
AudioFrame,
|
||||
ImageFrame,
|
||||
UserStoppedSpeakingFrame,
|
||||
)
|
||||
from dailyai.services.ai_services import FrameLogger, AIService
|
||||
|
||||
from runner import configure
|
||||
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv(override=True)
|
||||
|
||||
logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
|
||||
logger = logging.getLogger("dailyai")
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
sounds = {}
|
||||
images = {}
|
||||
sound_files = ["talking.wav", "listening.wav", "ding3.wav"]
|
||||
image_files = ["grandma-writing.png", "grandma-listening.png"]
|
||||
script_dir = os.path.dirname(__file__)
|
||||
|
||||
for file in sound_files:
|
||||
# Build the full path to the sound file
|
||||
full_path = os.path.join(script_dir, "assets", file)
|
||||
# Get the filename without the extension to use as the dictionary key
|
||||
filename = os.path.splitext(os.path.basename(full_path))[0]
|
||||
# Open the sound and convert it to bytes
|
||||
with wave.open(full_path) as audio_file:
|
||||
sounds[file] = audio_file.readframes(-1)
|
||||
|
||||
for file in image_files:
|
||||
# Build the full path to the image file
|
||||
full_path = os.path.join(script_dir, "assets", file)
|
||||
# Get the filename without the extension to use as the dictionary key
|
||||
filename = os.path.splitext(os.path.basename(full_path))[0]
|
||||
# Open the image and convert it to bytes
|
||||
with Image.open(full_path) as img:
|
||||
images[file] = img.tobytes()
|
||||
|
||||
|
||||
class StoryStartFrame(TextFrame):
|
||||
pass
|
||||
|
||||
|
||||
class StoryPageFrame(TextFrame):
|
||||
pass
|
||||
|
||||
|
||||
class StoryPromptFrame(TextFrame):
|
||||
pass
|
||||
|
||||
|
||||
class StoryProcessor(FrameProcessor):
|
||||
def __init__(self, messages, story):
|
||||
self._messages = messages
|
||||
self._text = ""
|
||||
self._story = story
|
||||
|
||||
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||
"""
|
||||
The response from the LLM service looks like:
|
||||
A comment about the user's choice
|
||||
[start] (when the cat starts telling parts of the story)
|
||||
A sentence of the story
|
||||
[break] (between each sentence/'page' of the story)
|
||||
[prompt] (when the cat asks the user to make a decision)
|
||||
Question about the next part of the story
|
||||
|
||||
1. Catch the frames that are generated by the LLM service
|
||||
"""
|
||||
if isinstance(frame, UserStoppedSpeakingFrame):
|
||||
yield ImageFrame(None, images["grandma-writing.png"])
|
||||
yield AudioFrame(sounds["talking.wav"])
|
||||
|
||||
elif isinstance(frame, TextFrame):
|
||||
self._text += frame.text
|
||||
|
||||
if re.findall(r".*\[[sS]tart\].*", self._text):
|
||||
# Then we have the intro. Send it to speech ASAP
|
||||
self._text = self._text.replace("[Start]", "")
|
||||
self._text = self._text.replace("[start]", "")
|
||||
|
||||
self._text = self._text.replace("\n", " ")
|
||||
if len(self._text) > 2:
|
||||
yield ImageFrame(None, images["grandma-writing.png"])
|
||||
yield StoryStartFrame(self._text)
|
||||
yield AudioFrame(sounds["ding3.wav"])
|
||||
self._text = ""
|
||||
|
||||
elif re.findall(r".*\[[bB]reak\].*", self._text):
|
||||
# Then it's a page of the story. Get an image too
|
||||
self._text = self._text.replace("[Break]", "")
|
||||
self._text = self._text.replace("[break]", "")
|
||||
self._text = self._text.replace("\n", " ")
|
||||
if len(self._text) > 2:
|
||||
self._story.append(self._text)
|
||||
yield StoryPageFrame(self._text)
|
||||
yield AudioFrame(sounds["ding3.wav"])
|
||||
|
||||
self._text = ""
|
||||
elif re.findall(r".*\[[pP]rompt\].*", self._text):
|
||||
# Then it's question time. Flush any
|
||||
# text here as a story page, then set
|
||||
# the var to get to prompt mode
|
||||
# cb: trying scene now
|
||||
# self.handle_chunk(self._text)
|
||||
self._text = self._text.replace("[Prompt]", "")
|
||||
self._text = self._text.replace("[prompt]", "")
|
||||
|
||||
self._text = self._text.replace("\n", " ")
|
||||
if len(self._text) > 2:
|
||||
self._story.append(self._text)
|
||||
yield StoryPageFrame(self._text)
|
||||
else:
|
||||
# After the prompt thing, we'll catch an LLM end to get the
|
||||
# last bit
|
||||
pass
|
||||
elif isinstance(frame, LLMResponseEndFrame):
|
||||
yield ImageFrame(None, images["grandma-writing.png"])
|
||||
yield StoryPromptFrame(self._text)
|
||||
self._text = ""
|
||||
yield frame
|
||||
yield ImageFrame(None, images["grandma-listening.png"])
|
||||
yield AudioFrame(sounds["listening.wav"])
|
||||
|
||||
else:
|
||||
# pass through everything that's not a TextFrame
|
||||
yield frame
|
||||
|
||||
|
||||
class StoryImageGenerator(FrameProcessor):
|
||||
def __init__(self, story, llm, img):
|
||||
self._story = story
|
||||
self._llm = llm
|
||||
self._img = img
|
||||
|
||||
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||
if isinstance(frame, StoryPageFrame):
|
||||
if len(self._story) == 1:
|
||||
prompt = f'You are an illustrator for a children\'s story book. Generate a prompt for DALL-E to create an illustration for the first page of the book, which reads: "{self._story[0]}"\n\n Your response should start with the phrase "Children\'s book illustration of".'
|
||||
else:
|
||||
prompt = f"You are an illustrator for a children's story book. Here is the story so far:\n\n\"{' '.join(self._story[:-1])}\"\n\nGenerate a prompt for DALL-E to create an illustration for the next page. Here's the sentence for the next page:\n\n\"{self._story[-1:][0]}\"\n\n Your response should start with the phrase \"Children's book illustration of\"."
|
||||
msgs = [{"role": "system", "content": prompt}]
|
||||
image_prompt = ""
|
||||
async for f in self._llm.process_frame(LLMMessagesFrame(msgs)):
|
||||
if isinstance(f, TextFrame):
|
||||
image_prompt += f.text
|
||||
async for f in self._img.process_frame(TextFrame(image_prompt)):
|
||||
yield f
|
||||
# Yield the original StoryPageFrame for basic image/audio sync
|
||||
yield frame
|
||||
else:
|
||||
yield frame
|
||||
|
||||
|
||||
async def main(room_url: str, token):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a storytelling grandma who loves to make up fantastic, fun, and educational stories for children between the ages of 5 and 10 years old. Your stories are full of friendly, magical creatures. Your stories are never scary. Each sentence of your story will become a page in a storybook. Stop after 3-4 sentences and give the child a choice to make that will influence the next part of the story. Once the child responds, start by saying something nice about the choice they made, then include [start] in your response. Include [break] after each sentence of the story. Include [prompt] between the story and the prompt.",
|
||||
}
|
||||
]
|
||||
|
||||
story = []
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
model="gpt-4-1106-preview",
|
||||
) # gpt-4-1106-preview
|
||||
tts = ElevenLabsTTSService(
|
||||
aiohttp_session=session,
|
||||
api_key=os.getenv("ELEVENLABS_API_KEY"),
|
||||
voice_id="Xb7hH8MSUJpSbSDYk0k2",
|
||||
) # matilda
|
||||
img = FalImageGenService(
|
||||
image_size="1024x1024",
|
||||
aiohttp_session=session,
|
||||
key_id=os.getenv("FAL_KEY_ID"),
|
||||
key_secret=os.getenv("FAL_KEY_SECRET"),
|
||||
)
|
||||
lra = LLMResponseAggregator(messages)
|
||||
ura = UserResponseAggregator(messages)
|
||||
sp = StoryProcessor(messages, story)
|
||||
sig = StoryImageGenerator(story, llm, img)
|
||||
|
||||
transport = DailyTransport(
|
||||
room_url,
|
||||
token,
|
||||
"Storybot",
|
||||
5,
|
||||
mic_enabled=True,
|
||||
mic_sample_rate=16000,
|
||||
camera_enabled=True,
|
||||
camera_width=1024,
|
||||
camera_height=1024,
|
||||
start_transcription=True,
|
||||
vad_enabled=True,
|
||||
vad_stop_s=1.5,
|
||||
)
|
||||
|
||||
start_story_event = asyncio.Event()
|
||||
|
||||
@transport.event_handler("on_first_other_participant_joined")
|
||||
async def on_first_other_participant_joined(transport):
|
||||
start_story_event.set()
|
||||
|
||||
async def storytime():
|
||||
await start_story_event.wait()
|
||||
|
||||
# We're being a bit tricky here by using a special system prompt to
|
||||
# ask the user for a story topic. After their intial response, we'll
|
||||
# use a different system prompt to create story pages.
|
||||
intro_messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a storytelling grandma who loves to make up fantastic, fun, and educational stories for children between the ages of 5 and 10 years old. Your stories are full of friendly, magical creatures. Your stories are never scary. Begin by asking what a child wants you to tell a story about. Keep your reponse to only a few sentences.",
|
||||
}
|
||||
]
|
||||
lca = LLMAssistantContextAggregator(messages)
|
||||
local_pipeline = Pipeline(
|
||||
[llm, lca, tts], sink=transport.send_queue)
|
||||
await local_pipeline.queue_frames(
|
||||
[
|
||||
ImageFrame(None, images["grandma-listening.png"]),
|
||||
LLMMessagesFrame(intro_messages),
|
||||
AudioFrame(sounds["listening.wav"]),
|
||||
EndPipeFrame(),
|
||||
]
|
||||
)
|
||||
await local_pipeline.run_pipeline()
|
||||
|
||||
fl = FrameLogger("### After Image Generation")
|
||||
pipeline = Pipeline(
|
||||
processors=[
|
||||
ura,
|
||||
llm,
|
||||
sp,
|
||||
sig,
|
||||
fl,
|
||||
tts,
|
||||
lra,
|
||||
]
|
||||
)
|
||||
await transport.run_pipeline(
|
||||
pipeline,
|
||||
)
|
||||
|
||||
transport.transcription_settings["extra"]["endpointing"] = True
|
||||
transport.transcription_settings["extra"]["punctuate"] = True
|
||||
try:
|
||||
await asyncio.gather(transport.run(), storytime())
|
||||
except (asyncio.CancelledError, KeyboardInterrupt):
|
||||
print("whoops")
|
||||
transport.stop()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
(url, token) = configure()
|
||||
asyncio.run(main(url, token))
|
||||
@@ -1,109 +0,0 @@
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import logging
|
||||
import os
|
||||
from typing import AsyncGenerator
|
||||
|
||||
from dailyai.pipeline.aggregators import (
|
||||
SentenceAggregator,
|
||||
)
|
||||
from dailyai.pipeline.frames import (
|
||||
Frame,
|
||||
LLMMessagesFrame,
|
||||
TextFrame,
|
||||
SendAppMessageFrame,
|
||||
)
|
||||
from dailyai.pipeline.frame_processor import FrameProcessor
|
||||
from dailyai.pipeline.pipeline import Pipeline
|
||||
from dailyai.transports.daily_transport import DailyTransport
|
||||
from dailyai.services.azure_ai_services import AzureTTSService
|
||||
from dailyai.services.open_ai_services import OpenAILLMService
|
||||
from dailyai.pipeline.aggregators import LLMFullResponseAggregator
|
||||
|
||||
from runner import configure
|
||||
|
||||
from dotenv import load_dotenv
|
||||
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
|
||||
logger = logging.getLogger("dailyai")
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
"""
|
||||
This example looks a bit different than the chatbot example, because it isn't waiting on the user to stop talking to start translating.
|
||||
It also isn't saving what the user or bot says into the context object for use in subsequent interactions.
|
||||
"""
|
||||
|
||||
|
||||
# We need to use a custom service here to yield LLM frames without saving
|
||||
# any context
|
||||
class TranslationProcessor(FrameProcessor):
|
||||
def __init__(self, language):
|
||||
self._language = language
|
||||
|
||||
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||
if isinstance(frame, TextFrame):
|
||||
context = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": f"You will be provided with a sentence in English, and your task is to translate it into {self._language}.",
|
||||
},
|
||||
{"role": "user", "content": frame.text},
|
||||
]
|
||||
yield LLMMessagesFrame(context)
|
||||
else:
|
||||
yield frame
|
||||
|
||||
|
||||
class TranslationSubtitles(FrameProcessor):
|
||||
def __init__(self, language):
|
||||
self._language = language
|
||||
|
||||
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||
if isinstance(frame, TextFrame):
|
||||
app_message = {
|
||||
"language": self._language,
|
||||
"text": frame.text
|
||||
}
|
||||
yield SendAppMessageFrame(app_message, None)
|
||||
yield frame
|
||||
else:
|
||||
yield frame
|
||||
|
||||
|
||||
async def main(room_url: str, token):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
transport = DailyTransport(
|
||||
room_url,
|
||||
token,
|
||||
"Translator",
|
||||
duration_minutes=5,
|
||||
start_transcription=True,
|
||||
mic_enabled=True,
|
||||
mic_sample_rate=16000,
|
||||
camera_enabled=False,
|
||||
)
|
||||
tts = AzureTTSService(
|
||||
api_key=os.getenv("AZURE_SPEECH_API_KEY"),
|
||||
region=os.getenv("AZURE_SPEECH_REGION"),
|
||||
voice="es-ES-AlvaroNeural",
|
||||
)
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4-turbo-preview"
|
||||
)
|
||||
sa = SentenceAggregator()
|
||||
tp = TranslationProcessor("Spanish")
|
||||
lfra = LLMFullResponseAggregator()
|
||||
ts = TranslationSubtitles("spanish")
|
||||
pipeline = Pipeline([sa, tp, llm, lfra, ts, tts])
|
||||
|
||||
transport.transcription_settings["extra"]["endpointing"] = True
|
||||
transport.transcription_settings["extra"]["punctuate"] = True
|
||||
await transport.run(pipeline)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
(url, token) = configure()
|
||||
asyncio.run(main(url, token))
|
||||
@@ -1,319 +0,0 @@
|
||||
#
|
||||
# This file is autogenerated by pip-compile with Python 3.10
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile --all-extras pyproject.toml
|
||||
#
|
||||
aiohttp==3.9.3
|
||||
# via dailyai (pyproject.toml)
|
||||
aiosignal==1.3.1
|
||||
# via aiohttp
|
||||
anthropic==0.20.0
|
||||
# via dailyai (pyproject.toml)
|
||||
anyio==4.3.0
|
||||
# via
|
||||
# anthropic
|
||||
# httpx
|
||||
# openai
|
||||
# starlette
|
||||
async-timeout==4.0.3
|
||||
# via aiohttp
|
||||
attrs==23.2.0
|
||||
# via
|
||||
# aiohttp
|
||||
# fal
|
||||
av==11.0.0
|
||||
# via faster-whisper
|
||||
azure-cognitiveservices-speech==1.36.0
|
||||
# via dailyai (pyproject.toml)
|
||||
blinker==1.7.0
|
||||
# via flask
|
||||
certifi==2024.2.2
|
||||
# via
|
||||
# httpcore
|
||||
# httpx
|
||||
# requests
|
||||
charset-normalizer==3.3.2
|
||||
# via requests
|
||||
click==8.1.7
|
||||
# via
|
||||
# fal
|
||||
# flask
|
||||
colorama==0.4.6
|
||||
# via fal
|
||||
coloredlogs==15.0.1
|
||||
# via onnxruntime
|
||||
ctranslate2==4.1.0
|
||||
# via faster-whisper
|
||||
daily-python==0.7.2
|
||||
# via dailyai (pyproject.toml)
|
||||
deprecated==1.2.14
|
||||
# via opentelemetry-api
|
||||
dill==0.3.7
|
||||
# via fal
|
||||
distlib==0.3.8
|
||||
# via virtualenv
|
||||
distro==1.9.0
|
||||
# via
|
||||
# anthropic
|
||||
# openai
|
||||
exceptiongroup==1.2.0
|
||||
# via anyio
|
||||
fal==0.12.3
|
||||
# via dailyai (pyproject.toml)
|
||||
fastapi==0.99.1
|
||||
# via fal
|
||||
faster-whisper==1.0.1
|
||||
# via dailyai (pyproject.toml)
|
||||
filelock==3.13.3
|
||||
# via
|
||||
# huggingface-hub
|
||||
# pyht
|
||||
# torch
|
||||
# triton
|
||||
# virtualenv
|
||||
flask==3.0.2
|
||||
# via
|
||||
# dailyai (pyproject.toml)
|
||||
# flask-cors
|
||||
flask-cors==4.0.0
|
||||
# via dailyai (pyproject.toml)
|
||||
flatbuffers==24.3.25
|
||||
# via onnxruntime
|
||||
frozenlist==1.4.1
|
||||
# via
|
||||
# aiohttp
|
||||
# aiosignal
|
||||
fsspec==2024.3.1
|
||||
# via
|
||||
# huggingface-hub
|
||||
# torch
|
||||
grpc-interceptor==0.15.4
|
||||
# via fal
|
||||
grpcio==1.62.1
|
||||
# via
|
||||
# fal
|
||||
# grpc-interceptor
|
||||
# isolate
|
||||
# isolate-proto
|
||||
# pyht
|
||||
h11==0.14.0
|
||||
# via httpcore
|
||||
httpcore==1.0.5
|
||||
# via httpx
|
||||
httpx==0.27.0
|
||||
# via
|
||||
# anthropic
|
||||
# fal
|
||||
# openai
|
||||
huggingface-hub==0.22.2
|
||||
# via
|
||||
# faster-whisper
|
||||
# tokenizers
|
||||
humanfriendly==10.0
|
||||
# via coloredlogs
|
||||
idna==3.6
|
||||
# via
|
||||
# anyio
|
||||
# httpx
|
||||
# requests
|
||||
# yarl
|
||||
importlib-metadata==7.0.0
|
||||
# via opentelemetry-api
|
||||
isolate[build]==0.12.7
|
||||
# via
|
||||
# fal
|
||||
# isolate-proto
|
||||
isolate-proto==0.3.3
|
||||
# via fal
|
||||
itsdangerous==2.1.2
|
||||
# via flask
|
||||
jinja2==3.1.3
|
||||
# via
|
||||
# flask
|
||||
# torch
|
||||
markdown-it-py==3.0.0
|
||||
# via rich
|
||||
markupsafe==2.1.5
|
||||
# via
|
||||
# jinja2
|
||||
# werkzeug
|
||||
mdurl==0.1.2
|
||||
# via markdown-it-py
|
||||
mpmath==1.3.0
|
||||
# via sympy
|
||||
msgpack==1.0.8
|
||||
# via fal
|
||||
multidict==6.0.5
|
||||
# via
|
||||
# aiohttp
|
||||
# yarl
|
||||
networkx==3.2.1
|
||||
# via torch
|
||||
numpy==1.26.4
|
||||
# via
|
||||
# ctranslate2
|
||||
# dailyai (pyproject.toml)
|
||||
# onnxruntime
|
||||
nvidia-cublas-cu12==12.1.3.1
|
||||
# via
|
||||
# nvidia-cudnn-cu12
|
||||
# nvidia-cusolver-cu12
|
||||
# torch
|
||||
nvidia-cuda-cupti-cu12==12.1.105
|
||||
# via torch
|
||||
nvidia-cuda-nvrtc-cu12==12.1.105
|
||||
# via torch
|
||||
nvidia-cuda-runtime-cu12==12.1.105
|
||||
# via torch
|
||||
nvidia-cudnn-cu12==8.9.2.26
|
||||
# via torch
|
||||
nvidia-cufft-cu12==11.0.2.54
|
||||
# via torch
|
||||
nvidia-curand-cu12==10.3.2.106
|
||||
# via torch
|
||||
nvidia-cusolver-cu12==11.4.5.107
|
||||
# via torch
|
||||
nvidia-cusparse-cu12==12.1.0.106
|
||||
# via
|
||||
# nvidia-cusolver-cu12
|
||||
# torch
|
||||
nvidia-nccl-cu12==2.19.3
|
||||
# via torch
|
||||
nvidia-nvjitlink-cu12==12.4.127
|
||||
# via
|
||||
# nvidia-cusolver-cu12
|
||||
# nvidia-cusparse-cu12
|
||||
nvidia-nvtx-cu12==12.1.105
|
||||
# via torch
|
||||
onnxruntime==1.17.1
|
||||
# via faster-whisper
|
||||
openai==1.14.2
|
||||
# via dailyai (pyproject.toml)
|
||||
opentelemetry-api==1.24.0
|
||||
# via
|
||||
# fal
|
||||
# opentelemetry-sdk
|
||||
opentelemetry-sdk==1.24.0
|
||||
# via fal
|
||||
opentelemetry-semantic-conventions==0.45b0
|
||||
# via opentelemetry-sdk
|
||||
packaging==24.0
|
||||
# via
|
||||
# fal
|
||||
# huggingface-hub
|
||||
# onnxruntime
|
||||
pathspec==0.11.2
|
||||
# via fal
|
||||
pillow==10.2.0
|
||||
# via
|
||||
# dailyai (pyproject.toml)
|
||||
# fal
|
||||
platformdirs==4.2.0
|
||||
# via
|
||||
# isolate
|
||||
# virtualenv
|
||||
portalocker==2.8.2
|
||||
# via fal
|
||||
protobuf==4.25.3
|
||||
# via
|
||||
# isolate
|
||||
# isolate-proto
|
||||
# onnxruntime
|
||||
# pyht
|
||||
pyaudio==0.2.14
|
||||
# via dailyai (pyproject.toml)
|
||||
pydantic==1.10.15
|
||||
# via
|
||||
# anthropic
|
||||
# fal
|
||||
# fastapi
|
||||
# openai
|
||||
pygments==2.17.2
|
||||
# via rich
|
||||
pyht==0.0.26
|
||||
# via dailyai (pyproject.toml)
|
||||
pyjwt==2.8.0
|
||||
# via fal
|
||||
python-dateutil==2.9.0.post0
|
||||
# via fal
|
||||
python-dotenv==1.0.1
|
||||
# via dailyai (pyproject.toml)
|
||||
pyyaml==6.0.1
|
||||
# via
|
||||
# ctranslate2
|
||||
# huggingface-hub
|
||||
# isolate
|
||||
requests==2.31.0
|
||||
# via
|
||||
# huggingface-hub
|
||||
# pyht
|
||||
rich==13.7.1
|
||||
# via fal
|
||||
six==1.16.0
|
||||
# via python-dateutil
|
||||
sniffio==1.3.1
|
||||
# via
|
||||
# anthropic
|
||||
# anyio
|
||||
# httpx
|
||||
# openai
|
||||
starlette==0.27.0
|
||||
# via fastapi
|
||||
structlog==22.3.0
|
||||
# via fal
|
||||
sympy==1.12
|
||||
# via
|
||||
# onnxruntime
|
||||
# torch
|
||||
tblib==3.0.0
|
||||
# via isolate
|
||||
tokenizers==0.15.2
|
||||
# via
|
||||
# anthropic
|
||||
# faster-whisper
|
||||
torch==2.2.1
|
||||
# via
|
||||
# dailyai (pyproject.toml)
|
||||
# torchaudio
|
||||
torchaudio==2.2.1
|
||||
# via dailyai (pyproject.toml)
|
||||
tqdm==4.66.2
|
||||
# via
|
||||
# huggingface-hub
|
||||
# openai
|
||||
triton==2.2.0
|
||||
# via torch
|
||||
types-python-dateutil==2.9.0.20240316
|
||||
# via fal
|
||||
typing-extensions==4.10.0
|
||||
# via
|
||||
# anthropic
|
||||
# anyio
|
||||
# dailyai (pyproject.toml)
|
||||
# fal
|
||||
# fastapi
|
||||
# huggingface-hub
|
||||
# openai
|
||||
# opentelemetry-sdk
|
||||
# pydantic
|
||||
# torch
|
||||
urllib3==2.2.1
|
||||
# via requests
|
||||
virtualenv==20.25.1
|
||||
# via isolate
|
||||
websockets==12.0
|
||||
# via
|
||||
# dailyai (pyproject.toml)
|
||||
# fal
|
||||
werkzeug==3.0.2
|
||||
# via flask
|
||||
wrapt==1.16.0
|
||||
# via deprecated
|
||||
yarl==1.9.4
|
||||
# via aiohttp
|
||||
zipp==3.18.1
|
||||
# via importlib-metadata
|
||||
|
||||
# The following packages are considered to be unsafe in a requirements file:
|
||||
# setuptools
|
||||
@@ -1,285 +0,0 @@
|
||||
#
|
||||
# This file is autogenerated by pip-compile with Python 3.10
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile --all-extras pyproject.toml
|
||||
#
|
||||
aiohttp==3.9.3
|
||||
# via dailyai (pyproject.toml)
|
||||
aiosignal==1.3.1
|
||||
# via aiohttp
|
||||
anthropic==0.20.0
|
||||
# via dailyai (pyproject.toml)
|
||||
anyio==4.3.0
|
||||
# via
|
||||
# anthropic
|
||||
# httpx
|
||||
# openai
|
||||
# starlette
|
||||
async-timeout==4.0.3
|
||||
# via aiohttp
|
||||
attrs==23.2.0
|
||||
# via
|
||||
# aiohttp
|
||||
# fal
|
||||
av==11.0.0
|
||||
# via faster-whisper
|
||||
azure-cognitiveservices-speech==1.36.0
|
||||
# via dailyai (pyproject.toml)
|
||||
blinker==1.7.0
|
||||
# via flask
|
||||
certifi==2024.2.2
|
||||
# via
|
||||
# httpcore
|
||||
# httpx
|
||||
# requests
|
||||
charset-normalizer==3.3.2
|
||||
# via requests
|
||||
click==8.1.7
|
||||
# via
|
||||
# fal
|
||||
# flask
|
||||
colorama==0.4.6
|
||||
# via fal
|
||||
coloredlogs==15.0.1
|
||||
# via onnxruntime
|
||||
ctranslate2==4.1.0
|
||||
# via faster-whisper
|
||||
daily-python==0.7.2
|
||||
# via dailyai (pyproject.toml)
|
||||
deprecated==1.2.14
|
||||
# via opentelemetry-api
|
||||
dill==0.3.7
|
||||
# via fal
|
||||
distlib==0.3.8
|
||||
# via virtualenv
|
||||
distro==1.9.0
|
||||
# via
|
||||
# anthropic
|
||||
# openai
|
||||
exceptiongroup==1.2.0
|
||||
# via anyio
|
||||
fal==0.12.3
|
||||
# via dailyai (pyproject.toml)
|
||||
fastapi==0.99.1
|
||||
# via fal
|
||||
faster-whisper==1.0.1
|
||||
# via dailyai (pyproject.toml)
|
||||
filelock==3.13.3
|
||||
# via
|
||||
# huggingface-hub
|
||||
# pyht
|
||||
# torch
|
||||
# virtualenv
|
||||
flask==3.0.2
|
||||
# via
|
||||
# dailyai (pyproject.toml)
|
||||
# flask-cors
|
||||
flask-cors==4.0.0
|
||||
# via dailyai (pyproject.toml)
|
||||
flatbuffers==24.3.25
|
||||
# via onnxruntime
|
||||
frozenlist==1.4.1
|
||||
# via
|
||||
# aiohttp
|
||||
# aiosignal
|
||||
fsspec==2024.3.1
|
||||
# via
|
||||
# huggingface-hub
|
||||
# torch
|
||||
grpc-interceptor==0.15.4
|
||||
# via fal
|
||||
grpcio==1.62.1
|
||||
# via
|
||||
# fal
|
||||
# grpc-interceptor
|
||||
# isolate
|
||||
# isolate-proto
|
||||
# pyht
|
||||
h11==0.14.0
|
||||
# via httpcore
|
||||
httpcore==1.0.5
|
||||
# via httpx
|
||||
httpx==0.27.0
|
||||
# via
|
||||
# anthropic
|
||||
# fal
|
||||
# openai
|
||||
huggingface-hub==0.22.2
|
||||
# via
|
||||
# faster-whisper
|
||||
# tokenizers
|
||||
humanfriendly==10.0
|
||||
# via coloredlogs
|
||||
idna==3.6
|
||||
# via
|
||||
# anyio
|
||||
# httpx
|
||||
# requests
|
||||
# yarl
|
||||
importlib-metadata==7.0.0
|
||||
# via opentelemetry-api
|
||||
isolate[build]==0.12.7
|
||||
# via
|
||||
# fal
|
||||
# isolate-proto
|
||||
isolate-proto==0.3.3
|
||||
# via fal
|
||||
itsdangerous==2.1.2
|
||||
# via flask
|
||||
jinja2==3.1.3
|
||||
# via
|
||||
# flask
|
||||
# torch
|
||||
markdown-it-py==3.0.0
|
||||
# via rich
|
||||
markupsafe==2.1.5
|
||||
# via
|
||||
# jinja2
|
||||
# werkzeug
|
||||
mdurl==0.1.2
|
||||
# via markdown-it-py
|
||||
mpmath==1.3.0
|
||||
# via sympy
|
||||
msgpack==1.0.8
|
||||
# via fal
|
||||
multidict==6.0.5
|
||||
# via
|
||||
# aiohttp
|
||||
# yarl
|
||||
networkx==3.2.1
|
||||
# via torch
|
||||
numpy==1.26.4
|
||||
# via
|
||||
# ctranslate2
|
||||
# dailyai (pyproject.toml)
|
||||
# onnxruntime
|
||||
onnxruntime==1.17.1
|
||||
# via faster-whisper
|
||||
openai==1.14.2
|
||||
# via dailyai (pyproject.toml)
|
||||
opentelemetry-api==1.24.0
|
||||
# via
|
||||
# fal
|
||||
# opentelemetry-sdk
|
||||
opentelemetry-sdk==1.24.0
|
||||
# via fal
|
||||
opentelemetry-semantic-conventions==0.45b0
|
||||
# via opentelemetry-sdk
|
||||
packaging==24.0
|
||||
# via
|
||||
# fal
|
||||
# huggingface-hub
|
||||
# onnxruntime
|
||||
pathspec==0.11.2
|
||||
# via fal
|
||||
pillow==10.2.0
|
||||
# via
|
||||
# dailyai (pyproject.toml)
|
||||
# fal
|
||||
platformdirs==4.2.0
|
||||
# via
|
||||
# isolate
|
||||
# virtualenv
|
||||
portalocker==2.8.2
|
||||
# via fal
|
||||
protobuf==4.25.3
|
||||
# via
|
||||
# isolate
|
||||
# isolate-proto
|
||||
# onnxruntime
|
||||
# pyht
|
||||
pyaudio==0.2.14
|
||||
# via dailyai (pyproject.toml)
|
||||
pydantic==1.10.15
|
||||
# via
|
||||
# anthropic
|
||||
# fal
|
||||
# fastapi
|
||||
# openai
|
||||
pygments==2.17.2
|
||||
# via rich
|
||||
pyht==0.0.26
|
||||
# via dailyai (pyproject.toml)
|
||||
pyjwt==2.8.0
|
||||
# via fal
|
||||
python-dateutil==2.9.0.post0
|
||||
# via fal
|
||||
python-dotenv==1.0.1
|
||||
# via dailyai (pyproject.toml)
|
||||
pyyaml==6.0.1
|
||||
# via
|
||||
# ctranslate2
|
||||
# huggingface-hub
|
||||
# isolate
|
||||
requests==2.31.0
|
||||
# via
|
||||
# huggingface-hub
|
||||
# pyht
|
||||
rich==13.7.1
|
||||
# via fal
|
||||
six==1.16.0
|
||||
# via python-dateutil
|
||||
sniffio==1.3.1
|
||||
# via
|
||||
# anthropic
|
||||
# anyio
|
||||
# httpx
|
||||
# openai
|
||||
starlette==0.27.0
|
||||
# via fastapi
|
||||
structlog==22.3.0
|
||||
# via fal
|
||||
sympy==1.12
|
||||
# via
|
||||
# onnxruntime
|
||||
# torch
|
||||
tblib==3.0.0
|
||||
# via isolate
|
||||
tokenizers==0.15.2
|
||||
# via
|
||||
# anthropic
|
||||
# faster-whisper
|
||||
torch==2.2.1
|
||||
# via
|
||||
# dailyai (pyproject.toml)
|
||||
# torchaudio
|
||||
torchaudio==2.2.1
|
||||
# via dailyai (pyproject.toml)
|
||||
tqdm==4.66.2
|
||||
# via
|
||||
# huggingface-hub
|
||||
# openai
|
||||
types-python-dateutil==2.9.0.20240316
|
||||
# via fal
|
||||
typing-extensions==4.10.0
|
||||
# via
|
||||
# anthropic
|
||||
# anyio
|
||||
# dailyai (pyproject.toml)
|
||||
# fal
|
||||
# fastapi
|
||||
# huggingface-hub
|
||||
# openai
|
||||
# opentelemetry-sdk
|
||||
# pydantic
|
||||
# torch
|
||||
urllib3==2.2.1
|
||||
# via requests
|
||||
virtualenv==20.25.1
|
||||
# via isolate
|
||||
websockets==12.0
|
||||
# via
|
||||
# dailyai (pyproject.toml)
|
||||
# fal
|
||||
werkzeug==3.0.2
|
||||
# via flask
|
||||
wrapt==1.16.0
|
||||
# via deprecated
|
||||
yarl==1.9.4
|
||||
# via aiohttp
|
||||
zipp==3.18.1
|
||||
# via importlib-metadata
|
||||
|
||||
# The following packages are considered to be unsafe in a requirements file:
|
||||
# setuptools
|
||||
@@ -1,54 +1,30 @@
|
||||
[build-system]
|
||||
requires = ["setuptools>=64", "setuptools_scm>=8"]
|
||||
requires = ["setuptools"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
[project]
|
||||
name = "dailyai"
|
||||
dynamic = ["version"]
|
||||
description = "An open source framework for real-time, multi-modal, conversational AI applications"
|
||||
license = { text = "BSD 2-Clause License" }
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.7"
|
||||
keywords = ["webrtc", "audio", "video", "ai"]
|
||||
classifiers = [
|
||||
"Development Status :: 5 - Production/Stable",
|
||||
"Intended Audience :: Developers",
|
||||
"License :: OSI Approved :: BSD License",
|
||||
"Topic :: Communications :: Conferencing",
|
||||
"Topic :: Multimedia :: Sound/Audio",
|
||||
"Topic :: Multimedia :: Video",
|
||||
"Topic :: Scientific/Engineering :: Artificial Intelligence"
|
||||
]
|
||||
name = "daily_ai"
|
||||
version = "0.0.1"
|
||||
description = "Orchestrator for AI bots with Daily"
|
||||
dependencies = [
|
||||
"aiohttp==3.9.3",
|
||||
"numpy==1.26.4",
|
||||
"Pillow==10.2.0",
|
||||
"typing-extensions==4.10.0",
|
||||
"aiohttp",
|
||||
"azure-cognitiveservices-speech",
|
||||
"daily-python",
|
||||
"fal",
|
||||
"faster_whisper",
|
||||
"groq",
|
||||
"google-cloud-texttospeech",
|
||||
"numpy",
|
||||
"openai",
|
||||
"Pillow",
|
||||
"pyht",
|
||||
"python-dotenv",
|
||||
"torch",
|
||||
"torchaudio",
|
||||
"pyaudio",
|
||||
"typing-extensions"
|
||||
]
|
||||
|
||||
[project.urls]
|
||||
Source = "https://github.com/daily-co/dailyai"
|
||||
Website = "https://daily.co"
|
||||
|
||||
[project.optional-dependencies]
|
||||
anthropic = [ "anthropic==0.20.0" ]
|
||||
azure = [ "azure-cognitiveservices-speech==1.36.0" ]
|
||||
daily = [ "daily-python==0.7.2" ]
|
||||
examples = [ "python-dotenv==1.0.1", "flask==3.0.2", "flask_cors==4.0.0" ]
|
||||
fal = [ "fal==0.12.3" ]
|
||||
local = [ "pyaudio==0.2.14" ]
|
||||
openai = [ "openai==1.14.2" ]
|
||||
playht = [ "pyht==0.0.26" ]
|
||||
silero = [ "torch==2.2.1", "torchaudio==2.2.1" ]
|
||||
websocket = [ "websockets==12.0" ]
|
||||
whisper = [ "faster_whisper==1.0.1" ]
|
||||
|
||||
[tool.setuptools.packages.find]
|
||||
# All the following settings are optional:
|
||||
where = ["src"]
|
||||
|
||||
[tool.pytest.ini_options]
|
||||
pythonpath = ["src"]
|
||||
|
||||
[tool.setuptools_scm]
|
||||
# Empty
|
||||
|
||||
4
requirements.txt
Normal file
@@ -0,0 +1,4 @@
|
||||
autopep8==2.0.4
|
||||
build==1.0.3
|
||||
packaging==23.2
|
||||
pyproject_hooks==1.0.0
|
||||
77
src/dailyai/conversation_wrappers.py
Normal file
@@ -0,0 +1,77 @@
|
||||
import asyncio
|
||||
import copy
|
||||
import functools
|
||||
from typing import AsyncGenerator, Awaitable, Callable
|
||||
from dailyai.queue_aggregators import LLMAssistantContextAggregator, LLMContextAggregator, LLMUserContextAggregator
|
||||
from dailyai.queue_frame import EndStreamQueueFrame, QueueFrame, TranscriptionQueueFrame
|
||||
|
||||
|
||||
class InterruptibleConversationWrapper:
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
frame_generator: Callable[[], AsyncGenerator[QueueFrame, None]],
|
||||
runner: Callable[
|
||||
[str, LLMContextAggregator, LLMContextAggregator], Awaitable[None]
|
||||
],
|
||||
interrupt: Callable[[], None],
|
||||
my_participant_id: str | None,
|
||||
llm_messages: list[dict[str, str]],
|
||||
llm_context_aggregator_in=LLMUserContextAggregator,
|
||||
llm_context_aggregator_out=LLMAssistantContextAggregator,
|
||||
delay_before_speech_seconds: float = 1.0,
|
||||
):
|
||||
self._frame_generator: Callable[[], AsyncGenerator[QueueFrame, None]] = frame_generator
|
||||
self._runner: Callable[
|
||||
[str, LLMContextAggregator, LLMContextAggregator], Awaitable[None]
|
||||
] = runner
|
||||
self._interrupt: Callable[[], None] = interrupt
|
||||
self._my_participant_id = my_participant_id
|
||||
self._messages: list[dict[str, str]] = llm_messages
|
||||
self._delay_before_speech_seconds = delay_before_speech_seconds
|
||||
self._llm_context_aggregator_in = llm_context_aggregator_in
|
||||
self._llm_context_aggregator_out = llm_context_aggregator_out
|
||||
|
||||
self._current_phrase = ""
|
||||
|
||||
def update_messages(self, new_messages: list[dict[str, str]], task: asyncio.Task | None):
|
||||
if task:
|
||||
if not task.cancelled():
|
||||
self._current_phrase = ""
|
||||
self._messages = new_messages
|
||||
|
||||
async def speak_after_delay(self, user_speech, messages):
|
||||
await asyncio.sleep(self._delay_before_speech_seconds)
|
||||
tma_in = self._llm_context_aggregator_in(
|
||||
messages, self._my_participant_id, complete_sentences=False
|
||||
)
|
||||
tma_out = self._llm_context_aggregator_out(
|
||||
messages, self._my_participant_id
|
||||
)
|
||||
|
||||
await self._runner(user_speech, tma_in, tma_out)
|
||||
|
||||
async def run_conversation(self):
|
||||
current_response_task = None
|
||||
|
||||
async for frame in self._frame_generator():
|
||||
if isinstance(frame, EndStreamQueueFrame):
|
||||
break
|
||||
elif not isinstance(frame, TranscriptionQueueFrame):
|
||||
continue
|
||||
|
||||
if frame.participantId == self._my_participant_id:
|
||||
continue
|
||||
|
||||
if current_response_task:
|
||||
current_response_task.cancel()
|
||||
self._interrupt()
|
||||
|
||||
self._current_phrase += " " + frame.text
|
||||
current_llm_messages = copy.deepcopy(self._messages)
|
||||
current_response_task = asyncio.create_task(
|
||||
self.speak_after_delay(self._current_phrase, current_llm_messages)
|
||||
)
|
||||
current_response_task.add_done_callback(
|
||||
functools.partial(self.update_messages, current_llm_messages)
|
||||
)
|
||||
@@ -1,392 +0,0 @@
|
||||
import asyncio
|
||||
import re
|
||||
|
||||
from dailyai.pipeline.frame_processor import FrameProcessor
|
||||
|
||||
from dailyai.pipeline.frames import (
|
||||
EndFrame,
|
||||
EndPipeFrame,
|
||||
Frame,
|
||||
LLMMessagesFrame,
|
||||
LLMResponseEndFrame,
|
||||
LLMResponseStartFrame,
|
||||
TextFrame,
|
||||
TranscriptionFrame,
|
||||
UserStartedSpeakingFrame,
|
||||
UserStoppedSpeakingFrame,
|
||||
)
|
||||
from dailyai.pipeline.pipeline import Pipeline
|
||||
from dailyai.services.ai_services import AIService
|
||||
|
||||
from typing import AsyncGenerator, Coroutine, List
|
||||
|
||||
|
||||
class ResponseAggregator(FrameProcessor):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
messages: list[dict] | None,
|
||||
role: str,
|
||||
start_frame,
|
||||
end_frame,
|
||||
accumulator_frame,
|
||||
pass_through=True,
|
||||
):
|
||||
self.aggregation = ""
|
||||
self.aggregating = False
|
||||
self.messages = messages
|
||||
self._role = role
|
||||
self._start_frame = start_frame
|
||||
self._end_frame = end_frame
|
||||
self._accumulator_frame = accumulator_frame
|
||||
self._pass_through = pass_through
|
||||
|
||||
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||
if not self.messages:
|
||||
return
|
||||
|
||||
if isinstance(frame, self._start_frame):
|
||||
self.aggregating = True
|
||||
elif isinstance(frame, self._end_frame):
|
||||
self.aggregating = False
|
||||
# Sometimes VAD triggers quickly on and off. If we don't get any transcription,
|
||||
# it creates empty LLM message queue frames
|
||||
if len(self.aggregation) > 0:
|
||||
self.messages.append(
|
||||
{"role": self._role, "content": self.aggregation})
|
||||
self.aggregation = ""
|
||||
yield self._end_frame()
|
||||
yield LLMMessagesFrame(self.messages)
|
||||
elif isinstance(frame, self._accumulator_frame) and self.aggregating:
|
||||
self.aggregation += f" {frame.text}"
|
||||
if self._pass_through:
|
||||
yield frame
|
||||
else:
|
||||
yield frame
|
||||
|
||||
|
||||
class LLMResponseAggregator(ResponseAggregator):
|
||||
def __init__(self, messages: list[dict]):
|
||||
super().__init__(
|
||||
messages=messages,
|
||||
role="assistant",
|
||||
start_frame=LLMResponseStartFrame,
|
||||
end_frame=LLMResponseEndFrame,
|
||||
accumulator_frame=TextFrame,
|
||||
)
|
||||
|
||||
|
||||
class UserResponseAggregator(ResponseAggregator):
|
||||
def __init__(self, messages: list[dict]):
|
||||
super().__init__(
|
||||
messages=messages,
|
||||
role="user",
|
||||
start_frame=UserStartedSpeakingFrame,
|
||||
end_frame=UserStoppedSpeakingFrame,
|
||||
accumulator_frame=TranscriptionFrame,
|
||||
pass_through=False,
|
||||
)
|
||||
|
||||
|
||||
class LLMContextAggregator(AIService):
|
||||
def __init__(
|
||||
self,
|
||||
messages: list[dict],
|
||||
role: str,
|
||||
bot_participant_id=None,
|
||||
complete_sentences=True,
|
||||
pass_through=True,
|
||||
):
|
||||
super().__init__()
|
||||
self.messages = messages
|
||||
self.bot_participant_id = bot_participant_id
|
||||
self.role = role
|
||||
self.sentence = ""
|
||||
self.complete_sentences = complete_sentences
|
||||
self.pass_through = pass_through
|
||||
|
||||
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||
# We don't do anything with non-text frames, pass it along to next in
|
||||
# the pipeline.
|
||||
if not isinstance(frame, TextFrame):
|
||||
yield frame
|
||||
return
|
||||
|
||||
# Ignore transcription frames from the bot
|
||||
if isinstance(frame, TranscriptionFrame):
|
||||
if frame.participantId == self.bot_participant_id:
|
||||
return
|
||||
|
||||
# The common case for "pass through" is receiving frames from the LLM that we'll
|
||||
# use to update the "assistant" LLM messages, but also passing the text frames
|
||||
# along to a TTS service to be spoken to the user.
|
||||
if self.pass_through:
|
||||
yield frame
|
||||
|
||||
# TODO: split up transcription by participant
|
||||
if self.complete_sentences:
|
||||
# type: ignore -- the linter thinks this isn't a TextFrame, even
|
||||
# though we check it above
|
||||
self.sentence += frame.text
|
||||
if self.sentence.endswith((".", "?", "!")):
|
||||
self.messages.append(
|
||||
{"role": self.role, "content": self.sentence})
|
||||
self.sentence = ""
|
||||
yield LLMMessagesFrame(self.messages)
|
||||
else:
|
||||
# type: ignore -- the linter thinks this isn't a TextFrame, even
|
||||
# though we check it above
|
||||
self.messages.append({"role": self.role, "content": frame.text})
|
||||
yield LLMMessagesFrame(self.messages)
|
||||
|
||||
|
||||
class LLMUserContextAggregator(LLMContextAggregator):
|
||||
def __init__(
|
||||
self,
|
||||
messages: list[dict],
|
||||
bot_participant_id=None,
|
||||
complete_sentences=True):
|
||||
super().__init__(
|
||||
messages,
|
||||
"user",
|
||||
bot_participant_id,
|
||||
complete_sentences,
|
||||
pass_through=False)
|
||||
|
||||
|
||||
class LLMAssistantContextAggregator(LLMContextAggregator):
|
||||
def __init__(
|
||||
self,
|
||||
messages: list[dict],
|
||||
bot_participant_id=None,
|
||||
complete_sentences=True):
|
||||
super().__init__(
|
||||
messages,
|
||||
"assistant",
|
||||
bot_participant_id,
|
||||
complete_sentences,
|
||||
pass_through=True,
|
||||
)
|
||||
|
||||
|
||||
class SentenceAggregator(FrameProcessor):
|
||||
"""This frame processor aggregates text frames into complete sentences.
|
||||
|
||||
Frame input/output:
|
||||
TextFrame("Hello,") -> None
|
||||
TextFrame(" world.") -> TextFrame("Hello world.")
|
||||
|
||||
Doctest:
|
||||
>>> async def print_frames(aggregator, frame):
|
||||
... async for frame in aggregator.process_frame(frame):
|
||||
... print(frame.text)
|
||||
|
||||
>>> aggregator = SentenceAggregator()
|
||||
>>> asyncio.run(print_frames(aggregator, TextFrame("Hello,")))
|
||||
>>> asyncio.run(print_frames(aggregator, TextFrame(" world.")))
|
||||
Hello, world.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.aggregation = ""
|
||||
|
||||
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||
if isinstance(frame, TextFrame):
|
||||
m = re.search("(.*[?.!])(.*)", frame.text)
|
||||
if m:
|
||||
yield TextFrame(self.aggregation + m.group(1))
|
||||
self.aggregation = m.group(2)
|
||||
else:
|
||||
self.aggregation += frame.text
|
||||
elif isinstance(frame, EndFrame):
|
||||
if self.aggregation:
|
||||
yield TextFrame(self.aggregation)
|
||||
yield frame
|
||||
else:
|
||||
yield frame
|
||||
|
||||
|
||||
class LLMFullResponseAggregator(FrameProcessor):
|
||||
"""This class aggregates Text frames until it receives a
|
||||
LLMResponseEndFrame, then emits the concatenated text as
|
||||
a single text frame.
|
||||
|
||||
given the following frames:
|
||||
|
||||
TextFrame("Hello,")
|
||||
TextFrame(" world.")
|
||||
TextFrame(" I am")
|
||||
TextFrame(" an LLM.")
|
||||
LLMResponseEndFrame()]
|
||||
|
||||
this processor will yield nothing for the first 4 frames, then
|
||||
|
||||
TextFrame("Hello, world. I am an LLM.")
|
||||
LLMResponseEndFrame()
|
||||
|
||||
when passed the last frame.
|
||||
|
||||
>>> async def print_frames(aggregator, frame):
|
||||
... async for frame in aggregator.process_frame(frame):
|
||||
... if isinstance(frame, TextFrame):
|
||||
... print(frame.text)
|
||||
... else:
|
||||
... print(frame.__class__.__name__)
|
||||
|
||||
>>> aggregator = LLMFullResponseAggregator()
|
||||
>>> asyncio.run(print_frames(aggregator, TextFrame("Hello,")))
|
||||
>>> asyncio.run(print_frames(aggregator, TextFrame(" world.")))
|
||||
>>> asyncio.run(print_frames(aggregator, TextFrame(" I am")))
|
||||
>>> asyncio.run(print_frames(aggregator, TextFrame(" an LLM.")))
|
||||
>>> asyncio.run(print_frames(aggregator, LLMResponseEndFrame()))
|
||||
Hello, world. I am an LLM.
|
||||
LLMResponseEndFrame
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.aggregation = ""
|
||||
|
||||
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||
if isinstance(frame, TextFrame):
|
||||
self.aggregation += frame.text
|
||||
elif isinstance(frame, LLMResponseEndFrame):
|
||||
yield TextFrame(self.aggregation)
|
||||
yield frame
|
||||
self.aggregation = ""
|
||||
else:
|
||||
yield frame
|
||||
|
||||
|
||||
class StatelessTextTransformer(FrameProcessor):
|
||||
"""This processor calls the given function on any text in a text frame.
|
||||
|
||||
>>> async def print_frames(aggregator, frame):
|
||||
... async for frame in aggregator.process_frame(frame):
|
||||
... print(frame.text)
|
||||
|
||||
>>> aggregator = StatelessTextTransformer(lambda x: x.upper())
|
||||
>>> asyncio.run(print_frames(aggregator, TextFrame("Hello")))
|
||||
HELLO
|
||||
"""
|
||||
|
||||
def __init__(self, transform_fn):
|
||||
self.transform_fn = transform_fn
|
||||
|
||||
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||
if isinstance(frame, TextFrame):
|
||||
result = self.transform_fn(frame.text)
|
||||
if isinstance(result, Coroutine):
|
||||
result = await result
|
||||
|
||||
yield TextFrame(result)
|
||||
else:
|
||||
yield frame
|
||||
|
||||
|
||||
class ParallelPipeline(FrameProcessor):
|
||||
"""Run multiple pipelines in parallel.
|
||||
|
||||
This class takes frames from its source queue and sends them to each
|
||||
sub-pipeline. Each sub-pipeline emits its frames into this class's
|
||||
sink queue. No guarantees are made about the ordering of frames in
|
||||
the sink queue (that is, no sub-pipeline has higher priority than
|
||||
any other, frames are put on the sink in the order they're emitted
|
||||
by the sub-pipelines).
|
||||
|
||||
After each frame is taken from this class's source queue and placed
|
||||
in each sub-pipeline's source queue, an EndPipeFrame is put on each
|
||||
sub-pipeline's source queue. This indicates to the sub-pipe runner
|
||||
that it should exit.
|
||||
|
||||
Since frame handlers pass through unhandled frames by convention, this
|
||||
class de-dupes frames in its sink before yielding them.
|
||||
"""
|
||||
|
||||
def __init__(self, pipeline_definitions: List[List[FrameProcessor]]):
|
||||
self.sources = [asyncio.Queue() for _ in pipeline_definitions]
|
||||
self.sink: asyncio.Queue[Frame] = asyncio.Queue()
|
||||
self.pipelines: list[Pipeline] = [
|
||||
Pipeline(
|
||||
pipeline_definition,
|
||||
source,
|
||||
self.sink,
|
||||
)
|
||||
for source, pipeline_definition in zip(self.sources, pipeline_definitions)
|
||||
]
|
||||
|
||||
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||
for source in self.sources:
|
||||
await source.put(frame)
|
||||
await source.put(EndPipeFrame())
|
||||
|
||||
await asyncio.gather(*[pipeline.run_pipeline() for pipeline in self.pipelines])
|
||||
|
||||
seen_ids = set()
|
||||
while not self.sink.empty():
|
||||
frame = await self.sink.get()
|
||||
|
||||
# de-dup frames. Because the convention is to yield a frame that isn't processed,
|
||||
# each pipeline will likely yield the same frame, so we will end up with _n_ copies
|
||||
# of unprocessed frames where _n_ is the number of parallel pipes that don't
|
||||
# process that frame.
|
||||
if id(frame) in seen_ids:
|
||||
continue
|
||||
seen_ids.add(id(frame))
|
||||
|
||||
# Skip passing along EndPipeFrame, because we use them
|
||||
# for our own flow control.
|
||||
if not isinstance(frame, EndPipeFrame):
|
||||
yield frame
|
||||
|
||||
|
||||
class GatedAggregator(FrameProcessor):
|
||||
"""Accumulate frames, with custom functions to start and stop accumulation.
|
||||
Yields gate-opening frame before any accumulated frames, then ensuing frames
|
||||
until and not including the gate-closed frame.
|
||||
|
||||
>>> from dailyai.pipeline.frames import ImageFrame
|
||||
|
||||
>>> async def print_frames(aggregator, frame):
|
||||
... async for frame in aggregator.process_frame(frame):
|
||||
... if isinstance(frame, TextFrame):
|
||||
... print(frame.text)
|
||||
... else:
|
||||
... print(frame.__class__.__name__)
|
||||
|
||||
>>> aggregator = GatedAggregator(
|
||||
... gate_close_fn=lambda x: isinstance(x, LLMResponseStartFrame),
|
||||
... gate_open_fn=lambda x: isinstance(x, ImageFrame),
|
||||
... start_open=False)
|
||||
>>> asyncio.run(print_frames(aggregator, TextFrame("Hello")))
|
||||
>>> asyncio.run(print_frames(aggregator, TextFrame("Hello again.")))
|
||||
>>> asyncio.run(print_frames(aggregator, ImageFrame(url='', image=bytes([]))))
|
||||
ImageFrame
|
||||
Hello
|
||||
Hello again.
|
||||
>>> asyncio.run(print_frames(aggregator, TextFrame("Goodbye.")))
|
||||
Goodbye.
|
||||
"""
|
||||
|
||||
def __init__(self, gate_open_fn, gate_close_fn, start_open):
|
||||
self.gate_open_fn = gate_open_fn
|
||||
self.gate_close_fn = gate_close_fn
|
||||
self.gate_open = start_open
|
||||
self.accumulator: List[Frame] = []
|
||||
|
||||
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||
if self.gate_open:
|
||||
if self.gate_close_fn(frame):
|
||||
self.gate_open = False
|
||||
else:
|
||||
if self.gate_open_fn(frame):
|
||||
self.gate_open = True
|
||||
|
||||
if self.gate_open:
|
||||
yield frame
|
||||
if self.accumulator:
|
||||
for frame in self.accumulator:
|
||||
yield frame
|
||||
self.accumulator = []
|
||||
else:
|
||||
self.accumulator.append(frame)
|
||||
@@ -1,34 +0,0 @@
|
||||
from abc import abstractmethod
|
||||
from typing import AsyncGenerator
|
||||
|
||||
from dailyai.pipeline.frames import ControlFrame, Frame
|
||||
|
||||
|
||||
class FrameProcessor:
|
||||
"""This is the base class for all frame processors. Frame processors consume a frame
|
||||
and yield 0 or more frames. Generally frame processors are used as part of a pipeline
|
||||
where frames come from a source queue, are processed by a series of frame processors,
|
||||
then placed on a sink queue.
|
||||
|
||||
By convention, FrameProcessors should immediately yield any frames they don't process.
|
||||
|
||||
Stateful FrameProcessors should watch for the EndFrame and finalize their
|
||||
output, eg. yielding an unfinished sentence if they're aggregating LLM output to full
|
||||
sentences. EndFrame is also a chance to clean up any services that need to
|
||||
be closed, del'd, etc.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
async def process_frame(
|
||||
self, frame: Frame
|
||||
) -> AsyncGenerator[Frame, None]:
|
||||
"""Process a single frame and yield 0 or more frames."""
|
||||
yield frame
|
||||
|
||||
@abstractmethod
|
||||
async def interrupted(self) -> None:
|
||||
"""Handle any cleanup if the pipeline was interrupted."""
|
||||
pass
|
||||
|
||||
def __str__(self):
|
||||
return self.__class__.__name__
|
||||
@@ -1,25 +0,0 @@
|
||||
syntax = "proto3";
|
||||
|
||||
package dailyai_proto;
|
||||
|
||||
message TextFrame {
|
||||
string text = 1;
|
||||
}
|
||||
|
||||
message AudioFrame {
|
||||
bytes data = 1;
|
||||
}
|
||||
|
||||
message TranscriptionFrame {
|
||||
string text = 1;
|
||||
string participantId = 2;
|
||||
string timestamp = 3;
|
||||
}
|
||||
|
||||
message Frame {
|
||||
oneof frame {
|
||||
TextFrame text = 1;
|
||||
AudioFrame audio = 2;
|
||||
TranscriptionFrame transcription = 3;
|
||||
}
|
||||
}
|
||||
@@ -1,187 +0,0 @@
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, List
|
||||
|
||||
|
||||
class Frame:
|
||||
def __str__(self):
|
||||
return f"{self.__class__.__name__}"
|
||||
|
||||
|
||||
class ControlFrame(Frame):
|
||||
# Control frames should contain no instance data, so
|
||||
# equality is based solely on the class.
|
||||
def __eq__(self, other):
|
||||
return isinstance(other, self.__class__)
|
||||
|
||||
|
||||
class StartFrame(ControlFrame):
|
||||
"""Used (but not required) to start a pipeline, and is also used to
|
||||
indicate that an interruption has ended and the transport should start
|
||||
processing frames again."""
|
||||
pass
|
||||
|
||||
|
||||
class EndFrame(ControlFrame):
|
||||
"""Indicates that a pipeline has ended and frame processors and pipelines
|
||||
should be shut down. If the transport receives this frame, it will stop
|
||||
sending frames to its output channel(s) and close all its threads."""
|
||||
pass
|
||||
|
||||
|
||||
class EndPipeFrame(ControlFrame):
|
||||
"""Indicates that a pipeline has ended but that the transport should
|
||||
continue processing. This frame is used in parallel pipelines and other
|
||||
sub-pipelines."""
|
||||
pass
|
||||
|
||||
|
||||
class PipelineStartedFrame(ControlFrame):
|
||||
"""
|
||||
Used by the transport to indicate that execution of a pipeline is starting
|
||||
(or restarting). It should be the first frame your app receives when it
|
||||
starts, or when an interruptible pipeline has been interrupted.
|
||||
"""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class LLMResponseStartFrame(ControlFrame):
|
||||
"""Used to indicate the beginning of an LLM response. Following TextFrames
|
||||
are part of the LLM response until an LLMResponseEndFrame"""
|
||||
pass
|
||||
|
||||
|
||||
class LLMResponseEndFrame(ControlFrame):
|
||||
"""Indicates the end of an LLM response."""
|
||||
pass
|
||||
|
||||
|
||||
@dataclass()
|
||||
class AudioFrame(Frame):
|
||||
"""A chunk of audio. Will be played by the transport if the transport's mic
|
||||
has been enabled."""
|
||||
data: bytes
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.__class__.__name__}, size: {len(self.data)} B"
|
||||
|
||||
|
||||
@dataclass()
|
||||
class ImageFrame(Frame):
|
||||
"""An image. Will be shown by the transport if the transport's camera is
|
||||
enabled."""
|
||||
url: str | None
|
||||
image: bytes
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.__class__.__name__}, url: {self.url}, image size: {len(self.image)} B"
|
||||
|
||||
|
||||
@dataclass()
|
||||
class SpriteFrame(Frame):
|
||||
"""An animated sprite. Will be shown by the transport if the transport's
|
||||
camera is enabled. Will play at the framerate specified in the transport's
|
||||
`fps` constructor parameter."""
|
||||
images: list[bytes]
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.__class__.__name__}, list size: {len(self.images)}"
|
||||
|
||||
|
||||
@dataclass()
|
||||
class TextFrame(Frame):
|
||||
"""A chunk of text. Emitted by LLM services, consumed by TTS services, can
|
||||
be used to send text through pipelines."""
|
||||
text: str
|
||||
|
||||
def __str__(self):
|
||||
return f'{self.__class__.__name__}: "{self.text}"'
|
||||
|
||||
|
||||
@dataclass()
|
||||
class TranscriptionFrame(TextFrame):
|
||||
"""A text frame with transcription-specific data. Will be placed in the
|
||||
transport's receive queue when a participant speaks."""
|
||||
participantId: str
|
||||
timestamp: str
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.__class__.__name__}, text: '{self.text}' participantId: {self.participantId}, timestamp: {self.timestamp}"
|
||||
|
||||
|
||||
class TTSStartFrame(ControlFrame):
|
||||
"""Used to indicate the beginning of a TTS response. Following AudioFrames
|
||||
are part of the TTS response until an TTEndFrame. These frames can be used
|
||||
for aggregating audio frames in a transport to optimize the size of frames
|
||||
sent to the session, without needing to control this in the TTS service."""
|
||||
pass
|
||||
|
||||
|
||||
class TTSEndFrame(ControlFrame):
|
||||
"""Indicates the end of a TTS response."""
|
||||
pass
|
||||
|
||||
|
||||
@dataclass()
|
||||
class LLMMessagesFrame(Frame):
|
||||
"""A frame containing a list of LLM messages. Used to signal that an LLM
|
||||
service should run a chat completion and emit an LLMStartFrames, TextFrames
|
||||
and an LLMEndFrame.
|
||||
Note that the messages property on this class is mutable, and will be
|
||||
be updated by various ResponseAggregator frame processors."""
|
||||
messages: List[dict]
|
||||
|
||||
|
||||
@dataclass()
|
||||
class ReceivedAppMessageFrame(Frame):
|
||||
message: Any
|
||||
sender: str
|
||||
|
||||
def __str__(self):
|
||||
return f"ReceivedAppMessageFrame: sender: {self.sender}, message: {self.message}"
|
||||
|
||||
|
||||
@dataclass()
|
||||
class SendAppMessageFrame(Frame):
|
||||
message: Any
|
||||
participantId: str | None
|
||||
|
||||
def __str__(self):
|
||||
return f"SendAppMessageFrame: participantId: {self.participantId}, message: {self.message}"
|
||||
|
||||
|
||||
class UserStartedSpeakingFrame(Frame):
|
||||
"""Emitted by VAD to indicate that a participant has started speaking.
|
||||
This can be used for interruptions or other times when detecting that
|
||||
someone is speaking is more important than knowing what they're saying
|
||||
(as you will with a TranscriptionFrame)"""
|
||||
pass
|
||||
|
||||
|
||||
class UserStoppedSpeakingFrame(Frame):
|
||||
"""Emitted by the VAD to indicate that a user stopped speaking."""
|
||||
pass
|
||||
|
||||
|
||||
class BotStartedSpeakingFrame(Frame):
|
||||
pass
|
||||
|
||||
|
||||
class BotStoppedSpeakingFrame(Frame):
|
||||
pass
|
||||
|
||||
|
||||
@dataclass()
|
||||
class LLMFunctionStartFrame(Frame):
|
||||
"""Emitted when the LLM receives the beginning of a function call
|
||||
completion. A frame processor can use this frame to indicate that it should
|
||||
start preparing to make a function call, if it can do so in the absence of
|
||||
any arguments."""
|
||||
function_name: str
|
||||
|
||||
|
||||
@dataclass()
|
||||
class LLMFunctionCallFrame(Frame):
|
||||
"""Emitted when the LLM has received an entire function call completion."""
|
||||
function_name: str
|
||||
arguments: str
|
||||
@@ -1,24 +0,0 @@
|
||||
from typing import List
|
||||
from dailyai.pipeline.frames import EndFrame, EndPipeFrame
|
||||
from dailyai.pipeline.pipeline import Pipeline
|
||||
|
||||
|
||||
class SequentialMergePipeline(Pipeline):
|
||||
"""This class merges the sink queues from a list of pipelines. Frames from
|
||||
each pipeline's sink are merged in the order of pipelines in the list."""
|
||||
|
||||
def __init__(self, pipelines: List[Pipeline]):
|
||||
super().__init__([])
|
||||
self.pipelines = pipelines
|
||||
|
||||
async def run_pipeline(self):
|
||||
for idx, pipeline in enumerate(self.pipelines):
|
||||
while True:
|
||||
frame = await pipeline.sink.get()
|
||||
if isinstance(
|
||||
frame, EndFrame) or isinstance(
|
||||
frame, EndPipeFrame):
|
||||
break
|
||||
await self.sink.put(frame)
|
||||
|
||||
await self.sink.put(EndFrame())
|
||||
@@ -1,115 +0,0 @@
|
||||
from typing import AsyncGenerator, Callable
|
||||
from dailyai.pipeline.frame_processor import FrameProcessor
|
||||
from dailyai.pipeline.frames import (
|
||||
Frame,
|
||||
LLMResponseEndFrame,
|
||||
LLMResponseStartFrame,
|
||||
TextFrame,
|
||||
TranscriptionFrame,
|
||||
UserStartedSpeakingFrame,
|
||||
UserStoppedSpeakingFrame,
|
||||
)
|
||||
from dailyai.pipeline.openai_frames import OpenAILLMContextFrame
|
||||
from dailyai.services.openai_llm_context import OpenAILLMContext
|
||||
|
||||
try:
|
||||
from openai.types.chat import ChatCompletionRole
|
||||
except ModuleNotFoundError as e:
|
||||
print(f"Exception: {e}")
|
||||
print(
|
||||
"In order to use OpenAI, you need to `pip install dailyai[openai]`. Also, set `OPENAI_API_KEY` environment variable.")
|
||||
raise Exception(f"Missing module: {e}")
|
||||
|
||||
|
||||
class OpenAIContextAggregator(FrameProcessor):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
context: OpenAILLMContext,
|
||||
aggregator: Callable[[Frame, str | None], str | None],
|
||||
role: ChatCompletionRole,
|
||||
start_frame: type,
|
||||
end_frame: type,
|
||||
accumulator_frame: type,
|
||||
pass_through=True,
|
||||
):
|
||||
if not (
|
||||
issubclass(start_frame, Frame)
|
||||
and issubclass(end_frame, Frame)
|
||||
and issubclass(accumulator_frame, Frame)
|
||||
):
|
||||
raise TypeError(
|
||||
"start_frame, end_frame and accumulator_frame must be instances of Frame"
|
||||
)
|
||||
|
||||
self._context: OpenAILLMContext = context
|
||||
self._aggregator: Callable[[Frame, str | None], None] = aggregator
|
||||
self._role: ChatCompletionRole = role
|
||||
self._start_frame = start_frame
|
||||
self._end_frame = end_frame
|
||||
self._accumulator_frame = accumulator_frame
|
||||
self._pass_through = pass_through
|
||||
|
||||
self._aggregating = False
|
||||
self._aggregation = None
|
||||
|
||||
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||
if isinstance(frame, self._start_frame):
|
||||
self._aggregating = True
|
||||
elif isinstance(frame, self._end_frame):
|
||||
self._aggregating = False
|
||||
if self._aggregation:
|
||||
self._context.add_message(
|
||||
{
|
||||
"role": self._role,
|
||||
"content": self._aggregation,
|
||||
"name": self._role,
|
||||
} # type: ignore
|
||||
)
|
||||
self._aggregation = None
|
||||
yield OpenAILLMContextFrame(self._context)
|
||||
elif isinstance(frame, self._accumulator_frame) and self._aggregating:
|
||||
self._aggregation = self._aggregator(frame, self._aggregation)
|
||||
if self._pass_through:
|
||||
yield frame
|
||||
else:
|
||||
yield frame
|
||||
|
||||
def string_aggregator(
|
||||
self,
|
||||
frame: Frame,
|
||||
aggregation: str | None) -> str | None:
|
||||
if not isinstance(frame, TextFrame):
|
||||
raise TypeError(
|
||||
"Frame must be a TextFrame instance to be aggregated by a string aggregator."
|
||||
)
|
||||
if not aggregation:
|
||||
aggregation = ""
|
||||
return " ".join([aggregation, frame.text])
|
||||
|
||||
|
||||
class OpenAIUserContextAggregator(OpenAIContextAggregator):
|
||||
def __init__(self, context: OpenAILLMContext):
|
||||
super().__init__(
|
||||
context=context,
|
||||
aggregator=self.string_aggregator,
|
||||
role="user",
|
||||
start_frame=UserStartedSpeakingFrame,
|
||||
end_frame=UserStoppedSpeakingFrame,
|
||||
accumulator_frame=TranscriptionFrame,
|
||||
pass_through=False,
|
||||
)
|
||||
|
||||
|
||||
class OpenAIAssistantContextAggregator(OpenAIContextAggregator):
|
||||
|
||||
def __init__(self, context: OpenAILLMContext):
|
||||
super().__init__(
|
||||
context,
|
||||
aggregator=self.string_aggregator,
|
||||
role="assistant",
|
||||
start_frame=LLMResponseStartFrame,
|
||||
end_frame=LLMResponseEndFrame,
|
||||
accumulator_frame=TextFrame,
|
||||
pass_through=True,
|
||||
)
|
||||
@@ -1,12 +0,0 @@
|
||||
from dataclasses import dataclass
|
||||
|
||||
from dailyai.pipeline.frames import Frame
|
||||
from dailyai.services.openai_llm_context import OpenAILLMContext
|
||||
|
||||
|
||||
@dataclass()
|
||||
class OpenAILLMContextFrame(Frame):
|
||||
"""Like an LLMMessagesFrame, but with extra context specific to the
|
||||
OpenAI API. The context in this message is also mutable, and will be
|
||||
changed by the OpenAIContextAggregator frame processor."""
|
||||
context: OpenAILLMContext
|
||||
@@ -1,149 +0,0 @@
|
||||
import asyncio
|
||||
import logging
|
||||
from typing import AsyncGenerator, AsyncIterable, Iterable, List
|
||||
from dailyai.pipeline.frame_processor import FrameProcessor
|
||||
|
||||
from dailyai.pipeline.frames import AudioFrame, EndPipeFrame, EndFrame, Frame
|
||||
|
||||
|
||||
class Pipeline:
|
||||
"""
|
||||
This class manages a pipe of FrameProcessors, and runs them in sequence. The "source"
|
||||
and "sink" queues are managed by the caller. You can use this class stand-alone to
|
||||
perform specialized processing, or you can use the Transport's run_pipeline method to
|
||||
instantiate and run a pipeline with the Transport's sink and source queues.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
processors: List[FrameProcessor],
|
||||
source: asyncio.Queue | None = None,
|
||||
sink: asyncio.Queue[Frame] | None = None,
|
||||
name: str | None = None,
|
||||
):
|
||||
"""Create a new pipeline. By default we create the sink and source queues
|
||||
if they're not provided, but these can be overridden to point to other
|
||||
queues. If this pipeline is run by a transport, its sink and source queues
|
||||
will be overridden.
|
||||
"""
|
||||
self._processors: List[FrameProcessor] = processors
|
||||
|
||||
self.source: asyncio.Queue[Frame] = source or asyncio.Queue()
|
||||
self.sink: asyncio.Queue[Frame] = sink or asyncio.Queue()
|
||||
|
||||
self._logger = logging.getLogger("dailyai.pipeline")
|
||||
self._last_log_line = ""
|
||||
self._shown_repeated_log = False
|
||||
self._name = name or str(id(self))
|
||||
|
||||
def set_source(self, source: asyncio.Queue[Frame]):
|
||||
"""Set the source queue for this pipeline. Frames from this queue
|
||||
will be processed by each frame_processor in the pipeline, or order
|
||||
from first to last."""
|
||||
self.source = source
|
||||
|
||||
def set_sink(self, sink: asyncio.Queue[Frame]):
|
||||
"""Set the sink queue for this pipeline. After the last frame_processor
|
||||
has processed a frame, its output will be placed on this queue."""
|
||||
self.sink = sink
|
||||
|
||||
def add_processor(self, processor: FrameProcessor):
|
||||
self._processors.append(processor)
|
||||
|
||||
async def get_next_source_frame(self) -> AsyncGenerator[Frame, None]:
|
||||
"""Convenience function to get the next frame from the source queue. This
|
||||
lets us consistently have an AsyncGenerator yield frames, from either the
|
||||
source queue or a frame_processor."""
|
||||
|
||||
yield await self.source.get()
|
||||
|
||||
async def queue_frames(
|
||||
self,
|
||||
frames: Iterable[Frame] | AsyncIterable[Frame],
|
||||
) -> None:
|
||||
"""Insert frames directly into a pipeline. This is typically used inside a transport
|
||||
participant_joined callback to prompt a bot to start a conversation, for example."""
|
||||
|
||||
if isinstance(frames, AsyncIterable):
|
||||
async for frame in frames:
|
||||
await self.source.put(frame)
|
||||
elif isinstance(frames, Iterable):
|
||||
for frame in frames:
|
||||
await self.source.put(frame)
|
||||
else:
|
||||
raise Exception("Frames must be an iterable or async iterable")
|
||||
|
||||
async def run_pipeline(self):
|
||||
"""Run the pipeline. Take each frame from the source queue, pass it to
|
||||
the first frame_processor, pass the output of that frame_processor to the
|
||||
next in the list, etc. until the last frame_processor has processed the
|
||||
resulting frames, then place those frames in the sink queue.
|
||||
|
||||
The source and sink queues must be set before calling this method.
|
||||
|
||||
This method will exit when an EndFrame is placed on the sink queue.
|
||||
No more frames will be placed on the sink queue after an EndFrame, even
|
||||
if it's not the last frame yielded by the last frame_processor in the pipeline..
|
||||
"""
|
||||
|
||||
try:
|
||||
while True:
|
||||
initial_frame = await self.source.get()
|
||||
async for frame in self._run_pipeline_recursively(
|
||||
initial_frame, self._processors
|
||||
):
|
||||
self._log_frame(frame, len(self._processors) + 1)
|
||||
await self.sink.put(frame)
|
||||
|
||||
if isinstance(initial_frame, EndFrame) or isinstance(
|
||||
initial_frame, EndPipeFrame
|
||||
):
|
||||
break
|
||||
except asyncio.CancelledError:
|
||||
# this means there's been an interruption, do any cleanup necessary
|
||||
# here.
|
||||
for processor in self._processors:
|
||||
await processor.interrupted()
|
||||
|
||||
async def _run_pipeline_recursively(
|
||||
self, initial_frame: Frame, processors: List[FrameProcessor], depth=1
|
||||
) -> AsyncGenerator[Frame, None]:
|
||||
"""Internal function to add frames to the pipeline as they're yielded
|
||||
by each processor."""
|
||||
if processors:
|
||||
self._log_frame(initial_frame, depth)
|
||||
async for frame in processors[0].process_frame(initial_frame):
|
||||
async for final_frame in self._run_pipeline_recursively(
|
||||
frame, processors[1:], depth + 1
|
||||
):
|
||||
yield final_frame
|
||||
else:
|
||||
yield initial_frame
|
||||
|
||||
def _log_frame(self, frame: Frame, depth: int):
|
||||
"""Log a frame as it moves through the pipeline. This is useful for debugging.
|
||||
Note that this function inherits the logging level from the "dailyai" logger.
|
||||
If you want debug output from dailyai in general but not this function (it is
|
||||
noisy) you can silence this function by doing something like this:
|
||||
|
||||
# enable debug logging for the dailyai package.
|
||||
logger = logging.getLogger("dailyai")
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
# silence the pipeline logging
|
||||
logger = logging.getLogger("dailyai.pipeline")
|
||||
logger.setLevel(logging.WARNING)
|
||||
"""
|
||||
source = str(self._processors[depth - 2]) if depth > 1 else "source"
|
||||
dest = str(self._processors[depth - 1]) if depth < (len(self._processors) + 1) else "sink"
|
||||
prefix = self._name + " " * depth
|
||||
logline = prefix + " -> ".join([source, frame.__class__.__name__, dest])
|
||||
if logline == self._last_log_line:
|
||||
if self._shown_repeated_log:
|
||||
return
|
||||
self._shown_repeated_log = True
|
||||
self._logger.debug(prefix + "... repeated")
|
||||
else:
|
||||
self._shown_repeated_log = False
|
||||
self._last_log_line = logline
|
||||
self._logger.debug(logline)
|
||||
@@ -1,32 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# Generated by the protocol buffer compiler. DO NOT EDIT!
|
||||
# source: frames.proto
|
||||
# Protobuf Python Version: 4.25.3
|
||||
"""Generated protocol buffer code."""
|
||||
from google.protobuf import descriptor as _descriptor
|
||||
from google.protobuf import descriptor_pool as _descriptor_pool
|
||||
from google.protobuf import symbol_database as _symbol_database
|
||||
from google.protobuf.internal import builder as _builder
|
||||
# @@protoc_insertion_point(imports)
|
||||
|
||||
_sym_db = _symbol_database.Default()
|
||||
|
||||
|
||||
|
||||
|
||||
DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x0c\x66rames.proto\x12\rdailyai_proto\"\x19\n\tTextFrame\x12\x0c\n\x04text\x18\x01 \x01(\t\"\x1a\n\nAudioFrame\x12\x0c\n\x04\x64\x61ta\x18\x01 \x01(\x0c\"L\n\x12TranscriptionFrame\x12\x0c\n\x04text\x18\x01 \x01(\t\x12\x15\n\rparticipantId\x18\x02 \x01(\t\x12\x11\n\ttimestamp\x18\x03 \x01(\t\"\xa2\x01\n\x05\x46rame\x12(\n\x04text\x18\x01 \x01(\x0b\x32\x18.dailyai_proto.TextFrameH\x00\x12*\n\x05\x61udio\x18\x02 \x01(\x0b\x32\x19.dailyai_proto.AudioFrameH\x00\x12:\n\rtranscription\x18\x03 \x01(\x0b\x32!.dailyai_proto.TranscriptionFrameH\x00\x42\x07\n\x05\x66rameb\x06proto3')
|
||||
|
||||
_globals = globals()
|
||||
_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
|
||||
_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'frames_pb2', _globals)
|
||||
if _descriptor._USE_C_DESCRIPTORS == False:
|
||||
DESCRIPTOR._options = None
|
||||
_globals['_TEXTFRAME']._serialized_start=31
|
||||
_globals['_TEXTFRAME']._serialized_end=56
|
||||
_globals['_AUDIOFRAME']._serialized_start=58
|
||||
_globals['_AUDIOFRAME']._serialized_end=84
|
||||
_globals['_TRANSCRIPTIONFRAME']._serialized_start=86
|
||||
_globals['_TRANSCRIPTIONFRAME']._serialized_end=162
|
||||
_globals['_FRAME']._serialized_start=165
|
||||
_globals['_FRAME']._serialized_end=327
|
||||
# @@protoc_insertion_point(module_scope)
|
||||
98
src/dailyai/queue_aggregators.py
Normal file
@@ -0,0 +1,98 @@
|
||||
import asyncio
|
||||
|
||||
from dailyai.queue_frame import LLMMessagesQueueFrame, QueueFrame, TextQueueFrame, TranscriptionQueueFrame
|
||||
from dailyai.services.ai_services import AIService
|
||||
|
||||
from typing import AsyncGenerator, List
|
||||
|
||||
|
||||
class QueueTee:
|
||||
async def run_to_queue_and_generate(
|
||||
self,
|
||||
output_queue: asyncio.Queue,
|
||||
generator: AsyncGenerator[QueueFrame, None]
|
||||
) -> AsyncGenerator[QueueFrame, None]:
|
||||
async for frame in generator:
|
||||
await output_queue.put(frame)
|
||||
yield frame
|
||||
|
||||
async def run_to_queues(
|
||||
self,
|
||||
output_queues: List[asyncio.Queue],
|
||||
generator: AsyncGenerator[QueueFrame, None]
|
||||
):
|
||||
async for frame in generator:
|
||||
for queue in output_queues:
|
||||
await queue.put(frame)
|
||||
|
||||
|
||||
class LLMContextAggregator(AIService):
|
||||
def __init__(
|
||||
self,
|
||||
messages: list[dict],
|
||||
role: str,
|
||||
bot_participant_id=None,
|
||||
complete_sentences=True,
|
||||
pass_through=True):
|
||||
super().__init__()
|
||||
self.messages = messages
|
||||
self.bot_participant_id = bot_participant_id
|
||||
self.role = role
|
||||
self.sentence = ""
|
||||
self.complete_sentences = complete_sentences
|
||||
self.pass_through = pass_through
|
||||
|
||||
async def process_frame(self, frame: QueueFrame) -> AsyncGenerator[QueueFrame, None]:
|
||||
# We don't do anything with non-text frames, pass it along to next in the pipeline.
|
||||
if not isinstance(frame, TextQueueFrame):
|
||||
yield frame
|
||||
return
|
||||
|
||||
# Ignore transcription frames from the bot
|
||||
if isinstance(frame, TranscriptionQueueFrame):
|
||||
if frame.participantId == self.bot_participant_id:
|
||||
return
|
||||
|
||||
# The common case for "pass through" is receiving frames from the LLM that we'll
|
||||
# use to update the "assistant" LLM messages, but also passing the text frames
|
||||
# along to a TTS service to be spoken to the user.
|
||||
if self.pass_through:
|
||||
yield frame
|
||||
|
||||
# TODO: split up transcription by participant
|
||||
if self.complete_sentences:
|
||||
# type: ignore -- the linter thinks this isn't a TextQueueFrame, even
|
||||
# though we check it above
|
||||
self.sentence += frame.text
|
||||
if self.sentence.endswith((".", "?", "!")):
|
||||
self.messages.append({"role": self.role, "content": self.sentence})
|
||||
self.sentence = ""
|
||||
yield LLMMessagesQueueFrame(self.messages)
|
||||
else:
|
||||
# type: ignore -- the linter thinks this isn't a TextQueueFrame, even
|
||||
# though we check it above
|
||||
self.messages.append({"role": self.role, "content": frame.text})
|
||||
yield LLMMessagesQueueFrame(self.messages)
|
||||
|
||||
async def finalize(self) -> AsyncGenerator[QueueFrame, None]:
|
||||
# Send any dangling words that weren't finished with punctuation.
|
||||
if self.complete_sentences and self.sentence:
|
||||
self.messages.append({"role": self.role, "content": self.sentence})
|
||||
yield LLMMessagesQueueFrame(self.messages)
|
||||
|
||||
|
||||
class LLMUserContextAggregator(LLMContextAggregator):
|
||||
def __init__(self,
|
||||
messages: list[dict],
|
||||
bot_participant_id=None,
|
||||
complete_sentences=True):
|
||||
super().__init__(messages, "user", bot_participant_id, complete_sentences, pass_through=False)
|
||||
|
||||
|
||||
class LLMAssistantContextAggregator(LLMContextAggregator):
|
||||
def __init__(
|
||||
self, messages: list[dict], bot_participant_id=None, complete_sentences=True
|
||||
):
|
||||
super().__init__(
|
||||
messages, "assistant", bot_participant_id, complete_sentences, pass_through=True
|
||||
)
|
||||