Compare commits

...

96 Commits

Author SHA1 Message Date
Aleix Conchillo Flaqué
b254525d3c go back to using @dataclass since they can be inspected 2024-05-12 22:35:43 -07:00
Aleix Conchillo Flaqué
6c06fb8169 README: update pypi badge 2024-05-12 19:28:00 -07:00
Aleix Conchillo Flaqué
721cd11d62 Merge pull request #133 from pipecat-ai/aleix/readme
rebased jpt/readme branch
2024-05-13 10:26:45 +08:00
Aleix Conchillo Flaqué
bfbcb9d531 fix autopep8 linting 2024-05-12 19:25:17 -07:00
Aleix Conchillo Flaqué
724e78c5be renamed image.png to pipecat.png 2024-05-12 17:44:10 -07:00
Jon Taylor
d3c3d78855 added discord badge 2024-05-12 17:41:36 -07:00
Jon Taylor
8fa9fdcd5a Reworked readme to have more pipes and cats 2024-05-12 17:41:30 -07:00
Aleix Conchillo Flaqué
7856d20a38 Merge pull request #132 from pipecat-ai/pypi-repo-change
change pypi repo to pipecat-ai
2024-05-13 03:14:40 +08:00
Aleix Conchillo Flaqué
6d10027f2d change pypi repo to pipecat-ai 2024-05-12 12:08:43 -07:00
Aleix Conchillo Flaqué
bea31215dc Merge pull request #129 from daily-co/wip-proposal
pipecat proposal
2024-05-13 01:13:18 +08:00
Aleix Conchillo Flaqué
083480ca1e update macos-py3.10-requirements.txt 2024-05-12 10:10:35 -07:00
Aleix Conchillo Flaqué
65846330cf update linux-py3.10-requirements.txt 2024-05-12 10:09:04 -07:00
Aleix Conchillo Flaqué
29f48266f7 README: install dev-requirements.txt first 2024-05-12 10:07:54 -07:00
Aleix Conchillo Flaqué
bfd583211c examples: use LocalAudioTransport 2024-05-12 10:07:54 -07:00
Aleix Conchillo Flaqué
b026915d19 initial commit for new pipecat architecture 2024-05-12 10:07:25 -07:00
Aleix Conchillo Flaqué
4a0836dc8f Merge pull request #130 from daily-co/dependabot-05-06-24
dependabot: update packages 05-06-24
2024-05-07 08:14:38 +08:00
Aleix Conchillo Flaqué
2729c6bf5b dependabot: update packages 05-06-24 2024-05-06 15:33:33 -07:00
Aleix Conchillo Flaqué
712a889121 Merge pull request #128 from daily-co/pillow-security-fixes
pyproject: pillow security fixes
2024-04-23 01:51:49 +08:00
Aleix Conchillo Flaqué
2f341e4fb0 pyproject: pillow security fixes 2024-04-22 10:28:42 -07:00
Kwindla Hultman Kramer
24198ecf45 Merge pull request #126 from daily-co/jptaylor-patch-3
Update README.md
2024-04-12 23:10:30 -07:00
Jon Taylor
7e4fefe958 Update README.md 2024-04-12 22:45:30 -07:00
Jon Taylor
e9af39b85f Merge pull request #125 from daily-co/jptaylor-patch-2
Update README.md
2024-04-12 22:44:14 -07:00
Jon Taylor
38aa3cebb4 Update README.md 2024-04-12 22:42:11 -07:00
Jon Taylor
72724365a0 Merge pull request #124 from daily-co/jptaylor-patch-1
Update README.md
2024-04-12 22:40:29 -07:00
Jon Taylor
5368462e41 Update README.md 2024-04-12 22:28:40 -07:00
Jon Taylor
1b2b29dd18 Merge pull request #123 from daily-co/jpt/pypi-badge
added pypi badge
2024-04-12 07:33:26 -07:00
Kwindla Hultman Kramer
d2b2b6f619 Merge pull request #122 from daily-co/kwindla-patch-1
Update README.md
2024-04-11 21:34:37 -07:00
Jon Taylor
54bcb52129 added pypi badge 2024-04-11 21:34:27 -07:00
Kwindla Hultman Kramer
3dc7438bc8 Update README.md 2024-04-11 21:05:27 -07:00
Aleix Conchillo Flaqué
523bb9f2a2 Merge pull request #120 from daily-co/small-fireworks-fixes
minor fireworks updates
2024-04-12 06:35:57 +08:00
Aleix Conchillo Flaqué
0c2b3f8b65 minor fireworks updates 2024-04-11 15:34:23 -07:00
chadbailey59
0b7578056d added fireworks adapter (#118) 2024-04-11 17:15:02 -05:00
Aleix Conchillo Flaqué
f1b6b9f8e5 Merge pull request #119 from daily-co/use-new-fal-client-library
services: FalImageGenService now uses fal-client library
2024-04-12 05:59:58 +08:00
Aleix Conchillo Flaqué
cbc51babbe services: use asyncio to_thread in moondreamservice 2024-04-11 14:22:44 -07:00
Aleix Conchillo Flaqué
b0faafc184 update macos-py3.10 requirements 2024-04-11 14:16:19 -07:00
Aleix Conchillo Flaqué
103092dbb2 update linux-py3.10 requirements 2024-04-11 14:13:59 -07:00
Aleix Conchillo Flaqué
7b49c9ade3 services: FalImageGenService now uses fal-client library 2024-04-11 14:09:01 -07:00
Aleix Conchillo Flaqué
1e83a405c0 Merge pull request #117 from daily-co/llm-use-aggregator-pass-through-fix
aggregators: fix LLMUserResponseAggregator passs-through
2024-04-12 04:24:56 +08:00
Aleix Conchillo Flaqué
7336866a1c examples: rely on new daily default transcription settings 2024-04-11 11:22:58 -07:00
Aleix Conchillo Flaqué
0f23282e30 transport: enable interim results in daily transport 2024-04-11 11:22:05 -07:00
Aleix Conchillo Flaqué
eb3bf117b1 use InterimTranscriptionFrame in LLMUserResponseAggregator 2024-04-11 11:21:42 -07:00
Aleix Conchillo Flaqué
e288aa047b examples: use LLMUserResponseAggregator with VAD 2024-04-11 08:10:56 -07:00
Aleix Conchillo Flaqué
9a9df35d7b aggregators: allow TranscriptionFrame after an end frame threshold 2024-04-10 23:35:31 -07:00
Aleix Conchillo Flaqué
af8663e95d aggregators: fix LLMUserResponseAggregator passs-through 2024-04-10 21:46:16 -07:00
Aleix Conchillo Flaqué
db05a9b29b Merge pull request #116 from daily-co/moondream-use-cpu
moondream: allow passing use_cpu
2024-04-11 09:08:11 +08:00
Aleix Conchillo Flaqué
130e418800 moondream: allow passing use_cpu 2024-04-10 17:43:44 -07:00
Aleix Conchillo Flaqué
1a0a66e503 Merge pull request #114 from daily-co/jpt/fal-updates
Updated Fal.ai service to take a params model and allow for model string param
2024-04-11 00:47:33 +08:00
Aleix Conchillo Flaqué
e22babbae2 examples: update with new FalImageGenService parameters 2024-04-10 09:45:08 -07:00
Aleix Conchillo Flaqué
bfe2e0f36e services: don't use image_size in ImageGenService 2024-04-10 09:44:42 -07:00
Aleix Conchillo Flaqué
26d401e5de Merge pull request #115 from daily-co/add-vision-and-moondream-service
add vision and moondream service
2024-04-11 00:22:26 +08:00
Aleix Conchillo Flaqué
3c20f9153d added VisionImageFrame and VisionImageFrameAggregator 2024-04-10 09:19:34 -07:00
Aleix Conchillo Flaqué
2f9899af5a update macos-py3.10 requirements 2024-04-09 22:39:04 -07:00
Aleix Conchillo Flaqué
5ef5cf30f4 update linux-py3.10 requirements 2024-04-09 22:36:35 -07:00
Aleix Conchillo Flaqué
34a6c5691b examples: added 12-describe-video 2024-04-09 22:36:35 -07:00
Aleix Conchillo Flaqué
18bf09c704 services: added MoondreamService 2024-04-09 22:36:35 -07:00
Aleix Conchillo Flaqué
84cfa7cc95 services: added VisionService 2024-04-09 22:36:35 -07:00
Aleix Conchillo Flaqué
a5eba0106b transport: allow requesting a user frame 2024-04-09 22:36:35 -07:00
Aleix Conchillo Flaqué
b117a185e3 frames: added UserImageRequestFrame 2024-04-09 22:14:54 -07:00
Aleix Conchillo Flaqué
0219230827 Merge pull request #113 from daily-co/aleix/only-subcribe-to-participant
only subcribe to participant
2024-04-10 10:47:29 +08:00
Aleix Conchillo Flaqué
9fcbb36997 examples: add 14a-local-render-remote-participant 2024-04-09 19:46:10 -07:00
Aleix Conchillo Flaqué
0bf15fd6eb daily: only subscribe to participant video source 2024-04-09 19:46:10 -07:00
Aleix Conchillo Flaqué
989252bb52 daily: always check camera/mic/speaker enabled 2024-04-09 19:46:10 -07:00
Jon Taylor
7b44a79a5b added params and model attribute to fal service 2024-04-09 17:43:27 -07:00
Aleix Conchillo Flaqué
4bd29b0080 Merge pull request #110 from daily-co/compatible-versions
pyproject: use compatible version
2024-04-10 00:41:22 +08:00
Aleix Conchillo Flaqué
ebb76fdae9 update macos-py3.10 requirements 2024-04-09 08:52:37 -07:00
Aleix Conchillo Flaqué
5d52def0fe update linux-py3.10 requirements 2024-04-09 08:49:41 -07:00
Aleix Conchillo Flaqué
9ada56d0b0 pyproject: use compatible version 2024-04-09 08:41:54 -07:00
Aleix Conchillo Flaqué
8d73cdb2ee Merge pull request #111 from daily-co/user-transcription-aggregator
pipeline: add UserTranscriptionAggregator
2024-04-09 23:34:52 +08:00
Aleix Conchillo Flaqué
4f04b10202 Merge pull request #112 from daily-co/user-image-frame
user image frames and other updates
2024-04-09 23:34:32 +08:00
Aleix Conchillo Flaqué
97b923e37e llm user and assistant aggregator renames 2024-04-09 08:31:48 -07:00
Aleix Conchillo Flaqué
57aabea0a3 examples: added 14-render-remote-participant 2024-04-09 08:01:14 -07:00
Aleix Conchillo Flaqué
319b8e7816 updated ImageFrame and added URLImageFrame and UserImageFrame 2024-04-08 23:23:33 -07:00
Aleix Conchillo Flaqué
96950ca6df daily: on_first_other_participant_joined now gets the participant 2024-04-08 23:23:33 -07:00
Aleix Conchillo Flaqué
d7b2e67c35 pipeline: add UserTranscriptionAggregator 2024-04-08 17:15:14 -07:00
Aleix Conchillo Flaqué
53930b47a5 github: just some rewording 2024-04-06 18:03:53 -07:00
Aleix Conchillo Flaqué
86c8ab02cc github: also publish stables releases to test pypi 2024-04-06 17:58:13 -07:00
Aleix Conchillo Flaqué
b678097f6d Merge pull request #109 from daily-co/only-use-fps
transport: only use fps to set maxFramerate
2024-04-07 07:02:44 +08:00
Aleix Conchillo Flaqué
eb455043c4 transport: use camera_bitrate and camera_framerate 2024-04-06 12:27:05 -07:00
Aleix Conchillo Flaqué
dd696be04c Merge pull request #108 from daily-co/add-camera-max-framerate
transport: add camera_max_framerate argument
2024-04-06 11:18:42 +08:00
Aleix Conchillo Flaqué
96b2337183 transport: add camera_max_framerate argument 2024-04-05 20:16:03 -07:00
Aleix Conchillo Flaqué
ea52e73f57 Merge pull request #107 from daily-co/increase-max-framerate
transport: increase daily maxFramerate to 30
2024-04-06 11:08:21 +08:00
Aleix Conchillo Flaqué
88404e4739 Merge pull request #106 from daily-co/updated-to-be-updated-examples
examples: updated to_be_updated examples
2024-04-06 11:06:30 +08:00
Aleix Conchillo Flaqué
0fd323714e transport: add camera_max_bitrate argument 2024-04-05 20:05:58 -07:00
Aleix Conchillo Flaqué
a362ca4d3d transport: increase daily maxFramerate to 30 2024-04-05 19:44:25 -07:00
Aleix Conchillo Flaqué
02b5c3dd5f update dot-env.template 2024-04-05 16:16:56 -07:00
Aleix Conchillo Flaqué
497a09cbc8 examples: updated to_be_updated examples 2024-04-05 16:01:23 -07:00
Aleix Conchillo Flaqué
172a14245d Merge pull request #104 from daily-co/threaded-transport-allow-sink-override
examples: fix whisper examples
2024-04-06 04:46:12 +08:00
Aleix Conchillo Flaqué
302246399b Merge pull request #105 from daily-co/local-tranport-read-audio-frames
transports: fix local transport read_audio_frames
2024-04-06 04:44:37 +08:00
Aleix Conchillo Flaqué
9590cc2fbc examples: fix whisper examples 2024-04-05 13:43:51 -07:00
Aleix Conchillo Flaqué
09e4044c72 transports: fix local transport read_audio_frames 2024-04-05 13:34:01 -07:00
Aleix Conchillo Flaqué
efdfb74dc3 github: increase fetch-depth to 100 for test publish 2024-04-05 08:32:29 -07:00
Aleix Conchillo Flaqué
158de6f20b github: fetch-tags and increase fetch-depth for test publish 2024-04-05 08:25:37 -07:00
Aleix Conchillo Flaqué
47f68b742d pyproject: user proper environment for test pypi 2024-04-05 08:02:45 -07:00
Aleix Conchillo Flaqué
2654ca1f62 pyproject: don't use local version for test pypi 2024-04-05 07:51:52 -07:00
Aleix Conchillo Flaqué
4263827ee8 README: use double-quotes with optional dependencies 2024-04-04 17:47:16 -07:00
Aleix Conchillo Flaqué
97fe529b0e github: update test publish workflow 2024-04-04 17:41:31 -07:00
133 changed files with 5567 additions and 3946 deletions

View File

@@ -46,7 +46,7 @@ jobs:
needs: [ build ]
environment:
name: pypi
url: https://pypi.org/p/dailyai
url: https://pypi.org/p/pipecat-ai
permissions:
id-token: write
steps:
@@ -60,3 +60,25 @@ jobs:
with:
verbose: true
print-hash: true
publish-to-test-pypi:
name: "Publish to Test PyPI"
runs-on: ubuntu-latest
needs: [ build ]
environment:
name: testpypi
url: https://pypi.org/p/pipecat-ai
permissions:
id-token: write
steps:
- name: Download wheels
uses: actions/download-artifact@v4
with:
name: wheels
path: ./dist
- name: Publish to PyPI
uses: pypa/gh-action-pypi-publish@release/v1
with:
verbose: true
print-hash: true
repository-url: https://test.pypi.org/legacy/

View File

@@ -15,6 +15,8 @@ jobs:
uses: actions/checkout@v4
with:
ref: ${{ github.event.inputs.gitref }}
fetch-tags: true
fetch-depth: 100
- name: Set up Python
id: setup_python
uses: actions/setup-python@v4
@@ -35,21 +37,23 @@ jobs:
- name: Upload wheels
uses: actions/upload-artifact@v4
with:
name: wheels
path: ./dist
publish-to-pypi:
name: "Test publish to PyPI"
name: "Publish to Test PyPI"
runs-on: ubuntu-latest
needs: [ build ]
environment:
name: pypi
url: https://pypi.org/p/dailyai
name: testpypi
url: https://pypi.org/p/pipecat-ai
permissions:
id-token: write
steps:
- name: Download wheels
uses: actions/download-artifact@v4
with:
name: wheels
path: ./dist
- name: Publish to PyPI
uses: pypa/gh-action-pypi-publish@release/v1

1
.gitignore vendored
View File

@@ -3,6 +3,7 @@ env/
__pycache__/
*~
venv
.venv
#*#
# Distribution / packaging

155
README.md
View File

@@ -1,77 +1,130 @@
# dailyai — an open source framework for real-time, multi-modal, conversational AI applications
<div align="center">
 <img alt="pipecat" width="300px" height="auto" src="pipecat.png">
</div>
# Pipecat
[![PyPI](https://img.shields.io/pypi/v/pipecat-ai)](https://pypi.org/project/pipecat-ai) [![Discord](https://img.shields.io/discord/1239284677165056021
)](https://discord.gg/pipecat)
`pipecat` is a framework for building voice (and multimodal) conversational agents. Things like personal coaches, meeting assistants, story-telling toys for kids, customer support bots, and snarky social companions.
Build things like this:
[![AI-powered voice patient intake for healthcare](https://img.youtube.com/vi/lDevgsp9vn0/0.jpg)](https://www.youtube.com/watch?v=lDevgsp9vn0)
**`dailyai` started as a toolkit for implementing generative AI voice bots.** Things like personal coaches, meeting assistants, story-telling toys for kids, customer support bots, and snarky social companions.
## Getting started with voice agents
In 2023 a *lot* of us got excited about the possibility of having open-ended conversations with LLMs. It became clear pretty quickly that we were all solving the same [low-level problems](https://www.daily.co/blog/how-to-talk-to-an-llm-with-your-voice/):
- low-latency, reliable audio transport
- echo cancellation
- phrase endpointing (knowing when the bot should respond to human speech)
- interruptibility
- writing clean code to stream data through "pipelines" of speech-to-text, LLM inference, and text-to-speech models
You can get started with Pipecat running on your local machine, then move your agent processes to the cloud when youre ready. You can also add a telephone number, image output, video input, use different LLMs, and more.
As our applications expanded to include additional things like image generation, function calling, and vision models, we started to think about what a complete framework for these kinds of apps could look like.
Today, `dailyai` is:
1. a set of code building blocks for interacting with generative AI services and creating low-latency, interruptible data pipelines that use multiple services
2. transport services that moves audio, video, and events across the Internet
3. implementations of specific generative AI services
Currently implemented services:
- Speech-to-text
- Deepgram
- Whisper
- LLMs
- Azure
- OpenAI
- Image generation
- Azure
- Fal
- OpenAI
- Text-to-speech
- Azure
- Deepgram
- ElevenLabs
- Transport
- Daily
- Local (in progress, intended as a quick start example service)
If you'd like to [implement a service]((https://github.com/daily-co/daily-ai-sdk/tree/main/src/dailyai/services)), we welcome PRs! Our goal is to support lots of services in all of the above categories, plus new categories (like real-time video) as they emerge.
## Getting started
Today, the easiest way to get started with `dailyai` is to use [Daily](https://www.daily.co/) as your transport service. This toolkit started life as an internal SDK at Daily and millions of minutes of AI conversation have been served using it and its earlier prototype incarnations. (The [transport base class](https://github.com/daily-co/daily-ai-sdk/blob/main/src/dailyai/transports/abstract_transport.py) is easy to extend, though, so feel free to submit PRs if you'd like to implement another transport service.)
```
```shell
# install the module
pip install dailyai
pip install pipecat-ai
# set up an .env file with API keys
cp dot-env.template .env
```
By default, in order to minimize dependencies, only the basic framework functionality is available. Some third-party AI services require additional
dependencies that you can install with:
By default, in order to minimize dependencies, only the basic framework functionality is available. Some third-party AI services require additional dependencies that you can install with:
```
pip install dailyai[option,...]
```shell
pip install "pipecat-ai[option,...]"
```
Your project may or may not need these, so they're made available as optional requirements. Here is a list:
- **AI services**: `anthropic`, `azure`, `fal`, `openai`, `playht`, `silero`, `whisper`
- **AI services**: `anthropic`, `azure`, `fal`, `moondream`, `openai`, `playht`, `silero`, `whisper`
- **Transports**: `daily`, `local`, `websocket`
## A simple voice agent running locally
If youre doing AI-related stuff, you probably have an OpenAI API key.
To generate voice output, one service thats easy to get started with is ElevenLabs. If you dont already have an ElevenLabs developer account, you can sign up for one [here].
So lets run a really simple agent thats just a GPT-4 prompt, wired up to voice input and speaker output.
You can change the prompt, in the code. The current prompt is “Tell me something interesting about the Roman Empire.”
`cd examples/getting-started` to run the following examples …
```shell
# Talk to a local pipecat process with your voice. Specify GPT-4 as the LLM.
export OPENAI_API_KEY=...
export ELEVENLABS_API_KEY=...
python ./local-mic.py | ./pipecat-pipes-gpt-4.py | ./local-speaker.py
```
## WebSockets instead of pipes
To run your agent in the cloud, you can switch the Pipecat transport layer to use a WebSocket instead of Unix pipes.
```shell
# Talk to a local pipecat process with your voice. Specify GPT-4 as the LLM.
export OPENAI_API_KEY=...
export ELEVENLABS_API_KEY=...
python ./local-mic-and-speaker-wss.py wss://localhost:8088
```
## WebRTC for production use
WebSockets are fine for server-to-server communication or for initial development. But for production use, youll need client-server audio to use a protocol designed for real-time media transport. (For an explanation of the difference between WebSockets and WebRTC, see [this post.])
One way to get up and running quickly with WebRTC is to sign up for a Daily developer account. Daily gives you SDKs and global infrastructure for audio (and video) routing. Every account gets 10,000 audio/video/transcription minutes free each month.
Sign up [here](https://dashboard.daily.co/u/signup) and [create a room](https://docs.daily.co/reference/rest-api/rooms) in the developer Dashboard. Then run the examples, this time connecting via WebRTC instead of a WebSocket.
```shell
# 1. Run the pipecat process. Provide your Daily API key and a Daily room
export DAILY_API_KEY=...
export OPENAI_API_KEY=...
export ELEVENLABS_API_KEY=...
python pipecat-daily-gpt-4.py --daily-room https://example.daily.co/pipecat
# 2. Visit the Daily room link in any web browser to talk to the pipecat process.
# You'll want to use a Daily SDK to embed the client-side code into your own
# app. But visiting the room URL in a browser is a quick way to start building
# agents because you can focus on just the agent code at first.
open -a "Google Chrome" https://example.daily.co/pipecat
```
## Deploy your agent to the cloud
Now that youve decoupled client and server, and have a Pipecat process that can run anywhere you can run Python, you can deploy this example agent to the cloud.
`TBC`
## Taking it further
### Add a telephone number
Daily supports telephone connections in addition to WebRTC streams. You can add a telephone number to your Daily room with the following REST API call. Once youve done that, you can call your agent on the phone.
Youll need to add a credit card to your Daily account to enable telephone numbers.
`TBC`
### Add image output
Daily supports telephone connections in addition to WebRTC streams. You can add a telephone number to your Daily room with the following REST API call. Once youve done that, you can call your agent on the phone.
Youll need to add a credit card to your Daily account to enable telephone numbers.
`TBC`
### Add video output
`TBC`
## Code examples
There are two directories of examples:
- [foundational](https://github.com/daily-co/daily-ai-sdk/tree/main/examples/foundational) — demos that build on each other, introducing one or two concepts at a time
- [starter apps](https://github.com/daily-co/daily-ai-sdk/tree/main/examples/starter-apps) — complete applications that you can use as starting points for development
- [foundational](https://github.com/daily-co/pipecat/tree/main/examples/foundational) — examples that build on each other, introducing one or two concepts at a time
- [starter apps](https://github.com/daily-co/pipecat/tree/main/examples/starter-apps) — complete applications that you can use as starting points for development
Before running the examples you need to install the dependencies (which will install all the dependencies to run all of the examples):
@@ -97,7 +150,7 @@ source venv/bin/activate
From the root of this repo, run the following:
```
pip install -r {env}-requirements.txt -r dev-requirements.txt
pip install -r dev-requirements.txt -r {env}-requirements.txt
python -m build
```

View File

@@ -1,6 +1,6 @@
autopep8==2.0.4
build==1.0.3
pip-tools==7.4.1
pytest==8.1.1
setuptools==69.2.0
setuptools_scm==8.0.4
autopep8~=2.1.0
build~=1.2.1
pip-tools~=7.4.1
pytest~=8.2.0
setuptools~=69.5.1
setuptools_scm~=8.1.0

View File

@@ -1,8 +1,8 @@
# Daily AI SDK Docs
# Pipecat Docs
## [Architecture Overview](architecture.md)
Learn about the thinking behind the SDK's design.
Learn about the thinking behind the framework's design.
## [A Frame's Progress](frame-progress.md)
@@ -10,7 +10,7 @@ See how a Frame is processed through a Transport, a Pipeline, and a series of Fr
## [Example Code](examples/)
The repo includes several example apps in the `examples` directory. The docs explain how they work.
The repository includes several example apps in the `examples` directory. The docs explain how they work.
## [API Reference](api/)

View File

@@ -1,4 +1,4 @@
# Daily AI SDK Architecture Guide
# Pipecat architecture guide
## Frames
@@ -10,8 +10,8 @@ Frame processors operate on frames. Every frame processor implements a `process_
## Pipelines
Pipelines are lists of frame processors that read from a source queue and send the processed frames to a sink queue. A very simple pipeline might chain an LLM frame processor to a text-to-speech frame processor, with a transport's send queue as its sync. Placing LLM message frames on the pipeline's source queue will cause the LLM's response to be spoken. See example #2 for an implementation of this.
Pipelines are lists of frame processors linked together. Frame processors can push frames upstream or downstream to their peers. A very simple pipeline might chain an LLM frame processor to a text-to-speech frame processor, with a transport as an output.
## Transports
Transports provide a receive queue, which is input from "the outside world", and a sink queue, which is data that will be sent "to the outside world". The `LocalTransportService` does this with the local camera, mic, display and speaker. The `DailyTransportService` does this with a WebRTC session joined to a Daily.co room.
Transports provide input and output frame processors to receive or send frames respectively. For example, the `DailyTransport` does this with a WebRTC session joined to a Daily.co room.

View File

@@ -2,8 +2,16 @@
ANTHROPIC_API_KEY=...
# Azure
SPEECH_KEY=...
SPEECH_REGION=...
AZURE_SPEECH_REGION=...
AZURE_SPEECH_API_KEY=...
AZURE_CHATGPT_API_KEY=...
AZURE_CHATGPT_ENDPOINT=https://...
AZURE_CHATGPT_MODEL=...
AZURE_DALLE_API_KEY=...
AZURE_DALLE_ENDPOINT=https://...
AZURE_DALLE_MODEL=...
# Daily
DAILY_API_KEY=...
@@ -14,8 +22,10 @@ ELEVENLABS_API_KEY=...
ELEVENLABS_VOICE_ID=...
# Fal
FAL_KEY_ID=...
FAL_KEY_SECRET=...
FAL_KEY=...
# Fireworks
FIREWORKS_API_KEY=...
# PlayHT
PLAY_HT_USER_ID=...

View File

@@ -1,31 +1,36 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import asyncio
import aiohttp
import logging
import os
from dailyai.pipeline.frames import EndFrame, TextFrame
from dailyai.pipeline.pipeline import Pipeline
import sys
from dailyai.transports.daily_transport import DailyTransport
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
from pipecat.frames.frames import EndFrame, TextFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.task import PipelineTask
from pipecat.pipeline.runner import PipelineRunner
from pipecat.services.elevenlabs import ElevenLabsTTSService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from runner import configure
from loguru import logger
from dotenv import load_dotenv
load_dotenv(override=True)
logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
logger = logging.getLogger("dailyai")
logger.setLevel(logging.DEBUG)
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")
async def main(room_url):
async with aiohttp.ClientSession() as session:
transport = DailyTransport(
room_url,
None,
"Say One Thing",
mic_enabled=True,
)
room_url, None, "Say One Thing", DailyParams(audio_out_enabled=True))
tts = ElevenLabsTTSService(
aiohttp_session=session,
@@ -33,21 +38,18 @@ async def main(room_url):
voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
)
pipeline = Pipeline([tts])
runner = PipelineRunner()
task = PipelineTask(Pipeline([tts, transport.output()]))
# Register an event handler so we can play the audio when the
# participant joins.
@transport.event_handler("on_participant_joined")
async def on_participant_joined(transport, participant):
if participant["info"]["isLocal"]:
return
async def on_new_participant_joined(transport, participant):
participant_name = participant["info"]["userName"] or ''
await pipeline.queue_frames([TextFrame("Hello there, " + participant_name + "!"), EndFrame()])
await transport.run(pipeline)
del tts
await task.queue_frames([TextFrame(f"Hello there, {participant_name}!"), EndFrame()])
await runner.run(task)
if __name__ == "__main__":
(url, token) = configure()

View File

@@ -0,0 +1,53 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import asyncio
import aiohttp
import os
import sys
from pipecat.frames.frames import EndFrame, TextFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineTask
from pipecat.services.elevenlabs import ElevenLabsTTSService
from pipecat.transports.base_transport import TransportParams
from pipecat.transports.local.audio import LocalAudioTransport
from loguru import logger
from dotenv import load_dotenv
load_dotenv(override=True)
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")
async def main():
async with aiohttp.ClientSession() as session:
transport = LocalAudioTransport(TransportParams(audio_out_enabled=True))
tts = ElevenLabsTTSService(
aiohttp_session=session,
api_key=os.getenv("ELEVENLABS_API_KEY"),
voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
)
pipeline = Pipeline([tts, transport.output()])
task = PipelineTask(pipeline)
async def say_something():
await asyncio.sleep(1)
await task.queue_frames([TextFrame("Hello there!"), EndFrame()])
runner = PipelineRunner()
await asyncio.gather(runner.run(task), say_something())
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -1,38 +0,0 @@
import asyncio
import aiohttp
import logging
import os
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
from dailyai.transports.local_transport import LocalTransport
from dotenv import load_dotenv
load_dotenv(override=True)
logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
logger = logging.getLogger("dailyai")
logger.setLevel(logging.DEBUG)
async def main():
async with aiohttp.ClientSession() as session:
meeting_duration_minutes = 1
transport = LocalTransport(
duration_minutes=meeting_duration_minutes, mic_enabled=True
)
tts = ElevenLabsTTSService(
aiohttp_session=session,
api_key=os.getenv("ELEVENLABS_API_KEY"),
voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
)
async def say_something():
await asyncio.sleep(1)
await transport.say("Hello there.", tts)
await transport.stop_when_done()
await asyncio.gather(transport.run(), say_something())
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -1,23 +1,31 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import asyncio
import os
import logging
import aiohttp
import os
import sys
from dailyai.pipeline.frames import EndFrame, LLMMessagesFrame
from dailyai.pipeline.pipeline import Pipeline
from dailyai.transports.daily_transport import DailyTransport
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
from dailyai.services.open_ai_services import OpenAILLMService
from pipecat.frames.frames import EndFrame, LLMMessagesFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineTask
from pipecat.services.elevenlabs import ElevenLabsTTSService
from pipecat.services.openai import OpenAILLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from runner import configure
from loguru import logger
from dotenv import load_dotenv
load_dotenv(override=True)
logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
logger = logging.getLogger("dailyai")
logger.setLevel(logging.DEBUG)
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")
async def main(room_url):
@@ -26,8 +34,7 @@ async def main(room_url):
room_url,
None,
"Say One Thing From an LLM",
mic_enabled=True,
)
DailyParams(audio_out_enabled=True))
tts = ElevenLabsTTSService(
aiohttp_session=session,
@@ -45,13 +52,15 @@ async def main(room_url):
"content": "You are an LLM in a WebRTC session, and this is a 'hello world' demo. Say hello to the world.",
}]
pipeline = Pipeline([llm, tts])
runner = PipelineRunner()
@transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport):
await pipeline.queue_frames([LLMMessagesFrame(messages), EndFrame()])
task = PipelineTask(Pipeline([llm, tts, transport.output()]))
await transport.run(pipeline)
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
await task.queue_frames([LLMMessagesFrame(messages), EndFrame()])
await runner.run(task)
if __name__ == "__main__":

View File

@@ -1,21 +1,30 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import asyncio
import aiohttp
import logging
import os
import sys
from dailyai.pipeline.frames import TextFrame
from dailyai.pipeline.pipeline import Pipeline
from dailyai.transports.daily_transport import DailyTransport
from dailyai.services.fal_ai_services import FalImageGenService
from pipecat.frames.frames import TextFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineTask
from pipecat.services.fal import FalImageGenService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from runner import configure
from loguru import logger
from dotenv import load_dotenv
load_dotenv(override=True)
logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
logger = logging.getLogger("dailyai")
logger.setLevel(logging.DEBUG)
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")
async def main(room_url):
@@ -24,32 +33,34 @@ async def main(room_url):
room_url,
None,
"Show a still frame image",
camera_enabled=True,
camera_width=1024,
camera_height=1024,
duration_minutes=1
DailyParams(
camera_out_enabled=True,
camera_out_width=1024,
camera_out_height=1024
)
)
imagegen = FalImageGenService(
image_size="square_hd",
params=FalImageGenService.InputParams(
image_size="square_hd"
),
aiohttp_session=session,
key_id=os.getenv("FAL_KEY_ID"),
key_secret=os.getenv("FAL_KEY_SECRET"),
key=os.getenv("FAL_KEY"),
)
pipeline = Pipeline([imagegen])
runner = PipelineRunner()
@transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport):
task = PipelineTask(Pipeline([imagegen, transport.output()]))
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
# Note that we do not put an EndFrame() item in the pipeline for this demo.
# This means that the bot will stay in the channel until it times out.
# An EndFrame() in the pipeline would cause the transport to shut
# down.
await pipeline.queue_frames(
[TextFrame("a cat in the style of picasso")]
)
await task.queue_frames([TextFrame("a cat in the style of picasso")])
await transport.run(pipeline)
await runner.run(task)
if __name__ == "__main__":

View File

@@ -1,57 +0,0 @@
import asyncio
import aiohttp
import logging
import os
import tkinter as tk
from dailyai.pipeline.frames import TextFrame, EndFrame
from dailyai.pipeline.pipeline import Pipeline
from dailyai.services.fal_ai_services import FalImageGenService
from dailyai.transports.local_transport import LocalTransport
from dotenv import load_dotenv
load_dotenv(override=True)
logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
logger = logging.getLogger("dailyai")
logger.setLevel(logging.DEBUG)
async def main():
async with aiohttp.ClientSession() as session:
meeting_duration_minutes = 2
tk_root = tk.Tk()
tk_root.title("dailyai")
transport = LocalTransport(
tk_root=tk_root,
mic_enabled=False,
camera_enabled=True,
camera_width=1024,
camera_height=1024,
duration_minutes=meeting_duration_minutes,
)
imagegen = FalImageGenService(
image_size="square_hd",
aiohttp_session=session,
key_id=os.getenv("FAL_KEY_ID"),
key_secret=os.getenv("FAL_KEY_SECRET"),
)
pipeline = Pipeline([imagegen])
await pipeline.queue_frames([TextFrame("a cat in the style of picasso")])
async def run_tk():
while not transport._stop_threads.is_set():
tk_root.update()
tk_root.update_idletasks()
await asyncio.sleep(0.1)
await asyncio.gather(transport.run(pipeline, override_pipeline_source_queue=False), run_tk())
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,68 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import asyncio
import aiohttp
import os
import sys
import tkinter as tk
from pipecat.frames.frames import TextFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineTask
from pipecat.services.fal import FalImageGenService
from pipecat.transports.base_transport import TransportParams
from pipecat.transports.local.tk import TkLocalTransport
from loguru import logger
from dotenv import load_dotenv
load_dotenv(override=True)
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")
async def main():
async with aiohttp.ClientSession() as session:
tk_root = tk.Tk()
tk_root.title("Picasso Cat")
transport = TkLocalTransport(
tk_root,
TransportParams(
camera_out_enabled=True,
camera_out_width=1024,
camera_out_height=1024))
imagegen = FalImageGenService(
params=FalImageGenService.InputParams(
image_size="square_hd"
),
aiohttp_session=session,
key=os.getenv("FAL_KEY"),
)
pipeline = Pipeline([imagegen, transport.output()])
task = PipelineTask(pipeline)
await task.queue_frames([TextFrame("a cat in the style of picasso")])
runner = PipelineRunner()
async def run_tk():
while runner.is_active():
tk_root.update()
tk_root.update_idletasks()
await asyncio.sleep(0.1)
await asyncio.gather(runner.run(task), run_tk())
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -1,37 +1,40 @@
import asyncio
import logging
import os
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import aiohttp
from dailyai.pipeline.merge_pipeline import SequentialMergePipeline
from dailyai.pipeline.pipeline import Pipeline
import asyncio
import os
import sys
from dailyai.transports.daily_transport import DailyTransport
from dailyai.services.azure_ai_services import AzureLLMService, AzureTTSService
from dailyai.services.deepgram_ai_services import DeepgramTTSService
from dailyai.pipeline.frames import EndPipeFrame, LLMMessagesFrame, TextFrame
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
from pipecat.pipeline.merge_pipeline import SequentialMergePipeline
from pipecat.pipeline.pipeline import Pipeline
from pipecat.frames.frames import EndPipeFrame, LLMMessagesFrame, TextFrame
from pipecat.pipeline.task import PipelineTask
from pipecat.services.azure import AzureLLMService, AzureTTSService
from pipecat.services.elevenlabs import ElevenLabsTTSService
from pipecat.services.transport_services import TransportServiceOutput
from pipecat.services.transports.daily_transport import DailyTransport
from runner import configure
from loguru import logger
from dotenv import load_dotenv
load_dotenv(override=True)
logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
logger = logging.getLogger("dailyai")
logger.setLevel(logging.DEBUG)
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")
async def main(room_url: str):
async with aiohttp.ClientSession() as session:
transport = DailyTransport(
room_url,
None,
"Static And Dynamic Speech",
duration_minutes=1,
mic_enabled=True,
mic_sample_rate=16000,
)
transport = DailyTransport(room_url, None, "Static And Dynamic Speech")
meeting = TransportServiceOutput(transport, mic_enabled=True)
llm = AzureLLMService(
api_key=os.getenv("AZURE_CHATGPT_API_KEY"),
@@ -43,10 +46,6 @@ async def main(room_url: str):
region=os.getenv("AZURE_SPEECH_REGION"),
)
deepgram_tts = DeepgramTTSService(
aiohttp_session=session,
api_key=os.getenv("DEEPGRAM_API_KEY"),
)
elevenlabs_tts = ElevenLabsTTSService(
aiohttp_session=session,
api_key=os.getenv("ELEVENLABS_API_KEY"),
@@ -56,11 +55,13 @@ async def main(room_url: str):
messages = [{"role": "system",
"content": "tell the user a joke about llamas"}]
# Start a task to run the LLM to create a joke, and convert the LLM output to audio frames. This task
# will run in parallel with generating and speaking the audio for static text, so there's no delay to
# speak the LLM response.
# Start a task to run the LLM to create a joke, and convert the LLM
# output to audio frames. This task will run in parallel with generating
# and speaking the audio for static text, so there's no delay to speak
# the LLM response.
llm_pipeline = Pipeline([llm, elevenlabs_tts])
await llm_pipeline.queue_frames([LLMMessagesFrame(messages), EndPipeFrame()])
llm_task = PipelineTask(llm_pipeline)
await llm_task.queue_frames([LLMMessagesFrame(messages), EndPipeFrame()])
simple_tts_pipeline = Pipeline([azure_tts])
await simple_tts_pipeline.queue_frames(

View File

@@ -1,64 +1,74 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import asyncio
import aiohttp
import os
import logging
import sys
from dataclasses import dataclass
from typing import AsyncGenerator
from dailyai.pipeline.aggregators import (
GatedAggregator,
LLMFullResponseAggregator,
ParallelPipeline,
SentenceAggregator,
)
from dailyai.pipeline.frames import (
from pipecat.frames.frames import (
AppFrame,
Frame,
ImageRawFrame,
TextFrame,
EndFrame,
ImageFrame,
LLMMessagesFrame,
LLMResponseStartFrame,
)
from dailyai.pipeline.frame_processor import FrameProcessor
from dailyai.pipeline.pipeline import Pipeline
from dailyai.transports.daily_transport import DailyTransport
from dailyai.services.open_ai_services import OpenAILLMService
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
from dailyai.services.fal_ai_services import FalImageGenService
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineTask
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
from pipecat.processors.aggregators.gated import GatedAggregator
from pipecat.processors.aggregators.llm_response import LLMFullResponseAggregator
from pipecat.processors.aggregators.sentence import SentenceAggregator
from pipecat.processors.aggregators.parallel_task import ParallelTask
from pipecat.services.openai import OpenAILLMService
from pipecat.services.elevenlabs import ElevenLabsTTSService
from pipecat.services.fal import FalImageGenService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from runner import configure
from loguru import logger
from dotenv import load_dotenv
load_dotenv(override=True)
logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
logger = logging.getLogger("dailyai")
logger.setLevel(logging.DEBUG)
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")
@dataclass
class MonthFrame(Frame):
class MonthFrame(AppFrame):
month: str
def __str__(self):
return f"{self.name}(month: {self.month})"
class MonthPrepender(FrameProcessor):
def __init__(self):
super().__init__()
self.most_recent_month = "Placeholder, month frame not yet received"
self.prepend_to_next_text_frame = False
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
async def process_frame(self, frame: Frame, direction: FrameDirection):
if isinstance(frame, MonthFrame):
self.most_recent_month = frame.month
elif self.prepend_to_next_text_frame and isinstance(frame, TextFrame):
yield TextFrame(f"{self.most_recent_month}: {frame.text}")
await self.push_frame(TextFrame(f"{self.most_recent_month}: {frame.text}"))
self.prepend_to_next_text_frame = False
elif isinstance(frame, LLMResponseStartFrame):
self.prepend_to_next_text_frame = True
yield frame
await self.push_frame(frame)
else:
yield frame
await self.push_frame(frame, direction)
async def main(room_url):
@@ -67,11 +77,12 @@ async def main(room_url):
room_url,
None,
"Month Narration Bot",
mic_enabled=True,
camera_enabled=True,
mic_sample_rate=16000,
camera_width=1024,
camera_height=1024,
DailyParams(
audio_out_enabled=True,
camera_out_enabled=True,
camera_out_width=1024,
camera_out_height=1024
)
)
tts = ElevenLabsTTSService(
@@ -85,31 +96,33 @@ async def main(room_url):
model="gpt-4-turbo-preview")
imagegen = FalImageGenService(
image_size="square_hd",
params=FalImageGenService.InputParams(
image_size="square_hd"
),
aiohttp_session=session,
key_id=os.getenv("FAL_KEY_ID"),
key_secret=os.getenv("FAL_KEY_SECRET"),
key=os.getenv("FAL_KEY"),
)
gated_aggregator = GatedAggregator(
gate_open_fn=lambda frame: isinstance(
frame, ImageFrame), gate_close_fn=lambda frame: isinstance(
frame, LLMResponseStartFrame), start_open=False, )
gate_open_fn=lambda frame: isinstance(frame, ImageRawFrame),
gate_close_fn=lambda frame: isinstance(frame, LLMResponseStartFrame),
start_open=False
)
sentence_aggregator = SentenceAggregator()
month_prepender = MonthPrepender()
llm_full_response_aggregator = LLMFullResponseAggregator()
pipeline = Pipeline(
processors=[
llm,
sentence_aggregator,
ParallelPipeline(
[[month_prepender, tts], [llm_full_response_aggregator, imagegen]]
),
gated_aggregator,
],
)
pipeline = Pipeline([
llm,
sentence_aggregator,
ParallelTask(
[month_prepender, tts],
[llm_full_response_aggregator, imagegen]
),
gated_aggregator,
transport.output()
])
frames = []
for month in [
@@ -132,13 +145,18 @@ async def main(room_url):
"content": f"Describe a nature photograph suitable for use in a calendar, for the month of {month}. Include only the image description with no preamble. Limit the description to one sentence, please.",
}
]
frames.append(MonthFrame(month))
frames.append(MonthFrame(month=month))
frames.append(LLMMessagesFrame(messages))
frames.append(EndFrame())
await pipeline.queue_frames(frames)
await transport.run(pipeline, override_pipeline_source_queue=False)
runner = PipelineRunner()
task = PipelineTask(pipeline)
await task.queue_frames(frames)
await runner.run(task)
if __name__ == "__main__":

View File

@@ -0,0 +1,164 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import aiohttp
import asyncio
import os
import sys
import tkinter as tk
from pipecat.frames.frames import AudioRawFrame, Frame, URLImageRawFrame, LLMMessagesFrame, TextFrame
from pipecat.pipeline.parallel_pipeline import ParallelPipeline
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineTask
from pipecat.processors.aggregators.llm_response import LLMFullResponseAggregator
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
from pipecat.services.openai import OpenAILLMService
from pipecat.services.elevenlabs import ElevenLabsTTSService
from pipecat.services.fal import FalImageGenService
from pipecat.transports.base_transport import TransportParams
from pipecat.transports.local.tk import TkLocalTransport
from loguru import logger
from dotenv import load_dotenv
load_dotenv(override=True)
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")
async def main():
async with aiohttp.ClientSession() as session:
tk_root = tk.Tk()
tk_root.title("Calendar")
runner = PipelineRunner()
async def get_month_data(month):
messages = [{"role": "system", "content": f"Describe a nature photograph suitable for use in a calendar, for the month of {month}. Include only the image description with no preamble. Limit the description to one sentence, please.", }]
class ImageDescription(FrameProcessor):
def __init__(self):
super().__init__()
self.text = ""
async def process_frame(self, frame: Frame, direction: FrameDirection):
if isinstance(frame, TextFrame):
self.text = frame.text
await self.push_frame(frame, direction)
class AudioGrabber(FrameProcessor):
def __init__(self):
super().__init__()
self.audio = bytearray()
async def process_frame(self, frame: Frame, direction: FrameDirection):
if isinstance(frame, AudioRawFrame):
self.audio.extend(frame.audio)
self.frame = AudioRawFrame(
bytes(self.audio), frame.sample_rate, frame.num_channels)
class ImageGrabber(FrameProcessor):
def __init__(self):
super().__init__()
self.frame = None
async def process_frame(self, frame: Frame, direction: FrameDirection):
if isinstance(frame, URLImageRawFrame):
self.frame = frame
llm = OpenAILLMService(
api_key=os.getenv("OPENAI_API_KEY"),
model="gpt-4-turbo-preview")
tts = ElevenLabsTTSService(
aiohttp_session=session,
api_key=os.getenv("ELEVENLABS_API_KEY"),
voice_id=os.getenv("ELEVENLABS_VOICE_ID"))
imagegen = FalImageGenService(
params=FalImageGenService.InputParams(
image_size="square_hd"
),
aiohttp_session=session,
key=os.getenv("FAL_KEY"))
aggregator = LLMFullResponseAggregator()
description = ImageDescription()
audio_grabber = AudioGrabber()
image_grabber = ImageGrabber()
pipeline = Pipeline([llm, aggregator, description,
ParallelPipeline([tts, audio_grabber],
[imagegen, image_grabber])])
task = PipelineTask(pipeline)
await task.queue_frame(LLMMessagesFrame(messages))
await task.stop_when_done()
await runner.run(task)
return {
"month": month,
"text": description.text,
"image": image_grabber.frame,
"audio": audio_grabber.frame,
}
transport = TkLocalTransport(
tk_root,
TransportParams(
audio_out_enabled=True,
camera_out_enabled=True,
camera_out_width=1024,
camera_out_height=1024))
pipeline = Pipeline([transport.output()])
task = PipelineTask(pipeline)
# We only specify 5 months as we create tasks all at once and we might
# get rate limited otherwise.
months: list[str] = [
"January",
"February",
# "March",
# "April",
# "May",
]
# We create one task per month. This will be executed concurrently.
month_tasks = [asyncio.create_task(get_month_data(month)) for month in months]
# Now we wait for each month task in the order they're completed. The
# benefit is we'll have as little delay as possible before the first
# month, and likely no delay between months, but the months won't
# display in order.
async def show_images(month_tasks):
for month_data_task in asyncio.as_completed(month_tasks):
data = await month_data_task
await task.queue_frames([data["image"], data["audio"]])
await runner.stop_when_done()
async def run_tk():
while True:
tk_root.update()
tk_root.update_idletasks()
await asyncio.sleep(0.1)
await asyncio.gather(runner.run(task), show_images(month_tasks), run_tk())
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -1,26 +1,37 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import asyncio
import aiohttp
import logging
import os
from dailyai.pipeline.frames import LLMMessagesFrame
from dailyai.pipeline.pipeline import Pipeline
import sys
from dailyai.transports.daily_transport import DailyTransport
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
from dailyai.services.open_ai_services import OpenAILLMService
from dailyai.services.ai_services import FrameLogger
from dailyai.pipeline.aggregators import (
LLMAssistantContextAggregator,
LLMUserContextAggregator,
from pipecat.frames.frames import LLMMessagesFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineTask
from pipecat.processors.aggregators.llm_response import (
LLMAssistantResponseAggregator,
LLMUserResponseAggregator,
)
from pipecat.processors.logger import FrameLogger
from pipecat.services.elevenlabs import ElevenLabsTTSService
from pipecat.services.openai import OpenAILLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.vad.silero import SileroVAD
from runner import configure
from loguru import logger
from dotenv import load_dotenv
load_dotenv(override=True)
logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
logger = logging.getLogger("dailyai")
logger.setLevel(logging.DEBUG)
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")
async def main(room_url: str, token):
@@ -29,14 +40,15 @@ async def main(room_url: str, token):
room_url,
token,
"Respond bot",
duration_minutes=5,
start_transcription=True,
mic_enabled=True,
mic_sample_rate=16000,
camera_enabled=False,
vad_enabled=True,
DailyParams(
audio_in_enabled=True, # This is so Silero VAD can get audio data
audio_out_enabled=True,
transcription_enabled=True
)
)
vad = SileroVAD()
tts = ElevenLabsTTSService(
aiohttp_session=session,
api_key=os.getenv("ELEVENLABS_API_KEY"),
@@ -46,41 +58,35 @@ async def main(room_url: str, token):
llm = OpenAILLMService(
api_key=os.getenv("OPENAI_API_KEY"),
model="gpt-4-turbo-preview")
fl = FrameLogger("Inner")
fl2 = FrameLogger("Outer")
fl_in = FrameLogger("Inner")
fl_out = FrameLogger("Outer")
messages = [
{
"role": "system",
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio. Respond to what the user said in a creative and helpful way.",
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so it should not contain special characters. Respond to what the user said in a creative and helpful way.",
},
]
tma_in = LLMUserResponseAggregator(messages)
tma_out = LLMAssistantResponseAggregator(messages)
tma_in = LLMUserContextAggregator(
messages, transport._my_participant_id)
tma_out = LLMAssistantContextAggregator(
messages, transport._my_participant_id
)
pipeline = Pipeline(
processors=[
fl,
tma_in,
llm,
fl2,
tts,
tma_out,
],
)
pipeline = Pipeline([fl_in, transport.input(), vad, tma_in, llm,
fl_out, tts, tma_out, transport.output()])
@transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport):
task = PipelineTask(pipeline)
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
transport.capture_participant_transcription(participant["id"])
# Kick off the conversation.
messages.append(
{"role": "system", "content": "Please introduce yourself to the user."})
await pipeline.queue_frames([LLMMessagesFrame(messages)])
await task.queue_frames([LLMMessagesFrame(messages)])
transport.transcription_settings["extra"]["endpointing"] = True
transport.transcription_settings["extra"]["punctuate"] = True
await transport.run(pipeline)
runner = PipelineRunner()
await runner.run(task)
if __name__ == "__main__":

View File

@@ -0,0 +1,116 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import asyncio
import aiohttp
import os
import sys
from PIL import Image
from pipecat.frames.frames import ImageRawFrame, Frame, SystemFrame, TextFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineTask
from pipecat.processors.aggregators.llm_context import (
LLMAssistantContextAggregator,
LLMUserContextAggregator,
)
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
from pipecat.services.openai import OpenAILLMService
from pipecat.services.elevenlabs import ElevenLabsTTSService
from pipecat.transports.services.daily import DailyTransport
from pipecat.transports.services.daily import DailyParams
from runner import configure
from loguru import logger
from dotenv import load_dotenv
load_dotenv(override=True)
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")
class ImageSyncAggregator(FrameProcessor):
def __init__(self, speaking_path: str, waiting_path: str):
super().__init__()
self._speaking_image = Image.open(speaking_path)
self._speaking_image_format = self._speaking_image.format
self._speaking_image_bytes = self._speaking_image.tobytes()
self._waiting_image = Image.open(waiting_path)
self._waiting_image_format = self._waiting_image.format
self._waiting_image_bytes = self._waiting_image.tobytes()
async def process_frame(self, frame: Frame, direction: FrameDirection):
if not isinstance(frame, SystemFrame):
await self.push_frame(ImageRawFrame(image=self._speaking_image_bytes, size=(1024, 1024), format=self._speaking_image_format))
await self.push_frame(frame)
await self.push_frame(ImageRawFrame(image=self._waiting_image_bytes, size=(1024, 1024), format=self._waiting_image_format))
else:
await self.push_frame(frame)
async def main(room_url: str, token):
async with aiohttp.ClientSession() as session:
transport = DailyTransport(
room_url,
token,
"Respond bot",
DailyParams(
audio_out_enabled=True,
camera_out_width=1024,
camera_out_height=1024,
transcription_enabled=True
)
)
tts = ElevenLabsTTSService(
aiohttp_session=session,
api_key=os.getenv("ELEVENLABS_API_KEY"),
voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
)
llm = OpenAILLMService(
api_key=os.getenv("OPENAI_API_KEY"),
model="gpt-4-turbo-preview")
messages = [
{
"role": "system",
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so it should not contain special characters. Respond to what the user said in a creative and helpful way.",
},
]
tma_in = LLMUserContextAggregator(messages)
tma_out = LLMAssistantContextAggregator(messages)
image_sync_aggregator = ImageSyncAggregator(
os.path.join(os.path.dirname(__file__), "assets", "speaking.png"),
os.path.join(os.path.dirname(__file__), "assets", "waiting.png"),
)
pipeline = Pipeline([transport.input(), image_sync_aggregator,
tma_in, llm, tma_out, tts, transport.output()])
task = PipelineTask(pipeline)
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
participant_name = participant["info"]["userName"] or ''
transport.capture_participant_transcription(participant["id"])
await task.queue_frames([TextFrame(f"Hi, this is {participant_name}.")])
runner = PipelineRunner()
await runner.run(task)
if __name__ == "__main__":
(url, token) = configure()
asyncio.run(main(url, token))

View File

@@ -2,16 +2,16 @@ import asyncio
import aiohttp
import logging
import os
from dailyai.pipeline.aggregators import (
LLMResponseAggregator,
UserResponseAggregator,
from pipecat.pipeline.aggregators import (
LLMAssistantResponseAggregator,
LLMUserResponseAggregator,
)
from dailyai.pipeline.pipeline import Pipeline
from dailyai.services.ai_services import FrameLogger
from dailyai.transports.daily_transport import DailyTransport
from dailyai.services.open_ai_services import OpenAILLMService
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
from pipecat.pipeline.pipeline import Pipeline
from pipecat.services.ai_services import FrameLogger
from pipecat.transports.daily_transport import DailyTransport
from pipecat.services.open_ai_services import OpenAILLMService
from pipecat.services.elevenlabs_ai_services import ElevenLabsTTSService
from runner import configure
@@ -19,7 +19,7 @@ from dotenv import load_dotenv
load_dotenv(override=True)
logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
logger = logging.getLogger("dailyai")
logger = logging.getLogger("pipecat")
logger.setLevel(logging.DEBUG)
@@ -50,7 +50,7 @@ async def main(room_url: str, token):
pipeline = Pipeline([FrameLogger(), llm, FrameLogger(), tts])
@transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport):
async def on_first_other_participant_joined(transport, participant):
await transport.say("Hi, I'm listening!", tts)
async def run_conversation():
@@ -63,11 +63,10 @@ async def main(room_url: str, token):
await transport.run_interruptible_pipeline(
pipeline,
post_processor=LLMResponseAggregator(messages),
pre_processor=UserResponseAggregator(messages),
post_processor=LLMAssistantResponseAggregator(messages),
pre_processor=LLMUserResponseAggregator(messages),
)
transport.transcription_settings["extra"]["punctuate"] = False
await asyncio.gather(transport.run(), run_conversation())

View File

@@ -3,14 +3,14 @@ import aiohttp
import asyncio
import logging
import os
from dailyai.pipeline.aggregators import SentenceAggregator
from dailyai.pipeline.pipeline import Pipeline
from pipecat.pipeline.aggregators import SentenceAggregator
from pipecat.pipeline.pipeline import Pipeline
from dailyai.transports.daily_transport import DailyTransport
from dailyai.services.azure_ai_services import AzureLLMService, AzureTTSService
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
from dailyai.services.fal_ai_services import FalImageGenService
from dailyai.pipeline.frames import AudioFrame, EndFrame, ImageFrame, LLMMessagesFrame, TextFrame
from pipecat.transports.daily_transport import DailyTransport
from pipecat.services.azure_ai_services import AzureLLMService, AzureTTSService
from pipecat.services.elevenlabs_ai_services import ElevenLabsTTSService
from pipecat.services.fal_ai_services import FalImageGenService
from pipecat.pipeline.frames import AudioFrame, EndFrame, ImageFrame, LLMMessagesFrame, TextFrame
from runner import configure
@@ -18,7 +18,7 @@ from dotenv import load_dotenv
load_dotenv(override=True)
logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
logger = logging.getLogger("dailyai")
logger = logging.getLogger("pipecat")
logger.setLevel(logging.DEBUG)
@@ -51,10 +51,11 @@ async def main(room_url: str):
voice_id="jBpfuIE2acCO8z3wKNLl",
)
dalle = FalImageGenService(
image_size="1024x1024",
params=FalImageGenService.InputParams(
image_size="1024x1024"
),
aiohttp_session=session,
key_id=os.getenv("FAL_KEY_ID"),
key_secret=os.getenv("FAL_KEY_SECRET"),
key=os.getenv("FAL_KEY"),
)
bot1_messages = [
@@ -91,7 +92,7 @@ async def main(room_url: str):
if isinstance(frame, TextFrame):
message += frame.text
elif isinstance(frame, AudioFrame):
all_audio.extend(frame.data)
all_audio.extend(frame.audio)
return (message, all_audio)
@@ -122,7 +123,7 @@ async def main(room_url: str):
)
await transport.send_queue.put(
[
ImageFrame(None, image_data1[1]),
ImageFrame(image_data1[1], image_data1[2]),
AudioFrame(audio1),
]
)
@@ -134,7 +135,7 @@ async def main(room_url: str):
)
await transport.send_queue.put(
[
ImageFrame(None, image_data2[1]),
ImageFrame(image_data2[1], image_data2[2]),
AudioFrame(audio2),
]
)

View File

@@ -0,0 +1,62 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import asyncio
import sys
from pipecat.frames.frames import AudioRawFrame, ImageRawFrame
from pipecat.processors.filter import Filter
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineTask
from pipecat.pipeline.parallel_pipeline import ParallelPipeline
from pipecat.transports.services.daily import DailyTransport, DailyParams
from runner import configure
from loguru import logger
from dotenv import load_dotenv
load_dotenv(override=True)
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")
async def main(room_url, token):
transport = DailyTransport(
room_url, token, "Test",
DailyParams(
audio_in_enabled=True,
audio_out_enabled=True,
camera_out_enabled=True,
camera_out_width=1280,
camera_out_height=720
)
)
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
transport.capture_participant_video(participant["id"])
# The ParallelPipeline is not really necessary here but it shows how you
# would process audio and video concurrently in parallel pipelines.
pipeline = Pipeline([transport.input(),
ParallelPipeline(
[Filter([AudioRawFrame])],
[Filter([ImageRawFrame])]),
transport.output()])
runner = PipelineRunner()
task = PipelineTask(pipeline)
await runner.run(task)
if __name__ == "__main__":
(url, token) = configure()
asyncio.run(main(url, token))

View File

@@ -0,0 +1,65 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import asyncio
import sys
import tkinter as tk
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineTask
from pipecat.transports.base_transport import TransportParams
from pipecat.transports.local.tk import TkLocalTransport
from pipecat.transports.services.daily import DailyParams, DailyTransport
from runner import configure
from loguru import logger
from dotenv import load_dotenv
load_dotenv(override=True)
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")
async def main(room_url, token):
tk_root = tk.Tk()
tk_root.title("Local Mirror")
daily_transport = DailyTransport(room_url, token, "Test", DailyParams(audio_in_enabled=True))
tk_transport = TkLocalTransport(
tk_root,
TransportParams(
audio_out_enabled=True,
camera_out_enabled=True,
camera_out_width=1280,
camera_out_height=720))
@daily_transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
transport.capture_participant_video(participant["id"])
pipeline = Pipeline([daily_transport.input(), tk_transport.output()])
runner = PipelineRunner()
async def run_tk():
while runner.is_active():
tk_root.update()
tk_root.update_idletasks()
await asyncio.sleep(0.1)
task = PipelineTask(pipeline)
await asyncio.gather(runner.run(task), run_tk())
if __name__ == "__main__":
(url, token) = configure()
asyncio.run(main(url, token))

View File

@@ -0,0 +1,181 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import asyncio
import aiohttp
import os
import random
import sys
from PIL import Image
from pipecat.frames.frames import (
Frame,
SystemFrame,
TextFrame,
ImageRawFrame,
SpriteFrame,
TranscriptionFrame,
)
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineTask
from pipecat.processors.aggregators.llm_context import (
LLMUserContextAggregator,
LLMAssistantContextAggregator,
)
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
from pipecat.services.openai import OpenAILLMService
from pipecat.services.elevenlabs import ElevenLabsTTSService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from runner import configure
from loguru import logger
from dotenv import load_dotenv
load_dotenv(override=True)
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")
sprites = {}
image_files = [
"sc-default.png",
"sc-talk.png",
"sc-listen-1.png",
"sc-think-1.png",
"sc-think-2.png",
"sc-think-3.png",
"sc-think-4.png",
]
script_dir = os.path.dirname(__file__)
for file in image_files:
# Build the full path to the image file
full_path = os.path.join(script_dir, "assets", file)
# Get the filename without the extension to use as the dictionary key
filename = os.path.splitext(os.path.basename(full_path))[0]
# Open the image and convert it to bytes
with Image.open(full_path) as img:
sprites[file] = ImageRawFrame(image=img.tobytes(), size=img.size, format=img.format)
# When the bot isn't talking, show a static image of the cat listening
quiet_frame = sprites["sc-listen-1.png"]
# When the bot is talking, build an animation from two sprites
talking_list = [sprites["sc-default.png"], sprites["sc-talk.png"]]
talking = [random.choice(talking_list) for x in range(30)]
talking_frame = SpriteFrame(talking)
# TODO: Support "thinking" as soon as we get a valid transcript, while LLM
# is processing
thinking_list = [
sprites["sc-think-1.png"],
sprites["sc-think-2.png"],
sprites["sc-think-3.png"],
sprites["sc-think-4.png"],
]
thinking_frame = SpriteFrame(thinking_list)
class NameCheckFilter(FrameProcessor):
def __init__(self, names: list[str]):
super().__init__()
self._names = names
self._sentence = ""
async def process_frame(self, frame: Frame, direction: FrameDirection):
if isinstance(frame, SystemFrame):
await self.push_frame(frame, direction)
return
content: str = ""
# TODO: split up transcription by participant
if isinstance(frame, TranscriptionFrame):
content = frame.text
self._sentence += content
if self._sentence.endswith((".", "?", "!")):
if any(name in self._sentence for name in self._names):
await self.push_frame(TextFrame(self._sentence))
self._sentence = ""
else:
self._sentence = ""
else:
await self.push_frame(frame, direction)
class ImageSyncAggregator(FrameProcessor):
async def process_frame(self, frame: Frame, direction: FrameDirection):
await self.push_frame(talking_frame)
await self.push_frame(frame)
await self.push_frame(quiet_frame)
async def main(room_url: str, token):
async with aiohttp.ClientSession() as session:
transport = DailyTransport(
room_url,
token,
"Santa Cat",
DailyParams(
audio_out_enabled=True,
camera_out_enabled=True,
camera_out_width=720,
camera_out_height=1280,
camera_out_framerate=10,
transcription_enabled=True
)
)
llm = OpenAILLMService(
api_key=os.getenv("OPENAI_API_KEY"),
model="gpt-4-turbo-preview")
tts = ElevenLabsTTSService(
aiohttp_session=session,
api_key=os.getenv("ELEVENLABS_API_KEY"),
voice_id="jBpfuIE2acCO8z3wKNLl",
)
isa = ImageSyncAggregator()
messages = [
{
"role": "system",
"content": "You are Santa Cat, a cat that lives in Santa's workshop at the North Pole. You should be clever, and a bit sarcastic. You should also tell jokes every once in a while. Your responses should only be a few sentences long.",
},
]
tma_in = LLMUserContextAggregator(messages)
tma_out = LLMAssistantContextAggregator(messages)
ncf = NameCheckFilter(["Santa Cat", "Santa"])
pipeline = Pipeline([transport.input(), isa, ncf, tma_in,
llm, tma_out, tts, transport.output()])
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
# Send some greeting at the beginning.
await tts.say("Hi! If you want to talk to me, just say 'hey Santa Cat'.")
transport.capture_participant_transcription(participant["id"])
async def starting_image():
await transport.send_image(quiet_frame)
runner = PipelineRunner()
task = PipelineTask(pipeline)
await asyncio.gather(runner.run(task), starting_image())
if __name__ == "__main__":
(url, token) = configure()
asyncio.run(main(url, token))

View File

@@ -0,0 +1,132 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import aiohttp
import asyncio
import os
import sys
import wave
from pipecat.frames.frames import (
Frame,
AudioRawFrame,
LLMResponseEndFrame,
LLMMessagesFrame,
)
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineTask
from pipecat.processors.aggregators.llm_context import (
LLMUserContextAggregator,
LLMAssistantContextAggregator,
)
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
from pipecat.processors.logger import FrameLogger
from pipecat.services.elevenlabs import ElevenLabsTTSService
from pipecat.services.openai import OpenAILLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from runner import configure
from loguru import logger
from dotenv import load_dotenv
load_dotenv(override=True)
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")
sounds = {}
sound_files = ["ding1.wav", "ding2.wav"]
script_dir = os.path.dirname(__file__)
for file in sound_files:
# Build the full path to the image file
full_path = os.path.join(script_dir, "assets", file)
# Get the filename without the extension to use as the dictionary key
filename = os.path.splitext(os.path.basename(full_path))[0]
# Open the image and convert it to bytes
with wave.open(full_path) as audio_file:
sounds[file] = AudioRawFrame(audio_file.readframes(-1),
audio_file.getframerate(), audio_file.getnchannels())
class OutboundSoundEffectWrapper(FrameProcessor):
async def process_frame(self, frame: Frame, direction: FrameDirection):
if isinstance(frame, LLMResponseEndFrame):
await self.push_frame(sounds["ding1.wav"])
# In case anything else downstream needs it
await self.push_frame(frame, direction)
else:
await self.push_frame(frame, direction)
class InboundSoundEffectWrapper(FrameProcessor):
async def process_frame(self, frame: Frame, direction: FrameDirection):
if isinstance(frame, LLMMessagesFrame):
await self.push_frame(sounds["ding2.wav"])
# In case anything else downstream needs it
await self.push_frame(frame, direction)
else:
await self.push_frame(frame, direction)
async def main(room_url: str, token):
async with aiohttp.ClientSession() as session:
transport = DailyTransport(
room_url,
token,
"Respond bot",
DailyParams(audio_out_enabled=True, transcription_enabled=True)
)
llm = OpenAILLMService(
api_key=os.getenv("OPENAI_API_KEY"),
model="gpt-4-turbo-preview")
tts = ElevenLabsTTSService(
aiohttp_session=session,
api_key=os.getenv("ELEVENLABS_API_KEY"),
voice_id="ErXwobaYiN019PkySvjV",
)
messages = [
{
"role": "system",
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio. Respond to what the user said in a creative and helpful way.",
},
]
tma_in = LLMUserContextAggregator(messages)
tma_out = LLMAssistantContextAggregator(messages)
out_sound = OutboundSoundEffectWrapper()
in_sound = InboundSoundEffectWrapper()
fl = FrameLogger("LLM Out")
fl2 = FrameLogger("Transcription In")
pipeline = Pipeline([transport.input(), tma_in, in_sound, fl2, llm,
tma_out, fl, tts, out_sound, transport.output()])
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
transport.capture_participant_transcription(participant["id"])
await tts.say("Hi, I'm listening!")
await transport.send_audio(sounds["ding1.wav"])
runner = PipelineRunner()
task = PipelineTask(pipeline)
await runner.run(task)
if __name__ == "__main__":
(url, token) = configure()
asyncio.run(main(url, token))

View File

@@ -0,0 +1,104 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import asyncio
import aiohttp
import os
import sys
from pipecat.frames.frames import Frame, TextFrame, UserImageRequestFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineTask
from pipecat.processors.aggregators.user_response import UserResponseAggregator
from pipecat.processors.aggregators.vision_image_frame import VisionImageFrameAggregator
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
from pipecat.services.elevenlabs import ElevenLabsTTSService
from pipecat.services.moondream import MoondreamService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.vad.silero import SileroVAD
from runner import configure
from loguru import logger
from dotenv import load_dotenv
load_dotenv(override=True)
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")
class UserImageRequester(FrameProcessor):
def __init__(self, participant_id: str | None = None):
super().__init__()
self._participant_id = participant_id
def set_participant_id(self, participant_id: str):
self._participant_id = participant_id
async def process_frame(self, frame: Frame, direction: FrameDirection):
if self._participant_id and isinstance(frame, TextFrame):
await self.push_frame(UserImageRequestFrame(self._participant_id), FrameDirection.UPSTREAM)
await self.push_frame(frame, direction)
async def main(room_url: str, token):
async with aiohttp.ClientSession() as session:
transport = DailyTransport(
room_url,
token,
"Describe participant video",
DailyParams(
audio_in_enabled=True, # This is so Silero VAD can get audio data
audio_out_enabled=True,
transcription_enabled=True
)
)
vad = SileroVAD()
tts = ElevenLabsTTSService(
aiohttp_session=session,
api_key=os.getenv("ELEVENLABS_API_KEY"),
voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
)
user_response = UserResponseAggregator()
image_requester = UserImageRequester()
vision_aggregator = VisionImageFrameAggregator()
# If you run into weird description, try with use_cpu=True
moondream = MoondreamService()
tts = ElevenLabsTTSService(
aiohttp_session=session,
api_key=os.getenv("ELEVENLABS_API_KEY"),
voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
)
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
await tts.say("Hi there! Feel free to ask me what I see.")
transport.capture_participant_video(participant["id"], framerate=0)
transport.capture_participant_transcription(participant["id"])
image_requester.set_participant_id(participant["id"])
pipeline = Pipeline([transport.input(), vad, user_response, image_requester,
vision_aggregator, moondream, tts, transport.output()])
task = PipelineTask(pipeline)
runner = PipelineRunner()
await runner.run(task)
if __name__ == "__main__":
(url, token) = configure()
asyncio.run(main(url, token))

View File

@@ -1,42 +1,53 @@
import asyncio
import logging
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
from dailyai.transports.daily_transport import DailyTransport
from dailyai.services.whisper_ai_services import WhisperSTTService
from dailyai.pipeline.pipeline import Pipeline
import asyncio
import sys
from pipecat.frames.frames import Frame, TranscriptionFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineTask
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
from pipecat.services.whisper import WhisperSTTService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from runner import configure
logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
logger = logging.getLogger("dailyai")
logger.setLevel(logging.DEBUG)
from loguru import logger
from dotenv import load_dotenv
load_dotenv(override=True)
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")
class TranscriptionLogger(FrameProcessor):
async def process_frame(self, frame: Frame, direction: FrameDirection):
if isinstance(frame, TranscriptionFrame):
print(f"Transcription: {frame.text}")
async def main(room_url: str):
transport = DailyTransport(
room_url,
None,
"Transcription bot",
start_transcription=False,
mic_enabled=False,
camera_enabled=False,
speaker_enabled=True,
)
transport = DailyTransport(room_url, None, "Transcription bot",
DailyParams(audio_in_enabled=True))
stt = WhisperSTTService()
transcription_output_queue = asyncio.Queue()
tl = TranscriptionLogger()
pipeline = Pipeline([stt])
pipeline.set_sink(transcription_output_queue)
pipeline = Pipeline([transport.input(), stt, tl])
async def handle_transcription():
print("`````````TRANSCRIPTION`````````")
while True:
item = await transcription_output_queue.get()
print(item.text)
task = PipelineTask(pipeline)
await asyncio.gather(transport.run(pipeline), handle_transcription())
runner = PipelineRunner()
await runner.run(task)
if __name__ == "__main__":

View File

@@ -1,53 +1,55 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import asyncio
import logging
import sys
from dailyai.pipeline.frames import EndFrame, TranscriptionFrame
from dailyai.transports.local_transport import LocalTransport
from dailyai.services.whisper_ai_services import WhisperSTTService
from dailyai.pipeline.pipeline import Pipeline
from pipecat.frames.frames import Frame, TranscriptionFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineTask
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
from pipecat.services.whisper import WhisperSTTService
from pipecat.transports.base_transport import TransportParams
from pipecat.transports.local.audio import LocalAudioTransport
logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
logger = logging.getLogger("dailyai")
logger.setLevel(logging.DEBUG)
from runner import configure
from loguru import logger
from dotenv import load_dotenv
load_dotenv(override=True)
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")
async def main():
meeting_duration_minutes = 1
class TranscriptionLogger(FrameProcessor):
transport = LocalTransport(
mic_enabled=False,
camera_enabled=False,
speaker_enabled=True,
duration_minutes=meeting_duration_minutes,
start_transcription=False,
)
async def process_frame(self, frame: Frame, direction: FrameDirection):
if isinstance(frame, TranscriptionFrame):
print(f"Transcription: {frame.text}")
async def main(room_url: str):
transport = LocalAudioTransport(TransportParams(audio_in_enabled=True))
stt = WhisperSTTService()
transcription_output_queue = asyncio.Queue()
transport_done = asyncio.Event()
tl = TranscriptionLogger()
pipeline = Pipeline([stt])
pipeline.set_sink(transcription_output_queue)
pipeline = Pipeline([transport.input(), stt, tl])
async def handle_transcription():
print("`````````TRANSCRIPTION`````````")
while not transport_done.is_set():
item = await transcription_output_queue.get()
print("got item from queue", item)
if isinstance(item, TranscriptionFrame):
print(item.text)
elif isinstance(item, EndFrame):
break
print("handle_transcription done")
task = PipelineTask(pipeline)
async def run_until_done():
await transport.run(pipeline)
transport_done.set()
print("run_until_done done")
runner = PipelineRunner()
await asyncio.gather(run_until_done(), handle_transcription())
await runner.run(task)
if __name__ == "__main__":
asyncio.run(main())
(url, token) = configure()
asyncio.run(main(url))

View File

@@ -1,134 +0,0 @@
import aiohttp
import asyncio
import logging
import tkinter as tk
import os
from dailyai.pipeline.frames import AudioFrame, ImageFrame
from dailyai.services.open_ai_services import OpenAILLMService
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
from dailyai.services.fal_ai_services import FalImageGenService
from dailyai.transports.local_transport import LocalTransport
from dotenv import load_dotenv
load_dotenv(override=True)
logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
logger = logging.getLogger("dailyai")
logger.setLevel(logging.DEBUG)
async def main():
async with aiohttp.ClientSession() as session:
meeting_duration_minutes = 5
tk_root = tk.Tk()
tk_root.title("Calendar")
transport = LocalTransport(
mic_enabled=True,
camera_enabled=True,
camera_width=1024,
camera_height=1024,
duration_minutes=meeting_duration_minutes,
tk_root=tk_root,
)
tts = ElevenLabsTTSService(
aiohttp_session=session,
api_key=os.getenv("ELEVENLABS_API_KEY"),
voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
)
llm = OpenAILLMService(
api_key=os.getenv("OPENAI_API_KEY"),
model="gpt-4-turbo-preview")
dalle = FalImageGenService(
image_size="1024x1024",
aiohttp_session=session,
key_id=os.getenv("FAL_KEY_ID"),
key_secret=os.getenv("FAL_KEY_SECRET"),
)
# Get a complete audio chunk from the given text. Splitting this into its own
# coroutine lets us ensure proper ordering of the audio chunks on the
# send queue.
async def get_all_audio(text):
all_audio = bytearray()
async for audio in tts.run_tts(text):
all_audio.extend(audio)
return all_audio
async def get_month_data(month):
messages = [{"role": "system", "content": f"Describe a nature photograph suitable for use in a calendar, for the month of {
month}. Include only the image description with no preamble. Limit the description to one sentence, please.", }]
image_description = await llm.run_llm(messages)
if not image_description:
return
to_speak = f"{month}: {image_description}"
audio_task = asyncio.create_task(get_all_audio(to_speak))
image_task = asyncio.create_task(
dalle.run_image_gen(image_description))
(audio, image_data) = await asyncio.gather(audio_task, image_task)
return {
"month": month,
"text": image_description,
"image_url": image_data[0],
"image": image_data[1],
"audio": audio,
}
months: list[str] = [
"January",
"February",
"March",
"April",
"May",
"June",
"July",
"August",
"September",
"October",
"November",
"December",
]
async def show_images():
# This will play the months in the order they're completed. The benefit
# is we'll have as little delay as possible before the first month, and
# likely no delay between months, but the months won't display in
# order.
for month_data_task in asyncio.as_completed(month_tasks):
data = await month_data_task
if data:
await transport.send_queue.put(
[
ImageFrame(data["image_url"], data["image"]),
AudioFrame(data["audio"]),
]
)
await asyncio.sleep(25)
# wait for the output queue to be empty, then leave the meeting
await transport.stop_when_done()
async def run_tk():
while not transport._stop_threads.is_set():
tk_root.update()
tk_root.update_idletasks()
await asyncio.sleep(0.1)
month_tasks = [
asyncio.create_task(
get_month_data(month)) for month in months]
await asyncio.gather(transport.run(), show_images(), run_tk())
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -1,122 +0,0 @@
import asyncio
import os
import logging
from typing import AsyncGenerator
import aiohttp
from PIL import Image
from dailyai.pipeline.frames import ImageFrame, Frame
from dailyai.transports.daily_transport import DailyTransport
from dailyai.services.ai_services import AIService
from dailyai.pipeline.aggregators import (
LLMAssistantContextAggregator,
LLMUserContextAggregator,
)
from dailyai.services.open_ai_services import OpenAILLMService
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
from dailyai.services.fal_ai_services import FalImageGenService
from runner import configure
from dotenv import load_dotenv
load_dotenv(override=True)
logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
logger = logging.getLogger("dailyai")
logger.setLevel(logging.DEBUG)
class ImageSyncAggregator(AIService):
def __init__(self, speaking_path: str, waiting_path: str):
self._speaking_image = Image.open(speaking_path)
self._speaking_image_bytes = self._speaking_image.tobytes()
self._waiting_image = Image.open(waiting_path)
self._waiting_image_bytes = self._waiting_image.tobytes()
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
yield ImageFrame(None, self._speaking_image_bytes)
yield frame
yield ImageFrame(None, self._waiting_image_bytes)
async def main(room_url: str, token):
async with aiohttp.ClientSession() as session:
transport = DailyTransport(
room_url,
token,
"Respond bot",
5,
)
transport._camera_enabled = True
transport._camera_width = 1024
transport._camera_height = 1024
transport._mic_enabled = True
transport._mic_sample_rate = 16000
tts = ElevenLabsTTSService(
aiohttp_session=session,
api_key=os.getenv("ELEVENLABS_API_KEY"),
voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
)
llm = OpenAILLMService(
api_key=os.getenv("OPENAI_API_KEY"),
model="gpt-4-turbo-preview")
img = FalImageGenService(
image_size="1024x1024",
aiohttp_session=session,
key_id=os.getenv("FAL_KEY_ID"),
key_secret=os.getenv("FAL_KEY_SECRET"),
)
async def get_images():
get_speaking_task = asyncio.create_task(
img.run_image_gen("An image of a cat speaking")
)
get_waiting_task = asyncio.create_task(
img.run_image_gen("An image of a cat waiting")
)
(speaking_data, waiting_data) = await asyncio.gather(
get_speaking_task, get_waiting_task
)
return speaking_data, waiting_data
@transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport):
await tts.say("Hi, I'm listening!", transport.send_queue)
async def handle_transcriptions():
messages = [
{
"role": "system",
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio. Respond to what the user said in a creative and helpful way.",
},
]
tma_in = LLMUserContextAggregator(
messages, transport._my_participant_id)
tma_out = LLMAssistantContextAggregator(
messages, transport._my_participant_id
)
image_sync_aggregator = ImageSyncAggregator(
os.path.join(
os.path.dirname(__file__), "assets", "speaking.png"), os.path.join(
os.path.dirname(__file__), "assets", "waiting.png"), )
await tts.run_to_queue(
transport.send_queue,
image_sync_aggregator.run(
tma_out.run(llm.run(tma_in.run(transport.get_receive_frames())))
),
)
transport.transcription_settings["extra"]["punctuate"] = True
await asyncio.gather(transport.run(), handle_transcriptions())
if __name__ == "__main__":
(url, token) = configure()
asyncio.run(main(url, token))

View File

@@ -1,191 +0,0 @@
import aiohttp
import asyncio
import logging
import os
import random
from typing import AsyncGenerator
from PIL import Image
from dailyai.transports.daily_transport import DailyTransport
from dailyai.services.open_ai_services import OpenAILLMService
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
from dailyai.pipeline.aggregators import (
LLMUserContextAggregator,
LLMAssistantContextAggregator,
)
from dailyai.pipeline.frames import (
Frame,
TextFrame,
ImageFrame,
SpriteFrame,
TranscriptionFrame,
)
from dailyai.services.ai_services import AIService
from runner import configure
from dotenv import load_dotenv
load_dotenv(override=True)
logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
logger = logging.getLogger("dailyai")
logger.setLevel(logging.DEBUG)
sprites = {}
image_files = [
"sc-default.png",
"sc-talk.png",
"sc-listen-1.png",
"sc-think-1.png",
"sc-think-2.png",
"sc-think-3.png",
"sc-think-4.png",
]
script_dir = os.path.dirname(__file__)
for file in image_files:
# Build the full path to the image file
full_path = os.path.join(script_dir, "assets", file)
# Get the filename without the extension to use as the dictionary key
filename = os.path.splitext(os.path.basename(full_path))[0]
# Open the image and convert it to bytes
with Image.open(full_path) as img:
sprites[file] = img.tobytes()
# When the bot isn't talking, show a static image of the cat listening
quiet_frame = ImageFrame("", sprites["sc-listen-1.png"])
# When the bot is talking, build an animation from two sprites
talking_list = [sprites["sc-default.png"], sprites["sc-talk.png"]]
talking = [random.choice(talking_list) for x in range(30)]
talking_frame = SpriteFrame(images=talking)
# TODO: Support "thinking" as soon as we get a valid transcript, while LLM
# is processing
thinking_list = [
sprites["sc-think-1.png"],
sprites["sc-think-2.png"],
sprites["sc-think-3.png"],
sprites["sc-think-4.png"],
]
thinking_frame = SpriteFrame(images=thinking_list)
class TranscriptFilter(AIService):
def __init__(self, bot_participant_id=None):
self.bot_participant_id = bot_participant_id
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
if isinstance(frame, TranscriptionFrame):
if frame.participantId != self.bot_participant_id:
yield frame
class NameCheckFilter(AIService):
def __init__(self, names: list[str]):
self.names = names
self.sentence = ""
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
content: str = ""
# TODO: split up transcription by participant
if isinstance(frame, TextFrame):
content = frame.text
self.sentence += content
if self.sentence.endswith((".", "?", "!")):
if any(name in self.sentence for name in self.names):
out = self.sentence
self.sentence = ""
yield TextFrame(out)
else:
out = self.sentence
self.sentence = ""
class ImageSyncAggregator(AIService):
def __init__(self):
pass
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
yield talking_frame
yield frame
yield quiet_frame
async def main(room_url: str, token):
async with aiohttp.ClientSession() as session:
transport = DailyTransport(
room_url,
token,
"Santa Cat",
duration_minutes=3,
start_transcription=True,
mic_enabled=True,
mic_sample_rate=16000,
camera_enabled=True,
camera_width=720,
camera_height=1280,
)
transport._mic_enabled = True
transport._mic_sample_rate = 16000
transport._camera_enabled = True
transport._camera_width = 720
transport._camera_height = 1280
llm = OpenAILLMService(
api_key=os.getenv("OPENAI_API_KEY"),
model="gpt-4-turbo-preview")
tts = ElevenLabsTTSService(
aiohttp_session=session,
api_key=os.getenv("ELEVENLABS_API_KEY"),
voice_id="jBpfuIE2acCO8z3wKNLl",
)
isa = ImageSyncAggregator()
@transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport):
await tts.say(
"Hi! If you want to talk to me, just say 'hey Santa Cat'.",
transport.send_queue,
)
async def handle_transcriptions():
messages = [
{
"role": "system",
"content": "You are Santa Cat, a cat that lives in Santa's workshop at the North Pole. You should be clever, and a bit sarcastic. You should also tell jokes every once in a while. Your responses should only be a few sentences long.",
},
]
tma_in = LLMUserContextAggregator(
messages, transport._my_participant_id)
tma_out = LLMAssistantContextAggregator(
messages, transport._my_participant_id
)
tf = TranscriptFilter(transport._my_participant_id)
ncf = NameCheckFilter(["Santa Cat", "Santa"])
await tts.run_to_queue(
transport.send_queue,
isa.run(
tma_out.run(
llm.run(
tma_in.run(
ncf.run(tf.run(transport.get_receive_frames())))
)
)
),
)
async def starting_image():
await transport.send_queue.put(quiet_frame)
transport.transcription_settings["extra"]["punctuate"] = True
await asyncio.gather(transport.run(), handle_transcriptions(), starting_image())
if __name__ == "__main__":
(url, token) = configure()
asyncio.run(main(url, token))

View File

@@ -1,140 +0,0 @@
import aiohttp
import asyncio
import logging
import os
import wave
from dailyai.transports.daily_transport import DailyTransport
from dailyai.services.open_ai_services import OpenAILLMService
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
from dailyai.pipeline.aggregators import (
LLMUserContextAggregator,
LLMAssistantContextAggregator,
)
from dailyai.services.ai_services import AIService, FrameLogger
from dailyai.pipeline.frames import (
Frame,
AudioFrame,
LLMResponseEndFrame,
LLMMessagesFrame,
)
from typing import AsyncGenerator
from runner import configure
from dotenv import load_dotenv
load_dotenv(override=True)
logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
logger = logging.getLogger("dailyai")
logger.setLevel(logging.DEBUG)
sounds = {}
sound_files = ["ding1.wav", "ding2.wav"]
script_dir = os.path.dirname(__file__)
for file in sound_files:
# Build the full path to the image file
full_path = os.path.join(script_dir, "assets", file)
# Get the filename without the extension to use as the dictionary key
filename = os.path.splitext(os.path.basename(full_path))[0]
# Open the image and convert it to bytes
with wave.open(full_path) as audio_file:
sounds[file] = audio_file.readframes(-1)
class OutboundSoundEffectWrapper(AIService):
def __init__(self):
pass
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
if isinstance(frame, LLMResponseEndFrame):
yield AudioFrame(sounds["ding1.wav"])
# In case anything else up the stack needs it
yield frame
else:
yield frame
class InboundSoundEffectWrapper(AIService):
def __init__(self):
pass
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
if isinstance(frame, LLMMessagesFrame):
yield AudioFrame(sounds["ding2.wav"])
# In case anything else up the stack needs it
yield frame
else:
yield frame
async def main(room_url: str, token):
async with aiohttp.ClientSession() as session:
transport = DailyTransport(
room_url,
token,
"Respond bot",
duration_minutes=5,
mic_enabled=True,
mic_sample_rate=16000,
camera_enabled=False,
)
llm = OpenAILLMService(
api_key=os.getenv("OPENAI_API_KEY"),
model="gpt-4-turbo-preview")
tts = ElevenLabsTTSService(
aiohttp_session=session,
api_key=os.getenv("ELEVENLABS_API_KEY"),
voice_id="ErXwobaYiN019PkySvjV",
)
@transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport):
await tts.say("Hi, I'm listening!", transport.send_queue)
await transport.send_queue.put(AudioFrame(sounds["ding1.wav"]))
async def handle_transcriptions():
messages = [
{
"role": "system",
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio. Respond to what the user said in a creative and helpful way.",
},
]
tma_in = LLMUserContextAggregator(
messages, transport._my_participant_id)
tma_out = LLMAssistantContextAggregator(
messages, transport._my_participant_id
)
out_sound = OutboundSoundEffectWrapper()
in_sound = InboundSoundEffectWrapper()
fl = FrameLogger("LLM Out")
fl2 = FrameLogger("Transcription In")
await out_sound.run_to_queue(
transport.send_queue,
tts.run(
fl.run(
tma_out.run(
llm.run(
fl2.run(
in_sound.run(
tma_in.run(transport.get_receive_frames())
)
)
)
)
)
),
)
transport.transcription_settings["extra"]["punctuate"] = True
await asyncio.gather(transport.run(), handle_transcriptions())
if __name__ == "__main__":
(url, token) = configure()
asyncio.run(main(url, token))

View File

@@ -1,6 +1,6 @@
syntax = "proto3";
package dailyai_proto;
package pipecat_proto;
message TextFrame {
string text = 1;

View File

@@ -28,7 +28,7 @@
const proto = protobuf.load("frames.proto", (err, root) => {
if (err) throw err;
frame = root.lookupType("dailyai_proto.Frame");
frame = root.lookupType("pipecat_proto.Frame");
});
function initWebSocket() {

View File

@@ -2,15 +2,15 @@ import asyncio
import aiohttp
import logging
import os
from dailyai.pipeline.frame_processor import FrameProcessor
from dailyai.pipeline.frames import TextFrame, TranscriptionFrame
from dailyai.pipeline.pipeline import Pipeline
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
from dailyai.transports.websocket_transport import WebsocketTransport
from dailyai.services.whisper_ai_services import WhisperSTTService
from pipecat.pipeline.frame_processor import FrameProcessor
from pipecat.pipeline.frames import TextFrame, TranscriptionFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.services.elevenlabs_ai_services import ElevenLabsTTSService
from pipecat.transports.websocket_transport import WebsocketTransport
from pipecat.services.whisper_ai_services import WhisperSTTService
logging.basicConfig(format="%(levelno)s %(asctime)s %(message)s")
logger = logging.getLogger("dailyai")
logger = logging.getLogger("pipecat")
logger.setLevel(logging.DEBUG)

View File

@@ -5,11 +5,10 @@ import time
import urllib.parse
import random
from dailyai.transports.daily_transport import DailyTransport
from dailyai.services.azure_ai_services import AzureLLMService, AzureTTSService
from dailyai.pipeline.frames import Frame, FrameType
from dailyai.services.fal_ai_services import FalImageGenService
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
from pipecat.transports.daily_transport import DailyTransport
from pipecat.services.azure_ai_services import AzureLLMService, AzureTTSService
from pipecat.pipeline.frames import Frame
from pipecat.services.fal_ai_services import FalImageGenService
async def main(room_url: str, token):
@@ -77,8 +76,6 @@ async def main(room_url: str, token):
async for audio in audio_generator:
transport.output_queue.put(Frame(FrameType.AUDIO_FRAME, audio))
transport.transcription_settings["extra"]["punctuate"] = False
transport.transcription_settings["extra"]["endpointing"] = False
await asyncio.gather(transport.run(), handle_transcriptions())

View File

@@ -3,11 +3,11 @@ import asyncio
import os
import wave
from dailyai.transports.daily_transport import DailyTransport
from dailyai.services.azure_ai_services import AzureLLMService, AzureTTSService
from dailyai.pipeline.aggregators import LLMContextAggregator
from dailyai.services.ai_services import AIService, FrameLogger
from dailyai.pipeline.frames import Frame, AudioFrame, LLMResponseEndFrame, LLMMessagesFrame
from pipecat.transports.daily_transport import DailyTransport
from pipecat.services.azure_ai_services import AzureLLMService, AzureTTSService
from pipecat.pipeline.aggregators import LLMContextAggregator
from pipecat.services.ai_services import AIService, FrameLogger
from pipecat.pipeline.frames import Frame, AudioFrame, LLMResponseEndFrame, LLMMessagesFrame
from typing import AsyncGenerator
from runner import configure
@@ -80,7 +80,7 @@ async def main(room_url: str, token, phone):
tts = AzureTTSService()
@transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport):
async def on_first_other_participant_joined(transport, participant):
await tts.say("Hi, I'm listening!", transport.send_queue)
await transport.send_queue.put(AudioFrame(sounds["ding1.wav"]))
@@ -127,8 +127,6 @@ async def main(room_url: str, token, phone):
transport.start_recording()
transport.dialout(phone)
transport.transcription_settings["extra"]["punctuate"] = True
await asyncio.gather(transport.run(), handle_transcriptions())

View File

@@ -5,11 +5,11 @@ import os
from PIL import Image
from typing import AsyncGenerator
from dailyai.pipeline.aggregators import (
LLMResponseAggregator,
UserResponseAggregator,
from pipecat.pipeline.aggregators import (
LLMAssistantResponseAggregator,
LLMUserResponseAggregator,
)
from dailyai.pipeline.frames import (
from pipecat.pipeline.frames import (
ImageFrame,
SpriteFrame,
Frame,
@@ -18,11 +18,11 @@ from dailyai.pipeline.frames import (
AudioFrame,
PipelineStartedFrame,
)
from dailyai.services.ai_services import AIService
from dailyai.pipeline.pipeline import Pipeline
from dailyai.transports.daily_transport import DailyTransport
from dailyai.services.open_ai_services import OpenAILLMService
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
from pipecat.services.ai_services import AIService
from pipecat.pipeline.pipeline import Pipeline
from pipecat.transports.daily_transport import DailyTransport
from pipecat.services.open_ai_services import OpenAILLMService
from pipecat.services.elevenlabs_ai_services import ElevenLabsTTSService
from runner import configure
@@ -30,7 +30,7 @@ from dotenv import load_dotenv
load_dotenv(override=True)
logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
logger = logging.getLogger("dailyai")
logger = logging.getLogger("pipecat")
logger.setLevel(logging.DEBUG)
sprites = []
@@ -48,7 +48,7 @@ for i in range(1, 26):
flipped = sprites[::-1]
sprites.extend(flipped)
# When the bot isn't talking, show a static image of the cat listening
quiet_frame = ImageFrame("", sprites[0])
quiet_frame = ImageFrame(sprites[0], (1024, 576))
talking_frame = SpriteFrame(images=sprites)
@@ -79,8 +79,6 @@ class TalkingAnimation(AIService):
class AnimationInitializer(AIService):
def __init__(self):
super().__init__()
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
if isinstance(frame, PipelineStartedFrame):
@@ -127,7 +125,7 @@ async def main(room_url: str, token):
]
@transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport):
async def on_first_other_participant_joined(transport, participant):
print(f"!!! in here, pipeline.source is {pipeline.source}")
await pipeline.queue_frames([LLMMessagesFrame(messages)])
@@ -135,12 +133,10 @@ async def main(room_url: str, token):
await transport.run_interruptible_pipeline(
pipeline,
post_processor=LLMResponseAggregator(messages),
pre_processor=UserResponseAggregator(messages),
post_processor=LLMAssistantResponseAggregator(messages),
pre_processor=LLMUserResponseAggregator(messages),
)
transport.transcription_settings["extra"]["endpointing"] = True
transport.transcription_settings["extra"]["punctuate"] = True
await asyncio.gather(transport.run(), run_conversation())

View File

@@ -7,25 +7,26 @@ import os
import re
import wave
from typing import AsyncGenerator, List
from dailyai.pipeline.opeanai_llm_aggregator import (
from pipecat.pipeline.opeanai_llm_aggregator import (
OpenAIAssistantContextAggregator,
OpenAIUserContextAggregator,
)
from dailyai.pipeline.pipeline import Pipeline
from dailyai.transports.daily_transport import DailyTransport
from dailyai.services.openai_llm_context import OpenAILLMContext
from dailyai.services.open_ai_services import OpenAILLMService
# from dailyai.services.deepgram_ai_services import DeepgramTTSService
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
from dailyai.pipeline.frames import (
from pipecat.pipeline.pipeline import Pipeline
from pipecat.transports.daily_transport import DailyTransport
from pipecat.services.openai_llm_context import OpenAILLMContext
from pipecat.services.open_ai_services import OpenAILLMService
# from pipecat.services.deepgram_ai_services import DeepgramTTSService
from pipecat.services.elevenlabs_ai_services import ElevenLabsTTSService
from pipecat.services.fireworks_ai_services import FireworksLLMService
from pipecat.pipeline.frames import (
Frame,
LLMFunctionCallFrame,
LLMFunctionStartFrame,
AudioFrame,
)
from dailyai.pipeline.openai_frames import OpenAILLMContextFrame
from dailyai.services.ai_services import FrameLogger, AIService
from pipecat.pipeline.openai_frames import OpenAILLMContextFrame
from pipecat.services.ai_services import FrameLogger, AIService
from openai._types import NotGiven, NOT_GIVEN
from openai.types.chat import (
@@ -38,7 +39,7 @@ from dotenv import load_dotenv
load_dotenv(override=True)
logging.basicConfig(format="%(levelno)s %(asctime)s %(message)s")
logger = logging.getLogger("dailyai")
logger = logging.getLogger("pipecat")
logger.setLevel(logging.DEBUG)
sounds = {}
@@ -249,8 +250,7 @@ class ChecklistProcessor(AIService):
print(f"--> {pretty_json}\n")
if frame.function_name not in self._functions:
raise Exception(
f"The LLM tried to call a function named {frame.function_name}, which isn't in the list of known functions. Please check your prompt and/or self._functions."
)
f"Unknown function.")
fn = getattr(self, frame.function_name)
result = fn(json.loads(frame.arguments))
@@ -306,9 +306,9 @@ async def main(room_url: str, token):
messages = []
llm = OpenAILLMService(
api_key=os.getenv("OPENAI_API_KEY"),
model="gpt-4-1106-preview",
llm = FireworksLLMService(
api_key=os.getenv("FIREWORKS_API_KEY"),
model="accounts/fireworks/models/firefunction-v1"
)
# tts = DeepgramTTSService(
# aiohttp_session=session,
@@ -330,7 +330,7 @@ async def main(room_url: str, token):
pipeline = Pipeline(processors=[fl, llm, fl2, checklist, tts])
@transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport):
async def on_first_other_participant_joined(transport, participant):
await pipeline.queue_frames([OpenAILLMContextFrame(context)])
async def handle_intake():
@@ -340,8 +340,6 @@ async def main(room_url: str, token):
pre_processor=OpenAIUserContextAggregator(context),
)
transport.transcription_settings["extra"]["endpointing"] = True
transport.transcription_settings["extra"]["punctuate"] = True
try:
await asyncio.gather(transport.run(), handle_intake())
except (asyncio.CancelledError, KeyboardInterrupt):

View File

@@ -9,20 +9,21 @@ import wave
from typing import AsyncGenerator
from PIL import Image
from dailyai.pipeline.pipeline import Pipeline
from dailyai.pipeline.frame_processor import FrameProcessor
from dailyai.transports.daily_transport import DailyTransport
from dailyai.services.azure_ai_services import AzureLLMService, AzureTTSService
from dailyai.services.fal_ai_services import FalImageGenService
from dailyai.services.open_ai_services import OpenAILLMService
from dailyai.services.deepgram_ai_services import DeepgramTTSService
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
from dailyai.pipeline.aggregators import (
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.frame_processor import FrameProcessor
from pipecat.services.live_stream import LiveStream
from pipecat.transports.daily_transport import DailyTransport
from pipecat.services.azure_ai_services import AzureLLMService, AzureTTSService
from pipecat.services.fal_ai_services import FalImageGenService
from pipecat.services.open_ai_services import OpenAILLMService
from pipecat.services.deepgram_ai_services import DeepgramTTSService
from pipecat.services.elevenlabs_ai_services import ElevenLabsTTSService
from pipecat.pipeline.aggregators import (
LLMAssistantContextAggregator,
UserResponseAggregator,
LLMResponseAggregator,
LLMAssistantResponseAggregator,
LLMUserResponseAggregator,
)
from dailyai.pipeline.frames import (
from pipecat.pipeline.frames import (
EndPipeFrame,
LLMMessagesFrame,
Frame,
@@ -32,7 +33,7 @@ from dailyai.pipeline.frames import (
ImageFrame,
UserStoppedSpeakingFrame,
)
from dailyai.services.ai_services import FrameLogger, AIService
from pipecat.services.ai_services import FrameLogger, AIService
from runner import configure
@@ -40,7 +41,7 @@ from dotenv import load_dotenv
load_dotenv(override=True)
logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
logger = logging.getLogger("dailyai")
logger = logging.getLogger("pipecat")
logger.setLevel(logging.DEBUG)
sounds = {}
@@ -99,7 +100,7 @@ class StoryProcessor(FrameProcessor):
1. Catch the frames that are generated by the LLM service
"""
if isinstance(frame, UserStoppedSpeakingFrame):
yield ImageFrame(None, images["grandma-writing.png"])
yield ImageFrame(images["grandma-writing.png"], (1024, 1024))
yield AudioFrame(sounds["talking.wav"])
elif isinstance(frame, TextFrame):
@@ -112,7 +113,7 @@ class StoryProcessor(FrameProcessor):
self._text = self._text.replace("\n", " ")
if len(self._text) > 2:
yield ImageFrame(None, images["grandma-writing.png"])
yield ImageFrame(images["grandma-writing.png"], (1024, 1024))
yield StoryStartFrame(self._text)
yield AudioFrame(sounds["ding3.wav"])
self._text = ""
@@ -146,11 +147,11 @@ class StoryProcessor(FrameProcessor):
# last bit
pass
elif isinstance(frame, LLMResponseEndFrame):
yield ImageFrame(None, images["grandma-writing.png"])
yield ImageFrame(images["grandma-writing.png"], (1024, 1024))
yield StoryPromptFrame(self._text)
self._text = ""
yield frame
yield ImageFrame(None, images["grandma-listening.png"])
yield ImageFrame(images["grandma-listening.png"], (1024, 1024))
yield AudioFrame(sounds["listening.wav"])
else:
@@ -204,13 +205,14 @@ async def main(room_url: str, token):
voice_id="Xb7hH8MSUJpSbSDYk0k2",
) # matilda
img = FalImageGenService(
image_size="1024x1024",
params={
image_size = "1024x1024",
},
aiohttp_session=session,
key_id=os.getenv("FAL_KEY_ID"),
key_secret=os.getenv("FAL_KEY_SECRET"),
key=os.getenv("FAL_KEY"),
)
lra = LLMResponseAggregator(messages)
ura = UserResponseAggregator(messages)
lra = LLMAssistantResponseAggregator(messages)
ura = LLMUserResponseAggregator(messages)
sp = StoryProcessor(messages, story)
sig = StoryImageGenerator(story, llm, img)
@@ -232,7 +234,7 @@ async def main(room_url: str, token):
start_story_event = asyncio.Event()
@transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport):
async def on_first_other_participant_joined(transport, participant):
start_story_event.set()
async def storytime():
@@ -252,7 +254,7 @@ async def main(room_url: str, token):
[llm, lca, tts], sink=transport.send_queue)
await local_pipeline.queue_frames(
[
ImageFrame(None, images["grandma-listening.png"]),
ImageFrame(images["grandma-listening.png"], (1024, 1024)),
LLMMessagesFrame(intro_messages),
AudioFrame(sounds["listening.wav"]),
EndPipeFrame(),
@@ -260,6 +262,10 @@ async def main(room_url: str, token):
)
await local_pipeline.run_pipeline()
pipeline = Pipeline([llm, lca, tts, ls_sink])
pipeline.queue_frames([...])
pipeline.run()
fl = FrameLogger("### After Image Generation")
pipeline = Pipeline(
processors=[
@@ -276,8 +282,6 @@ async def main(room_url: str, token):
pipeline,
)
transport.transcription_settings["extra"]["endpointing"] = True
transport.transcription_settings["extra"]["punctuate"] = True
try:
await asyncio.gather(transport.run(), storytime())
except (asyncio.CancelledError, KeyboardInterrupt):

View File

@@ -4,21 +4,21 @@ import logging
import os
from typing import AsyncGenerator
from dailyai.pipeline.aggregators import (
from pipecat.pipeline.aggregators import (
SentenceAggregator,
)
from dailyai.pipeline.frames import (
from pipecat.pipeline.frames import (
Frame,
LLMMessagesFrame,
TextFrame,
SendAppMessageFrame,
)
from dailyai.pipeline.frame_processor import FrameProcessor
from dailyai.pipeline.pipeline import Pipeline
from dailyai.transports.daily_transport import DailyTransport
from dailyai.services.azure_ai_services import AzureTTSService
from dailyai.services.open_ai_services import OpenAILLMService
from dailyai.pipeline.aggregators import LLMFullResponseAggregator
from pipecat.pipeline.frame_processor import FrameProcessor
from pipecat.pipeline.pipeline import Pipeline
from pipecat.transports.daily_transport import DailyTransport
from pipecat.services.azure_ai_services import AzureTTSService
from pipecat.services.open_ai_services import OpenAILLMService
from pipecat.pipeline.aggregators import LLMFullResponseAggregator
from runner import configure
@@ -28,7 +28,7 @@ from dotenv import load_dotenv
load_dotenv(override=True)
logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
logger = logging.getLogger("dailyai")
logger = logging.getLogger("pipecat")
logger.setLevel(logging.DEBUG)
"""
@@ -99,8 +99,6 @@ async def main(room_url: str, token):
ts = TranslationSubtitles("spanish")
pipeline = Pipeline([sa, tp, llm, lfra, ts, tts])
transport.transcription_settings["extra"]["endpointing"] = True
transport.transcription_settings["extra"]["punctuate"] = True
await transport.run(pipeline)

View File

@@ -4,29 +4,28 @@
#
# pip-compile --all-extras pyproject.toml
#
aiohttp==3.9.3
# via dailyai (pyproject.toml)
aiohttp==3.9.5
# via pipecat (pyproject.toml)
aiosignal==1.3.1
# via aiohttp
anthropic==0.20.0
# via dailyai (pyproject.toml)
annotated-types==0.6.0
# via pydantic
anthropic==0.25.8
# via pipecat (pyproject.toml)
anyio==4.3.0
# via
# anthropic
# httpx
# openai
# starlette
async-timeout==4.0.3
# via aiohttp
attrs==23.2.0
# via
# aiohttp
# fal
av==11.0.0
# via aiohttp
av==12.0.0
# via faster-whisper
azure-cognitiveservices-speech==1.36.0
# via dailyai (pyproject.toml)
blinker==1.7.0
azure-cognitiveservices-speech==1.37.0
# via pipecat (pyproject.toml)
blinker==1.8.2
# via flask
certifi==2024.2.2
# via
@@ -36,48 +35,38 @@ certifi==2024.2.2
charset-normalizer==3.3.2
# via requests
click==8.1.7
# via
# fal
# flask
colorama==0.4.6
# via fal
# via flask
coloredlogs==15.0.1
# via onnxruntime
ctranslate2==4.1.0
ctranslate2==4.2.1
# via faster-whisper
daily-python==0.7.2
# via dailyai (pyproject.toml)
deprecated==1.2.14
# via opentelemetry-api
dill==0.3.7
# via fal
distlib==0.3.8
# via virtualenv
daily-python==0.7.4
# via pipecat (pyproject.toml)
distro==1.9.0
# via
# anthropic
# openai
exceptiongroup==1.2.0
einops==0.8.0
# via pipecat (pyproject.toml)
exceptiongroup==1.2.1
# via anyio
fal==0.12.3
# via dailyai (pyproject.toml)
fastapi==0.99.1
# via fal
faster-whisper==1.0.1
# via dailyai (pyproject.toml)
filelock==3.13.3
fal-client==0.4.0
# via pipecat (pyproject.toml)
faster-whisper==1.0.2
# via pipecat (pyproject.toml)
filelock==3.14.0
# via
# huggingface-hub
# pyht
# torch
# transformers
# triton
# virtualenv
flask==3.0.2
flask==3.0.3
# via
# dailyai (pyproject.toml)
# flask-cors
flask-cors==4.0.0
# via dailyai (pyproject.toml)
# pipecat (pyproject.toml)
flask-cors==4.0.1
# via pipecat (pyproject.toml)
flatbuffers==24.3.25
# via onnxruntime
frozenlist==1.4.1
@@ -88,15 +77,8 @@ fsspec==2024.3.1
# via
# huggingface-hub
# torch
grpc-interceptor==0.15.4
# via fal
grpcio==1.62.1
# via
# fal
# grpc-interceptor
# isolate
# isolate-proto
# pyht
grpcio==1.63.0
# via pyht
h11==0.14.0
# via httpcore
httpcore==1.0.5
@@ -104,57 +86,51 @@ httpcore==1.0.5
httpx==0.27.0
# via
# anthropic
# fal
# fal-client
# openai
huggingface-hub==0.22.2
httpx-sse==0.4.0
# via fal-client
huggingface-hub==0.23.0
# via
# faster-whisper
# timm
# tokenizers
# transformers
humanfriendly==10.0
# via coloredlogs
idna==3.6
idna==3.7
# via
# anyio
# httpx
# requests
# yarl
importlib-metadata==7.0.0
# via opentelemetry-api
isolate[build]==0.12.7
# via
# fal
# isolate-proto
isolate-proto==0.3.3
# via fal
itsdangerous==2.1.2
itsdangerous==2.2.0
# via flask
jinja2==3.1.3
jinja2==3.1.4
# via
# flask
# torch
markdown-it-py==3.0.0
# via rich
loguru==0.7.2
# via pipecat (pyproject.toml)
markupsafe==2.1.5
# via
# jinja2
# werkzeug
mdurl==0.1.2
# via markdown-it-py
mpmath==1.3.0
# via sympy
msgpack==1.0.8
# via fal
multidict==6.0.5
# via
# aiohttp
# yarl
networkx==3.2.1
networkx==3.3
# via torch
numpy==1.26.4
# via
# ctranslate2
# dailyai (pyproject.toml)
# onnxruntime
# pipecat (pyproject.toml)
# torchvision
# transformers
nvidia-cublas-cu12==12.1.3.1
# via
# nvidia-cudnn-cu12
@@ -178,7 +154,7 @@ nvidia-cusparse-cu12==12.1.0.106
# via
# nvidia-cusolver-cu12
# torch
nvidia-nccl-cu12==2.19.3
nvidia-nccl-cu12==2.20.5
# via torch
nvidia-nvjitlink-cu12==12.4.127
# via
@@ -186,134 +162,106 @@ nvidia-nvjitlink-cu12==12.4.127
# nvidia-cusparse-cu12
nvidia-nvtx-cu12==12.1.105
# via torch
onnxruntime==1.17.1
onnxruntime==1.17.3
# via faster-whisper
openai==1.14.2
# via dailyai (pyproject.toml)
opentelemetry-api==1.24.0
# via
# fal
# opentelemetry-sdk
opentelemetry-sdk==1.24.0
# via fal
opentelemetry-semantic-conventions==0.45b0
# via opentelemetry-sdk
openai==1.26.0
# via pipecat (pyproject.toml)
packaging==24.0
# via
# fal
# huggingface-hub
# onnxruntime
pathspec==0.11.2
# via fal
pillow==10.2.0
# transformers
pillow==10.3.0
# via
# dailyai (pyproject.toml)
# fal
platformdirs==4.2.0
# via
# isolate
# virtualenv
portalocker==2.8.2
# via fal
# pipecat (pyproject.toml)
# torchvision
protobuf==4.25.3
# via
# isolate
# isolate-proto
# onnxruntime
# pyht
pyaudio==0.2.14
# via dailyai (pyproject.toml)
pydantic==1.10.15
# via pipecat (pyproject.toml)
pydantic==2.7.1
# via
# anthropic
# fal
# fastapi
# openai
pygments==2.17.2
# via rich
pyht==0.0.26
# via dailyai (pyproject.toml)
pyjwt==2.8.0
# via fal
python-dateutil==2.9.0.post0
# via fal
pydantic-core==2.18.2
# via pydantic
pyht==0.0.28
# via pipecat (pyproject.toml)
python-dotenv==1.0.1
# via dailyai (pyproject.toml)
# via pipecat (pyproject.toml)
pyyaml==6.0.1
# via
# ctranslate2
# huggingface-hub
# isolate
# timm
# transformers
regex==2024.5.10
# via transformers
requests==2.31.0
# via
# huggingface-hub
# pyht
rich==13.7.1
# via fal
six==1.16.0
# via python-dateutil
# transformers
safetensors==0.4.3
# via
# timm
# transformers
sniffio==1.3.1
# via
# anthropic
# anyio
# httpx
# openai
starlette==0.27.0
# via fastapi
structlog==22.3.0
# via fal
sympy==1.12
# via
# onnxruntime
# torch
tblib==3.0.0
# via isolate
tokenizers==0.15.2
timm==0.9.16
# via pipecat (pyproject.toml)
tokenizers==0.19.1
# via
# anthropic
# faster-whisper
torch==2.2.1
# transformers
torch==2.3.0
# via
# dailyai (pyproject.toml)
# pipecat (pyproject.toml)
# timm
# torchaudio
torchaudio==2.2.1
# via dailyai (pyproject.toml)
tqdm==4.66.2
# torchvision
torchaudio==2.3.0
# via pipecat (pyproject.toml)
torchvision==0.18.0
# via timm
tqdm==4.66.4
# via
# huggingface-hub
# openai
triton==2.2.0
# transformers
transformers==4.40.2
# via pipecat (pyproject.toml)
triton==2.3.0
# via torch
types-python-dateutil==2.9.0.20240316
# via fal
typing-extensions==4.10.0
typing-extensions==4.11.0
# via
# anthropic
# anyio
# dailyai (pyproject.toml)
# fal
# fastapi
# huggingface-hub
# openai
# opentelemetry-sdk
# pipecat (pyproject.toml)
# pydantic
# pydantic-core
# torch
urllib3==2.2.1
# via requests
virtualenv==20.25.1
# via isolate
websockets==12.0
# via
# dailyai (pyproject.toml)
# fal
werkzeug==3.0.2
# via pipecat (pyproject.toml)
werkzeug==3.0.3
# via flask
wrapt==1.16.0
# via deprecated
yarl==1.9.4
# via aiohttp
zipp==3.18.1
# via importlib-metadata
# The following packages are considered to be unsafe in a requirements file:
# setuptools

View File

@@ -4,29 +4,28 @@
#
# pip-compile --all-extras pyproject.toml
#
aiohttp==3.9.3
# via dailyai (pyproject.toml)
aiohttp==3.9.5
# via pipecat (pyproject.toml)
aiosignal==1.3.1
# via aiohttp
anthropic==0.20.0
# via dailyai (pyproject.toml)
annotated-types==0.6.0
# via pydantic
anthropic==0.25.8
# via pipecat (pyproject.toml)
anyio==4.3.0
# via
# anthropic
# httpx
# openai
# starlette
async-timeout==4.0.3
# via aiohttp
attrs==23.2.0
# via
# aiohttp
# fal
av==11.0.0
# via aiohttp
av==12.0.0
# via faster-whisper
azure-cognitiveservices-speech==1.36.0
# via dailyai (pyproject.toml)
blinker==1.7.0
azure-cognitiveservices-speech==1.37.0
# via pipecat (pyproject.toml)
blinker==1.8.2
# via flask
certifi==2024.2.2
# via
@@ -36,47 +35,37 @@ certifi==2024.2.2
charset-normalizer==3.3.2
# via requests
click==8.1.7
# via
# fal
# flask
colorama==0.4.6
# via fal
# via flask
coloredlogs==15.0.1
# via onnxruntime
ctranslate2==4.1.0
ctranslate2==4.2.1
# via faster-whisper
daily-python==0.7.2
# via dailyai (pyproject.toml)
deprecated==1.2.14
# via opentelemetry-api
dill==0.3.7
# via fal
distlib==0.3.8
# via virtualenv
daily-python==0.7.4
# via pipecat (pyproject.toml)
distro==1.9.0
# via
# anthropic
# openai
exceptiongroup==1.2.0
einops==0.8.0
# via pipecat (pyproject.toml)
exceptiongroup==1.2.1
# via anyio
fal==0.12.3
# via dailyai (pyproject.toml)
fastapi==0.99.1
# via fal
faster-whisper==1.0.1
# via dailyai (pyproject.toml)
filelock==3.13.3
fal-client==0.4.0
# via pipecat (pyproject.toml)
faster-whisper==1.0.2
# via pipecat (pyproject.toml)
filelock==3.14.0
# via
# huggingface-hub
# pyht
# torch
# virtualenv
flask==3.0.2
# transformers
flask==3.0.3
# via
# dailyai (pyproject.toml)
# flask-cors
flask-cors==4.0.0
# via dailyai (pyproject.toml)
# pipecat (pyproject.toml)
flask-cors==4.0.1
# via pipecat (pyproject.toml)
flatbuffers==24.3.25
# via onnxruntime
frozenlist==1.4.1
@@ -87,15 +76,8 @@ fsspec==2024.3.1
# via
# huggingface-hub
# torch
grpc-interceptor==0.15.4
# via fal
grpcio==1.62.1
# via
# fal
# grpc-interceptor
# isolate
# isolate-proto
# pyht
grpcio==1.63.0
# via pyht
h11==0.14.0
# via httpcore
httpcore==1.0.5
@@ -103,183 +85,149 @@ httpcore==1.0.5
httpx==0.27.0
# via
# anthropic
# fal
# fal-client
# openai
huggingface-hub==0.22.2
httpx-sse==0.4.0
# via fal-client
huggingface-hub==0.23.0
# via
# faster-whisper
# timm
# tokenizers
# transformers
humanfriendly==10.0
# via coloredlogs
idna==3.6
idna==3.7
# via
# anyio
# httpx
# requests
# yarl
importlib-metadata==7.0.0
# via opentelemetry-api
isolate[build]==0.12.7
# via
# fal
# isolate-proto
isolate-proto==0.3.3
# via fal
itsdangerous==2.1.2
itsdangerous==2.2.0
# via flask
jinja2==3.1.3
jinja2==3.1.4
# via
# flask
# torch
markdown-it-py==3.0.0
# via rich
loguru==0.7.2
# via pipecat (pyproject.toml)
markupsafe==2.1.5
# via
# jinja2
# werkzeug
mdurl==0.1.2
# via markdown-it-py
mpmath==1.3.0
# via sympy
msgpack==1.0.8
# via fal
multidict==6.0.5
# via
# aiohttp
# yarl
networkx==3.2.1
networkx==3.3
# via torch
numpy==1.26.4
# via
# ctranslate2
# dailyai (pyproject.toml)
# onnxruntime
onnxruntime==1.17.1
# pipecat (pyproject.toml)
# torchvision
# transformers
onnxruntime==1.17.3
# via faster-whisper
openai==1.14.2
# via dailyai (pyproject.toml)
opentelemetry-api==1.24.0
# via
# fal
# opentelemetry-sdk
opentelemetry-sdk==1.24.0
# via fal
opentelemetry-semantic-conventions==0.45b0
# via opentelemetry-sdk
openai==1.26.0
# via pipecat (pyproject.toml)
packaging==24.0
# via
# fal
# huggingface-hub
# onnxruntime
pathspec==0.11.2
# via fal
pillow==10.2.0
# transformers
pillow==10.3.0
# via
# dailyai (pyproject.toml)
# fal
platformdirs==4.2.0
# via
# isolate
# virtualenv
portalocker==2.8.2
# via fal
# pipecat (pyproject.toml)
# torchvision
protobuf==4.25.3
# via
# isolate
# isolate-proto
# onnxruntime
# pyht
pyaudio==0.2.14
# via dailyai (pyproject.toml)
pydantic==1.10.15
# via pipecat (pyproject.toml)
pydantic==2.7.1
# via
# anthropic
# fal
# fastapi
# openai
pygments==2.17.2
# via rich
pyht==0.0.26
# via dailyai (pyproject.toml)
pyjwt==2.8.0
# via fal
python-dateutil==2.9.0.post0
# via fal
pydantic-core==2.18.2
# via pydantic
pyht==0.0.28
# via pipecat (pyproject.toml)
python-dotenv==1.0.1
# via dailyai (pyproject.toml)
# via pipecat (pyproject.toml)
pyyaml==6.0.1
# via
# ctranslate2
# huggingface-hub
# isolate
# timm
# transformers
regex==2024.5.10
# via transformers
requests==2.31.0
# via
# huggingface-hub
# pyht
rich==13.7.1
# via fal
six==1.16.0
# via python-dateutil
# transformers
safetensors==0.4.3
# via
# timm
# transformers
sniffio==1.3.1
# via
# anthropic
# anyio
# httpx
# openai
starlette==0.27.0
# via fastapi
structlog==22.3.0
# via fal
sympy==1.12
# via
# onnxruntime
# torch
tblib==3.0.0
# via isolate
tokenizers==0.15.2
timm==0.9.16
# via pipecat (pyproject.toml)
tokenizers==0.19.1
# via
# anthropic
# faster-whisper
torch==2.2.1
# transformers
torch==2.3.0
# via
# dailyai (pyproject.toml)
# pipecat (pyproject.toml)
# timm
# torchaudio
torchaudio==2.2.1
# via dailyai (pyproject.toml)
tqdm==4.66.2
# torchvision
torchaudio==2.3.0
# via pipecat (pyproject.toml)
torchvision==0.18.0
# via timm
tqdm==4.66.4
# via
# huggingface-hub
# openai
types-python-dateutil==2.9.0.20240316
# via fal
typing-extensions==4.10.0
# transformers
transformers==4.40.2
# via pipecat (pyproject.toml)
typing-extensions==4.11.0
# via
# anthropic
# anyio
# dailyai (pyproject.toml)
# fal
# fastapi
# huggingface-hub
# openai
# opentelemetry-sdk
# pipecat (pyproject.toml)
# pydantic
# pydantic-core
# torch
urllib3==2.2.1
# via requests
virtualenv==20.25.1
# via isolate
websockets==12.0
# via
# dailyai (pyproject.toml)
# fal
werkzeug==3.0.2
# via pipecat (pyproject.toml)
werkzeug==3.0.3
# via flask
wrapt==1.16.0
# via deprecated
yarl==1.9.4
# via aiohttp
zipp==3.18.1
# via importlib-metadata
# The following packages are considered to be unsafe in a requirements file:
# setuptools

BIN
pipecat.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 12 KiB

View File

@@ -3,9 +3,9 @@ requires = ["setuptools>=64", "setuptools_scm>=8"]
build-backend = "setuptools.build_meta"
[project]
name = "dailyai"
name = "pipecat-ai"
dynamic = ["version"]
description = "An open source framework for real-time, multi-modal, conversational AI applications"
description = "An open source framework for voice (and multimodal) assistants"
license = { text = "BSD 2-Clause License" }
readme = "README.md"
requires-python = ">=3.7"
@@ -20,28 +20,31 @@ classifiers = [
"Topic :: Scientific/Engineering :: Artificial Intelligence"
]
dependencies = [
"aiohttp==3.9.3",
"numpy==1.26.4",
"Pillow==10.2.0",
"typing-extensions==4.10.0",
"aiohttp~=3.9.5",
"numpy~=1.26.4",
"loguru~=0.7.0",
"Pillow~=10.3.0",
"typing-extensions~=4.11.0",
]
[project.urls]
Source = "https://github.com/daily-co/dailyai"
Website = "https://daily.co"
Source = "https://github.com/pipecat-ai/pipecat"
Website = "https://pipecat.ai"
[project.optional-dependencies]
anthropic = [ "anthropic==0.20.0" ]
azure = [ "azure-cognitiveservices-speech==1.36.0" ]
daily = [ "daily-python==0.7.2" ]
examples = [ "python-dotenv==1.0.1", "flask==3.0.2", "flask_cors==4.0.0" ]
fal = [ "fal==0.12.3" ]
local = [ "pyaudio==0.2.14" ]
openai = [ "openai==1.14.2" ]
playht = [ "pyht==0.0.26" ]
silero = [ "torch==2.2.1", "torchaudio==2.2.1" ]
websocket = [ "websockets==12.0" ]
whisper = [ "faster_whisper==1.0.1" ]
anthropic = [ "anthropic~=0.25.7" ]
azure = [ "azure-cognitiveservices-speech~=1.37.0" ]
daily = [ "daily-python~=0.7.4" ]
examples = [ "python-dotenv~=1.0.0", "flask~=3.0.3", "flask_cors~=4.0.1" ]
fal = [ "fal-client~=0.4.0" ]
fireworks = [ "openai~=1.26.0" ]
local = [ "pyaudio~=0.2.0" ]
moondream = [ "einops~=0.8.0", "timm~=0.9.16", "transformers~=4.40.2" ]
openai = [ "openai~=1.26.0" ]
playht = [ "pyht~=0.0.28" ]
silero = [ "torch~=2.3.0", "torchaudio~=2.3.0" ]
websocket = [ "websockets~=12.0" ]
whisper = [ "faster-whisper~=1.0.2" ]
[tool.setuptools.packages.find]
# All the following settings are optional:
@@ -51,4 +54,4 @@ where = ["src"]
pythonpath = ["src"]
[tool.setuptools_scm]
# Empty
local_scheme = "no-local-version"

View File

@@ -1,392 +0,0 @@
import asyncio
import re
from dailyai.pipeline.frame_processor import FrameProcessor
from dailyai.pipeline.frames import (
EndFrame,
EndPipeFrame,
Frame,
LLMMessagesFrame,
LLMResponseEndFrame,
LLMResponseStartFrame,
TextFrame,
TranscriptionFrame,
UserStartedSpeakingFrame,
UserStoppedSpeakingFrame,
)
from dailyai.pipeline.pipeline import Pipeline
from dailyai.services.ai_services import AIService
from typing import AsyncGenerator, Coroutine, List
class ResponseAggregator(FrameProcessor):
def __init__(
self,
*,
messages: list[dict] | None,
role: str,
start_frame,
end_frame,
accumulator_frame,
pass_through=True,
):
self.aggregation = ""
self.aggregating = False
self.messages = messages
self._role = role
self._start_frame = start_frame
self._end_frame = end_frame
self._accumulator_frame = accumulator_frame
self._pass_through = pass_through
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
if not self.messages:
return
if isinstance(frame, self._start_frame):
self.aggregating = True
elif isinstance(frame, self._end_frame):
self.aggregating = False
# Sometimes VAD triggers quickly on and off. If we don't get any transcription,
# it creates empty LLM message queue frames
if len(self.aggregation) > 0:
self.messages.append(
{"role": self._role, "content": self.aggregation})
self.aggregation = ""
yield self._end_frame()
yield LLMMessagesFrame(self.messages)
elif isinstance(frame, self._accumulator_frame) and self.aggregating:
self.aggregation += f" {frame.text}"
if self._pass_through:
yield frame
else:
yield frame
class LLMResponseAggregator(ResponseAggregator):
def __init__(self, messages: list[dict]):
super().__init__(
messages=messages,
role="assistant",
start_frame=LLMResponseStartFrame,
end_frame=LLMResponseEndFrame,
accumulator_frame=TextFrame,
)
class UserResponseAggregator(ResponseAggregator):
def __init__(self, messages: list[dict]):
super().__init__(
messages=messages,
role="user",
start_frame=UserStartedSpeakingFrame,
end_frame=UserStoppedSpeakingFrame,
accumulator_frame=TranscriptionFrame,
pass_through=False,
)
class LLMContextAggregator(AIService):
def __init__(
self,
messages: list[dict],
role: str,
bot_participant_id=None,
complete_sentences=True,
pass_through=True,
):
super().__init__()
self.messages = messages
self.bot_participant_id = bot_participant_id
self.role = role
self.sentence = ""
self.complete_sentences = complete_sentences
self.pass_through = pass_through
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
# We don't do anything with non-text frames, pass it along to next in
# the pipeline.
if not isinstance(frame, TextFrame):
yield frame
return
# Ignore transcription frames from the bot
if isinstance(frame, TranscriptionFrame):
if frame.participantId == self.bot_participant_id:
return
# The common case for "pass through" is receiving frames from the LLM that we'll
# use to update the "assistant" LLM messages, but also passing the text frames
# along to a TTS service to be spoken to the user.
if self.pass_through:
yield frame
# TODO: split up transcription by participant
if self.complete_sentences:
# type: ignore -- the linter thinks this isn't a TextFrame, even
# though we check it above
self.sentence += frame.text
if self.sentence.endswith((".", "?", "!")):
self.messages.append(
{"role": self.role, "content": self.sentence})
self.sentence = ""
yield LLMMessagesFrame(self.messages)
else:
# type: ignore -- the linter thinks this isn't a TextFrame, even
# though we check it above
self.messages.append({"role": self.role, "content": frame.text})
yield LLMMessagesFrame(self.messages)
class LLMUserContextAggregator(LLMContextAggregator):
def __init__(
self,
messages: list[dict],
bot_participant_id=None,
complete_sentences=True):
super().__init__(
messages,
"user",
bot_participant_id,
complete_sentences,
pass_through=False)
class LLMAssistantContextAggregator(LLMContextAggregator):
def __init__(
self,
messages: list[dict],
bot_participant_id=None,
complete_sentences=True):
super().__init__(
messages,
"assistant",
bot_participant_id,
complete_sentences,
pass_through=True,
)
class SentenceAggregator(FrameProcessor):
"""This frame processor aggregates text frames into complete sentences.
Frame input/output:
TextFrame("Hello,") -> None
TextFrame(" world.") -> TextFrame("Hello world.")
Doctest:
>>> async def print_frames(aggregator, frame):
... async for frame in aggregator.process_frame(frame):
... print(frame.text)
>>> aggregator = SentenceAggregator()
>>> asyncio.run(print_frames(aggregator, TextFrame("Hello,")))
>>> asyncio.run(print_frames(aggregator, TextFrame(" world.")))
Hello, world.
"""
def __init__(self):
self.aggregation = ""
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
if isinstance(frame, TextFrame):
m = re.search("(.*[?.!])(.*)", frame.text)
if m:
yield TextFrame(self.aggregation + m.group(1))
self.aggregation = m.group(2)
else:
self.aggregation += frame.text
elif isinstance(frame, EndFrame):
if self.aggregation:
yield TextFrame(self.aggregation)
yield frame
else:
yield frame
class LLMFullResponseAggregator(FrameProcessor):
"""This class aggregates Text frames until it receives a
LLMResponseEndFrame, then emits the concatenated text as
a single text frame.
given the following frames:
TextFrame("Hello,")
TextFrame(" world.")
TextFrame(" I am")
TextFrame(" an LLM.")
LLMResponseEndFrame()]
this processor will yield nothing for the first 4 frames, then
TextFrame("Hello, world. I am an LLM.")
LLMResponseEndFrame()
when passed the last frame.
>>> async def print_frames(aggregator, frame):
... async for frame in aggregator.process_frame(frame):
... if isinstance(frame, TextFrame):
... print(frame.text)
... else:
... print(frame.__class__.__name__)
>>> aggregator = LLMFullResponseAggregator()
>>> asyncio.run(print_frames(aggregator, TextFrame("Hello,")))
>>> asyncio.run(print_frames(aggregator, TextFrame(" world.")))
>>> asyncio.run(print_frames(aggregator, TextFrame(" I am")))
>>> asyncio.run(print_frames(aggregator, TextFrame(" an LLM.")))
>>> asyncio.run(print_frames(aggregator, LLMResponseEndFrame()))
Hello, world. I am an LLM.
LLMResponseEndFrame
"""
def __init__(self):
self.aggregation = ""
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
if isinstance(frame, TextFrame):
self.aggregation += frame.text
elif isinstance(frame, LLMResponseEndFrame):
yield TextFrame(self.aggregation)
yield frame
self.aggregation = ""
else:
yield frame
class StatelessTextTransformer(FrameProcessor):
"""This processor calls the given function on any text in a text frame.
>>> async def print_frames(aggregator, frame):
... async for frame in aggregator.process_frame(frame):
... print(frame.text)
>>> aggregator = StatelessTextTransformer(lambda x: x.upper())
>>> asyncio.run(print_frames(aggregator, TextFrame("Hello")))
HELLO
"""
def __init__(self, transform_fn):
self.transform_fn = transform_fn
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
if isinstance(frame, TextFrame):
result = self.transform_fn(frame.text)
if isinstance(result, Coroutine):
result = await result
yield TextFrame(result)
else:
yield frame
class ParallelPipeline(FrameProcessor):
"""Run multiple pipelines in parallel.
This class takes frames from its source queue and sends them to each
sub-pipeline. Each sub-pipeline emits its frames into this class's
sink queue. No guarantees are made about the ordering of frames in
the sink queue (that is, no sub-pipeline has higher priority than
any other, frames are put on the sink in the order they're emitted
by the sub-pipelines).
After each frame is taken from this class's source queue and placed
in each sub-pipeline's source queue, an EndPipeFrame is put on each
sub-pipeline's source queue. This indicates to the sub-pipe runner
that it should exit.
Since frame handlers pass through unhandled frames by convention, this
class de-dupes frames in its sink before yielding them.
"""
def __init__(self, pipeline_definitions: List[List[FrameProcessor]]):
self.sources = [asyncio.Queue() for _ in pipeline_definitions]
self.sink: asyncio.Queue[Frame] = asyncio.Queue()
self.pipelines: list[Pipeline] = [
Pipeline(
pipeline_definition,
source,
self.sink,
)
for source, pipeline_definition in zip(self.sources, pipeline_definitions)
]
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
for source in self.sources:
await source.put(frame)
await source.put(EndPipeFrame())
await asyncio.gather(*[pipeline.run_pipeline() for pipeline in self.pipelines])
seen_ids = set()
while not self.sink.empty():
frame = await self.sink.get()
# de-dup frames. Because the convention is to yield a frame that isn't processed,
# each pipeline will likely yield the same frame, so we will end up with _n_ copies
# of unprocessed frames where _n_ is the number of parallel pipes that don't
# process that frame.
if id(frame) in seen_ids:
continue
seen_ids.add(id(frame))
# Skip passing along EndPipeFrame, because we use them
# for our own flow control.
if not isinstance(frame, EndPipeFrame):
yield frame
class GatedAggregator(FrameProcessor):
"""Accumulate frames, with custom functions to start and stop accumulation.
Yields gate-opening frame before any accumulated frames, then ensuing frames
until and not including the gate-closed frame.
>>> from dailyai.pipeline.frames import ImageFrame
>>> async def print_frames(aggregator, frame):
... async for frame in aggregator.process_frame(frame):
... if isinstance(frame, TextFrame):
... print(frame.text)
... else:
... print(frame.__class__.__name__)
>>> aggregator = GatedAggregator(
... gate_close_fn=lambda x: isinstance(x, LLMResponseStartFrame),
... gate_open_fn=lambda x: isinstance(x, ImageFrame),
... start_open=False)
>>> asyncio.run(print_frames(aggregator, TextFrame("Hello")))
>>> asyncio.run(print_frames(aggregator, TextFrame("Hello again.")))
>>> asyncio.run(print_frames(aggregator, ImageFrame(url='', image=bytes([]))))
ImageFrame
Hello
Hello again.
>>> asyncio.run(print_frames(aggregator, TextFrame("Goodbye.")))
Goodbye.
"""
def __init__(self, gate_open_fn, gate_close_fn, start_open):
self.gate_open_fn = gate_open_fn
self.gate_close_fn = gate_close_fn
self.gate_open = start_open
self.accumulator: List[Frame] = []
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
if self.gate_open:
if self.gate_close_fn(frame):
self.gate_open = False
else:
if self.gate_open_fn(frame):
self.gate_open = True
if self.gate_open:
yield frame
if self.accumulator:
for frame in self.accumulator:
yield frame
self.accumulator = []
else:
self.accumulator.append(frame)

View File

@@ -1,34 +0,0 @@
from abc import abstractmethod
from typing import AsyncGenerator
from dailyai.pipeline.frames import ControlFrame, Frame
class FrameProcessor:
"""This is the base class for all frame processors. Frame processors consume a frame
and yield 0 or more frames. Generally frame processors are used as part of a pipeline
where frames come from a source queue, are processed by a series of frame processors,
then placed on a sink queue.
By convention, FrameProcessors should immediately yield any frames they don't process.
Stateful FrameProcessors should watch for the EndFrame and finalize their
output, eg. yielding an unfinished sentence if they're aggregating LLM output to full
sentences. EndFrame is also a chance to clean up any services that need to
be closed, del'd, etc.
"""
@abstractmethod
async def process_frame(
self, frame: Frame
) -> AsyncGenerator[Frame, None]:
"""Process a single frame and yield 0 or more frames."""
yield frame
@abstractmethod
async def interrupted(self) -> None:
"""Handle any cleanup if the pipeline was interrupted."""
pass
def __str__(self):
return self.__class__.__name__

View File

@@ -1,187 +0,0 @@
from dataclasses import dataclass
from typing import Any, List
class Frame:
def __str__(self):
return f"{self.__class__.__name__}"
class ControlFrame(Frame):
# Control frames should contain no instance data, so
# equality is based solely on the class.
def __eq__(self, other):
return isinstance(other, self.__class__)
class StartFrame(ControlFrame):
"""Used (but not required) to start a pipeline, and is also used to
indicate that an interruption has ended and the transport should start
processing frames again."""
pass
class EndFrame(ControlFrame):
"""Indicates that a pipeline has ended and frame processors and pipelines
should be shut down. If the transport receives this frame, it will stop
sending frames to its output channel(s) and close all its threads."""
pass
class EndPipeFrame(ControlFrame):
"""Indicates that a pipeline has ended but that the transport should
continue processing. This frame is used in parallel pipelines and other
sub-pipelines."""
pass
class PipelineStartedFrame(ControlFrame):
"""
Used by the transport to indicate that execution of a pipeline is starting
(or restarting). It should be the first frame your app receives when it
starts, or when an interruptible pipeline has been interrupted.
"""
pass
class LLMResponseStartFrame(ControlFrame):
"""Used to indicate the beginning of an LLM response. Following TextFrames
are part of the LLM response until an LLMResponseEndFrame"""
pass
class LLMResponseEndFrame(ControlFrame):
"""Indicates the end of an LLM response."""
pass
@dataclass()
class AudioFrame(Frame):
"""A chunk of audio. Will be played by the transport if the transport's mic
has been enabled."""
data: bytes
def __str__(self):
return f"{self.__class__.__name__}, size: {len(self.data)} B"
@dataclass()
class ImageFrame(Frame):
"""An image. Will be shown by the transport if the transport's camera is
enabled."""
url: str | None
image: bytes
def __str__(self):
return f"{self.__class__.__name__}, url: {self.url}, image size: {len(self.image)} B"
@dataclass()
class SpriteFrame(Frame):
"""An animated sprite. Will be shown by the transport if the transport's
camera is enabled. Will play at the framerate specified in the transport's
`fps` constructor parameter."""
images: list[bytes]
def __str__(self):
return f"{self.__class__.__name__}, list size: {len(self.images)}"
@dataclass()
class TextFrame(Frame):
"""A chunk of text. Emitted by LLM services, consumed by TTS services, can
be used to send text through pipelines."""
text: str
def __str__(self):
return f'{self.__class__.__name__}: "{self.text}"'
@dataclass()
class TranscriptionFrame(TextFrame):
"""A text frame with transcription-specific data. Will be placed in the
transport's receive queue when a participant speaks."""
participantId: str
timestamp: str
def __str__(self):
return f"{self.__class__.__name__}, text: '{self.text}' participantId: {self.participantId}, timestamp: {self.timestamp}"
class TTSStartFrame(ControlFrame):
"""Used to indicate the beginning of a TTS response. Following AudioFrames
are part of the TTS response until an TTEndFrame. These frames can be used
for aggregating audio frames in a transport to optimize the size of frames
sent to the session, without needing to control this in the TTS service."""
pass
class TTSEndFrame(ControlFrame):
"""Indicates the end of a TTS response."""
pass
@dataclass()
class LLMMessagesFrame(Frame):
"""A frame containing a list of LLM messages. Used to signal that an LLM
service should run a chat completion and emit an LLMStartFrames, TextFrames
and an LLMEndFrame.
Note that the messages property on this class is mutable, and will be
be updated by various ResponseAggregator frame processors."""
messages: List[dict]
@dataclass()
class ReceivedAppMessageFrame(Frame):
message: Any
sender: str
def __str__(self):
return f"ReceivedAppMessageFrame: sender: {self.sender}, message: {self.message}"
@dataclass()
class SendAppMessageFrame(Frame):
message: Any
participantId: str | None
def __str__(self):
return f"SendAppMessageFrame: participantId: {self.participantId}, message: {self.message}"
class UserStartedSpeakingFrame(Frame):
"""Emitted by VAD to indicate that a participant has started speaking.
This can be used for interruptions or other times when detecting that
someone is speaking is more important than knowing what they're saying
(as you will with a TranscriptionFrame)"""
pass
class UserStoppedSpeakingFrame(Frame):
"""Emitted by the VAD to indicate that a user stopped speaking."""
pass
class BotStartedSpeakingFrame(Frame):
pass
class BotStoppedSpeakingFrame(Frame):
pass
@dataclass()
class LLMFunctionStartFrame(Frame):
"""Emitted when the LLM receives the beginning of a function call
completion. A frame processor can use this frame to indicate that it should
start preparing to make a function call, if it can do so in the absence of
any arguments."""
function_name: str
@dataclass()
class LLMFunctionCallFrame(Frame):
"""Emitted when the LLM has received an entire function call completion."""
function_name: str
arguments: str

View File

@@ -1,12 +0,0 @@
from dataclasses import dataclass
from dailyai.pipeline.frames import Frame
from dailyai.services.openai_llm_context import OpenAILLMContext
@dataclass()
class OpenAILLMContextFrame(Frame):
"""Like an LLMMessagesFrame, but with extra context specific to the
OpenAI API. The context in this message is also mutable, and will be
changed by the OpenAIContextAggregator frame processor."""
context: OpenAILLMContext

View File

@@ -1,149 +0,0 @@
import asyncio
import logging
from typing import AsyncGenerator, AsyncIterable, Iterable, List
from dailyai.pipeline.frame_processor import FrameProcessor
from dailyai.pipeline.frames import AudioFrame, EndPipeFrame, EndFrame, Frame
class Pipeline:
"""
This class manages a pipe of FrameProcessors, and runs them in sequence. The "source"
and "sink" queues are managed by the caller. You can use this class stand-alone to
perform specialized processing, or you can use the Transport's run_pipeline method to
instantiate and run a pipeline with the Transport's sink and source queues.
"""
def __init__(
self,
processors: List[FrameProcessor],
source: asyncio.Queue | None = None,
sink: asyncio.Queue[Frame] | None = None,
name: str | None = None,
):
"""Create a new pipeline. By default we create the sink and source queues
if they're not provided, but these can be overridden to point to other
queues. If this pipeline is run by a transport, its sink and source queues
will be overridden.
"""
self._processors: List[FrameProcessor] = processors
self.source: asyncio.Queue[Frame] = source or asyncio.Queue()
self.sink: asyncio.Queue[Frame] = sink or asyncio.Queue()
self._logger = logging.getLogger("dailyai.pipeline")
self._last_log_line = ""
self._shown_repeated_log = False
self._name = name or str(id(self))
def set_source(self, source: asyncio.Queue[Frame]):
"""Set the source queue for this pipeline. Frames from this queue
will be processed by each frame_processor in the pipeline, or order
from first to last."""
self.source = source
def set_sink(self, sink: asyncio.Queue[Frame]):
"""Set the sink queue for this pipeline. After the last frame_processor
has processed a frame, its output will be placed on this queue."""
self.sink = sink
def add_processor(self, processor: FrameProcessor):
self._processors.append(processor)
async def get_next_source_frame(self) -> AsyncGenerator[Frame, None]:
"""Convenience function to get the next frame from the source queue. This
lets us consistently have an AsyncGenerator yield frames, from either the
source queue or a frame_processor."""
yield await self.source.get()
async def queue_frames(
self,
frames: Iterable[Frame] | AsyncIterable[Frame],
) -> None:
"""Insert frames directly into a pipeline. This is typically used inside a transport
participant_joined callback to prompt a bot to start a conversation, for example."""
if isinstance(frames, AsyncIterable):
async for frame in frames:
await self.source.put(frame)
elif isinstance(frames, Iterable):
for frame in frames:
await self.source.put(frame)
else:
raise Exception("Frames must be an iterable or async iterable")
async def run_pipeline(self):
"""Run the pipeline. Take each frame from the source queue, pass it to
the first frame_processor, pass the output of that frame_processor to the
next in the list, etc. until the last frame_processor has processed the
resulting frames, then place those frames in the sink queue.
The source and sink queues must be set before calling this method.
This method will exit when an EndFrame is placed on the sink queue.
No more frames will be placed on the sink queue after an EndFrame, even
if it's not the last frame yielded by the last frame_processor in the pipeline..
"""
try:
while True:
initial_frame = await self.source.get()
async for frame in self._run_pipeline_recursively(
initial_frame, self._processors
):
self._log_frame(frame, len(self._processors) + 1)
await self.sink.put(frame)
if isinstance(initial_frame, EndFrame) or isinstance(
initial_frame, EndPipeFrame
):
break
except asyncio.CancelledError:
# this means there's been an interruption, do any cleanup necessary
# here.
for processor in self._processors:
await processor.interrupted()
async def _run_pipeline_recursively(
self, initial_frame: Frame, processors: List[FrameProcessor], depth=1
) -> AsyncGenerator[Frame, None]:
"""Internal function to add frames to the pipeline as they're yielded
by each processor."""
if processors:
self._log_frame(initial_frame, depth)
async for frame in processors[0].process_frame(initial_frame):
async for final_frame in self._run_pipeline_recursively(
frame, processors[1:], depth + 1
):
yield final_frame
else:
yield initial_frame
def _log_frame(self, frame: Frame, depth: int):
"""Log a frame as it moves through the pipeline. This is useful for debugging.
Note that this function inherits the logging level from the "dailyai" logger.
If you want debug output from dailyai in general but not this function (it is
noisy) you can silence this function by doing something like this:
# enable debug logging for the dailyai package.
logger = logging.getLogger("dailyai")
logger.setLevel(logging.DEBUG)
# silence the pipeline logging
logger = logging.getLogger("dailyai.pipeline")
logger.setLevel(logging.WARNING)
"""
source = str(self._processors[depth - 2]) if depth > 1 else "source"
dest = str(self._processors[depth - 1]) if depth < (len(self._processors) + 1) else "sink"
prefix = self._name + " " * depth
logline = prefix + " -> ".join([source, frame.__class__.__name__, dest])
if logline == self._last_log_line:
if self._shown_repeated_log:
return
self._shown_repeated_log = True
self._logger.debug(prefix + "... repeated")
else:
self._shown_repeated_log = False
self._last_log_line = logline
self._logger.debug(logline)

View File

@@ -1,145 +0,0 @@
import io
import logging
import time
import wave
from dailyai.pipeline.frame_processor import FrameProcessor
from dailyai.pipeline.frames import (
AudioFrame,
EndFrame,
EndPipeFrame,
ImageFrame,
Frame,
TTSEndFrame,
TTSStartFrame,
TextFrame,
TranscriptionFrame,
)
from abc import abstractmethod
from typing import AsyncGenerator, BinaryIO
class AIService(FrameProcessor):
def __init__(self):
self.logger = logging.getLogger("dailyai")
class LLMService(AIService):
"""This class is a no-op but serves as a base class for LLM services."""
def __init__(self):
super().__init__()
class TTSService(AIService):
def __init__(self, aggregate_sentences=True):
super().__init__()
self.aggregate_sentences: bool = aggregate_sentences
self.current_sentence: str = ""
# Some TTS services require a specific sample rate. We default to 16k
def get_mic_sample_rate(self):
return 16000
# Converts the text to audio. Yields a list of audio frames that can
# be sent to the microphone device
@abstractmethod
async def run_tts(self, text) -> AsyncGenerator[bytes, None]:
# yield empty bytes here, so linting can infer what this method does
yield bytes()
async def wrap_tts(self, text) -> AsyncGenerator[Frame, None]:
yield TTSStartFrame()
async for audio_chunk in self.run_tts(text):
yield AudioFrame(audio_chunk)
yield TTSEndFrame()
yield TextFrame(text)
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
if isinstance(frame, EndFrame) or isinstance(frame, EndPipeFrame):
if self.current_sentence:
async for cleanup_frame in self.wrap_tts(self.current_sentence):
yield cleanup_frame
if not isinstance(frame, TextFrame):
yield frame
return
text: str | None = None
if not self.aggregate_sentences:
text = frame.text
else:
self.current_sentence += frame.text
if self.current_sentence.strip().endswith((".", "?", "!")):
text = self.current_sentence
self.current_sentence = ""
if text:
async for frame in self.wrap_tts(text):
yield frame
class ImageGenService(AIService):
def __init__(self, image_size, **kwargs):
super().__init__(**kwargs)
self.image_size = image_size
# Renders the image. Returns an Image object.
@abstractmethod
async def run_image_gen(self, sentence: str) -> tuple[str, bytes]:
pass
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
if not isinstance(frame, TextFrame):
yield frame
return
(url, image_data) = await self.run_image_gen(frame.text)
yield ImageFrame(url, image_data)
class STTService(AIService):
"""STTService is a base class for speech-to-text services."""
_frame_rate: int
def __init__(self, frame_rate: int = 16000, **kwargs):
super().__init__(**kwargs)
self._frame_rate = frame_rate
@abstractmethod
async def run_stt(self, audio: BinaryIO) -> str:
"""Returns transcript as a string"""
pass
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
"""Processes a frame of audio data, either buffering or transcribing it."""
if not isinstance(frame, AudioFrame):
return
data = frame.data
content = io.BufferedRandom(io.BytesIO())
ww = wave.open(self._content, "wb")
ww.setnchannels(1)
ww.setsampwidth(2)
ww.setframerate(self._frame_rate)
ww.writeframesraw(data)
ww.close()
content.seek(0)
text = await self.run_stt(content)
yield TranscriptionFrame(text, "", str(time.time()))
class FrameLogger(AIService):
def __init__(self, prefix="Frame", **kwargs):
super().__init__(**kwargs)
self.prefix = prefix
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
if isinstance(frame, (AudioFrame, ImageFrame)):
self.logger.info(f"{self.prefix}: {type(frame)}")
else:
print(f"{self.prefix}: {frame}")
yield frame

View File

@@ -1,44 +0,0 @@
from typing import AsyncGenerator
from dailyai.pipeline.frames import Frame, LLMMessagesFrame, TextFrame
from dailyai.services.ai_services import LLMService
try:
from anthropic import AsyncAnthropic
except ModuleNotFoundError as e:
print(f"Exception: {e}")
print(
"In order to use Anthropic, you need to `pip install dailyai[anthropic]`. Also, set `ANTHROPIC_API_KEY` environment variable.")
raise Exception(f"Missing module: {e}")
class AnthropicLLMService(LLMService):
def __init__(
self,
api_key,
model="claude-3-opus-20240229",
max_tokens=1024):
super().__init__()
self.client = AsyncAnthropic(api_key=api_key)
self.model = model
self.max_tokens = max_tokens
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
if not isinstance(frame, LLMMessagesFrame):
yield frame
stream = await self.client.messages.create(
max_tokens=self.max_tokens,
messages=[
{
"role": "user",
"content": "Hello, Claude",
}
],
model=self.model,
stream=True,
)
async for event in stream:
if event.type == "content_block_delta":
yield TextFrame(event.delta.text)

View File

@@ -1,36 +0,0 @@
import aiohttp
from dailyai.services.ai_services import TTSService
class DeepgramAIService(TTSService):
def __init__(
self,
*,
aiohttp_session: aiohttp.ClientSession,
api_key,
voice,
sample_rate=16000
):
super().__init__()
self._api_key = api_key
self._voice = voice
self._sample_rate = sample_rate
self._aiohttp_session = aiohttp_session
async def run_tts(self, sentence):
self.logger.info(f"Running deepgram tts for {sentence}")
base_url = "https://api.beta.deepgram.com/v1/speak"
request_url = f"{base_url}?model={self._voice}&encoding=linear16&container=none&sample_rate={self._sample_rate}"
headers = {
"authorization": f"token {self._api_key}",
"Content-Type": "application/json"}
data = {"text": sentence}
async with self._aiohttp_session.post(
request_url, headers=headers, json=data
) as r:
async for chunk in r.content:
if chunk:
yield chunk

View File

@@ -1,46 +0,0 @@
import aiohttp
from typing import AsyncGenerator
from dailyai.services.ai_services import TTSService
class ElevenLabsTTSService(TTSService):
def __init__(
self,
*,
aiohttp_session: aiohttp.ClientSession,
api_key,
voice_id,
model="eleven_turbo_v2",
):
super().__init__()
self._api_key = api_key
self._voice_id = voice_id
self._aiohttp_session = aiohttp_session
self._model = model
async def run_tts(self, sentence) -> AsyncGenerator[bytes, None]:
url = f"https://api.elevenlabs.io/v1/text-to-speech/{self._voice_id}/stream"
payload = {"text": sentence, "model_id": self._model}
querystring = {
"output_format": "pcm_16000",
"optimize_streaming_latency": 2}
headers = {
"xi-api-key": self._api_key,
"Content-Type": "application/json",
}
async with self._aiohttp_session.post(
url, json=payload, headers=headers, params=querystring
) as r:
if r.status != 200:
self.logger.error(
f"audio fetch status code: {r.status}, error: {r.text}"
)
return
async for chunk in r.content:
if chunk:
yield chunk

View File

@@ -1,58 +0,0 @@
import aiohttp
import asyncio
import io
import os
from PIL import Image
from dailyai.services.ai_services import ImageGenService
try:
import fal
except ModuleNotFoundError as e:
print(f"Exception: {e}")
print(
"In order to use Fal, you need to `pip install dailyai[fal]`. Also, set `FAL_KEY_ID` and `FAL_KEY_SECRET` environment variables.")
raise Exception(f"Missing module: {e}")
class FalImageGenService(ImageGenService):
def __init__(
self,
*,
image_size,
aiohttp_session: aiohttp.ClientSession,
key_id=None,
key_secret=None
):
super().__init__(image_size)
self._aiohttp_session = aiohttp_session
if key_id:
os.environ["FAL_KEY_ID"] = key_id
if key_secret:
os.environ["FAL_KEY_SECRET"] = key_secret
async def run_image_gen(self, sentence) -> tuple[str, bytes]:
def get_image_url(sentence, size):
handler = fal.apps.submit(
"110602490-fast-sdxl",
# "fal-ai/fast-sdxl",
arguments={"prompt": sentence},
)
for event in handler.iter_events():
if isinstance(event, fal.apps.InProgress):
pass
result = handler.get()
image_url = result["images"][0]["url"] if result else None
if not image_url:
raise Exception("Image generation failed")
return image_url
image_url = await asyncio.to_thread(get_image_url, sentence, self.image_size)
# Load the image from the url
async with self._aiohttp_session.get(image_url) as response:
image_stream = io.BytesIO(await response.content.read())
image = Image.open(image_stream)
return (image_url, image.tobytes())

View File

@@ -1,74 +0,0 @@
import array
import io
import math
import time
from typing import AsyncGenerator
import wave
from dailyai.pipeline.frames import AudioFrame, Frame, TranscriptionFrame
from dailyai.services.ai_services import STTService
class LocalSTTService(STTService):
_content: io.BufferedRandom
_wave: wave.Wave_write
_current_silence_frames: int
# Configuration
_min_rms: int
_max_silence_frames: int
_frame_rate: int
def __init__(self,
min_rms: int = 400,
max_silence_frames: int = 3,
frame_rate: int = 16000,
**kwargs):
super().__init__(frame_rate, **kwargs)
self._current_silence_frames = 0
self._min_rms = min_rms
self._max_silence_frames = max_silence_frames
self._frame_rate = frame_rate
self._new_wave()
def _new_wave(self):
"""Creates a new wave object and content buffer."""
self._content = io.BufferedRandom(io.BytesIO())
ww = wave.open(self._content, "wb")
ww.setnchannels(1)
ww.setsampwidth(2)
ww.setframerate(self._frame_rate)
self._wave = ww
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
"""Processes a frame of audio data, either buffering or transcribing it."""
if not isinstance(frame, AudioFrame):
yield frame
return
data = frame.data
# Try to filter out empty background noise
# (Very rudimentary approach, can be improved)
rms = self._get_volume(data)
if rms >= self._min_rms:
# If volume is high enough, write new data to wave file
self._wave.writeframesraw(data)
# If buffer is not empty and we detect a 3-frame pause in speech,
# transcribe the audio gathered so far.
if self._content.tell() > 0 and self._current_silence_frames > self._max_silence_frames:
self._current_silence_frames = 0
self._wave.close()
self._content.seek(0)
text = await self.run_stt(self._content)
self._new_wave()
yield TranscriptionFrame(text, '', str(time.time()))
# If we get this far, this is a frame of silence
self._current_silence_frames += 1
def _get_volume(self, audio: bytes) -> float:
# https://docs.python.org/3/library/array.html
audio_array = array.array('h', audio)
squares = [sample**2 for sample in audio_array]
mean = sum(squares) / len(audio_array)
rms = math.sqrt(mean)
return rms

View File

@@ -1,56 +0,0 @@
import aiohttp
from PIL import Image
import io
from dailyai.services.ai_services import ImageGenService
from dailyai.services.openai_api_llm_service import BaseOpenAILLMService
try:
from openai import AsyncOpenAI
except ModuleNotFoundError as e:
print(f"Exception: {e}")
print(
"In order to use OpenAI, you need to `pip install dailyai[openai]`. Also, set `OPENAI_API_KEY` environment variable.")
raise Exception(f"Missing module: {e}")
class OpenAILLMService(BaseOpenAILLMService):
def __init__(self, model="gpt-4", * args, **kwargs):
super().__init__(model, *args, **kwargs)
class OpenAIImageGenService(ImageGenService):
def __init__(
self,
*,
image_size: str,
aiohttp_session: aiohttp.ClientSession,
api_key,
model="dall-e-3",
):
super().__init__(image_size=image_size)
self._model = model
self._client = AsyncOpenAI(api_key=api_key)
self._aiohttp_session = aiohttp_session
async def run_image_gen(self, sentence) -> tuple[str, bytes]:
self.logger.info("Generating OpenAI image", sentence)
image = await self._client.images.generate(
prompt=sentence,
model=self._model,
n=1,
size=self.image_size
)
image_url = image.data[0].url
if not image_url:
raise Exception("No image provided in response", image)
# Load the image from the url
async with self._aiohttp_session.get(image_url) as response:
image_stream = io.BytesIO(await response.content.read())
image = Image.open(image_stream)
return (image_url, image.tobytes())

View File

@@ -1,61 +0,0 @@
from typing import List
try:
from openai._types import NOT_GIVEN, NotGiven
from openai.types.chat import (
ChatCompletionToolParam,
ChatCompletionToolChoiceOptionParam,
ChatCompletionMessageParam,
)
except ModuleNotFoundError as e:
print(f"Exception: {e}")
print(
"In order to use OpenAI, you need to `pip install dailyai[openai]`. Also, set `OPENAI_API_KEY` environment variable.")
raise Exception(f"Missing module: {e}")
class OpenAILLMContext:
def __init__(
self,
messages: List[ChatCompletionMessageParam] | None = None,
tools: List[ChatCompletionToolParam] | NotGiven = NOT_GIVEN,
tool_choice: ChatCompletionToolChoiceOptionParam | NotGiven = NOT_GIVEN
):
self.messages: List[ChatCompletionMessageParam] = messages if messages else [
]
self.tool_choice: ChatCompletionToolChoiceOptionParam | NotGiven = tool_choice
self.tools: List[ChatCompletionToolParam] | NotGiven = tools
@staticmethod
def from_messages(messages: List[dict]) -> "OpenAILLMContext":
context = OpenAILLMContext()
for message in messages:
context.add_message({
"content": message["content"],
"role": message["role"],
"name": message["name"] if "name" in message else message["role"]
})
return context
# def __deepcopy__(self, memo):
def add_message(self, message: ChatCompletionMessageParam):
self.messages.append(message)
def get_messages(self) -> List[ChatCompletionMessageParam]:
return self.messages
def set_tool_choice(
self, tool_choice: ChatCompletionToolChoiceOptionParam | NotGiven
):
self.tool_choice = tool_choice
def set_tools(
self,
tools: List[ChatCompletionToolParam] | NotGiven = NOT_GIVEN):
if tools != NOT_GIVEN and len(tools) == 0:
tools = NOT_GIVEN
self.tools = tools

View File

@@ -1,41 +0,0 @@
from abc import abstractmethod
import asyncio
import logging
import time
from dailyai.pipeline.frame_processor import FrameProcessor
from dailyai.pipeline.pipeline import Pipeline
class AbstractTransport:
def __init__(self, **kwargs):
self.send_queue = asyncio.Queue()
self.receive_queue = asyncio.Queue()
self.completed_queue = asyncio.Queue()
duration_minutes = kwargs.get("duration_minutes") or 10
self._expiration = time.time() + duration_minutes * 60
self._mic_enabled = kwargs.get("mic_enabled") or False
self._mic_sample_rate = kwargs.get("mic_sample_rate") or 16000
self._camera_enabled = kwargs.get("camera_enabled") or False
self._camera_width = kwargs.get("camera_width") or 1024
self._camera_height = kwargs.get("camera_height") or 768
self._speaker_enabled = kwargs.get("speaker_enabled") or False
self._speaker_sample_rate = kwargs.get("speaker_sample_rate") or 16000
self._fps = kwargs.get("fps") or 8
self._logger: logging.Logger = logging.getLogger("dailyai.transport")
@abstractmethod
async def run(self, pipeline: Pipeline, override_pipeline_source_queue=True):
pass
@abstractmethod
async def run_interruptible_pipeline(
self,
pipeline: Pipeline,
pre_processor: FrameProcessor | None = None,
post_processor: FrameProcessor | None = None,
):
pass

View File

@@ -1,314 +0,0 @@
import asyncio
import inspect
import logging
import signal
import threading
import types
from functools import partial
from typing import Any
from dailyai.pipeline.frames import (
ReceivedAppMessageFrame,
TranscriptionFrame,
)
from threading import Event
try:
from daily import (
EventHandler,
CallClient,
Daily,
VirtualCameraDevice,
VirtualMicrophoneDevice,
VirtualSpeakerDevice,
)
except ModuleNotFoundError as e:
print(f"Exception: {e}")
print(
"In order to use the Daily transport, you need to `pip install dailyai[daily]`.")
raise Exception(f"Missing module: {e}")
from dailyai.transports.threaded_transport import ThreadedTransport
NUM_CHANNELS = 1
SPEECH_THRESHOLD = 0.90
VAD_RESET_PERIOD_MS = 2000
class DailyTransport(ThreadedTransport, EventHandler):
_daily_initialized = False
_lock = threading.Lock()
_speaker_enabled: bool
_speaker_sample_rate: int
_vad_enabled: bool
# This is necessary to override EventHandler's __new__ method.
def __new__(cls, *args, **kwargs):
return super().__new__(cls)
def __init__(
self,
room_url: str,
token: str | None,
bot_name: str,
min_others_count: int = 1,
start_transcription: bool = False,
**kwargs,
):
kwargs['has_webrtc_vad'] = True
# This will call ThreadedTransport.__init__ method, not EventHandler
super().__init__(**kwargs)
self._room_url: str = room_url
self._bot_name: str = bot_name
self._token: str | None = token
self._min_others_count = min_others_count
self._start_transcription = start_transcription
self._is_interrupted = Event()
self._stop_threads = Event()
self._other_participant_has_joined = False
self._my_participant_id = None
self.transcription_settings = {
"language": "en",
"tier": "nova",
"model": "2-conversationalai",
"profanity_filter": True,
"redact": False,
"extra": {
"endpointing": True,
"punctuate": False,
},
}
self._logger: logging.Logger = logging.getLogger("dailyai")
self._event_handlers = {}
self.webrtc_vad = Daily.create_native_vad(
reset_period_ms=VAD_RESET_PERIOD_MS,
sample_rate=self._speaker_sample_rate,
channels=NUM_CHANNELS
)
def _patch_method(self, event_name, *args, **kwargs):
try:
for handler in self._event_handlers[event_name]:
if inspect.iscoroutinefunction(handler):
if self._loop:
future = asyncio.run_coroutine_threadsafe(
handler(*args, **kwargs), self._loop)
# wait for the coroutine to finish. This will also
# raise any exceptions raised by the coroutine.
future.result()
else:
raise Exception(
"No event loop to run coroutine. In order to use async event handlers, you must run the DailyTransportService in an asyncio event loop.")
else:
handler(*args, **kwargs)
except Exception as e:
self._logger.error(f"Exception in event handler {event_name}: {e}")
raise e
def _webrtc_vad_analyze(self):
buffer = self.read_audio_frames(int(self._vad_samples))
if len(buffer) > 0:
confidence = self.webrtc_vad.analyze_frames(buffer)
# yeses = int(confidence * 20.0)
# nos = 20 - yeses
# out = "!" * yeses + "." * nos
# print(f"!!! confidence: {out} {confidence}")
talking = confidence > SPEECH_THRESHOLD
return talking
def add_event_handler(self, event_name: str, handler):
if not event_name.startswith("on_"):
raise Exception(
f"Event handler {event_name} must start with 'on_'")
methods = inspect.getmembers(self, predicate=inspect.ismethod)
if event_name not in [method[0] for method in methods]:
raise Exception(f"Event handler {event_name} not found")
if event_name not in self._event_handlers:
self._event_handlers[event_name] = [
getattr(
self, event_name), types.MethodType(
handler, self)]
setattr(self, event_name, partial(self._patch_method, event_name))
else:
self._event_handlers[event_name].append(
types.MethodType(handler, self))
def event_handler(self, event_name: str):
def decorator(handler):
self.add_event_handler(event_name, handler)
return handler
return decorator
def write_frame_to_camera(self, frame: bytes):
self.camera.write_frame(frame)
def write_frame_to_mic(self, frame: bytes):
self.mic.write_frames(frame)
def send_app_message(self, message: Any, participantId: str | None):
self.client.send_app_message(message, participantId)
def read_audio_frames(self, desired_frame_count):
bytes = self._speaker.read_frames(desired_frame_count)
return bytes
def _prerun(self):
# Only initialize Daily once
if not DailyTransport._daily_initialized:
with DailyTransport._lock:
Daily.init()
DailyTransport._daily_initialized = True
self.client = CallClient(event_handler=self)
if self._mic_enabled:
self.mic: VirtualMicrophoneDevice = Daily.create_microphone_device(
"mic", sample_rate=self._mic_sample_rate, channels=1
)
if self._camera_enabled:
self.camera: VirtualCameraDevice = Daily.create_camera_device(
"camera", width=self._camera_width, height=self._camera_height, color_format="RGB")
if self._speaker_enabled or self._vad_enabled:
self._speaker: VirtualSpeakerDevice = Daily.create_speaker_device(
"speaker", sample_rate=self._speaker_sample_rate, channels=1
)
Daily.select_speaker_device("speaker")
self.client.set_user_name(self._bot_name)
self.client.join(
self._room_url,
self._token,
completion=self.call_joined,
client_settings={
"inputs": {
"camera": {
"isEnabled": True,
"settings": {
"deviceId": "camera",
},
},
"microphone": {
"isEnabled": True,
"settings": {
"deviceId": "mic",
"customConstraints": {
"autoGainControl": {"exact": False},
"echoCancellation": {"exact": False},
"noiseSuppression": {"exact": False},
},
},
},
},
"publishing": {
"camera": {
"sendSettings": {
"maxQuality": "low",
"encodings": {
"low": {
"maxBitrate": 250000,
"scaleResolutionDownBy": 1.333,
"maxFramerate": 8,
}
},
}
}
},
},
)
self._my_participant_id = self.client.participants()["local"]["id"]
self.client.update_subscription_profiles({
"base": {
"camera": "unsubscribed",
}
})
if self._token and self._start_transcription:
self.client.start_transcription(self.transcription_settings)
self.original_sigint_handler = signal.getsignal(signal.SIGINT)
signal.signal(signal.SIGINT, self.process_interrupt_handler)
def process_interrupt_handler(self, signum, frame):
self._post_run()
if callable(self.original_sigint_handler):
self.original_sigint_handler(signum, frame)
def _post_run(self):
self.client.leave()
self.client.release()
def on_first_other_participant_joined(self):
pass
def call_joined(self, join_data, client_error):
# self._logger.info(f"Call_joined: {join_data}, {client_error}")
pass
def dialout(self, number):
self.client.start_dialout({"phoneNumber": number})
def start_recording(self):
self.client.start_recording()
def on_error(self, error):
self._logger.error(f"on_error: {error}")
def on_call_state_updated(self, state):
pass
def on_participant_joined(self, participant):
if not self._other_participant_has_joined and participant["id"] != self._my_participant_id:
self._other_participant_has_joined = True
self.on_first_other_participant_joined()
def on_participant_left(self, participant, reason):
if len(self.client.participants()) < self._min_others_count + 1:
self._stop_threads.set()
def on_app_message(self, message: Any, sender: str):
if self._loop:
frame = ReceivedAppMessageFrame(message, sender)
print(frame)
asyncio.run_coroutine_threadsafe(
self.receive_queue.put(frame), self._loop
)
def on_transcription_message(self, message: dict):
if self._loop:
participantId = ""
if "participantId" in message:
participantId = message["participantId"]
elif "session_id" in message:
participantId = message["session_id"]
if self._my_participant_id and participantId != self._my_participant_id:
frame = TranscriptionFrame(
message["text"], participantId, message["timestamp"])
asyncio.run_coroutine_threadsafe(
self.receive_queue.put(frame), self._loop)
def on_transcription_error(self, message):
self._logger.error(f"Transcription error: {message}")
def on_transcription_started(self, status):
pass
def on_transcription_stopped(self, stopped_by, stopped_by_error):
pass

View File

@@ -1,93 +0,0 @@
import asyncio
import numpy as np
import tkinter as tk
from dailyai.transports.threaded_transport import ThreadedTransport
try:
import pyaudio
except ModuleNotFoundError as e:
print(f"Exception: {e}")
print(
"In order to use the local transport, you need to `pip install dailyai[local]`. On MacOS, you also need to `brew install portaudio`.")
raise Exception(f"Missing module: {e}")
class LocalTransport(ThreadedTransport):
def __init__(self, **kwargs):
super().__init__(**kwargs)
self._sample_width = kwargs.get("sample_width") or 2
self._n_channels = kwargs.get("n_channels") or 1
self._tk_root = kwargs.get("tk_root") or None
if self._camera_enabled and not self._tk_root:
raise ValueError(
"If camera is enabled, a tkinter root must be provided")
if self._speaker_enabled:
self._speaker_buffer_pending = bytearray()
async def _write_frame_to_tkinter(self, frame: bytes):
data = f"P6 {self._camera_width} {self._camera_height} 255 ".encode() + \
frame
photo = tk.PhotoImage(
width=self._camera_width,
height=self._camera_height,
data=data,
format="PPM")
self._image_label.config(image=photo)
# This holds a reference to the photo, preventing it from being garbage
# collected.
self._image_label.image = photo # type: ignore
def write_frame_to_camera(self, frame: bytes):
if self._camera_enabled and self._loop:
asyncio.run_coroutine_threadsafe(
self._write_frame_to_tkinter(frame), self._loop
)
def write_frame_to_mic(self, frame: bytes):
if self._mic_enabled:
self._audio_stream.write(frame)
def read_frames(self, desired_frame_count):
bytes = b""
if self._speaker_enabled:
bytes = self._speaker_stream.read(
desired_frame_count,
exception_on_overflow=False,
)
return bytes
def _prerun(self):
if self._mic_enabled:
self._pyaudio = pyaudio.PyAudio()
self._audio_stream = self._pyaudio.open(
format=self._pyaudio.get_format_from_width(self._sample_width),
channels=self._n_channels,
rate=self._speaker_sample_rate,
output=True,
)
if self._camera_enabled:
# Start with a neutral gray background.
array = np.ones((1024, 1024, 3)) * 128
data = f"P5 {1024} {1024} 255 ".encode(
) + array.astype(np.uint8).tobytes()
photo = tk.PhotoImage(
width=1024,
height=1024,
data=data,
format="PPM")
self._image_label = tk.Label(self._tk_root, image=photo)
self._image_label.pack()
if self._speaker_enabled:
self._speaker_stream = self._pyaudio.open(
format=self._pyaudio.get_format_from_width(self._sample_width),
channels=self._n_channels,
rate=self._speaker_sample_rate,
frames_per_buffer=self._speaker_sample_rate,
input=True
)

View File

@@ -1,497 +0,0 @@
from abc import abstractmethod
import asyncio
import itertools
import numpy as np
import queue
import threading
import time
from typing import Any, AsyncGenerator
from enum import Enum
from dailyai.pipeline.frame_processor import FrameProcessor
from dailyai.pipeline.frames import (
SendAppMessageFrame,
AudioFrame,
EndFrame,
ImageFrame,
Frame,
PipelineStartedFrame,
SpriteFrame,
StartFrame,
TextFrame,
UserStartedSpeakingFrame,
UserStoppedSpeakingFrame,
)
from dailyai.pipeline.pipeline import Pipeline
from dailyai.services.ai_services import TTSService
from dailyai.transports.abstract_transport import AbstractTransport
# Provided by Alexander Veysov
def int2float(sound):
try:
abs_max = np.abs(sound).max()
sound = sound.astype("float32")
if abs_max > 0:
sound *= 1 / 32768
sound = sound.squeeze() # depends on the use case
return sound
except ValueError:
return sound
class VADState(Enum):
QUIET = 1
STARTING = 2
SPEAKING = 3
STOPPING = 4
class ThreadedTransport(AbstractTransport):
def __init__(
self,
**kwargs,
) -> None:
super().__init__(**kwargs)
self._vad_start_s = kwargs.get("vad_start_s") or 0.2
self._vad_stop_s = kwargs.get("vad_stop_s") or 0.8
self._context = kwargs.get("context") or []
self._vad_enabled = kwargs.get("vad_enabled") or False
self._has_webrtc_vad = kwargs.get("has_webrtc_vad") or False
if self._vad_enabled and self._speaker_enabled:
raise Exception(
"Sorry, you can't use speaker_enabled and vad_enabled at the same time. Please set one to False."
)
self._vad_samples = 1536
if self._vad_enabled:
try:
global torch, torchaudio
import torch
# We don't use torchaudio here, but we need to try importing it because
# Silero uses it
import torchaudio
torch.set_num_threads(1)
(self.model, self.utils) = torch.hub.load(
repo_or_dir="snakers4/silero-vad", model="silero_vad", force_reload=False
)
self._logger.debug("Loaded Silero VAD")
except ModuleNotFoundError as e:
if self._has_webrtc_vad:
self._logger.debug(
f"Couldn't load torch; using webrtc VAD")
self._vad_samples = int(self._speaker_sample_rate / 100.0)
else:
self._logger.error(f"Exception: {e}")
self._logger.error(
"In order to use Silero VAD, you'll need to `pip install dailyai[silero].")
raise Exception(f"Missing module(s): {e}")
vad_frame_s = self._vad_samples / self._speaker_sample_rate
self._vad_start_frames = round(self._vad_start_s / vad_frame_s)
self._vad_stop_frames = round(self._vad_stop_s / vad_frame_s)
self._vad_starting_count = 0
self._vad_stopping_count = 0
self._vad_state = VADState.QUIET
self._user_is_speaking = False
self._threadsafe_send_queue = queue.Queue()
self._images = None
try:
self._loop: asyncio.AbstractEventLoop | None = asyncio.get_running_loop()
except RuntimeError:
self._loop = None
self._stop_threads = threading.Event()
self._is_interrupted = threading.Event()
async def run(self, pipeline: Pipeline | None = None, override_pipeline_source_queue=True):
self._prerun()
async_output_queue_marshal_task = asyncio.create_task(
self._marshal_frames())
self._camera_thread = threading.Thread(
target=self._run_camera, daemon=True)
self._camera_thread.start()
self._frame_consumer_thread = threading.Thread(
target=self._frame_consumer, daemon=True
)
self._frame_consumer_thread.start()
if self._speaker_enabled:
self._receive_audio_thread = threading.Thread(
target=self._receive_audio, daemon=True
)
self._receive_audio_thread.start()
if self._vad_enabled:
self._vad_thread = threading.Thread(target=self._vad, daemon=True)
self._vad_thread.start()
pipeline_task = None
if pipeline:
pipeline_task = asyncio.create_task(
self.run_pipeline(pipeline, override_pipeline_source_queue)
)
try:
while time.time() < self._expiration and not self._stop_threads.is_set():
await asyncio.sleep(1)
except Exception as e:
self._logger.error(f"Exception {e}")
raise e
finally:
# Do anything that must be done to clean up
self._post_run()
self._stop_threads.set()
if pipeline_task:
pipeline_task.cancel()
await self.send_queue.put(EndFrame())
await async_output_queue_marshal_task
self._frame_consumer_thread.join()
if self._speaker_enabled:
self._receive_audio_thread.join()
if self._vad_enabled:
self._vad_thread.join()
async def run_pipeline(self, pipeline: Pipeline, override_pipeline_source_queue=True):
pipeline.set_sink(self.send_queue)
if override_pipeline_source_queue:
pipeline.set_source(self.receive_queue)
await pipeline.run_pipeline()
async def run_interruptible_pipeline(
self,
pipeline: Pipeline,
pre_processor: FrameProcessor | None = None,
post_processor: FrameProcessor | None = None,
):
pipeline.set_sink(self.send_queue)
source_queue = asyncio.Queue()
pipeline.set_source(source_queue)
pipeline_task = asyncio.create_task(pipeline.run_pipeline())
async def yield_frame(frame: Frame) -> AsyncGenerator[Frame, None]:
yield frame
async def post_process(post_processor: FrameProcessor):
while True:
frame = await self.completed_queue.get()
# We ignore the output of the post_processor's process frame;
# this is called to update the post-processor's state.
async for frame in post_processor.process_frame(frame):
pass
if isinstance(frame, EndFrame):
break
if post_processor:
post_process_task = asyncio.create_task(
post_process(post_processor))
started = False
async for frame in self.get_receive_frames():
if isinstance(frame, UserStartedSpeakingFrame):
pipeline_task.cancel()
self.interrupt()
pipeline_task = asyncio.create_task(pipeline.run_pipeline())
started = False
if not started:
await self.send_queue.put(StartFrame())
if pre_processor:
frame_generator = pre_processor.process_frame(frame)
else:
frame_generator = yield_frame(frame)
async for frame in frame_generator:
await source_queue.put(frame)
if isinstance(frame, EndFrame):
break
await asyncio.gather(pipeline_task, post_process_task)
async def say(self, text: str, tts: TTSService):
"""Say a phrase. Use with caution; this bypasses any running pipelines."""
async for frame in tts.process_frame(TextFrame(text)):
await self.send_queue.put(frame)
def _post_run(self):
# Note that this function must be idempotent! It can be called multiple times
# if, for example, a keyboard interrupt occurs.
pass
def stop(self):
self._stop_threads.set()
async def stop_when_done(self):
await self._wait_for_send_queue_to_empty()
self.stop()
async def _wait_for_send_queue_to_empty(self):
await self.send_queue.join()
self._threadsafe_send_queue.join()
@abstractmethod
def write_frame_to_camera(self, frame: bytes):
pass
@abstractmethod
def write_frame_to_mic(self, frame: bytes):
pass
@abstractmethod
def read_audio_frames(self, desired_frame_count):
return bytes()
@abstractmethod
def _prerun(self):
pass
def _silero_vad_analyze(self):
try:
audio_chunk = self.read_audio_frames(self._vad_samples)
audio_int16 = np.frombuffer(audio_chunk, np.int16)
audio_float32 = int2float(audio_int16)
new_confidence = self.model(
torch.from_numpy(audio_float32), 16000).item()
# yeses = int(new_confidence * 20.0)
# nos = 20 - yeses
# out = "!" * yeses + "." * nos
# print(f"!!! confidence: {out}")
speaking = new_confidence > 0.5
return speaking
except BaseException:
# This comes from an empty audio array
return False
def _vad(self):
while not self._stop_threads.is_set():
if hasattr(self, 'model'): # we can use Silero
speaking = self._silero_vad_analyze()
elif self._has_webrtc_vad:
speaking = self._webrtc_vad_analyze()
else:
raise Exception("VAD is running with no VAD service available")
if speaking:
match self._vad_state:
case VADState.QUIET:
self._vad_state = VADState.STARTING
self._vad_starting_count = 1
case VADState.STARTING:
self._vad_starting_count += 1
case VADState.STOPPING:
self._vad_state = VADState.SPEAKING
self._vad_stopping_count = 0
else:
match self._vad_state:
case VADState.STARTING:
self._vad_state = VADState.QUIET
self._vad_starting_count = 0
case VADState.SPEAKING:
self._vad_state = VADState.STOPPING
self._vad_stopping_count = 1
case VADState.STOPPING:
self._vad_stopping_count += 1
if (
self._vad_state == VADState.STARTING
and self._vad_starting_count >= self._vad_start_frames
):
if self._loop:
asyncio.run_coroutine_threadsafe(
self.receive_queue.put(
UserStartedSpeakingFrame()), self._loop)
# self.interrupt()
self._vad_state = VADState.SPEAKING
self._vad_starting_count = 0
if (
self._vad_state == VADState.STOPPING
and self._vad_stopping_count >= self._vad_stop_frames
):
if self._loop:
asyncio.run_coroutine_threadsafe(
self.receive_queue.put(
UserStoppedSpeakingFrame()), self._loop)
self._vad_state = VADState.QUIET
self._vad_stopping_count = 0
async def _marshal_frames(self):
while True:
frame: Frame | list = await self.send_queue.get()
self._threadsafe_send_queue.put(frame)
self.send_queue.task_done()
if isinstance(frame, EndFrame):
break
def interrupt(self):
self._logger.debug("### Interrupting")
self._is_interrupted.set()
async def get_receive_frames(self) -> AsyncGenerator[Frame, None]:
while True:
frame = await self.receive_queue.get()
yield frame
if isinstance(frame, EndFrame):
break
def _receive_audio(self):
if not self._loop:
self._logger.error("No loop available for audio thread")
return
seconds = 1
desired_frame_count = self._speaker_sample_rate * seconds
while not self._stop_threads.is_set():
buffer = self.read_audio_frames(desired_frame_count)
if len(buffer) > 0:
frame = AudioFrame(buffer)
asyncio.run_coroutine_threadsafe(
self.receive_queue.put(frame), self._loop
)
asyncio.run_coroutine_threadsafe(
self.receive_queue.put(
EndFrame()), self._loop)
def _set_image(self, image: bytes):
self._images = itertools.cycle([image])
def _set_images(self, images: list[bytes], start_frame=0):
self._images = itertools.cycle(images)
def send_app_message(self, message: Any, participantId: str | None):
""" Child classes should override this to send a custom message to the room. """
pass
def _run_camera(self):
try:
while not self._stop_threads.is_set():
if self._images:
this_frame = next(self._images)
self.write_frame_to_camera(this_frame)
time.sleep(1.0 / self._fps)
except Exception as e:
self._logger.error(f"Exception {e} in camera thread.")
raise e
def _frame_consumer(self):
self._logger.info("🎬 Starting frame consumer thread")
b = bytearray()
smallest_write_size = 3200
largest_write_size = 8000
while True:
try:
frames_or_frame: Frame | list[Frame] = self._threadsafe_send_queue.get(
)
if (
isinstance(frames_or_frame, AudioFrame)
and len(frames_or_frame.data) > largest_write_size
):
# subdivide large audio frames to enable interruption
frames = []
for i in range(0, len(frames_or_frame.data),
largest_write_size):
frames.append(AudioFrame(
frames_or_frame.data[i: i + largest_write_size]))
elif isinstance(frames_or_frame, Frame):
frames: list[Frame] = [frames_or_frame]
elif isinstance(frames_or_frame, list):
frames: list[Frame] = frames_or_frame
else:
raise Exception("Unknown type in output queue")
for frame in frames:
if isinstance(frame, EndFrame):
self._logger.info("Stopping frame consumer thread")
self._stop_threads.set()
self._threadsafe_send_queue.task_done()
if self._loop:
asyncio.run_coroutine_threadsafe(
self.completed_queue.put(frame), self._loop
)
# Also send the EndFrame to the pipeline so it can stop
asyncio.run_coroutine_threadsafe(
self.receive_queue.put(frame), self._loop
)
return
# if interrupted, we just pull frames off the queue and
# discard them
if not self._is_interrupted.is_set():
if frame:
if isinstance(frame, AudioFrame):
chunk = frame.data
b.extend(chunk)
truncated_length: int = len(b) - (
len(b) % smallest_write_size
)
if truncated_length:
self.write_frame_to_mic(
bytes(b[:truncated_length]))
b = b[truncated_length:]
elif isinstance(frame, ImageFrame):
self._set_image(frame.image)
elif isinstance(frame, SpriteFrame):
self._set_images(frame.images)
elif isinstance(frame, SendAppMessageFrame):
self.send_app_message(
frame.message, frame.participantId)
elif len(b):
self.write_frame_to_mic(bytes(b))
b = bytearray()
else:
# if there are leftover audio bytes, write them now; failing to do so
# can cause static in the audio stream.
if len(b):
truncated_length = len(b) - (len(b) % 160)
self.write_frame_to_mic(
bytes(b[:truncated_length]))
b = bytearray()
if isinstance(frame, StartFrame):
self._is_interrupted.clear()
asyncio.run_coroutine_threadsafe(
self.receive_queue.put(PipelineStartedFrame()),
self._loop,
)
if self._loop:
asyncio.run_coroutine_threadsafe(
self.completed_queue.put(frame), self._loop
)
self._threadsafe_send_queue.task_done()
except queue.Empty:
if len(b):
self.write_frame_to_mic(bytes(b))
b = bytearray()
except Exception as e:
self._logger.error(
f"Exception in frame_consumer: {e}, {len(b)}")
raise e

View File

@@ -1,125 +0,0 @@
import asyncio
import time
from typing import AsyncGenerator, List
from dailyai.pipeline.frame_processor import FrameProcessor
from dailyai.pipeline.frames import AudioFrame, ControlFrame, EndFrame, Frame, TTSEndFrame, TTSStartFrame, TextFrame
from dailyai.pipeline.pipeline import Pipeline
from dailyai.serializers.protobuf_serializer import ProtobufFrameSerializer
from dailyai.transports.abstract_transport import AbstractTransport
from dailyai.transports.threaded_transport import ThreadedTransport
try:
import websockets
except ModuleNotFoundError as e:
print(f"Exception: {e}")
print(
"In order to use the websocket transport, you need to `pip install dailyai[websocket]`.")
raise Exception(f"Missing module: {e}")
class WebSocketFrameProcessor(FrameProcessor):
"""This FrameProcessor filters and mutates frames before they're sent over the websocket.
This is necessary to aggregate audio frames into sizes that are cleanly playable by the client"""
def __init__(
self,
audio_frame_size: int | None = None,
sendable_frames: List[Frame] | None = None):
super().__init__()
if not audio_frame_size:
raise ValueError("audio_frame_size must be provided")
self._audio_frame_size = audio_frame_size
self._sendable_frames = sendable_frames or [TextFrame, AudioFrame]
self._audio_buffer = bytes()
self._in_tts_audio = False
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
if isinstance(frame, TTSStartFrame):
self._in_tts_audio = True
elif isinstance(frame, AudioFrame):
if self._in_tts_audio:
self._audio_buffer += frame.data
while len(self._audio_buffer) >= self._audio_frame_size:
yield AudioFrame(self._audio_buffer[:self._audio_frame_size])
self._audio_buffer = self._audio_buffer[self._audio_frame_size:]
elif isinstance(frame, TTSEndFrame):
self._in_tts_audio = False
if self._audio_buffer:
yield AudioFrame(self._audio_buffer)
self._audio_buffer = bytes()
elif type(frame) in self._sendable_frames or isinstance(frame, ControlFrame):
yield frame
class WebsocketTransport(AbstractTransport):
def __init__(self, **kwargs):
super().__init__(**kwargs)
self._sample_width = kwargs.get("sample_width", 2)
self._n_channels = kwargs.get("n_channels", 1)
self._port = kwargs.get("port", 8765)
self._host = kwargs.get("host", "localhost")
self._audio_frame_size = kwargs.get("audio_frame_size", 16000)
self._sendable_frames = kwargs.get(
"sendable_frames", [
TextFrame, AudioFrame, TTSEndFrame, TTSStartFrame])
self._serializer = kwargs.get("serializer", ProtobufFrameSerializer())
self._server: websockets.WebSocketServer | None = None
self._websocket: websockets.WebSocketServerProtocol | None = None
self._connection_handlers = []
async def run(self, pipeline: Pipeline, override_pipeline_source_queue=True):
self._stop_server_event = asyncio.Event()
pipeline.set_sink(self.send_queue)
if override_pipeline_source_queue:
pipeline.set_source(self.receive_queue)
pipeline.add_processor(WebSocketFrameProcessor(
audio_frame_size=self._audio_frame_size,
sendable_frames=self._sendable_frames))
async def timeout():
sleep_time = self._expiration - time.time()
await asyncio.sleep(sleep_time)
self._stop_server_event.set()
async def send_task():
while not self._stop_server_event.is_set():
frame = await self.send_queue.get()
if isinstance(frame, EndFrame):
self._stop_server_event.set()
break
if self._websocket and frame:
proto = self._serializer.serialize(frame)
await self._websocket.send(proto)
async def start_server():
async with websockets.serve(self._websocket_handler, self._host, self._port) as server:
self._logger.debug("Websocket server started.")
await self._stop_server_event.wait()
self._logger.debug("Websocket server stopped.")
await self.receive_queue.put(EndFrame())
timeout_task = asyncio.create_task(timeout())
await asyncio.gather(start_server(), send_task(), pipeline.run_pipeline())
timeout_task.cancel()
def on_connection(self, handler):
self._connection_handlers.append(handler)
async def _websocket_handler(self, websocket: websockets.WebSocketServerProtocol, path):
if self._websocket:
await self._websocket.close()
self._logger.warning(
"Got another websocket connection; closing first.")
for handler in self._connection_handlers:
await handler()
self._websocket = websocket
async for message in websocket:
frame = self._serializer.deserialize(message)
await self.receive_queue.put(frame)

View File

@@ -1,6 +1,12 @@
//
// Copyright (c) 2024, Daily
//
// SPDX-License-Identifier: BSD 2-Clause License
//
syntax = "proto3";
package dailyai_proto;
package pipecat_proto;
message TextFrame {
string text = 1;

View File

@@ -0,0 +1,276 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
from typing import List, Tuple
from dataclasses import dataclass, field
from pipecat.utils.utils import obj_count, obj_id
@dataclass
class Frame:
id: int = field(init=False)
name: str = field(init=False)
def __post_init__(self):
self.id: int = obj_id()
self.name: str = f"{self.__class__.__name__}#{obj_count(self)}"
def __str__(self):
return self.name
@dataclass
class DataFrame(Frame):
pass
@dataclass
class AudioRawFrame(DataFrame):
"""A chunk of audio. Will be played by the transport if the transport's
microphone has been enabled.
"""
audio: bytes
sample_rate: int
num_channels: int
def __post_init__(self):
super().__post_init__()
self.num_frames = int(len(self.audio) / (self.num_channels * 2))
def __str__(self):
return f"{self.name}(size: {len(self.audio)}, frames: {self.num_frames}, sample_rate: {self.sample_rate}, channels: {self.num_channels})"
@dataclass
class ImageRawFrame(DataFrame):
"""An image. Will be shown by the transport if the transport's camera is
enabled.
"""
image: bytes
size: Tuple[int, int]
format: str
def __str__(self):
return f"{self.name}(size: {self.size}, format: {self.format})"
@dataclass
class URLImageRawFrame(ImageRawFrame):
"""An image with an associated URL. Will be shown by the transport if the
transport's camera is enabled.
"""
url: str | None
def __str__(self):
return f"{self.name}(url: {self.url}, size: {self.size}, format: {self.format})"
@dataclass
class VisionImageRawFrame(ImageRawFrame):
"""An image with an associated text to ask for a description of it. Will be
shown by the transport if the transport's camera is enabled.
"""
text: str | None
def __str__(self):
return f"{self.name}(text: {self.text}, size: {self.size}, format: {self.format})"
@dataclass
class UserImageRawFrame(ImageRawFrame):
"""An image associated to a user. Will be shown by the transport if the
transport's camera is enabled.
"""
user_id: str
def __str__(self):
return f"{self.name}(user: {self.user_id}, size: {self.size}, format: {self.format})"
@dataclass
class SpriteFrame(Frame):
"""An animated sprite. Will be shown by the transport if the transport's
camera is enabled. Will play at the framerate specified in the transport's
`fps` constructor parameter.
"""
images: List[ImageRawFrame]
def __str__(self):
return f"{self.name}(size: {len(self.images)})"
@dataclass
class TextFrame(DataFrame):
"""A chunk of text. Emitted by LLM services, consumed by TTS services, can
be used to send text through pipelines.
"""
text: str
def __str__(self):
return f'{self.name}: "{self.text}"'
@dataclass
class TranscriptionFrame(TextFrame):
"""A text frame with transcription-specific data. Will be placed in the
transport's receive queue when a participant speaks.
"""
user_id: str
timestamp: str
def __str__(self):
return f"{self.name}(user: {self.user_id}, timestamp: {self.timestamp})"
@dataclass
class InterimTranscriptionFrame(TextFrame):
"""A text frame with interim transcription-specific data. Will be placed in
the transport's receive queue when a participant speaks."""
user_id: str
timestamp: str
def __str__(self):
return f"{self.name}(user: {self.user_id}, timestamp: {self.timestamp})"
@dataclass
class LLMMessagesFrame(DataFrame):
"""A frame containing a list of LLM messages. Used to signal that an LLM
service should run a chat completion and emit an LLMStartFrames, TextFrames
and an LLMEndFrame. Note that the messages property on this class is
mutable, and will be be updated by various ResponseAggregator frame
processors.
"""
messages: List[dict]
#
# App frames. Application user-defined frames.
#
@dataclass
class AppFrame(Frame):
pass
#
# System frames
#
@dataclass
class SystemFrame(Frame):
pass
@dataclass
class StartFrame(SystemFrame):
"""This is the first frame that should be pushed down a pipeline."""
pass
@dataclass
class CancelFrame(SystemFrame):
"""Indicates that a pipeline needs to stop right away."""
pass
@dataclass
class ErrorFrame(SystemFrame):
"""This is used notify upstream that an error has occurred downstream the
pipeline."""
error: str | None
def __str__(self):
return f"{self.name}(error: {self.error})"
#
# Control frames
#
@dataclass
class ControlFrame(Frame):
pass
@dataclass
class EndFrame(ControlFrame):
"""Indicates that a pipeline has ended and frame processors and pipelines
should be shut down. If the transport receives this frame, it will stop
sending frames to its output channel(s) and close all its threads. Note,
that this is a control frame, which means it will received in the order it
was sent (unline system frames).
"""
pass
@dataclass
class LLMResponseStartFrame(ControlFrame):
"""Used to indicate the beginning of an LLM response. Following TextFrames
are part of the LLM response until an LLMResponseEndFrame"""
pass
@dataclass
class LLMResponseEndFrame(ControlFrame):
"""Indicates the end of an LLM response."""
pass
@dataclass
class UserStartedSpeakingFrame(ControlFrame):
"""Emitted by VAD to indicate that a user has started speaking. This can be
used for interruptions or other times when detecting that someone is
speaking is more important than knowing what they're saying (as you will
with a TranscriptionFrame)
"""
pass
@dataclass
class UserStoppedSpeakingFrame(ControlFrame):
"""Emitted by the VAD to indicate that a user stopped speaking."""
pass
@dataclass
class TTSStartedFrame(ControlFrame):
"""Used to indicate the beginning of a TTS response. Following
AudioRawFrames are part of the TTS response until an TTSEndFrame. These
frames can be used for aggregating audio frames in a transport to optimize
the size of frames sent to the session, without needing to control this in
the TTS service.
"""
pass
@dataclass
class TTSStoppedFrame(ControlFrame):
"""Indicates the end of a TTS response."""
pass
@dataclass
class UserImageRequestFrame(ControlFrame):
"""A frame user to request an image from the given user."""
user_id: str
def __str__(self):
return f"{self.name}, user: {self.user_id}"

View File

@@ -14,7 +14,7 @@ _sym_db = _symbol_database.Default()
DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x0c\x66rames.proto\x12\rdailyai_proto\"\x19\n\tTextFrame\x12\x0c\n\x04text\x18\x01 \x01(\t\"\x1a\n\nAudioFrame\x12\x0c\n\x04\x64\x61ta\x18\x01 \x01(\x0c\"L\n\x12TranscriptionFrame\x12\x0c\n\x04text\x18\x01 \x01(\t\x12\x15\n\rparticipantId\x18\x02 \x01(\t\x12\x11\n\ttimestamp\x18\x03 \x01(\t\"\xa2\x01\n\x05\x46rame\x12(\n\x04text\x18\x01 \x01(\x0b\x32\x18.dailyai_proto.TextFrameH\x00\x12*\n\x05\x61udio\x18\x02 \x01(\x0b\x32\x19.dailyai_proto.AudioFrameH\x00\x12:\n\rtranscription\x18\x03 \x01(\x0b\x32!.dailyai_proto.TranscriptionFrameH\x00\x42\x07\n\x05\x66rameb\x06proto3')
DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x0c\x66rames.proto\x12\rpipecat_proto\"\x19\n\tTextFrame\x12\x0c\n\x04text\x18\x01 \x01(\t\"\x1a\n\nAudioFrame\x12\x0c\n\x04\x64\x61ta\x18\x01 \x01(\x0c\"L\n\x12TranscriptionFrame\x12\x0c\n\x04text\x18\x01 \x01(\t\x12\x15\n\rparticipantId\x18\x02 \x01(\t\x12\x11\n\ttimestamp\x18\x03 \x01(\t\"\xa2\x01\n\x05\x46rame\x12(\n\x04text\x18\x01 \x01(\x0b\x32\x18.pipecat_proto.TextFrameH\x00\x12*\n\x05\x61udio\x18\x02 \x01(\x0b\x32\x19.pipecat_proto.AudioFrameH\x00\x12:\n\rtranscription\x18\x03 \x01(\x0b\x32!.pipecat_proto.TranscriptionFrameH\x00\x42\x07\n\x05\x66rameb\x06proto3')
_globals = globals()
_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)

View File

@@ -1,6 +1,6 @@
from typing import List
from dailyai.pipeline.frames import EndFrame, EndPipeFrame
from dailyai.pipeline.pipeline import Pipeline
from pipecat.pipeline.frames import EndFrame, EndPipeFrame
from pipecat.pipeline.pipeline import Pipeline
class SequentialMergePipeline(Pipeline):

View File

@@ -0,0 +1,137 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import asyncio
from pipecat.pipeline.pipeline import Pipeline
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
from pipecat.frames.frames import CancelFrame, EndFrame, Frame, StartFrame
from loguru import logger
class Source(FrameProcessor):
def __init__(self, upstream_queue: asyncio.Queue):
super().__init__()
self._up_queue = upstream_queue
async def process_frame(self, frame: Frame, direction: FrameDirection):
match direction:
case FrameDirection.UPSTREAM:
await self._up_queue.put(frame)
case FrameDirection.DOWNSTREAM:
await self.push_frame(frame, direction)
class Sink(FrameProcessor):
def __init__(self, downstream_queue: asyncio.Queue):
super().__init__()
self._down_queue = downstream_queue
async def process_frame(self, frame: Frame, direction: FrameDirection):
match direction:
case FrameDirection.UPSTREAM:
await self.push_frame(frame, direction)
case FrameDirection.DOWNSTREAM:
await self._down_queue.put(frame)
class ParallelPipeline(FrameProcessor):
def __init__(self, *args):
super().__init__()
if len(args) == 0:
raise Exception(f"ParallelPipeline needs at least one argument")
self._sources = []
self._sinks = []
self._up_queue = asyncio.Queue()
self._down_queue = asyncio.Queue()
self._up_task: asyncio.Task | None = None
self._down_task: asyncio.Task | None = None
self._pipelines = []
logger.debug(f"Creating {self} pipelines")
for processors in args:
if not isinstance(processors, list):
raise TypeError(f"ParallelPipeline argument {processors} is not a list")
# We add a source at before the pipeline and a sink after.
source = Source(self._up_queue)
sink = Sink(self._down_queue)
self._sources.append(source)
self._sinks.append(sink)
# Create pipeline
pipeline = Pipeline(processors)
source.link(pipeline)
pipeline.link(sink)
self._pipelines.append(pipeline)
logger.debug(f"Finished creating {self} pipelines")
#
# Frame processor
#
async def cleanup(self):
await asyncio.gather(*[p.cleanup() for p in self._pipelines])
async def _start_tasks(self):
loop = self.get_event_loop()
self._up_task = loop.create_task(self._process_up_queue())
self._down_task = loop.create_task(self._process_down_queue())
async def process_frame(self, frame: Frame, direction: FrameDirection):
if isinstance(frame, StartFrame):
await self._start_tasks()
if direction == FrameDirection.UPSTREAM:
# If we get an upstream frame we process it in each sink.
await asyncio.gather(*[s.process_frame(frame, direction) for s in self._sinks])
elif direction == FrameDirection.DOWNSTREAM:
# If we get a downstream frame we process it in each source.
# TODO(aleix): We are creating task for each frame. For real-time
# video/audio this might be too slow. We should use an already
# created task instead.
await asyncio.gather(*[s.process_frame(frame, direction) for s in self._sources])
# If we get an EndFrame we stop our queue processing tasks and wait on
# all the pipelines to finish.
if isinstance(frame, CancelFrame) or isinstance(frame, EndFrame):
# Use None to indicate when queues should be done processing.
await self._up_queue.put(None)
await self._down_queue.put(None)
if self._up_task:
await self._up_task
if self._down_task:
await self._down_task
async def _process_up_queue(self):
running = True
seen_ids = set()
while running:
frame = await self._up_queue.get()
if frame and frame.id not in seen_ids:
await self.push_frame(frame, FrameDirection.UPSTREAM)
seen_ids.add(frame.id)
running = frame is not None
self._up_queue.task_done()
async def _process_down_queue(self):
running = True
seen_ids = set()
while running:
frame = await self._down_queue.get()
if frame and frame.id not in seen_ids:
await self.push_frame(frame, FrameDirection.DOWNSTREAM)
seen_ids.add(frame.id)
running = frame is not None
self._down_queue.task_done()

View File

@@ -0,0 +1,76 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import asyncio
from typing import Callable, Coroutine, List
from pipecat.frames.frames import Frame
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
class PipelineSource(FrameProcessor):
def __init__(self, upstream_push_frame: Callable[[Frame, FrameDirection], Coroutine]):
super().__init__()
self._upstream_push_frame = upstream_push_frame
async def process_frame(self, frame: Frame, direction: FrameDirection):
match direction:
case FrameDirection.UPSTREAM:
await self._upstream_push_frame(frame, direction)
case FrameDirection.DOWNSTREAM:
await self.push_frame(frame, direction)
class PipelineSink(FrameProcessor):
def __init__(self, downstream_push_frame: Callable[[Frame, FrameDirection], Coroutine]):
super().__init__()
self._downstream_push_frame = downstream_push_frame
async def process_frame(self, frame: Frame, direction: FrameDirection):
match direction:
case FrameDirection.UPSTREAM:
await self.push_frame(frame, direction)
case FrameDirection.DOWNSTREAM:
await self._downstream_push_frame(frame, direction)
class Pipeline(FrameProcessor):
def __init__(self, processors: List[FrameProcessor]):
super().__init__()
# Add a source and a sink queue so we can forward frames upstream and
# downstream outside of the pipeline.
self._source = PipelineSource(self.push_frame)
self._sink = PipelineSink(self.push_frame)
self._processors: List[FrameProcessor] = [self._source] + processors + [self._sink]
self._link_processors()
#
# Frame processor
#
async def cleanup(self):
await self._cleanup_processors()
async def process_frame(self, frame: Frame, direction: FrameDirection):
if direction == FrameDirection.DOWNSTREAM:
await self._source.process_frame(frame, FrameDirection.DOWNSTREAM)
elif direction == FrameDirection.UPSTREAM:
await self._sink.process_frame(frame, FrameDirection.UPSTREAM)
async def _cleanup_processors(self):
await asyncio.gather(*[p.cleanup() for p in self._processors])
def _link_processors(self):
prev = self._processors[0]
for curr in self._processors[1:]:
prev.link(curr)
prev = curr

View File

@@ -0,0 +1,60 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import asyncio
import signal
from pipecat.pipeline.task import PipelineTask
from pipecat.utils.utils import obj_count, obj_id
from loguru import logger
class PipelineRunner:
def __init__(self, name: str | None = None, handle_sigint: bool = True):
self.id: int = obj_id()
self.name: str = name or f"{self.__class__.__name__}#{obj_count(self)}"
self._loop: asyncio.AbstractEventLoop = asyncio.get_running_loop()
self._tasks = {}
self._running = True
if handle_sigint:
self._setup_sigint()
async def run(self, task: PipelineTask):
logger.debug(f"Runner {self} started running {task}")
self._running = True
self._tasks[task.name] = task
await task.run()
del self._tasks[task.name]
self._running = False
logger.debug(f"Runner {self} finished running {task}")
async def stop_when_done(self):
logger.debug(f"Runner {self} scheduled to stop when all tasks are done")
await asyncio.gather(*[t.stop_when_done() for t in self._tasks.values()])
async def cancel(self):
logger.debug(f"Canceling runner {self}")
await asyncio.gather(*[t.cancel() for t in self._tasks.values()])
def is_active(self):
return self._running
def _setup_sigint(self):
self._loop.add_signal_handler(
signal.SIGINT,
lambda *args: asyncio.create_task(self._sigint_handler())
)
async def _sigint_handler(self):
logger.warning(f"Ctrl-C detected. Canceling runner {self}")
await self.cancel()
def __str__(self):
return self.name

View File

@@ -0,0 +1,93 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import asyncio
from typing import AsyncIterable, Iterable
from pipecat.frames.frames import CancelFrame, EndFrame, ErrorFrame, Frame, StartFrame
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
from pipecat.utils.utils import obj_count, obj_id
from loguru import logger
class Source(FrameProcessor):
def __init__(self, up_queue: asyncio.Queue):
super().__init__()
self._up_queue = up_queue
async def process_frame(self, frame: Frame, direction: FrameDirection):
match direction:
case FrameDirection.UPSTREAM:
await self._up_queue.put(frame)
case FrameDirection.DOWNSTREAM:
await self.push_frame(frame, direction)
class PipelineTask:
def __init__(self, pipeline: FrameProcessor):
self.id: int = obj_id()
self.name: str = f"{self.__class__.__name__}#{obj_count(self)}"
self._pipeline = pipeline
self._task_queue = asyncio.Queue()
self._up_queue = asyncio.Queue()
self._source = Source(self._up_queue)
self._source.link(pipeline)
async def stop_when_done(self):
logger.debug(f"Task {self} scheduled to stop when done")
await self.queue_frame(EndFrame())
async def cancel(self):
logger.debug(f"Canceling pipeline task {self}")
await self.queue_frame(CancelFrame())
async def run(self):
await asyncio.gather(self._process_task_queue(), self._process_up_queue())
async def queue_frame(self, frame: Frame):
await self._task_queue.put(frame)
async def queue_frames(self, frames: Iterable[Frame] | AsyncIterable[Frame]):
if isinstance(frames, AsyncIterable):
async for frame in frames:
await self.queue_frame(frame)
elif isinstance(frames, Iterable):
for frame in frames:
await self.queue_frame(frame)
else:
raise Exception("Frames must be an iterable or async iterable")
async def _process_task_queue(self):
await self._source.process_frame(StartFrame(), FrameDirection.DOWNSTREAM)
running = True
while running:
frame = await self._task_queue.get()
await self._source.process_frame(frame, FrameDirection.DOWNSTREAM)
self._task_queue.task_done()
running = not (isinstance(frame, CancelFrame) or isinstance(frame, EndFrame))
# We just enqueue None to terminate the task.
await self._up_queue.put(None)
async def _process_up_queue(self):
running = True
while running:
frame = await self._up_queue.get()
if frame:
if isinstance(frame, ErrorFrame):
logger.error(f"Error running app: {frame.error}")
await self.queue_frame(CancelFrame())
self._up_queue.task_done()
running = frame is not None
def __str__(self):
return self.name

View File

@@ -0,0 +1,72 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
from typing import List
from pipecat.frames.frames import Frame, SystemFrame
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
from loguru import logger
class GatedAggregator(FrameProcessor):
"""Accumulate frames, with custom functions to start and stop accumulation.
Yields gate-opening frame before any accumulated frames, then ensuing frames
until and not including the gate-closed frame.
>>> from pipecat.pipeline.frames import ImageFrame
>>> async def print_frames(aggregator, frame):
... async for frame in aggregator.process_frame(frame):
... if isinstance(frame, TextFrame):
... print(frame.text)
... else:
... print(frame.__class__.__name__)
>>> aggregator = GatedAggregator(
... gate_close_fn=lambda x: isinstance(x, LLMResponseStartFrame),
... gate_open_fn=lambda x: isinstance(x, ImageFrame),
... start_open=False)
>>> asyncio.run(print_frames(aggregator, TextFrame("Hello")))
>>> asyncio.run(print_frames(aggregator, TextFrame("Hello again.")))
>>> asyncio.run(print_frames(aggregator, ImageFrame(image=bytes([]), size=(0, 0))))
ImageFrame
Hello
Hello again.
>>> asyncio.run(print_frames(aggregator, TextFrame("Goodbye.")))
Goodbye.
"""
def __init__(self, gate_open_fn, gate_close_fn, start_open):
super().__init__()
self._gate_open_fn = gate_open_fn
self._gate_close_fn = gate_close_fn
self._gate_open = start_open
self._accumulator: List[Frame] = []
async def process_frame(self, frame: Frame, direction: FrameDirection):
# We must not block system frames.
if isinstance(frame, SystemFrame):
await self.push_frame(frame, direction)
return
old_state = self._gate_open
if self._gate_open:
self._gate_open = not self._gate_close_fn(frame)
else:
self._gate_open = self._gate_open_fn(frame)
if old_state != self._gate_open:
state = "open" if self._gate_open else "closed"
logger.debug(f"Gate is now {state} because of {frame}")
if self._gate_open:
await self.push_frame(frame, direction)
for frame in self._accumulator:
await self.push_frame(frame, direction)
self._accumulator = []
else:
self._accumulator.append(frame)

View File

@@ -0,0 +1,82 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
from pipecat.frames.frames import Frame, InterimTranscriptionFrame, LLMMessagesFrame, TextFrame
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
class LLMContextAggregator(FrameProcessor):
def __init__(
self,
messages: list[dict],
role: str,
complete_sentences=True,
pass_through=True,
):
super().__init__()
self._messages = messages
self._role = role
self._sentence = ""
self._complete_sentences = complete_sentences
self._pass_through = pass_through
async def process_frame(self, frame: Frame, direction: FrameDirection):
# We don't do anything with non-text frames, pass it along to next in
# the pipeline.
if not isinstance(frame, TextFrame):
await self.push_frame(frame, direction)
return
# If we get interim results, we ignore them.
if isinstance(frame, InterimTranscriptionFrame):
return
# The common case for "pass through" is receiving frames from the LLM that we'll
# use to update the "assistant" LLM messages, but also passing the text frames
# along to a TTS service to be spoken to the user.
if self._pass_through:
await self.push_frame(frame, direction)
# TODO: split up transcription by participant
if self._complete_sentences:
# type: ignore -- the linter thinks this isn't a TextFrame, even
# though we check it above
self._sentence += frame.text
if self._sentence.endswith((".", "?", "!")):
self._messages.append(
{"role": self._role, "content": self._sentence})
self._sentence = ""
await self.push_frame(LLMMessagesFrame(self._messages))
else:
# type: ignore -- the linter thinks this isn't a TextFrame, even
# though we check it above
self._messages.append({"role": self._role, "content": frame.text})
await self.push_frame(LLMMessagesFrame(self._messages))
class LLMUserContextAggregator(LLMContextAggregator):
def __init__(
self,
messages: list[dict],
complete_sentences=True):
super().__init__(
messages,
"user",
complete_sentences,
pass_through=False)
class LLMAssistantContextAggregator(LLMContextAggregator):
def __init__(
self,
messages: list[dict],
complete_sentences=True):
super().__init__(
messages,
"assistant",
complete_sentences,
pass_through=True,
)

View File

@@ -0,0 +1,190 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
from pipecat.frames.frames import (
Frame,
InterimTranscriptionFrame,
LLMMessagesFrame,
LLMResponseStartFrame,
TextFrame,
LLMResponseEndFrame,
TranscriptionFrame,
UserStartedSpeakingFrame,
UserStoppedSpeakingFrame)
class LLMResponseAggregator(FrameProcessor):
def __init__(
self,
*,
messages: list[dict] | None,
role: str,
start_frame,
end_frame,
accumulator_frame: TextFrame,
interim_accumulator_frame: TextFrame | None = None
):
super().__init__()
self._messages = messages
self._role = role
self._start_frame = start_frame
self._end_frame = end_frame
self._accumulator_frame = accumulator_frame
self._interim_accumulator_frame = interim_accumulator_frame
self._seen_start_frame = False
self._seen_end_frame = False
self._seen_interim_results = False
self._aggregation = ""
self._aggregating = False
#
# Frame processor
#
# Use cases implemented:
#
# S: Start, E: End, T: Transcription, I: Interim, X: Text
#
# S E -> None
# S T E -> X
# S I T E -> X
# S I E T -> X
# S I E I T -> X
#
# The following case would not be supported:
#
# S I E T1 I T2 -> X
#
# and T2 would be dropped.
async def process_frame(self, frame: Frame, direction: FrameDirection):
if not self._messages:
return
send_aggregation = False
if isinstance(frame, self._start_frame):
self._seen_start_frame = True
self._aggregating = True
elif isinstance(frame, self._end_frame):
self._seen_end_frame = True
# We might have received the end frame but we might still be
# aggregating (i.e. we have seen interim results but not the final
# text).
self._aggregating = self._seen_interim_results
# Send the aggregation if we are not aggregating anymore (i.e. no
# more interim results received).
send_aggregation = not self._aggregating
elif isinstance(frame, self._accumulator_frame):
if self._aggregating:
self._aggregation += f" {frame.text}"
# We have recevied a complete sentence, so if we have seen the
# end frame and we were still aggregating, it means we should
# send the aggregation.
send_aggregation = self._seen_end_frame
# We just got our final result, so let's reset interim results.
self._seen_interim_results = False
elif self._interim_accumulator_frame and isinstance(frame, self._interim_accumulator_frame):
self._seen_interim_results = True
else:
await self.push_frame(frame, direction)
if send_aggregation:
await self._push_aggregation()
async def _push_aggregation(self):
if len(self._aggregation) > 0:
self._messages.append({"role": self._role, "content": self._aggregation})
frame = LLMMessagesFrame(self._messages)
await self.push_frame(frame)
# Reset
self._aggregation = ""
self._seen_start_frame = False
self._seen_end_frame = False
self._seen_interim_results = False
class LLMAssistantResponseAggregator(LLMResponseAggregator):
def __init__(self, messages: list[dict]):
super().__init__(
messages=messages,
role="assistant",
start_frame=LLMResponseStartFrame,
end_frame=LLMResponseEndFrame,
accumulator_frame=TextFrame
)
class LLMUserResponseAggregator(LLMResponseAggregator):
def __init__(self, messages: list[dict]):
super().__init__(
messages=messages,
role="user",
start_frame=UserStartedSpeakingFrame,
end_frame=UserStoppedSpeakingFrame,
accumulator_frame=TranscriptionFrame,
interim_accumulator_frame=InterimTranscriptionFrame
)
class LLMFullResponseAggregator(FrameProcessor):
"""This class aggregates Text frames until it receives a
LLMResponseEndFrame, then emits the concatenated text as
a single text frame.
given the following frames:
TextFrame("Hello,")
TextFrame(" world.")
TextFrame(" I am")
TextFrame(" an LLM.")
LLMResponseEndFrame()]
this processor will yield nothing for the first 4 frames, then
TextFrame("Hello, world. I am an LLM.")
LLMResponseEndFrame()
when passed the last frame.
>>> async def print_frames(aggregator, frame):
... async for frame in aggregator.process_frame(frame):
... if isinstance(frame, TextFrame):
... print(frame.text)
... else:
... print(frame.__class__.__name__)
>>> aggregator = LLMFullResponseAggregator()
>>> asyncio.run(print_frames(aggregator, TextFrame("Hello,")))
>>> asyncio.run(print_frames(aggregator, TextFrame(" world.")))
>>> asyncio.run(print_frames(aggregator, TextFrame(" I am")))
>>> asyncio.run(print_frames(aggregator, TextFrame(" an LLM.")))
>>> asyncio.run(print_frames(aggregator, LLMResponseEndFrame()))
Hello, world. I am an LLM.
LLMResponseEndFrame
"""
def __init__(self):
super().__init__()
self._aggregation = ""
async def process_frame(self, frame: Frame, direction: FrameDirection):
if isinstance(frame, TextFrame):
self._aggregation += frame.text
elif isinstance(frame, LLMResponseEndFrame):
await self.push_frame(TextFrame(self._aggregation))
await self.push_frame(frame)
self._aggregation = ""
else:
await self.push_frame(frame, direction)

View File

@@ -1,6 +1,14 @@
from typing import AsyncGenerator, Callable
from dailyai.pipeline.frame_processor import FrameProcessor
from dailyai.pipeline.frames import (
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
from dataclasses import dataclass
from typing import AsyncGenerator, Callable, List
from pipecat.frames.frames import (
Frame,
LLMResponseEndFrame,
LLMResponseStartFrame,
@@ -9,16 +17,58 @@ from dailyai.pipeline.frames import (
UserStartedSpeakingFrame,
UserStoppedSpeakingFrame,
)
from dailyai.pipeline.openai_frames import OpenAILLMContextFrame
from dailyai.services.openai_llm_context import OpenAILLMContext
from pipecat.processors.frame_processor import FrameProcessor
try:
from openai.types.chat import ChatCompletionRole
except ModuleNotFoundError as e:
print(f"Exception: {e}")
print(
"In order to use OpenAI, you need to `pip install dailyai[openai]`. Also, set `OPENAI_API_KEY` environment variable.")
raise Exception(f"Missing module: {e}")
from openai._types import NOT_GIVEN, NotGiven
from openai.types.chat import (
ChatCompletionRole,
ChatCompletionToolParam,
ChatCompletionToolChoiceOptionParam,
ChatCompletionMessageParam
)
class OpenAILLMContext:
def __init__(
self,
messages: List[ChatCompletionMessageParam] | None = None,
tools: List[ChatCompletionToolParam] | NotGiven = NOT_GIVEN,
tool_choice: ChatCompletionToolChoiceOptionParam | NotGiven = NOT_GIVEN
):
self.messages: List[ChatCompletionMessageParam] = messages if messages else [
]
self.tool_choice: ChatCompletionToolChoiceOptionParam | NotGiven = tool_choice
self.tools: List[ChatCompletionToolParam] | NotGiven = tools
@ staticmethod
def from_messages(messages: List[dict]) -> "OpenAILLMContext":
context = OpenAILLMContext()
for message in messages:
context.add_message({
"content": message["content"],
"role": message["role"],
"name": message["name"] if "name" in message else message["role"]
})
return context
def add_message(self, message: ChatCompletionMessageParam):
self.messages.append(message)
def get_messages(self) -> List[ChatCompletionMessageParam]:
return self.messages
def set_tool_choice(
self, tool_choice: ChatCompletionToolChoiceOptionParam | NotGiven
):
self.tool_choice = tool_choice
def set_tools(self, tools: List[ChatCompletionToolParam] | NotGiven = NOT_GIVEN):
if tools != NOT_GIVEN and len(tools) == 0:
tools = NOT_GIVEN
self.tools = tools
class OpenAIContextAggregator(FrameProcessor):
@@ -113,3 +163,13 @@ class OpenAIAssistantContextAggregator(OpenAIContextAggregator):
accumulator_frame=TextFrame,
pass_through=True,
)
@dataclass
class OpenAILLMContextFrame(Frame):
"""Like an LLMMessagesFrame, but with extra context specific to the OpenAI
API. The context in this message is also mutable, and will be changed by the
OpenAIContextAggregator frame processor.
"""
context: OpenAILLMContext

View File

@@ -0,0 +1,104 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import asyncio
from typing import List
from pipecat.pipeline.pipeline import Pipeline
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
from pipecat.frames.frames import Frame
from loguru import logger
class Source(FrameProcessor):
def __init__(self, upstream_queue: asyncio.Queue):
super().__init__()
self._up_queue = upstream_queue
async def process_frame(self, frame: Frame, direction: FrameDirection):
match direction:
case FrameDirection.UPSTREAM:
await self._up_queue.put(frame)
case FrameDirection.DOWNSTREAM:
await self.push_frame(frame, direction)
class Sink(FrameProcessor):
def __init__(self, downstream_queue: asyncio.Queue):
super().__init__()
self._down_queue = downstream_queue
async def process_frame(self, frame: Frame, direction: FrameDirection):
match direction:
case FrameDirection.UPSTREAM:
await self.push_frame(frame, direction)
case FrameDirection.DOWNSTREAM:
await self._down_queue.put(frame)
class ParallelTask(FrameProcessor):
def __init__(self, *args):
super().__init__()
if len(args) == 0:
raise Exception(f"ParallelTask needs at least one argument")
self._sinks = []
self._pipelines = []
self._up_queue = asyncio.Queue()
self._down_queue = asyncio.Queue()
logger.debug(f"Creating {self} pipelines")
for processors in args:
if not isinstance(processors, list):
raise TypeError(f"ParallelTask argument {processors} is not a list")
# We add a source at the beginning of the pipeline and a sink at the end.
source = Source(self._up_queue)
sink = Sink(self._down_queue)
processors: List[FrameProcessor] = [source] + processors
processors.append(sink)
# Keep track of sinks. We access the source through the pipeline.
self._sinks.append(sink)
# Create pipeline
pipeline = Pipeline(processors)
self._pipelines.append(pipeline)
logger.debug(f"Finished creating {self} pipelines")
#
# Frame processor
#
async def process_frame(self, frame: Frame, direction: FrameDirection):
if direction == FrameDirection.UPSTREAM:
# If we get an upstream frame we process it in each sink.
await asyncio.gather(*[s.process_frame(frame, direction) for s in self._sinks])
elif direction == FrameDirection.DOWNSTREAM:
# If we get a downstream frame we process it in each source (using the pipeline).
await asyncio.gather(*[p.process_frame(frame, direction) for p in self._pipelines])
seen_ids = set()
while not self._up_queue.empty():
frame = await self._up_queue.get()
if frame and frame.id not in seen_ids:
await self.push_frame(frame, FrameDirection.UPSTREAM)
seen_ids.add(frame.id)
self._up_queue.task_done()
seen_ids = set()
while not self._down_queue.empty():
frame = await self._down_queue.get()
if frame and frame.id not in seen_ids:
await self.push_frame(frame, FrameDirection.DOWNSTREAM)
seen_ids.add(frame.id)
self._down_queue.task_done()

View File

@@ -0,0 +1,50 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import re
from typing import List
from pipecat.frames.frames import EndFrame, Frame, TextFrame
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
class SentenceAggregator(FrameProcessor):
"""This frame processor aggregates text frames into complete sentences.
Frame input/output:
TextFrame("Hello,") -> None
TextFrame(" world.") -> TextFrame("Hello world.")
Doctest:
>>> async def print_frames(aggregator, frame):
... async for frame in aggregator.process_frame(frame):
... print(frame.text)
>>> aggregator = SentenceAggregator()
>>> asyncio.run(print_frames(aggregator, TextFrame("Hello,")))
>>> asyncio.run(print_frames(aggregator, TextFrame(" world.")))
Hello, world.
"""
def __init__(self):
super().__init__()
self._aggregation = ""
async def process_frame(self, frame: Frame, direction: FrameDirection):
if isinstance(frame, TextFrame):
m = re.search("(.*[?.!])(.*)", frame.text)
if m:
await self.push_frame(TextFrame(self._aggregation + m.group(1)))
self._aggregation = m.group(2)
else:
self._aggregation += frame.text
elif isinstance(frame, EndFrame):
if self._aggregation:
await self.push_frame(TextFrame(self._aggregation))
await self.push_frame(frame)
else:
await self.push_frame(frame, direction)

View File

@@ -0,0 +1,139 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
from pipecat.frames.frames import (
Frame,
InterimTranscriptionFrame,
TextFrame,
TranscriptionFrame,
UserStartedSpeakingFrame,
UserStoppedSpeakingFrame)
class ResponseAggregator(FrameProcessor):
"""This frame processor aggregates frames between a start and an end frame
into complete text frame sentences.
For example, frame input/output:
UserStartedSpeakingFrame() -> None
TranscriptionFrame("Hello,") -> None
TranscriptionFrame(" world.") -> None
UserStoppedSpeakingFrame() -> TextFrame("Hello world.")
Doctest:
>>> async def print_frames(aggregator, frame):
... async for frame in aggregator.process_frame(frame):
... if isinstance(frame, TextFrame):
... print(frame.text)
>>> aggregator = ResponseAggregator(start_frame = UserStartedSpeakingFrame,
... end_frame=UserStoppedSpeakingFrame,
... accumulator_frame=TranscriptionFrame,
... pass_through=False)
>>> asyncio.run(print_frames(aggregator, UserStartedSpeakingFrame()))
>>> asyncio.run(print_frames(aggregator, TranscriptionFrame("Hello,", 1, 1)))
>>> asyncio.run(print_frames(aggregator, TranscriptionFrame("world.", 1, 2)))
>>> asyncio.run(print_frames(aggregator, UserStoppedSpeakingFrame()))
Hello, world.
"""
def __init__(
self,
*,
start_frame,
end_frame,
accumulator_frame: TextFrame,
interim_accumulator_frame: TextFrame | None = None
):
super().__init__()
self._start_frame = start_frame
self._end_frame = end_frame
self._accumulator_frame = accumulator_frame
self._interim_accumulator_frame = interim_accumulator_frame
self._seen_start_frame = False
self._seen_end_frame = False
self._seen_interim_results = False
self._aggregation = ""
self._aggregating = False
#
# Frame processor
#
# Use cases implemented:
#
# S: Start, E: End, T: Transcription, I: Interim, X: Text
#
# S E -> None
# S T E -> X
# S I T E -> X
# S I E T -> X
# S I E I T -> X
#
# The following case would not be supported:
#
# S I E T1 I T2 -> X
#
# and T2 would be dropped.
async def process_frame(self, frame: Frame, direction: FrameDirection):
send_aggregation = False
if isinstance(frame, self._start_frame):
self._seen_start_frame = True
self._aggregating = True
elif isinstance(frame, self._end_frame):
self._seen_end_frame = True
# We might have received the end frame but we might still be
# aggregating (i.e. we have seen interim results but not the final
# text).
self._aggregating = self._seen_interim_results
# Send the aggregation if we are not aggregating anymore (i.e. no
# more interim results received).
send_aggregation = not self._aggregating
elif isinstance(frame, self._accumulator_frame):
if self._aggregating:
self._aggregation += f" {frame.text}"
# We have recevied a complete sentence, so if we have seen the
# end frame and we were still aggregating, it means we should
# send the aggregation.
send_aggregation = self._seen_end_frame
# We just got our final result, so let's reset interim results.
self._seen_interim_results = False
elif self._interim_accumulator_frame and isinstance(frame, self._interim_accumulator_frame):
self._seen_interim_results = True
else:
await self.push_frame(frame, direction)
if send_aggregation:
await self._push_aggregation()
async def _push_aggregation(self):
if len(self._aggregation) > 0:
await self.push_frame(TextFrame(self._aggregation.strip()))
# Reset
self._aggregation = ""
self._seen_start_frame = False
self._seen_end_frame = False
self._seen_interim_results = False
class UserResponseAggregator(ResponseAggregator):
def __init__(self):
super().__init__(
start_frame=UserStartedSpeakingFrame,
end_frame=UserStoppedSpeakingFrame,
accumulator_frame=TranscriptionFrame,
interim_accumulator_frame=InterimTranscriptionFrame,
)

View File

@@ -0,0 +1,45 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
from pipecat.frames.frames import Frame, ImageRawFrame, TextFrame, VisionImageRawFrame
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
class VisionImageFrameAggregator(FrameProcessor):
"""This aggregator waits for a consecutive TextFrame and an
ImageFrame. After the ImageFrame arrives it will output a VisionImageFrame.
>>> from pipecat.pipeline.frames import ImageFrame
>>> async def print_frames(aggregator, frame):
... async for frame in aggregator.process_frame(frame):
... print(frame)
>>> aggregator = VisionImageFrameAggregator()
>>> asyncio.run(print_frames(aggregator, TextFrame("What do you see?")))
>>> asyncio.run(print_frames(aggregator, ImageFrame(image=bytes([]), size=(0, 0))))
VisionImageFrame, text: What do you see?, image size: 0x0, buffer size: 0 B
"""
def __init__(self):
super().__init__()
self._describe_text = None
async def process_frame(self, frame: Frame, direction: FrameDirection):
if isinstance(frame, TextFrame):
self._describe_text = frame.text
elif isinstance(frame, ImageRawFrame):
if self._describe_text:
frame = VisionImageRawFrame(
text=self._describe_text,
image=frame.image,
size=frame.size,
format=frame.format)
await self.push_frame(frame)
self._describe_text = None
else:
await self.push_frame(frame, direction)

View File

@@ -0,0 +1,34 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
from typing import List
from pipecat.frames.frames import AppFrame, ControlFrame, Frame, SystemFrame
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
class Filter(FrameProcessor):
def __init__(self, types: List[type]):
super().__init__()
self._types = types
#
# Frame processor
#
def _should_passthrough_frame(self, frame):
for t in self._types:
if isinstance(frame, t):
return True
return (isinstance(frame, AppFrame)
or isinstance(frame, ControlFrame)
or isinstance(frame, SystemFrame))
async def process_frame(self, frame: Frame, direction: FrameDirection):
if self._should_passthrough_frame(frame):
await self.push_frame(frame, direction)

View File

@@ -0,0 +1,54 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import asyncio
from asyncio import AbstractEventLoop
from enum import Enum
from pipecat.frames.frames import Frame
from pipecat.utils.utils import obj_count, obj_id
from loguru import logger
class FrameDirection(Enum):
DOWNSTREAM = 1
UPSTREAM = 2
class FrameProcessor:
def __init__(self):
self.id: int = obj_id()
self.name = f"{self.__class__.__name__}#{obj_count(self)}"
self._prev: "FrameProcessor" | None = None
self._next: "FrameProcessor" | None = None
self._loop: AbstractEventLoop = asyncio.get_running_loop()
async def cleanup(self):
pass
def link(self, processor: 'FrameProcessor'):
self._next = processor
processor._prev = self
logger.debug(f"Linking {self} -> {self._next}")
def get_event_loop(self) -> AbstractEventLoop:
return self._loop
async def process_frame(self, frame: Frame, direction: FrameDirection):
pass
async def push_frame(self, frame: Frame, direction: FrameDirection = FrameDirection.DOWNSTREAM):
if direction == FrameDirection.DOWNSTREAM and self._next:
logger.trace(f"Pushing {frame} from {self} to {self._next}")
await self._next.process_frame(frame, direction)
elif direction == FrameDirection.UPSTREAM and self._prev:
logger.trace(f"Pushing {frame} upstream from {self} to {self._prev}")
await self._prev.process_frame(frame, direction)
def __str__(self):
return self.name

View File

@@ -0,0 +1,22 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
from pipecat.frames.frames import Frame
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
class FrameLogger(FrameProcessor):
def __init__(self, prefix="Frame"):
super().__init__()
self._prefix = prefix
async def process_frame(self, frame: Frame, direction: FrameDirection):
match direction:
case FrameDirection.UPSTREAM:
print(f"< {self._prefix}: {frame}")
case FrameDirection.DOWNSTREAM:
print(f"> {self._prefix}: {frame}")
await self.push_frame(frame, direction)

View File

@@ -0,0 +1,36 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
from typing import Coroutine
from pipecat.frames.frames import Frame, TextFrame
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
class StatelessTextTransformer(FrameProcessor):
"""This processor calls the given function on any text in a text frame.
>>> async def print_frames(aggregator, frame):
... async for frame in aggregator.process_frame(frame):
... print(frame.text)
>>> aggregator = StatelessTextTransformer(lambda x: x.upper())
>>> asyncio.run(print_frames(aggregator, TextFrame("Hello")))
HELLO
"""
def __init__(self, transform_fn):
super().__init__()
self._transform_fn = transform_fn
async def process_frame(self, frame: Frame, direction: FrameDirection):
if isinstance(frame, TextFrame):
result = self._transform_fn(frame.text)
if isinstance(result, Coroutine):
result = await result
await self.push_frame(result)
else:
await self.push_frame(frame, direction)

View File

@@ -0,0 +1,25 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
from typing import List
from pipecat.frames.frames import AudioRawFrame
def maybe_split_audio_frame(frame: AudioRawFrame, largest_write_size: int) -> List[AudioRawFrame]:
"""Subdivide large audio frames to enable interruption."""
frames: List[AudioRawFrame] = []
if len(frame.audio) > largest_write_size:
for i in range(0, len(frame.audio), largest_write_size):
chunk = frame.audio[i: i + largest_write_size]
frames.append(
AudioRawFrame(
audio=chunk,
sample_rate=frame.sample_rate,
num_channels=frame.num_channels))
else:
frames.append(frame)
return frames

View File

@@ -1,6 +1,6 @@
from abc import abstractmethod
from dailyai.pipeline.frames import Frame
from pipecat.pipeline.frames import Frame
class FrameSerializer:

View File

@@ -1,8 +1,8 @@
import dataclasses
from typing import Text
from dailyai.pipeline.frames import AudioFrame, Frame, TextFrame, TranscriptionFrame
import dailyai.pipeline.protobufs.frames_pb2 as frame_protos
from dailyai.serializers.abstract_frame_serializer import FrameSerializer
from pipecat.pipeline.frames import AudioFrame, Frame, TextFrame, TranscriptionFrame
import pipecat.pipeline.protobufs.frames_pb2 as frame_protos
from pipecat.serializers.abstract_frame_serializer import FrameSerializer
class ProtobufFrameSerializer(FrameSerializer):

View File

View File

@@ -0,0 +1,171 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import array
import io
import math
import wave
from abc import abstractmethod
from typing import BinaryIO
from pipecat.frames.frames import (
AudioRawFrame,
EndFrame,
Frame,
TextFrame,
VisionImageRawFrame,
)
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
class AIService(FrameProcessor):
def __init__(self):
super().__init__()
class LLMService(AIService):
"""This class is a no-op but serves as a base class for LLM services."""
def __init__(self):
super().__init__()
class TTSService(AIService):
def __init__(self, aggregate_sentences: bool = True):
super().__init__()
self._aggregate_sentences: bool = aggregate_sentences
self._current_sentence: str = ""
# Converts the text to audio.
@abstractmethod
async def run_tts(self, text: str):
pass
async def say(self, text: str):
await self.process_frame(TextFrame(text=text), FrameDirection.DOWNSTREAM)
async def _process_text_frame(self, frame: TextFrame):
text: str | None = None
if not self._aggregate_sentences:
text = frame.text
else:
self._current_sentence += frame.text
if self._current_sentence.strip().endswith((".", "?", "!")):
text = self._current_sentence
self._current_sentence = ""
if text:
await self.run_tts(text)
async def process_frame(self, frame: Frame, direction: FrameDirection):
if isinstance(frame, TextFrame):
await self._process_text_frame(frame)
elif isinstance(frame, EndFrame):
if self._current_sentence:
await self.run_tts(self._current_sentence)
await self.push_frame(frame)
else:
await self.push_frame(frame, direction)
class STTService(AIService):
"""STTService is a base class for speech-to-text services."""
def __init__(self,
min_rms: int = 400,
max_silence_frames: int = 3,
sample_rate: int = 16000,
num_channels: int = 1):
super().__init__()
self._min_rms = min_rms
self._max_silence_frames = max_silence_frames
self._sample_rate = sample_rate
self._num_channels = num_channels
self._current_silence_frames = 0
(self._content, self._wave) = self._new_wave()
@abstractmethod
async def run_stt(self, audio: BinaryIO):
"""Returns transcript as a string"""
pass
def _new_wave(self):
content = io.BufferedRandom(io.BytesIO())
ww = wave.open(content, "wb")
ww.setsampwidth(2)
ww.setnchannels(self._num_channels)
ww.setframerate(self._sample_rate)
return (content, ww)
def _get_volume(self, audio: bytes) -> float:
# https://docs.python.org/3/library/array.html
audio_array = array.array('h', audio)
squares = [sample**2 for sample in audio_array]
mean = sum(squares) / len(audio_array)
rms = math.sqrt(mean)
return rms
async def process_frame(self, frame: Frame, direction: FrameDirection):
"""Processes a frame of audio data, either buffering or transcribing it."""
if not isinstance(frame, AudioRawFrame):
await self.push_frame(frame, direction)
return
audio = frame.audio
# Try to filter out empty background noise
# (Very rudimentary approach, can be improved)
rms = self._get_volume(audio)
if rms >= self._min_rms:
# If volume is high enough, write new data to wave file
self._wave.writeframes(audio)
# If buffer is not empty and we detect a 3-frame pause in speech,
# transcribe the audio gathered so far.
if self._content.tell() > 0 and self._current_silence_frames > self._max_silence_frames:
self._current_silence_frames = 0
self._wave.close()
self._content.seek(0)
await self.run_stt(self._content)
(self._content, self._wave) = self._new_wave()
# If we get this far, this is a frame of silence
self._current_silence_frames += 1
class ImageGenService(AIService):
def __init__(self):
super().__init__()
# Renders the image. Returns an Image object.
@abstractmethod
async def run_image_gen(self, prompt: str):
pass
async def process_frame(self, frame: Frame, direction: FrameDirection):
if isinstance(frame, TextFrame):
await self.run_image_gen(frame.text)
else:
await self.push_frame(frame, direction)
class VisionService(AIService):
"""VisionService is a base class for vision services."""
def __init__(self):
super().__init__()
self._describe_text = None
@abstractmethod
async def run_vision(self, frame: VisionImageRawFrame):
pass
async def process_frame(self, frame: Frame, direction: FrameDirection):
if isinstance(frame, VisionImageRawFrame):
await self.run_vision(frame)
else:
await self.push_frame(frame, direction)

View File

@@ -0,0 +1,51 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
from pipecat.frames.frames import Frame, LLMMessagesFrame, TextFrame
from pipecat.processors.frame_processor import FrameDirection
from pipecat.services.ai_services import LLMService
from loguru import logger
try:
from anthropic import AsyncAnthropic
except ModuleNotFoundError as e:
logger.error(f"Exception: {e}")
logger.error(
"In order to use Anthropic, you need to `pip install pipecat-ai[anthropic]`. Also, set `ANTHROPIC_API_KEY` environment variable.")
raise Exception(f"Missing module: {e}")
class AnthropicLLMService(LLMService):
def __init__(
self,
api_key,
model="claude-3-opus-20240229",
max_tokens=1024):
super().__init__()
self.client = AsyncAnthropic(api_key=api_key)
self.model = model
self.max_tokens = max_tokens
async def process_frame(self, frame: Frame, direction: FrameDirection):
if isinstance(frame, LLMMessagesFrame):
stream = await self.client.messages.create(
max_tokens=self.max_tokens,
messages=[
{
"role": "user",
"content": "Hello, Claude",
}
],
model=self.model,
stream=True,
)
async for event in stream:
if event.type == "content_block_delta":
await self.push_frame(TextFrame(event.delta.text))
else:
await self.push_frame(frame, direction)

View File

@@ -5,9 +5,11 @@ from openai import AsyncAzureOpenAI
from collections.abc import AsyncGenerator
from dailyai.services.ai_services import TTSService, ImageGenService
from pipecat.services.ai_services import TTSService, ImageGenService
from PIL import Image
from loguru import logger
# See .env.example for Azure configuration needed
try:
from azure.cognitiveservices.speech import (
@@ -17,12 +19,12 @@ try:
CancellationReason,
)
except ModuleNotFoundError as e:
print(f"Exception: {e}")
print(
"In order to use Azure TTS, you need to `pip install dailyai[azure]`. Also, set `SPEECH_KEY` and `SPEECH_REGION` environment variables.")
logger.error(f"Exception: {e}")
logger.error(
"In order to use Azure TTS, you need to `pip install pipecat-ai[azure]`. Also, set `AZURE_SPEECH_API_KEY` and `AZURE_SPEECH_REGION` environment variables.")
raise Exception(f"Missing module: {e}")
from dailyai.services.openai_api_llm_service import BaseOpenAILLMService
from pipecat.services.openai_api_llm_service import BaseOpenAILLMService
class AzureTTSService(TTSService):
@@ -97,23 +99,24 @@ class AzureImageGenServiceREST(ImageGenService):
endpoint,
model,
):
super().__init__(image_size=image_size)
super().__init__()
self._api_key = api_key
self._azure_endpoint = endpoint
self._api_version = api_version
self._model = model
self._aiohttp_session = aiohttp_session
self._image_size = image_size
async def run_image_gen(self, sentence) -> tuple[str, bytes]:
async def run_image_gen(self, prompt: str) -> tuple[str, bytes, tuple[int, int]]:
url = f"{self._azure_endpoint}openai/images/generations:submit?api-version={self._api_version}"
headers = {
"api-key": self._api_key,
"Content-Type": "application/json"}
body = {
# Enter your prompt text here
"prompt": sentence,
"size": self.image_size,
"prompt": prompt,
"size": self._image_size,
"n": 1,
}
async with self._aiohttp_session.post(
@@ -146,4 +149,4 @@ class AzureImageGenServiceREST(ImageGenService):
async with self._aiohttp_session.get(image_url) as response:
image_stream = io.BytesIO(await response.content.read())
image = Image.open(image_stream)
return (image_url, image.tobytes())
return (image_url, image.tobytes(), image.size)

View File

@@ -1,8 +1,17 @@
from collections.abc import AsyncGenerator
from dailyai.services.ai_services import TTSService
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
from pipecat.frames.frames import AudioRawFrame
from pipecat.services.ai_services import TTSService
from loguru import logger
class DeepgramTTSService(TTSService):
def __init__(
self,
*,
@@ -15,15 +24,13 @@ class DeepgramTTSService(TTSService):
self._api_key = api_key
self._aiohttp_session = aiohttp_session
def get_mic_sample_rate(self):
return 24000
async def run_tts(self, sentence) -> AsyncGenerator[bytes, None]:
self.logger.info(f"Running deepgram tts for {sentence}")
async def run_tts(self, text: str):
logger.info(f"Running Deepgram TTS for {text}")
base_url = "https://api.beta.deepgram.com/v1/speak"
request_url = f"{base_url}?model={self._voice}&encoding=linear16&container=none&sample_rate=16000"
headers = {"authorization": f"token {self._api_key}"}
body = {"text": sentence}
body = {"text": text}
async with self._aiohttp_session.post(request_url, headers=headers, json=body) as r:
async for data in r.content:
yield data
frame = AudioRawFrame(audio=data, sample_rate=16000, num_channels=1)
await self.push_frame(frame)

View File

@@ -0,0 +1,58 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import aiohttp
from pipecat.frames.frames import AudioRawFrame, TTSStartedFrame, TTSStoppedFrame
from pipecat.services.ai_services import TTSService
from loguru import logger
class ElevenLabsTTSService(TTSService):
def __init__(
self,
*,
aiohttp_session: aiohttp.ClientSession,
api_key: str,
voice_id: str,
model: str = "eleven_turbo_v2",
):
super().__init__()
self._api_key = api_key
self._voice_id = voice_id
self._aiohttp_session = aiohttp_session
self._model = model
async def run_tts(self, text: str):
logger.debug(f"Transcribing text: {text}")
url = f"https://api.elevenlabs.io/v1/text-to-speech/{self._voice_id}/stream"
payload = {"text": text, "model_id": self._model}
querystring = {
"output_format": "pcm_16000",
"optimize_streaming_latency": 2}
headers = {
"xi-api-key": self._api_key,
"Content-Type": "application/json",
}
async with self._aiohttp_session.post(url, json=payload, headers=headers, params=querystring) as r:
if r.status != 200:
logger.error(f"Audio fetch status code: {r.status}, error: {r.text}")
return
await self.push_frame(TTSStartedFrame())
async for chunk in r.content:
if len(chunk) > 0:
frame = AudioRawFrame(chunk, 16000, 1)
await self.push_frame(frame)
await self.push_frame(TTSStoppedFrame())

View File

@@ -0,0 +1,83 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import aiohttp
import io
import os
from PIL import Image
from numpy import result_type
from pydantic import BaseModel
from typing import Optional, Union, Dict
from pipecat.frames.frames import URLImageRawFrame
from pipecat.services.ai_services import ImageGenService
from loguru import logger
try:
import fal_client
except ModuleNotFoundError as e:
logger.error(f"Exception: {e}")
logger.error(
"In order to use Fal, you need to `pip install pipecat-ai[fal]`. Also, set `FAL_KEY` environment variable.")
raise Exception(f"Missing module: {e}")
class FalImageGenService(ImageGenService):
class InputParams(BaseModel):
seed: Optional[int] = None
num_inference_steps: int = 8
num_images: int = 1
image_size: Union[str, Dict[str, int]] = "square_hd"
expand_prompt: bool = False
enable_safety_checker: bool = True
format: str = "png"
def __init__(
self,
*,
aiohttp_session: aiohttp.ClientSession,
params: InputParams,
model: str = "fal-ai/fast-sdxl",
key: str | None = None,
):
super().__init__()
self._model = model
self._params = params
self._aiohttp_session = aiohttp_session
if key:
os.environ["FAL_KEY"] = key
async def run_image_gen(self, prompt: str):
logger.debug(f"Generating image from prompt: {prompt}")
response = await fal_client.run_async(
self._model,
arguments={"prompt": prompt, **self._params.model_dump()}
)
image_url = response["images"][0]["url"] if response else None
if not image_url:
logger.error("Image generation failed")
return
logger.debug(f"Image generated at: {image_url}")
# Load the image from the url
logger.debug(f"Downloading image {image_url} ...")
async with self._aiohttp_session.get(image_url) as response:
logger.debug(f"Downloaded image {image_url}")
image_stream = io.BytesIO(await response.content.read())
image = Image.open(image_stream)
frame = URLImageRawFrame(
url=image_url,
image=image.tobytes(),
size=image.size,
format=image.format)
await self.push_frame(frame)

View File

@@ -0,0 +1,24 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
from pipecat.services.openai import BaseOpenAILLMService
from loguru import logger
try:
from openai import AsyncOpenAI
except ModuleNotFoundError as e:
logger.error(f"Exception: {e}")
logger.error(
"In order to use Fireworks, you need to `pip install pipecat-ai[fireworks]`. Also, set the `FIREWORKS_API_KEY` environment variable.")
raise Exception(f"Missing module: {e}")
class FireworksLLMService(BaseOpenAILLMService):
def __init__(self,
model="accounts/fireworks/models/firefunction-v1",
base_url="https://api.fireworks.ai/inference/v1"):
super().__init__(model, base_url)

Some files were not shown because too many files have changed in this diff Show More