Compare commits

...

90 Commits

Author SHA1 Message Date
James Hush
1884ff3f09 logging 2024-11-27 19:38:37 +08:00
James Hush
f34e6bce94 Switch questions 2024-11-27 15:10:50 +08:00
James Hush
909bb30517 Better recreation 2024-11-27 14:08:01 +08:00
James Hush
632bae7eee Interrupted? 2024-11-27 12:21:45 +08:00
James Hush
cedccdcbc0 Add interruptions 2024-11-27 11:50:28 +08:00
James Hush
1893784b89 Save race bot 2024-11-27 11:36:28 +08:00
James Hush
e2384e2484 fix: add logging and error handling for issue #721 2024-11-26 11:22:58 +08:00
Mark Backman
98c0a6e047 Merge pull request #749 from pipecat-ai/mb/pipecat-flows-standalone
Make Pipecat Flows an independent package
2024-11-25 17:09:11 -05:00
Mark Backman
f599e160de Make Pipecat Flows an independent package 2024-11-25 13:42:08 -05:00
Mark Backman
11c5d822f9 Merge pull request #746 from pipecat-ai/mb/update-flows
Bumping pipecat-ai-flows version
2024-11-22 11:25:03 -05:00
Mark Backman
c3e22f0931 Bumping pipecat-ai-flows version 2024-11-22 11:21:40 -05:00
Kwindla Hultman Kramer
9409546f90 Merge pull request #743 from pipecat-ai/khk/gemini-exp
Empty text content bug fix for Gemini
2024-11-21 14:04:28 -08:00
Kwindla Hultman Kramer
8ddac0ccd8 Testing with gemini-exp-1114. Bug fix 2024-11-21 10:33:12 -08:00
Mark Backman
f938960d50 Merge pull request #736 from pipecat-ai/mb/language-support
Make language support more robust
2024-11-20 13:03:47 -05:00
Mark Backman
2981d87bc1 Update changelog 2024-11-20 12:56:35 -05:00
Mark Backman
106042bbb2 Make language support more robust 2024-11-20 12:56:11 -05:00
Filipi da Silva Fuchter
d25ddeb962 Merge pull request #739 from pipecat-ai/krisp_v7
bumping krisp to support v7
2024-11-20 11:39:39 -03:00
Filipi Fuchter
c441baa692 bumping krisp to support v7 2024-11-20 11:37:45 -03:00
Mark Backman
676ff14913 Merge pull request #735 from pipecat-ai/vp-internal-push-frame-fix
internal push frame fix
2024-11-20 06:34:40 -05:00
Vanessa Pyne
14893ade92 Update src/pipecat/processors/frame_processor.py
Co-authored-by: Mark Backman <mark@daily.co>
2024-11-19 22:37:58 -06:00
Mark Backman
2a39ff69d6 Merge pull request #720 from pipecat-ai/mb/conversation-flow 2024-11-19 21:46:20 -05:00
Mark Backman
e79289454a Merge pull request #734 from pipecat-ai/mb/fix-cartesia 2024-11-19 21:27:52 -05:00
Mark Backman
25d02da1b2 Merge pull request #738 from pipecat-ai/mb/natural-conversation-demo 2024-11-19 21:27:38 -05:00
Mark Backman
a36fc370fa Improve the 22c foundational example 2024-11-19 15:49:40 -05:00
Mark Backman
e4c2f6d4c2 Update changelog 2024-11-18 21:32:53 -05:00
Mark Backman
97659ca3f0 Use the new pipecat-ai-flows module 2024-11-18 21:29:35 -05:00
vipyne
e00c75ce3f fix: raise exception in internal_push_frame 2024-11-18 16:01:04 -06:00
Mark Backman
cf62167f54 Revert: services(cartesia): generated TTSStoppedFrame after no more audio 2024-11-18 12:25:04 -05:00
Mark Backman
b3dfeb61c4 Add CHANGELOG entry 2024-11-18 12:18:20 -05:00
Mark Backman
bd020320cd Support a list of messages 2024-11-18 12:18:20 -05:00
Mark Backman
7a55d2d7db Add end session handler and update example 2024-11-18 12:18:20 -05:00
Mark Backman
b7308dca5d Fix issue where actions would execute on terminating nodes 2024-11-18 12:18:20 -05:00
Mark Backman
5301f44b3b Add pre- and post-actions 2024-11-18 12:18:20 -05:00
Mark Backman
686165b95a Add ability to register actions 2024-11-18 12:18:20 -05:00
Mark Backman
4e0ecdd673 Class name updates and remove FrameProcessor base class 2024-11-18 12:18:20 -05:00
Mark Backman
1b74560f9d Move function registration into the ConversationFlowProcessor class 2024-11-18 12:18:20 -05:00
Mark Backman
0c1070433f Clean up and commenting 2024-11-18 12:18:20 -05:00
Mark Backman
ece2c08cde debugging 2024-11-18 12:18:20 -05:00
Mark Backman
0b9742da9e Add a conversation flow processor 2024-11-18 12:18:20 -05:00
Aleix Conchillo Flaqué
635aa6eb5b Merge pull request #729 from pipecat-ai/aleix/fastapi-websocket-dont-close
transports(fastapi): don't try to close socket
2024-11-18 16:01:41 +01:00
Mark Backman
1ff17cc2b6 Merge pull request #733 from pipecat-ai/aleix/add-missing-init-files
processors: add missing __init__.py
2024-11-18 09:44:56 -05:00
Mark Backman
41ce9e9087 Merge pull request #697 from pipecat-ai/cst/leave-message
add handler for disconnect-bot message
2024-11-18 09:38:11 -05:00
Mark Backman
4803c54ecf Update CHANGELOG 2024-11-18 09:36:19 -05:00
Christian Stuff
5d7b3f2b38 add handler for disconnect-bot message 2024-11-18 09:33:30 -05:00
Aleix Conchillo Flaqué
23e5b1ec4d processors: add missing __init__.py 2024-11-18 11:32:20 +01:00
Aleix Conchillo Flaqué
7f5a8928b8 transports(fastapi): don't try to close socket
The websocket is passed from outside (in the transport constructor) so we should
not be trying to close it. FastAPI does actually close it later. We didn't see
any issue because these functions were not implemented properly. The value to
check was `application_state` instead of `client_state`. But in any case,
Pipecat should not be responsible for closing things passed from outside.
2024-11-18 01:15:19 +01:00
Aleix Conchillo Flaqué
53f675f5cf Merge pull request #727 from pipecat-ai/aleix/pipecat-0.0.49
update CHANGELOG for 0.0.49
2024-11-18 06:27:12 +08:00
Aleix Conchillo Flaqué
8173e4ce55 update CHANGELOG for 0.0.49 2024-11-17 23:26:09 +01:00
Aleix Conchillo Flaqué
5445bb0363 rtvi: add on_bot_started event 2024-11-17 22:40:00 +01:00
Mark Backman
a2a94724e5 Merge pull request #725 from pipecat-ai/mb/fix-simple-chatbot
Fix simple-chatbot example
2024-11-16 12:10:05 -05:00
Aleix Conchillo Flaqué
a8f9b0635a Merge pull request #722 from pipecat-ai/aleix/more-dailin-events
transports(daily): add more dial-in events
2024-11-17 01:09:01 +08:00
Mark Backman
4273a31fd5 Fix simple-chatbot example 2024-11-16 07:48:42 -05:00
Aleix Conchillo Flaqué
67f975a2c8 transports(daily): add more dial-in events 2024-11-16 01:22:50 +01:00
Mark Backman
d0bca67666 Merge pull request #716 from pipecat-ai/mb/mute-stt-service
Add STTMuteFilter to un/mute the STT
2024-11-14 19:55:00 -05:00
Mark Backman
966974bfc6 Change STTMuteProcessor to STTMuteFilter 2024-11-14 19:47:37 -05:00
Mark Backman
f807f233bd Suppress UserStartedSpeakingFrame and UserStoppedSpeakingFrame when muted 2024-11-14 17:11:51 -05:00
Mark Backman
33108f5798 Code review feedback 2024-11-14 17:05:08 -05:00
Mark Backman
52de825af8 Update CHANGELOG 2024-11-14 13:47:08 -05:00
Mark Backman
5fe679039c Add STTMuteProcessor to un/mute the STT 2024-11-14 13:35:02 -05:00
Kwindla Hultman Kramer
534f710f5d Merge pull request #688 from pipecat-ai/khk/natural-conversation
More work on llm-as-judge phrase endpointing
2024-11-14 09:15:16 -08:00
Mark Backman
53a11744a8 Merge pull request #712 from pipecat-ai/aleix/some-languages-tweaks
some languages tweaks
2024-11-14 09:33:26 -05:00
Mark Backman
72412cc0c4 Code review feedback 2024-11-14 09:31:04 -05:00
Mark Backman
b77ac07bc6 Merge pull request #715 from pipecat-ai/mb/update-readme-2
Add visual divider below Pipecat README image
2024-11-14 08:54:25 -05:00
Mark Backman
eb6926e0ce Add visual divider below Pipecat README image 2024-11-14 08:51:07 -05:00
Mark Backman
3b2c9de944 Merge pull request #713 from pipecat-ai/mb/update-readme
Update README
2024-11-14 08:45:28 -05:00
Mark Backman
27ff868e5a Move CONTRIBUTING to top directory 2024-11-14 08:43:03 -05:00
Mark Backman
57ef525a8e Update README 2024-11-14 08:43:03 -05:00
Aleix Conchillo Flaqué
d1db54d5fe examples(playht): use a 2.0 engine 2024-11-13 17:19:23 +01:00
Aleix Conchillo Flaqué
4f88fc0eb8 services(tts): initialize language to the proper language code 2024-11-13 17:19:23 +01:00
Aleix Conchillo Flaqué
37d1f4c4e1 services(tts): some language to service language cleanup 2024-11-13 17:19:23 +01:00
Aleix Conchillo Flaqué
ef9e86d997 services(playht): make sure we only skip wav header no matter the size 2024-11-13 17:19:23 +01:00
Aleix Conchillo Flaqué
2d2ef5a417 services(playht): voice engine is Play3.0-mini 2024-11-13 17:19:23 +01:00
Aleix Conchillo Flaqué
c1fff00586 services(playht): fix language codes 2024-11-13 17:19:23 +01:00
Mark Backman
0af2196f50 Merge pull request #708 from pipecat-ai/mb/add-rime-ai
Add RimeTTSService
2024-11-12 18:29:53 -05:00
Mark Backman
cd42320788 Update changelog 2024-11-12 18:28:04 -05:00
Mark Backman
70fce52499 Merge pull request #710 from pipecat-ai/mb/update-readme-krisp
Update Krisp README instructions
2024-11-12 11:15:25 -05:00
Mark Backman
70b60c0593 Update Krisp README instructions 2024-11-12 10:26:12 -05:00
Jon Taylor
2d8aa03f31 Merge pull request #706 from pipecat-ai/jpt/modal-example
barebones modal.com deployment example
2024-11-12 11:41:00 +00:00
Kwindla Hultman Kramer
581ff26704 Merge pull request #707 from pipecat-ai/khk/clean-up
tiny PR to remove old comment lines
2024-11-11 21:14:16 -08:00
Kwindla Hultman Kramer
335178ff06 some gemini audio input examples 2024-11-11 21:04:50 -08:00
Kwindla Hultman Kramer
ee53535f41 gemini audio-in with no transcription 2024-11-11 21:04:50 -08:00
Kwindla Hultman Kramer
91ac40307e small fix and more prompt examples 2024-11-11 21:04:50 -08:00
Kwindla Hultman Kramer
b6c2c1f730 anthropic natural conversation example using claude haiku 2024-11-11 21:04:50 -08:00
Kwindla Hultman Kramer
b56c789ae4 fixes for proposed judge pipeline 2024-11-11 21:04:50 -08:00
Kwindla Hultman Kramer
bd435d9e62 missing commit 2024-11-11 21:04:50 -08:00
Kwindla Hultman Kramer
55a81df84f contributing to llm-as-judge phrase endpointing work 2024-11-11 21:04:50 -08:00
Kwindla Hultman Kramer
87434460f5 temp hacking 2024-11-11 21:04:50 -08:00
Mark Backman
958ec42e8d Add Rime.ai TTS service 2024-11-11 21:58:09 -05:00
Jon Taylor
d1fff60d1d barebones modal.com deployment example 2024-11-11 22:30:07 +00:00
Kwindla Hultman Kramer
1438e5654a remove old comment 2024-11-10 16:08:10 -08:00
47 changed files with 4027 additions and 716 deletions

View File

@@ -5,6 +5,44 @@ All notable changes to **Pipecat** will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
## [Unreleased]
### Added
- Added a new RTVI message called `disconnect-bot`, which when handled pushes
an `EndFrame` to trigger the pipeline to stop.
### Changed
- Expanded the transcriptions.language module to support a superset of
languages.
- Updated STT and TTS services with language options that match the supported
languages for each service.
## [0.0.49] - 2024-11-17
### Added
- Added RTVI `on_bot_started` event which is useful in a single turn
interaction.
- Added `DailyTransport` events `dialin-connected`, `dialin-stopped`,
`dialin-error` and `dialin-warning`. Needs daily-python >= 0.13.0.
- Added `RimeHttpTTSService` and the `07q-interruptible-rime.py` foundational
example.
- Added `STTMuteFilter`, a general-purpose processor that combines STT
muting and interruption control. When active, it prevents both transcription
and interruptions during bot speech. The processor supports multiple
strategies: `FIRST_SPEECH` (mute only during bot's first
speech), `ALWAYS` (mute during all bot speech), or `CUSTOM` (using provided
callback).
- Added `STTMuteFrame`, a control frame that enables/disables speech
transcription in STT services.
## [0.0.48] - 2024-11-10 "Antonio release"
### Added

103
README.md
View File

@@ -1,14 +1,21 @@
<div align="center">
<h1><div align="center">
 <img alt="pipecat" width="300px" height="auto" src="https://raw.githubusercontent.com/pipecat-ai/pipecat/main/pipecat.png">
</div>
# Pipecat
</div></h1>
[![PyPI](https://img.shields.io/pypi/v/pipecat-ai)](https://pypi.org/project/pipecat-ai) [![Discord](https://img.shields.io/discord/1239284677165056021)](https://discord.gg/pipecat) <a href="https://app.commanddash.io/agent/github_pipecat-ai_pipecat"><img src="https://img.shields.io/badge/AI-Code%20Agent-EB9FDA"></a>
`pipecat` is a framework for building voice (and multimodal) conversational agents. Things like personal coaches, meeting assistants, [story-telling toys for kids](https://storytelling-chatbot.fly.dev/), customer support bots, [intake flows](https://www.youtube.com/watch?v=lDevgsp9vn0), and snarky social companions.
Pipecat is an open source Python framework for building voice and multimodal conversational agents. It handles the complex orchestration of AI services, network transport, audio processing, and multimodal interactions, letting you focus on creating engaging experiences.
Take a look at some example apps:
## What you can build
- **Voice Assistants**: [Natural, real-time conversations with AI](https://demo.dailybots.ai/)
- **Interactive Agents**: Personal coaches and meeting assistants
- **Multimodal Apps**: Combine voice, video, images, and text
- **Creative Tools**: [Story-telling experiences](https://storytelling-chatbot.fly.dev/) and social companions
- **Business Solutions**: [Customer intake flows](https://www.youtube.com/watch?v=lDevgsp9vn0) and support bots
- **Complex conversational flows**: [Refer to Pipecat Flows](https://github.com/pipecat-ai/pipecat-flows) to learn more
## See it in action
<p float="left">
<a href="https://github.com/pipecat-ai/pipecat/tree/main/examples/simple-chatbot"><img src="https://raw.githubusercontent.com/pipecat-ai/pipecat/main/examples/simple-chatbot/image.png" width="280" /></a>&nbsp;
@@ -18,33 +25,54 @@ Take a look at some example apps:
<a href="https://github.com/pipecat-ai/pipecat/tree/main/examples/moondream-chatbot"><img src="https://raw.githubusercontent.com/pipecat-ai/pipecat/main/examples/moondream-chatbot/image.png" width="280" /></a>
</p>
## Getting started with voice agents
## Key features
- **Voice-first Design**: Built-in speech recognition, TTS, and conversation handling
- **Flexible Integration**: Works with popular AI services (OpenAI, ElevenLabs, etc.)
- **Pipeline Architecture**: Build complex apps from simple, reusable components
- **Real-time Processing**: Frame-based pipeline architecture for fluid interactions
- **Production Ready**: Enterprise-grade WebRTC and Websocket support
💡 Looking to build structured conversations? Check out [Pipecat Flows](https://github.com/pipecat-ai/pipecat-flows) for managing complex conversational states and transitions.
## Getting started
You can get started with Pipecat running on your local machine, then move your agent processes to the cloud when youre ready. You can also add a 📞 telephone number, 🖼️ image output, 📺 video input, use different LLMs, and more.
```shell
# install the module
# Install the module
pip install pipecat-ai
# set up an .env file with API keys
# Set up your environment
cp dot-env.template .env
```
By default, in order to minimize dependencies, only the basic framework functionality is available. Some third-party AI services require additional dependencies that you can install with:
To keep things lightweight, only the core framework is included by default. If you need support for third-party AI services, you can add the necessary dependencies with:
```shell
pip install "pipecat-ai[option,...]"
```
Your project may or may not need these, so they're made available as optional requirements. Here is a list:
Available options include:
- **AI services**: `anthropic`, `assemblyai`, `aws`, `azure`, `deepgram`, `gladia`, `google`, `fal`, `lmnt`, `moondream`, `openai`, `openpipe`, `playht`, `silero`, `whisper`, `xtts`
- **Transports**: `local`, `websocket`, `daily`
| Category | Services | Install Command Example |
| ------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------- |
| Speech-to-Text | [AssemblyAI](https://docs.pipecat.ai/api-reference/services/stt/assemblyai), [Azure](https://docs.pipecat.ai/api-reference/services/stt/azure), [Deepgram](https://docs.pipecat.ai/api-reference/services/stt/deepgram), [Gladia](https://docs.pipecat.ai/api-reference/services/stt/gladia), [Whisper](https://docs.pipecat.ai/api-reference/services/stt/whisper) | `pip install "pipecat-ai[deepgram]"` |
| LLMs | [Anthropic](https://docs.pipecat.ai/api-reference/services/llm/anthropic), [Azure](https://docs.pipecat.ai/api-reference/services/llm/azure), [Fireworks AI](https://docs.pipecat.ai/api-reference/services/llm/fireworks), [Gemini](https://docs.pipecat.ai/api-reference/services/llm/gemini), [Ollama](https://docs.pipecat.ai/api-reference/services/llm/ollama), [OpenAI](https://docs.pipecat.ai/api-reference/services/llm/openai), [Together AI](https://docs.pipecat.ai/api-reference/services/llm/together) | `pip install "pipecat-ai[openai]"` |
| Text-to-Speech | [AWS](https://docs.pipecat.ai/api-reference/services/tts/aws), [Azure](https://docs.pipecat.ai/api-reference/services/tts/azure), [Cartesia](https://docs.pipecat.ai/api-reference/services/tts/cartesia), [Deepgram](https://docs.pipecat.ai/api-reference/services/tts/deepgram), [ElevenLabs](https://docs.pipecat.ai/api-reference/services/tts/elevenlabs), [Google](https://docs.pipecat.ai/api-reference/services/tts/google), [LMNT](https://docs.pipecat.ai/api-reference/services/tts/lmnt), [OpenAI](https://docs.pipecat.ai/api-reference/services/tts/openai), [PlayHT](https://docs.pipecat.ai/api-reference/services/tts/playht), [Rime](https://docs.pipecat.ai/api-reference/services/tts/rime), [XTTS](https://docs.pipecat.ai/api-reference/services/tts/xtts) | `pip install "pipecat-ai[cartesia]"` |
| Speech-to-Speech | [OpenAI Realtime](https://docs.pipecat.ai/api-reference/services/s2s/openai) | `pip install "pipecat-ai[openai]"` |
| Transport | [Daily (WebRTC)](https://docs.pipecat.ai/api-reference/services/transport/daily), WebSocket, Local | `pip install "pipecat-ai[daily]"` |
| Video | [Tavus](https://docs.pipecat.ai/api-reference/services/video/tavus) | `pip install "pipecat-ai[tavus]"` |
| Vision & Image | [Moondream](https://docs.pipecat.ai/api-reference/services/vision/moondream), [fal](https://docs.pipecat.ai/api-reference/services/image-generation/fal) | `pip install "pipecat-ai[moondream]"` |
| Audio Processing | [Silero VAD](https://docs.pipecat.ai/api-reference/utilities/audio/silero-vad-analyzer), [Krisp](https://docs.pipecat.ai/api-reference/utilities/audio/krisp-filter), [Noisereduce](https://docs.pipecat.ai/api-reference/utilities/audio/noisereduce-filter) | `pip install "pipecat-ai[silero]"` |
| Analytics & Metrics | [Canonical AI](https://docs.pipecat.ai/api-reference/services/analytics/canonical), [Sentry](https://docs.pipecat.ai/api-reference/services/analytics/sentry) | `pip install "pipecat-ai[canonical]"` |
📚 [View full services documentation →](https://docs.pipecat.ai/api-reference/services/supported-services)
## Code examples
- [foundational](https://github.com/pipecat-ai/pipecat/tree/main/examples/foundational) — small snippets that build on each other, introducing one or two concepts at a time
- [example apps](https://github.com/pipecat-ai/pipecat/tree/main/examples/) — complete applications that you can use as starting points for development
- [Foundational](https://github.com/pipecat-ai/pipecat/tree/main/examples/foundational) — small snippets that build on each other, introducing one or two concepts at a time
- [Example apps](https://github.com/pipecat-ai/pipecat/tree/main/examples/) — complete applications that you can use as starting points for development
## A simple voice agent running locally
@@ -109,7 +137,7 @@ Run it with:
python app.py
```
Daily provides a prebuilt WebRTC user interface. Whilst the app is running, you can visit at `https://<yourdomain>.daily.co/<room_url>` and listen to the bot say hello!
Daily provides a prebuilt WebRTC user interface. While the app is running, you can visit at `https://<yourdomain>.daily.co/<room_url>` and listen to the bot say hello!
## WebRTC for production use
@@ -119,34 +147,6 @@ One way to get up and running quickly with WebRTC is to sign up for a Daily deve
Sign up [here](https://dashboard.daily.co/u/signup) and [create a room](https://docs.daily.co/reference/rest-api/rooms) in the developer Dashboard.
## What is VAD?
Voice Activity Detection &mdash; very important for knowing when a user has finished speaking to your bot. If you are not using press-to-talk, and want Pipecat to detect when the user has finished talking, VAD is an essential component for a natural feeling conversation.
Pipecat makes use of WebRTC VAD by default when using a WebRTC transport layer. Optionally, you can use Silero VAD for improved accuracy at the cost of higher CPU usage.
```shell
pip install pipecat-ai[silero]
```
## Running the Krisp Audio Filter
To use the Krisp Filter in this project, youll need access to the **Krisp C++ SDK**.
### Step 1: Obtain Access to the Krisp SDK
1. **Create a Krisp Account**: If you dont already have an account, [sign up at Krisp](https://krisp.ai/) to access the SDK.
2. **Download the SDK**: Once you have an account, follow the instructions on the Krisp platform to download the [Krisp's desktop SDKs](https://sdk.krisp.ai/sdk/desktop).
3. **Export the path to you krisp SDK**:
`export KRISP_SDK_PATH=/PATH/TO/KRISP/SDK`
### Step 2: Install the `pipecat-krisp` Module
Once the environment variable `KRISP_SDK_PATH` is exported, activate your Python virtual environment and install it with `pip`:
```shell
source venv/bin/activate
pip install pipecat-ai[krisp]
```
## Hacking on the framework itself
_Note that you may need to set up a virtual environment before following the instructions below. For instance, you might need to run the following from the root of the repo:_
@@ -224,8 +224,23 @@ Install the
}
```
## Contributing
We welcome contributions from the community! Whether you're fixing bugs, improving documentation, or adding new features, here's how you can help:
- **Found a bug?** Open an [issue](https://github.com/pipecat-ai/pipecat/issues)
- **Have a feature idea?** Start a [discussion](https://discord.gg/pipecat)
- **Want to contribute code?** Check our [CONTRIBUTING.md](CONTRIBUTING.md) guide
- **Documentation improvements?** [Docs](https://github.com/pipecat-ai/docs) PRs are always welcome
Before submitting a pull request, please check existing issues and PRs to avoid duplicates.
We aim to review all contributions promptly and provide constructive feedback to help get your changes merged.
## Getting help
➡️ [Join our Discord](https://discord.gg/pipecat)
➡️ [Read the docs](https://docs.pipecat.ai)
➡️ [Reach us on X](https://x.com/pipecat_ai)

View File

@@ -0,0 +1,91 @@
# Python
__pycache__/
*.py[cod]
*$py.class
*.so
.Python
build/
dist/
*.egg-info/
*.egg
.installed.cfg
.eggs/
downloads/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
MANIFEST
# Virtual Environments
venv/
env/
.env
.venv/
ENV/
env.bak/
venv.bak/
# IDE
.idea/
.vscode/
.spyderproject
.spyproject
.ropeproject
# Testing and Coverage
.coverage
.coverage.*
htmlcov/
.pytest_cache/
.tox/
.nox/
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
cover/
# Logs and Databases
*.log
*.db
db.sqlite3
db.sqlite3-journal
pip-log.txt
# System Files
.DS_Store
Thumbs.db
desktop.ini
*.swp
*.swo
*.bak
*.tmp
*~
# Build and Documentation
docs/_build/
.pybuilder/
target/
instance/
.webassets-cache
.pdm.toml
.pdm-python
.pdm-build/
__pypackages__/
# Other
*.mo
*.pot
*.sage.py
.mypy_cache/
.dmypy.json
dmypy.json
.pyre/
.pytype/
cython_debug/
.ipynb_checkpoints

View File

@@ -0,0 +1,37 @@
# Deploying Pipecat to Modal.com
Barebones deployment example for [modal.com](https://www.modal.com)
1. Install dependencies
```bash
python -m venv venv
source venv/bin/active # or OS equivalent
pip install -r requirements.txt
```
2. Setup .env
```bash
cp env.example .env
```
Alternatively, you can configure your Modal app to use [secrets](https://modal.com/docs/guide/secrets)
3. Test the app locally
```bash
modal serve app.py
```
4. Deploy to production
```bash
modal deploy app.py
```
## Configuration options
This app sets some sensible defaults for reducing cold starts, such as `minkeep_warm=1`, which will keep at least 1 warm instance ready for your bot function.
It has been configured to only allow a concurrency of 1 (`max_inputs=1`) as each user will require their own running function.

View File

@@ -0,0 +1,75 @@
import os
import aiohttp
import modal
from fastapi import HTTPException
from fastapi.responses import JSONResponse
from loguru import logger
from bot import _voice_bot_process
MAX_SESSION_TIME = 15 * 60 # 15 minutes
app = modal.App("pipecat-modal")
image = modal.Image.debian_slim(python_version="3.12").pip_install_from_requirements(
"requirements.txt"
)
@app.function(
image=image,
cpu=1.0,
secrets=[modal.Secret.from_dotenv()],
keep_warm=1,
enable_memory_snapshot=True,
max_inputs=1, # Do not reuse instances across requests
retries=0,
)
def launch_bot_process(room_url: str, token: str):
_voice_bot_process(room_url, token)
@app.function(
image=image,
secrets=[modal.Secret.from_dotenv()],
)
@modal.web_endpoint(method="POST")
async def start():
from pipecat.transports.services.helpers.daily_rest import (
DailyRESTHelper,
DailyRoomParams,
)
logger.info("Request received")
async with aiohttp.ClientSession() as session:
daily_rest_helper = DailyRESTHelper(
daily_api_key=os.getenv("DAILY_API_KEY", ""),
daily_api_url=os.getenv("DAILY_API_URL", "https://api.daily.co/v1"),
aiohttp_session=session,
)
# Create new Daily room
room = await daily_rest_helper.create_room(DailyRoomParams())
if not room.url:
raise HTTPException(
status_code=500,
detail="Unable to create room",
)
logger.info(f"Created room: {room.url}")
# Create bot token for room
token = await daily_rest_helper.get_token(room.url, MAX_SESSION_TIME)
if not token:
raise HTTPException(status_code=500, detail=f"Failed to get token for room: {room.url}")
logger.info(f"Bot token created: {token}")
# Spawn a new bot process
launch_bot_process.spawn(room_url=room.url, token=token)
# Return room URL to the user to join
# Note: in production, you would want to return a token to the user
return JSONResponse(content={"room_url": room.url, token: token})

View File

@@ -0,0 +1,90 @@
import asyncio
import os
import sys
from dotenv import load_dotenv
from loguru import logger
load_dotenv(override=True)
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")
async def main(room_url: str, token: str):
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import EndFrame, LLMMessagesFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.services.cartesia import CartesiaTTSService
from pipecat.services.openai import OpenAILLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport
transport = DailyTransport(
room_url,
token,
"bot",
DailyParams(
audio_out_enabled=True,
transcription_enabled=True,
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
),
)
tts = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY", ""), voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22"
)
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o")
messages = [
{
"role": "system",
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
},
]
context = OpenAILLMContext(messages)
context_aggregator = llm.create_context_aggregator(context)
pipeline = Pipeline(
[
transport.input(),
context_aggregator.user(),
llm,
tts,
transport.output(),
context_aggregator.assistant(),
]
)
task = PipelineTask(
pipeline,
PipelineParams(
allow_interruptions=True,
enable_metrics=True,
enable_usage_metrics=True,
report_only_initial_ttfb=True,
),
)
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
await transport.capture_participant_transcription(participant["id"])
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
await task.queue_frames([LLMMessagesFrame(messages)])
@transport.event_handler("on_participant_left")
async def on_participant_left(transport, participant, reason):
await task.queue_frame(EndFrame())
runner = PipelineRunner()
await runner.run(task)
def _voice_bot_process(room_url: str, token: str):
asyncio.run(main(room_url, token))

View File

@@ -0,0 +1,3 @@
DAILY_API_KEY=
OPENAI_API_KEY=
CARTESIA_API_KEY=

View File

@@ -0,0 +1,5 @@
python-dotenv==1.0.1
modal==0.65.48
pipecat-ai[daily,silero,cartesia,openai]==0.0.48
fastapi==0.115.4
aiohttp==3.10.10

View File

@@ -10,11 +10,12 @@ import os
import sys
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import LLMMessagesFrame
from pipecat.frames.frames import BotSpeakingFrame, Frame, InputAudioRawFrame, LLMMessagesFrame, TTSAudioRawFrame, TextFrame, UserStoppedSpeakingFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
from pipecat.services.cartesia import CartesiaTTSService
from pipecat.services.openai import OpenAILLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport
@@ -30,6 +31,22 @@ load_dotenv(override=True)
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")
class DebugProcessor(FrameProcessor):
def __init__(self, name, **kwargs):
self._name = name
super().__init__(**kwargs)
async def process_frame(self, frame: Frame, direction: FrameDirection):
await super().process_frame(frame, direction)
if not (
isinstance(frame, InputAudioRawFrame)
or isinstance(frame, BotSpeakingFrame)
or isinstance(frame, TTSAudioRawFrame)
or isinstance(frame, TextFrame)
):
logger.debug(f"--- {self._name}: {frame} {direction}")
await self.push_frame(frame, direction)
async def main():
async with aiohttp.ClientSession() as session:
@@ -63,11 +80,14 @@ async def main():
context = OpenAILLMContext(messages)
context_aggregator = llm.create_context_aggregator(context)
dp = DebugProcessor("dp")
pipeline = Pipeline(
[
transport.input(), # Transport user input
context_aggregator.user(), # User responses
dp,
llm, # LLM
tts, # TTS
transport.output(), # Transport bot output

View File

@@ -49,7 +49,7 @@ async def main():
tts = PlayHTTTSService(
user_id=os.getenv("PLAYHT_USER_ID"),
api_key=os.getenv("PLAYHT_API_KEY"),
voice_url="s3://voice-cloning-zero-shot/801a663f-efd0-4254-98d0-5c175514c3e8/jennifer/manifest.json",
voice_url="s3://voice-cloning-zero-shot/d9ff78ba-d016-47f6-b0ef-dd630f59414e/female-cs/manifest.json",
params=PlayHTTTSService.InputParams(language=Language.EN),
)

View File

@@ -0,0 +1,278 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import aiohttp
import asyncio
import os
import sys
import google.ai.generativelanguage as glm
from dataclasses import dataclass
from dotenv import load_dotenv
from loguru import logger
from runner import configure
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.services.cartesia import CartesiaTTSService
from pipecat.services.google import GoogleLLMService
from pipecat.processors.frame_processor import FrameProcessor
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.frames.frames import (
LLMFullResponseStartFrame,
LLMFullResponseEndFrame,
InputAudioRawFrame,
Frame,
StartInterruptionFrame,
TextFrame,
TranscriptionFrame,
UserStartedSpeakingFrame,
UserStoppedSpeakingFrame,
)
load_dotenv(override=True)
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")
marker = "|----|"
system_message = f"""
You are a helpful LLM in a WebRTC call. Your goals are to be helpful and brief in your responses.
You are expert at transcribing audio to text. You will receive a mixture of audio and text input. When
asked to transcribe what the user said, output an exact, word-for-word transcription.
Your output will be converted to audio so don't include special characters in your answers.
Each time you answer, you should respond in three parts.
1. Transcribe exactly what the user said.
2. Output the separator field '{marker}'.
3. Respond to the user's input in a helpful, creative way using only simple text and punctuation.
Example:
User: How many ounces are in a pound?
You: How many ounces are in a pound?
{marker}
There are 16 ounces in a pound.
"""
@dataclass
class MagicDemoTranscriptionFrame(Frame):
text: str
class UserAudioCollector(FrameProcessor):
def __init__(self, context, user_context_aggregator):
super().__init__()
self._context = context
self._user_context_aggregator = user_context_aggregator
self._audio_frames = []
self._start_secs = 0.2 # this should match VAD start_secs (hardcoding for now)
self._user_speaking = False
async def process_frame(self, frame, direction):
await super().process_frame(frame, direction)
if isinstance(frame, TranscriptionFrame):
# We could gracefully handle both audio input and text/transcription input ...
# but let's leave that as an exercise to the reader. :-)
return
if isinstance(frame, UserStartedSpeakingFrame):
self._user_speaking = True
elif isinstance(frame, UserStoppedSpeakingFrame):
self._user_speaking = False
self._context.add_audio_frames_message(audio_frames=self._audio_frames)
await self._user_context_aggregator.push_frame(
self._user_context_aggregator.get_context_frame()
)
elif isinstance(frame, InputAudioRawFrame):
if self._user_speaking:
self._audio_frames.append(frame)
else:
# Append the audio frame to our buffer. Treat the buffer as a ring buffer, dropping the oldest
# frames as necessary. Assume all audio frames have the same duration.
self._audio_frames.append(frame)
frame_duration = len(frame.audio) / 16 * frame.num_channels / frame.sample_rate
buffer_duration = frame_duration * len(self._audio_frames)
while buffer_duration > self._start_secs:
self._audio_frames.pop(0)
buffer_duration -= frame_duration
await self.push_frame(frame, direction)
class TranscriptExtractor(FrameProcessor):
def __init__(self, context):
super().__init__()
self._context = context
self._accumulator = ""
self._processing_llm_response = False
self._accumulating_transcript = False
def reset(self):
self._accumulator = ""
self._processing_llm_response = False
self._accumulating_transcript = False
async def process_frame(self, frame, direction):
await super().process_frame(frame, direction)
if isinstance(frame, LLMFullResponseStartFrame):
self._processing_llm_response = True
self._accumulating_transcript = True
elif isinstance(frame, TextFrame) and self._processing_llm_response:
if self._accumulating_transcript:
text = frame.text
split_index = text.find(marker)
if split_index < 0:
self._accumulator += frame.text
# do not push this frame
return
else:
self._accumulating_transcript = False
self._accumulator += text[:split_index]
frame.text = text[split_index + len(marker) :]
await self.push_frame(frame)
return
elif isinstance(frame, LLMFullResponseEndFrame):
await self.push_frame(MagicDemoTranscriptionFrame(text=self._accumulator.strip()))
self.reset()
await self.push_frame(frame, direction)
class TanscriptionContextFixup(FrameProcessor):
def __init__(self, context):
super().__init__()
self._context = context
self._transcript = "THIS IS A TRANSCRIPT"
def swap_user_audio(self):
if not self._transcript:
return
message = self._context.messages[-2]
last_part = message.parts[-1]
if (
message.role == "user"
and last_part.inline_data
and last_part.inline_data.mime_type == "audio/wav"
):
self._context.messages[-2] = glm.Content(
role="user", parts=[glm.Part(text=self._transcript)]
)
def add_transcript_back_to_inference_output(self):
if not self._transcript:
return
message = self._context.messages[-1]
last_part = message.parts[-1]
if message.role == "model" and last_part.text:
self._context.messages[-1].parts[-1].text += f"\n\n{marker}\n{self._transcript}\n"
async def process_frame(self, frame, direction):
await super().process_frame(frame, direction)
if isinstance(frame, MagicDemoTranscriptionFrame):
self._transcript = frame.text
elif isinstance(frame, LLMFullResponseEndFrame) or isinstance(
frame, StartInterruptionFrame
):
self.swap_user_audio()
self.add_transcript_back_to_inference_output()
self._transcript = ""
await self.push_frame(frame, direction)
async def main():
async with aiohttp.ClientSession() as session:
(room_url, token) = await configure(session)
transport = DailyTransport(
room_url,
token,
"Respond bot",
DailyParams(
audio_out_enabled=True,
# No transcription at all. just audio input to Gemini!
# transcription_enabled=True,
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
vad_audio_passthrough=True,
),
)
tts = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),
voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22", # British Lady
)
llm = GoogleLLMService(
model="gemini-1.5-flash-latest",
# model="gemini-exp-1114",
api_key=os.getenv("GOOGLE_API_KEY"),
)
messages = [
{
"role": "system",
"content": system_message,
},
{
"role": "user",
"content": "Start by saying hello.",
},
]
context = OpenAILLMContext(messages)
context_aggregator = llm.create_context_aggregator(context)
audio_collector = UserAudioCollector(context, context_aggregator.user())
pull_transcript_out_of_llm_output = TranscriptExtractor(context)
fixup_context_messages = TanscriptionContextFixup(context)
pipeline = Pipeline(
[
transport.input(), # Transport user input
audio_collector,
context_aggregator.user(), # User responses
llm, # LLM
pull_transcript_out_of_llm_output,
tts, # TTS
transport.output(), # Transport bot output
context_aggregator.assistant(), # Assistant spoken responses
fixup_context_messages,
]
)
task = PipelineTask(
pipeline,
PipelineParams(
allow_interruptions=True,
enable_metrics=True,
enable_usage_metrics=True,
),
)
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
await transport.capture_participant_transcription(participant["id"])
# Kick off the conversation.
await task.queue_frames([context_aggregator.user().get_context_frame()])
runner = PipelineRunner()
await runner.run(task)
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,100 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import asyncio
import os
import sys
import aiohttp
from dotenv import load_dotenv
from loguru import logger
from runner import configure
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import LLMMessagesFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.services.openai import OpenAILLMService
from pipecat.services.rime import RimeHttpTTSService
from pipecat.transports.services.daily import DailyParams, DailyTransport
load_dotenv(override=True)
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")
async def main():
async with aiohttp.ClientSession() as session:
(room_url, token) = await configure(session)
transport = DailyTransport(
room_url,
token,
"Respond bot",
DailyParams(
audio_out_enabled=True,
transcription_enabled=True,
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
),
)
tts = RimeHttpTTSService(
api_key=os.getenv("RIME_API_KEY", ""),
voice_id="rex",
params=RimeHttpTTSService.InputParams(reduce_latency=True),
)
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o")
messages = [
{
"role": "system",
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
},
]
context = OpenAILLMContext(messages)
context_aggregator = llm.create_context_aggregator(context)
pipeline = Pipeline(
[
transport.input(), # Transport user input
context_aggregator.user(), # User responses
llm, # LLM
tts, # TTS
transport.output(), # Transport bot output
context_aggregator.assistant(), # Assistant spoken responses
]
)
task = PipelineTask(
pipeline,
PipelineParams(
allow_interruptions=True,
enable_metrics=True,
enable_usage_metrics=True,
report_only_initial_ttfb=True,
),
)
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
await transport.capture_participant_transcription(participant["id"])
# Kick off the conversation.
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
await task.queue_frames([LLMMessagesFrame(messages)])
runner = PipelineRunner()
await runner.run(task)
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -64,7 +64,11 @@ async def main():
voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22", # British Lady
)
llm = GoogleLLMService(model="gemini-1.5-flash-latest", api_key=os.getenv("GOOGLE_API_KEY"))
llm = GoogleLLMService(
model="gemini-1.5-flash-latest",
# model="gemini-exp-1114",
api_key=os.getenv("GOOGLE_API_KEY"),
)
llm.register_function("get_weather", get_weather)
llm.register_function("get_image", get_image)
@@ -151,7 +155,6 @@ indicate you should use the get_image tool are:
allow_interruptions=True,
enable_metrics=True,
enable_usage_metrics=True,
report_only_initial_ttfb=True,
),
)

View File

@@ -0,0 +1,339 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import aiohttp
import asyncio
import os
import sys
import time
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import LLMMessagesFrame, TextFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.parallel_pipeline import ParallelPipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.openai_llm_context import (
OpenAILLMContext,
)
from pipecat.services.cartesia import CartesiaTTSService
from pipecat.services.deepgram import DeepgramSTTService
from pipecat.services.openai import OpenAILLMService
from pipecat.sync.event_notifier import EventNotifier
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.processors.frame_processor import FrameProcessor, FrameDirection
from pipecat.frames.frames import (
CancelFrame,
EndFrame,
Frame,
StartFrame,
StartInterruptionFrame,
StopInterruptionFrame,
SystemFrame,
TranscriptionFrame,
UserStartedSpeakingFrame,
UserStoppedSpeakingFrame,
)
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContextFrame
from pipecat.sync.base_notifier import BaseNotifier
from pipecat.processors.filters.function_filter import FunctionFilter
from pipecat.processors.user_idle_processor import UserIdleProcessor
from runner import configure
from loguru import logger
from dotenv import load_dotenv
load_dotenv(override=True)
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")
classifier_statement = "Determine if the user's statement ends with a complete thought and you should respond. The user text is transcribed speech. It may contain multiple fragments concatentated together. You are trying to determine only the completeness of the last user statement. The previous assistant statement is provided only for context. Categorize the text as either complete with the user now expecting a response, or incomplete. Return 'YES' if text is likely complete and the user is expecting a response. Return 'NO' if the text seems to be a partial expression or unfinished thought."
class StatementJudgeContextFilter(FrameProcessor):
def __init__(self, notifier: BaseNotifier, **kwargs):
super().__init__(**kwargs)
self._notifier = notifier
async def process_frame(self, frame: Frame, direction: FrameDirection):
await super().process_frame(frame, direction)
# We must not block system frames.
if isinstance(frame, SystemFrame):
await self.push_frame(frame, direction)
return
# Just treat an LLMMessagesFrame as complete, no matter what.
if isinstance(frame, LLMMessagesFrame):
await self._notifier.notify()
return
# Otherwise, we only want to handle OpenAILLMContextFrames, and only want to push a simple
# messages frame that contains a system prompt and the most recent user messages,
# concatenated.
if isinstance(frame, OpenAILLMContextFrame):
logger.debug(f"Context Frame: {frame}")
# Take text content from the most recent user messages.
messages = frame.context.messages
user_text_messages = []
last_assistant_message = None
for message in reversed(messages):
if message["role"] != "user":
if message["role"] == "assistant":
last_assistant_message = message
break
if isinstance(message["content"], str):
user_text_messages.append(message["content"])
elif isinstance(message["content"], list):
for content in message["content"]:
if content["type"] == "text":
user_text_messages.insert(0, content["text"])
# If we have any user text content, push an LLMMessagesFrame
if user_text_messages:
logger.debug(f"User text messages: {user_text_messages}")
user_message = " ".join(reversed(user_text_messages))
logger.debug(f"User message: {user_message}")
messages = [
{
"role": "system",
"content": classifier_statement,
}
]
if last_assistant_message:
messages.append(last_assistant_message)
messages.append({"role": "user", "content": user_message})
await self.push_frame(LLMMessagesFrame(messages))
class CompletenessCheck(FrameProcessor):
def __init__(self, notifier: BaseNotifier):
super().__init__()
self._notifier = notifier
async def process_frame(self, frame: Frame, direction: FrameDirection):
await super().process_frame(frame, direction)
if isinstance(frame, TextFrame) and frame.text == "YES":
logger.debug("Completeness check YES")
await self.push_frame(UserStoppedSpeakingFrame())
await self._notifier.notify()
elif isinstance(frame, TextFrame) and frame.text == "NO":
logger.debug("Completeness check NO")
class OutputGate(FrameProcessor):
def __init__(self, notifier: BaseNotifier, **kwargs):
super().__init__(**kwargs)
self._gate_open = False
self._frames_buffer = []
self._notifier = notifier
def close_gate(self):
self._gate_open = False
def open_gate(self):
self._gate_open = True
async def process_frame(self, frame: Frame, direction: FrameDirection):
await super().process_frame(frame, direction)
# We must not block system frames.
if isinstance(frame, SystemFrame):
if isinstance(frame, StartFrame):
await self._start()
if isinstance(frame, (EndFrame, CancelFrame)):
await self._stop()
if isinstance(frame, StartInterruptionFrame):
self._frames_buffer = []
self.close_gate()
await self.push_frame(frame, direction)
return
# Ignore frames that are not following the direction of this gate.
if direction != FrameDirection.DOWNSTREAM:
await self.push_frame(frame, direction)
return
if self._gate_open:
await self.push_frame(frame, direction)
return
self._frames_buffer.append((frame, direction))
async def _start(self):
self._frames_buffer = []
self._gate_task = self.get_event_loop().create_task(self._gate_task_handler())
async def _stop(self):
self._gate_task.cancel()
await self._gate_task
async def _gate_task_handler(self):
while True:
try:
await self._notifier.wait()
self.open_gate()
for frame, direction in self._frames_buffer:
await self.push_frame(frame, direction)
self._frames_buffer = []
except asyncio.CancelledError:
break
async def main():
async with aiohttp.ClientSession() as session:
(room_url, _) = await configure(session)
transport = DailyTransport(
room_url,
None,
"Respond bot",
DailyParams(
audio_out_enabled=True,
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
vad_audio_passthrough=True,
),
)
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
tts = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),
voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22", # British Lady
)
# This is the LLM that will be used to detect if the user has finished a
# statement. This doesn't really need to be an LLM, we could use NLP
# libraries for that, but we have the machinery to use an LLM, so we might as well!
statement_llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o")
# This is the regular LLM.
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o")
messages = [
{
"role": "system",
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
},
]
context = OpenAILLMContext(messages)
context_aggregator = llm.create_context_aggregator(context)
# We have instructed the LLM to return 'YES' if it thinks the user
# completed a sentence. So, if it's 'YES' we will return true in this
# predicate which will wake up the notifier.
async def wake_check_filter(frame):
logger.debug(f"Completeness check frame: {frame}")
return frame.text == "YES"
# This is a notifier that we use to synchronize the two LLMs.
notifier = EventNotifier()
# This turns the LLM context into an inference request to classify the user's speech
# as complete or incomplete.
statement_judge_context_filter = StatementJudgeContextFilter(notifier=notifier)
# This sends a UserStoppedSpeakingFrame and triggers the notifier event
completeness_check = CompletenessCheck(notifier=notifier)
# # Notify if the user hasn't said anything.
async def user_idle_notifier(frame):
await notifier.notify()
# Sometimes the LLM will fail detecting if a user has completed a
# sentence, this will wake up the notifier if that happens.
user_idle = UserIdleProcessor(callback=user_idle_notifier, timeout=5.0)
bot_output_gate = OutputGate(notifier=notifier)
async def block_user_stopped_speaking(frame):
return not isinstance(frame, UserStoppedSpeakingFrame)
async def pass_only_llm_trigger_frames(frame):
return (
isinstance(frame, OpenAILLMContextFrame)
or isinstance(frame, LLMMessagesFrame)
or isinstance(frame, StartInterruptionFrame)
or isinstance(frame, StopInterruptionFrame)
)
pipeline = Pipeline(
[
transport.input(),
stt,
context_aggregator.user(),
ParallelPipeline(
[
# Pass everything except UserStoppedSpeaking to the elements after
# this ParallelPipeline
FunctionFilter(filter=block_user_stopped_speaking),
],
[
# Ignore everything except an OpenAILLMContextFrame. Pass a specially constructed
# LLMMessagesFrame to the statement classifier LLM. The only frame this
# sub-pipeline will output is a UserStoppedSpeakingFrame.
statement_judge_context_filter,
statement_llm,
completeness_check,
],
[
# Block everything except OpenAILLMContextFrame and LLMMessagesFrame
FunctionFilter(filter=pass_only_llm_trigger_frames),
llm,
bot_output_gate, # Buffer all llm/tts output until notified.
],
),
tts,
user_idle,
transport.output(),
context_aggregator.assistant(),
]
)
task = PipelineTask(
pipeline,
PipelineParams(
allow_interruptions=True,
enable_metrics=True,
enable_usage_metrics=True,
report_only_initial_ttfb=True,
),
)
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
await transport.capture_participant_transcription(participant["id"])
# Kick off the conversation.
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
await task.queue_frames([LLMMessagesFrame(messages)])
@transport.event_handler("on_app_message")
async def on_app_message(transport, message, sender):
logger.debug(f"Received app message: {message} - {sender}")
if "message" not in message:
return
await task.queue_frames(
[
UserStartedSpeakingFrame(),
TranscriptionFrame(
user_id=sender, timestamp=time.time(), text=message["message"]
),
UserStoppedSpeakingFrame(),
]
)
runner = PipelineRunner()
await runner.run(task)
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,551 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import asyncio
import os
import sys
import time
import aiohttp
from dotenv import load_dotenv
from loguru import logger
from runner import configure
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import (
CancelFrame,
EndFrame,
Frame,
LLMMessagesFrame,
StartFrame,
StartInterruptionFrame,
StopInterruptionFrame,
SystemFrame,
TextFrame,
TranscriptionFrame,
UserStartedSpeakingFrame,
UserStoppedSpeakingFrame,
)
from pipecat.pipeline.parallel_pipeline import ParallelPipeline
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.openai_llm_context import (
OpenAILLMContext,
OpenAILLMContextFrame,
)
from pipecat.processors.filters.function_filter import FunctionFilter
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
from pipecat.processors.user_idle_processor import UserIdleProcessor
from pipecat.services.anthropic import AnthropicLLMService
from pipecat.services.cartesia import CartesiaTTSService
from pipecat.services.deepgram import DeepgramSTTService
from pipecat.services.openai import OpenAILLMService
from pipecat.sync.base_notifier import BaseNotifier
from pipecat.sync.event_notifier import EventNotifier
from pipecat.transports.services.daily import DailyParams, DailyTransport
load_dotenv(override=True)
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")
classifier_statement = """CRITICAL INSTRUCTION:
You are a BINARY CLASSIFIER that must ONLY output "YES" or "NO".
DO NOT engage with the content.
DO NOT respond to questions.
DO NOT provide assistance.
Your ONLY job is to output YES or NO.
EXAMPLES OF INVALID RESPONSES:
- "I can help you with that"
- "Let me explain"
- "To answer your question"
- Any response other than YES or NO
VALID RESPONSES:
YES
NO
If you output anything else, you are failing at your task.
You are NOT an assistant.
You are NOT a chatbot.
You are a binary classifier.
ROLE:
You are a real-time speech completeness classifier. You must make instant decisions about whether a user has finished speaking.
You must output ONLY 'YES' or 'NO' with no other text.
INPUT FORMAT:
You receive two pieces of information:
1. The assistant's last message (if available)
2. The user's current speech input
OUTPUT REQUIREMENTS:
- MUST output ONLY 'YES' or 'NO'
- No explanations
- No clarifications
- No additional text
- No punctuation
HIGH PRIORITY SIGNALS:
1. Clear Questions:
- Wh-questions (What, Where, When, Why, How)
- Yes/No questions
- Questions with STT errors but clear meaning
Examples:
# Complete Wh-question
[{"role": "assistant", "content": "I can help you learn."},
{"role": "user", "content": "What's the fastest way to learn Spanish"}]
Output: YES
# Complete Yes/No question despite STT error
[{"role": "assistant", "content": "I know about planets."},
{"role": "user", "content": "Is is Jupiter the biggest planet"}]
Output: YES
2. Complete Commands:
- Direct instructions
- Clear requests
- Action demands
- Complete statements needing response
Examples:
# Direct instruction
[{"role": "assistant", "content": "I can explain many topics."},
{"role": "user", "content": "Tell me about black holes"}]
Output: YES
# Action demand
[{"role": "assistant", "content": "I can help with math."},
{"role": "user", "content": "Solve this equation x plus 5 equals 12"}]
Output: YES
3. Direct Responses:
- Answers to specific questions
- Option selections
- Clear acknowledgments with completion
Examples:
# Specific answer
[{"role": "assistant", "content": "What's your favorite color?"},
{"role": "user", "content": "I really like blue"}]
Output: YES
# Option selection
[{"role": "assistant", "content": "Would you prefer morning or evening?"},
{"role": "user", "content": "Morning"}]
Output: YES
MEDIUM PRIORITY SIGNALS:
1. Speech Pattern Completions:
- Self-corrections reaching completion
- False starts with clear ending
- Topic changes with complete thought
- Mid-sentence completions
Examples:
# Self-correction reaching completion
[{"role": "assistant", "content": "What would you like to know?"},
{"role": "user", "content": "Tell me about... no wait, explain how rainbows form"}]
Output: YES
# Topic change with complete thought
[{"role": "assistant", "content": "The weather is nice today."},
{"role": "user", "content": "Actually can you tell me who invented the telephone"}]
Output: YES
# Mid-sentence completion
[{"role": "assistant", "content": "Hello I'm ready."},
{"role": "user", "content": "What's the capital of? France"}]
Output: YES
2. Context-Dependent Brief Responses:
- Acknowledgments (okay, sure, alright)
- Agreements (yes, yeah)
- Disagreements (no, nah)
- Confirmations (correct, exactly)
Examples:
# Acknowledgment
[{"role": "assistant", "content": "Should we talk about history?"},
{"role": "user", "content": "Sure"}]
Output: YES
# Disagreement with completion
[{"role": "assistant", "content": "Is that what you meant?"},
{"role": "user", "content": "No not really"}]
Output: YES
LOW PRIORITY SIGNALS:
1. STT Artifacts (Consider but don't over-weight):
- Repeated words
- Unusual punctuation
- Capitalization errors
- Word insertions/deletions
Examples:
# Word repetition but complete
[{"role": "assistant", "content": "I can help with that."},
{"role": "user", "content": "What what is the time right now"}]
Output: YES
# Missing punctuation but complete
[{"role": "assistant", "content": "I can explain that."},
{"role": "user", "content": "Please tell me how computers work"}]
Output: YES
2. Speech Features:
- Filler words (um, uh, like)
- Thinking pauses
- Word repetitions
- Brief hesitations
Examples:
# Filler words but complete
[{"role": "assistant", "content": "What would you like to know?"},
{"role": "user", "content": "Um uh how do airplanes fly"}]
Output: YES
# Thinking pause but incomplete
[{"role": "assistant", "content": "I can explain anything."},
{"role": "user", "content": "Well um I want to know about the"}]
Output: NO
DECISION RULES:
1. Return YES if:
- ANY high priority signal shows clear completion
- Medium priority signals combine to show completion
- Meaning is clear despite low priority artifacts
2. Return NO if:
- No high priority signals present
- Thought clearly trails off
- Multiple incomplete indicators
- User appears mid-formulation
3. When uncertain:
- If you can understand the intent → YES
- If meaning is unclear → NO
- Always make a binary decision
- Never request clarification
Examples:
# Incomplete despite corrections
[{"role": "assistant", "content": "What would you like to know about?"},
{"role": "user", "content": "Can you tell me about"}]
Output: NO
# Complete despite multiple artifacts
[{"role": "assistant", "content": "I can help you learn."},
{"role": "user", "content": "How do you I mean what's the best way to learn programming"}]
Output: YES
# Trailing off incomplete
[{"role": "assistant", "content": "I can explain anything."},
{"role": "user", "content": "I was wondering if you could tell me why"}]
Output: NO
"""
conversational_system_message = """You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.
Please be very concise in your responses. Unless you are explicitly asked to do otherwise, give me the shortest complete answer possible without unnecessary elaboration. Generally you should answer with a single sentence.
"""
class StatementJudgeContextFilter(FrameProcessor):
def __init__(self, notifier: BaseNotifier, **kwargs):
super().__init__(**kwargs)
self._notifier = notifier
async def process_frame(self, frame: Frame, direction: FrameDirection):
await super().process_frame(frame, direction)
# We must not block system frames.
if isinstance(frame, SystemFrame):
await self.push_frame(frame, direction)
return
# Just treat an LLMMessagesFrame as complete, no matter what.
if isinstance(frame, LLMMessagesFrame):
await self._notifier.notify()
return
# Otherwise, we only want to handle OpenAILLMContextFrames, and only want to push a simple
# messages frame that contains a system prompt and the most recent user messages,
# concatenated.
if isinstance(frame, OpenAILLMContextFrame):
# Take text content from the most recent user messages.
messages = frame.context.messages
user_text_messages = []
last_assistant_message = None
for message in reversed(messages):
if message["role"] != "user":
if message["role"] == "assistant":
last_assistant_message = message
break
if isinstance(message["content"], str):
user_text_messages.append(message["content"])
elif isinstance(message["content"], list):
for content in message["content"]:
if content["type"] == "text":
user_text_messages.insert(0, content["text"])
# If we have any user text content, push an LLMMessagesFrame
if user_text_messages:
user_message = " ".join(reversed(user_text_messages))
logger.debug(f"!!! {user_message}")
messages = [
{
"role": "system",
"content": classifier_statement,
}
]
if last_assistant_message:
messages.append(last_assistant_message)
messages.append({"role": "user", "content": user_message})
await self.push_frame(LLMMessagesFrame(messages))
class CompletenessCheck(FrameProcessor):
def __init__(self, notifier: BaseNotifier):
super().__init__()
self._notifier = notifier
async def process_frame(self, frame: Frame, direction: FrameDirection):
await super().process_frame(frame, direction)
if isinstance(frame, TextFrame) and frame.text == "YES":
logger.debug("!!! Completeness check YES")
await self.push_frame(UserStoppedSpeakingFrame())
await self._notifier.notify()
elif isinstance(frame, TextFrame) and frame.text == "NO":
logger.debug("!!! Completeness check NO")
class OutputGate(FrameProcessor):
def __init__(self, notifier: BaseNotifier, **kwargs):
super().__init__(**kwargs)
self._gate_open = False
self._frames_buffer = []
self._notifier = notifier
def close_gate(self):
self._gate_open = False
def open_gate(self):
self._gate_open = True
async def process_frame(self, frame: Frame, direction: FrameDirection):
await super().process_frame(frame, direction)
# We must not block system frames.
if isinstance(frame, SystemFrame):
if isinstance(frame, StartFrame):
await self._start()
if isinstance(frame, (EndFrame, CancelFrame)):
await self._stop()
if isinstance(frame, StartInterruptionFrame):
self._frames_buffer = []
self.close_gate()
await self.push_frame(frame, direction)
return
# Ignore frames that are not following the direction of this gate.
if direction != FrameDirection.DOWNSTREAM:
await self.push_frame(frame, direction)
return
if self._gate_open:
await self.push_frame(frame, direction)
return
self._frames_buffer.append((frame, direction))
async def _start(self):
self._frames_buffer = []
self._gate_task = self.get_event_loop().create_task(self._gate_task_handler())
async def _stop(self):
self._gate_task.cancel()
await self._gate_task
async def _gate_task_handler(self):
while True:
try:
await self._notifier.wait()
self.open_gate()
for frame, direction in self._frames_buffer:
await self.push_frame(frame, direction)
self._frames_buffer = []
except asyncio.CancelledError:
break
async def main():
async with aiohttp.ClientSession() as session:
(room_url, _) = await configure(session)
transport = DailyTransport(
room_url,
None,
"Respond bot",
DailyParams(
audio_out_enabled=True,
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
vad_audio_passthrough=True,
),
)
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
tts = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),
voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22", # British Lady
)
# This is the LLM that will be used to detect if the user has finished a
# statement. This doesn't really need to be an LLM, we could use NLP
# libraries for that, but we have the machinery to use an LLM, so we might as well!
statement_llm = AnthropicLLMService(
api_key=os.getenv("ANTHROPIC_API_KEY"),
model="claude-3-5-sonnet-20241022",
)
# This is the regular LLM.
llm = OpenAILLMService(
api_key=os.getenv("OPENAI_API_KEY"),
model="gpt-4o",
)
messages = [
{
"role": "system",
"content": conversational_system_message,
},
]
context = OpenAILLMContext(messages)
context_aggregator = llm.create_context_aggregator(context)
# We have instructed the LLM to return 'YES' if it thinks the user
# completed a sentence. So, if it's 'YES' we will return true in this
# predicate which will wake up the notifier.
async def wake_check_filter(frame):
return frame.text == "YES"
# This is a notifier that we use to synchronize the two LLMs.
notifier = EventNotifier()
# This turns the LLM context into an inference request to classify the user's speech
# as complete or incomplete.
statement_judge_context_filter = StatementJudgeContextFilter(notifier=notifier)
# This sends a UserStoppedSpeakingFrame and triggers the notifier event
completeness_check = CompletenessCheck(notifier=notifier)
# # Notify if the user hasn't said anything.
async def user_idle_notifier(frame):
await notifier.notify()
# Sometimes the LLM will fail detecting if a user has completed a
# sentence, this will wake up the notifier if that happens.
user_idle = UserIdleProcessor(callback=user_idle_notifier, timeout=5.0)
bot_output_gate = OutputGate(notifier=notifier)
async def block_user_stopped_speaking(frame):
return not isinstance(frame, UserStoppedSpeakingFrame)
async def pass_only_llm_trigger_frames(frame):
return (
isinstance(frame, OpenAILLMContextFrame)
or isinstance(frame, LLMMessagesFrame)
or isinstance(frame, StartInterruptionFrame)
or isinstance(frame, StopInterruptionFrame)
)
pipeline = Pipeline(
[
transport.input(),
stt,
context_aggregator.user(),
ParallelPipeline(
[
# Pass everything except UserStoppedSpeaking to the elements after
# this ParallelPipeline
FunctionFilter(filter=block_user_stopped_speaking),
],
[
# Ignore everything except an OpenAILLMContextFrame. Pass a specially constructed
# LLMMessagesFrame to the statement classifier LLM. The only frame this
# sub-pipeline will output is a UserStoppedSpeakingFrame.
statement_judge_context_filter,
statement_llm,
completeness_check,
],
[
# Block everything except OpenAILLMContextFrame and LLMMessagesFrame
FunctionFilter(filter=pass_only_llm_trigger_frames),
llm,
bot_output_gate, # Buffer all llm/tts output until notified.
],
),
tts,
user_idle,
transport.output(),
context_aggregator.assistant(),
]
)
task = PipelineTask(
pipeline,
PipelineParams(
allow_interruptions=True,
enable_metrics=True,
enable_usage_metrics=True,
),
)
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
await transport.capture_participant_transcription(participant["id"])
# Kick off the conversation.
messages.append(
{
"role": "user",
"content": "Start by just saying \"Hello I'm ready.\" Don't say anything else.",
}
)
await task.queue_frames([LLMMessagesFrame(messages)])
@transport.event_handler("on_app_message")
async def on_app_message(transport, message, sender):
logger.debug(f"Received app message: {message} - {sender}")
if "message" not in message:
return
await task.queue_frames(
[
UserStartedSpeakingFrame(),
TranscriptionFrame(
user_id=sender, timestamp=time.time(), text=message["message"]
),
UserStoppedSpeakingFrame(),
]
)
runner = PipelineRunner()
await runner.run(task)
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,355 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import aiohttp
import asyncio
import os
import sys
import time
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import LLMMessagesFrame, TextFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.parallel_pipeline import ParallelPipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.services.deepgram import DeepgramSTTService
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.openai_llm_context import (
OpenAILLMContext,
)
from pipecat.services.cartesia import CartesiaTTSService
from pipecat.services.google import GoogleLLMService, GoogleLLMContext
from pipecat.sync.event_notifier import EventNotifier
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.processors.frame_processor import FrameProcessor, FrameDirection
from pipecat.frames.frames import (
CancelFrame,
EndFrame,
Frame,
InputAudioRawFrame,
StartFrame,
StartInterruptionFrame,
StopInterruptionFrame,
SystemFrame,
TranscriptionFrame,
UserStartedSpeakingFrame,
UserStoppedSpeakingFrame,
)
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContextFrame
from pipecat.sync.base_notifier import BaseNotifier
from pipecat.processors.filters.function_filter import FunctionFilter
from pipecat.processors.user_idle_processor import UserIdleProcessor
from runner import configure
from loguru import logger
from dotenv import load_dotenv
load_dotenv(override=True)
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")
classifier_statement = """You are an audio language classifier model. You are receiving audio from a user in a WebRTC call. Your job is to decide whether the user has finished speaking or not.
Categorize the input you receive as either:
1. a complete thought, statement, or question, or
2. an incomplete thought, statement, or question
Output 'YES' if the input is likely to be a completed thought, statement, or question.
Output 'NO' if the input indicates that the user is still speaking and does not yet expect a response yet.
If you are unsure, output 'YES'.
"""
conversational_system_message = """You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.
Please be very concise in your responses. Unless you are explicitly asked to do otherwise, give me the shortest complete answer possible without unnecessary elaboration. Generally you should answer with a single sentence.
"""
class StatementJudgeAudioContextAccumulator(FrameProcessor):
def __init__(self, *, notifier: BaseNotifier, **kwargs):
super().__init__(**kwargs)
self._notifier = notifier
self._audio_frames = []
self._audio_frames = []
self._start_secs = 0.2 # this should match VAD start_secs (hardcoding for now)
self._user_speaking = False
async def reset(self):
self._audio_frames = []
self._user_speaking = False
async def process_frame(self, frame: Frame, direction: FrameDirection):
await super().process_frame(frame, direction)
# ignore context frame
if isinstance(frame, OpenAILLMContextFrame):
return
if isinstance(frame, TranscriptionFrame):
# We could gracefully handle both audio input and text/transcription input ...
# but let's leave that as an exercise to the reader. :-)
return
if isinstance(frame, UserStartedSpeakingFrame):
self._user_speaking = True
elif isinstance(frame, UserStoppedSpeakingFrame):
self._user_speaking = False
context = GoogleLLMContext()
context.set_messages([{"role": "system", "content": classifier_statement}])
context.add_audio_frames_message(audio_frames=self._audio_frames)
await self.push_frame(OpenAILLMContextFrame(context=context))
elif isinstance(frame, InputAudioRawFrame):
if self._user_speaking:
self._audio_frames.append(frame)
else:
# Append the audio frame to our buffer. Treat the buffer as a ring buffer, dropping the oldest
# frames as necessary. Assume all audio frames have the same duration.
self._audio_frames.append(frame)
frame_duration = len(frame.audio) / 16 * frame.num_channels / frame.sample_rate
buffer_duration = frame_duration * len(self._audio_frames)
while buffer_duration > self._start_secs:
self._audio_frames.pop(0)
buffer_duration -= frame_duration
await self.push_frame(frame, direction)
class CompletenessCheck(FrameProcessor):
def __init__(
self, notifier: BaseNotifier, audio_accumulator: StatementJudgeAudioContextAccumulator
):
super().__init__()
self._notifier = notifier
self._audio_accumulator = audio_accumulator
async def process_frame(self, frame: Frame, direction: FrameDirection):
await super().process_frame(frame, direction)
if isinstance(frame, TextFrame) and frame.text.startswith("YES"):
logger.debug("Completeness check YES")
await self.push_frame(UserStoppedSpeakingFrame())
await self._audio_accumulator.reset()
await self._notifier.notify()
elif isinstance(frame, TextFrame):
if frame.text.strip():
logger.debug(f"Completeness check NO - '{frame.text}'")
class OutputGate(FrameProcessor):
def __init__(self, notifier: BaseNotifier, **kwargs):
super().__init__(**kwargs)
self._gate_open = False
self._frames_buffer = []
self._notifier = notifier
def close_gate(self):
self._gate_open = False
def open_gate(self):
self._gate_open = True
async def process_frame(self, frame: Frame, direction: FrameDirection):
await super().process_frame(frame, direction)
# We must not block system frames.
if isinstance(frame, SystemFrame):
if isinstance(frame, StartFrame):
await self._start()
if isinstance(frame, (EndFrame, CancelFrame)):
await self._stop()
if isinstance(frame, StartInterruptionFrame):
self._frames_buffer = []
self.close_gate()
await self.push_frame(frame, direction)
return
# Ignore frames that are not following the direction of this gate.
if direction != FrameDirection.DOWNSTREAM:
await self.push_frame(frame, direction)
return
if self._gate_open:
await self.push_frame(frame, direction)
return
self._frames_buffer.append((frame, direction))
async def _start(self):
self._frames_buffer = []
self._gate_task = self.get_event_loop().create_task(self._gate_task_handler())
async def _stop(self):
self._gate_task.cancel()
await self._gate_task
async def _gate_task_handler(self):
while True:
try:
await self._notifier.wait()
self.open_gate()
for frame, direction in self._frames_buffer:
await self.push_frame(frame, direction)
self._frames_buffer = []
except asyncio.CancelledError:
break
async def main():
async with aiohttp.ClientSession() as session:
(room_url, _) = await configure(session)
transport = DailyTransport(
room_url,
None,
"Respond bot",
DailyParams(
audio_out_enabled=True,
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
vad_audio_passthrough=True,
),
)
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
tts = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),
voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22", # British Lady
)
# This is the LLM that will be used to detect if the user has finished a
# statement. This doesn't really need to be an LLM, we could use NLP
# libraries for that, but we have the machinery to use an LLM, so we might as well!
statement_llm = GoogleLLMService(
model="gemini-1.5-flash-latest", api_key=os.getenv("GOOGLE_API_KEY")
)
# This is the regular LLM.
llm = GoogleLLMService(model="gemini-1.5-flash-latest", api_key=os.getenv("GOOGLE_API_KEY"))
messages = [
{
"role": "system",
"content": conversational_system_message,
},
]
context = OpenAILLMContext(messages)
context_aggregator = llm.create_context_aggregator(context)
# We have instructed the LLM to return 'YES' if it thinks the user
# completed a sentence. So, if it's 'YES' we will return true in this
# predicate which will wake up the notifier.
async def wake_check_filter(frame):
return frame.text == "YES"
# This is a notifier that we use to synchronize the two LLMs.
notifier = EventNotifier()
# This turns the LLM context into an inference request to classify the user's speech
# as complete or incomplete.
statement_judge_context_filter = StatementJudgeAudioContextAccumulator(notifier=notifier)
# This sends a UserStoppedSpeakingFrame and triggers the notifier event
completeness_check = CompletenessCheck(
notifier=notifier, audio_accumulator=statement_judge_context_filter
)
# # Notify if the user hasn't said anything.
async def user_idle_notifier(frame):
await notifier.notify()
# Sometimes the LLM will fail detecting if a user has completed a
# sentence, this will wake up the notifier if that happens.
user_idle = UserIdleProcessor(callback=user_idle_notifier, timeout=5.0)
bot_output_gate = OutputGate(notifier=notifier)
async def block_user_stopped_speaking(frame):
return not isinstance(frame, UserStoppedSpeakingFrame)
async def pass_only_llm_trigger_frames(frame):
return (
isinstance(frame, OpenAILLMContextFrame)
or isinstance(frame, LLMMessagesFrame)
or isinstance(frame, StartInterruptionFrame)
or isinstance(frame, StopInterruptionFrame)
)
pipeline = Pipeline(
[
transport.input(),
ParallelPipeline(
[
# Pass everything except UserStoppedSpeaking to the elements after
# this ParallelPipeline
FunctionFilter(filter=block_user_stopped_speaking),
],
[
statement_judge_context_filter,
statement_llm,
completeness_check,
],
[
stt,
context_aggregator.user(),
# Block everything except OpenAILLMContextFrame and LLMMessagesFrame
FunctionFilter(filter=pass_only_llm_trigger_frames),
llm,
bot_output_gate, # Buffer all llm/tts output until notified.
],
),
tts,
user_idle,
transport.output(),
context_aggregator.assistant(),
]
)
task = PipelineTask(
pipeline,
PipelineParams(
allow_interruptions=True,
enable_metrics=True,
enable_usage_metrics=True,
),
)
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
await transport.capture_participant_transcription(participant["id"])
# Kick off the conversation.
await task.queue_frames([context_aggregator.user().get_context_frame()])
@transport.event_handler("on_app_message")
async def on_app_message(transport, message, sender):
logger.debug(f"Received app message: {message} - {sender}")
if "message" not in message:
return
await task.queue_frames(
[
UserStartedSpeakingFrame(),
TranscriptionFrame(
user_id=sender, timestamp=time.time(), text=message["message"]
),
UserStoppedSpeakingFrame(),
]
)
runner = PipelineRunner()
await runner.run(task)
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,98 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import asyncio
import os
import sys
import aiohttp
from dotenv import load_dotenv
from loguru import logger
from runner import configure
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import (
LLMMessagesFrame,
)
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.processors.filters.stt_mute_filter import STTMuteConfig, STTMuteFilter, STTMuteStrategy
from pipecat.services.deepgram import DeepgramSTTService, DeepgramTTSService
from pipecat.services.openai import OpenAILLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport
load_dotenv(override=True)
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")
async def main():
async with aiohttp.ClientSession() as session:
(room_url, _) = await configure(session)
transport = DailyTransport(
room_url,
None,
"Respond bot",
DailyParams(
audio_out_enabled=True,
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
vad_audio_passthrough=True,
),
)
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
# Configure the mute processor to mute only during first speech
stt_mute_processor = STTMuteFilter(
stt_service=stt, config=STTMuteConfig(strategy=STTMuteStrategy.FIRST_SPEECH)
)
tts = DeepgramTTSService(api_key=os.getenv("DEEPGRAM_API_KEY"), voice="aura-helios-en")
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o")
messages = [
{
"role": "system",
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
},
]
context = OpenAILLMContext(messages)
context_aggregator = llm.create_context_aggregator(context)
pipeline = Pipeline(
[
transport.input(), # Transport user input
stt_mute_processor, # Add the mute processor before STT
stt, # STT
context_aggregator.user(), # User responses
llm, # LLM
tts, # TTS
transport.output(), # Transport bot output
context_aggregator.assistant(), # Assistant spoken responses
]
)
task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True))
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
# Kick off the conversation.
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
await task.queue_frames([LLMMessagesFrame(messages)])
runner = PipelineRunner()
await runner.run(task)
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,191 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import asyncio
import os
import sys
import time
import aiohttp
from loguru import logger
from runner import configure
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import (
BotSpeakingFrame,
EndFrame,
Frame,
InputAudioRawFrame,
StartInterruptionFrame,
StopInterruptionFrame,
TextFrame,
TranscriptionFrame,
TTSAudioRawFrame,
UserStartedSpeakingFrame,
UserStoppedSpeakingFrame,
)
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
from pipecat.services.cartesia import CartesiaTTSService
from pipecat.services.openai import OpenAILLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")
class DebugProcessor(FrameProcessor):
def __init__(self, name, **kwargs):
self._name = name
super().__init__(**kwargs)
async def process_frame(self, frame: Frame, direction: FrameDirection):
await super().process_frame(frame, direction)
if not (
isinstance(frame, InputAudioRawFrame)
or isinstance(frame, BotSpeakingFrame)
or isinstance(frame, UserStoppedSpeakingFrame)
or isinstance(frame, TTSAudioRawFrame)
or isinstance(frame, TextFrame)
):
logger.debug(f"--- {self._name}: {frame} {direction}")
await self.push_frame(frame, direction)
async def main():
async with aiohttp.ClientSession() as session:
(room_url, _) = await configure(session)
transport = DailyTransport(
room_url,
None,
"AI Bot",
DailyParams(
audio_out_enabled=True,
transcription_enabled=True,
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
),
)
tts = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),
voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22", # British Lady
)
llm = OpenAILLMService(api_key=os.environ["OPENAI_API_KEY"], model="gpt-4o")
messages = [
{
"role": "system",
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
},
]
dp = DebugProcessor("dp")
context = OpenAILLMContext(messages)
context_aggregator = llm.create_context_aggregator(context)
runner = PipelineRunner()
task = PipelineTask(
Pipeline(
[
# transport.input(),
context_aggregator.user(),
llm,
dp,
tts,
transport.output(),
context_aggregator.assistant(),
]
),
PipelineParams(
allow_interruptions=True,
),
)
# Register an event handler so we can play the audio when the
# participant joins.
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
participant_id = participant.get("info", {}).get("participantId", "")
# Create frames for 600 seconds
start_time = time.time()
while time.time() - start_time < 300:
elapsed_time = round(time.time() - start_time)
logger.info(f"Running for {elapsed_time} seconds")
await task.queue_frame(
StartInterruptionFrame(),
)
await asyncio.sleep(1)
await task.queue_frame(
UserStartedSpeakingFrame(),
)
await asyncio.sleep(1)
await task.queue_frame(
TranscriptionFrame("Tell me more about your company.", participant_id, time.time()),
)
await asyncio.sleep(1)
await task.queue_frame(
StopInterruptionFrame(),
)
await asyncio.sleep(1)
await task.queue_frame(
UserStoppedSpeakingFrame(),
)
await asyncio.sleep(5)
await task.queue_frame(StartInterruptionFrame())
await asyncio.sleep(1)
await task.queue_frame(
UserStartedSpeakingFrame(),
)
await asyncio.sleep(1)
await task.queue_frame(
TranscriptionFrame("Give me a list of appointment dates.", participant_id, time.time()),
)
await asyncio.sleep(1)
await task.queue_frames(
StopInterruptionFrame(),
)
await asyncio.sleep(1)
await task.queue_frame(
UserStoppedSpeakingFrame(),
)
await asyncio.sleep(5)
await task.queue_frame(EndFrame())
# @transport.event_handler("on_first_participant_joined")
# async def on_first_participant_joined(transport, participant):
# await transport.capture_participant_transcription(participant["id"])
# # Kick off the conversation.
# messages.append({"role": "system", "content": "Please introduce yourself to the user."})
# await task.queue_frames([LLMMessagesFrame(messages)])
await runner.run(task)
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -5,36 +5,33 @@
#
import asyncio
import aiohttp
import os
import sys
import aiohttp
from dotenv import load_dotenv
from loguru import logger
from PIL import Image
from runner import configure
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import (
BotStartedSpeakingFrame,
BotStoppedSpeakingFrame,
Frame,
LLMMessagesFrame,
OutputImageRawFrame,
SpriteFrame,
)
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.frames.frames import (
OutputImageRawFrame,
SpriteFrame,
Frame,
LLMMessagesFrame,
TTSAudioRawFrame,
TTSStoppedFrame,
)
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
from pipecat.services.elevenlabs import ElevenLabsTTSService
from pipecat.services.openai import OpenAILLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from runner import configure
from loguru import logger
from dotenv import load_dotenv
load_dotenv(override=True)
logger.remove(0)
@@ -73,15 +70,15 @@ class TalkingAnimation(FrameProcessor):
async def process_frame(self, frame: Frame, direction: FrameDirection):
await super().process_frame(frame, direction)
if isinstance(frame, TTSAudioRawFrame):
if isinstance(frame, BotStartedSpeakingFrame):
if not self._is_talking:
await self.push_frame(talking_frame)
self._is_talking = True
elif isinstance(frame, TTSStoppedFrame):
elif isinstance(frame, BotStoppedSpeakingFrame):
await self.push_frame(quiet_frame)
self._is_talking = False
await self.push_frame(frame)
await self.push_frame(frame, direction)
async def main():
@@ -162,7 +159,7 @@ async def main():
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
transport.capture_participant_transcription(participant["id"])
await transport.capture_participant_transcription(participant["id"])
await task.queue_frames([LLMMessagesFrame(messages)])
runner = PipelineRunner()

View File

@@ -42,7 +42,7 @@ aws = [ "boto3~=1.35.27" ]
azure = [ "azure-cognitiveservices-speech~=1.40.0" ]
canonical = [ "aiofiles~=24.1.0" ]
cartesia = [ "cartesia~=1.0.13", "websockets~=13.1" ]
daily = [ "daily-python~=0.12.0" ]
daily = [ "daily-python~=0.13.0" ]
deepgram = [ "deepgram-sdk~=3.7.3" ]
elevenlabs = [ "websockets~=13.1" ]
examples = [ "python-dotenv~=1.0.1", "flask~=3.0.3", "flask_cors~=4.0.1" ]
@@ -51,7 +51,7 @@ gladia = [ "websockets~=13.1" ]
google = [ "google-generativeai~=0.8.3", "google-cloud-texttospeech~=2.17.2" ]
gstreamer = [ "pygobject~=3.48.2" ]
fireworks = [ "openai~=1.37.2" ]
krisp = [ "pipecat-ai-krisp~=0.2.0" ]
krisp = [ "pipecat-ai-krisp~=0.3.0" ]
langchain = [ "langchain~=0.2.14", "langchain-community~=0.2.12", "langchain-openai~=0.1.20" ]
livekit = [ "livekit~=0.17.5", "livekit-api~=0.7.1", "tenacity~=8.5.0" ]
lmnt = [ "lmnt~=1.1.4" ]

View File

@@ -570,6 +570,13 @@ class TTSUpdateSettingsFrame(ServiceUpdateSettingsFrame):
pass
@dataclass
class STTMuteFrame(ControlFrame):
"""Control frame to mute/unmute the STT service."""
mute: bool
@dataclass
class STTUpdateSettingsFrame(ServiceUpdateSettingsFrame):
pass

View File

@@ -15,6 +15,7 @@ from loguru import logger
from PIL import Image
from pipecat.frames.frames import (
AudioRawFrame,
Frame,
FunctionCallInProgressFrame,
FunctionCallResultFrame,
@@ -174,6 +175,10 @@ class OpenAILLMContext:
content.append({"type": "text", "text": text})
self.add_message({"role": "user", "content": content})
def add_audio_frames_message(self, *, audio_frames: list[AudioRawFrame], text: str = None):
# todo: implement for OpenAI models and others
pass
async def call_function(
self,
f: Callable[
@@ -213,6 +218,29 @@ class OpenAILLMContext:
await f(function_name, tool_call_id, arguments, llm, self, function_call_result_callback)
def create_wav_header(self, sample_rate, num_channels, bits_per_sample, data_size):
# RIFF chunk descriptor
header = bytearray()
header.extend(b"RIFF") # ChunkID
header.extend((data_size + 36).to_bytes(4, "little")) # ChunkSize: total size - 8
header.extend(b"WAVE") # Format
# "fmt " sub-chunk
header.extend(b"fmt ") # Subchunk1ID
header.extend((16).to_bytes(4, "little")) # Subchunk1Size (16 for PCM)
header.extend((1).to_bytes(2, "little")) # AudioFormat (1 for PCM)
header.extend(num_channels.to_bytes(2, "little")) # NumChannels
header.extend(sample_rate.to_bytes(4, "little")) # SampleRate
# Calculate byte rate and block align
byte_rate = sample_rate * num_channels * (bits_per_sample // 8)
block_align = num_channels * (bits_per_sample // 8)
header.extend(byte_rate.to_bytes(4, "little")) # ByteRate
header.extend(block_align.to_bytes(2, "little")) # BlockAlign
header.extend(bits_per_sample.to_bytes(2, "little")) # BitsPerSample
# "data" sub-chunk
header.extend(b"data") # Subchunk2ID
header.extend(data_size.to_bytes(4, "little")) # Subchunk2Size
return header
@dataclass
class OpenAILLMContextFrame(Frame):

View File

View File

@@ -11,19 +11,27 @@ from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
class FunctionFilter(FrameProcessor):
def __init__(self, filter: Callable[[Frame], Awaitable[bool]]):
def __init__(
self,
filter: Callable[[Frame], Awaitable[bool]],
direction: FrameDirection = FrameDirection.DOWNSTREAM,
):
super().__init__()
self._filter = filter
self._direction = direction
#
# Frame processor
#
def _should_passthrough_frame(self, frame):
return isinstance(frame, SystemFrame)
# Ignore system frames and frames that are not following the direction of this gate
def _should_passthrough_frame(self, frame, direction):
return isinstance(frame, SystemFrame) or direction != self._direction
async def process_frame(self, frame: Frame, direction: FrameDirection):
passthrough = self._should_passthrough_frame(frame)
await super().process_frame(frame, direction)
passthrough = self._should_passthrough_frame(frame, direction)
allowed = await self._filter(frame)
if passthrough or allowed:
await self.push_frame(frame, direction)

View File

@@ -0,0 +1,111 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
from dataclasses import dataclass
from enum import Enum
from typing import Awaitable, Callable, Optional
from loguru import logger
from pipecat.frames.frames import (
BotStartedSpeakingFrame,
BotStoppedSpeakingFrame,
Frame,
StartInterruptionFrame,
StopInterruptionFrame,
STTMuteFrame,
UserStartedSpeakingFrame,
UserStoppedSpeakingFrame,
)
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
from pipecat.services.ai_services import STTService
class STTMuteStrategy(Enum):
FIRST_SPEECH = "first_speech" # Mute only during first bot speech
ALWAYS = "always" # Mute during all bot speech
CUSTOM = "custom" # Allow custom logic via callback
@dataclass
class STTMuteConfig:
"""Configuration for STTMuteFilter"""
strategy: STTMuteStrategy
# Optional callback for custom muting logic
should_mute_callback: Optional[Callable[["STTMuteFilter"], Awaitable[bool]]] = None
class STTMuteFilter(FrameProcessor):
"""A general-purpose processor that handles STT muting and interruption control.
This processor combines the concepts of STT muting and interruption control,
treating them as a single coordinated feature. When STT is muted, interruptions
are automatically disabled.
"""
def __init__(self, stt_service: STTService, config: STTMuteConfig, **kwargs):
super().__init__(**kwargs)
self._stt_service = stt_service
self._config = config
self._first_speech_handled = False
self._bot_is_speaking = False
@property
def is_muted(self) -> bool:
"""Returns whether STT is currently muted."""
return self._stt_service.is_muted
async def _handle_mute_state(self, should_mute: bool):
"""Handles both STT muting and interruption control."""
if should_mute != self.is_muted:
logger.debug(f"STT {'muting' if should_mute else 'unmuting'}")
await self.push_frame(STTMuteFrame(mute=should_mute))
async def _should_mute(self) -> bool:
"""Determines if STT should be muted based on current state and strategy."""
if not self._bot_is_speaking:
return False
if self._config.strategy == STTMuteStrategy.ALWAYS:
return True
elif (
self._config.strategy == STTMuteStrategy.FIRST_SPEECH and not self._first_speech_handled
):
self._first_speech_handled = True
return True
elif self._config.strategy == STTMuteStrategy.CUSTOM and self._config.should_mute_callback:
return await self._config.should_mute_callback(self)
return False
async def process_frame(self, frame: Frame, direction: FrameDirection):
# Handle bot speaking state changes
if isinstance(frame, BotStartedSpeakingFrame):
self._bot_is_speaking = True
await self._handle_mute_state(await self._should_mute())
elif isinstance(frame, BotStoppedSpeakingFrame):
self._bot_is_speaking = False
await self._handle_mute_state(await self._should_mute())
# Handle frame propagation
if isinstance(
frame,
(
StartInterruptionFrame,
StopInterruptionFrame,
UserStartedSpeakingFrame,
UserStoppedSpeakingFrame,
),
):
# Only pass VAD-related frames when not muted
if not self.is_muted:
await self.push_frame(frame, direction)
else:
logger.debug(f"{frame.__class__.__name__} suppressed - STT currently muted")
else:
# Pass all other frames through
await self.push_frame(frame, direction)

View File

@@ -246,6 +246,8 @@ class FrameProcessor:
await self._prev.queue_frame(frame, direction)
except Exception as e:
logger.exception(f"Uncaught exception in {self}: {e}")
await self.push_error(ErrorFrame(str(e)))
raise
def __create_input_task(self):
self.__input_queue = asyncio.Queue()

View File

@@ -591,6 +591,7 @@ class RTVIProcessor(FrameProcessor):
self._message_queue = asyncio.Queue()
self._message_task = self.get_event_loop().create_task(self._message_task_handler())
self._register_event_handler("on_bot_started")
self._register_event_handler("on_client_ready")
def register_action(self, action: RTVIAction):
@@ -679,7 +680,7 @@ class RTVIProcessor(FrameProcessor):
await self._pipeline.cleanup()
async def _start(self, frame: StartFrame):
pass
await self._call_event_handler("on_bot_started")
async def _stop(self, frame: EndFrame):
await self._cancel_tasks()
@@ -742,6 +743,8 @@ class RTVIProcessor(FrameProcessor):
case "update-config":
update_config = RTVIUpdateConfig.model_validate(message.data)
await self._handle_update_config(message.id, update_config)
case "disconnect-bot":
await self.push_frame(EndFrame())
case "action":
action = RTVIActionRun.model_validate(message.data)
action_frame = RTVIActionFrame(message_id=message.id, rtvi_action_run=action)

View File

@@ -22,6 +22,7 @@ from pipecat.frames.frames import (
LLMFullResponseEndFrame,
StartFrame,
StartInterruptionFrame,
STTMuteFrame,
STTUpdateSettingsFrame,
TextFrame,
TTSAudioRawFrame,
@@ -454,6 +455,12 @@ class STTService(AIService):
super().__init__(**kwargs)
self._audio_passthrough = audio_passthrough
self._settings: Dict[str, Any] = {}
self._muted: bool = False
@property
def is_muted(self) -> bool:
"""Returns whether the STT service is currently muted."""
return self._muted
@abstractmethod
async def set_model(self, model: str):
@@ -482,7 +489,8 @@ class STTService(AIService):
logger.warning(f"Unknown setting for STT service: {key}")
async def process_audio_frame(self, frame: AudioRawFrame):
await self.process_generator(self.run_stt(frame.audio))
if not self._muted:
await self.process_generator(self.run_stt(frame.audio))
async def process_frame(self, frame: Frame, direction: FrameDirection):
"""Processes a frame of audio data, either buffering or transcribing it."""
@@ -497,6 +505,9 @@ class STTService(AIService):
await self.push_frame(frame, direction)
elif isinstance(frame, STTUpdateSettingsFrame):
await self._update_settings(frame.settings)
elif isinstance(frame, STTMuteFrame):
self._muted = frame.mute
logger.debug(f"STT service {'muted' if frame.mute else 'unmuted'}")
else:
await self.push_frame(frame, direction)

View File

@@ -5,7 +5,6 @@
#
import asyncio
from typing import AsyncGenerator, Optional
from loguru import logger
@@ -33,6 +32,82 @@ except ModuleNotFoundError as e:
raise Exception(f"Missing module: {e}")
def language_to_aws_language(language: Language) -> str | None:
language_map = {
# Arabic
Language.AR: "arb",
Language.AR_AE: "ar-AE",
# Catalan
Language.CA: "ca-ES",
# Chinese
Language.ZH: "cmn-CN", # Mandarin
Language.YUE: "yue-CN", # Cantonese
Language.YUE_CN: "yue-CN",
# Czech
Language.CS: "cs-CZ",
# Danish
Language.DA: "da-DK",
# Dutch
Language.NL: "nl-NL",
Language.NL_BE: "nl-BE",
# English
Language.EN: "en-US", # Default to US English
Language.EN_AU: "en-AU",
Language.EN_GB: "en-GB",
Language.EN_IN: "en-IN",
Language.EN_NZ: "en-NZ",
Language.EN_US: "en-US",
Language.EN_ZA: "en-ZA",
# Finnish
Language.FI: "fi-FI",
# French
Language.FR: "fr-FR",
Language.FR_BE: "fr-BE",
Language.FR_CA: "fr-CA",
# German
Language.DE: "de-DE",
Language.DE_AT: "de-AT",
Language.DE_CH: "de-CH",
# Hindi
Language.HI: "hi-IN",
# Icelandic
Language.IS: "is-IS",
# Italian
Language.IT: "it-IT",
# Japanese
Language.JA: "ja-JP",
# Korean
Language.KO: "ko-KR",
# Norwegian
Language.NO: "nb-NO",
Language.NB: "nb-NO",
Language.NB_NO: "nb-NO",
# Polish
Language.PL: "pl-PL",
# Portuguese
Language.PT: "pt-PT",
Language.PT_BR: "pt-BR",
Language.PT_PT: "pt-PT",
# Romanian
Language.RO: "ro-RO",
# Russian
Language.RU: "ru-RU",
# Spanish
Language.ES: "es-ES",
Language.ES_MX: "es-MX",
Language.ES_US: "es-US",
# Swedish
Language.SV: "sv-SE",
# Turkish
Language.TR: "tr-TR",
# Welsh
Language.CY: "cy-GB",
Language.CY_GB: "cy-GB",
}
return language_map.get(language)
class AWSTTSService(TTSService):
class InputParams(BaseModel):
engine: Optional[str] = None
@@ -65,7 +140,7 @@ class AWSTTSService(TTSService):
"engine": params.engine,
"language": self.language_to_service_language(params.language)
if params.language
else Language.EN,
else "en-US",
"pitch": params.pitch,
"rate": params.rate,
"volume": params.volume,
@@ -77,62 +152,7 @@ class AWSTTSService(TTSService):
return True
def language_to_service_language(self, language: Language) -> str | None:
match language:
case Language.CA:
return "ca-ES"
case Language.ZH:
return "cmn-CN"
case Language.DA:
return "da-DK"
case Language.NL:
return "nl-NL"
case Language.NL_BE:
return "nl-BE"
case Language.EN | Language.EN_US:
return "en-US"
case Language.EN_AU:
return "en-AU"
case Language.EN_GB:
return "en-GB"
case Language.EN_NZ:
return "en-NZ"
case Language.EN_IN:
return "en-IN"
case Language.FI:
return "fi-FI"
case Language.FR:
return "fr-FR"
case Language.FR_CA:
return "fr-CA"
case Language.DE:
return "de-DE"
case Language.HI:
return "hi-IN"
case Language.IT:
return "it-IT"
case Language.JA:
return "ja-JP"
case Language.KO:
return "ko-KR"
case Language.NO:
return "nb-NO"
case Language.PL:
return "pl-PL"
case Language.PT:
return "pt-PT"
case Language.PT_BR:
return "pt-BR"
case Language.RO:
return "ro-RO"
case Language.RU:
return "ru-RU"
case Language.ES:
return "es-ES"
case Language.SV:
return "sv-SE"
case Language.TR:
return "tr-TR"
return None
return language_to_aws_language(language)
def _construct_ssml(self, text: str) -> str:
ssml = "<speak>"

View File

@@ -61,6 +61,343 @@ except ModuleNotFoundError as e:
raise Exception(f"Missing module: {e}")
def language_to_azure_language(language: Language) -> str | None:
language_map = {
# Afrikaans
Language.AF: "af-ZA",
Language.AF_ZA: "af-ZA",
# Amharic
Language.AM: "am-ET",
Language.AM_ET: "am-ET",
# Arabic
Language.AR: "ar-AE", # Default to UAE Arabic
Language.AR_AE: "ar-AE",
Language.AR_BH: "ar-BH",
Language.AR_DZ: "ar-DZ",
Language.AR_EG: "ar-EG",
Language.AR_IQ: "ar-IQ",
Language.AR_JO: "ar-JO",
Language.AR_KW: "ar-KW",
Language.AR_LB: "ar-LB",
Language.AR_LY: "ar-LY",
Language.AR_MA: "ar-MA",
Language.AR_OM: "ar-OM",
Language.AR_QA: "ar-QA",
Language.AR_SA: "ar-SA",
Language.AR_SY: "ar-SY",
Language.AR_TN: "ar-TN",
Language.AR_YE: "ar-YE",
# Assamese
Language.AS: "as-IN",
Language.AS_IN: "as-IN",
# Azerbaijani
Language.AZ: "az-AZ",
Language.AZ_AZ: "az-AZ",
# Bulgarian
Language.BG: "bg-BG",
Language.BG_BG: "bg-BG",
# Bengali
Language.BN: "bn-IN", # Default to Indian Bengali
Language.BN_BD: "bn-BD",
Language.BN_IN: "bn-IN",
# Bosnian
Language.BS: "bs-BA",
Language.BS_BA: "bs-BA",
# Catalan
Language.CA: "ca-ES",
Language.CA_ES: "ca-ES",
# Czech
Language.CS: "cs-CZ",
Language.CS_CZ: "cs-CZ",
# Welsh
Language.CY: "cy-GB",
Language.CY_GB: "cy-GB",
# Danish
Language.DA: "da-DK",
Language.DA_DK: "da-DK",
# German
Language.DE: "de-DE",
Language.DE_AT: "de-AT",
Language.DE_CH: "de-CH",
Language.DE_DE: "de-DE",
# Greek
Language.EL: "el-GR",
Language.EL_GR: "el-GR",
# English
Language.EN: "en-US", # Default to US English
Language.EN_AU: "en-AU",
Language.EN_CA: "en-CA",
Language.EN_GB: "en-GB",
Language.EN_HK: "en-HK",
Language.EN_IE: "en-IE",
Language.EN_IN: "en-IN",
Language.EN_KE: "en-KE",
Language.EN_NG: "en-NG",
Language.EN_NZ: "en-NZ",
Language.EN_PH: "en-PH",
Language.EN_SG: "en-SG",
Language.EN_TZ: "en-TZ",
Language.EN_US: "en-US",
Language.EN_ZA: "en-ZA",
# Spanish
Language.ES: "es-ES", # Default to Spain Spanish
Language.ES_AR: "es-AR",
Language.ES_BO: "es-BO",
Language.ES_CL: "es-CL",
Language.ES_CO: "es-CO",
Language.ES_CR: "es-CR",
Language.ES_CU: "es-CU",
Language.ES_DO: "es-DO",
Language.ES_EC: "es-EC",
Language.ES_ES: "es-ES",
Language.ES_GQ: "es-GQ",
Language.ES_GT: "es-GT",
Language.ES_HN: "es-HN",
Language.ES_MX: "es-MX",
Language.ES_NI: "es-NI",
Language.ES_PA: "es-PA",
Language.ES_PE: "es-PE",
Language.ES_PR: "es-PR",
Language.ES_PY: "es-PY",
Language.ES_SV: "es-SV",
Language.ES_US: "es-US",
Language.ES_UY: "es-UY",
Language.ES_VE: "es-VE",
# Estonian
Language.ET: "et-EE",
Language.ET_EE: "et-EE",
# Basque
Language.EU: "eu-ES",
Language.EU_ES: "eu-ES",
# Persian
Language.FA: "fa-IR",
Language.FA_IR: "fa-IR",
# Finnish
Language.FI: "fi-FI",
Language.FI_FI: "fi-FI",
# Filipino
Language.FIL: "fil-PH",
Language.FIL_PH: "fil-PH",
# French
Language.FR: "fr-FR",
Language.FR_BE: "fr-BE",
Language.FR_CA: "fr-CA",
Language.FR_CH: "fr-CH",
Language.FR_FR: "fr-FR",
# Irish
Language.GA: "ga-IE",
Language.GA_IE: "ga-IE",
# Galician
Language.GL: "gl-ES",
Language.GL_ES: "gl-ES",
# Gujarati
Language.GU: "gu-IN",
Language.GU_IN: "gu-IN",
# Hebrew
Language.HE: "he-IL",
Language.HE_IL: "he-IL",
# Hindi
Language.HI: "hi-IN",
Language.HI_IN: "hi-IN",
# Croatian
Language.HR: "hr-HR",
Language.HR_HR: "hr-HR",
# Hungarian
Language.HU: "hu-HU",
Language.HU_HU: "hu-HU",
# Armenian
Language.HY: "hy-AM",
Language.HY_AM: "hy-AM",
# Indonesian
Language.ID: "id-ID",
Language.ID_ID: "id-ID",
# Icelandic
Language.IS: "is-IS",
Language.IS_IS: "is-IS",
# Italian
Language.IT: "it-IT",
Language.IT_IT: "it-IT",
# Inuktitut
Language.IU_CANS_CA: "iu-Cans-CA",
Language.IU_LATN_CA: "iu-Latn-CA",
# Japanese
Language.JA: "ja-JP",
Language.JA_JP: "ja-JP",
# Javanese
Language.JV: "jv-ID",
Language.JV_ID: "jv-ID",
# Georgian
Language.KA: "ka-GE",
Language.KA_GE: "ka-GE",
# Kazakh
Language.KK: "kk-KZ",
Language.KK_KZ: "kk-KZ",
# Khmer
Language.KM: "km-KH",
Language.KM_KH: "km-KH",
# Kannada
Language.KN: "kn-IN",
Language.KN_IN: "kn-IN",
# Korean
Language.KO: "ko-KR",
Language.KO_KR: "ko-KR",
# Lao
Language.LO: "lo-LA",
Language.LO_LA: "lo-LA",
# Lithuanian
Language.LT: "lt-LT",
Language.LT_LT: "lt-LT",
# Latvian
Language.LV: "lv-LV",
Language.LV_LV: "lv-LV",
# Macedonian
Language.MK: "mk-MK",
Language.MK_MK: "mk-MK",
# Malayalam
Language.ML: "ml-IN",
Language.ML_IN: "ml-IN",
# Mongolian
Language.MN: "mn-MN",
Language.MN_MN: "mn-MN",
# Marathi
Language.MR: "mr-IN",
Language.MR_IN: "mr-IN",
# Malay
Language.MS: "ms-MY",
Language.MS_MY: "ms-MY",
# Maltese
Language.MT: "mt-MT",
Language.MT_MT: "mt-MT",
# Burmese
Language.MY: "my-MM",
Language.MY_MM: "my-MM",
# Norwegian
Language.NB: "nb-NO",
Language.NB_NO: "nb-NO",
Language.NO: "nb-NO",
# Nepali
Language.NE: "ne-NP",
Language.NE_NP: "ne-NP",
# Dutch
Language.NL: "nl-NL",
Language.NL_BE: "nl-BE",
Language.NL_NL: "nl-NL",
# Odia
Language.OR: "or-IN",
Language.OR_IN: "or-IN",
# Punjabi
Language.PA: "pa-IN",
Language.PA_IN: "pa-IN",
# Polish
Language.PL: "pl-PL",
Language.PL_PL: "pl-PL",
# Pashto
Language.PS: "ps-AF",
Language.PS_AF: "ps-AF",
# Portuguese
Language.PT: "pt-PT",
Language.PT_BR: "pt-BR",
Language.PT_PT: "pt-PT",
# Romanian
Language.RO: "ro-RO",
Language.RO_RO: "ro-RO",
# Russian
Language.RU: "ru-RU",
Language.RU_RU: "ru-RU",
# Sinhala
Language.SI: "si-LK",
Language.SI_LK: "si-LK",
# Slovak
Language.SK: "sk-SK",
Language.SK_SK: "sk-SK",
# Slovenian
Language.SL: "sl-SI",
Language.SL_SI: "sl-SI",
# Somali
Language.SO: "so-SO",
Language.SO_SO: "so-SO",
# Albanian
Language.SQ: "sq-AL",
Language.SQ_AL: "sq-AL",
# Serbian
Language.SR: "sr-RS",
Language.SR_RS: "sr-RS",
Language.SR_LATN: "sr-Latn-RS",
Language.SR_LATN_RS: "sr-Latn-RS",
# Sundanese
Language.SU: "su-ID",
Language.SU_ID: "su-ID",
# Swedish
Language.SV: "sv-SE",
Language.SV_SE: "sv-SE",
# Swahili
Language.SW: "sw-KE",
Language.SW_KE: "sw-KE",
Language.SW_TZ: "sw-TZ",
# Tamil
Language.TA: "ta-IN",
Language.TA_IN: "ta-IN",
Language.TA_LK: "ta-LK",
Language.TA_MY: "ta-MY",
Language.TA_SG: "ta-SG",
# Telugu
Language.TE: "te-IN",
Language.TE_IN: "te-IN",
# Thai
Language.TH: "th-TH",
Language.TH_TH: "th-TH",
# Turkish
Language.TR: "tr-TR",
Language.TR_TR: "tr-TR",
# Ukrainian
Language.UK: "uk-UA",
Language.UK_UA: "uk-UA",
# Urdu
Language.UR: "ur-IN",
Language.UR_IN: "ur-IN",
Language.UR_PK: "ur-PK",
# Uzbek
Language.UZ: "uz-UZ",
Language.UZ_UZ: "uz-UZ",
# Vietnamese
Language.VI: "vi-VN",
Language.VI_VN: "vi-VN",
# Wu Chinese
Language.WUU: "wuu-CN",
Language.WUU_CN: "wuu-CN",
# Yue Chinese
Language.YUE: "yue-CN",
Language.YUE_CN: "yue-CN",
# Chinese
Language.ZH: "zh-CN",
Language.ZH_CN: "zh-CN",
Language.ZH_CN_GUANGXI: "zh-CN-guangxi",
Language.ZH_CN_HENAN: "zh-CN-henan",
Language.ZH_CN_LIAONING: "zh-CN-liaoning",
Language.ZH_CN_SHAANXI: "zh-CN-shaanxi",
Language.ZH_CN_SHANDONG: "zh-CN-shandong",
Language.ZH_CN_SICHUAN: "zh-CN-sichuan",
Language.ZH_HK: "zh-HK",
Language.ZH_TW: "zh-TW",
# Zulu
Language.ZU: "zu-ZA",
Language.ZU_ZA: "zu-ZA",
}
return language_map.get(language)
def sample_rate_to_output_format(sample_rate: int) -> SpeechSynthesisOutputFormat:
sample_rate_map = {
8000: SpeechSynthesisOutputFormat.Raw8Khz16BitMonoPcm,
16000: SpeechSynthesisOutputFormat.Raw16Khz16BitMonoPcm,
22050: SpeechSynthesisOutputFormat.Raw22050Hz16BitMonoPcm,
24000: SpeechSynthesisOutputFormat.Raw24Khz16BitMonoPcm,
44100: SpeechSynthesisOutputFormat.Raw44100Hz16BitMonoPcm,
48000: SpeechSynthesisOutputFormat.Raw48Khz16BitMonoPcm,
}
return sample_rate_map.get(sample_rate, SpeechSynthesisOutputFormat.Raw24Khz16BitMonoPcm)
class AzureLLMService(BaseOpenAILLMService):
def __init__(
self, *, api_key: str, endpoint: str, model: str, api_version: str = "2023-12-01-preview"
@@ -89,23 +426,6 @@ class AzureLLMService(BaseOpenAILLMService):
return OpenAIContextAggregatorPair(_user=user, _assistant=assistant)
def sample_rate_to_output_format(sample_rate: int) -> SpeechSynthesisOutputFormat:
match sample_rate:
case 8000:
return SpeechSynthesisOutputFormat.Raw8Khz16BitMonoPcm
case 16000:
return SpeechSynthesisOutputFormat.Raw16Khz16BitMonoPcm
case 22050:
return SpeechSynthesisOutputFormat.Raw22050Hz16BitMonoPcm
case 24000:
return SpeechSynthesisOutputFormat.Raw24Khz16BitMonoPcm
case 44100:
return SpeechSynthesisOutputFormat.Raw44100Hz16BitMonoPcm
case 48000:
return SpeechSynthesisOutputFormat.Raw48Khz16BitMonoPcm
return SpeechSynthesisOutputFormat.Raw16Khz16BitMonoPcm
class AzureBaseTTSService(TTSService):
class InputParams(BaseModel):
emphasis: Optional[str] = None
@@ -134,7 +454,7 @@ class AzureBaseTTSService(TTSService):
"emphasis": params.emphasis,
"language": self.language_to_service_language(params.language)
if params.language
else Language.EN_US,
else "en-US",
"pitch": params.pitch,
"rate": params.rate,
"role": params.role,
@@ -152,92 +472,7 @@ class AzureBaseTTSService(TTSService):
return True
def language_to_service_language(self, language: Language) -> str | None:
match language:
case Language.BG:
return "bg-BG"
case Language.CA:
return "ca-ES"
case Language.ZH:
return "zh-CN"
case Language.ZH_TW:
return "zh-TW"
case Language.CS:
return "cs-CZ"
case Language.DA:
return "da-DK"
case Language.NL:
return "nl-NL"
case Language.EN | Language.EN_US:
return "en-US"
case Language.EN_AU:
return "en-AU"
case Language.EN_GB:
return "en-GB"
case Language.EN_NZ:
return "en-NZ"
case Language.EN_IN:
return "en-IN"
case Language.ET:
return "et-EE"
case Language.FI:
return "fi-FI"
case Language.NL_BE:
return "nl-BE"
case Language.FR:
return "fr-FR"
case Language.FR_CA:
return "fr-CA"
case Language.DE:
return "de-DE"
case Language.DE_CH:
return "de-CH"
case Language.EL:
return "el-GR"
case Language.HI:
return "hi-IN"
case Language.HU:
return "hu-HU"
case Language.ID:
return "id-ID"
case Language.IT:
return "it-IT"
case Language.JA:
return "ja-JP"
case Language.KO:
return "ko-KR"
case Language.LV:
return "lv-LV"
case Language.LT:
return "lt-LT"
case Language.MS:
return "ms-MY"
case Language.NO:
return "nb-NO"
case Language.PL:
return "pl-PL"
case Language.PT:
return "pt-PT"
case Language.PT_BR:
return "pt-BR"
case Language.RO:
return "ro-RO"
case Language.RU:
return "ru-RU"
case Language.SK:
return "sk-SK"
case Language.ES:
return "es-ES"
case Language.SV:
return "sv-SE"
case Language.TH:
return "th-TH"
case Language.TR:
return "tr-TR"
case Language.UK:
return "uk-UA"
case Language.VI:
return "vi-VN"
return None
return language_to_azure_language(language)
def _construct_ssml(self, text: str) -> str:
language = self._settings["language"]

View File

@@ -7,6 +7,7 @@
import asyncio
import base64
import json
import random
import uuid
from typing import AsyncGenerator, List, Optional, Union
@@ -44,29 +45,27 @@ except ModuleNotFoundError as e:
def language_to_cartesia_language(language: Language) -> str | None:
match language:
case Language.DE:
return "de"
case (
Language.EN
| Language.EN_US
| Language.EN_GB
| Language.EN_AU
| Language.EN_NZ
| Language.EN_IN
):
return "en"
case Language.ES:
return "es"
case Language.FR | Language.FR_CA:
return "fr"
case Language.JA:
return "ja"
case Language.PT | Language.PT_BR:
return "pt"
case Language.ZH | Language.ZH_TW:
return "zh"
return None
BASE_LANGUAGES = {
Language.DE: "de",
Language.EN: "en",
Language.ES: "es",
Language.FR: "fr",
Language.JA: "ja",
Language.PT: "pt",
Language.ZH: "zh",
}
result = BASE_LANGUAGES.get(language)
# If not found in base languages, try to find the base language from a variant
if not result:
# Convert enum value to string and get the base language part (e.g. es-ES -> es)
lang_str = str(language.value)
base_code = lang_str.split("-")[0].lower()
# Look up the base code in our supported languages
result = base_code if base_code in BASE_LANGUAGES.values() else None
return result
class CartesiaTTSService(WordTTSService):
@@ -117,7 +116,7 @@ class CartesiaTTSService(WordTTSService):
},
"language": self.language_to_service_language(params.language)
if params.language
else Language.EN,
else "en",
"speed": params.speed,
"emotion": params.emotion,
}
@@ -224,17 +223,22 @@ class CartesiaTTSService(WordTTSService):
async def _receive_task_handler(self):
try:
async for message in self._get_websocket():
# Randomly cancel the asyncio task 1% of the time
if random.random() < 0.01:
logger.info(f"Cancelling task for {self} due to random chance")
asyncio.current_task().cancel()
msg = json.loads(message)
if not msg or msg["context_id"] != self._context_id:
continue
if msg["type"] == "done":
await self.push_frame(TTSStoppedFrame())
await self.stop_ttfb_metrics()
# Unset _context_id but not the _context_id_start_timestamp
# because we are likely still playing out audio and need the
# timestamp to set send context frames.
self._context_id = None
await self.add_word_timestamps([("LLMFullResponseEndFrame", 0), ("Reset", 0)])
await self.add_word_timestamps(
[("TTSStoppedFrame", 0), ("LLMFullResponseEndFrame", 0), ("Reset", 0)]
)
elif msg["type"] == "timestamps":
await self.add_word_timestamps(
list(zip(msg["word_timestamps"]["words"], msg["word_timestamps"]["start"]))
@@ -257,6 +261,7 @@ class CartesiaTTSService(WordTTSService):
logger.error(f"Cartesia error, unknown message type: {msg}")
except asyncio.CancelledError:
pass
# await self.push_error(ErrorFrame(f"{self} cancelled", True))
except Exception as e:
logger.error(f"{self} exception: {e}")
@@ -331,7 +336,7 @@ class CartesiaHttpTTSService(TTSService):
},
"language": self.language_to_service_language(params.language)
if params.language
else Language.EN,
else "en",
"speed": params.speed,
"emotion": params.emotion,
}

View File

@@ -42,6 +42,51 @@ except ModuleNotFoundError as e:
ElevenLabsOutputFormat = Literal["pcm_16000", "pcm_22050", "pcm_24000", "pcm_44100"]
def language_to_elevenlabs_language(language: Language) -> str | None:
BASE_LANGUAGES = {
Language.BG: "bg",
Language.CS: "cs",
Language.DA: "da",
Language.DE: "de",
Language.EL: "el",
Language.EN: "en",
Language.ES: "es",
Language.FI: "fi",
Language.FR: "fr",
Language.HI: "hi",
Language.HU: "hu",
Language.ID: "id",
Language.IT: "it",
Language.JA: "ja",
Language.KO: "ko",
Language.MS: "ms",
Language.NL: "nl",
Language.NO: "no",
Language.PL: "pl",
Language.PT: "pt",
Language.RO: "ro",
Language.RU: "ru",
Language.SK: "sk",
Language.SV: "sv",
Language.TR: "tr",
Language.UK: "uk",
Language.VI: "vi",
Language.ZH: "zh",
}
result = BASE_LANGUAGES.get(language)
# If not found in base languages, try to find the base language from a variant
if not result:
# Convert enum value to string and get the base language part (e.g. es-ES -> es)
lang_str = str(language.value)
base_code = lang_str.split("-")[0].lower()
# Look up the base code in our supported languages
result = base_code if base_code in BASE_LANGUAGES.values() else None
return result
def sample_rate_from_output_format(output_format: str) -> int:
match output_format:
case "pcm_16000":
@@ -135,7 +180,7 @@ class ElevenLabsTTSService(WordTTSService):
"sample_rate": sample_rate_from_output_format(output_format),
"language": self.language_to_service_language(params.language)
if params.language
else Language.EN,
else "en",
"output_format": output_format,
"optimize_streaming_latency": params.optimize_streaming_latency,
"stability": params.stability,
@@ -158,73 +203,7 @@ class ElevenLabsTTSService(WordTTSService):
return True
def language_to_service_language(self, language: Language) -> str | None:
match language:
case Language.BG:
return "bg"
case Language.ZH:
return "zh"
case Language.CS:
return "cs"
case Language.DA:
return "da"
case Language.NL:
return "nl"
case (
Language.EN
| Language.EN_US
| Language.EN_AU
| Language.EN_GB
| Language.EN_NZ
| Language.EN_IN
):
return "en"
case Language.FI:
return "fi"
case Language.FR | Language.FR_CA:
return "fr"
case Language.DE | Language.DE_CH:
return "de"
case Language.EL:
return "el"
case Language.HI:
return "hi"
case Language.HU:
return "hu"
case Language.ID:
return "id"
case Language.IT:
return "it"
case Language.JA:
return "ja"
case Language.KO:
return "ko"
case Language.MS:
return "ms"
case Language.NO:
return "no"
case Language.PL:
return "pl"
case Language.PT:
return "pt-PT"
case Language.PT_BR:
return "pt-BR"
case Language.RO:
return "ro"
case Language.RU:
return "ru"
case Language.SK:
return "sk"
case Language.ES:
return "es"
case Language.SV:
return "sv"
case Language.TR:
return "tr"
case Language.UK:
return "uk"
case Language.VI:
return "vi"
return None
return language_to_elevenlabs_language(language)
def _set_voice_settings(self):
voice_settings = {}

View File

@@ -34,6 +34,101 @@ except ModuleNotFoundError as e:
raise Exception(f"Missing module: {e}")
def language_to_gladia_language(language: Language) -> str | None:
BASE_LANGUAGES = {
Language.AF: "af",
Language.AM: "am",
Language.AR: "ar",
Language.AS: "as",
Language.AZ: "az",
Language.BG: "bg",
Language.BN: "bn",
Language.BS: "bs",
Language.CA: "ca",
Language.CS: "cs",
Language.CY: "cy",
Language.DA: "da",
Language.DE: "de",
Language.EL: "el",
Language.EN: "en",
Language.ES: "es",
Language.ET: "et",
Language.EU: "eu",
Language.FA: "fa",
Language.FI: "fi",
Language.FR: "fr",
Language.GA: "ga",
Language.GL: "gl",
Language.GU: "gu",
Language.HE: "he",
Language.HI: "hi",
Language.HR: "hr",
Language.HU: "hu",
Language.HY: "hy",
Language.ID: "id",
Language.IS: "is",
Language.IT: "it",
Language.JA: "ja",
Language.JV: "jv",
Language.KA: "ka",
Language.KK: "kk",
Language.KM: "km",
Language.KN: "kn",
Language.KO: "ko",
Language.LO: "lo",
Language.LT: "lt",
Language.LV: "lv",
Language.MK: "mk",
Language.ML: "ml",
Language.MN: "mn",
Language.MR: "mr",
Language.MS: "ms",
Language.MT: "mt",
Language.MY: "my",
Language.NE: "ne",
Language.NL: "nl",
Language.NO: "no",
Language.OR: "or",
Language.PA: "pa",
Language.PL: "pl",
Language.PS: "ps",
Language.PT: "pt",
Language.RO: "ro",
Language.RU: "ru",
Language.SI: "si",
Language.SK: "sk",
Language.SL: "sl",
Language.SO: "so",
Language.SQ: "sq",
Language.SR: "sr",
Language.SU: "su",
Language.SV: "sv",
Language.SW: "sw",
Language.TA: "ta",
Language.TE: "te",
Language.TH: "th",
Language.TR: "tr",
Language.UK: "uk",
Language.UR: "ur",
Language.UZ: "uz",
Language.VI: "vi",
Language.ZH: "zh",
Language.ZU: "zu",
}
result = BASE_LANGUAGES.get(language)
# If not found in base languages, try to find the base language from a variant
if not result:
# Convert enum value to string and get the base language part (e.g. es-ES -> es)
lang_str = str(language.value)
base_code = lang_str.split("-")[0].lower()
# Look up the base code in our supported languages
result = base_code if base_code in BASE_LANGUAGES.values() else None
return result
class GladiaSTTService(STTService):
class InputParams(BaseModel):
sample_rate: Optional[int] = 16000
@@ -79,50 +174,7 @@ class GladiaSTTService(STTService):
self._confidence = confidence
def language_to_service_language(self, language: Language) -> str | None:
language_map = {
Language.BG: "bg",
Language.CA: "ca",
Language.ZH: "zh",
Language.CS: "cs",
Language.DA: "da",
Language.NL: "nl",
Language.EN: "en",
Language.EN_US: "en",
Language.EN_AU: "en",
Language.EN_GB: "en",
Language.EN_NZ: "en",
Language.EN_IN: "en",
Language.ET: "et",
Language.FI: "fi",
Language.FR: "fr",
Language.FR_CA: "fr",
Language.DE: "de",
Language.DE_CH: "de",
Language.EL: "el",
Language.HI: "hi",
Language.HU: "hu",
Language.ID: "id",
Language.IT: "it",
Language.JA: "ja",
Language.KO: "ko",
Language.LV: "lv",
Language.LT: "lt",
Language.MS: "ms",
Language.NO: "no",
Language.PL: "pl",
Language.PT: "pt",
Language.PT_BR: "pt",
Language.RO: "ro",
Language.RU: "ru",
Language.SK: "sk",
Language.ES: "es",
Language.SV: "sv",
Language.TH: "th",
Language.TR: "tr",
Language.UK: "uk",
Language.VI: "vi",
}
return language_map.get(language)
return language_to_gladia_language(language)
async def start(self, frame: StartFrame):
await super().start(frame)

View File

@@ -16,6 +16,7 @@ from PIL import Image
from pydantic import BaseModel, Field
from pipecat.frames.frames import (
AudioRawFrame,
ErrorFrame,
Frame,
LLMFullResponseEndFrame,
@@ -55,6 +56,166 @@ except ModuleNotFoundError as e:
raise Exception(f"Missing module: {e}")
def language_to_google_language(language: Language) -> str | None:
language_map = {
# Afrikaans
Language.AF: "af-ZA",
Language.AF_ZA: "af-ZA",
# Arabic
Language.AR: "ar-XA",
# Bengali
Language.BN: "bn-IN",
Language.BN_IN: "bn-IN",
# Bulgarian
Language.BG: "bg-BG",
Language.BG_BG: "bg-BG",
# Catalan
Language.CA: "ca-ES",
Language.CA_ES: "ca-ES",
# Chinese (Mandarin and Cantonese)
Language.ZH: "cmn-CN",
Language.ZH_CN: "cmn-CN",
Language.ZH_TW: "cmn-TW",
Language.ZH_HK: "yue-HK",
# Czech
Language.CS: "cs-CZ",
Language.CS_CZ: "cs-CZ",
# Danish
Language.DA: "da-DK",
Language.DA_DK: "da-DK",
# Dutch
Language.NL: "nl-NL",
Language.NL_BE: "nl-BE",
Language.NL_NL: "nl-NL",
# English
Language.EN: "en-US",
Language.EN_US: "en-US",
Language.EN_AU: "en-AU",
Language.EN_GB: "en-GB",
Language.EN_IN: "en-IN",
# Estonian
Language.ET: "et-EE",
Language.ET_EE: "et-EE",
# Filipino
Language.FIL: "fil-PH",
Language.FIL_PH: "fil-PH",
# Finnish
Language.FI: "fi-FI",
Language.FI_FI: "fi-FI",
# French
Language.FR: "fr-FR",
Language.FR_CA: "fr-CA",
Language.FR_FR: "fr-FR",
# Galician
Language.GL: "gl-ES",
Language.GL_ES: "gl-ES",
# German
Language.DE: "de-DE",
Language.DE_DE: "de-DE",
# Greek
Language.EL: "el-GR",
Language.EL_GR: "el-GR",
# Gujarati
Language.GU: "gu-IN",
Language.GU_IN: "gu-IN",
# Hebrew
Language.HE: "he-IL",
Language.HE_IL: "he-IL",
# Hindi
Language.HI: "hi-IN",
Language.HI_IN: "hi-IN",
# Hungarian
Language.HU: "hu-HU",
Language.HU_HU: "hu-HU",
# Icelandic
Language.IS: "is-IS",
Language.IS_IS: "is-IS",
# Indonesian
Language.ID: "id-ID",
Language.ID_ID: "id-ID",
# Italian
Language.IT: "it-IT",
Language.IT_IT: "it-IT",
# Japanese
Language.JA: "ja-JP",
Language.JA_JP: "ja-JP",
# Kannada
Language.KN: "kn-IN",
Language.KN_IN: "kn-IN",
# Korean
Language.KO: "ko-KR",
Language.KO_KR: "ko-KR",
# Latvian
Language.LV: "lv-LV",
Language.LV_LV: "lv-LV",
# Lithuanian
Language.LT: "lt-LT",
Language.LT_LT: "lt-LT",
# Malay
Language.MS: "ms-MY",
Language.MS_MY: "ms-MY",
# Malayalam
Language.ML: "ml-IN",
Language.ML_IN: "ml-IN",
# Marathi
Language.MR: "mr-IN",
Language.MR_IN: "mr-IN",
# Norwegian
Language.NO: "nb-NO",
Language.NB: "nb-NO",
Language.NB_NO: "nb-NO",
# Polish
Language.PL: "pl-PL",
Language.PL_PL: "pl-PL",
# Portuguese
Language.PT: "pt-PT",
Language.PT_BR: "pt-BR",
Language.PT_PT: "pt-PT",
# Punjabi
Language.PA: "pa-IN",
Language.PA_IN: "pa-IN",
# Romanian
Language.RO: "ro-RO",
Language.RO_RO: "ro-RO",
# Russian
Language.RU: "ru-RU",
Language.RU_RU: "ru-RU",
# Serbian
Language.SR: "sr-RS",
Language.SR_RS: "sr-RS",
# Slovak
Language.SK: "sk-SK",
Language.SK_SK: "sk-SK",
# Spanish
Language.ES: "es-ES",
Language.ES_ES: "es-ES",
Language.ES_US: "es-US",
# Swedish
Language.SV: "sv-SE",
Language.SV_SE: "sv-SE",
# Tamil
Language.TA: "ta-IN",
Language.TA_IN: "ta-IN",
# Telugu
Language.TE: "te-IN",
Language.TE_IN: "te-IN",
# Thai
Language.TH: "th-TH",
Language.TH_TH: "th-TH",
# Turkish
Language.TR: "tr-TR",
Language.TR_TR: "tr-TR",
# Ukrainian
Language.UK: "uk-UA",
Language.UK_UA: "uk-UA",
# Vietnamese
Language.VI: "vi-VN",
Language.VI_VN: "vi-VN",
}
return language_map.get(language)
class GoogleUserContextAggregator(OpenAIUserContextAggregator):
async def _push_aggregation(self):
if len(self._aggregation) > 0:
@@ -120,9 +281,10 @@ class GoogleAssistantContextAggregator(OpenAIAssistantContextAggregator):
)
run_llm = not bool(self._function_calls_in_progress)
else:
self._context.add_message(
glm.Content(role="model", parts=[glm.Part(text=aggregation)])
)
if aggregation.strip():
self._context.add_message(
glm.Content(role="model", parts=[glm.Part(text=aggregation)])
)
if self._pending_image_frame_message:
frame = self._pending_image_frame_message
@@ -184,11 +346,53 @@ class GoogleLLMContext(OpenAILLMContext):
msgs.append(obj)
return msgs
def add_image_frame_message(
self, *, format: str, size: tuple[int, int], image: bytes, text: str = None
):
buffer = io.BytesIO()
Image.frombytes(format, size, image).save(buffer, format="JPEG")
parts = []
if text:
parts.append(glm.Part(text=text))
parts.append(
glm.Part(inline_data=glm.Blob(mime_type="image/jpeg", data=buffer.getvalue())),
)
self.add_message(glm.Content(role="user", parts=parts))
def add_audio_frames_message(self, *, audio_frames: list[AudioRawFrame], text: str = None):
if not audio_frames:
return
sample_rate = audio_frames[0].sample_rate
num_channels = audio_frames[0].num_channels
parts = []
data = b"".join(frame.audio for frame in audio_frames)
if text:
parts.append(glm.Part(text=text))
parts.append(
glm.Part(
inline_data=glm.Blob(
mime_type="audio/wav",
data=(
bytes(
self.create_wav_header(sample_rate, num_channels, 16, len(data)) + data
)
),
)
),
)
self.add_message(glm.Content(role="user", parts=parts))
# message = {"mime_type": "audio/mp3", "data": bytes(data + create_wav_header(sample_rate, num_channels, 16, len(data)))}
# self.add_message(message)
def from_standard_message(self, message):
role = message["role"]
content = message.get("content", [])
if role == "system":
role = "user"
self.system_message = content
return None
elif role == "assistant":
role = "model"
@@ -232,20 +436,6 @@ class GoogleLLMContext(OpenAILLMContext):
message = glm.Content(role=role, parts=parts)
return message
def add_image_frame_message(
self, *, format: str, size: tuple[int, int], image: bytes, text: str = None
):
buffer = io.BytesIO()
Image.frombytes(format, size, image).save(buffer, format="JPEG")
parts = []
if text:
parts.append(glm.Part(text=text))
parts.append(
glm.Part(inline_data=glm.Blob(mime_type="image/jpeg", data=buffer.getvalue())),
)
self.add_message(glm.Content(role="user", parts=parts))
def to_standard_messages(self, obj) -> list:
msg = {"role": obj.role, "content": []}
if msg["role"] == "model":
@@ -289,9 +479,20 @@ class GoogleLLMContext(OpenAILLMContext):
return [msg]
def _restructure_from_openai_messages(self):
self.system_message = None
# first, map across self._messages calling self.from_standard_message(m) to modify messages in place
try:
self._messages[:] = [self.from_standard_message(m) for m in self._messages]
self._messages[:] = [
msg
for msg in (self.from_standard_message(m) for m in self._messages)
if msg is not None
]
# We might have been given a messages list with only a system message. If so, let's put that back in
# the messages list as a user message.
if self.system_message and not self._messages:
self.add_message(
glm.Content(role="user", parts=[glm.Part(text=self.system_message)])
)
except Exception as e:
logger.error(f"Error mapping messages: {e}")
# iterate over messages and remove any messages that have an empty content list
@@ -319,11 +520,14 @@ class GoogleLLMService(LLMService):
api_key: str,
model: str = "gemini-1.5-flash-latest",
params: InputParams = InputParams(),
system_instruction: Optional[str] = None,
**kwargs,
):
super().__init__(**kwargs)
gai.configure(api_key=api_key)
self._create_client(model)
self.set_model_name(model)
self._system_instruction = system_instruction
self._create_client()
self._settings = {
"max_tokens": params.max_tokens,
"temperature": params.temperature,
@@ -335,34 +539,10 @@ class GoogleLLMService(LLMService):
def can_generate_metrics(self) -> bool:
return True
def _create_client(self, model: str):
self.set_model_name(model)
self._client = gai.GenerativeModel(model)
def _get_messages_from_openai_context(self, context: OpenAILLMContext) -> List[glm.Content]:
openai_messages = context.get_messages()
google_messages = []
for message in openai_messages:
role = message["role"]
content = message["content"]
if role == "system":
role = "user"
elif role == "assistant":
role = "model"
parts = [glm.Part(text=content)]
if "mime_type" in message:
parts.append(
glm.Part(
inline_data=glm.Blob(
mime_type=message["mime_type"], data=message["data"].getvalue()
)
)
)
google_messages.append({"role": role, "parts": parts})
return google_messages
def _create_client(self):
self._client = gai.GenerativeModel(
self._model_name, system_instruction=self._system_instruction
)
async def _async_generator_wrapper(self, sync_generator):
for item in sync_generator:
@@ -374,10 +554,11 @@ class GoogleLLMService(LLMService):
try:
logger.debug(f"Generating chat: {context.get_messages_for_logging()}")
# todo: move this into the new context code structure, convert from openai context one time
# todo: add system instructions
# messages = self._get_messages_from_openai_context(context)
messages = context.messages
if self._system_instruction != context.system_message:
logger.debug(f"System instruction changed: {context.system_message}")
self._system_instruction = context.system_message
self._create_client()
# Filter out None values and create GenerationConfig
generation_params = {
@@ -394,24 +575,21 @@ class GoogleLLMService(LLMService):
generation_config = GenerationConfig(**generation_params) if generation_params else None
await self.start_ttfb_metrics()
tools = context.tools if context.tools else []
response = self._client.generate_content(
contents=messages, tools=tools, stream=True, generation_config=generation_config
)
tokens = LLMTokenUsage(
prompt_tokens=response.usage_metadata.prompt_token_count,
completion_tokens=response.usage_metadata.candidates_token_count,
total_tokens=response.usage_metadata.total_token_count,
)
await self.start_llm_usage_metrics(tokens)
await self.stop_ttfb_metrics()
prompt_tokens = response.usage_metadata.prompt_token_count
completion_tokens = response.usage_metadata.candidates_token_count
total_tokens = response.usage_metadata.total_token_count
async for chunk in self._async_generator_wrapper(response):
# todo: usage
if chunk.usage_metadata:
prompt_tokens += response.usage_metadata.prompt_token_count
completion_tokens += response.usage_metadata.candidates_token_count
total_tokens += response.usage_metadata.total_token_count
try:
for c in chunk.parts:
if c.text:
@@ -436,6 +614,13 @@ class GoogleLLMService(LLMService):
except Exception as e:
logger.exception(f"{self} exception: {e}")
finally:
await self.start_llm_usage_metrics(
LLMTokenUsage(
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
total_tokens=total_tokens,
)
)
await self.push_frame(LLMFullResponseEndFrame())
async def process_frame(self, frame: Frame, direction: FrameDirection):
@@ -499,7 +684,7 @@ class GoogleTTSService(TTSService):
"emphasis": params.emphasis,
"language": self.language_to_service_language(params.language)
if params.language
else Language.EN,
else "en-US",
"gender": params.gender,
"google_style": params.google_style,
}
@@ -530,88 +715,7 @@ class GoogleTTSService(TTSService):
return True
def language_to_service_language(self, language: Language) -> str | None:
match language:
case Language.BG:
return "bg-BG"
case Language.CA:
return "ca-ES"
case Language.ZH:
return "cmn-CN"
case Language.ZH_TW:
return "cmn-TW"
case Language.CS:
return "cs-CZ"
case Language.DA:
return "da-DK"
case Language.NL:
return "nl-NL"
case Language.EN | Language.EN_US:
return "en-US"
case Language.EN_AU:
return "en-AU"
case Language.EN_GB:
return "en-GB"
case Language.EN_IN:
return "en-IN"
case Language.ET:
return "et-EE"
case Language.FI:
return "fi-FI"
case Language.NL_BE:
return "nl-BE"
case Language.FR:
return "fr-FR"
case Language.FR_CA:
return "fr-CA"
case Language.DE:
return "de-DE"
case Language.EL:
return "el-GR"
case Language.HI:
return "hi-IN"
case Language.HU:
return "hu-HU"
case Language.ID:
return "id-ID"
case Language.IT:
return "it-IT"
case Language.JA:
return "ja-JP"
case Language.KO:
return "ko-KR"
case Language.LV:
return "lv-LV"
case Language.LT:
return "lt-LT"
case Language.MS:
return "ms-MY"
case Language.NO:
return "nb-NO"
case Language.PL:
return "pl-PL"
case Language.PT:
return "pt-PT"
case Language.PT_BR:
return "pt-BR"
case Language.RO:
return "ro-RO"
case Language.RU:
return "ru-RU"
case Language.SK:
return "sk-SK"
case Language.ES:
return "es-ES"
case Language.SV:
return "sv-SE"
case Language.TH:
return "th-TH"
case Language.TR:
return "tr-TR"
case Language.UK:
return "uk-UA"
case Language.VI:
return "vi-VN"
return None
return language_to_google_language(language)
def _construct_ssml(self, text: str) -> str:
ssml = "<speak>"

View File

@@ -35,6 +35,30 @@ except ModuleNotFoundError as e:
raise Exception(f"Missing module: {e}")
def language_to_lmnt_language(language: Language) -> str | None:
BASE_LANGUAGES = {
Language.DE: "de",
Language.EN: "en",
Language.ES: "es",
Language.FR: "fr",
Language.KO: "ko",
Language.PT: "pt",
Language.ZH: "zh",
}
result = BASE_LANGUAGES.get(language)
# If not found in base languages, try to find the base language from a variant
if not result:
# Convert enum value to string and get the base language part (e.g. es-ES -> es)
lang_str = str(language.value)
base_code = lang_str.split("-")[0].lower()
# Look up the base code in our supported languages
result = base_code if base_code in BASE_LANGUAGES.values() else None
return result
class LmntTTSService(TTSService):
def __init__(
self,
@@ -72,29 +96,7 @@ class LmntTTSService(TTSService):
return True
def language_to_service_language(self, language: Language) -> str | None:
match language:
case Language.DE:
return "de"
case (
Language.EN
| Language.EN_US
| Language.EN_AU
| Language.EN_GB
| Language.EN_NZ
| Language.EN_IN
):
return "en"
case Language.ES:
return "es"
case Language.FR | Language.FR_CA:
return "fr"
case Language.PT | Language.PT_BR:
return "pt"
case Language.ZH | Language.ZH_TW:
return "zh"
case Language.KO:
return "ko"
return None
return language_to_lmnt_language(language)
async def start(self, frame: StartFrame):
await super().start(frame)

View File

@@ -441,8 +441,6 @@ class OpenAIRealtimeBetaLLMService(LLMService):
async def _handle_evt_speech_started(self, evt):
await self._truncate_current_audio_response()
# todo: might need to guard sending these when we fully support using either openai
# turn detection of Pipecat turn detection
await self._start_interruption() # cancels this processor task
await self.push_frame(StartInterruptionFrame()) # cancels downstream tasks
await self.push_frame(UserStartedSpeakingFrame())

View File

@@ -47,63 +47,40 @@ except ModuleNotFoundError as e:
def language_to_playht_language(language: Language) -> str | None:
match language:
case Language.BG:
return "BULGARIAN"
case Language.CA:
return "CATALAN"
case Language.CS:
return "CZECH"
case Language.DA:
return "DANISH"
case Language.DE:
return "GERMAN"
case (
Language.EN
| Language.EN_US
| Language.EN_GB
| Language.EN_AU
| Language.EN_NZ
| Language.EN_IN
):
return "ENGLISH"
case Language.ES:
return "SPANISH"
case Language.FR | Language.FR_CA:
return "FRENCH"
case Language.EL:
return "GREEK"
case Language.HI:
return "HINDI"
case Language.HU:
return "HUNGARIAN"
case Language.ID:
return "INDONESIAN"
case Language.IT:
return "ITALIAN"
case Language.JA:
return "JAPANESE"
case Language.KO:
return "KOREAN"
case Language.MS:
return "MALAY"
case Language.NL:
return "DUTCH"
case Language.PL:
return "POLISH"
case Language.PT | Language.PT_BR:
return "PORTUGUESE"
case Language.RU:
return "RUSSIAN"
case Language.SV:
return "SWEDISH"
case Language.TH:
return "THAI"
case Language.TR:
return "TURKISH"
case Language.UK:
return "UKRAINIAN"
return None
language_map = {
Language.BG: "bulgarian",
Language.CA: "catalan",
Language.CS: "czech",
Language.DA: "danish",
Language.DE: "german",
Language.EN: "english",
Language.EN_US: "english",
Language.EN_GB: "english",
Language.EN_AU: "english",
Language.EN_NZ: "english",
Language.EN_IN: "english",
Language.ES: "spanish",
Language.FR: "french",
Language.FR_CA: "french",
Language.EL: "greek",
Language.HI: "hindi",
Language.HU: "hungarian",
Language.ID: "indonesian",
Language.IT: "italian",
Language.JA: "japanese",
Language.KO: "korean",
Language.MS: "malay",
Language.NL: "dutch",
Language.PL: "polish",
Language.PT: "portuguese",
Language.PT_BR: "portuguese",
Language.RU: "russian",
Language.SV: "swedish",
Language.TH: "thai",
Language.TR: "turkish",
Language.UK: "ukrainian",
}
return language_map.get(language)
class PlayHTTTSService(TTSService):
@@ -118,7 +95,7 @@ class PlayHTTTSService(TTSService):
api_key: str,
user_id: str,
voice_url: str,
voice_engine: str = "PlayHT3.0-mini",
voice_engine: str = "Play3.0-mini",
sample_rate: int = 24000,
output_format: str = "wav",
params: InputParams = InputParams(),
@@ -140,7 +117,7 @@ class PlayHTTTSService(TTSService):
"sample_rate": sample_rate,
"language": self.language_to_service_language(params.language)
if params.language
else Language.EN,
else "english",
"output_format": output_format,
"voice_engine": voice_engine,
"speed": params.speed,
@@ -153,8 +130,7 @@ class PlayHTTTSService(TTSService):
return True
def language_to_service_language(self, language: Language) -> str | None:
# Keep your existing language mapping logic here
pass
return language_to_playht_language(language)
async def start(self, frame: StartFrame):
await super().start(frame)
@@ -234,17 +210,11 @@ class PlayHTTTSService(TTSService):
async def _receive_task_handler(self):
try:
header_size = 78 # Size of the WAV header + extra bytes we want to skip
header_received = False
async for message in self._get_websocket():
if isinstance(message, bytes):
chunk_size = len(message)
# Skip the WAV header
if not header_received and chunk_size == header_size:
header_received = True
# Skip the WAV header message
if message.startswith(b"RIFF"):
continue
await self.stop_ttfb_metrics()
frame = TTSAudioRawFrame(message, self._settings["sample_rate"], 1)
await self.push_frame(frame)
@@ -254,7 +224,6 @@ class PlayHTTTSService(TTSService):
msg = json.loads(message)
if "request_id" in msg and msg["request_id"] == self._request_id:
await self.push_frame(TTSStoppedFrame())
header_received = False # Reset for the next audio stream
self._request_id = None
elif "error" in msg:
logger.error(f"{self} error: {msg}")
@@ -334,7 +303,7 @@ class PlayHTHttpTTSService(TTSService):
api_key: str,
user_id: str,
voice_url: str,
voice_engine: str = "PlayHT3.0-mini",
voice_engine: str = "Play3.0-mini",
sample_rate: int = 24000,
params: InputParams = InputParams(),
**kwargs,
@@ -352,7 +321,7 @@ class PlayHTHttpTTSService(TTSService):
"sample_rate": sample_rate,
"language": self.language_to_service_language(params.language)
if params.language
else Language.EN,
else "english",
"format": Format.FORMAT_WAV,
"voice_engine": voice_engine,
"speed": params.speed,

View File

@@ -0,0 +1,101 @@
from typing import AsyncGenerator, Optional
import aiohttp
from loguru import logger
from pydantic import BaseModel
from pipecat.frames.frames import (
ErrorFrame,
Frame,
TTSAudioRawFrame,
TTSStartedFrame,
TTSStoppedFrame,
)
from pipecat.services.ai_services import TTSService
class RimeHttpTTSService(TTSService):
class InputParams(BaseModel):
pause_between_brackets: Optional[bool] = False
phonemize_between_brackets: Optional[bool] = False
inline_speed_alpha: Optional[str] = None
speed_alpha: Optional[float] = 1.0
reduce_latency: Optional[bool] = False
def __init__(
self,
*,
api_key: str,
voice_id: str = "eva",
model: str = "mist",
sample_rate: int = 24000,
params: InputParams = InputParams(),
**kwargs,
):
super().__init__(sample_rate=sample_rate, **kwargs)
self._api_key = api_key
self._base_url = "https://users.rime.ai/v1/rime-tts"
self._settings = {
"speaker": voice_id,
"modelId": model,
"samplingRate": sample_rate,
"speedAlpha": params.speed_alpha,
"reduceLatency": params.reduce_latency,
"pauseBetweenBrackets": params.pause_between_brackets,
"phonemizeBetweenBrackets": params.phonemize_between_brackets,
}
if params.inline_speed_alpha:
self._settings["inlineSpeedAlpha"] = params.inline_speed_alpha
def can_generate_metrics(self) -> bool:
return True
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
logger.debug(f"Generating TTS: [{text}]")
headers = {
"Accept": "audio/pcm",
"Authorization": f"Bearer {self._api_key}",
"Content-Type": "application/json",
}
payload = self._settings.copy()
payload["text"] = text
try:
await self.start_ttfb_metrics()
await self.start_tts_usage_metrics(text)
yield TTSStartedFrame()
async with aiohttp.ClientSession() as session:
async with session.post(self._base_url, json=payload, headers=headers) as response:
if response.status != 200:
error_message = f"Rime TTS error: HTTP {response.status}"
logger.error(error_message)
yield ErrorFrame(error=error_message)
return
# Process the streaming response
chunk_size = 8192
first_chunk = True
async for chunk in response.content.iter_chunked(chunk_size):
if first_chunk:
await self.stop_ttfb_metrics()
first_chunk = False
if chunk:
frame = TTSAudioRawFrame(chunk, self._settings["samplingRate"], 1)
yield frame
yield TTSStoppedFrame()
except Exception as e:
logger.exception(f"Error generating TTS: {e}")
yield ErrorFrame(error=f"Rime TTS error: {str(e)}")
finally:
yield TTSStoppedFrame()

View File

@@ -7,6 +7,7 @@
from typing import Any, AsyncGenerator, Dict
import aiohttp
from loguru import logger
from pipecat.audio.utils import resample_audio
from pipecat.frames.frames import (
@@ -20,9 +21,6 @@ from pipecat.frames.frames import (
from pipecat.services.ai_services import TTSService
from pipecat.transcriptions.language import Language
from loguru import logger
# The server below can connect to XTTS through a local running docker
#
# Docker command: $ docker run --gpus=all -e COQUI_TOS_AGREED=1 --rm -p 8000:80 ghcr.io/coqui-ai/xtts-streaming-server:latest-cuda121
@@ -31,6 +29,45 @@ from loguru import logger
# https://github.com/coqui-ai/xtts-streaming-server
def language_to_xtts_language(language: Language) -> str | None:
BASE_LANGUAGES = {
Language.CS: "cs",
Language.DE: "de",
Language.EN: "en",
Language.ES: "es",
Language.FR: "fr",
Language.HI: "hi",
Language.HU: "hu",
Language.IT: "it",
Language.JA: "ja",
Language.KO: "ko",
Language.NL: "nl",
Language.PL: "pl",
Language.PT: "pt",
Language.RU: "ru",
Language.TR: "tr",
# Special case for Chinese base language
Language.ZH: "zh-cn",
}
result = BASE_LANGUAGES.get(language)
# If not found in base languages, try to find the base language from a variant
if not result:
# Convert enum value to string and get the base language part (e.g. es-ES -> es)
lang_str = str(language.value)
base_code = lang_str.split("-")[0].lower()
# Special handling for Chinese variants
if base_code == "zh":
result = "zh-cn"
else:
# Look up the base code in our supported languages
result = base_code if base_code in BASE_LANGUAGES.values() else None
return result
class XTTSService(TTSService):
def __init__(
self,
@@ -56,47 +93,7 @@ class XTTSService(TTSService):
return True
def language_to_service_language(self, language: Language) -> str | None:
match language:
case Language.CS:
return "cs"
case Language.DE:
return "de"
case (
Language.EN
| Language.EN_US
| Language.EN_AU
| Language.EN_GB
| Language.EN_NZ
| Language.EN_IN
):
return "en"
case Language.ES:
return "es"
case Language.FR:
return "fr"
case Language.HI:
return "hi"
case Language.HU:
return "hu"
case Language.IT:
return "it"
case Language.JA:
return "ja"
case Language.KO:
return "ko"
case Language.NL:
return "nl"
case Language.PL:
return "pl"
case Language.PT | Language.PT_BR:
return "pt"
case Language.RU:
return "ru"
case Language.TR:
return "tr"
case Language.ZH:
return "zh-cn"
return None
return language_to_xtts_language(language)
async def start(self, frame: StartFrame):
await super().start(frame)

View File

@@ -5,7 +5,6 @@
#
import sys
from enum import Enum
if sys.version_info < (3, 11):
@@ -20,46 +19,411 @@ else:
class Language(StrEnum):
BG = "bg" # Bulgarian
CA = "ca" # Catalan
ZH = "zh" # Chinese simplified
ZH_TW = "zh-TW" # Chinese traditional
CS = "cs" # Czech
DA = "da" # Danish
NL = "nl" # Dutch
EN = "en" # English
EN_US = "en-US" # English (USA)
EN_AU = "en-AU" # English (Australia)
EN_GB = "en-GB" # English (Great Britain)
EN_NZ = "en-NZ" # English (New Zealand)
EN_IN = "en-IN" # English (India)
ET = "et" # Estonian
FI = "fi" # Finnish
NL_BE = "nl-BE" # Flemmish
FR = "fr" # French
FR_CA = "fr-CA" # French (Canada)
DE = "de" # German
DE_CH = "de-CH" # German (Switzerland)
EL = "el" # Greek
HI = "hi" # Hindi
HU = "hu" # Hungarian
ID = "id" # Indonesian
IT = "it" # Italian
JA = "ja" # Japanese
KO = "ko" # Korean
LV = "lv" # Latvian
LT = "lt" # Lithuanian
MS = "ms" # Malay
NO = "no" # Norwegian
PL = "pl" # Polish
PT = "pt" # Portuguese
PT_BR = "pt-BR" # Portuguese (Brazil)
RO = "ro" # Romanian
RU = "ru" # Russian
SK = "sk" # Slovak
ES = "es" # Spanish
SV = "sv" # Swedish
TH = "th" # Thai
TR = "tr" # Turkish
UK = "uk" # Ukrainian
VI = "vi" # Vietnamese
# Afrikaans
AF = "af"
AF_ZA = "af-ZA"
# Amharic
AM = "am"
AM_ET = "am-ET"
# Arabic
AR = "ar"
AR_AE = "ar-AE"
AR_BH = "ar-BH"
AR_DZ = "ar-DZ"
AR_EG = "ar-EG"
AR_IQ = "ar-IQ"
AR_JO = "ar-JO"
AR_KW = "ar-KW"
AR_LB = "ar-LB"
AR_LY = "ar-LY"
AR_MA = "ar-MA"
AR_OM = "ar-OM"
AR_QA = "ar-QA"
AR_SA = "ar-SA"
AR_SY = "ar-SY"
AR_TN = "ar-TN"
AR_YE = "ar-YE"
# Assamese
AS = "as"
AS_IN = "as-IN"
# Azerbaijani
AZ = "az"
AZ_AZ = "az-AZ"
# Bulgarian
BG = "bg"
BG_BG = "bg-BG"
# Bengali
BN = "bn"
BN_BD = "bn-BD"
BN_IN = "bn-IN"
# Bosnian
BS = "bs"
BS_BA = "bs-BA"
# Catalan
CA = "ca"
CA_ES = "ca-ES"
# Czech
CS = "cs"
CS_CZ = "cs-CZ"
# Welsh
CY = "cy"
CY_GB = "cy-GB"
# Danish
DA = "da"
DA_DK = "da-DK"
# German
DE = "de"
DE_AT = "de-AT"
DE_CH = "de-CH"
DE_DE = "de-DE"
# Greek
EL = "el"
EL_GR = "el-GR"
# English
EN = "en"
EN_AU = "en-AU"
EN_CA = "en-CA"
EN_GB = "en-GB"
EN_HK = "en-HK"
EN_IE = "en-IE"
EN_IN = "en-IN"
EN_KE = "en-KE"
EN_NG = "en-NG"
EN_NZ = "en-NZ"
EN_PH = "en-PH"
EN_SG = "en-SG"
EN_TZ = "en-TZ"
EN_US = "en-US"
EN_ZA = "en-ZA"
# Spanish
ES = "es"
ES_AR = "es-AR"
ES_BO = "es-BO"
ES_CL = "es-CL"
ES_CO = "es-CO"
ES_CR = "es-CR"
ES_CU = "es-CU"
ES_DO = "es-DO"
ES_EC = "es-EC"
ES_ES = "es-ES"
ES_GQ = "es-GQ"
ES_GT = "es-GT"
ES_HN = "es-HN"
ES_MX = "es-MX"
ES_NI = "es-NI"
ES_PA = "es-PA"
ES_PE = "es-PE"
ES_PR = "es-PR"
ES_PY = "es-PY"
ES_SV = "es-SV"
ES_US = "es-US"
ES_UY = "es-UY"
ES_VE = "es-VE"
# Estonian
ET = "et"
ET_EE = "et-EE"
# Basque
EU = "eu"
EU_ES = "eu-ES"
# Persian
FA = "fa"
FA_IR = "fa-IR"
# Finnish
FI = "fi"
FI_FI = "fi-FI"
# Filipino
FIL = "fil"
FIL_PH = "fil-PH"
# French
FR = "fr"
FR_BE = "fr-BE"
FR_CA = "fr-CA"
FR_CH = "fr-CH"
FR_FR = "fr-FR"
# Irish
GA = "ga"
GA_IE = "ga-IE"
# Galician
GL = "gl"
GL_ES = "gl-ES"
# Gujarati
GU = "gu"
GU_IN = "gu-IN"
# Hebrew
HE = "he"
HE_IL = "he-IL"
# Hindi
HI = "hi"
HI_IN = "hi-IN"
# Croatian
HR = "hr"
HR_HR = "hr-HR"
# Hungarian
HU = "hu"
HU_HU = "hu-HU"
# Armenian
HY = "hy"
HY_AM = "hy-AM"
# Indonesian
ID = "id"
ID_ID = "id-ID"
# Icelandic
IS = "is"
IS_IS = "is-IS"
# Italian
IT = "it"
IT_IT = "it-IT"
# Inuktitut
IU_CANS = "iu-Cans"
IU_CANS_CA = "iu-Cans-CA"
IU_LATN = "iu-Latn"
IU_LATN_CA = "iu-Latn-CA"
# Japanese
JA = "ja"
JA_JP = "ja-JP"
# Javanese
JV = "jv"
JV_ID = "jv-ID"
# Georgian
KA = "ka"
KA_GE = "ka-GE"
# Kazakh
KK = "kk"
KK_KZ = "kk-KZ"
# Khmer
KM = "km"
KM_KH = "km-KH"
# Kannada
KN = "kn"
KN_IN = "kn-IN"
# Korean
KO = "ko"
KO_KR = "ko-KR"
# Lao
LO = "lo"
LO_LA = "lo-LA"
# Lithuanian
LT = "lt"
LT_LT = "lt-LT"
# Latvian
LV = "lv"
LV_LV = "lv-LV"
# Macedonian
MK = "mk"
MK_MK = "mk-MK"
# Malayalam
ML = "ml"
ML_IN = "ml-IN"
# Mongolian
MN = "mn"
MN_MN = "mn-MN"
# Marathi
MR = "mr"
MR_IN = "mr-IN"
# Malay
MS = "ms"
MS_MY = "ms-MY"
# Maltese
MT = "mt"
MT_MT = "mt-MT"
# Burmese
MY = "my"
MY_MM = "my-MM"
# Norwegian
NB = "nb"
NB_NO = "nb-NO"
NO = "no"
# Nepali
NE = "ne"
NE_NP = "ne-NP"
# Dutch
NL = "nl"
NL_BE = "nl-BE"
NL_NL = "nl-NL"
# Odia
OR = "or"
OR_IN = "or-IN"
# Punjabi
PA = "pa"
PA_IN = "pa-IN"
# Polish
PL = "pl"
PL_PL = "pl-PL"
# Pashto
PS = "ps"
PS_AF = "ps-AF"
# Portuguese
PT = "pt"
PT_BR = "pt-BR"
PT_PT = "pt-PT"
# Romanian
RO = "ro"
RO_RO = "ro-RO"
# Russian
RU = "ru"
RU_RU = "ru-RU"
# Sinhala
SI = "si"
SI_LK = "si-LK"
# Slovak
SK = "sk"
SK_SK = "sk-SK"
# Slovenian
SL = "sl"
SL_SI = "sl-SI"
# Somali
SO = "so"
SO_SO = "so-SO"
# Albanian
SQ = "sq"
SQ_AL = "sq-AL"
# Serbian
SR = "sr"
SR_RS = "sr-RS"
SR_LATN = "sr-Latn"
SR_LATN_RS = "sr-Latn-RS"
# Sundanese
SU = "su"
SU_ID = "su-ID"
# Swedish
SV = "sv"
SV_SE = "sv-SE"
# Swahili
SW = "sw"
SW_KE = "sw-KE"
SW_TZ = "sw-TZ"
# Tagalog
TL = "tl"
# Tamil
TA = "ta"
TA_IN = "ta-IN"
TA_LK = "ta-LK"
TA_MY = "ta-MY"
TA_SG = "ta-SG"
# Telugu
TE = "te"
TE_IN = "te-IN"
# Thai
TH = "th"
TH_TH = "th-TH"
# Turkish
TR = "tr"
TR_TR = "tr-TR"
# Ukrainian
UK = "uk"
UK_UA = "uk-UA"
# Urdu
UR = "ur"
UR_IN = "ur-IN"
UR_PK = "ur-PK"
# Uzbek
UZ = "uz"
UZ_UZ = "uz-UZ"
# Vietnamese
VI = "vi"
VI_VN = "vi-VN"
# Wu Chinese
WUU = "wuu"
WUU_CN = "wuu-CN"
# Yue Chinese
YUE = "yue"
YUE_CN = "yue-CN"
# Chinese
ZH = "zh"
ZH_CN = "zh-CN"
ZH_CN_GUANGXI = "zh-CN-guangxi"
ZH_CN_HENAN = "zh-CN-henan"
ZH_CN_LIAONING = "zh-CN-liaoning"
ZH_CN_SHAANXI = "zh-CN-shaanxi"
ZH_CN_SHANDONG = "zh-CN-shandong"
ZH_CN_SICHUAN = "zh-CN-sichuan"
ZH_HK = "zh-HK"
ZH_TW = "zh-TW"
# Xhosa
XH = "xh"
# Zulu
ZU = "zu"
ZU_ZA = "zu-ZA"

View File

@@ -71,6 +71,7 @@ class BaseInputTransport(FrameProcessor):
return self._params.vad_analyzer
async def push_audio_frame(self, frame: InputAudioRawFrame):
logger.info(f"Pushing audio qsize: {self._audio_in_queue.qsize()}")
if self._params.audio_in_enabled or self._params.vad_enabled:
await self._audio_in_queue.put(frame)
@@ -167,6 +168,7 @@ class BaseInputTransport(FrameProcessor):
return vad_state
async def _audio_task_handler(self):
logger.info("_audio_task_handler started")
vad_state: VADState = VADState.QUIET
while True:
try:

View File

@@ -70,16 +70,6 @@ class FastAPIWebsocketInputTransport(BaseInputTransport):
await self._callbacks.on_client_connected(self._websocket)
self._receive_task = self.get_event_loop().create_task(self._receive_messages())
async def stop(self, frame: EndFrame):
await super().stop(frame)
if self._websocket.client_state != WebSocketState.DISCONNECTED:
await self._websocket.close()
async def cancel(self, frame: CancelFrame):
await super().cancel(frame)
if self._websocket.client_state != WebSocketState.DISCONNECTED:
await self._websocket.close()
async def _receive_messages(self):
async for message in self._websocket.iter_text():
frame = self._params.serializer.deserialize(message)

View File

@@ -106,6 +106,7 @@ class WebsocketServerInputTransport(BaseInputTransport):
continue
if isinstance(frame, AudioRawFrame):
logger.info("websocket_server")
await self.push_audio_frame(
InputAudioRawFrame(
audio=frame.audio,

View File

@@ -128,7 +128,11 @@ class DailyCallbacks(BaseModel):
on_error: Callable[[str], Awaitable[None]]
on_app_message: Callable[[Any, str], Awaitable[None]]
on_call_state_updated: Callable[[str], Awaitable[None]]
on_dialin_connected: Callable[[Any], Awaitable[None]]
on_dialin_ready: Callable[[str], Awaitable[None]]
on_dialin_stopped: Callable[[Any], Awaitable[None]]
on_dialin_error: Callable[[Any], Awaitable[None]]
on_dialin_warning: Callable[[Any], Awaitable[None]]
on_dialout_answered: Callable[[Any], Awaitable[None]]
on_dialout_connected: Callable[[Any], Awaitable[None]]
on_dialout_stopped: Callable[[Any], Awaitable[None]]
@@ -536,9 +540,21 @@ class DailyTransportClient(EventHandler):
def on_call_state_updated(self, state: str):
self._call_async_callback(self._callbacks.on_call_state_updated, state)
def on_dialin_connected(self, data: Any):
self._call_async_callback(self._callbacks.on_dialin_connected, data)
def on_dialin_ready(self, sip_endpoint: str):
self._call_async_callback(self._callbacks.on_dialin_ready, sip_endpoint)
def on_dialin_stopped(self, data: Any):
self._call_async_callback(self._callbacks.on_dialin_stopped, data)
def on_dialin_error(self, data: Any):
self._call_async_callback(self._callbacks.on_dialin_error, data)
def on_dialin_warning(self, data: Any):
self._call_async_callback(self._callbacks.on_dialin_warning, data)
def on_dialout_answered(self, data: Any):
self._call_async_callback(self._callbacks.on_dialout_answered, data)
@@ -822,7 +838,11 @@ class DailyTransport(BaseTransport):
on_error=self._on_error,
on_app_message=self._on_app_message,
on_call_state_updated=self._on_call_state_updated,
on_dialin_connected=self._on_dialin_connected,
on_dialin_ready=self._on_dialin_ready,
on_dialin_stopped=self._on_dialin_stopped,
on_dialin_error=self._on_dialin_error,
on_dialin_warning=self._on_dialin_warning,
on_dialout_answered=self._on_dialout_answered,
on_dialout_connected=self._on_dialout_connected,
on_dialout_stopped=self._on_dialout_stopped,
@@ -851,7 +871,11 @@ class DailyTransport(BaseTransport):
self._register_event_handler("on_left")
self._register_event_handler("on_app_message")
self._register_event_handler("on_call_state_updated")
self._register_event_handler("on_dialin_connected")
self._register_event_handler("on_dialin_ready")
self._register_event_handler("on_dialin_stopped")
self._register_event_handler("on_dialin_error")
self._register_event_handler("on_dialin_warning")
self._register_event_handler("on_dialout_answered")
self._register_event_handler("on_dialout_connected")
self._register_event_handler("on_dialout_stopped")
@@ -987,11 +1011,23 @@ class DailyTransport(BaseTransport):
except Exception as e:
logger.exception(f"Error handling dialin-ready event ({url}): {e}")
async def _on_dialin_connected(self, data):
await self._call_event_handler("on_dialin_connected", data)
async def _on_dialin_ready(self, sip_endpoint):
if self._params.dialin_settings:
await self._handle_dialin_ready(sip_endpoint)
await self._call_event_handler("on_dialin_ready", sip_endpoint)
async def _on_dialin_stopped(self, data):
await self._call_event_handler("on_dialin_stopped", data)
async def _on_dialin_error(self, data):
await self._call_event_handler("on_dialin_error", data)
async def _on_dialin_warning(self, data):
await self._call_event_handler("on_dialin_warning", data)
async def _on_dialout_answered(self, data):
await self._call_event_handler("on_dialout_answered", data)