Compare commits
284 Commits
hush/aggre
...
mb/cli
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4d02d886f3 | ||
|
|
42289eb30d | ||
|
|
a0c93ab6de | ||
|
|
4bec566bbf | ||
|
|
ec3cd24182 | ||
|
|
e36e64c2e8 | ||
|
|
02a88022dd | ||
|
|
6cae61f2cc | ||
|
|
3b40079120 | ||
|
|
ff0b38859b | ||
|
|
4d499324d1 | ||
|
|
f13e006db2 | ||
|
|
87d9e8c9cd | ||
|
|
4820f1c059 | ||
|
|
860c39d1b1 | ||
|
|
ae5c5ed7f6 | ||
|
|
7aa01c1ca8 | ||
|
|
4d6356748f | ||
|
|
5b1a182421 | ||
|
|
6ac0c34413 | ||
|
|
c115422dbf | ||
|
|
a2a973be27 | ||
|
|
0407744950 | ||
|
|
7ce370ccc6 | ||
|
|
a4867f61aa | ||
|
|
a67a765783 | ||
|
|
81221668b1 | ||
|
|
cc9c264940 | ||
|
|
f2c61ac9fd | ||
|
|
88f8c10f63 | ||
|
|
855f4842dd | ||
|
|
2bf44fe2af | ||
|
|
3e8a7cc254 | ||
|
|
a600c05570 | ||
|
|
3ba6b55659 | ||
|
|
d5f2dcfac0 | ||
|
|
d12134038b | ||
|
|
a22af3a7e0 | ||
|
|
76e07c6c48 | ||
|
|
8d8503bca7 | ||
|
|
a444097060 | ||
|
|
1b9e96c016 | ||
|
|
7967bc53c3 | ||
|
|
6381335346 | ||
|
|
0fd5d26104 | ||
|
|
41f817bf04 | ||
|
|
27115e6565 | ||
|
|
3c4807d7d4 | ||
|
|
8902f1dc94 | ||
|
|
a25333ee51 | ||
|
|
82c7d7ad83 | ||
|
|
ba2ab51ef7 | ||
|
|
22557fa668 | ||
|
|
3fbf59e7c6 | ||
|
|
129ab5ea0e | ||
|
|
dc917523d0 | ||
|
|
5ea7cc9d32 | ||
|
|
e11ede475b | ||
|
|
90d29e04af | ||
|
|
4c67136a8d | ||
|
|
9d78402a33 | ||
|
|
73877218e9 | ||
|
|
6a1be90cbb | ||
|
|
fbac959ecb | ||
|
|
18dd85431c | ||
|
|
abc569b3d2 | ||
|
|
fa5d4ecf86 | ||
|
|
83b0dc39f7 | ||
|
|
0c31b5ef19 | ||
|
|
d16c36c56d | ||
|
|
8fe3bcd484 | ||
|
|
be2858bfbb | ||
|
|
b6b0997553 | ||
|
|
3b751322d3 | ||
|
|
fce6f55ddb | ||
|
|
d9580f72a9 | ||
|
|
cc66ac14f1 | ||
|
|
9ddec0f8b4 | ||
|
|
9babfe9fd9 | ||
|
|
21d8d148b8 | ||
|
|
0588c82bbf | ||
|
|
16e9093d5a | ||
|
|
91a5d580fd | ||
|
|
0473556992 | ||
|
|
fdaa4e476e | ||
|
|
502e7e42a7 | ||
|
|
2ab3d4fb42 | ||
|
|
55014bdd77 | ||
|
|
334796bd65 | ||
|
|
1c25b6fb72 | ||
|
|
91b29de7ca | ||
|
|
21d610cd30 | ||
|
|
f7fe673ad1 | ||
|
|
4b415721e2 | ||
|
|
8d2a98e0e7 | ||
|
|
523e890c8c | ||
|
|
3c748fe772 | ||
|
|
d293cee372 | ||
|
|
8b62a96878 | ||
|
|
0c102ce70b | ||
|
|
3894d2a4b9 | ||
|
|
1f6b61c0db | ||
|
|
8ee28b37cd | ||
|
|
e85e7e4d84 | ||
|
|
1b3afb5511 | ||
|
|
7cec013666 | ||
|
|
86127167fb | ||
|
|
9935a68018 | ||
|
|
5679dde70f | ||
|
|
d81b0f6368 | ||
|
|
9698b008da | ||
|
|
7b05c9283b | ||
|
|
303dd2ec35 | ||
|
|
aa6e81648a | ||
|
|
1a87870ef3 | ||
|
|
aac4ce2d12 | ||
|
|
2a79b2c853 | ||
|
|
15bf5b1533 | ||
|
|
cdc86db8ce | ||
|
|
9d2ad750b5 | ||
|
|
19ceb1a48f | ||
|
|
59217eae38 | ||
|
|
bea0aee835 | ||
|
|
aeace9b9be | ||
|
|
2994640f47 | ||
|
|
10069719e4 | ||
|
|
046b76df60 | ||
|
|
f2d9063984 | ||
|
|
7c1e2793c5 | ||
|
|
99f008e927 | ||
|
|
2699f0c2a6 | ||
|
|
0b6dd98000 | ||
|
|
a14fb20d15 | ||
|
|
728361a6a7 | ||
|
|
106db69e8e | ||
|
|
cf90071926 | ||
|
|
deaeb75a1f | ||
|
|
a666327d70 | ||
|
|
13a0522546 | ||
|
|
7da37a0d1f | ||
|
|
7efb22a323 | ||
|
|
8084e2f909 | ||
|
|
86127c6a6e | ||
|
|
402e019ae2 | ||
|
|
f09e4e238b | ||
|
|
2921162b3b | ||
|
|
ac1582c906 | ||
|
|
e4b01a5844 | ||
|
|
fa663abbbc | ||
|
|
d19e6111c3 | ||
|
|
8a6d504a7e | ||
|
|
43915937f2 | ||
|
|
48e92a22fe | ||
|
|
566af6b0b8 | ||
|
|
12e7613d5f | ||
|
|
04a68f2c57 | ||
|
|
9b4ca12f49 | ||
|
|
453ce715a6 | ||
|
|
d87b6189ba | ||
|
|
8293347b77 | ||
|
|
c85a3f0b94 | ||
|
|
233fb25e6c | ||
|
|
080978daa6 | ||
|
|
62b7c3d3b2 | ||
|
|
4b2379cba8 | ||
|
|
92087bdfa8 | ||
|
|
617919ac09 | ||
|
|
0669daec3d | ||
|
|
7c15a8c800 | ||
|
|
066b77fba0 | ||
|
|
d9aef5f916 | ||
|
|
91ae3f8a9b | ||
|
|
36da623352 | ||
|
|
31b9087ea6 | ||
|
|
1851fed22e | ||
|
|
eddce460da | ||
|
|
da4f30cb6d | ||
|
|
250cf2d8f1 | ||
|
|
7bbdb4f991 | ||
|
|
051c4782fb | ||
|
|
b1ccec74b2 | ||
|
|
92bf0d9eda | ||
|
|
f985550441 | ||
|
|
de8ee96927 | ||
|
|
2576d0f340 | ||
|
|
f38f4711ac | ||
|
|
c2f3ddd329 | ||
|
|
73ffe96228 | ||
|
|
bd13a80da7 | ||
|
|
312959f97e | ||
|
|
fe168e3c68 | ||
|
|
28929a47f7 | ||
|
|
03f5defbc3 | ||
|
|
b216648315 | ||
|
|
084b133a01 | ||
|
|
e589876176 | ||
|
|
a826313bf9 | ||
|
|
49f44aa7c8 | ||
|
|
64ceef9cf0 | ||
|
|
cd6567c1f1 | ||
|
|
ac67ca1555 | ||
|
|
8d38994756 | ||
|
|
607e3040d4 | ||
|
|
60604a9449 | ||
|
|
4abe4a6253 | ||
|
|
4c054af17b | ||
|
|
dcba940d42 | ||
|
|
ad2adb0c58 | ||
|
|
76923010b5 | ||
|
|
1b511557b2 | ||
|
|
fdadb12933 | ||
|
|
f1bbb7ba22 | ||
|
|
c1492c5275 | ||
|
|
4ffdabcfde | ||
|
|
b489de2fc3 | ||
|
|
d9656cbb1a | ||
|
|
05fb223985 | ||
|
|
62a5f07ad2 | ||
|
|
b669e3a481 | ||
|
|
99f1041a47 | ||
|
|
37b1345bfa | ||
|
|
8994ac17eb | ||
|
|
63bc825008 | ||
|
|
e7ffde1c4c | ||
|
|
1c88565725 | ||
|
|
07a6c2fb0e | ||
|
|
e99f3bf75a | ||
|
|
f09d780413 | ||
|
|
e370d23374 | ||
|
|
b68ec14146 | ||
|
|
c567fd71b1 | ||
|
|
2ca1b2d6f8 | ||
|
|
04041a9a9a | ||
|
|
6c498dc70f | ||
|
|
32b07c1720 | ||
|
|
ad507ce23d | ||
|
|
be562cedfc | ||
|
|
089e703e1f | ||
|
|
4dc1e15a99 | ||
|
|
c7dc2e886f | ||
|
|
11bc4ea854 | ||
|
|
029d76033d | ||
|
|
924d7dea9a | ||
|
|
244e94f3ce | ||
|
|
af1f51d49e | ||
|
|
9ba3c168b8 | ||
|
|
e6ee8f7a16 | ||
|
|
2ea2bd99e0 | ||
|
|
0c2ced7c52 | ||
|
|
fb160646b8 | ||
|
|
89fed57af2 | ||
|
|
feae3b6d2d | ||
|
|
92d3be8975 | ||
|
|
0f53e1db2c | ||
|
|
d398e8cc10 | ||
|
|
e5f263d380 | ||
|
|
3a4c303c54 | ||
|
|
54a1ef47d0 | ||
|
|
149ffa4f3c | ||
|
|
e5465034d9 | ||
|
|
568c7c782d | ||
|
|
9851334221 | ||
|
|
e79c4fc99d | ||
|
|
55c321f4ff | ||
|
|
a14a53a005 | ||
|
|
a71f937e8f | ||
|
|
032032df65 | ||
|
|
d0178edad0 | ||
|
|
795c5e55d9 | ||
|
|
8f8d8ae0d8 | ||
|
|
741f192d04 | ||
|
|
a5595b82ea | ||
|
|
4d1915eb41 | ||
|
|
b3a84fc772 | ||
|
|
403d22e62c | ||
|
|
ee00ee5c57 | ||
|
|
f53fd880dc | ||
|
|
de3461e4cc | ||
|
|
7bafc3a1bb | ||
|
|
22ef61fe8d | ||
|
|
7078fb53bd | ||
|
|
33447ad6f2 | ||
|
|
6faa50ae5b | ||
|
|
889dc19a27 |
13
.github/workflows/build.yaml
vendored
13
.github/workflows/build.yaml
vendored
@@ -21,20 +21,21 @@ jobs:
|
|||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
|
with:
|
||||||
|
fetch-depth: 0 # Fetch all history for setuptools_scm
|
||||||
- name: Install uv
|
- name: Install uv
|
||||||
uses: astral-sh/setup-uv@v3
|
uses: astral-sh/setup-uv@v3
|
||||||
with:
|
with:
|
||||||
version: "latest"
|
version: "latest"
|
||||||
|
|
||||||
- name: Set up Python
|
- name: Set up Python
|
||||||
run: uv python install 3.10
|
run: uv python install 3.10
|
||||||
|
|
||||||
- name: Install development dependencies
|
- name: Install development dependencies
|
||||||
run: uv sync --group dev
|
run: uv sync --group dev
|
||||||
|
|
||||||
- name: Build project
|
- name: Build project
|
||||||
run: uv build
|
run: uv build
|
||||||
|
|
||||||
- name: Install project in editable mode
|
- name: Install project in editable mode
|
||||||
run: uv pip install --editable .
|
run: uv pip install --editable .
|
||||||
|
|||||||
22
.github/workflows/publish.yaml
vendored
22
.github/workflows/publish.yaml
vendored
@@ -5,25 +5,25 @@ on:
|
|||||||
inputs:
|
inputs:
|
||||||
gitref:
|
gitref:
|
||||||
type: string
|
type: string
|
||||||
description: "what git tag to build (e.g. v0.0.74)"
|
description: 'what git tag to build (e.g. v0.0.74)'
|
||||||
required: true
|
required: true
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
build:
|
build:
|
||||||
name: "Build and upload wheels"
|
name: 'Build and upload wheels'
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout repo
|
- name: Checkout repo
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
with:
|
with:
|
||||||
ref: ${{ github.event.inputs.gitref }}
|
ref: ${{ github.event.inputs.gitref }}
|
||||||
|
|
||||||
- name: Install uv
|
- name: Install uv
|
||||||
uses: astral-sh/setup-uv@v3
|
uses: astral-sh/setup-uv@v3
|
||||||
with:
|
with:
|
||||||
version: "latest"
|
version: 'latest'
|
||||||
- name: Set up Python
|
- name: Set up Python
|
||||||
run: uv python install 3.10
|
run: uv python install 3.12
|
||||||
- name: Install development dependencies
|
- name: Install development dependencies
|
||||||
run: uv sync --group dev
|
run: uv sync --group dev
|
||||||
- name: Build project
|
- name: Build project
|
||||||
@@ -35,9 +35,9 @@ jobs:
|
|||||||
path: ./dist
|
path: ./dist
|
||||||
|
|
||||||
publish-to-pypi:
|
publish-to-pypi:
|
||||||
name: "Publish to PyPI"
|
name: 'Publish to PyPI'
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
needs: [ build ]
|
needs: [build]
|
||||||
environment:
|
environment:
|
||||||
name: pypi
|
name: pypi
|
||||||
url: https://pypi.org/p/pipecat-ai
|
url: https://pypi.org/p/pipecat-ai
|
||||||
@@ -56,12 +56,12 @@ jobs:
|
|||||||
print-hash: true
|
print-hash: true
|
||||||
|
|
||||||
publish-to-test-pypi:
|
publish-to-test-pypi:
|
||||||
name: "Publish to Test PyPI"
|
name: 'Publish to Test PyPI'
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
needs: [ build ]
|
needs: [build]
|
||||||
environment:
|
environment:
|
||||||
name: testpypi
|
name: testpypi
|
||||||
url: https://pypi.org/p/pipecat-ai
|
url: https://test.pypi.org/p/pipecat-ai
|
||||||
permissions:
|
permissions:
|
||||||
id-token: write
|
id-token: write
|
||||||
steps:
|
steps:
|
||||||
@@ -70,7 +70,7 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
name: wheels
|
name: wheels
|
||||||
path: ./dist
|
path: ./dist
|
||||||
- name: Publish to PyPI
|
- name: Publish to Test PyPI
|
||||||
uses: pypa/gh-action-pypi-publish@release/v1
|
uses: pypa/gh-action-pypi-publish@release/v1
|
||||||
with:
|
with:
|
||||||
verbose: true
|
verbose: true
|
||||||
|
|||||||
12
.github/workflows/publish_test.yaml
vendored
12
.github/workflows/publish_test.yaml
vendored
@@ -4,7 +4,7 @@ on: workflow_dispatch
|
|||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
build:
|
build:
|
||||||
name: "Build and upload wheels"
|
name: 'Build and upload wheels'
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout repo
|
- name: Checkout repo
|
||||||
@@ -15,9 +15,9 @@ jobs:
|
|||||||
- name: Install uv
|
- name: Install uv
|
||||||
uses: astral-sh/setup-uv@v3
|
uses: astral-sh/setup-uv@v3
|
||||||
with:
|
with:
|
||||||
version: "latest"
|
version: 'latest'
|
||||||
- name: Set up Python
|
- name: Set up Python
|
||||||
run: uv python install 3.10
|
run: uv python install 3.12
|
||||||
- name: Install development dependencies
|
- name: Install development dependencies
|
||||||
run: uv sync --group dev
|
run: uv sync --group dev
|
||||||
- name: Build project
|
- name: Build project
|
||||||
@@ -29,12 +29,12 @@ jobs:
|
|||||||
path: ./dist
|
path: ./dist
|
||||||
|
|
||||||
publish-to-test-pypi:
|
publish-to-test-pypi:
|
||||||
name: "Publish to Test PyPI"
|
name: 'Publish to Test PyPI'
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
needs: [build]
|
needs: [build]
|
||||||
environment:
|
environment:
|
||||||
name: testpypi
|
name: testpypi
|
||||||
url: https://pypi.org/p/pipecat-ai
|
url: https://test.pypi.org/p/pipecat-ai
|
||||||
permissions:
|
permissions:
|
||||||
id-token: write
|
id-token: write
|
||||||
steps:
|
steps:
|
||||||
@@ -43,7 +43,7 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
name: wheels
|
name: wheels
|
||||||
path: ./dist
|
path: ./dist
|
||||||
- name: Publish to PyPI
|
- name: Publish to Test PyPI
|
||||||
uses: pypa/gh-action-pypi-publish@release/v1
|
uses: pypa/gh-action-pypi-publish@release/v1
|
||||||
with:
|
with:
|
||||||
verbose: true
|
verbose: true
|
||||||
|
|||||||
314
CHANGELOG.md
314
CHANGELOG.md
@@ -5,6 +5,310 @@ All notable changes to **Pipecat** will be documented in this file.
|
|||||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
||||||
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||||
|
|
||||||
|
## [Unreleased]
|
||||||
|
|
||||||
|
### Added
|
||||||
|
|
||||||
|
- Added the [Pipecat CLI](https://github.com/pipecat-ai/pipecat-cli) to the
|
||||||
|
required dependencies, enabling you to scaffold a new project directly from
|
||||||
|
`pipecat-ai`. Get started with:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
uv run pipecat init
|
||||||
|
```
|
||||||
|
|
||||||
|
- Expanded support for universal `LLMContext` to `AWSNovaSonicLLMService`.
|
||||||
|
As a reminder, the context-setup pattern when using `LLMContext` is:
|
||||||
|
|
||||||
|
```python
|
||||||
|
context = LLMContext(messages, tools)
|
||||||
|
context_aggregator = LLMContextAggregatorPair(context)
|
||||||
|
```
|
||||||
|
|
||||||
|
(Note that even though `AWSNovaSonicLLMService` now supports the universal
|
||||||
|
`LLMContext`, it is not meant to be swapped out for another LLM service at
|
||||||
|
runtime.)
|
||||||
|
|
||||||
|
Worth noting: whether or not you use the new context-setup pattern with
|
||||||
|
`AWSNovaSonicLLMService`, some types have changed under the hood:
|
||||||
|
|
||||||
|
```python
|
||||||
|
## BEFORE:
|
||||||
|
|
||||||
|
# Context aggregator type
|
||||||
|
context_aggregator: AWSNovaSonicContextAggregatorPair
|
||||||
|
|
||||||
|
# Context frame type
|
||||||
|
frame: OpenAILLMContextFrame
|
||||||
|
|
||||||
|
# Context type
|
||||||
|
context: AWSNovaSonicLLMContext
|
||||||
|
# or
|
||||||
|
context: OpenAILLMContext
|
||||||
|
|
||||||
|
# Reading messages from context
|
||||||
|
messages = context.messages
|
||||||
|
|
||||||
|
## AFTER:
|
||||||
|
|
||||||
|
# Context aggregator type
|
||||||
|
context_aggregator: LLMContextAggregatorPair
|
||||||
|
|
||||||
|
# Context frame type
|
||||||
|
frame: LLMContextFrame
|
||||||
|
|
||||||
|
# Context type
|
||||||
|
context: LLMContext
|
||||||
|
|
||||||
|
# Reading messages from context
|
||||||
|
messages = context.get_messages()
|
||||||
|
```
|
||||||
|
|
||||||
|
- Added support for `bulbul:v3` model in `SarvamTTSService` and
|
||||||
|
`SarvamHttpTTSService`.
|
||||||
|
|
||||||
|
- Added `keyterms_prompt` parameter to `AssemblyAIConnectionParams`.
|
||||||
|
|
||||||
|
- Added `speech_model` parameter to `AssemblyAIConnectionParams` to access the
|
||||||
|
multilingual model.
|
||||||
|
|
||||||
|
- Added support for trickle ICE to the `SmallWebRTCTransport`.
|
||||||
|
|
||||||
|
- Added support for updating `OpenAITTSService` settings (`instructions` and
|
||||||
|
`speed`) at runtime via `TTSUpdateSettingsFrame`.
|
||||||
|
|
||||||
|
- Added `--whatsapp` flag to runner to better surface WhatsApp transport logs.
|
||||||
|
|
||||||
|
- Added `on_connected` and `on_disconnected` events to TTS and STT
|
||||||
|
websocket-based services.
|
||||||
|
|
||||||
|
- Added an `aggregate_sentences` arg in `ElevenLabsHttpTTSService`, where the
|
||||||
|
default value is True.
|
||||||
|
|
||||||
|
- Added a `room_properties` arg to the Daily runner's `configure()` method,
|
||||||
|
allowing `DailyRoomProperties` to be provided.
|
||||||
|
|
||||||
|
- The runner `--folder` argument now supports downloading files from
|
||||||
|
subdirectories.
|
||||||
|
|
||||||
|
### Changed
|
||||||
|
|
||||||
|
- `CartesiaSTTService` now inherits from `WebsocketSTTService`.
|
||||||
|
|
||||||
|
- Package upgrades:
|
||||||
|
|
||||||
|
- `daily-python` upgraded to 0.20.0.
|
||||||
|
- `openai` upgraded to support up to 2.x.x.
|
||||||
|
- `openpipe` upgraded to support up to 5.x.x.
|
||||||
|
|
||||||
|
- `SpeechmaticsSTTService` updated dependencies for `speechmatics-rt>=0.5.0`.
|
||||||
|
|
||||||
|
### Deprecated
|
||||||
|
|
||||||
|
- The `send_transcription_frames` argument to `AWSNovaSonicLLMService` is
|
||||||
|
deprecated. Transcription frames are now always sent. They go upstream, to be
|
||||||
|
handled by the user context aggregator. See "Added" section for details.
|
||||||
|
|
||||||
|
- Types in `pipecat.services.aws.nova_sonic.context` have been deprecated due
|
||||||
|
to changes to support `LLMContext`. See "Changed" section for details.
|
||||||
|
|
||||||
|
### Fixed
|
||||||
|
|
||||||
|
- Fixed an issue in `RivaSegmentedSTTService` where a runtime error occurred due
|
||||||
|
to a mismatch in the \_handle_transcription method's signature.
|
||||||
|
|
||||||
|
- Fixed multiple pipeline task cancellation issues. `asyncio.CancelledError` is
|
||||||
|
now handled properly in `PipelineTask` making it possible to cancel an asyncio
|
||||||
|
task that it's executing a `PipelineRunner` cleanly. Also,
|
||||||
|
`PipelineTask.cancel()` does not block anymore waiting for the `CancelFrame`
|
||||||
|
to reach the end of the pipeline (going back to the behavior in < 0.0.83).
|
||||||
|
|
||||||
|
- Fixed an issue in `ElevenLabsTTSService` and `ElevenLabsHttpTTSService` where
|
||||||
|
the Flash models would split words, resulting in a space being inserted
|
||||||
|
between words.
|
||||||
|
|
||||||
|
- Fixed an issue where audio filters' `stop()` would not be called when using
|
||||||
|
`CancelFrame`.
|
||||||
|
|
||||||
|
- Fixed an issue in `ElevenLabsHttpTTSService`, where
|
||||||
|
`apply_text_normalization` was incorrectly set as a query parameter. It's now
|
||||||
|
being added as a request parameter.
|
||||||
|
|
||||||
|
- Fixed an issue where `RimeHttpTTSService` and `PiperTTSService` could generate
|
||||||
|
incorrectly 16-bit aligned audio frames, potentially leading to internal
|
||||||
|
errors or static audio.
|
||||||
|
|
||||||
|
- Fixed an issue in `SpeechmaticsSTTService` where `AdditionalVocabEntry` items
|
||||||
|
needed to have `sounds_like` for the session to start.
|
||||||
|
|
||||||
|
### Other
|
||||||
|
|
||||||
|
- Added foundational example `47-sentry-metrics.py`, demonstrating how to use the
|
||||||
|
`SentryMetrics` processor.
|
||||||
|
|
||||||
|
- Added foundational example `14x-function-calling-openpipe.py`.
|
||||||
|
|
||||||
|
## [0.0.90] - 2025-10-10
|
||||||
|
|
||||||
|
### Added
|
||||||
|
|
||||||
|
- Added audio filter `KrispVivaFilter` using the Krisp VIVA SDK.
|
||||||
|
|
||||||
|
- Added `--folder` argument to the runner, allowing files saved in that folder
|
||||||
|
to be downloaded from `http://HOST:PORT/file/FILE`.
|
||||||
|
|
||||||
|
- Added `GeminiLiveVertexLLMService`, for accessing Gemini Live via Google
|
||||||
|
Vertex AI.
|
||||||
|
|
||||||
|
- Added some new configuration options to `GeminiLiveLLMService`:
|
||||||
|
|
||||||
|
- `thinking`
|
||||||
|
- `enable_affective_dialog`
|
||||||
|
- `proactivity`
|
||||||
|
|
||||||
|
Note that these new configuration options require using a newer model than
|
||||||
|
the default, like "gemini-2.5-flash-native-audio-preview-09-2025". The last
|
||||||
|
two require specifying `http_options=HttpOptions(api_version="v1alpha")`.
|
||||||
|
|
||||||
|
- Added `on_pipeline_error` event to `PipelineTask`. This event will get fired
|
||||||
|
when an `ErrorFrame` is pushed (use `FrameProcessor.push_error()`).
|
||||||
|
|
||||||
|
```python
|
||||||
|
@task.event_handler("on_pipeline_error")
|
||||||
|
async def on_pipeline_error(task: PipelineTask, frame: ErrorFrame):
|
||||||
|
...
|
||||||
|
```
|
||||||
|
|
||||||
|
- Added a `service_tier` `InputParam` to the `BaseOpenAILLMService`. This
|
||||||
|
parameter can influence the latency of the response. For example `"priority"`
|
||||||
|
will result in faster completions, but in exchange for a higher price.
|
||||||
|
|
||||||
|
### Changed
|
||||||
|
|
||||||
|
- Updated `GeminiLiveLLMService` to use the `google-genai` library rather than
|
||||||
|
use WebSockets directly.
|
||||||
|
|
||||||
|
### Deprecated
|
||||||
|
|
||||||
|
- `LivekitFrameSerializer` is now deprecated. Use `LiveKitTransport` instead.
|
||||||
|
|
||||||
|
- `pipecat.service.openai_realtime` is now deprecated, use
|
||||||
|
`pipecat.services.openai.realtime` instead or
|
||||||
|
`pipecat.services.azure.realtime` for Azure Realtime.
|
||||||
|
|
||||||
|
- `pipecat.service.aws_nova_sonic` is now deprecated, use
|
||||||
|
`pipecat.services.aws.nova_sonic` instead.
|
||||||
|
|
||||||
|
- `GeminiMultimodalLiveLLMService` is now deprecated, use
|
||||||
|
`GeminiLiveLLMService`.
|
||||||
|
|
||||||
|
### Fixed
|
||||||
|
|
||||||
|
- Fixed a `GoogleVertexLLMService` issue that would generate an error if no
|
||||||
|
token information was returned.
|
||||||
|
|
||||||
|
- `GeminiLiveLLMService` will now end gracefully (i.e. after the bot has
|
||||||
|
finished) upon receiving an `EndFrame`.
|
||||||
|
|
||||||
|
- `GeminiLiveLLMService` will try to seamlessly reconnect when it loses its
|
||||||
|
connection.
|
||||||
|
|
||||||
|
## [0.0.89] - 2025-10-07
|
||||||
|
|
||||||
|
### Fixed
|
||||||
|
|
||||||
|
- Reverted a change introduced in 0.0.88 that was causing pipelines to be frozen
|
||||||
|
when using interruption strategies and processors that block interruption
|
||||||
|
frames (e.g. `STTMuteFilter`).
|
||||||
|
|
||||||
|
## [0.0.88] - 2025-10-07
|
||||||
|
|
||||||
|
### Added
|
||||||
|
|
||||||
|
- Added support for Nano Banana models to `GoogleLLMService`. For example, you
|
||||||
|
can now use the `gemini-2.5-flash-image` model to generate images.
|
||||||
|
|
||||||
|
- Added `HumeTTSService` for text-to-speech synthesis using Hume AI's expressive
|
||||||
|
voice models. Provides high-quality, emotionally expressive speech synthesis
|
||||||
|
with support for various voice models. Includes example in
|
||||||
|
`examples/foundational/07ad-interruptible-hume.py`. Use with:
|
||||||
|
`uv pip install pipecat-ai[hume]`.
|
||||||
|
|
||||||
|
### Changed
|
||||||
|
|
||||||
|
- Updated default `GoogleLLMService` model to `gemini-2.5-flash`.
|
||||||
|
|
||||||
|
### Deprecated
|
||||||
|
|
||||||
|
- PlayHT is shutting down their API on December 31st, 2025. As a result,
|
||||||
|
`PlayHTTTSService` and `PlayHTHttpTTSService` are deprecated and will be
|
||||||
|
removed in a future version.
|
||||||
|
|
||||||
|
### Fixed
|
||||||
|
|
||||||
|
- Fixed an issue with `AWSNovaSonicLLMService` where the client wouldn't
|
||||||
|
connect due to a breaking change in the AWS dependency chain.
|
||||||
|
|
||||||
|
- `PermissionError` is now caught if NLTK's `punkt_tab` can't be downloaded.
|
||||||
|
|
||||||
|
- Fixed an issue that would cause wrong user/assistant context ordering when
|
||||||
|
using interruption strategies.
|
||||||
|
|
||||||
|
- Fixed RTVI incoming message handling, broken in 0.0.87.
|
||||||
|
|
||||||
|
## [0.0.87] - 2025-10-02
|
||||||
|
|
||||||
|
### Added
|
||||||
|
|
||||||
|
- Added `WebsocketSTTService` base class for websocket-based STT services.
|
||||||
|
Combines STT functionality with websocket connectivity, providing automatic
|
||||||
|
error handling and reconnection capabilities with exponential backoff.
|
||||||
|
|
||||||
|
- Added `DeepgramFluxSTTService` for real-time speech recognition using
|
||||||
|
Deepgram's Flux WebSocket API. Flux understands conversational flow and
|
||||||
|
automatically handles turn-taking.
|
||||||
|
|
||||||
|
- Added RTVI messages for user/bot audio levels and system logs.
|
||||||
|
|
||||||
|
- Include OpenAI-based LLM services cached tokens to `MetricsFrame`.
|
||||||
|
|
||||||
|
### Changed
|
||||||
|
|
||||||
|
- Updated the default model for `AnthropicLLMService` to
|
||||||
|
`claude-sonnet-4-5-20250929`.
|
||||||
|
|
||||||
|
### Deprecated
|
||||||
|
|
||||||
|
- `DailyTransportMessageFrame` and `DailyTransportMessageUrgentFrame` are
|
||||||
|
deprecated, use `DailyOutputTransportMessageFrame` and
|
||||||
|
`DailyOutputTransportMessageUrgentFrame` respectively instead.
|
||||||
|
|
||||||
|
- `LiveKitTransportMessageFrame` and `LiveKitTransportMessageUrgentFrame` are
|
||||||
|
deprecated, use `LiveKitOutputTransportMessageFrame` and
|
||||||
|
`LiveKitOutputTransportMessageUrgentFrame` respectively instead.
|
||||||
|
|
||||||
|
- `TransportMessageFrame` and `TransportMessageUrgentFrame` are deprecated, use
|
||||||
|
`OutputTransportMessageFrame` and `OutputTransportMessageUrgentFrame`
|
||||||
|
respectively instead.
|
||||||
|
|
||||||
|
- `InputTransportMessageUrgentFrame` is deprecated, use
|
||||||
|
`InputTransportMessageFrame` instead.
|
||||||
|
|
||||||
|
- `DailyUpdateRemoteParticipantsFrame` is deprecated and will be removed in a
|
||||||
|
future version. Instead, create your own custom frame and handle it in the
|
||||||
|
`@transport.output().event_handler("on_after_push_frame")` event handler or a
|
||||||
|
custom processor.
|
||||||
|
|
||||||
|
## Fixed
|
||||||
|
|
||||||
|
- Fixed an issue in `AWSBedrockLLMService` where timeout exceptions weren't
|
||||||
|
being detected.
|
||||||
|
|
||||||
|
- Fixed a `PipelineTask` issue that could prevent the application to exit if
|
||||||
|
`task.cancel()` was called when the task was already finished.
|
||||||
|
|
||||||
|
- Fixed an issue where local SmartTurn was not being ran in a separate thread.
|
||||||
|
|
||||||
## [0.0.86] - 2025-09-24
|
## [0.0.86] - 2025-09-24
|
||||||
|
|
||||||
### Added
|
### Added
|
||||||
@@ -1326,7 +1630,7 @@ quality and critical bugs impacting `ParallelPipelines` functionality.**
|
|||||||
- Added `session_token` parameter to `AWSNovaSonicLLMService`.
|
- Added `session_token` parameter to `AWSNovaSonicLLMService`.
|
||||||
|
|
||||||
- Added Gemini Multimodal Live File API for uploading, fetching, listing, and
|
- Added Gemini Multimodal Live File API for uploading, fetching, listing, and
|
||||||
deleting files. See `26f-gemini-multimodal-live-files-api.py` for example usage.
|
deleting files. See `26f-gemini-live-files-api.py` for example usage.
|
||||||
|
|
||||||
### Changed
|
### Changed
|
||||||
|
|
||||||
@@ -3332,7 +3636,7 @@ stt = DeepgramSTTService(..., live_options=LiveOptions(model="nova-2-general"))
|
|||||||
- Added the new modalities option and helper function to set Gemini output
|
- Added the new modalities option and helper function to set Gemini output
|
||||||
modalities.
|
modalities.
|
||||||
|
|
||||||
- Added `examples/foundational/26d-gemini-multimodal-live-text.py` which is
|
- Added `examples/foundational/26d-gemini-live-text.py` which is
|
||||||
using Gemini as TEXT modality and using another TTS provider for TTS process.
|
using Gemini as TEXT modality and using another TTS provider for TTS process.
|
||||||
|
|
||||||
### Changed
|
### Changed
|
||||||
@@ -3519,9 +3823,9 @@ stt = DeepgramSTTService(..., live_options=LiveOptions(model="nova-2-general"))
|
|||||||
- Added new foundational examples for `GeminiMultimodalLiveLLMService`:
|
- Added new foundational examples for `GeminiMultimodalLiveLLMService`:
|
||||||
|
|
||||||
- `26-gemini-multimodal-live.py`
|
- `26-gemini-multimodal-live.py`
|
||||||
- `26a-gemini-multimodal-live-transcription.py`
|
- `26a-gemini-live-transcription.py`
|
||||||
- `26b-gemini-multimodal-live-video.py`
|
- `26b-gemini-live-video.py`
|
||||||
- `26c-gemini-multimodal-live-video.py`
|
- `26c-gemini-live-video.py`
|
||||||
|
|
||||||
- Added `SimliVideoService`. This is an integration for Simli AI avatars.
|
- Added `SimliVideoService`. This is an integration for Simli AI avatars.
|
||||||
(see https://www.simli.com)
|
(see https://www.simli.com)
|
||||||
|
|||||||
336
COMMUNITY_INTEGRATIONS.md
Normal file
336
COMMUNITY_INTEGRATIONS.md
Normal file
@@ -0,0 +1,336 @@
|
|||||||
|
# Community Integrations Guide
|
||||||
|
|
||||||
|
Pipecat welcomes community-maintained integrations! As our ecosystem grows, we've established a process for any developer to create and maintain their own service integrations while ensuring discoverability for the Pipecat community.
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
**What we support:** Community-maintained integrations that live in separate repositories and are maintained by their authors.
|
||||||
|
|
||||||
|
**What we don't do:** The Pipecat team does not code review, test, or maintain community integrations. We provide guidance and list approved integrations for discoverability.
|
||||||
|
|
||||||
|
**Why this approach:** This allows the community to move quickly while keeping the Pipecat core team focused on maintaining the framework itself.
|
||||||
|
|
||||||
|
## Submitting your Integration
|
||||||
|
|
||||||
|
To be listed as an official community integration, follow these steps:
|
||||||
|
|
||||||
|
### Step 1: Build Your Integration
|
||||||
|
|
||||||
|
Create your integration following the patterns and examples shown in the "Integration Patterns and Examples" section below.
|
||||||
|
|
||||||
|
### Step 2: Set Up Your Repository
|
||||||
|
|
||||||
|
Your repository must contain these components:
|
||||||
|
|
||||||
|
- **Source code** - Complete implementation following Pipecat patterns
|
||||||
|
- **Foundational example** - Single file example showing basic usage (see [Pipecat examples](https://github.com/pipecat-ai/pipecat/tree/main/examples/foundational))
|
||||||
|
- **README.md** - Must include:
|
||||||
|
|
||||||
|
- Introduction and explanation of your integration
|
||||||
|
- Installation instructions
|
||||||
|
- Usage instructions with Pipecat Pipeline
|
||||||
|
- How to run your example
|
||||||
|
- Pipecat version compatibility (e.g., "Tested with Pipecat v0.0.86")
|
||||||
|
- Company attribution: If you work for the company providing the service, please mention this in your README. This helps build confidence that the integration will be actively maintained.
|
||||||
|
|
||||||
|
- **LICENSE** - Permissive license (BSD-2 like Pipecat, or equivalent open source terms)
|
||||||
|
- **Code documentation** - Source code with docstrings (we recommend following [Pipecat's docstring conventions](https://github.com/pipecat-ai/pipecat/blob/main/CONTRIBUTING.md#docstring-conventions))
|
||||||
|
- **Changelog** - Maintain a changelog for version updates
|
||||||
|
|
||||||
|
### Step 3: Join Discord
|
||||||
|
|
||||||
|
Join our Discord: https://discord.gg/pipecat
|
||||||
|
|
||||||
|
### Step 4: Submit for Listing
|
||||||
|
|
||||||
|
Submit a pull request to add your integration to our [Community Integrations documentation page](https://docs.pipecat.ai/server/services/community-integrations).
|
||||||
|
|
||||||
|
**To submit:**
|
||||||
|
|
||||||
|
1. Fork the [Pipecat docs repository](https://github.com/pipecat-ai/docs)
|
||||||
|
2. Edit the file `server/services/community-integrations.mdx`
|
||||||
|
3. Add your integration to the appropriate service category table with:
|
||||||
|
- Service name
|
||||||
|
- Link to your repository
|
||||||
|
- Maintainer GitHub username(s)
|
||||||
|
4. Include a link to your demo video (approx 30-60 seconds) in your PR description showing:
|
||||||
|
- Core functionality of your integration
|
||||||
|
- Handling of an interruption (if applicable to service type)
|
||||||
|
5. Submit your pull request
|
||||||
|
|
||||||
|
Once your PR is submitted, post in the `#community-integrations` Discord channel to let us know.
|
||||||
|
|
||||||
|
## Integration Patterns and Examples
|
||||||
|
|
||||||
|
### STT (Speech-to-Text) Services
|
||||||
|
|
||||||
|
#### Websocket-based Services
|
||||||
|
|
||||||
|
**Base class:** `STTService`
|
||||||
|
|
||||||
|
**Examples:**
|
||||||
|
|
||||||
|
- [DeepgramSTTService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/deepgram/stt.py)
|
||||||
|
- [SpeechmaticsSTTService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/speechmatics/stt.py)
|
||||||
|
|
||||||
|
#### File-based Services
|
||||||
|
|
||||||
|
**Base class:** `SegmentedSTTService`
|
||||||
|
|
||||||
|
**Examples:**
|
||||||
|
|
||||||
|
- [RivaSTTService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/riva/stt.py)
|
||||||
|
- [FalSTTService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/fal/stt.py)
|
||||||
|
|
||||||
|
#### Key requirements:
|
||||||
|
|
||||||
|
- STT services should push `InterimTranscriptionFrames` and `TranscriptionFrames`
|
||||||
|
- If confidence values are available, filter for values >50% confidence
|
||||||
|
|
||||||
|
### LLM (Large Language Model) Services
|
||||||
|
|
||||||
|
#### OpenAI-Compatible Services
|
||||||
|
|
||||||
|
**Base class:** `OpenAILLMService`
|
||||||
|
|
||||||
|
**Examples:**
|
||||||
|
|
||||||
|
- [AzureLLMService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/azure/llm.py)
|
||||||
|
- [GrokLLMService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/grok/llm.py) - Shows overriding the base class where needed
|
||||||
|
|
||||||
|
#### Non-OpenAI Compatible Services
|
||||||
|
|
||||||
|
**Requires:** Full implementation
|
||||||
|
|
||||||
|
**Examples:**
|
||||||
|
|
||||||
|
- [AnthropicLLMService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/anthropic/llm.py)
|
||||||
|
- [GoogleLLMService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/google/llm.py)
|
||||||
|
|
||||||
|
#### Key requirements:
|
||||||
|
|
||||||
|
- **Frame sequence:** Output must follow this frame sequence pattern:
|
||||||
|
|
||||||
|
- `LLMFullResponseStartFrame` - Signals the start of an LLM response
|
||||||
|
- `LLMTextFrame` - Contains LLM content, typically streamed as tokens
|
||||||
|
- `LLMFullResponseEndFrame` - Signals the end of an LLM response
|
||||||
|
|
||||||
|
- **Context aggregation:** Implement context aggregation to collect user and assistant content:
|
||||||
|
- Aggregators come in pairs with a `user()` instance and `assistant()` instance
|
||||||
|
- Context must adhere to the `LLMContext` universal format
|
||||||
|
- Aggregators should handle adding messages, function calls, and images to the context
|
||||||
|
|
||||||
|
### TTS (Text-to-Speech) Services
|
||||||
|
|
||||||
|
#### AudioContextWordTTSService
|
||||||
|
|
||||||
|
**Use for:** Websocket-based services supporting word/timestamp alignment
|
||||||
|
|
||||||
|
**Example:**
|
||||||
|
|
||||||
|
- [CartesiaTTSService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/cartesia/tts.py)
|
||||||
|
|
||||||
|
#### InterruptibleTTSService
|
||||||
|
|
||||||
|
**Use for:** Websocket-based services without word/timestamp alignment, requiring disconnection on interruption
|
||||||
|
|
||||||
|
**Example:**
|
||||||
|
|
||||||
|
- [SarvamTTSService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/sarvam/tts.py)
|
||||||
|
|
||||||
|
#### WordTTSService
|
||||||
|
|
||||||
|
**Use for:** HTTP-based services supporting word/timestamp alignment
|
||||||
|
|
||||||
|
**Example:**
|
||||||
|
|
||||||
|
- [ElevenLabsHttpTTSService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/elevenlabs/tts.py)
|
||||||
|
|
||||||
|
#### TTSService
|
||||||
|
|
||||||
|
**Use for:** HTTP-based services without word/timestamp alignment
|
||||||
|
|
||||||
|
**Example:**
|
||||||
|
|
||||||
|
- [GoogleHttpTTSService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/google/tts.py)
|
||||||
|
|
||||||
|
#### Key requirements:
|
||||||
|
|
||||||
|
- For websocket services, use asyncio WebSocket implementation (required for v13+ support)
|
||||||
|
- Handle idle service timeouts with keepalives
|
||||||
|
- TTSServices push both audio (`TTSRawAudioFrame`) and text (`TTSTextFrame`) frames
|
||||||
|
|
||||||
|
### Telephony Serializers
|
||||||
|
|
||||||
|
Pipecat supports telephony provider integration using websocket connections to exchange MediaStreams. These services use a FrameSerializer to serialize and deserialize inputs from the FastAPIWebsocketTransport.
|
||||||
|
|
||||||
|
**Examples:**
|
||||||
|
|
||||||
|
- [Twilio](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/serializers/twilio.py)
|
||||||
|
- [Telnyx](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/serializers/telnyx.py)
|
||||||
|
|
||||||
|
#### Key requirements:
|
||||||
|
|
||||||
|
- Include hang-up functionality using the provider's native API, ideally using `aiohttp`
|
||||||
|
- Support DTMF (dual-tone multi-frequency) events if the provider supports them:
|
||||||
|
- Deserialize DTMF events from the provider's protocol to `InputDTMFFrame`
|
||||||
|
- Use `KeypadEntry` enum for valid keypad entries (0-9, \*, #, A-D)
|
||||||
|
- Handle invalid DTMF digits gracefully by returning `None`
|
||||||
|
|
||||||
|
### Image Generation Services
|
||||||
|
|
||||||
|
**Base class:** `ImageGenService`
|
||||||
|
|
||||||
|
**Examples:**
|
||||||
|
|
||||||
|
- [FalImageGenService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/fal/image.py)
|
||||||
|
- [GoogleImageGenService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/google/image.py)
|
||||||
|
|
||||||
|
#### Key requirements:
|
||||||
|
|
||||||
|
- Must implement `run_image_gen` method returning an `AsyncGenerator`
|
||||||
|
|
||||||
|
### Vision Services
|
||||||
|
|
||||||
|
Vision services process images and provide analysis such as descriptions, object detection, or visual question answering.
|
||||||
|
|
||||||
|
**Base class:** `VisionService`
|
||||||
|
|
||||||
|
**Example:**
|
||||||
|
|
||||||
|
- [MoondreamVisionService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/moondream/vision.py)
|
||||||
|
|
||||||
|
#### Key requirements:
|
||||||
|
|
||||||
|
- Must implement `run_vision` method that takes an `LLMContext` and returns an `AsyncGenerator[Frame, None]`
|
||||||
|
- The method processes the latest image in the context and yields frames with analysis results
|
||||||
|
- Typically yields `TextFrame` objects containing descriptions or answers
|
||||||
|
|
||||||
|
## Implementation Guidelines
|
||||||
|
|
||||||
|
### Naming Conventions
|
||||||
|
|
||||||
|
- **STT:** `VendorSTTService`
|
||||||
|
- **LLM:** `VendorLLMService`
|
||||||
|
- **TTS:**
|
||||||
|
- Websocket: `VendorTTSService`
|
||||||
|
- HTTP: `VendorHttpTTSService`
|
||||||
|
- **Image:** `VendorImageGenService`
|
||||||
|
- **Vision:** `VendorVisionService`
|
||||||
|
- **Telephony:** `VendorFrameSerializer`
|
||||||
|
|
||||||
|
### Metrics Support
|
||||||
|
|
||||||
|
Enable metrics in your service:
|
||||||
|
|
||||||
|
```python
|
||||||
|
def can_generate_metrics(self) -> bool:
|
||||||
|
"""Check if this service can generate processing metrics.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True, as this service supports metrics.
|
||||||
|
"""
|
||||||
|
return True
|
||||||
|
```
|
||||||
|
|
||||||
|
### Dynamic Settings Updates
|
||||||
|
|
||||||
|
STT, LLM, and TTS services support `ServiceUpdateSettingsFrame` for dynamic configuration changes. The base STTService has an `_update_settings()` method that handles settings, and the private `_settings` `Dict` is used to store settings and provide access to the subclass.
|
||||||
|
|
||||||
|
```python
|
||||||
|
async def set_language(self, language: Language):
|
||||||
|
"""Set the recognition language and reconnect.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
language: The language to use for speech recognition.
|
||||||
|
"""
|
||||||
|
logger.info(f"Switching STT language to: [{language}]")
|
||||||
|
self._settings["language"] = language
|
||||||
|
await self._disconnect()
|
||||||
|
await self._connect()
|
||||||
|
```
|
||||||
|
|
||||||
|
Note that, in this example, Deepgram requires the websocket connection be disconnected and reconnected to reinitialize the service with the new value. Consider if your service requires reconnection.
|
||||||
|
|
||||||
|
### Sample Rate Handling
|
||||||
|
|
||||||
|
Sample rates are set via PipelineParams and passed to each frame processor at initialization. The pattern is to _not_ set the sample rate value in the constructor of a given service. Instead, use the `start()` method to initialize sample rates from the frame:
|
||||||
|
|
||||||
|
```python
|
||||||
|
async def start(self, frame: StartFrame):
|
||||||
|
"""Start the service."""
|
||||||
|
await super().start(frame)
|
||||||
|
self._settings["output_format"]["sample_rate"] = self.sample_rate
|
||||||
|
await self._connect()
|
||||||
|
```
|
||||||
|
|
||||||
|
Note that `self.sample_rate` is a `@property` set in the TTSService base class, which provides access to the private sample rate value obtained from the StartFrame.
|
||||||
|
|
||||||
|
### Tracing Decorators
|
||||||
|
|
||||||
|
Use Pipecat's tracing decorators:
|
||||||
|
|
||||||
|
- **STT:** `@traced_stt` - decorate a function that handles `transcript`, `is_final`, `language` as args
|
||||||
|
- **LLM:** `@traced_llm` - decorate the `_process_context()` method
|
||||||
|
- **TTS:** `@traced_tts` - decorate the `run_tts()` method
|
||||||
|
|
||||||
|
## Best Practices
|
||||||
|
|
||||||
|
### Packaging and Distribution
|
||||||
|
|
||||||
|
- Use [uv](https://docs.astral.sh/uv/) for packaging (encouraged)
|
||||||
|
- Consider releasing to PyPI for easier installation
|
||||||
|
- Follow semantic versioning principles
|
||||||
|
- Maintain a changelog
|
||||||
|
|
||||||
|
### HTTP Communication
|
||||||
|
|
||||||
|
For REST-based communication, use aiohttp. Pipecat includes this as a required dependency, so using it prevents adding an additional dependency to your integration.
|
||||||
|
|
||||||
|
### Error Handling
|
||||||
|
|
||||||
|
- Wrap API calls in appropriate try/catch blocks
|
||||||
|
- Handle rate limits and network failures gracefully
|
||||||
|
- Provide meaningful error messages
|
||||||
|
- When errors occur, raise exceptions AND push `ErrorFrame`s to notify the pipeline:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from pipecat.frames.frames import ErrorFrame
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Your API call
|
||||||
|
result = await self._make_api_call()
|
||||||
|
except Exception as e:
|
||||||
|
# Push error frame to pipeline
|
||||||
|
await self.push_error(ErrorFrame(error=f"{self} error: {e}"))
|
||||||
|
# Raise or handle as appropriate
|
||||||
|
raise
|
||||||
|
```
|
||||||
|
|
||||||
|
### Testing
|
||||||
|
|
||||||
|
- Your foundational example serves as a valuable integration-level test
|
||||||
|
- Unit tests are nice to have. As the Pipecat teams provides better guidance, we will encourage unit testing more
|
||||||
|
|
||||||
|
## Disclaimer
|
||||||
|
|
||||||
|
Community integrations are community-maintained and not officially supported by the Pipecat team. Users should evaluate these integrations independently. The Pipecat team reserves the right to remove listings that become unmaintained or problematic.
|
||||||
|
|
||||||
|
## Staying Up to Date
|
||||||
|
|
||||||
|
Pipecat evolves rapidly to support the latest AI technologies and patterns. While we strive to minimize breaking changes, they do occur as the framework matures.
|
||||||
|
|
||||||
|
**We strongly recommend:**
|
||||||
|
|
||||||
|
- Join our Discord at https://discord.gg/pipecat and monitor the `#announcements` channel for release notifications
|
||||||
|
- Follow our changelog: https://github.com/pipecat-ai/pipecat/blob/main/CHANGELOG.md
|
||||||
|
- Test your integration against new Pipecat releases promptly
|
||||||
|
- Update your README with the last tested Pipecat version
|
||||||
|
|
||||||
|
This helps ensure your integration remains compatible and your users have clear expectations about version support.
|
||||||
|
|
||||||
|
## Questions?
|
||||||
|
|
||||||
|
Join our Discord community at https://discord.gg/pipecat and post in the `#community-integrations` channel for guidance and support.
|
||||||
|
|
||||||
|
For additional questions, you can also reach out to us at pipecat-ai@daily.co.
|
||||||
@@ -1,5 +1,9 @@
|
|||||||
## Contributing to Pipecat
|
## Contributing to Pipecat
|
||||||
|
|
||||||
|
**Want to add a new service integration?**
|
||||||
|
We encourage community-maintained integrations! Please see our [Community Integration Guide](COMMUNITY_INTEGRATIONS.md) for the process and requirements.
|
||||||
|
|
||||||
|
**Want to contribute to Pipecat core?**
|
||||||
We welcome contributions of all kinds! Your help is appreciated. Follow these steps to get involved:
|
We welcome contributions of all kinds! Your help is appreciated. Follow these steps to get involved:
|
||||||
|
|
||||||
1. **Fork this repository**: Start by forking the Pipecat Documentation repository to your GitHub account.
|
1. **Fork this repository**: Start by forking the Pipecat Documentation repository to your GitHub account.
|
||||||
|
|||||||
143
README.md
143
README.md
@@ -3,6 +3,7 @@
|
|||||||
</div></h1>
|
</div></h1>
|
||||||
|
|
||||||
[](https://pypi.org/project/pipecat-ai)  [](https://codecov.io/gh/pipecat-ai/pipecat) [](https://docs.pipecat.ai) [](https://discord.gg/pipecat) [](https://deepwiki.com/pipecat-ai/pipecat)
|
[](https://pypi.org/project/pipecat-ai)  [](https://codecov.io/gh/pipecat-ai/pipecat) [](https://docs.pipecat.ai) [](https://discord.gg/pipecat) [](https://deepwiki.com/pipecat-ai/pipecat)
|
||||||
|
[](https://getmanta.ai/pipecat)
|
||||||
|
|
||||||
# 🎙️ Pipecat: Real-Time Voice & Multimodal AI Agents
|
# 🎙️ Pipecat: Real-Time Voice & Multimodal AI Agents
|
||||||
|
|
||||||
@@ -19,10 +20,6 @@
|
|||||||
- **Business Agents** – customer intake, support bots, guided flows
|
- **Business Agents** – customer intake, support bots, guided flows
|
||||||
- **Complex Dialog Systems** – design logic with structured conversations
|
- **Complex Dialog Systems** – design logic with structured conversations
|
||||||
|
|
||||||
🧭 Looking to build structured conversations? Check out [Pipecat Flows](https://github.com/pipecat-ai/pipecat-flows) for managing complex conversational states and transitions.
|
|
||||||
|
|
||||||
🔍 Looking for help debugging your pipeline and processors? Check out [Whisker](https://github.com/pipecat-ai/whisker), a real-time Pipecat debugger.
|
|
||||||
|
|
||||||
## 🧠 Why Pipecat?
|
## 🧠 Why Pipecat?
|
||||||
|
|
||||||
- **Voice-first**: Integrates speech recognition, text-to-speech, and conversation handling
|
- **Voice-first**: Integrates speech recognition, text-to-speech, and conversation handling
|
||||||
@@ -30,40 +27,38 @@
|
|||||||
- **Composable Pipelines**: Build complex behavior from modular components
|
- **Composable Pipelines**: Build complex behavior from modular components
|
||||||
- **Real-Time**: Ultra-low latency interaction with different transports (e.g. WebSockets or WebRTC)
|
- **Real-Time**: Ultra-low latency interaction with different transports (e.g. WebSockets or WebRTC)
|
||||||
|
|
||||||
## 📱 Client SDKs
|
## 🌐 Pipecat Ecosystem
|
||||||
|
|
||||||
You can connect to Pipecat from any platform using our official SDKs:
|
### 📱 Client SDKs
|
||||||
|
|
||||||
<table>
|
Building client applications? You can connect to Pipecat from any platform using our official SDKs:
|
||||||
<tr>
|
|
||||||
<td>
|
<a href="https://docs.pipecat.ai/client/js/introduction">JavaScript</a> | <a href="https://docs.pipecat.ai/client/react/introduction">React</a> | <a href="https://docs.pipecat.ai/client/react-native/introduction">React Native</a> |
|
||||||
<img src="https://cdn.jsdelivr.net/gh/devicons/devicon/icons/javascript/javascript-original.svg" width="40" height="40" alt="JavaScript"/>
|
<a href="https://docs.pipecat.ai/client/ios/introduction">Swift</a> | <a href="https://docs.pipecat.ai/client/android/introduction">Kotlin</a> | <a href="https://docs.pipecat.ai/client/c++/introduction">C++</a> | <a href="https://github.com/pipecat-ai/pipecat-esp32">ESP32</a>
|
||||||
<a href="https://docs.pipecat.ai/client/js/introduction">JavaScript</a>
|
|
||||||
</td>
|
### 🧭 Structured conversations
|
||||||
<td>
|
|
||||||
<img src="https://cdn.jsdelivr.net/gh/devicons/devicon/icons/react/react-original.svg" width="40" height="40" alt="React"/>
|
Looking to build structured conversations? Check out [Pipecat Flows](https://github.com/pipecat-ai/pipecat-flows) for managing complex conversational states and transitions.
|
||||||
<a href="https://docs.pipecat.ai/client/react/introduction">React</a>
|
|
||||||
</td>
|
### 🪄 Beautiful UIs
|
||||||
<td>
|
|
||||||
<img src="https://cdn.jsdelivr.net/gh/devicons/devicon/icons/react/react-original.svg" width="40" height="40" alt="React Native"/>
|
Want to build beautiful and engaging experiences? Checkout the [Voice UI Kit](https://github.com/pipecat-ai/voice-ui-kit), a collection of components, hooks and templates for building voice AI applications quickly.
|
||||||
<a href="https://docs.pipecat.ai/client/react-native/introduction">React Native</a>
|
|
||||||
</td>
|
### 🛠️ CLI
|
||||||
</tr>
|
|
||||||
<tr>
|
Create a new project in under a minute with the [Pipecat CLI](https://github.com/pipecat-ai/pipecat-cli). Then use the CLI to monitor and deploy your agent to production.
|
||||||
<td>
|
|
||||||
<img src="https://cdn.jsdelivr.net/gh/devicons/devicon/icons/swift/swift-original.svg" width="40" height="40" alt="Swift"/>
|
### 🔍 Debugging
|
||||||
<a href="https://docs.pipecat.ai/client/ios/introduction">Swift</a>
|
|
||||||
</td>
|
Looking for help debugging your pipeline and processors? Check out [Whisker](https://github.com/pipecat-ai/whisker), a real-time Pipecat debugger.
|
||||||
<td>
|
|
||||||
<img src="https://cdn.jsdelivr.net/gh/devicons/devicon/icons/kotlin/kotlin-original.svg" width="40" height="40" alt="Kotlin"/>
|
### 🖥️ Terminal
|
||||||
<a href="https://docs.pipecat.ai/client/android/introduction">Kotlin</a>
|
|
||||||
</td>
|
Love terminal applications? Check out [Tail](https://github.com/pipecat-ai/tail), a terminal dashboard for Pipecat.
|
||||||
<td>
|
|
||||||
<img src="https://cdn.jsdelivr.net/gh/devicons/devicon/icons/cplusplus/cplusplus-original.svg" width="40" height="40" alt="JavaScript"/>
|
### 📺️ Pipecat TV Channel
|
||||||
<a href="https://docs.pipecat.ai/client/c++/introduction">C++</a>
|
|
||||||
</td>
|
Catch new features, interviews, and how-tos on our [Pipecat TV](https://www.youtube.com/playlist?list=PLzU2zoMTQIHjqC3v4q2XVSR3hGSzwKFwH) channel.
|
||||||
</tr>
|
|
||||||
</table>
|
|
||||||
|
|
||||||
## 🎬 See it in action
|
## 🎬 See it in action
|
||||||
|
|
||||||
@@ -72,24 +67,24 @@ You can connect to Pipecat from any platform using our official SDKs:
|
|||||||
<a href="https://github.com/pipecat-ai/pipecat-examples/tree/main/storytelling-chatbot"><img src="https://raw.githubusercontent.com/pipecat-ai/pipecat-examples/main/storytelling-chatbot/image.png" width="400" /></a>
|
<a href="https://github.com/pipecat-ai/pipecat-examples/tree/main/storytelling-chatbot"><img src="https://raw.githubusercontent.com/pipecat-ai/pipecat-examples/main/storytelling-chatbot/image.png" width="400" /></a>
|
||||||
<br/>
|
<br/>
|
||||||
<a href="https://github.com/pipecat-ai/pipecat-examples/tree/main/translation-chatbot"><img src="https://raw.githubusercontent.com/pipecat-ai/pipecat-examples/main/translation-chatbot/image.png" width="400" /></a>
|
<a href="https://github.com/pipecat-ai/pipecat-examples/tree/main/translation-chatbot"><img src="https://raw.githubusercontent.com/pipecat-ai/pipecat-examples/main/translation-chatbot/image.png" width="400" /></a>
|
||||||
<a href="https://github.com/pipecat-ai/pipecat-examples/tree/main/moondream-chatbot"><img src="https://raw.githubusercontent.com/pipecat-ai/pipecat-examples/main/moondream-chatbot/image.png" width="400" /></a>
|
<a href="https://github.com/pipecat-ai/pipecat/blob/main/examples/foundational/12-describe-video.py"><img src="https://github.com/pipecat-ai/pipecat/blob/main/examples/foundational/assets/moondream.png" width="400" /></a>
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
## 🧩 Available services
|
## 🧩 Available services
|
||||||
|
|
||||||
| Category | Services |
|
| Category | Services |
|
||||||
| ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| Speech-to-Text | [AssemblyAI](https://docs.pipecat.ai/server/services/stt/assemblyai), [AWS](https://docs.pipecat.ai/server/services/stt/aws), [Azure](https://docs.pipecat.ai/server/services/stt/azure), [Cartesia](https://docs.pipecat.ai/server/services/stt/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/stt/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/stt/elevenlabs), [Fal Wizper](https://docs.pipecat.ai/server/services/stt/fal), [Gladia](https://docs.pipecat.ai/server/services/stt/gladia), [Google](https://docs.pipecat.ai/server/services/stt/google), [Groq (Whisper)](https://docs.pipecat.ai/server/services/stt/groq), [NVIDIA Riva](https://docs.pipecat.ai/server/services/stt/riva), [OpenAI (Whisper)](https://docs.pipecat.ai/server/services/stt/openai), [SambaNova (Whisper)](https://docs.pipecat.ai/server/services/stt/sambanova), [Soniox](https://docs.pipecat.ai/server/services/stt/soniox), [Speechmatics](https://docs.pipecat.ai/server/services/stt/speechmatics), [Ultravox](https://docs.pipecat.ai/server/services/stt/ultravox), [Whisper](https://docs.pipecat.ai/server/services/stt/whisper) |
|
| Speech-to-Text | [AssemblyAI](https://docs.pipecat.ai/server/services/stt/assemblyai), [AWS](https://docs.pipecat.ai/server/services/stt/aws), [Azure](https://docs.pipecat.ai/server/services/stt/azure), [Cartesia](https://docs.pipecat.ai/server/services/stt/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/stt/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/stt/elevenlabs), [Fal Wizper](https://docs.pipecat.ai/server/services/stt/fal), [Gladia](https://docs.pipecat.ai/server/services/stt/gladia), [Google](https://docs.pipecat.ai/server/services/stt/google), [Groq (Whisper)](https://docs.pipecat.ai/server/services/stt/groq), [NVIDIA Riva](https://docs.pipecat.ai/server/services/stt/riva), [OpenAI (Whisper)](https://docs.pipecat.ai/server/services/stt/openai), [SambaNova (Whisper)](https://docs.pipecat.ai/server/services/stt/sambanova), [Soniox](https://docs.pipecat.ai/server/services/stt/soniox), [Speechmatics](https://docs.pipecat.ai/server/services/stt/speechmatics), [Ultravox](https://docs.pipecat.ai/server/services/stt/ultravox), [Whisper](https://docs.pipecat.ai/server/services/stt/whisper) |
|
||||||
| LLMs | [Anthropic](https://docs.pipecat.ai/server/services/llm/anthropic), [AWS](https://docs.pipecat.ai/server/services/llm/aws), [Azure](https://docs.pipecat.ai/server/services/llm/azure), [Cerebras](https://docs.pipecat.ai/server/services/llm/cerebras), [DeepSeek](https://docs.pipecat.ai/server/services/llm/deepseek), [Fireworks AI](https://docs.pipecat.ai/server/services/llm/fireworks), [Gemini](https://docs.pipecat.ai/server/services/llm/gemini), [Grok](https://docs.pipecat.ai/server/services/llm/grok), [Groq](https://docs.pipecat.ai/server/services/llm/groq), [Mistral](https://docs.pipecat.ai/server/services/llm/mistral), [NVIDIA NIM](https://docs.pipecat.ai/server/services/llm/nim), [Ollama](https://docs.pipecat.ai/server/services/llm/ollama), [OpenAI](https://docs.pipecat.ai/server/services/llm/openai), [OpenRouter](https://docs.pipecat.ai/server/services/llm/openrouter), [Perplexity](https://docs.pipecat.ai/server/services/llm/perplexity), [Qwen](https://docs.pipecat.ai/server/services/llm/qwen), [SambaNova](https://docs.pipecat.ai/server/services/llm/sambanova) [Together AI](https://docs.pipecat.ai/server/services/llm/together) |
|
| LLMs | [Anthropic](https://docs.pipecat.ai/server/services/llm/anthropic), [AWS](https://docs.pipecat.ai/server/services/llm/aws), [Azure](https://docs.pipecat.ai/server/services/llm/azure), [Cerebras](https://docs.pipecat.ai/server/services/llm/cerebras), [DeepSeek](https://docs.pipecat.ai/server/services/llm/deepseek), [Fireworks AI](https://docs.pipecat.ai/server/services/llm/fireworks), [Gemini](https://docs.pipecat.ai/server/services/llm/gemini), [Grok](https://docs.pipecat.ai/server/services/llm/grok), [Groq](https://docs.pipecat.ai/server/services/llm/groq), [Mistral](https://docs.pipecat.ai/server/services/llm/mistral), [NVIDIA NIM](https://docs.pipecat.ai/server/services/llm/nim), [Ollama](https://docs.pipecat.ai/server/services/llm/ollama), [OpenAI](https://docs.pipecat.ai/server/services/llm/openai), [OpenRouter](https://docs.pipecat.ai/server/services/llm/openrouter), [Perplexity](https://docs.pipecat.ai/server/services/llm/perplexity), [Qwen](https://docs.pipecat.ai/server/services/llm/qwen), [SambaNova](https://docs.pipecat.ai/server/services/llm/sambanova) [Together AI](https://docs.pipecat.ai/server/services/llm/together) |
|
||||||
| Text-to-Speech | [Async](https://docs.pipecat.ai/server/services/tts/asyncai), [AWS](https://docs.pipecat.ai/server/services/tts/aws), [Azure](https://docs.pipecat.ai/server/services/tts/azure), [Cartesia](https://docs.pipecat.ai/server/services/tts/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/tts/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/tts/elevenlabs), [Fish](https://docs.pipecat.ai/server/services/tts/fish), [Google](https://docs.pipecat.ai/server/services/tts/google), [Groq](https://docs.pipecat.ai/server/services/tts/groq), [Inworld](https://docs.pipecat.ai/server/services/tts/inworld), [LMNT](https://docs.pipecat.ai/server/services/tts/lmnt), [MiniMax](https://docs.pipecat.ai/server/services/tts/minimax), [Neuphonic](https://docs.pipecat.ai/server/services/tts/neuphonic), [NVIDIA Riva](https://docs.pipecat.ai/server/services/tts/riva), [OpenAI](https://docs.pipecat.ai/server/services/tts/openai), [Piper](https://docs.pipecat.ai/server/services/tts/piper), [PlayHT](https://docs.pipecat.ai/server/services/tts/playht), [Rime](https://docs.pipecat.ai/server/services/tts/rime), [Sarvam](https://docs.pipecat.ai/server/services/tts/sarvam), [XTTS](https://docs.pipecat.ai/server/services/tts/xtts) |
|
| Text-to-Speech | [Async](https://docs.pipecat.ai/server/services/tts/asyncai), [AWS](https://docs.pipecat.ai/server/services/tts/aws), [Azure](https://docs.pipecat.ai/server/services/tts/azure), [Cartesia](https://docs.pipecat.ai/server/services/tts/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/tts/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/tts/elevenlabs), [Fish](https://docs.pipecat.ai/server/services/tts/fish), [Google](https://docs.pipecat.ai/server/services/tts/google), [Groq](https://docs.pipecat.ai/server/services/tts/groq), [Hume](https://docs.pipecat.ai/server/services/tts/hume), [Inworld](https://docs.pipecat.ai/server/services/tts/inworld), [LMNT](https://docs.pipecat.ai/server/services/tts/lmnt), [MiniMax](https://docs.pipecat.ai/server/services/tts/minimax), [Neuphonic](https://docs.pipecat.ai/server/services/tts/neuphonic), [NVIDIA Riva](https://docs.pipecat.ai/server/services/tts/riva), [OpenAI](https://docs.pipecat.ai/server/services/tts/openai), [Piper](https://docs.pipecat.ai/server/services/tts/piper), [PlayHT](https://docs.pipecat.ai/server/services/tts/playht), [Rime](https://docs.pipecat.ai/server/services/tts/rime), [Sarvam](https://docs.pipecat.ai/server/services/tts/sarvam), [XTTS](https://docs.pipecat.ai/server/services/tts/xtts) |
|
||||||
| Speech-to-Speech | [AWS Nova Sonic](https://docs.pipecat.ai/server/services/s2s/aws), [Gemini Multimodal Live](https://docs.pipecat.ai/server/services/s2s/gemini), [OpenAI Realtime](https://docs.pipecat.ai/server/services/s2s/openai) |
|
| Speech-to-Speech | [AWS Nova Sonic](https://docs.pipecat.ai/server/services/s2s/aws), [Gemini Multimodal Live](https://docs.pipecat.ai/server/services/s2s/gemini), [OpenAI Realtime](https://docs.pipecat.ai/server/services/s2s/openai) |
|
||||||
| Transport | [Daily (WebRTC)](https://docs.pipecat.ai/server/services/transport/daily), [FastAPI Websocket](https://docs.pipecat.ai/server/services/transport/fastapi-websocket), [SmallWebRTCTransport](https://docs.pipecat.ai/server/services/transport/small-webrtc), [WebSocket Server](https://docs.pipecat.ai/server/services/transport/websocket-server), Local |
|
| Transport | [Daily (WebRTC)](https://docs.pipecat.ai/server/services/transport/daily), [FastAPI Websocket](https://docs.pipecat.ai/server/services/transport/fastapi-websocket), [SmallWebRTCTransport](https://docs.pipecat.ai/server/services/transport/small-webrtc), [WebSocket Server](https://docs.pipecat.ai/server/services/transport/websocket-server), Local |
|
||||||
| Serializers | [Plivo](https://docs.pipecat.ai/server/utilities/serializers/plivo), [Twilio](https://docs.pipecat.ai/server/utilities/serializers/twilio), [Telnyx](https://docs.pipecat.ai/server/utilities/serializers/telnyx) |
|
| Serializers | [Plivo](https://docs.pipecat.ai/server/utilities/serializers/plivo), [Twilio](https://docs.pipecat.ai/server/utilities/serializers/twilio), [Telnyx](https://docs.pipecat.ai/server/utilities/serializers/telnyx) |
|
||||||
| Video | [HeyGen](https://docs.pipecat.ai/server/services/video/heygen), [Tavus](https://docs.pipecat.ai/server/services/video/tavus), [Simli](https://docs.pipecat.ai/server/services/video/simli) |
|
| Video | [HeyGen](https://docs.pipecat.ai/server/services/video/heygen), [Tavus](https://docs.pipecat.ai/server/services/video/tavus), [Simli](https://docs.pipecat.ai/server/services/video/simli) |
|
||||||
| Memory | [mem0](https://docs.pipecat.ai/server/services/memory/mem0) |
|
| Memory | [mem0](https://docs.pipecat.ai/server/services/memory/mem0) |
|
||||||
| Vision & Image | [fal](https://docs.pipecat.ai/server/services/image-generation/fal), [Google Imagen](https://docs.pipecat.ai/server/services/image-generation/fal), [Moondream](https://docs.pipecat.ai/server/services/vision/moondream) |
|
| Vision & Image | [fal](https://docs.pipecat.ai/server/services/image-generation/fal), [Google Imagen](https://docs.pipecat.ai/server/services/image-generation/fal), [Moondream](https://docs.pipecat.ai/server/services/vision/moondream) |
|
||||||
| Audio Processing | [Silero VAD](https://docs.pipecat.ai/server/utilities/audio/silero-vad-analyzer), [Krisp](https://docs.pipecat.ai/server/utilities/audio/krisp-filter), [Koala](https://docs.pipecat.ai/server/utilities/audio/koala-filter), [ai-coustics](https://docs.pipecat.ai/server/utilities/audio/aic-filter) |
|
| Audio Processing | [Silero VAD](https://docs.pipecat.ai/server/utilities/audio/silero-vad-analyzer), [Krisp](https://docs.pipecat.ai/server/utilities/audio/krisp-filter), [Koala](https://docs.pipecat.ai/server/utilities/audio/koala-filter), [ai-coustics](https://docs.pipecat.ai/server/utilities/audio/aic-filter) |
|
||||||
| Analytics & Metrics | [OpenTelemetry](https://docs.pipecat.ai/server/utilities/opentelemetry), [Sentry](https://docs.pipecat.ai/server/services/analytics/sentry) |
|
| Analytics & Metrics | [OpenTelemetry](https://docs.pipecat.ai/server/utilities/opentelemetry), [Sentry](https://docs.pipecat.ai/server/services/analytics/sentry) |
|
||||||
|
|
||||||
📚 [View full services documentation →](https://docs.pipecat.ai/server/services/supported-services)
|
📚 [View full services documentation →](https://docs.pipecat.ai/server/services/supported-services)
|
||||||
|
|
||||||
@@ -184,54 +179,6 @@ Run a specific test suite:
|
|||||||
uv run pytest tests/test_name.py
|
uv run pytest tests/test_name.py
|
||||||
```
|
```
|
||||||
|
|
||||||
### Setting up your editor
|
|
||||||
|
|
||||||
This project uses strict [PEP 8](https://peps.python.org/pep-0008/) formatting via [Ruff](https://github.com/astral-sh/ruff).
|
|
||||||
|
|
||||||
#### Emacs
|
|
||||||
|
|
||||||
You can use [use-package](https://github.com/jwiegley/use-package) to install [emacs-lazy-ruff](https://github.com/christophermadsen/emacs-lazy-ruff) package and configure `ruff` arguments:
|
|
||||||
|
|
||||||
```elisp
|
|
||||||
(use-package lazy-ruff
|
|
||||||
:ensure t
|
|
||||||
:hook ((python-mode . lazy-ruff-mode))
|
|
||||||
:config
|
|
||||||
(setq lazy-ruff-format-command "ruff format")
|
|
||||||
(setq lazy-ruff-check-command "ruff check --select I"))
|
|
||||||
```
|
|
||||||
|
|
||||||
`ruff` was installed in the `venv` environment described before, so you should be able to use [pyvenv-auto](https://github.com/ryotaro612/pyvenv-auto) to automatically load that environment inside Emacs.
|
|
||||||
|
|
||||||
```elisp
|
|
||||||
(use-package pyvenv-auto
|
|
||||||
:ensure t
|
|
||||||
:defer t
|
|
||||||
:hook ((python-mode . pyvenv-auto-run)))
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Visual Studio Code
|
|
||||||
|
|
||||||
Install the
|
|
||||||
[Ruff](https://marketplace.visualstudio.com/items?itemName=charliermarsh.ruff) extension. Then edit the user settings (_Ctrl-Shift-P_ `Open User Settings (JSON)`) and set it as the default Python formatter, and enable formatting on save:
|
|
||||||
|
|
||||||
```json
|
|
||||||
"[python]": {
|
|
||||||
"editor.defaultFormatter": "charliermarsh.ruff",
|
|
||||||
"editor.formatOnSave": true
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
#### PyCharm
|
|
||||||
|
|
||||||
`ruff` was installed in the `venv` environment described before, now to enable autoformatting on save, go to `File` -> `Settings` -> `Tools` -> `File Watchers` and add a new watcher with the following settings:
|
|
||||||
|
|
||||||
1. **Name**: `Ruff formatter`
|
|
||||||
2. **File type**: `Python`
|
|
||||||
3. **Working directory**: `$ContentRoot$`
|
|
||||||
4. **Arguments**: `format $FilePath$`
|
|
||||||
5. **Program**: `$PyInterpreterDirectory$/ruff`
|
|
||||||
|
|
||||||
## 🤝 Contributing
|
## 🤝 Contributing
|
||||||
|
|
||||||
We welcome contributions from the community! Whether you're fixing bugs, improving documentation, or adding new features, here's how you can help:
|
We welcome contributions from the community! Whether you're fixing bugs, improving documentation, or adding new features, here's how you can help:
|
||||||
|
|||||||
5
SECURITY.md
Normal file
5
SECURITY.md
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
# Security Policy
|
||||||
|
|
||||||
|
## Reporting a Vulnerability
|
||||||
|
|
||||||
|
Please email `disclosures@daily.co`.
|
||||||
@@ -50,6 +50,7 @@ autodoc_mock_imports = [
|
|||||||
# Krisp - has build issues on some platforms
|
# Krisp - has build issues on some platforms
|
||||||
"pipecat_ai_krisp",
|
"pipecat_ai_krisp",
|
||||||
"krisp",
|
"krisp",
|
||||||
|
"krisp_audio",
|
||||||
# System-specific GUI libraries
|
# System-specific GUI libraries
|
||||||
"_tkinter",
|
"_tkinter",
|
||||||
"tkinter",
|
"tkinter",
|
||||||
|
|||||||
12
env.example
12
env.example
@@ -58,6 +58,9 @@ GOOGLE_CLOUD_PROJECT_ID=...
|
|||||||
GOOGLE_TEST_CREDENTIALS=...
|
GOOGLE_TEST_CREDENTIALS=...
|
||||||
GOOGLE_VERTEX_TEST_CREDENTIALS=...
|
GOOGLE_VERTEX_TEST_CREDENTIALS=...
|
||||||
|
|
||||||
|
# Hume
|
||||||
|
HUME_API_KEY=...
|
||||||
|
|
||||||
# LMNT
|
# LMNT
|
||||||
LMNT_API_KEY=...
|
LMNT_API_KEY=...
|
||||||
LMNT_VOICE_ID=...
|
LMNT_VOICE_ID=...
|
||||||
@@ -87,6 +90,9 @@ SIMLI_FACE_ID=...
|
|||||||
# Krisp
|
# Krisp
|
||||||
KRISP_MODEL_PATH=...
|
KRISP_MODEL_PATH=...
|
||||||
|
|
||||||
|
# Krisp Viva
|
||||||
|
KRISP_VIVA_MODEL_PATH=...
|
||||||
|
|
||||||
# DeepSeek
|
# DeepSeek
|
||||||
DEEPSEEK_API_KEY=...
|
DEEPSEEK_API_KEY=...
|
||||||
|
|
||||||
@@ -155,3 +161,9 @@ NVIDIA_API_KEY=...
|
|||||||
|
|
||||||
# Qwen
|
# Qwen
|
||||||
QWEN_API_KEY=...
|
QWEN_API_KEY=...
|
||||||
|
|
||||||
|
# WhatsApp
|
||||||
|
WHATSAPP_TOKEN=
|
||||||
|
WHATSAPP_WEBHOOK_VERIFICATION_TOKEN=
|
||||||
|
WHATSAPP_PHONE_NUMBER_ID=
|
||||||
|
WHATSAPP_APP_SECRET=
|
||||||
@@ -25,7 +25,7 @@ from pipecat.processors.aggregators.llm_response_universal import LLMContextAggr
|
|||||||
from pipecat.runner.daily import configure
|
from pipecat.runner.daily import configure
|
||||||
from pipecat.services.cartesia.tts import CartesiaTTSService
|
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||||
from pipecat.services.openai.llm import OpenAILLMService
|
from pipecat.services.openai.llm import OpenAILLMService
|
||||||
from pipecat.transports.daily.transport import DailyLogLevel, DailyParams, DailyTransport
|
from pipecat.transports.daily.transport import DailyParams, DailyTransport
|
||||||
|
|
||||||
load_dotenv(override=True)
|
load_dotenv(override=True)
|
||||||
|
|
||||||
@@ -49,7 +49,6 @@ async def main():
|
|||||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
transport.set_log_level(DailyLogLevel.Info)
|
|
||||||
|
|
||||||
tts = CartesiaTTSService(
|
tts = CartesiaTTSService(
|
||||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||||
|
|||||||
@@ -21,8 +21,8 @@ from pipecat.processors.aggregators.llm_context import LLMContext
|
|||||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||||
from pipecat.runner.types import RunnerArguments
|
from pipecat.runner.types import RunnerArguments
|
||||||
from pipecat.runner.utils import create_transport
|
from pipecat.runner.utils import create_transport
|
||||||
|
from pipecat.services.cartesia.stt import CartesiaSTTService
|
||||||
from pipecat.services.cartesia.tts import CartesiaTTSService
|
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
|
||||||
from pipecat.services.openai.llm import OpenAILLMService
|
from pipecat.services.openai.llm import OpenAILLMService
|
||||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||||
from pipecat.transports.daily.transport import DailyParams
|
from pipecat.transports.daily.transport import DailyParams
|
||||||
@@ -58,7 +58,7 @@ transport_params = {
|
|||||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||||
logger.info(f"Starting bot")
|
logger.info(f"Starting bot")
|
||||||
|
|
||||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
stt = CartesiaSTTService(api_key=os.getenv("CARTESIA_API_KEY"))
|
||||||
|
|
||||||
tts = CartesiaTTSService(
|
tts = CartesiaTTSService(
|
||||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||||
|
|||||||
138
examples/foundational/07ae-interruptible-hume.py
Normal file
138
examples/foundational/07ae-interruptible-hume.py
Normal file
@@ -0,0 +1,138 @@
|
|||||||
|
#
|
||||||
|
# Copyright (c) 2024–2025, Daily
|
||||||
|
#
|
||||||
|
# SPDX-License-Identifier: BSD 2-Clause License
|
||||||
|
#
|
||||||
|
|
||||||
|
import os
|
||||||
|
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
from loguru import logger
|
||||||
|
|
||||||
|
from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
|
||||||
|
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
|
||||||
|
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||||
|
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||||
|
from pipecat.frames.frames import LLMRunFrame
|
||||||
|
from pipecat.pipeline.pipeline import Pipeline
|
||||||
|
from pipecat.pipeline.runner import PipelineRunner
|
||||||
|
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||||
|
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||||
|
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||||
|
from pipecat.processors.frameworks.rtvi import RTVIConfig, RTVIObserver, RTVIProcessor
|
||||||
|
from pipecat.runner.types import RunnerArguments
|
||||||
|
from pipecat.runner.utils import create_transport
|
||||||
|
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||||
|
from pipecat.services.hume.tts import HUME_SAMPLE_RATE, HumeTTSService
|
||||||
|
from pipecat.services.openai.llm import OpenAILLMService
|
||||||
|
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||||
|
from pipecat.transports.daily.transport import DailyParams
|
||||||
|
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||||
|
|
||||||
|
load_dotenv(override=True)
|
||||||
|
|
||||||
|
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||||
|
# instantiated. The function will be called when the desired transport gets
|
||||||
|
# selected.
|
||||||
|
transport_params = {
|
||||||
|
"daily": lambda: DailyParams(
|
||||||
|
audio_in_enabled=True,
|
||||||
|
audio_out_enabled=True,
|
||||||
|
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||||
|
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||||
|
),
|
||||||
|
"twilio": lambda: FastAPIWebsocketParams(
|
||||||
|
audio_in_enabled=True,
|
||||||
|
audio_out_enabled=True,
|
||||||
|
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||||
|
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||||
|
),
|
||||||
|
"webrtc": lambda: TransportParams(
|
||||||
|
audio_in_enabled=True,
|
||||||
|
audio_out_enabled=True,
|
||||||
|
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||||
|
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||||
|
),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||||
|
logger.info(f"Starting bot")
|
||||||
|
|
||||||
|
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||||
|
|
||||||
|
tts = HumeTTSService(
|
||||||
|
api_key=os.getenv("HUME_API_KEY"),
|
||||||
|
# Replace with your Hume voice ID
|
||||||
|
voice_id="f898a92e-685f-43fa-985b-a46920f0650b",
|
||||||
|
)
|
||||||
|
|
||||||
|
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
|
||||||
|
|
||||||
|
messages = [
|
||||||
|
{
|
||||||
|
"role": "system",
|
||||||
|
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
context = LLMContext(messages)
|
||||||
|
context_aggregator = LLMContextAggregatorPair(context)
|
||||||
|
|
||||||
|
rtvi = RTVIProcessor(config=RTVIConfig(config=[]))
|
||||||
|
|
||||||
|
pipeline = Pipeline(
|
||||||
|
[
|
||||||
|
transport.input(), # Transport user input
|
||||||
|
rtvi,
|
||||||
|
stt,
|
||||||
|
context_aggregator.user(), # User responses
|
||||||
|
llm, # LLM
|
||||||
|
tts, # TTS
|
||||||
|
transport.output(), # Transport bot output
|
||||||
|
context_aggregator.assistant(), # Assistant spoken responses
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
task = PipelineTask(
|
||||||
|
pipeline,
|
||||||
|
params=PipelineParams(
|
||||||
|
enable_metrics=True,
|
||||||
|
enable_usage_metrics=True,
|
||||||
|
audio_out_sample_rate=HUME_SAMPLE_RATE,
|
||||||
|
),
|
||||||
|
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||||
|
observers=[RTVIObserver(rtvi)],
|
||||||
|
)
|
||||||
|
|
||||||
|
@rtvi.event_handler("on_client_ready")
|
||||||
|
async def on_client_ready(rtvi):
|
||||||
|
await rtvi.set_bot_ready()
|
||||||
|
|
||||||
|
@transport.event_handler("on_client_connected")
|
||||||
|
async def on_client_connected(transport, client):
|
||||||
|
logger.info(f"Client connected")
|
||||||
|
# Kick off the conversation.
|
||||||
|
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
|
||||||
|
await task.queue_frames([LLMRunFrame()])
|
||||||
|
|
||||||
|
@transport.event_handler("on_client_disconnected")
|
||||||
|
async def on_client_disconnected(transport, client):
|
||||||
|
logger.info(f"Client disconnected")
|
||||||
|
await task.cancel()
|
||||||
|
|
||||||
|
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||||
|
|
||||||
|
await runner.run(task)
|
||||||
|
|
||||||
|
|
||||||
|
async def bot(runner_args: RunnerArguments):
|
||||||
|
"""Main bot entry point compatible with Pipecat Cloud."""
|
||||||
|
transport = await create_transport(runner_args, transport_params)
|
||||||
|
await run_bot(transport, runner_args)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
from pipecat.runner.run import main
|
||||||
|
|
||||||
|
main()
|
||||||
118
examples/foundational/07c-interruptible-deepgram-flux.py
Normal file
118
examples/foundational/07c-interruptible-deepgram-flux.py
Normal file
@@ -0,0 +1,118 @@
|
|||||||
|
#
|
||||||
|
# Copyright (c) 2024–2025, Daily
|
||||||
|
#
|
||||||
|
# SPDX-License-Identifier: BSD 2-Clause License
|
||||||
|
#
|
||||||
|
|
||||||
|
|
||||||
|
import os
|
||||||
|
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
from loguru import logger
|
||||||
|
|
||||||
|
from pipecat.frames.frames import LLMRunFrame
|
||||||
|
from pipecat.pipeline.pipeline import Pipeline
|
||||||
|
from pipecat.pipeline.runner import PipelineRunner
|
||||||
|
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||||
|
from pipecat.processors.aggregators.llm_response_universal import (
|
||||||
|
LLMContext,
|
||||||
|
LLMContextAggregatorPair,
|
||||||
|
)
|
||||||
|
from pipecat.runner.types import RunnerArguments
|
||||||
|
from pipecat.runner.utils import create_transport
|
||||||
|
from pipecat.services.deepgram.flux.stt import DeepgramFluxSTTService
|
||||||
|
from pipecat.services.deepgram.tts import DeepgramTTSService
|
||||||
|
from pipecat.services.openai.llm import OpenAILLMService
|
||||||
|
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||||
|
from pipecat.transports.daily.transport import DailyParams
|
||||||
|
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||||
|
|
||||||
|
load_dotenv(override=True)
|
||||||
|
|
||||||
|
|
||||||
|
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||||
|
# instantiated. The function will be called when the desired transport gets
|
||||||
|
# selected.
|
||||||
|
transport_params = {
|
||||||
|
"daily": lambda: DailyParams(
|
||||||
|
audio_in_enabled=True,
|
||||||
|
audio_out_enabled=True,
|
||||||
|
),
|
||||||
|
"twilio": lambda: FastAPIWebsocketParams(
|
||||||
|
audio_in_enabled=True,
|
||||||
|
audio_out_enabled=True,
|
||||||
|
),
|
||||||
|
"webrtc": lambda: TransportParams(
|
||||||
|
audio_in_enabled=True,
|
||||||
|
audio_out_enabled=True,
|
||||||
|
),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||||
|
logger.info(f"Starting bot")
|
||||||
|
|
||||||
|
stt = DeepgramFluxSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||||
|
|
||||||
|
tts = DeepgramTTSService(api_key=os.getenv("DEEPGRAM_API_KEY"), voice="aura-2-andromeda-en")
|
||||||
|
|
||||||
|
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
|
||||||
|
|
||||||
|
messages = [
|
||||||
|
{
|
||||||
|
"role": "system",
|
||||||
|
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
context = LLMContext(messages)
|
||||||
|
context_aggregator = LLMContextAggregatorPair(context)
|
||||||
|
|
||||||
|
pipeline = Pipeline(
|
||||||
|
[
|
||||||
|
transport.input(), # Transport user input
|
||||||
|
stt, # STT
|
||||||
|
context_aggregator.user(), # User responses
|
||||||
|
llm, # LLM
|
||||||
|
tts, # TTS
|
||||||
|
transport.output(), # Transport bot output
|
||||||
|
context_aggregator.assistant(), # Assistant spoken responses
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
task = PipelineTask(
|
||||||
|
pipeline,
|
||||||
|
params=PipelineParams(
|
||||||
|
enable_metrics=True,
|
||||||
|
enable_usage_metrics=True,
|
||||||
|
),
|
||||||
|
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||||
|
)
|
||||||
|
|
||||||
|
@transport.event_handler("on_client_connected")
|
||||||
|
async def on_client_connected(transport, client):
|
||||||
|
logger.info(f"Client connected")
|
||||||
|
# Kick off the conversation.
|
||||||
|
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
|
||||||
|
await task.queue_frames([LLMRunFrame()])
|
||||||
|
|
||||||
|
@transport.event_handler("on_client_disconnected")
|
||||||
|
async def on_client_disconnected(transport, client):
|
||||||
|
logger.info(f"Client disconnected")
|
||||||
|
await task.cancel()
|
||||||
|
|
||||||
|
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||||
|
|
||||||
|
await runner.run(task)
|
||||||
|
|
||||||
|
|
||||||
|
async def bot(runner_args: RunnerArguments):
|
||||||
|
"""Main bot entry point compatible with Pipecat Cloud."""
|
||||||
|
transport = await create_transport(runner_args, transport_params)
|
||||||
|
await run_bot(transport, runner_args)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
from pipecat.runner.run import main
|
||||||
|
|
||||||
|
main()
|
||||||
@@ -23,7 +23,6 @@ from pipecat.processors.aggregators.llm_context import LLMContext
|
|||||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||||
from pipecat.runner.types import RunnerArguments
|
from pipecat.runner.types import RunnerArguments
|
||||||
from pipecat.runner.utils import create_transport
|
from pipecat.runner.utils import create_transport
|
||||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
|
||||||
from pipecat.services.elevenlabs.stt import ElevenLabsSTTService
|
from pipecat.services.elevenlabs.stt import ElevenLabsSTTService
|
||||||
from pipecat.services.elevenlabs.tts import ElevenLabsHttpTTSService
|
from pipecat.services.elevenlabs.tts import ElevenLabsHttpTTSService
|
||||||
from pipecat.services.openai.llm import OpenAILLMService
|
from pipecat.services.openai.llm import OpenAILLMService
|
||||||
|
|||||||
151
examples/foundational/07n-interruptible-gemini-image.py
Normal file
151
examples/foundational/07n-interruptible-gemini-image.py
Normal file
@@ -0,0 +1,151 @@
|
|||||||
|
#
|
||||||
|
# Copyright (c) 2024–2025, Daily
|
||||||
|
#
|
||||||
|
# SPDX-License-Identifier: BSD 2-Clause License
|
||||||
|
#
|
||||||
|
|
||||||
|
"""
|
||||||
|
A conversational AI bot using Gemini for both LLM, STT and TTS.
|
||||||
|
|
||||||
|
This example demonstrates how to use Gemini's image generation capabilities.
|
||||||
|
|
||||||
|
Features showcased:
|
||||||
|
- Gemini LLM for conversation and image generation
|
||||||
|
- Google TTS and STT
|
||||||
|
|
||||||
|
Run with:
|
||||||
|
python examples/foundational/07n-interruptible-gemini-image.py
|
||||||
|
|
||||||
|
Make sure to set your environment variables:
|
||||||
|
export GOOGLE_API_KEY=your_api_key_here
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
from loguru import logger
|
||||||
|
|
||||||
|
from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
|
||||||
|
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
|
||||||
|
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||||
|
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||||
|
from pipecat.frames.frames import LLMRunFrame
|
||||||
|
from pipecat.pipeline.pipeline import Pipeline
|
||||||
|
from pipecat.pipeline.runner import PipelineRunner
|
||||||
|
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||||
|
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||||
|
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||||
|
from pipecat.runner.types import RunnerArguments
|
||||||
|
from pipecat.runner.utils import create_transport
|
||||||
|
from pipecat.services.google.llm import GoogleLLMService
|
||||||
|
from pipecat.services.google.stt import GoogleSTTService
|
||||||
|
from pipecat.services.google.tts import GoogleTTSService
|
||||||
|
from pipecat.transcriptions.language import Language
|
||||||
|
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||||
|
from pipecat.transports.daily.transport import DailyParams
|
||||||
|
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||||
|
|
||||||
|
load_dotenv(override=True)
|
||||||
|
|
||||||
|
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||||
|
# instantiated. The function will be called when the desired transport gets
|
||||||
|
# selected.
|
||||||
|
transport_params = {
|
||||||
|
"daily": lambda: DailyParams(
|
||||||
|
audio_in_enabled=True,
|
||||||
|
audio_out_enabled=True,
|
||||||
|
video_out_enabled=True,
|
||||||
|
video_out_width=1024,
|
||||||
|
video_out_height=1024,
|
||||||
|
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||||
|
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||||
|
),
|
||||||
|
"webrtc": lambda: TransportParams(
|
||||||
|
audio_in_enabled=True,
|
||||||
|
audio_out_enabled=True,
|
||||||
|
video_out_enabled=True,
|
||||||
|
video_out_width=1024,
|
||||||
|
video_out_height=1024,
|
||||||
|
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||||
|
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||||
|
),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||||
|
logger.info(f"Starting bot")
|
||||||
|
|
||||||
|
stt = GoogleSTTService(
|
||||||
|
params=GoogleSTTService.InputParams(languages=Language.EN_US),
|
||||||
|
credentials=os.getenv("GOOGLE_TEST_CREDENTIALS"),
|
||||||
|
)
|
||||||
|
|
||||||
|
tts = GoogleTTSService(
|
||||||
|
voice_id="en-US-Chirp3-HD-Charon",
|
||||||
|
params=GoogleTTSService.InputParams(language=Language.EN_US),
|
||||||
|
credentials=os.getenv("GOOGLE_TEST_CREDENTIALS"),
|
||||||
|
)
|
||||||
|
|
||||||
|
llm = GoogleLLMService(
|
||||||
|
api_key=os.getenv("GOOGLE_API_KEY"),
|
||||||
|
model="gemini-2.5-flash-image",
|
||||||
|
)
|
||||||
|
|
||||||
|
messages = [
|
||||||
|
{
|
||||||
|
"role": "system",
|
||||||
|
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
context = LLMContext(messages)
|
||||||
|
context_aggregator = LLMContextAggregatorPair(context)
|
||||||
|
|
||||||
|
pipeline = Pipeline(
|
||||||
|
[
|
||||||
|
transport.input(), # Transport user input
|
||||||
|
stt, # STT
|
||||||
|
context_aggregator.user(), # User responses
|
||||||
|
llm, # LLM
|
||||||
|
tts, # Gemini TTS
|
||||||
|
transport.output(), # Transport bot output
|
||||||
|
context_aggregator.assistant(), # Assistant spoken responses
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
task = PipelineTask(
|
||||||
|
pipeline,
|
||||||
|
params=PipelineParams(
|
||||||
|
enable_metrics=True,
|
||||||
|
enable_usage_metrics=True,
|
||||||
|
),
|
||||||
|
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||||
|
)
|
||||||
|
|
||||||
|
@transport.event_handler("on_client_connected")
|
||||||
|
async def on_client_connected(transport, client):
|
||||||
|
logger.info(f"Client connected")
|
||||||
|
# Kick off the conversation with a styled introduction
|
||||||
|
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
|
||||||
|
await task.queue_frames([LLMRunFrame()])
|
||||||
|
|
||||||
|
@transport.event_handler("on_client_disconnected")
|
||||||
|
async def on_client_disconnected(transport, client):
|
||||||
|
logger.info(f"Client disconnected")
|
||||||
|
await task.cancel()
|
||||||
|
|
||||||
|
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||||
|
|
||||||
|
await runner.run(task)
|
||||||
|
|
||||||
|
|
||||||
|
async def bot(runner_args: RunnerArguments):
|
||||||
|
"""Main bot entry point compatible with Pipecat Cloud."""
|
||||||
|
transport = await create_transport(runner_args, transport_params)
|
||||||
|
await run_bot(transport, runner_args)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
from pipecat.runner.run import main
|
||||||
|
|
||||||
|
main()
|
||||||
129
examples/foundational/07p-interruptible-krisp-viva.py
Normal file
129
examples/foundational/07p-interruptible-krisp-viva.py
Normal file
@@ -0,0 +1,129 @@
|
|||||||
|
#
|
||||||
|
# Copyright (c) 2024–2025, Daily
|
||||||
|
#
|
||||||
|
# SPDX-License-Identifier: BSD 2-Clause License
|
||||||
|
#
|
||||||
|
|
||||||
|
|
||||||
|
import os
|
||||||
|
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
from loguru import logger
|
||||||
|
|
||||||
|
from pipecat.audio.filters.krisp_viva_filter import KrispVivaFilter
|
||||||
|
from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
|
||||||
|
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
|
||||||
|
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||||
|
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||||
|
from pipecat.frames.frames import LLMRunFrame
|
||||||
|
from pipecat.pipeline.pipeline import Pipeline
|
||||||
|
from pipecat.pipeline.runner import PipelineRunner
|
||||||
|
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||||
|
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||||
|
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||||
|
from pipecat.runner.types import RunnerArguments
|
||||||
|
from pipecat.runner.utils import create_transport
|
||||||
|
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||||
|
from pipecat.services.deepgram.tts import DeepgramTTSService
|
||||||
|
from pipecat.services.openai.llm import OpenAILLMService
|
||||||
|
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||||
|
from pipecat.transports.daily.transport import DailyParams
|
||||||
|
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||||
|
|
||||||
|
load_dotenv(override=True)
|
||||||
|
|
||||||
|
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||||
|
# instantiated. The function will be called when the desired transport gets
|
||||||
|
# selected.
|
||||||
|
transport_params = {
|
||||||
|
"daily": lambda: DailyParams(
|
||||||
|
audio_in_enabled=True,
|
||||||
|
audio_out_enabled=True,
|
||||||
|
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||||
|
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||||
|
audio_in_filter=KrispVivaFilter(),
|
||||||
|
),
|
||||||
|
"twilio": lambda: FastAPIWebsocketParams(
|
||||||
|
audio_in_enabled=True,
|
||||||
|
audio_out_enabled=True,
|
||||||
|
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||||
|
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||||
|
audio_in_filter=KrispVivaFilter(),
|
||||||
|
),
|
||||||
|
"webrtc": lambda: TransportParams(
|
||||||
|
audio_in_enabled=True,
|
||||||
|
audio_out_enabled=True,
|
||||||
|
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||||
|
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||||
|
audio_in_filter=KrispVivaFilter(),
|
||||||
|
),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||||
|
logger.info(f"Starting bot")
|
||||||
|
|
||||||
|
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||||
|
|
||||||
|
tts = DeepgramTTSService(api_key=os.getenv("DEEPGRAM_API_KEY"), voice="aura-helios-en")
|
||||||
|
|
||||||
|
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
|
||||||
|
|
||||||
|
messages = [
|
||||||
|
{
|
||||||
|
"role": "system",
|
||||||
|
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
context = LLMContext(messages)
|
||||||
|
context_aggregator = LLMContextAggregatorPair(context)
|
||||||
|
|
||||||
|
pipeline = Pipeline(
|
||||||
|
[
|
||||||
|
transport.input(), # Transport user input
|
||||||
|
stt, # STT
|
||||||
|
context_aggregator.user(), # User responses
|
||||||
|
llm, # LLM
|
||||||
|
tts, # TTS
|
||||||
|
transport.output(), # Transport bot output
|
||||||
|
context_aggregator.assistant(), # Assistant spoken responses
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
task = PipelineTask(
|
||||||
|
pipeline,
|
||||||
|
params=PipelineParams(
|
||||||
|
enable_metrics=True,
|
||||||
|
enable_usage_metrics=True,
|
||||||
|
),
|
||||||
|
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||||
|
)
|
||||||
|
|
||||||
|
@transport.event_handler("on_client_connected")
|
||||||
|
async def on_client_connected(transport, client):
|
||||||
|
logger.info(f"Client connected")
|
||||||
|
# Kick off the conversation.
|
||||||
|
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
|
||||||
|
await task.queue_frames([LLMRunFrame()])
|
||||||
|
|
||||||
|
@transport.event_handler("on_client_disconnected")
|
||||||
|
async def on_client_disconnected(transport, client):
|
||||||
|
logger.info(f"Client disconnected")
|
||||||
|
await task.cancel()
|
||||||
|
|
||||||
|
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||||
|
|
||||||
|
await runner.run(task)
|
||||||
|
|
||||||
|
|
||||||
|
async def bot(runner_args: RunnerArguments):
|
||||||
|
"""Main bot entry point compatible with Pipecat Cloud."""
|
||||||
|
transport = await create_transport(runner_args, transport_params)
|
||||||
|
await run_bot(transport, runner_args)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
from pipecat.runner.run import main
|
||||||
|
|
||||||
|
main()
|
||||||
@@ -48,10 +48,7 @@ transport_params = {
|
|||||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||||
logger.info(f"Starting bot")
|
logger.info(f"Starting bot")
|
||||||
|
|
||||||
stt = CartesiaSTTService(
|
stt = CartesiaSTTService(api_key=os.getenv("CARTESIA_API_KEY"))
|
||||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
|
||||||
base_url=os.getenv("CARTESIA_BASE_URL"),
|
|
||||||
)
|
|
||||||
|
|
||||||
tl = TranscriptionLogger()
|
tl = TranscriptionLogger()
|
||||||
|
|
||||||
|
|||||||
@@ -76,9 +76,8 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
|||||||
|
|
||||||
llm = GoogleVertexLLMService(
|
llm = GoogleVertexLLMService(
|
||||||
credentials=os.getenv("GOOGLE_VERTEX_TEST_CREDENTIALS"),
|
credentials=os.getenv("GOOGLE_VERTEX_TEST_CREDENTIALS"),
|
||||||
params=GoogleVertexLLMService.InputParams(
|
project_id=os.getenv("GOOGLE_CLOUD_PROJECT_ID"),
|
||||||
project_id=os.getenv("GOOGLE_CLOUD_PROJECT_ID"),
|
location=os.getenv("GOOGLE_CLOUD_LOCATION"),
|
||||||
),
|
|
||||||
)
|
)
|
||||||
# You can aslo register a function_name of None to get all functions
|
# You can aslo register a function_name of None to get all functions
|
||||||
# sent to the same callback with an additional function_name parameter.
|
# sent to the same callback with an additional function_name parameter.
|
||||||
|
|||||||
182
examples/foundational/14x-function-calling-openpipe.py
Normal file
182
examples/foundational/14x-function-calling-openpipe.py
Normal file
@@ -0,0 +1,182 @@
|
|||||||
|
#
|
||||||
|
# Copyright (c) 2024–2025, Daily
|
||||||
|
#
|
||||||
|
# SPDX-License-Identifier: BSD 2-Clause License
|
||||||
|
#
|
||||||
|
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
from loguru import logger
|
||||||
|
|
||||||
|
from pipecat.adapters.schemas.function_schema import FunctionSchema
|
||||||
|
from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
||||||
|
from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
|
||||||
|
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
|
||||||
|
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||||
|
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||||
|
from pipecat.frames.frames import LLMRunFrame, TTSSpeakFrame
|
||||||
|
from pipecat.pipeline.pipeline import Pipeline
|
||||||
|
from pipecat.pipeline.runner import PipelineRunner
|
||||||
|
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||||
|
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||||
|
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||||
|
from pipecat.runner.types import RunnerArguments
|
||||||
|
from pipecat.runner.utils import create_transport
|
||||||
|
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||||
|
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||||
|
from pipecat.services.llm_service import FunctionCallParams
|
||||||
|
from pipecat.services.openpipe.llm import OpenPipeLLMService
|
||||||
|
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||||
|
from pipecat.transports.daily.transport import DailyParams
|
||||||
|
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||||
|
|
||||||
|
load_dotenv(override=True)
|
||||||
|
|
||||||
|
|
||||||
|
async def fetch_weather_from_api(params: FunctionCallParams):
|
||||||
|
await params.result_callback({"conditions": "nice", "temperature": "75"})
|
||||||
|
|
||||||
|
|
||||||
|
async def fetch_restaurant_recommendation(params: FunctionCallParams):
|
||||||
|
await params.result_callback({"name": "The Golden Dragon"})
|
||||||
|
|
||||||
|
|
||||||
|
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||||
|
# instantiated. The function will be called when the desired transport gets
|
||||||
|
# selected.
|
||||||
|
transport_params = {
|
||||||
|
"daily": lambda: DailyParams(
|
||||||
|
audio_in_enabled=True,
|
||||||
|
audio_out_enabled=True,
|
||||||
|
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||||
|
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||||
|
),
|
||||||
|
"twilio": lambda: FastAPIWebsocketParams(
|
||||||
|
audio_in_enabled=True,
|
||||||
|
audio_out_enabled=True,
|
||||||
|
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||||
|
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||||
|
),
|
||||||
|
"webrtc": lambda: TransportParams(
|
||||||
|
audio_in_enabled=True,
|
||||||
|
audio_out_enabled=True,
|
||||||
|
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||||
|
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||||
|
),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||||
|
logger.info(f"Starting bot")
|
||||||
|
|
||||||
|
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||||
|
|
||||||
|
tts = CartesiaTTSService(
|
||||||
|
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||||
|
voice_id="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||||
|
)
|
||||||
|
|
||||||
|
timestamp = int(time.time())
|
||||||
|
llm = OpenPipeLLMService(
|
||||||
|
api_key=os.getenv("OPENAI_API_KEY"),
|
||||||
|
openpipe_api_key=os.getenv("OPENPIPE_API_KEY"),
|
||||||
|
tags={"conversation_id": f"pipecat-{timestamp}"},
|
||||||
|
)
|
||||||
|
|
||||||
|
# You can also register a function_name of None to get all functions
|
||||||
|
# sent to the same callback with an additional function_name parameter.
|
||||||
|
llm.register_function("get_current_weather", fetch_weather_from_api)
|
||||||
|
llm.register_function("get_restaurant_recommendation", fetch_restaurant_recommendation)
|
||||||
|
|
||||||
|
@llm.event_handler("on_function_calls_started")
|
||||||
|
async def on_function_calls_started(service, function_calls):
|
||||||
|
await tts.queue_frame(TTSSpeakFrame("Let me check on that."))
|
||||||
|
|
||||||
|
weather_function = FunctionSchema(
|
||||||
|
name="get_current_weather",
|
||||||
|
description="Get the current weather",
|
||||||
|
properties={
|
||||||
|
"location": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "The city and state, e.g. San Francisco, CA",
|
||||||
|
},
|
||||||
|
"format": {
|
||||||
|
"type": "string",
|
||||||
|
"enum": ["celsius", "fahrenheit"],
|
||||||
|
"description": "The temperature unit to use. Infer this from the user's location.",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
required=["location", "format"],
|
||||||
|
)
|
||||||
|
restaurant_function = FunctionSchema(
|
||||||
|
name="get_restaurant_recommendation",
|
||||||
|
description="Get a restaurant recommendation",
|
||||||
|
properties={
|
||||||
|
"location": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "The city and state, e.g. San Francisco, CA",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
required=["location"],
|
||||||
|
)
|
||||||
|
tools = ToolsSchema(standard_tools=[weather_function, restaurant_function])
|
||||||
|
|
||||||
|
messages = [
|
||||||
|
{
|
||||||
|
"role": "system",
|
||||||
|
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
context = LLMContext(messages, tools)
|
||||||
|
context_aggregator = LLMContextAggregatorPair(context)
|
||||||
|
|
||||||
|
pipeline = Pipeline(
|
||||||
|
[
|
||||||
|
transport.input(),
|
||||||
|
stt,
|
||||||
|
context_aggregator.user(),
|
||||||
|
llm,
|
||||||
|
tts,
|
||||||
|
transport.output(),
|
||||||
|
context_aggregator.assistant(),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
task = PipelineTask(
|
||||||
|
pipeline,
|
||||||
|
params=PipelineParams(
|
||||||
|
enable_metrics=True,
|
||||||
|
enable_usage_metrics=True,
|
||||||
|
),
|
||||||
|
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||||
|
)
|
||||||
|
|
||||||
|
@transport.event_handler("on_client_connected")
|
||||||
|
async def on_client_connected(transport, client):
|
||||||
|
logger.info(f"Client connected")
|
||||||
|
# Kick off the conversation.
|
||||||
|
await task.queue_frames([LLMRunFrame()])
|
||||||
|
|
||||||
|
@transport.event_handler("on_client_disconnected")
|
||||||
|
async def on_client_disconnected(transport, client):
|
||||||
|
logger.info(f"Client disconnected")
|
||||||
|
await task.cancel()
|
||||||
|
|
||||||
|
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||||
|
|
||||||
|
await runner.run(task)
|
||||||
|
|
||||||
|
|
||||||
|
async def bot(runner_args: RunnerArguments):
|
||||||
|
"""Main bot entry point compatible with Pipecat Cloud."""
|
||||||
|
transport = await create_transport(runner_args, transport_params)
|
||||||
|
await run_bot(transport, runner_args)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
from pipecat.runner.run import main
|
||||||
|
|
||||||
|
main()
|
||||||
@@ -26,7 +26,11 @@ from pipecat.services.deepgram.stt import DeepgramSTTService
|
|||||||
from pipecat.services.deepgram.tts import DeepgramTTSService
|
from pipecat.services.deepgram.tts import DeepgramTTSService
|
||||||
from pipecat.services.openai.llm import OpenAILLMService
|
from pipecat.services.openai.llm import OpenAILLMService
|
||||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||||
from pipecat.transports.daily.transport import DailyParams, DailyTransportMessageFrame
|
from pipecat.transports.daily.transport import (
|
||||||
|
DailyOutputTransportMessageFrame,
|
||||||
|
DailyOutputTransportMessageUrgentFrame,
|
||||||
|
DailyParams,
|
||||||
|
)
|
||||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||||
|
|
||||||
load_dotenv(override=True)
|
load_dotenv(override=True)
|
||||||
@@ -128,14 +132,14 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
|||||||
logger.debug(f"Received latency ping app message: {message}")
|
logger.debug(f"Received latency ping app message: {message}")
|
||||||
ts = message["latency-ping"]["ts"]
|
ts = message["latency-ping"]["ts"]
|
||||||
# Send immediately
|
# Send immediately
|
||||||
transport.output().send_message(
|
await task.queue_frame(
|
||||||
DailyTransportMessageFrame(
|
DailyOutputTransportMessageUrgentFrame(
|
||||||
message={"latency-pong-msg-handler": {"ts": ts}}, participant_id=sender
|
message={"latency-pong-msg-handler": {"ts": ts}}, participant_id=sender
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
# And push to the pipeline for the Daily transport.output to send
|
# And push to the pipeline for the Daily transport.output to send
|
||||||
await task.queue_frame(
|
await task.queue_frame(
|
||||||
DailyTransportMessageFrame(
|
DailyOutputTransportMessageFrame(
|
||||||
message={"latency-pong-pipeline-delivery": {"ts": ts}},
|
message={"latency-pong-pipeline-delivery": {"ts": ts}},
|
||||||
participant_id=sender,
|
participant_id=sender,
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -24,14 +24,15 @@ from pipecat.processors.transcript_processor import TranscriptProcessor
|
|||||||
from pipecat.runner.types import RunnerArguments
|
from pipecat.runner.types import RunnerArguments
|
||||||
from pipecat.runner.utils import create_transport
|
from pipecat.runner.utils import create_transport
|
||||||
from pipecat.services.llm_service import FunctionCallParams
|
from pipecat.services.llm_service import FunctionCallParams
|
||||||
from pipecat.services.openai_realtime import (
|
from pipecat.services.openai.realtime.events import (
|
||||||
|
AudioConfiguration,
|
||||||
|
AudioInput,
|
||||||
InputAudioNoiseReduction,
|
InputAudioNoiseReduction,
|
||||||
InputAudioTranscription,
|
InputAudioTranscription,
|
||||||
OpenAIRealtimeLLMService,
|
|
||||||
SemanticTurnDetection,
|
SemanticTurnDetection,
|
||||||
SessionProperties,
|
SessionProperties,
|
||||||
)
|
)
|
||||||
from pipecat.services.openai_realtime.events import AudioConfiguration, AudioInput
|
from pipecat.services.openai.realtime.llm import OpenAIRealtimeLLMService
|
||||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||||
from pipecat.transports.daily.transport import DailyParams
|
from pipecat.transports.daily.transport import DailyParams
|
||||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||||
|
|||||||
@@ -21,13 +21,14 @@ from pipecat.pipeline.task import PipelineParams, PipelineTask
|
|||||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||||
from pipecat.runner.types import RunnerArguments
|
from pipecat.runner.types import RunnerArguments
|
||||||
from pipecat.runner.utils import create_transport
|
from pipecat.runner.utils import create_transport
|
||||||
|
from pipecat.services.azure.realtime.llm import AzureRealtimeLLMService
|
||||||
from pipecat.services.llm_service import FunctionCallParams
|
from pipecat.services.llm_service import FunctionCallParams
|
||||||
from pipecat.services.openai_realtime import (
|
from pipecat.services.openai.realtime.events import (
|
||||||
AzureRealtimeLLMService,
|
AudioConfiguration,
|
||||||
|
AudioInput,
|
||||||
InputAudioTranscription,
|
InputAudioTranscription,
|
||||||
SessionProperties,
|
SessionProperties,
|
||||||
)
|
)
|
||||||
from pipecat.services.openai_realtime.events import AudioConfiguration, AudioInput
|
|
||||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||||
from pipecat.transports.daily.transport import DailyParams
|
from pipecat.transports.daily.transport import DailyParams
|
||||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||||
|
|||||||
@@ -22,16 +22,17 @@ from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
|||||||
from pipecat.processors.transcript_processor import TranscriptProcessor
|
from pipecat.processors.transcript_processor import TranscriptProcessor
|
||||||
from pipecat.runner.types import RunnerArguments
|
from pipecat.runner.types import RunnerArguments
|
||||||
from pipecat.runner.utils import create_transport
|
from pipecat.runner.utils import create_transport
|
||||||
from pipecat.services.cartesia import CartesiaTTSService
|
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||||
from pipecat.services.llm_service import FunctionCallParams
|
from pipecat.services.llm_service import FunctionCallParams
|
||||||
from pipecat.services.openai_realtime import (
|
from pipecat.services.openai.realtime.events import (
|
||||||
|
AudioConfiguration,
|
||||||
|
AudioInput,
|
||||||
InputAudioNoiseReduction,
|
InputAudioNoiseReduction,
|
||||||
InputAudioTranscription,
|
InputAudioTranscription,
|
||||||
OpenAIRealtimeLLMService,
|
|
||||||
SemanticTurnDetection,
|
SemanticTurnDetection,
|
||||||
SessionProperties,
|
SessionProperties,
|
||||||
)
|
)
|
||||||
from pipecat.services.openai_realtime.events import AudioConfiguration, AudioInput
|
from pipecat.services.openai.realtime.llm import OpenAIRealtimeLLMService
|
||||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||||
from pipecat.transports.daily.transport import DailyParams
|
from pipecat.transports.daily.transport import DailyParams
|
||||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||||
|
|||||||
@@ -25,13 +25,14 @@ from pipecat.runner.types import RunnerArguments
|
|||||||
from pipecat.runner.utils import create_transport
|
from pipecat.runner.utils import create_transport
|
||||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||||
from pipecat.services.llm_service import FunctionCallParams
|
from pipecat.services.llm_service import FunctionCallParams
|
||||||
from pipecat.services.openai_realtime import (
|
from pipecat.services.openai.realtime.events import (
|
||||||
|
AudioConfiguration,
|
||||||
|
AudioInput,
|
||||||
InputAudioTranscription,
|
InputAudioTranscription,
|
||||||
OpenAIRealtimeLLMService,
|
|
||||||
SessionProperties,
|
SessionProperties,
|
||||||
TurnDetection,
|
TurnDetection,
|
||||||
)
|
)
|
||||||
from pipecat.services.openai_realtime.events import AudioConfiguration, AudioInput
|
from pipecat.services.openai.realtime.llm import OpenAIRealtimeLLMService
|
||||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||||
from pipecat.transports.daily.transport import DailyParams
|
from pipecat.transports.daily.transport import DailyParams
|
||||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||||
|
|||||||
@@ -72,7 +72,6 @@ async def save_conversation(params: FunctionCallParams):
|
|||||||
)
|
)
|
||||||
try:
|
try:
|
||||||
with open(filename, "w") as file:
|
with open(filename, "w") as file:
|
||||||
# todo: extract 'system' into the first message in the list
|
|
||||||
messages = params.context.get_messages()
|
messages = params.context.get_messages()
|
||||||
# remove the last message, which is the instruction we just gave to save the conversation
|
# remove the last message, which is the instruction we just gave to save the conversation
|
||||||
messages.pop()
|
messages.pop()
|
||||||
|
|||||||
@@ -90,7 +90,6 @@ async def save_conversation(params: FunctionCallParams):
|
|||||||
)
|
)
|
||||||
try:
|
try:
|
||||||
with open(filename, "w") as file:
|
with open(filename, "w") as file:
|
||||||
# todo: extract 'system' into the first message in the list
|
|
||||||
messages = params.context.get_messages()
|
messages = params.context.get_messages()
|
||||||
# remove the last message (the instruction to save the context)
|
# remove the last message (the instruction to save the context)
|
||||||
messages.pop()
|
messages.pop()
|
||||||
|
|||||||
@@ -20,10 +20,12 @@ from pipecat.frames.frames import LLMRunFrame
|
|||||||
from pipecat.pipeline.pipeline import Pipeline
|
from pipecat.pipeline.pipeline import Pipeline
|
||||||
from pipecat.pipeline.runner import PipelineRunner
|
from pipecat.pipeline.runner import PipelineRunner
|
||||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||||
|
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||||
|
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||||
from pipecat.runner.types import RunnerArguments
|
from pipecat.runner.types import RunnerArguments
|
||||||
from pipecat.runner.utils import create_transport
|
from pipecat.runner.utils import create_transport
|
||||||
from pipecat.services.aws_nova_sonic.aws import AWSNovaSonicLLMService
|
from pipecat.services.aws.nova_sonic.llm import AWSNovaSonicLLMService
|
||||||
from pipecat.services.llm_service import FunctionCallParams
|
from pipecat.services.llm_service import FunctionCallParams
|
||||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||||
from pipecat.transports.daily.transport import DailyParams
|
from pipecat.transports.daily.transport import DailyParams
|
||||||
@@ -75,7 +77,7 @@ async def save_conversation(params: FunctionCallParams):
|
|||||||
filename = f"{BASE_FILENAME}{timestamp}.json"
|
filename = f"{BASE_FILENAME}{timestamp}.json"
|
||||||
try:
|
try:
|
||||||
with open(filename, "w") as file:
|
with open(filename, "w") as file:
|
||||||
messages = params.context.get_messages_for_persistent_storage()
|
messages = params.context.get_messages()
|
||||||
# remove the last few messages. in reverse order, they are:
|
# remove the last few messages. in reverse order, they are:
|
||||||
# - the in progress save tool call
|
# - the in progress save tool call
|
||||||
# - the invocation of the save tool call
|
# - the invocation of the save tool call
|
||||||
@@ -223,13 +225,13 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
|||||||
llm.register_function("get_saved_conversation_filenames", get_saved_conversation_filenames)
|
llm.register_function("get_saved_conversation_filenames", get_saved_conversation_filenames)
|
||||||
llm.register_function("load_conversation", load_conversation)
|
llm.register_function("load_conversation", load_conversation)
|
||||||
|
|
||||||
context = OpenAILLMContext(
|
context = LLMContext(
|
||||||
messages=[
|
messages=[
|
||||||
{"role": "system", "content": f"{system_instruction}"},
|
{"role": "system", "content": f"{system_instruction}"},
|
||||||
],
|
],
|
||||||
tools=tools,
|
tools=tools,
|
||||||
)
|
)
|
||||||
context_aggregator = llm.create_context_aggregator(context)
|
context_aggregator = LLMContextAggregatorPair(context)
|
||||||
|
|
||||||
pipeline = Pipeline(
|
pipeline = Pipeline(
|
||||||
[
|
[
|
||||||
|
|||||||
@@ -17,7 +17,7 @@ from pipecat.pipeline.runner import PipelineRunner
|
|||||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||||
from pipecat.runner.types import RunnerArguments
|
from pipecat.runner.types import RunnerArguments
|
||||||
from pipecat.runner.utils import create_transport
|
from pipecat.runner.utils import create_transport
|
||||||
from pipecat.services.gemini_multimodal_live.gemini import GeminiMultimodalLiveLLMService
|
from pipecat.services.google.gemini_live.llm import GeminiLiveLLMService
|
||||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||||
from pipecat.transports.daily.transport import DailyParams
|
from pipecat.transports.daily.transport import DailyParams
|
||||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||||
@@ -65,7 +65,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
|||||||
Respond to what the user said in a creative and helpful way.
|
Respond to what the user said in a creative and helpful way.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
llm = GeminiMultimodalLiveLLMService(
|
llm = GeminiLiveLLMService(
|
||||||
api_key=os.getenv("GOOGLE_API_KEY"),
|
api_key=os.getenv("GOOGLE_API_KEY"),
|
||||||
system_instruction=system_instruction,
|
system_instruction=system_instruction,
|
||||||
voice_id="Puck", # Aoede, Charon, Fenrir, Kore, Puck
|
voice_id="Puck", # Aoede, Charon, Fenrir, Kore, Puck
|
||||||
@@ -20,7 +20,7 @@ from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
|||||||
from pipecat.processors.transcript_processor import TranscriptProcessor
|
from pipecat.processors.transcript_processor import TranscriptProcessor
|
||||||
from pipecat.runner.types import RunnerArguments
|
from pipecat.runner.types import RunnerArguments
|
||||||
from pipecat.runner.utils import create_transport
|
from pipecat.runner.utils import create_transport
|
||||||
from pipecat.services.gemini_multimodal_live.gemini import GeminiMultimodalLiveLLMService
|
from pipecat.services.google.gemini_live.llm import GeminiLiveLLMService
|
||||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||||
from pipecat.transports.daily.transport import DailyParams
|
from pipecat.transports.daily.transport import DailyParams
|
||||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||||
@@ -65,7 +65,7 @@ transport_params = {
|
|||||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||||
logger.info(f"Starting bot")
|
logger.info(f"Starting bot")
|
||||||
|
|
||||||
llm = GeminiMultimodalLiveLLMService(
|
llm = GeminiLiveLLMService(
|
||||||
api_key=os.getenv("GOOGLE_API_KEY"),
|
api_key=os.getenv("GOOGLE_API_KEY"),
|
||||||
voice_id="Aoede", # Puck, Charon, Kore, Fenrir, Aoede
|
voice_id="Aoede", # Puck, Charon, Kore, Fenrir, Aoede
|
||||||
# system_instruction="Talk like a pirate."
|
# system_instruction="Talk like a pirate."
|
||||||
@@ -22,7 +22,7 @@ from pipecat.pipeline.task import PipelineParams, PipelineTask
|
|||||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||||
from pipecat.runner.types import RunnerArguments
|
from pipecat.runner.types import RunnerArguments
|
||||||
from pipecat.runner.utils import create_transport
|
from pipecat.runner.utils import create_transport
|
||||||
from pipecat.services.gemini_multimodal_live.gemini import GeminiMultimodalLiveLLMService
|
from pipecat.services.google.gemini_live.llm import GeminiLiveLLMService
|
||||||
from pipecat.services.llm_service import FunctionCallParams
|
from pipecat.services.llm_service import FunctionCallParams
|
||||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||||
from pipecat.transports.daily.transport import DailyParams
|
from pipecat.transports.daily.transport import DailyParams
|
||||||
@@ -122,12 +122,15 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
|||||||
required=["location"],
|
required=["location"],
|
||||||
)
|
)
|
||||||
search_tool = {"google_search": {}}
|
search_tool = {"google_search": {}}
|
||||||
|
# KNOWN ISSUE: If using GeminiVertexLiveLLMService, it appears
|
||||||
|
# you cannot use the "google_search" tool alongside other tools.
|
||||||
|
# See https://github.com/googleapis/python-genai/issues/941.
|
||||||
tools = ToolsSchema(
|
tools = ToolsSchema(
|
||||||
standard_tools=[weather_function, restaurant_function],
|
standard_tools=[weather_function, restaurant_function],
|
||||||
custom_tools={AdapterType.GEMINI: [search_tool]},
|
custom_tools={AdapterType.GEMINI: [search_tool]},
|
||||||
)
|
)
|
||||||
|
|
||||||
llm = GeminiMultimodalLiveLLMService(
|
llm = GeminiLiveLLMService(
|
||||||
api_key=os.getenv("GOOGLE_API_KEY"),
|
api_key=os.getenv("GOOGLE_API_KEY"),
|
||||||
system_instruction=system_instruction,
|
system_instruction=system_instruction,
|
||||||
tools=tools,
|
tools=tools,
|
||||||
@@ -24,7 +24,7 @@ from pipecat.runner.utils import (
|
|||||||
maybe_capture_participant_camera,
|
maybe_capture_participant_camera,
|
||||||
maybe_capture_participant_screen,
|
maybe_capture_participant_screen,
|
||||||
)
|
)
|
||||||
from pipecat.services.gemini_multimodal_live.gemini import GeminiMultimodalLiveLLMService
|
from pipecat.services.google.gemini_live.llm import GeminiLiveLLMService
|
||||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||||
from pipecat.transports.daily.transport import DailyParams
|
from pipecat.transports.daily.transport import DailyParams
|
||||||
|
|
||||||
@@ -58,7 +58,7 @@ transport_params = {
|
|||||||
|
|
||||||
|
|
||||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||||
llm = GeminiMultimodalLiveLLMService(
|
llm = GeminiLiveLLMService(
|
||||||
api_key=os.getenv("GOOGLE_API_KEY"),
|
api_key=os.getenv("GOOGLE_API_KEY"),
|
||||||
voice_id="Aoede", # Puck, Charon, Kore, Fenrir, Aoede
|
voice_id="Aoede", # Puck, Charon, Kore, Fenrir, Aoede
|
||||||
# system_instruction="Talk like a pirate."
|
# system_instruction="Talk like a pirate."
|
||||||
@@ -20,9 +20,9 @@ from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
|||||||
from pipecat.runner.types import RunnerArguments
|
from pipecat.runner.types import RunnerArguments
|
||||||
from pipecat.runner.utils import create_transport
|
from pipecat.runner.utils import create_transport
|
||||||
from pipecat.services.cartesia.tts import CartesiaTTSService
|
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||||
from pipecat.services.gemini_multimodal_live.gemini import (
|
from pipecat.services.google.gemini_live.llm import (
|
||||||
GeminiMultimodalLiveLLMService,
|
GeminiLiveLLMService,
|
||||||
GeminiMultimodalModalities,
|
GeminiModalities,
|
||||||
InputParams,
|
InputParams,
|
||||||
)
|
)
|
||||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||||
@@ -80,11 +80,15 @@ transport_params = {
|
|||||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||||
logger.info(f"Starting bot")
|
logger.info(f"Starting bot")
|
||||||
|
|
||||||
llm = GeminiMultimodalLiveLLMService(
|
# KNOWN ISSUE: If using GeminiLiveVertexLLMService, you cannot specify a
|
||||||
|
# modality other than AUDIO (at least not if using the service's default
|
||||||
|
# model, which is a native audio model:
|
||||||
|
# https://cloud.google.com/vertex-ai/generative-ai/docs/live-api/tools#native-audio).
|
||||||
|
llm = GeminiLiveLLMService(
|
||||||
api_key=os.getenv("GOOGLE_API_KEY"),
|
api_key=os.getenv("GOOGLE_API_KEY"),
|
||||||
system_instruction=SYSTEM_INSTRUCTION,
|
system_instruction=SYSTEM_INSTRUCTION,
|
||||||
tools=[{"google_search": {}}, {"code_execution": {}}],
|
tools=[{"google_search": {}}, {"code_execution": {}}],
|
||||||
params=InputParams(modalities=GeminiMultimodalModalities.TEXT),
|
params=InputParams(modalities=GeminiModalities.TEXT),
|
||||||
)
|
)
|
||||||
|
|
||||||
# Optionally, you can set the response modalities via a function
|
# Optionally, you can set the response modalities via a function
|
||||||
@@ -19,7 +19,7 @@ from pipecat.pipeline.task import PipelineParams, PipelineTask
|
|||||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||||
from pipecat.runner.types import RunnerArguments
|
from pipecat.runner.types import RunnerArguments
|
||||||
from pipecat.runner.utils import create_transport
|
from pipecat.runner.utils import create_transport
|
||||||
from pipecat.services.gemini_multimodal_live.gemini import GeminiMultimodalLiveLLMService
|
from pipecat.services.google.gemini_live.llm import GeminiLiveLLMService
|
||||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||||
from pipecat.transports.daily.transport import DailyParams
|
from pipecat.transports.daily.transport import DailyParams
|
||||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||||
@@ -83,7 +83,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
|||||||
logger.info(f"Starting bot")
|
logger.info(f"Starting bot")
|
||||||
|
|
||||||
# Initialize the Gemini Multimodal Live model
|
# Initialize the Gemini Multimodal Live model
|
||||||
llm = GeminiMultimodalLiveLLMService(
|
llm = GeminiLiveLLMService(
|
||||||
api_key=os.getenv("GOOGLE_API_KEY"),
|
api_key=os.getenv("GOOGLE_API_KEY"),
|
||||||
voice_id="Puck", # Aoede, Charon, Fenrir, Kore, Puck
|
voice_id="Puck", # Aoede, Charon, Fenrir, Kore, Puck
|
||||||
system_instruction=system_instruction,
|
system_instruction=system_instruction,
|
||||||
@@ -19,9 +19,7 @@ from pipecat.pipeline.task import PipelineParams, PipelineTask
|
|||||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||||
from pipecat.runner.types import RunnerArguments
|
from pipecat.runner.types import RunnerArguments
|
||||||
from pipecat.runner.utils import create_transport
|
from pipecat.runner.utils import create_transport
|
||||||
from pipecat.services.gemini_multimodal_live.gemini import (
|
from pipecat.services.google.gemini_live.llm import GeminiLiveLLMService
|
||||||
GeminiMultimodalLiveLLMService,
|
|
||||||
)
|
|
||||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||||
from pipecat.transports.daily.transport import DailyParams
|
from pipecat.transports.daily.transport import DailyParams
|
||||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||||
@@ -110,7 +108,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
# Initialize Gemini service with File API support
|
# Initialize Gemini service with File API support
|
||||||
llm = GeminiMultimodalLiveLLMService(
|
llm = GeminiLiveLLMService(
|
||||||
api_key=os.getenv("GOOGLE_API_KEY"),
|
api_key=os.getenv("GOOGLE_API_KEY"),
|
||||||
system_instruction=system_instruction,
|
system_instruction=system_instruction,
|
||||||
voice_id="Charon", # Aoede, Charon, Fenrir, Kore, Puck
|
voice_id="Charon", # Aoede, Charon, Fenrir, Kore, Puck
|
||||||
@@ -9,13 +9,13 @@ from pipecat.audio.vad.vad_analyzer import VADParams
|
|||||||
from pipecat.frames.frames import Frame, LLMRunFrame
|
from pipecat.frames.frames import Frame, LLMRunFrame
|
||||||
from pipecat.pipeline.pipeline import Pipeline
|
from pipecat.pipeline.pipeline import Pipeline
|
||||||
from pipecat.pipeline.runner import PipelineRunner
|
from pipecat.pipeline.runner import PipelineRunner
|
||||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
from pipecat.pipeline.task import PipelineTask
|
||||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||||
from pipecat.runner.types import RunnerArguments
|
from pipecat.runner.types import RunnerArguments
|
||||||
from pipecat.runner.utils import create_transport
|
from pipecat.runner.utils import create_transport
|
||||||
from pipecat.services.gemini_multimodal_live.gemini import GeminiMultimodalLiveLLMService
|
|
||||||
from pipecat.services.google.frames import LLMSearchResponseFrame
|
from pipecat.services.google.frames import LLMSearchResponseFrame
|
||||||
|
from pipecat.services.google.gemini_live.llm import GeminiLiveLLMService
|
||||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||||
from pipecat.transports.daily.transport import DailyParams
|
from pipecat.transports.daily.transport import DailyParams
|
||||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||||
@@ -105,7 +105,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
|||||||
custom_tools={AdapterType.GEMINI: [{"google_search": {}}, {"code_execution": {}}]},
|
custom_tools={AdapterType.GEMINI: [{"google_search": {}}, {"code_execution": {}}]},
|
||||||
)
|
)
|
||||||
|
|
||||||
llm = GeminiMultimodalLiveLLMService(
|
llm = GeminiLiveLLMService(
|
||||||
api_key=os.getenv("GOOGLE_API_KEY"),
|
api_key=os.getenv("GOOGLE_API_KEY"),
|
||||||
system_instruction=SYSTEM_INSTRUCTION,
|
system_instruction=SYSTEM_INSTRUCTION,
|
||||||
voice_id="Charon", # Aoede, Charon, Fenrir, Kore, Puck
|
voice_id="Charon", # Aoede, Charon, Fenrir, Kore, Puck
|
||||||
191
examples/foundational/26h-gemini-live-vertex-function-calling.py
Normal file
191
examples/foundational/26h-gemini-live-vertex-function-calling.py
Normal file
@@ -0,0 +1,191 @@
|
|||||||
|
#
|
||||||
|
# Copyright (c) 2024–2025, Daily
|
||||||
|
#
|
||||||
|
# SPDX-License-Identifier: BSD 2-Clause License
|
||||||
|
#
|
||||||
|
|
||||||
|
|
||||||
|
import os
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
from google.genai.types import HttpOptions
|
||||||
|
from loguru import logger
|
||||||
|
|
||||||
|
from pipecat.adapters.schemas.function_schema import FunctionSchema
|
||||||
|
from pipecat.adapters.schemas.tools_schema import AdapterType, ToolsSchema
|
||||||
|
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||||
|
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||||
|
from pipecat.frames.frames import LLMRunFrame
|
||||||
|
from pipecat.pipeline.pipeline import Pipeline
|
||||||
|
from pipecat.pipeline.runner import PipelineRunner
|
||||||
|
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||||
|
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||||
|
from pipecat.runner.types import RunnerArguments
|
||||||
|
from pipecat.runner.utils import create_transport
|
||||||
|
from pipecat.services.google.gemini_live.llm import GeminiLiveLLMService
|
||||||
|
from pipecat.services.google.gemini_live.llm_vertex import GeminiLiveVertexLLMService
|
||||||
|
from pipecat.services.llm_service import FunctionCallParams
|
||||||
|
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||||
|
from pipecat.transports.daily.transport import DailyParams
|
||||||
|
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||||
|
|
||||||
|
load_dotenv(override=True)
|
||||||
|
|
||||||
|
|
||||||
|
async def fetch_weather_from_api(params: FunctionCallParams):
|
||||||
|
temperature = 75 if params.arguments["format"] == "fahrenheit" else 24
|
||||||
|
await params.result_callback(
|
||||||
|
{
|
||||||
|
"conditions": "nice",
|
||||||
|
"temperature": temperature,
|
||||||
|
"format": params.arguments["format"],
|
||||||
|
"timestamp": datetime.now().strftime("%Y%m%d_%H%M%S"),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def fetch_restaurant_recommendation(params: FunctionCallParams):
|
||||||
|
await params.result_callback({"name": "The Golden Dragon"})
|
||||||
|
|
||||||
|
|
||||||
|
system_instruction = """
|
||||||
|
You are a helpful assistant who can answer questions and use tools.
|
||||||
|
|
||||||
|
You have three tools available to you:
|
||||||
|
1. get_current_weather: Use this tool to get the current weather in a specific location.
|
||||||
|
2. get_restaurant_recommendation: Use this tool to get a restaurant recommendation in a specific location.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||||
|
# instantiated. The function will be called when the desired transport gets
|
||||||
|
# selected.
|
||||||
|
transport_params = {
|
||||||
|
"daily": lambda: DailyParams(
|
||||||
|
audio_in_enabled=True,
|
||||||
|
audio_out_enabled=True,
|
||||||
|
# set stop_secs to something roughly similar to the internal setting
|
||||||
|
# of the Multimodal Live api, just to align events. This doesn't really
|
||||||
|
# matter because we can only use the Multimodal Live API's phrase
|
||||||
|
# endpointing, for now.
|
||||||
|
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.5)),
|
||||||
|
),
|
||||||
|
"twilio": lambda: FastAPIWebsocketParams(
|
||||||
|
audio_in_enabled=True,
|
||||||
|
audio_out_enabled=True,
|
||||||
|
# set stop_secs to something roughly similar to the internal setting
|
||||||
|
# of the Multimodal Live api, just to align events. This doesn't really
|
||||||
|
# matter because we can only use the Multimodal Live API's phrase
|
||||||
|
# endpointing, for now.
|
||||||
|
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.5)),
|
||||||
|
),
|
||||||
|
"webrtc": lambda: TransportParams(
|
||||||
|
audio_in_enabled=True,
|
||||||
|
audio_out_enabled=True,
|
||||||
|
# set stop_secs to something roughly similar to the internal setting
|
||||||
|
# of the Multimodal Live api, just to align events. This doesn't really
|
||||||
|
# matter because we can only use the Multimodal Live API's phrase
|
||||||
|
# endpointing, for now.
|
||||||
|
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.5)),
|
||||||
|
),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||||
|
logger.info(f"Starting bot")
|
||||||
|
|
||||||
|
weather_function = FunctionSchema(
|
||||||
|
name="get_current_weather",
|
||||||
|
description="Get the current weather",
|
||||||
|
properties={
|
||||||
|
"location": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "The city and state, e.g. San Francisco, CA",
|
||||||
|
},
|
||||||
|
"format": {
|
||||||
|
"type": "string",
|
||||||
|
"enum": ["celsius", "fahrenheit"],
|
||||||
|
"description": "The temperature unit to use. Infer this from the user's location.",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
required=["location", "format"],
|
||||||
|
)
|
||||||
|
restaurant_function = FunctionSchema(
|
||||||
|
name="get_restaurant_recommendation",
|
||||||
|
description="Get a restaurant recommendation",
|
||||||
|
properties={
|
||||||
|
"location": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "The city and state, e.g. San Francisco, CA",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
required=["location"],
|
||||||
|
)
|
||||||
|
# KNOWN ISSUE: If using GeminiVertexLiveLLMService, it appears
|
||||||
|
# you cannot use the "google_search" tool alongside other tools.
|
||||||
|
# See https://github.com/googleapis/python-genai/issues/941.
|
||||||
|
tools = ToolsSchema(standard_tools=[weather_function, restaurant_function])
|
||||||
|
|
||||||
|
llm = GeminiLiveVertexLLMService(
|
||||||
|
credentials=os.getenv("GOOGLE_VERTEX_TEST_CREDENTIALS"),
|
||||||
|
project_id=os.getenv("GOOGLE_CLOUD_PROJECT_ID"),
|
||||||
|
location=os.getenv("GOOGLE_CLOUD_LOCATION"),
|
||||||
|
system_instruction=system_instruction,
|
||||||
|
voice_id="Puck", # Aoede, Charon, Fenrir, Kore, Puck
|
||||||
|
tools=tools,
|
||||||
|
)
|
||||||
|
|
||||||
|
llm.register_function("get_current_weather", fetch_weather_from_api)
|
||||||
|
llm.register_function("get_restaurant_recommendation", fetch_restaurant_recommendation)
|
||||||
|
|
||||||
|
context = OpenAILLMContext(
|
||||||
|
[{"role": "user", "content": "Say hello."}],
|
||||||
|
)
|
||||||
|
context_aggregator = llm.create_context_aggregator(context)
|
||||||
|
|
||||||
|
pipeline = Pipeline(
|
||||||
|
[
|
||||||
|
transport.input(),
|
||||||
|
context_aggregator.user(),
|
||||||
|
llm,
|
||||||
|
transport.output(),
|
||||||
|
context_aggregator.assistant(),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
task = PipelineTask(
|
||||||
|
pipeline,
|
||||||
|
params=PipelineParams(
|
||||||
|
enable_metrics=True,
|
||||||
|
enable_usage_metrics=True,
|
||||||
|
),
|
||||||
|
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||||
|
)
|
||||||
|
|
||||||
|
@transport.event_handler("on_client_connected")
|
||||||
|
async def on_client_connected(transport, client):
|
||||||
|
logger.info(f"Client connected")
|
||||||
|
# Kick off the conversation.
|
||||||
|
await task.queue_frames([LLMRunFrame()])
|
||||||
|
|
||||||
|
@transport.event_handler("on_client_disconnected")
|
||||||
|
async def on_client_disconnected(transport, client):
|
||||||
|
logger.info(f"Client disconnected")
|
||||||
|
await task.cancel()
|
||||||
|
|
||||||
|
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||||
|
|
||||||
|
await runner.run(task)
|
||||||
|
|
||||||
|
|
||||||
|
async def bot(runner_args: RunnerArguments):
|
||||||
|
"""Main bot entry point compatible with Pipecat Cloud."""
|
||||||
|
transport = await create_transport(runner_args, transport_params)
|
||||||
|
await run_bot(transport, runner_args)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
from pipecat.runner.run import main
|
||||||
|
|
||||||
|
main()
|
||||||
204
examples/foundational/26i-gemini-live-graceful-end.py
Normal file
204
examples/foundational/26i-gemini-live-graceful-end.py
Normal file
@@ -0,0 +1,204 @@
|
|||||||
|
#
|
||||||
|
# Copyright (c) 2024–2025, Daily
|
||||||
|
#
|
||||||
|
# SPDX-License-Identifier: BSD 2-Clause License
|
||||||
|
#
|
||||||
|
|
||||||
|
import os
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
from loguru import logger
|
||||||
|
|
||||||
|
from pipecat.adapters.schemas.function_schema import FunctionSchema
|
||||||
|
from pipecat.adapters.schemas.tools_schema import AdapterType, ToolsSchema
|
||||||
|
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||||
|
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||||
|
from pipecat.frames.frames import EndTaskFrame, LLMRunFrame
|
||||||
|
from pipecat.pipeline.pipeline import Pipeline
|
||||||
|
from pipecat.pipeline.runner import PipelineRunner
|
||||||
|
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||||
|
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||||
|
from pipecat.processors.frame_processor import FrameDirection
|
||||||
|
from pipecat.runner.types import RunnerArguments
|
||||||
|
from pipecat.runner.utils import create_transport
|
||||||
|
from pipecat.services.google.gemini_live.llm import GeminiLiveLLMService
|
||||||
|
from pipecat.services.llm_service import FunctionCallParams
|
||||||
|
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||||
|
from pipecat.transports.daily.transport import DailyParams
|
||||||
|
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||||
|
|
||||||
|
load_dotenv(override=True)
|
||||||
|
|
||||||
|
|
||||||
|
async def fetch_weather_from_api(params: FunctionCallParams):
|
||||||
|
temperature = 75 if params.arguments["format"] == "fahrenheit" else 24
|
||||||
|
await params.result_callback(
|
||||||
|
{
|
||||||
|
"conditions": "nice",
|
||||||
|
"temperature": temperature,
|
||||||
|
"format": params.arguments["format"],
|
||||||
|
"timestamp": datetime.now().strftime("%Y%m%d_%H%M%S"),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def fetch_restaurant_recommendation(params: FunctionCallParams):
|
||||||
|
await params.result_callback({"name": "The Golden Dragon"})
|
||||||
|
|
||||||
|
|
||||||
|
async def end_conversation(params: FunctionCallParams):
|
||||||
|
await params.result_callback({"success": True})
|
||||||
|
await params.llm.push_frame(EndTaskFrame(), FrameDirection.UPSTREAM)
|
||||||
|
|
||||||
|
|
||||||
|
system_instruction = """
|
||||||
|
You are a helpful assistant who can answer questions and use tools.
|
||||||
|
|
||||||
|
You have three tools available to you:
|
||||||
|
1. get_current_weather: Use this tool to get the current weather in a specific location.
|
||||||
|
2. get_restaurant_recommendation: Use this tool to get a restaurant recommendation in a specific location.
|
||||||
|
3. end_conversation: Use this tool to gracefully end the conversation.
|
||||||
|
|
||||||
|
After you've responded to the user three times, do two things, in order:
|
||||||
|
1. Politely let them know that that's all the time you have today and say goodbye.
|
||||||
|
2. Call the end_conversation tool to gracefully end the conversation.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||||
|
# instantiated. The function will be called when the desired transport gets
|
||||||
|
# selected.
|
||||||
|
transport_params = {
|
||||||
|
"daily": lambda: DailyParams(
|
||||||
|
audio_in_enabled=True,
|
||||||
|
audio_out_enabled=True,
|
||||||
|
# set stop_secs to something roughly similar to the internal setting
|
||||||
|
# of the Multimodal Live api, just to align events. This doesn't really
|
||||||
|
# matter because we can only use the Multimodal Live API's phrase
|
||||||
|
# endpointing, for now.
|
||||||
|
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.5)),
|
||||||
|
),
|
||||||
|
"twilio": lambda: FastAPIWebsocketParams(
|
||||||
|
audio_in_enabled=True,
|
||||||
|
audio_out_enabled=True,
|
||||||
|
# set stop_secs to something roughly similar to the internal setting
|
||||||
|
# of the Multimodal Live api, just to align events. This doesn't really
|
||||||
|
# matter because we can only use the Multimodal Live API's phrase
|
||||||
|
# endpointing, for now.
|
||||||
|
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.5)),
|
||||||
|
),
|
||||||
|
"webrtc": lambda: TransportParams(
|
||||||
|
audio_in_enabled=True,
|
||||||
|
audio_out_enabled=True,
|
||||||
|
# set stop_secs to something roughly similar to the internal setting
|
||||||
|
# of the Multimodal Live api, just to align events. This doesn't really
|
||||||
|
# matter because we can only use the Multimodal Live API's phrase
|
||||||
|
# endpointing, for now.
|
||||||
|
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.5)),
|
||||||
|
),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||||
|
logger.info(f"Starting bot")
|
||||||
|
|
||||||
|
weather_function = FunctionSchema(
|
||||||
|
name="get_current_weather",
|
||||||
|
description="Get the current weather",
|
||||||
|
properties={
|
||||||
|
"location": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "The city and state, e.g. San Francisco, CA",
|
||||||
|
},
|
||||||
|
"format": {
|
||||||
|
"type": "string",
|
||||||
|
"enum": ["celsius", "fahrenheit"],
|
||||||
|
"description": "The temperature unit to use. Infer this from the user's location.",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
required=["location", "format"],
|
||||||
|
)
|
||||||
|
restaurant_function = FunctionSchema(
|
||||||
|
name="get_restaurant_recommendation",
|
||||||
|
description="Get a restaurant recommendation",
|
||||||
|
properties={
|
||||||
|
"location": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "The city and state, e.g. San Francisco, CA",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
required=["location"],
|
||||||
|
)
|
||||||
|
end_conversation_function = FunctionSchema(
|
||||||
|
name="end_conversation",
|
||||||
|
description="Gracefully end the conversation",
|
||||||
|
properties={},
|
||||||
|
required=[],
|
||||||
|
)
|
||||||
|
search_tool = {"google_search": {}}
|
||||||
|
tools = ToolsSchema(
|
||||||
|
standard_tools=[weather_function, restaurant_function, end_conversation_function],
|
||||||
|
custom_tools={AdapterType.GEMINI: [search_tool]},
|
||||||
|
)
|
||||||
|
|
||||||
|
llm = GeminiLiveLLMService(
|
||||||
|
api_key=os.getenv("GOOGLE_API_KEY"),
|
||||||
|
system_instruction=system_instruction,
|
||||||
|
tools=tools,
|
||||||
|
)
|
||||||
|
|
||||||
|
llm.register_function("get_current_weather", fetch_weather_from_api)
|
||||||
|
llm.register_function("get_restaurant_recommendation", fetch_restaurant_recommendation)
|
||||||
|
llm.register_function("end_conversation", end_conversation)
|
||||||
|
|
||||||
|
context = OpenAILLMContext(
|
||||||
|
[{"role": "user", "content": "Say hello."}],
|
||||||
|
)
|
||||||
|
context_aggregator = llm.create_context_aggregator(context)
|
||||||
|
|
||||||
|
pipeline = Pipeline(
|
||||||
|
[
|
||||||
|
transport.input(),
|
||||||
|
context_aggregator.user(),
|
||||||
|
llm,
|
||||||
|
transport.output(),
|
||||||
|
context_aggregator.assistant(),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
task = PipelineTask(
|
||||||
|
pipeline,
|
||||||
|
params=PipelineParams(
|
||||||
|
enable_metrics=True,
|
||||||
|
enable_usage_metrics=True,
|
||||||
|
),
|
||||||
|
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||||
|
)
|
||||||
|
|
||||||
|
@transport.event_handler("on_client_connected")
|
||||||
|
async def on_client_connected(transport, client):
|
||||||
|
logger.info(f"Client connected")
|
||||||
|
# Kick off the conversation.
|
||||||
|
await task.queue_frames([LLMRunFrame()])
|
||||||
|
|
||||||
|
@transport.event_handler("on_client_disconnected")
|
||||||
|
async def on_client_disconnected(transport, client):
|
||||||
|
logger.info(f"Client disconnected")
|
||||||
|
await task.cancel()
|
||||||
|
|
||||||
|
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||||
|
|
||||||
|
await runner.run(task)
|
||||||
|
|
||||||
|
|
||||||
|
async def bot(runner_args: RunnerArguments):
|
||||||
|
"""Main bot entry point compatible with Pipecat Cloud."""
|
||||||
|
transport = await create_transport(runner_args, transport_params)
|
||||||
|
await run_bot(transport, runner_args)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
from pipecat.runner.run import main
|
||||||
|
|
||||||
|
main()
|
||||||
@@ -206,6 +206,14 @@ async def bot(runner_args: RunnerArguments):
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
if not os.getenv("NASA_API_KEY"):
|
||||||
|
logger.error(
|
||||||
|
f"Please set NASA_API_KEY environment variable for this example. See https://api.nasa.gov"
|
||||||
|
)
|
||||||
|
import sys
|
||||||
|
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
from pipecat.runner.run import main
|
from pipecat.runner.run import main
|
||||||
|
|
||||||
main()
|
main()
|
||||||
|
|||||||
@@ -141,6 +141,14 @@ async def bot(runner_args: RunnerArguments):
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
if not os.getenv("MCP_RUN_SSE_URL"):
|
||||||
|
logger.error(
|
||||||
|
f"Please set MCP_RUN_SSE_URL environment variable for this example. See https://mcp.run"
|
||||||
|
)
|
||||||
|
import sys
|
||||||
|
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
from pipecat.runner.run import main
|
from pipecat.runner.run import main
|
||||||
|
|
||||||
main()
|
main()
|
||||||
|
|||||||
@@ -219,6 +219,14 @@ async def bot(runner_args: RunnerArguments):
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
if not os.getenv("NASA_API_KEY") or not os.getenv("MCP_RUN_SSE_URL"):
|
||||||
|
logger.error(
|
||||||
|
f"Please set NASA_API_KEY and MCP_RUN_SSE_URL environment variables. See https://api.nasa.gov and https://mcp.run"
|
||||||
|
)
|
||||||
|
import sys
|
||||||
|
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
from pipecat.runner.run import main
|
from pipecat.runner.run import main
|
||||||
|
|
||||||
main()
|
main()
|
||||||
|
|||||||
@@ -145,6 +145,14 @@ async def bot(runner_args: RunnerArguments):
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
if not os.getenv("GITHUB_PERSONAL_ACCESS_TOKEN"):
|
||||||
|
logger.error(
|
||||||
|
f"Please set GITHUB_PERSONAL_ACCESS_TOKEN environment variable for this example."
|
||||||
|
)
|
||||||
|
import sys
|
||||||
|
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
from pipecat.runner.run import main
|
from pipecat.runner.run import main
|
||||||
|
|
||||||
main()
|
main()
|
||||||
|
|||||||
@@ -18,10 +18,11 @@ from pipecat.frames.frames import LLMRunFrame
|
|||||||
from pipecat.pipeline.pipeline import Pipeline
|
from pipecat.pipeline.pipeline import Pipeline
|
||||||
from pipecat.pipeline.runner import PipelineRunner
|
from pipecat.pipeline.runner import PipelineRunner
|
||||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||||
|
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||||
from pipecat.runner.types import RunnerArguments
|
from pipecat.runner.types import RunnerArguments
|
||||||
from pipecat.runner.utils import create_transport
|
from pipecat.runner.utils import create_transport
|
||||||
from pipecat.services.aws_nova_sonic import AWSNovaSonicLLMService
|
from pipecat.services.aws.nova_sonic.llm import AWSNovaSonicLLMService
|
||||||
from pipecat.services.llm_service import FunctionCallParams
|
from pipecat.services.llm_service import FunctionCallParams
|
||||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||||
from pipecat.transports.daily.transport import DailyParams
|
from pipecat.transports.daily.transport import DailyParams
|
||||||
@@ -119,9 +120,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
|||||||
llm.register_function("get_current_weather", fetch_weather_from_api)
|
llm.register_function("get_current_weather", fetch_weather_from_api)
|
||||||
|
|
||||||
# Set up context and context management.
|
# Set up context and context management.
|
||||||
# AWSNovaSonicService will adapt OpenAI LLM context objects with standard message format to
|
context = LLMContext(
|
||||||
# what's expected by Nova Sonic.
|
|
||||||
context = OpenAILLMContext(
|
|
||||||
messages=[
|
messages=[
|
||||||
{"role": "system", "content": f"{system_instruction}"},
|
{"role": "system", "content": f"{system_instruction}"},
|
||||||
{
|
{
|
||||||
@@ -131,7 +130,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
|||||||
],
|
],
|
||||||
tools=tools,
|
tools=tools,
|
||||||
)
|
)
|
||||||
context_aggregator = llm.create_context_aggregator(context)
|
context_aggregator = LLMContextAggregatorPair(context)
|
||||||
|
|
||||||
# Build the pipeline
|
# Build the pipeline
|
||||||
pipeline = Pipeline(
|
pipeline = Pipeline(
|
||||||
|
|||||||
@@ -20,7 +20,7 @@ from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
|||||||
from pipecat.processors.frameworks.rtvi import RTVIObserver, RTVIProcessor
|
from pipecat.processors.frameworks.rtvi import RTVIObserver, RTVIProcessor
|
||||||
from pipecat.runner.types import RunnerArguments
|
from pipecat.runner.types import RunnerArguments
|
||||||
from pipecat.runner.utils import create_transport
|
from pipecat.runner.utils import create_transport
|
||||||
from pipecat.services.gemini_multimodal_live import GeminiMultimodalLiveLLMService
|
from pipecat.services.google.gemini_live.llm import GeminiLiveLLMService
|
||||||
from pipecat.transports.base_transport import TransportParams
|
from pipecat.transports.base_transport import TransportParams
|
||||||
from pipecat.transports.daily.transport import DailyParams, DailyTransport
|
from pipecat.transports.daily.transport import DailyParams, DailyTransport
|
||||||
|
|
||||||
@@ -94,7 +94,7 @@ Respond to what the user said in a creative and helpful way. Keep your responses
|
|||||||
|
|
||||||
|
|
||||||
async def run_bot(pipecat_transport):
|
async def run_bot(pipecat_transport):
|
||||||
llm = GeminiMultimodalLiveLLMService(
|
llm = GeminiLiveLLMService(
|
||||||
api_key=os.getenv("GOOGLE_API_KEY"),
|
api_key=os.getenv("GOOGLE_API_KEY"),
|
||||||
voice_id="Puck", # Aoede, Charon, Fenrir, Kore, Puck
|
voice_id="Puck", # Aoede, Charon, Fenrir, Kore, Puck
|
||||||
transcribe_user_audio=True,
|
transcribe_user_audio=True,
|
||||||
|
|||||||
142
examples/foundational/47-sentry-metrics.py
Normal file
142
examples/foundational/47-sentry-metrics.py
Normal file
@@ -0,0 +1,142 @@
|
|||||||
|
#
|
||||||
|
# Copyright (c) 2024–2025, Daily
|
||||||
|
#
|
||||||
|
# SPDX-License-Identifier: BSD 2-Clause License
|
||||||
|
#
|
||||||
|
|
||||||
|
import os
|
||||||
|
|
||||||
|
import sentry_sdk
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
from loguru import logger
|
||||||
|
|
||||||
|
from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
|
||||||
|
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
|
||||||
|
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||||
|
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||||
|
from pipecat.frames.frames import LLMRunFrame
|
||||||
|
from pipecat.pipeline.pipeline import Pipeline
|
||||||
|
from pipecat.pipeline.runner import PipelineRunner
|
||||||
|
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||||
|
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||||
|
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||||
|
from pipecat.processors.metrics.sentry import SentryMetrics
|
||||||
|
from pipecat.runner.types import RunnerArguments
|
||||||
|
from pipecat.runner.utils import create_transport
|
||||||
|
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||||
|
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||||
|
from pipecat.services.openai.llm import OpenAILLMService
|
||||||
|
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||||
|
from pipecat.transports.daily.transport import DailyParams
|
||||||
|
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||||
|
|
||||||
|
load_dotenv(override=True)
|
||||||
|
|
||||||
|
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||||
|
# instantiated. The function will be called when the desired transport gets
|
||||||
|
# selected.
|
||||||
|
transport_params = {
|
||||||
|
"daily": lambda: DailyParams(
|
||||||
|
audio_in_enabled=True,
|
||||||
|
audio_out_enabled=True,
|
||||||
|
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||||
|
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||||
|
),
|
||||||
|
"twilio": lambda: FastAPIWebsocketParams(
|
||||||
|
audio_in_enabled=True,
|
||||||
|
audio_out_enabled=True,
|
||||||
|
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||||
|
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||||
|
),
|
||||||
|
"webrtc": lambda: TransportParams(
|
||||||
|
audio_in_enabled=True,
|
||||||
|
audio_out_enabled=True,
|
||||||
|
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||||
|
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||||
|
),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||||
|
logger.info(f"Starting bot")
|
||||||
|
|
||||||
|
# Initialize Sentry
|
||||||
|
sentry_sdk.init(
|
||||||
|
dsn=os.getenv("SENTRY_DSN"),
|
||||||
|
traces_sample_rate=1.0,
|
||||||
|
)
|
||||||
|
|
||||||
|
stt = DeepgramSTTService(
|
||||||
|
api_key=os.getenv("DEEPGRAM_API_KEY"),
|
||||||
|
metrics=SentryMetrics(),
|
||||||
|
)
|
||||||
|
|
||||||
|
tts = CartesiaTTSService(
|
||||||
|
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||||
|
voice_id="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||||
|
metrics=SentryMetrics(),
|
||||||
|
)
|
||||||
|
|
||||||
|
llm = OpenAILLMService(
|
||||||
|
api_key=os.getenv("OPENAI_API_KEY"),
|
||||||
|
metrics=SentryMetrics(),
|
||||||
|
)
|
||||||
|
|
||||||
|
messages = [
|
||||||
|
{
|
||||||
|
"role": "system",
|
||||||
|
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
context = LLMContext(messages)
|
||||||
|
context_aggregator = LLMContextAggregatorPair(context)
|
||||||
|
|
||||||
|
pipeline = Pipeline(
|
||||||
|
[
|
||||||
|
transport.input(), # Transport user input
|
||||||
|
stt,
|
||||||
|
context_aggregator.user(), # User responses
|
||||||
|
llm, # LLM
|
||||||
|
tts, # TTS
|
||||||
|
transport.output(), # Transport bot output
|
||||||
|
context_aggregator.assistant(), # Assistant spoken responses
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
task = PipelineTask(
|
||||||
|
pipeline,
|
||||||
|
params=PipelineParams(
|
||||||
|
enable_metrics=True,
|
||||||
|
enable_usage_metrics=True,
|
||||||
|
),
|
||||||
|
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||||
|
)
|
||||||
|
|
||||||
|
@transport.event_handler("on_client_connected")
|
||||||
|
async def on_client_connected(transport, client):
|
||||||
|
logger.info(f"Client connected")
|
||||||
|
# Kick off the conversation.
|
||||||
|
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
|
||||||
|
await task.queue_frames([LLMRunFrame()])
|
||||||
|
|
||||||
|
@transport.event_handler("on_client_disconnected")
|
||||||
|
async def on_client_disconnected(transport, client):
|
||||||
|
logger.info(f"Client disconnected")
|
||||||
|
await task.cancel()
|
||||||
|
|
||||||
|
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||||
|
|
||||||
|
await runner.run(task)
|
||||||
|
|
||||||
|
|
||||||
|
async def bot(runner_args: RunnerArguments):
|
||||||
|
"""Main bot entry point compatible with Pipecat Cloud."""
|
||||||
|
transport = await create_transport(runner_args, transport_params)
|
||||||
|
await run_bot(transport, runner_args)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
from pipecat.runner.run import main
|
||||||
|
|
||||||
|
main()
|
||||||
@@ -105,7 +105,7 @@ uv run 07-interruptible.py -t twilio -x NGROK_HOST_NAME
|
|||||||
### Vision & Multimodal
|
### Vision & Multimodal
|
||||||
|
|
||||||
- **[12a-describe-video-gemini-flash.py](./12a-describe-video-gemini-flash.py)**: Bot describes user's video (Video input, Multimodal LLMs)
|
- **[12a-describe-video-gemini-flash.py](./12a-describe-video-gemini-flash.py)**: Bot describes user's video (Video input, Multimodal LLMs)
|
||||||
- **[26c-gemini-multimodal-live-video.py](./26c-gemini-multimodal-live-video.py)**: Gemini with video input (Streaming video, Function calls)
|
- **[26c-gemini-live-video.py](./26c-gemini-live-video.py)**: Gemini with video input (Streaming video, Function calls)
|
||||||
|
|
||||||
### Voice & Language
|
### Voice & Language
|
||||||
|
|
||||||
|
|||||||
BIN
examples/foundational/assets/moondream.png
Normal file
BIN
examples/foundational/assets/moondream.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 1.1 MiB |
@@ -4,7 +4,7 @@ version = "0.1.0"
|
|||||||
description = "Quickstart example for building voice AI bots with Pipecat"
|
description = "Quickstart example for building voice AI bots with Pipecat"
|
||||||
requires-python = ">=3.10"
|
requires-python = ">=3.10"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"pipecat-ai[webrtc,daily,silero,deepgram,openai,cartesia,local-smart-turn-v3,runner]>=0.0.85",
|
"pipecat-ai[webrtc,daily,silero,deepgram,openai,cartesia,local-smart-turn-v3,runner]>=0.0.86",
|
||||||
"pipecatcloud>=0.2.4"
|
"pipecatcloud>=0.2.4"
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|||||||
@@ -34,10 +34,11 @@ dependencies = [
|
|||||||
"pyloudnorm~=0.1.1",
|
"pyloudnorm~=0.1.1",
|
||||||
"resampy~=0.4.3",
|
"resampy~=0.4.3",
|
||||||
"soxr~=0.5.0",
|
"soxr~=0.5.0",
|
||||||
"openai>=1.74.0,<=1.99.1",
|
"openai>=1.74.0,<3",
|
||||||
# Pinning numba to resolve package dependencies
|
# Pinning numba to resolve package dependencies
|
||||||
"numba==0.61.2",
|
"numba==0.61.2",
|
||||||
"wait_for2>=0.4.1; python_version<'3.12'",
|
"wait_for2>=0.4.1; python_version<'3.12'",
|
||||||
|
"pipecat-ai-cli"
|
||||||
]
|
]
|
||||||
|
|
||||||
[project.urls]
|
[project.urls]
|
||||||
@@ -50,23 +51,24 @@ anthropic = [ "anthropic~=0.49.0" ]
|
|||||||
assemblyai = [ "pipecat-ai[websockets-base]" ]
|
assemblyai = [ "pipecat-ai[websockets-base]" ]
|
||||||
asyncai = [ "pipecat-ai[websockets-base]" ]
|
asyncai = [ "pipecat-ai[websockets-base]" ]
|
||||||
aws = [ "aioboto3~=15.0.0", "pipecat-ai[websockets-base]" ]
|
aws = [ "aioboto3~=15.0.0", "pipecat-ai[websockets-base]" ]
|
||||||
aws-nova-sonic = [ "aws_sdk_bedrock_runtime~=0.0.2; python_version>='3.12'" ]
|
aws-nova-sonic = [ "aws_sdk_bedrock_runtime~=0.1.0; python_version>='3.12'" ]
|
||||||
azure = [ "azure-cognitiveservices-speech~=1.42.0"]
|
azure = [ "azure-cognitiveservices-speech~=1.42.0"]
|
||||||
cartesia = [ "cartesia~=2.0.3", "pipecat-ai[websockets-base]" ]
|
cartesia = [ "cartesia~=2.0.3", "pipecat-ai[websockets-base]" ]
|
||||||
cerebras = []
|
cerebras = []
|
||||||
deepseek = []
|
deepseek = []
|
||||||
daily = [ "daily-python~=0.19.9" ]
|
daily = [ "daily-python~=0.20.0" ]
|
||||||
deepgram = [ "deepgram-sdk~=4.7.0" ]
|
deepgram = [ "deepgram-sdk~=4.7.0" ]
|
||||||
elevenlabs = [ "pipecat-ai[websockets-base]" ]
|
elevenlabs = [ "pipecat-ai[websockets-base]" ]
|
||||||
fal = [ "fal-client~=0.5.9" ]
|
fal = [ "fal-client~=0.5.9" ]
|
||||||
fireworks = []
|
fireworks = []
|
||||||
fish = [ "ormsgpack~=1.7.0", "pipecat-ai[websockets-base]" ]
|
fish = [ "ormsgpack~=1.7.0", "pipecat-ai[websockets-base]" ]
|
||||||
gladia = [ "pipecat-ai[websockets-base]" ]
|
gladia = [ "pipecat-ai[websockets-base]" ]
|
||||||
google = [ "google-cloud-speech~=2.32.0", "google-cloud-texttospeech~=2.26.0", "google-genai~=1.24.0", "pipecat-ai[websockets-base]" ]
|
google = [ "google-cloud-speech>=2.33.0,<3", "google-cloud-texttospeech>=2.31.0,<3", "google-genai>=1.41.0,<2", "pipecat-ai[websockets-base]" ]
|
||||||
grok = []
|
grok = []
|
||||||
groq = [ "groq~=0.23.0" ]
|
groq = [ "groq~=0.23.0" ]
|
||||||
gstreamer = [ "pygobject~=3.50.0" ]
|
gstreamer = [ "pygobject~=3.50.0" ]
|
||||||
heygen = [ "livekit>=1.0.13", "pipecat-ai[websockets-base]" ]
|
heygen = [ "livekit>=1.0.13", "pipecat-ai[websockets-base]" ]
|
||||||
|
hume = [ "hume>=0.11.2" ]
|
||||||
inworld = []
|
inworld = []
|
||||||
krisp = [ "pipecat-ai-krisp~=0.4.0" ]
|
krisp = [ "pipecat-ai-krisp~=0.4.0" ]
|
||||||
koala = [ "pvkoala~=2.0.3" ]
|
koala = [ "pvkoala~=2.0.3" ]
|
||||||
@@ -83,7 +85,7 @@ nim = []
|
|||||||
neuphonic = [ "pipecat-ai[websockets-base]" ]
|
neuphonic = [ "pipecat-ai[websockets-base]" ]
|
||||||
noisereduce = [ "noisereduce~=3.0.3" ]
|
noisereduce = [ "noisereduce~=3.0.3" ]
|
||||||
openai = [ "pipecat-ai[websockets-base]" ]
|
openai = [ "pipecat-ai[websockets-base]" ]
|
||||||
openpipe = [ "openpipe~=4.50.0" ]
|
openpipe = [ "openpipe>=4.50.0,<6" ]
|
||||||
openrouter = []
|
openrouter = []
|
||||||
perplexity = []
|
perplexity = []
|
||||||
playht = [ "pipecat-ai[websockets-base]" ]
|
playht = [ "pipecat-ai[websockets-base]" ]
|
||||||
@@ -101,7 +103,7 @@ silero = [ "onnxruntime>=1.20.1,<2" ]
|
|||||||
simli = [ "simli-ai~=0.1.10"]
|
simli = [ "simli-ai~=0.1.10"]
|
||||||
soniox = [ "pipecat-ai[websockets-base]" ]
|
soniox = [ "pipecat-ai[websockets-base]" ]
|
||||||
soundfile = [ "soundfile~=0.13.0" ]
|
soundfile = [ "soundfile~=0.13.0" ]
|
||||||
speechmatics = [ "speechmatics-rt>=0.4.0" ]
|
speechmatics = [ "speechmatics-rt>=0.5.0" ]
|
||||||
strands = [ "strands-agents>=1.9.1,<2" ]
|
strands = [ "strands-agents>=1.9.1,<2" ]
|
||||||
tavus=[]
|
tavus=[]
|
||||||
together = []
|
together = []
|
||||||
|
|||||||
@@ -34,7 +34,8 @@ from pipecat.frames.frames import EndTaskFrame, LLMRunFrame, OutputImageRawFrame
|
|||||||
from pipecat.pipeline.pipeline import Pipeline
|
from pipecat.pipeline.pipeline import Pipeline
|
||||||
from pipecat.pipeline.runner import PipelineRunner
|
from pipecat.pipeline.runner import PipelineRunner
|
||||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||||
|
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||||
from pipecat.processors.audio.audio_buffer_processor import AudioBufferProcessor
|
from pipecat.processors.audio.audio_buffer_processor import AudioBufferProcessor
|
||||||
from pipecat.processors.frame_processor import FrameDirection
|
from pipecat.processors.frame_processor import FrameDirection
|
||||||
from pipecat.runner.types import RunnerArguments
|
from pipecat.runner.types import RunnerArguments
|
||||||
@@ -283,8 +284,8 @@ async def run_eval_pipeline(
|
|||||||
},
|
},
|
||||||
]
|
]
|
||||||
|
|
||||||
context = OpenAILLMContext(messages, tools)
|
context = LLMContext(messages, tools)
|
||||||
context_aggregator = llm.create_context_aggregator(context)
|
context_aggregator = LLMContextAggregatorPair(context)
|
||||||
|
|
||||||
audio_buffer = AudioBufferProcessor()
|
audio_buffer = AudioBufferProcessor()
|
||||||
|
|
||||||
|
|||||||
@@ -67,6 +67,7 @@ TESTS_07 = [
|
|||||||
("07ac-interruptible-asyncai-http.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
("07ac-interruptible-asyncai-http.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||||
("07b-interruptible-langchain.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
("07b-interruptible-langchain.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||||
("07c-interruptible-deepgram.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
("07c-interruptible-deepgram.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||||
|
("07c-interruptible-deepgram-flux.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||||
("07d-interruptible-elevenlabs.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
("07d-interruptible-elevenlabs.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||||
(
|
(
|
||||||
"07d-interruptible-elevenlabs-http.py",
|
"07d-interruptible-elevenlabs-http.py",
|
||||||
@@ -74,8 +75,6 @@ TESTS_07 = [
|
|||||||
EVAL_SIMPLE_MATH,
|
EVAL_SIMPLE_MATH,
|
||||||
BOT_SPEAKS_FIRST,
|
BOT_SPEAKS_FIRST,
|
||||||
),
|
),
|
||||||
("07e-interruptible-playht.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
|
||||||
("07e-interruptible-playht-http.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
|
||||||
("07f-interruptible-azure.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
("07f-interruptible-azure.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||||
("07g-interruptible-openai.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
("07g-interruptible-openai.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||||
("07h-interruptible-openpipe.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
("07h-interruptible-openpipe.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||||
@@ -102,6 +101,7 @@ TESTS_07 = [
|
|||||||
("07w-interruptible-fal.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
("07w-interruptible-fal.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||||
("07y-interruptible-minimax.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
("07y-interruptible-minimax.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||||
("07z-interruptible-sarvam.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
("07z-interruptible-sarvam.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||||
|
("07ae-interruptible-hume.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||||
# Needs a local XTTS docker instance running.
|
# Needs a local XTTS docker instance running.
|
||||||
# ("07i-interruptible-xtts.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
# ("07i-interruptible-xtts.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||||
# Needs a Krisp license.
|
# Needs a Krisp license.
|
||||||
@@ -136,6 +136,7 @@ TESTS_14 = [
|
|||||||
("14r-function-calling-aws.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
("14r-function-calling-aws.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||||
("14v-function-calling-openai.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
("14v-function-calling-openai.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||||
("14w-function-calling-mistral.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
("14w-function-calling-mistral.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||||
|
("14x-function-calling-openpipe.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||||
# Currently not working.
|
# Currently not working.
|
||||||
# ("14c-function-calling-together.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
# ("14c-function-calling-together.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||||
# ("14l-function-calling-deepseek.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
# ("14l-function-calling-deepseek.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||||
@@ -147,7 +148,10 @@ TESTS_15 = [
|
|||||||
]
|
]
|
||||||
|
|
||||||
TESTS_19 = [
|
TESTS_19 = [
|
||||||
|
("19-openai-realtime.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||||
("19-openai-realtime-beta.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
("19-openai-realtime-beta.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||||
|
# OpenAI Realtime not released on Azure yet
|
||||||
|
# ("19a-azure-realtime.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||||
("19a-azure-realtime-beta.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
("19a-azure-realtime-beta.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||||
("19b-openai-realtime-text.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
("19b-openai-realtime-text.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||||
("19b-openai-realtime-beta-text.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
("19b-openai-realtime-beta-text.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||||
@@ -160,18 +164,18 @@ TESTS_21 = [
|
|||||||
TESTS_26 = [
|
TESTS_26 = [
|
||||||
("26-gemini-multimodal-live.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
("26-gemini-multimodal-live.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||||
(
|
(
|
||||||
"26a-gemini-multimodal-live-transcription.py",
|
"26a-gemini-live-transcription.py",
|
||||||
PROMPT_SIMPLE_MATH,
|
PROMPT_SIMPLE_MATH,
|
||||||
EVAL_SIMPLE_MATH,
|
EVAL_SIMPLE_MATH,
|
||||||
BOT_SPEAKS_FIRST,
|
BOT_SPEAKS_FIRST,
|
||||||
),
|
),
|
||||||
(
|
(
|
||||||
"26b-gemini-multimodal-live-function-calling.py",
|
"26b-gemini-live-function-calling.py",
|
||||||
PROMPT_WEATHER,
|
PROMPT_WEATHER,
|
||||||
EVAL_WEATHER,
|
EVAL_WEATHER,
|
||||||
BOT_SPEAKS_FIRST,
|
BOT_SPEAKS_FIRST,
|
||||||
),
|
),
|
||||||
("26c-gemini-multimodal-live-video.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
("26c-gemini-live-video.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||||
(
|
(
|
||||||
"26e-gemini-multimodal-google-search.py",
|
"26e-gemini-multimodal-google-search.py",
|
||||||
PROMPT_ONLINE_SEARCH,
|
PROMPT_ONLINE_SEARCH,
|
||||||
@@ -179,7 +183,13 @@ TESTS_26 = [
|
|||||||
BOT_SPEAKS_FIRST,
|
BOT_SPEAKS_FIRST,
|
||||||
),
|
),
|
||||||
# Currently not working.
|
# Currently not working.
|
||||||
# ("26d-gemini-multimodal-live-text.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
# ("26d-gemini-live-text.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||||
|
(
|
||||||
|
"26h-gemini-live-vertex-function-calling.py",
|
||||||
|
PROMPT_WEATHER,
|
||||||
|
EVAL_WEATHER,
|
||||||
|
BOT_SPEAKS_FIRST,
|
||||||
|
),
|
||||||
]
|
]
|
||||||
|
|
||||||
TESTS_27 = [
|
TESTS_27 = [
|
||||||
|
|||||||
@@ -6,13 +6,47 @@
|
|||||||
|
|
||||||
"""AWS Nova Sonic LLM adapter for Pipecat."""
|
"""AWS Nova Sonic LLM adapter for Pipecat."""
|
||||||
|
|
||||||
|
import copy
|
||||||
import json
|
import json
|
||||||
from typing import Any, Dict, List, TypedDict
|
from dataclasses import dataclass
|
||||||
|
from enum import Enum
|
||||||
|
from typing import Any, Dict, List, Optional, TypedDict
|
||||||
|
|
||||||
|
from loguru import logger
|
||||||
|
|
||||||
from pipecat.adapters.base_llm_adapter import BaseLLMAdapter
|
from pipecat.adapters.base_llm_adapter import BaseLLMAdapter
|
||||||
from pipecat.adapters.schemas.function_schema import FunctionSchema
|
from pipecat.adapters.schemas.function_schema import FunctionSchema
|
||||||
from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
||||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
from pipecat.processors.aggregators.llm_context import LLMContext, LLMContextMessage
|
||||||
|
|
||||||
|
|
||||||
|
class Role(Enum):
|
||||||
|
"""Roles supported in AWS Nova Sonic conversations.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
SYSTEM: System-level messages (not used in conversation history).
|
||||||
|
USER: Messages sent by the user.
|
||||||
|
ASSISTANT: Messages sent by the assistant.
|
||||||
|
TOOL: Messages sent by tools (not used in conversation history).
|
||||||
|
"""
|
||||||
|
|
||||||
|
SYSTEM = "SYSTEM"
|
||||||
|
USER = "USER"
|
||||||
|
ASSISTANT = "ASSISTANT"
|
||||||
|
TOOL = "TOOL"
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class AWSNovaSonicConversationHistoryMessage:
|
||||||
|
"""A single message in AWS Nova Sonic conversation history.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
role: The role of the message sender (USER or ASSISTANT only).
|
||||||
|
text: The text content of the message.
|
||||||
|
"""
|
||||||
|
|
||||||
|
role: Role # only USER and ASSISTANT
|
||||||
|
text: str
|
||||||
|
|
||||||
|
|
||||||
class AWSNovaSonicLLMInvocationParams(TypedDict):
|
class AWSNovaSonicLLMInvocationParams(TypedDict):
|
||||||
@@ -21,7 +55,9 @@ class AWSNovaSonicLLMInvocationParams(TypedDict):
|
|||||||
This is a placeholder until support for universal LLMContext machinery is added for AWS Nova Sonic.
|
This is a placeholder until support for universal LLMContext machinery is added for AWS Nova Sonic.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
pass
|
system_instruction: Optional[str]
|
||||||
|
messages: List[AWSNovaSonicConversationHistoryMessage]
|
||||||
|
tools: List[Dict[str, Any]]
|
||||||
|
|
||||||
|
|
||||||
class AWSNovaSonicLLMAdapter(BaseLLMAdapter[AWSNovaSonicLLMInvocationParams]):
|
class AWSNovaSonicLLMAdapter(BaseLLMAdapter[AWSNovaSonicLLMInvocationParams]):
|
||||||
@@ -34,7 +70,7 @@ class AWSNovaSonicLLMAdapter(BaseLLMAdapter[AWSNovaSonicLLMInvocationParams]):
|
|||||||
@property
|
@property
|
||||||
def id_for_llm_specific_messages(self) -> str:
|
def id_for_llm_specific_messages(self) -> str:
|
||||||
"""Get the identifier used in LLMSpecificMessage instances for AWS Nova Sonic."""
|
"""Get the identifier used in LLMSpecificMessage instances for AWS Nova Sonic."""
|
||||||
raise NotImplementedError("Universal LLMContext is not yet supported for AWS Nova Sonic.")
|
return "aws-nova-sonic"
|
||||||
|
|
||||||
def get_llm_invocation_params(self, context: LLMContext) -> AWSNovaSonicLLMInvocationParams:
|
def get_llm_invocation_params(self, context: LLMContext) -> AWSNovaSonicLLMInvocationParams:
|
||||||
"""Get AWS Nova Sonic-specific LLM invocation parameters from a universal LLM context.
|
"""Get AWS Nova Sonic-specific LLM invocation parameters from a universal LLM context.
|
||||||
@@ -47,7 +83,13 @@ class AWSNovaSonicLLMAdapter(BaseLLMAdapter[AWSNovaSonicLLMInvocationParams]):
|
|||||||
Returns:
|
Returns:
|
||||||
Dictionary of parameters for invoking AWS Nova Sonic's LLM API.
|
Dictionary of parameters for invoking AWS Nova Sonic's LLM API.
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError("Universal LLMContext is not yet supported for AWS Nova Sonic.")
|
messages = self._from_universal_context_messages(self.get_messages(context))
|
||||||
|
return {
|
||||||
|
"system_instruction": messages.system_instruction,
|
||||||
|
"messages": messages.messages,
|
||||||
|
# NOTE: LLMContext's tools are guaranteed to be a ToolsSchema (or NOT_GIVEN)
|
||||||
|
"tools": self.from_standard_tools(context.tools) or [],
|
||||||
|
}
|
||||||
|
|
||||||
def get_messages_for_logging(self, context) -> List[Dict[str, Any]]:
|
def get_messages_for_logging(self, context) -> List[Dict[str, Any]]:
|
||||||
"""Get messages from a universal LLM context in a format ready for logging about AWS Nova Sonic.
|
"""Get messages from a universal LLM context in a format ready for logging about AWS Nova Sonic.
|
||||||
@@ -62,7 +104,75 @@ class AWSNovaSonicLLMAdapter(BaseLLMAdapter[AWSNovaSonicLLMInvocationParams]):
|
|||||||
Returns:
|
Returns:
|
||||||
List of messages in a format ready for logging about AWS Nova Sonic.
|
List of messages in a format ready for logging about AWS Nova Sonic.
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError("Universal LLMContext is not yet supported for AWS Nova Sonic.")
|
return self._from_universal_context_messages(self.get_messages(context)).messages
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ConvertedMessages:
|
||||||
|
"""Container for Google-formatted messages converted from universal context."""
|
||||||
|
|
||||||
|
messages: List[AWSNovaSonicConversationHistoryMessage]
|
||||||
|
system_instruction: Optional[str] = None
|
||||||
|
|
||||||
|
def _from_universal_context_messages(
|
||||||
|
self, universal_context_messages: List[LLMContextMessage]
|
||||||
|
) -> ConvertedMessages:
|
||||||
|
system_instruction = None
|
||||||
|
messages = []
|
||||||
|
|
||||||
|
# Bail if there are no messages
|
||||||
|
if not universal_context_messages:
|
||||||
|
return self.ConvertedMessages()
|
||||||
|
|
||||||
|
universal_context_messages = copy.deepcopy(universal_context_messages)
|
||||||
|
|
||||||
|
# If we have a "system" message as our first message, let's pull that out into "instruction"
|
||||||
|
if universal_context_messages[0].get("role") == "system":
|
||||||
|
system = universal_context_messages.pop(0)
|
||||||
|
content = system.get("content")
|
||||||
|
if isinstance(content, str):
|
||||||
|
system_instruction = content
|
||||||
|
elif isinstance(content, list):
|
||||||
|
system_instruction = content[0].get("text")
|
||||||
|
if system_instruction:
|
||||||
|
self._system_instruction = system_instruction
|
||||||
|
|
||||||
|
# Process remaining messages to fill out conversation history.
|
||||||
|
# Nova Sonic supports "user" and "assistant" messages in history.
|
||||||
|
for universal_context_message in universal_context_messages:
|
||||||
|
message = self._from_universal_context_message(universal_context_message)
|
||||||
|
if message:
|
||||||
|
messages.append(message)
|
||||||
|
|
||||||
|
return self.ConvertedMessages(messages=messages, system_instruction=system_instruction)
|
||||||
|
|
||||||
|
def _from_universal_context_message(self, message) -> AWSNovaSonicConversationHistoryMessage:
|
||||||
|
"""Convert standard message format to Nova Sonic format.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
message: Standard message dictionary to convert.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Nova Sonic conversation history message, or None if not convertible.
|
||||||
|
"""
|
||||||
|
role = message.get("role")
|
||||||
|
if message.get("role") == "user" or message.get("role") == "assistant":
|
||||||
|
content = message.get("content")
|
||||||
|
if isinstance(message.get("content"), list):
|
||||||
|
content = ""
|
||||||
|
for c in message.get("content"):
|
||||||
|
if c.get("type") == "text":
|
||||||
|
content += " " + c.get("text")
|
||||||
|
else:
|
||||||
|
logger.error(
|
||||||
|
f"Unhandled content type in context message: {c.get('type')} - {message}"
|
||||||
|
)
|
||||||
|
# There won't be content if this is an assistant tool call entry.
|
||||||
|
# We're ignoring those since they can't be loaded into AWS Nova Sonic conversation
|
||||||
|
# history
|
||||||
|
if content:
|
||||||
|
return AWSNovaSonicConversationHistoryMessage(role=Role[role.upper()], text=content)
|
||||||
|
# NOTE: we're ignoring messages with role "tool" since they can't be loaded into AWS Nova
|
||||||
|
# Sonic conversation history
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _to_aws_nova_sonic_function_format(function: FunctionSchema) -> Dict[str, Any]:
|
def _to_aws_nova_sonic_function_format(function: FunctionSchema) -> Dict[str, Any]:
|
||||||
|
|||||||
@@ -87,9 +87,11 @@ class GeminiLLMAdapter(BaseLLMAdapter[GeminiLLMInvocationParams]):
|
|||||||
Includes both converted standard tools and any custom Gemini-specific tools.
|
Includes both converted standard tools and any custom Gemini-specific tools.
|
||||||
"""
|
"""
|
||||||
functions_schema = tools_schema.standard_tools
|
functions_schema = tools_schema.standard_tools
|
||||||
formatted_standard_tools = [
|
formatted_standard_tools = (
|
||||||
{"function_declarations": [func.to_default_dict() for func in functions_schema]}
|
[{"function_declarations": [func.to_default_dict() for func in functions_schema]}]
|
||||||
]
|
if functions_schema
|
||||||
|
else []
|
||||||
|
)
|
||||||
custom_gemini_tools = []
|
custom_gemini_tools = []
|
||||||
if tools_schema.custom_tools:
|
if tools_schema.custom_tools:
|
||||||
custom_gemini_tools = tools_schema.custom_tools.get(AdapterType.GEMINI, [])
|
custom_gemini_tools = tools_schema.custom_tools.get(AdapterType.GEMINI, [])
|
||||||
|
|||||||
193
src/pipecat/audio/filters/krisp_viva_filter.py
Normal file
193
src/pipecat/audio/filters/krisp_viva_filter.py
Normal file
@@ -0,0 +1,193 @@
|
|||||||
|
#
|
||||||
|
# Copyright (c) 2024–2025, Daily
|
||||||
|
#
|
||||||
|
# SPDX-License-Identifier: BSD 2-Clause License
|
||||||
|
#
|
||||||
|
|
||||||
|
"""Krisp noise reduction audio filter for Pipecat.
|
||||||
|
|
||||||
|
This module provides an audio filter implementation using Krisp VIVA SDK.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from loguru import logger
|
||||||
|
|
||||||
|
from pipecat.audio.filters.base_audio_filter import BaseAudioFilter
|
||||||
|
from pipecat.frames.frames import FilterControlFrame, FilterEnableFrame
|
||||||
|
|
||||||
|
try:
|
||||||
|
import krisp_audio
|
||||||
|
except ModuleNotFoundError as e:
|
||||||
|
logger.error(f"Exception: {e}")
|
||||||
|
logger.error("In order to use the Krisp filter, you need to install krisp_audio.")
|
||||||
|
raise Exception(f"Missing module: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
def _log_callback(log_message, log_level):
|
||||||
|
logger.info(f"[{log_level}] {log_message}")
|
||||||
|
|
||||||
|
|
||||||
|
class KrispVivaFilter(BaseAudioFilter):
|
||||||
|
"""Audio filter using the Krisp VIVA SDK.
|
||||||
|
|
||||||
|
Provides real-time noise reduction for audio streams using Krisp's
|
||||||
|
proprietary noise suppression algorithms. This filter requires a
|
||||||
|
valid Krisp model file to operate.
|
||||||
|
|
||||||
|
Supported sample rates:
|
||||||
|
- 8000 Hz
|
||||||
|
- 16000 Hz
|
||||||
|
- 24000 Hz
|
||||||
|
- 32000 Hz
|
||||||
|
- 44100 Hz
|
||||||
|
- 48000 Hz
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Initialize Krisp Audio SDK globally
|
||||||
|
krisp_audio.globalInit("", _log_callback, krisp_audio.LogLevel.Off)
|
||||||
|
SDK_VERSION = krisp_audio.getVersion()
|
||||||
|
logger.debug(
|
||||||
|
f"Krisp Audio Python SDK Version: {SDK_VERSION.major}."
|
||||||
|
f"{SDK_VERSION.minor}.{SDK_VERSION.patch}"
|
||||||
|
)
|
||||||
|
|
||||||
|
SAMPLE_RATES = {
|
||||||
|
8000: krisp_audio.SamplingRate.Sr8000Hz,
|
||||||
|
16000: krisp_audio.SamplingRate.Sr16000Hz,
|
||||||
|
24000: krisp_audio.SamplingRate.Sr24000Hz,
|
||||||
|
32000: krisp_audio.SamplingRate.Sr32000Hz,
|
||||||
|
44100: krisp_audio.SamplingRate.Sr44100Hz,
|
||||||
|
48000: krisp_audio.SamplingRate.Sr48000Hz,
|
||||||
|
}
|
||||||
|
|
||||||
|
FRAME_SIZE_MS = 10 # Krisp requires audio frames of 10ms duration for processing.
|
||||||
|
|
||||||
|
def __init__(self, model_path: str = None, noise_suppression_level: int = 100) -> None:
|
||||||
|
"""Initialize the Krisp noise reduction filter.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
model_path: Path to the Krisp model file (.kef extension).
|
||||||
|
If None, uses KRISP_VIVA_MODEL_PATH environment variable.
|
||||||
|
noise_suppression_level: Noise suppression level.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If model_path is not provided and KRISP_VIVA_MODEL_PATH is not set.
|
||||||
|
Exception: If model file doesn't have .kef extension.
|
||||||
|
FileNotFoundError: If model file doesn't exist.
|
||||||
|
"""
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
|
# Set model path, checking environment if not specified
|
||||||
|
self._model_path = model_path or os.getenv("KRISP_VIVA_MODEL_PATH")
|
||||||
|
if not self._model_path:
|
||||||
|
logger.error("Model path is not provided and KRISP_VIVA_MODEL_PATH is not set.")
|
||||||
|
raise ValueError("Model path for KrispAudioProcessor must be provided.")
|
||||||
|
|
||||||
|
if not self._model_path.endswith(".kef"):
|
||||||
|
raise Exception("Model is expected with .kef extension")
|
||||||
|
|
||||||
|
if not os.path.isfile(self._model_path):
|
||||||
|
raise FileNotFoundError(f"Model file not found: {self._model_path}")
|
||||||
|
|
||||||
|
self._filtering = True
|
||||||
|
self._session = None
|
||||||
|
self._samples_per_frame = None
|
||||||
|
self._noise_suppression_level = noise_suppression_level
|
||||||
|
|
||||||
|
# Audio buffer to accumulate samples for complete frames
|
||||||
|
self._audio_buffer = bytearray()
|
||||||
|
|
||||||
|
def _int_to_sample_rate(self, sample_rate):
|
||||||
|
"""Convert integer sample rate to krisp_audio SamplingRate enum.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
sample_rate: Sample rate as integer
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
krisp_audio.SamplingRate enum value
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If sample rate is not supported
|
||||||
|
"""
|
||||||
|
if sample_rate not in self.SAMPLE_RATES:
|
||||||
|
raise ValueError("Unsupported sample rate")
|
||||||
|
return self.SAMPLE_RATES[sample_rate]
|
||||||
|
|
||||||
|
async def start(self, sample_rate: int):
|
||||||
|
"""Initialize the Krisp processor with the transport's sample rate.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
sample_rate: The sample rate of the input transport in Hz.
|
||||||
|
"""
|
||||||
|
model_info = krisp_audio.ModelInfo()
|
||||||
|
model_info.path = self._model_path
|
||||||
|
|
||||||
|
nc_cfg = krisp_audio.NcSessionConfig()
|
||||||
|
nc_cfg.inputSampleRate = self._int_to_sample_rate(sample_rate)
|
||||||
|
nc_cfg.inputFrameDuration = krisp_audio.FrameDuration.Fd10ms
|
||||||
|
nc_cfg.outputSampleRate = nc_cfg.inputSampleRate
|
||||||
|
nc_cfg.modelInfo = model_info
|
||||||
|
|
||||||
|
self._samples_per_frame = int((sample_rate * self.FRAME_SIZE_MS) / 1000)
|
||||||
|
self._session = krisp_audio.NcInt16.create(nc_cfg)
|
||||||
|
|
||||||
|
async def stop(self):
|
||||||
|
"""Clean up the Krisp processor when stopping."""
|
||||||
|
self._session = None
|
||||||
|
|
||||||
|
async def process_frame(self, frame: FilterControlFrame):
|
||||||
|
"""Process control frames to enable/disable filtering.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
frame: The control frame containing filter commands.
|
||||||
|
"""
|
||||||
|
if isinstance(frame, FilterEnableFrame):
|
||||||
|
self._filtering = frame.enable
|
||||||
|
|
||||||
|
async def filter(self, audio: bytes) -> bytes:
|
||||||
|
"""Apply Krisp noise reduction to audio data.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
audio: Raw audio data as bytes to be filtered.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Noise-reduced audio data as bytes.
|
||||||
|
"""
|
||||||
|
if not self._filtering:
|
||||||
|
return audio
|
||||||
|
|
||||||
|
# Add incoming audio to our buffer
|
||||||
|
self._audio_buffer.extend(audio)
|
||||||
|
|
||||||
|
# Calculate how many complete frames we can process
|
||||||
|
total_samples = len(self._audio_buffer) // 2 # 2 bytes per int16 sample
|
||||||
|
num_complete_frames = total_samples // self._samples_per_frame
|
||||||
|
|
||||||
|
if num_complete_frames == 0:
|
||||||
|
# Not enough samples for a complete frame yet, return empty
|
||||||
|
return b""
|
||||||
|
|
||||||
|
# Calculate how many bytes we need for complete frames
|
||||||
|
complete_samples_count = num_complete_frames * self._samples_per_frame
|
||||||
|
bytes_to_process = complete_samples_count * 2 # 2 bytes per sample
|
||||||
|
|
||||||
|
# Extract the bytes we can process
|
||||||
|
audio_to_process = bytes(self._audio_buffer[:bytes_to_process])
|
||||||
|
|
||||||
|
# Remove processed bytes from buffer, keep the remainder
|
||||||
|
self._audio_buffer = self._audio_buffer[bytes_to_process:]
|
||||||
|
|
||||||
|
# Process the complete frames
|
||||||
|
samples = np.frombuffer(audio_to_process, dtype=np.int16)
|
||||||
|
frames = samples.reshape(-1, self._samples_per_frame)
|
||||||
|
processed_samples = np.empty_like(samples)
|
||||||
|
|
||||||
|
for i, frame in enumerate(frames):
|
||||||
|
cleaned_frame = self._session.process(frame, self._noise_suppression_level)
|
||||||
|
processed_samples[i * self._samples_per_frame : (i + 1) * self._samples_per_frame] = (
|
||||||
|
cleaned_frame
|
||||||
|
)
|
||||||
|
|
||||||
|
return processed_samples.tobytes()
|
||||||
@@ -14,6 +14,8 @@ from abc import ABC, abstractmethod
|
|||||||
from enum import Enum
|
from enum import Enum
|
||||||
from typing import Optional, Tuple
|
from typing import Optional, Tuple
|
||||||
|
|
||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
from pipecat.metrics.metrics import MetricsData
|
from pipecat.metrics.metrics import MetricsData
|
||||||
|
|
||||||
|
|
||||||
@@ -29,6 +31,12 @@ class EndOfTurnState(Enum):
|
|||||||
INCOMPLETE = 2
|
INCOMPLETE = 2
|
||||||
|
|
||||||
|
|
||||||
|
class BaseTurnParams(BaseModel):
|
||||||
|
"""Base class for turn analyzer parameters."""
|
||||||
|
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
class BaseTurnAnalyzer(ABC):
|
class BaseTurnAnalyzer(ABC):
|
||||||
"""Abstract base class for analyzing user end of turn.
|
"""Abstract base class for analyzing user end of turn.
|
||||||
|
|
||||||
@@ -78,7 +86,7 @@ class BaseTurnAnalyzer(ABC):
|
|||||||
|
|
||||||
@property
|
@property
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def params(self):
|
def params(self) -> BaseTurnParams:
|
||||||
"""Get the current turn analyzer parameters.
|
"""Get the current turn analyzer parameters.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
|
|||||||
@@ -11,15 +11,17 @@ machine learning models to determine when a user has finished speaking, going
|
|||||||
beyond simple silence-based detection.
|
beyond simple silence-based detection.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
import time
|
import time
|
||||||
from abc import abstractmethod
|
from abc import abstractmethod
|
||||||
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
from typing import Any, Dict, Optional, Tuple
|
from typing import Any, Dict, Optional, Tuple
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
|
||||||
from pipecat.audio.turn.base_turn_analyzer import BaseTurnAnalyzer, EndOfTurnState
|
from pipecat.audio.turn.base_turn_analyzer import BaseTurnAnalyzer, BaseTurnParams, EndOfTurnState
|
||||||
from pipecat.metrics.metrics import MetricsData, SmartTurnMetricsData
|
from pipecat.metrics.metrics import MetricsData, SmartTurnMetricsData
|
||||||
|
|
||||||
# Default timing parameters
|
# Default timing parameters
|
||||||
@@ -29,7 +31,7 @@ MAX_DURATION_SECONDS = 8 # Max allowed segment duration
|
|||||||
USE_ONLY_LAST_VAD_SEGMENT = True
|
USE_ONLY_LAST_VAD_SEGMENT = True
|
||||||
|
|
||||||
|
|
||||||
class SmartTurnParams(BaseModel):
|
class SmartTurnParams(BaseTurnParams):
|
||||||
"""Configuration parameters for smart turn analysis.
|
"""Configuration parameters for smart turn analysis.
|
||||||
|
|
||||||
Parameters:
|
Parameters:
|
||||||
@@ -77,6 +79,9 @@ class BaseSmartTurn(BaseTurnAnalyzer):
|
|||||||
self._speech_triggered = False
|
self._speech_triggered = False
|
||||||
self._silence_ms = 0
|
self._silence_ms = 0
|
||||||
self._speech_start_time = 0
|
self._speech_start_time = 0
|
||||||
|
# Thread executor that will run the model. We only need one thread per
|
||||||
|
# analyzer because one analyzer just handles one audio stream.
|
||||||
|
self._executor = ThreadPoolExecutor(max_workers=1)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def speech_triggered(self) -> bool:
|
def speech_triggered(self) -> bool:
|
||||||
@@ -151,7 +156,10 @@ class BaseSmartTurn(BaseTurnAnalyzer):
|
|||||||
Tuple containing the end-of-turn state and optional metrics data
|
Tuple containing the end-of-turn state and optional metrics data
|
||||||
from the ML model analysis.
|
from the ML model analysis.
|
||||||
"""
|
"""
|
||||||
state, result = await self._process_speech_segment(self._audio_buffer)
|
loop = asyncio.get_running_loop()
|
||||||
|
state, result = await loop.run_in_executor(
|
||||||
|
self._executor, self._process_speech_segment, self._audio_buffer
|
||||||
|
)
|
||||||
if state == EndOfTurnState.COMPLETE or USE_ONLY_LAST_VAD_SEGMENT:
|
if state == EndOfTurnState.COMPLETE or USE_ONLY_LAST_VAD_SEGMENT:
|
||||||
self._clear(state)
|
self._clear(state)
|
||||||
logger.debug(f"End of Turn result: {state}")
|
logger.debug(f"End of Turn result: {state}")
|
||||||
@@ -169,9 +177,7 @@ class BaseSmartTurn(BaseTurnAnalyzer):
|
|||||||
self._speech_start_time = 0
|
self._speech_start_time = 0
|
||||||
self._silence_ms = 0
|
self._silence_ms = 0
|
||||||
|
|
||||||
async def _process_speech_segment(
|
def _process_speech_segment(self, audio_buffer) -> Tuple[EndOfTurnState, Optional[MetricsData]]:
|
||||||
self, audio_buffer
|
|
||||||
) -> Tuple[EndOfTurnState, Optional[MetricsData]]:
|
|
||||||
"""Process accumulated audio segment using ML model."""
|
"""Process accumulated audio segment using ML model."""
|
||||||
state = EndOfTurnState.INCOMPLETE
|
state = EndOfTurnState.INCOMPLETE
|
||||||
|
|
||||||
@@ -203,7 +209,7 @@ class BaseSmartTurn(BaseTurnAnalyzer):
|
|||||||
if len(segment_audio) > 0:
|
if len(segment_audio) > 0:
|
||||||
start_time = time.perf_counter()
|
start_time = time.perf_counter()
|
||||||
try:
|
try:
|
||||||
result = await self._predict_endpoint(segment_audio)
|
result = self._predict_endpoint(segment_audio)
|
||||||
state = (
|
state = (
|
||||||
EndOfTurnState.COMPLETE
|
EndOfTurnState.COMPLETE
|
||||||
if result["prediction"] == 1
|
if result["prediction"] == 1
|
||||||
@@ -249,6 +255,6 @@ class BaseSmartTurn(BaseTurnAnalyzer):
|
|||||||
return state, result_data
|
return state, result_data
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
async def _predict_endpoint(self, audio_array: np.ndarray) -> Dict[str, Any]:
|
def _predict_endpoint(self, audio_array: np.ndarray) -> Dict[str, Any]:
|
||||||
"""Predict end-of-turn using ML model from audio data."""
|
"""Predict end-of-turn using ML model from audio data."""
|
||||||
pass
|
pass
|
||||||
|
|||||||
@@ -104,11 +104,15 @@ class HttpSmartTurnAnalyzer(BaseSmartTurn):
|
|||||||
logger.error(f"Failed to send raw request to Daily Smart Turn: {e}")
|
logger.error(f"Failed to send raw request to Daily Smart Turn: {e}")
|
||||||
raise Exception("Failed to send raw request to Daily Smart Turn.")
|
raise Exception("Failed to send raw request to Daily Smart Turn.")
|
||||||
|
|
||||||
async def _predict_endpoint(self, audio_array: np.ndarray) -> Dict[str, Any]:
|
def _predict_endpoint(self, audio_array: np.ndarray) -> Dict[str, Any]:
|
||||||
"""Predict end-of-turn using remote HTTP ML service."""
|
"""Predict end-of-turn using remote HTTP ML service."""
|
||||||
try:
|
try:
|
||||||
serialized_array = self._serialize_array(audio_array)
|
serialized_array = self._serialize_array(audio_array)
|
||||||
return await self._send_raw_request(serialized_array)
|
loop = asyncio.get_running_loop()
|
||||||
|
future = asyncio.run_coroutine_threadsafe(
|
||||||
|
self._send_raw_request(serialized_array), loop
|
||||||
|
)
|
||||||
|
return future.result()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Smart turn prediction failed: {str(e)}")
|
logger.error(f"Smart turn prediction failed: {str(e)}")
|
||||||
# Return an incomplete prediction when a failure occurs
|
# Return an incomplete prediction when a failure occurs
|
||||||
|
|||||||
@@ -64,7 +64,7 @@ class LocalSmartTurnAnalyzer(BaseSmartTurn):
|
|||||||
self._turn_model.eval()
|
self._turn_model.eval()
|
||||||
logger.debug("Loaded Local Smart Turn")
|
logger.debug("Loaded Local Smart Turn")
|
||||||
|
|
||||||
async def _predict_endpoint(self, audio_array: np.ndarray) -> Dict[str, Any]:
|
def _predict_endpoint(self, audio_array: np.ndarray) -> Dict[str, Any]:
|
||||||
"""Predict end-of-turn using local PyTorch model."""
|
"""Predict end-of-turn using local PyTorch model."""
|
||||||
inputs = self._turn_processor(
|
inputs = self._turn_processor(
|
||||||
audio_array,
|
audio_array,
|
||||||
|
|||||||
@@ -73,7 +73,7 @@ class LocalSmartTurnAnalyzerV2(BaseSmartTurn):
|
|||||||
self._turn_model.eval()
|
self._turn_model.eval()
|
||||||
logger.debug("Loaded Local Smart Turn v2")
|
logger.debug("Loaded Local Smart Turn v2")
|
||||||
|
|
||||||
async def _predict_endpoint(self, audio_array: np.ndarray) -> Dict[str, Any]:
|
def _predict_endpoint(self, audio_array: np.ndarray) -> Dict[str, Any]:
|
||||||
"""Predict end-of-turn using local PyTorch model."""
|
"""Predict end-of-turn using local PyTorch model."""
|
||||||
inputs = self._turn_processor(
|
inputs = self._turn_processor(
|
||||||
audio_array,
|
audio_array,
|
||||||
|
|||||||
@@ -77,7 +77,7 @@ class LocalSmartTurnAnalyzerV3(BaseSmartTurn):
|
|||||||
|
|
||||||
logger.debug("Loaded Local Smart Turn v3")
|
logger.debug("Loaded Local Smart Turn v3")
|
||||||
|
|
||||||
async def _predict_endpoint(self, audio_array: np.ndarray) -> Dict[str, Any]:
|
def _predict_endpoint(self, audio_array: np.ndarray) -> Dict[str, Any]:
|
||||||
"""Predict end-of-turn using local ONNX model."""
|
"""Predict end-of-turn using local ONNX model."""
|
||||||
|
|
||||||
def truncate_audio_to_last_n_seconds(audio_array, n_seconds=8, sample_rate=16000):
|
def truncate_audio_to_last_n_seconds(audio_array, n_seconds=8, sample_rate=16000):
|
||||||
|
|||||||
@@ -11,7 +11,9 @@ data structures for voice activity detection in audio streams. Includes state
|
|||||||
management, parameter configuration, and audio analysis framework.
|
management, parameter configuration, and audio analysis framework.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
@@ -84,6 +86,10 @@ class VADAnalyzer(ABC):
|
|||||||
self._smoothing_factor = 0.2
|
self._smoothing_factor = 0.2
|
||||||
self._prev_volume = 0
|
self._prev_volume = 0
|
||||||
|
|
||||||
|
# Thread executor that will run the model. We only need one thread per
|
||||||
|
# analyzer because one analyzer just handles one audio stream.
|
||||||
|
self._executor = ThreadPoolExecutor(max_workers=1)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def sample_rate(self) -> int:
|
def sample_rate(self) -> int:
|
||||||
"""Get the current sample rate.
|
"""Get the current sample rate.
|
||||||
@@ -165,7 +171,7 @@ class VADAnalyzer(ABC):
|
|||||||
volume = calculate_audio_volume(audio, self.sample_rate)
|
volume = calculate_audio_volume(audio, self.sample_rate)
|
||||||
return exp_smoothing(volume, self._prev_volume, self._smoothing_factor)
|
return exp_smoothing(volume, self._prev_volume, self._smoothing_factor)
|
||||||
|
|
||||||
def analyze_audio(self, buffer) -> VADState:
|
async def analyze_audio(self, buffer: bytes) -> VADState:
|
||||||
"""Analyze audio buffer and return current VAD state.
|
"""Analyze audio buffer and return current VAD state.
|
||||||
|
|
||||||
Processes incoming audio data, maintains internal state, and determines
|
Processes incoming audio data, maintains internal state, and determines
|
||||||
@@ -177,6 +183,12 @@ class VADAnalyzer(ABC):
|
|||||||
Returns:
|
Returns:
|
||||||
Current VAD state after processing the buffer.
|
Current VAD state after processing the buffer.
|
||||||
"""
|
"""
|
||||||
|
loop = asyncio.get_running_loop()
|
||||||
|
state = await loop.run_in_executor(self._executor, self._run_analyzer, buffer)
|
||||||
|
return state
|
||||||
|
|
||||||
|
def _run_analyzer(self, buffer: bytes) -> VADState:
|
||||||
|
"""Analyze audio buffer and return current VAD state."""
|
||||||
self._vad_buffer += buffer
|
self._vad_buffer += buffer
|
||||||
|
|
||||||
num_required_bytes = self._vad_frames_num_bytes
|
num_required_bytes = self._vad_frames_num_bytes
|
||||||
|
|||||||
@@ -672,7 +672,7 @@ class TTSSpeakFrame(DataFrame):
|
|||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class TransportMessageFrame(DataFrame):
|
class OutputTransportMessageFrame(DataFrame):
|
||||||
"""Frame containing transport-specific message data.
|
"""Frame containing transport-specific message data.
|
||||||
|
|
||||||
Parameters:
|
Parameters:
|
||||||
@@ -685,6 +685,32 @@ class TransportMessageFrame(DataFrame):
|
|||||||
return f"{self.name}(message: {self.message})"
|
return f"{self.name}(message: {self.message})"
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class TransportMessageFrame(OutputTransportMessageFrame):
|
||||||
|
"""Frame containing transport-specific message data.
|
||||||
|
|
||||||
|
.. deprecated:: 0.0.87
|
||||||
|
This frame is deprecated and will be removed in a future version.
|
||||||
|
Instead, use `OutputTransportMessageFrame`.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
message: The transport message payload.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __post_init__(self):
|
||||||
|
super().__post_init__()
|
||||||
|
import warnings
|
||||||
|
|
||||||
|
with warnings.catch_warnings():
|
||||||
|
warnings.simplefilter("always")
|
||||||
|
warnings.warn(
|
||||||
|
"TransportMessageFrame is deprecated and will be removed in a future version. "
|
||||||
|
"Instead, use OutputTransportMessageFrame.",
|
||||||
|
DeprecationWarning,
|
||||||
|
stacklevel=2,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class DTMFFrame:
|
class DTMFFrame:
|
||||||
"""Base class for DTMF (Dual-Tone Multi-Frequency) keypad frames.
|
"""Base class for DTMF (Dual-Tone Multi-Frequency) keypad frames.
|
||||||
@@ -1092,8 +1118,8 @@ class STTMuteFrame(SystemFrame):
|
|||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class TransportMessageUrgentFrame(SystemFrame):
|
class InputTransportMessageFrame(SystemFrame):
|
||||||
"""Frame for urgent transport messages that need immediate processing.
|
"""Frame for transport messages received from external sources.
|
||||||
|
|
||||||
Parameters:
|
Parameters:
|
||||||
message: The urgent transport message payload.
|
message: The urgent transport message payload.
|
||||||
@@ -1106,20 +1132,69 @@ class TransportMessageUrgentFrame(SystemFrame):
|
|||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class InputTransportMessageUrgentFrame(TransportMessageUrgentFrame):
|
class InputTransportMessageUrgentFrame(InputTransportMessageFrame):
|
||||||
"""Frame for transport messages received from external sources.
|
"""Frame for transport messages received from external sources.
|
||||||
|
|
||||||
This frame wraps incoming transport messages to distinguish them from outgoing
|
.. deprecated:: 0.0.87
|
||||||
urgent transport messages (TransportMessageUrgentFrame), preventing infinite
|
This frame is deprecated and will be removed in a future version.
|
||||||
message loops in the transport layer. It inherits the message payload from
|
Instead, use `InputTransportMessageFrame`.
|
||||||
TransportMessageFrame while marking the message as having been received
|
|
||||||
rather than generated locally.
|
|
||||||
|
|
||||||
Used by transport implementations to properly handle bidirectional message
|
Parameters:
|
||||||
flow without creating feedback loops.
|
message: The urgent transport message payload.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
pass
|
def __post_init__(self):
|
||||||
|
super().__post_init__()
|
||||||
|
import warnings
|
||||||
|
|
||||||
|
with warnings.catch_warnings():
|
||||||
|
warnings.simplefilter("always")
|
||||||
|
warnings.warn(
|
||||||
|
"InputTransportMessageUrgentFrame is deprecated and will be removed in a future version. "
|
||||||
|
"Instead, use InputTransportMessageFrame.",
|
||||||
|
DeprecationWarning,
|
||||||
|
stacklevel=2,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class OutputTransportMessageUrgentFrame(SystemFrame):
|
||||||
|
"""Frame for urgent transport messages that need to be sent immediately.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
message: The urgent transport message payload.
|
||||||
|
"""
|
||||||
|
|
||||||
|
message: Any
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return f"{self.name}(message: {self.message})"
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class TransportMessageUrgentFrame(OutputTransportMessageUrgentFrame):
|
||||||
|
"""Frame for urgent transport messages that need to be sent immediately.
|
||||||
|
|
||||||
|
.. deprecated:: 0.0.87
|
||||||
|
This frame is deprecated and will be removed in a future version.
|
||||||
|
Instead, use `OutputTransportMessageUrgentFrame`.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
message: The urgent transport message payload.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __post_init__(self):
|
||||||
|
super().__post_init__()
|
||||||
|
import warnings
|
||||||
|
|
||||||
|
with warnings.catch_warnings():
|
||||||
|
warnings.simplefilter("always")
|
||||||
|
warnings.warn(
|
||||||
|
"TransportMessageUrgentFrame is deprecated and will be removed in a future version. "
|
||||||
|
"Instead, use OutputTransportMessageFrame.",
|
||||||
|
DeprecationWarning,
|
||||||
|
stacklevel=2,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
|
|||||||
@@ -70,11 +70,15 @@ class PipelineRunner(BaseObject):
|
|||||||
"""
|
"""
|
||||||
logger.debug(f"Runner {self} started running {task}")
|
logger.debug(f"Runner {self} started running {task}")
|
||||||
self._tasks[task.name] = task
|
self._tasks[task.name] = task
|
||||||
params = PipelineTaskParams(loop=self._loop)
|
|
||||||
|
# PipelineTask handles asyncio.CancelledError to shutdown the pipeline
|
||||||
|
# properly and re-raises it in case there's more cleanup to do.
|
||||||
try:
|
try:
|
||||||
|
params = PipelineTaskParams(loop=self._loop)
|
||||||
await task.run(params)
|
await task.run(params)
|
||||||
except asyncio.CancelledError:
|
except asyncio.CancelledError:
|
||||||
await self._cancel()
|
pass
|
||||||
|
|
||||||
del self._tasks[task.name]
|
del self._tasks[task.name]
|
||||||
|
|
||||||
# Cleanup base object.
|
# Cleanup base object.
|
||||||
|
|||||||
@@ -13,8 +13,7 @@ including heartbeats, idle detection, and observer integration.
|
|||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
import time
|
import time
|
||||||
from collections import deque
|
from typing import Any, AsyncIterable, Dict, Iterable, List, Optional, Tuple, Type
|
||||||
from typing import Any, AsyncIterable, Deque, Dict, Iterable, List, Optional, Tuple, Type
|
|
||||||
|
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
from pydantic import BaseModel, ConfigDict, Field
|
from pydantic import BaseModel, ConfigDict, Field
|
||||||
@@ -31,7 +30,6 @@ from pipecat.frames.frames import (
|
|||||||
ErrorFrame,
|
ErrorFrame,
|
||||||
Frame,
|
Frame,
|
||||||
HeartbeatFrame,
|
HeartbeatFrame,
|
||||||
InputAudioRawFrame,
|
|
||||||
InterruptionFrame,
|
InterruptionFrame,
|
||||||
InterruptionTaskFrame,
|
InterruptionTaskFrame,
|
||||||
MetricsFrame,
|
MetricsFrame,
|
||||||
@@ -132,12 +130,16 @@ class PipelineTask(BasePipelineTask):
|
|||||||
|
|
||||||
- on_pipeline_finished: Called after the pipeline has reached any terminal state.
|
- on_pipeline_finished: Called after the pipeline has reached any terminal state.
|
||||||
This includes:
|
This includes:
|
||||||
|
|
||||||
- StopFrame: pipeline was stopped (processors keep connections open)
|
- StopFrame: pipeline was stopped (processors keep connections open)
|
||||||
- EndFrame: pipeline ended normally
|
- EndFrame: pipeline ended normally
|
||||||
- CancelFrame: pipeline was cancelled
|
- CancelFrame: pipeline was cancelled
|
||||||
|
|
||||||
Use this event for cleanup, logging, or post-processing tasks. Users can inspect
|
Use this event for cleanup, logging, or post-processing tasks. Users can inspect
|
||||||
the frame if they need to handle specific cases.
|
the frame if they need to handle specific cases.
|
||||||
|
|
||||||
|
- on_pipeline_error: Called when an error occurs with ErrorFrame
|
||||||
|
|
||||||
Example::
|
Example::
|
||||||
|
|
||||||
@task.event_handler("on_frame_reached_upstream")
|
@task.event_handler("on_frame_reached_upstream")
|
||||||
@@ -148,9 +150,17 @@ class PipelineTask(BasePipelineTask):
|
|||||||
async def on_pipeline_idle_timeout(task):
|
async def on_pipeline_idle_timeout(task):
|
||||||
...
|
...
|
||||||
|
|
||||||
|
@task.event_handler("on_pipeline_started")
|
||||||
|
async def on_pipeline_started(task, frame):
|
||||||
|
...
|
||||||
|
|
||||||
@task.event_handler("on_pipeline_finished")
|
@task.event_handler("on_pipeline_finished")
|
||||||
async def on_pipeline_finished(task, frame):
|
async def on_pipeline_finished(task, frame):
|
||||||
...
|
...
|
||||||
|
|
||||||
|
@task.event_handler("on_pipeline_error")
|
||||||
|
async def on_pipeline_error(task, frame):
|
||||||
|
...
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
@@ -259,6 +269,9 @@ class PipelineTask(BasePipelineTask):
|
|||||||
# StopFrame) has been received at the end of the pipeline.
|
# StopFrame) has been received at the end of the pipeline.
|
||||||
self._pipeline_end_event = asyncio.Event()
|
self._pipeline_end_event = asyncio.Event()
|
||||||
|
|
||||||
|
# This event is set when the pipeline truly finishes.
|
||||||
|
self._pipeline_finished_event = asyncio.Event()
|
||||||
|
|
||||||
# This is the final pipeline. It is composed of a source processor,
|
# This is the final pipeline. It is composed of a source processor,
|
||||||
# followed by the user pipeline, and ending with a sink processor. The
|
# followed by the user pipeline, and ending with a sink processor. The
|
||||||
# source allows us to receive and react to upstream frames, and the sink
|
# source allows us to receive and react to upstream frames, and the sink
|
||||||
@@ -288,6 +301,7 @@ class PipelineTask(BasePipelineTask):
|
|||||||
self._register_event_handler("on_pipeline_ended")
|
self._register_event_handler("on_pipeline_ended")
|
||||||
self._register_event_handler("on_pipeline_cancelled")
|
self._register_event_handler("on_pipeline_cancelled")
|
||||||
self._register_event_handler("on_pipeline_finished")
|
self._register_event_handler("on_pipeline_finished")
|
||||||
|
self._register_event_handler("on_pipeline_error")
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def params(self) -> PipelineParams:
|
def params(self) -> PipelineParams:
|
||||||
@@ -390,12 +404,9 @@ class PipelineTask(BasePipelineTask):
|
|||||||
await self.queue_frame(EndFrame())
|
await self.queue_frame(EndFrame())
|
||||||
|
|
||||||
async def cancel(self):
|
async def cancel(self):
|
||||||
"""Immediately stop the running pipeline.
|
"""Request the running pipeline to cancel."""
|
||||||
|
if not self._finished:
|
||||||
Cancels all running tasks and stops frame processing without
|
await self._cancel()
|
||||||
waiting for completion.
|
|
||||||
"""
|
|
||||||
await self._cancel()
|
|
||||||
|
|
||||||
async def run(self, params: PipelineTaskParams):
|
async def run(self, params: PipelineTaskParams):
|
||||||
"""Start and manage the pipeline execution until completion or cancellation.
|
"""Start and manage the pipeline execution until completion or cancellation.
|
||||||
@@ -405,51 +416,38 @@ class PipelineTask(BasePipelineTask):
|
|||||||
"""
|
"""
|
||||||
if self.has_finished():
|
if self.has_finished():
|
||||||
return
|
return
|
||||||
cleanup_pipeline = True
|
|
||||||
|
# Setup processors.
|
||||||
|
await self._setup(params)
|
||||||
|
|
||||||
|
# Create all main tasks and wait for the main push task. This is the
|
||||||
|
# task that pushes frames to the very beginning of our pipeline (i.e. to
|
||||||
|
# our controlled source processor).
|
||||||
|
await self._create_tasks()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Setup processors.
|
# Wait for pipeline to finish.
|
||||||
await self._setup(params)
|
await self._wait_for_pipeline_finished()
|
||||||
|
|
||||||
# Create all main tasks and wait of the main push task. This is the
|
|
||||||
# task that pushes frames to the very beginning of our pipeline (our
|
|
||||||
# controlled source processor).
|
|
||||||
push_task = await self._create_tasks()
|
|
||||||
await push_task
|
|
||||||
|
|
||||||
# We have already cleaned up the pipeline inside the task.
|
|
||||||
cleanup_pipeline = False
|
|
||||||
|
|
||||||
# Pipeline has finished nicely.
|
|
||||||
self._finished = True
|
|
||||||
except asyncio.CancelledError:
|
except asyncio.CancelledError:
|
||||||
# Raise exception back to the pipeline runner so it can cancel this
|
logger.debug(f"Pipeline task {self} got cancelled from outside...")
|
||||||
# task properly.
|
# We have been cancelled from outside, let's just cancel everything.
|
||||||
|
await self._cancel()
|
||||||
|
# Wait again for pipeline to finish. This time we have really
|
||||||
|
# cancelled, so it should really finish.
|
||||||
|
await self._wait_for_pipeline_finished()
|
||||||
|
# Re-raise in case there's more cleanup to do.
|
||||||
raise
|
raise
|
||||||
finally:
|
finally:
|
||||||
# We can reach this point for different reasons:
|
# We can reach this point for different reasons:
|
||||||
#
|
#
|
||||||
# 1. The task has finished properly (e.g. `EndFrame`).
|
# 1. The pipeline task has finished (try case).
|
||||||
# 2. By calling `PipelineTask.cancel()`.
|
# 2. By an asyncio task cancellation (except case).
|
||||||
# 3. By asyncio task cancellation.
|
logger.debug(f"Pipeline task {self} is finishing...")
|
||||||
#
|
await self._cancel_tasks()
|
||||||
# Case (1) will execute the code below without issues because
|
if self._check_dangling_tasks:
|
||||||
# `self._finished` is true.
|
self._print_dangling_tasks()
|
||||||
#
|
self._finished = True
|
||||||
# Case (2) will execute the code below without issues because
|
logger.debug(f"Pipeline task {self} has finished")
|
||||||
# `self._cancelled` is true.
|
|
||||||
#
|
|
||||||
# Case (3) will raise the exception above (because we are cancelling
|
|
||||||
# the asyncio task). This will be then captured by the
|
|
||||||
# `PipelineRunner` which will call `PipelineTask.cancel()` and
|
|
||||||
# therefore becoming case (2).
|
|
||||||
if self._finished or self._cancelled:
|
|
||||||
logger.debug(f"Pipeline task {self} is finishing cleanup...")
|
|
||||||
await self._cancel_tasks()
|
|
||||||
await self._cleanup(cleanup_pipeline)
|
|
||||||
if self._check_dangling_tasks:
|
|
||||||
self._print_dangling_tasks()
|
|
||||||
self._finished = True
|
|
||||||
logger.debug(f"Pipeline task {self} has finished")
|
|
||||||
|
|
||||||
async def queue_frame(self, frame: Frame):
|
async def queue_frame(self, frame: Frame):
|
||||||
"""Queue a single frame to be pushed down the pipeline.
|
"""Queue a single frame to be pushed down the pipeline.
|
||||||
@@ -477,19 +475,7 @@ class PipelineTask(BasePipelineTask):
|
|||||||
if not self._cancelled:
|
if not self._cancelled:
|
||||||
logger.debug(f"Cancelling pipeline task {self}")
|
logger.debug(f"Cancelling pipeline task {self}")
|
||||||
self._cancelled = True
|
self._cancelled = True
|
||||||
cancel_frame = CancelFrame()
|
await self.queue_frame(CancelFrame())
|
||||||
# Make sure everything is cleaned up downstream. This is sent
|
|
||||||
# out-of-band from the main streaming task which is what we want since
|
|
||||||
# we want to cancel right away.
|
|
||||||
await self._pipeline.queue_frame(cancel_frame)
|
|
||||||
# Wait for CancelFrame to make it through the pipeline.
|
|
||||||
await self._wait_for_pipeline_end(cancel_frame)
|
|
||||||
# Only cancel the push task, we don't want to be able to process any
|
|
||||||
# other frame after cancel. Everything else will be cancelled in
|
|
||||||
# run().
|
|
||||||
if self._process_push_task:
|
|
||||||
await self._task_manager.cancel_task(self._process_push_task)
|
|
||||||
self._process_push_task = None
|
|
||||||
|
|
||||||
async def _create_tasks(self):
|
async def _create_tasks(self):
|
||||||
"""Create and start all pipeline processing tasks."""
|
"""Create and start all pipeline processing tasks."""
|
||||||
@@ -591,6 +577,17 @@ class PipelineTask(BasePipelineTask):
|
|||||||
|
|
||||||
self._pipeline_end_event.clear()
|
self._pipeline_end_event.clear()
|
||||||
|
|
||||||
|
# We are really done.
|
||||||
|
self._pipeline_finished_event.set()
|
||||||
|
|
||||||
|
async def _wait_for_pipeline_finished(self):
|
||||||
|
await self._pipeline_finished_event.wait()
|
||||||
|
self._pipeline_finished_event.clear()
|
||||||
|
# Make sure we wait for the main task to complete.
|
||||||
|
if self._process_push_task:
|
||||||
|
await self._process_push_task
|
||||||
|
self._process_push_task = None
|
||||||
|
|
||||||
async def _setup(self, params: PipelineTaskParams):
|
async def _setup(self, params: PipelineTaskParams):
|
||||||
"""Set up the pipeline task and all processors."""
|
"""Set up the pipeline task and all processors."""
|
||||||
mgr_params = TaskManagerParams(loop=params.loop)
|
mgr_params = TaskManagerParams(loop=params.loop)
|
||||||
@@ -693,12 +690,11 @@ class PipelineTask(BasePipelineTask):
|
|||||||
logger.debug(f"{self}: received interruption task frame {frame}")
|
logger.debug(f"{self}: received interruption task frame {frame}")
|
||||||
await self._pipeline.queue_frame(InterruptionFrame())
|
await self._pipeline.queue_frame(InterruptionFrame())
|
||||||
elif isinstance(frame, ErrorFrame):
|
elif isinstance(frame, ErrorFrame):
|
||||||
|
await self._call_event_handler("on_pipeline_error", frame)
|
||||||
if frame.fatal:
|
if frame.fatal:
|
||||||
logger.error(f"A fatal error occurred: {frame}")
|
logger.error(f"A fatal error occurred: {frame}")
|
||||||
# Cancel all tasks downstream.
|
# Cancel all tasks downstream.
|
||||||
await self.queue_frame(CancelFrame())
|
await self.queue_frame(CancelFrame())
|
||||||
# Tell the task we should stop.
|
|
||||||
await self.queue_frame(StopTaskFrame())
|
|
||||||
else:
|
else:
|
||||||
logger.warning(f"{self}: Something went wrong: {frame}")
|
logger.warning(f"{self}: Something went wrong: {frame}")
|
||||||
|
|
||||||
|
|||||||
@@ -15,9 +15,10 @@ service-specific adapter.
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
import base64
|
import base64
|
||||||
|
import copy
|
||||||
import io
|
import io
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import Any, List, Optional, TypeAlias, Union
|
from typing import TYPE_CHECKING, Any, List, Optional, TypeAlias, Union
|
||||||
|
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
from openai._types import NOT_GIVEN as OPEN_AI_NOT_GIVEN
|
from openai._types import NOT_GIVEN as OPEN_AI_NOT_GIVEN
|
||||||
@@ -31,6 +32,9 @@ from PIL import Image
|
|||||||
from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
||||||
from pipecat.frames.frames import AudioRawFrame
|
from pipecat.frames.frames import AudioRawFrame
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||||
|
|
||||||
# "Re-export" types from OpenAI that we're using as universal context types.
|
# "Re-export" types from OpenAI that we're using as universal context types.
|
||||||
# NOTE: if universal message types need to someday diverge from OpenAI's, we
|
# NOTE: if universal message types need to someday diverge from OpenAI's, we
|
||||||
# should consider managing our own definitions. But we should do so carefully,
|
# should consider managing our own definitions. But we should do so carefully,
|
||||||
@@ -65,6 +69,26 @@ class LLMContext:
|
|||||||
and content formatting.
|
and content formatting.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def from_openai_context(openai_context: "OpenAILLMContext") -> "LLMContext":
|
||||||
|
"""Create a universal LLM context from an OpenAI-specific context.
|
||||||
|
|
||||||
|
NOTE: this should only be used internally, for facilitating migration
|
||||||
|
from OpenAILLMContext to LLMContext. New user code should use
|
||||||
|
LLMContext directly.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
openai_context: The OpenAI LLM context to convert.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
New LLMContext instance with converted messages and settings.
|
||||||
|
"""
|
||||||
|
return LLMContext(
|
||||||
|
messages=openai_context.get_messages(),
|
||||||
|
tools=openai_context.tools,
|
||||||
|
tool_choice=openai_context.tool_choice,
|
||||||
|
)
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
messages: Optional[List[LLMContextMessage]] = None,
|
messages: Optional[List[LLMContextMessage]] = None,
|
||||||
|
|||||||
@@ -13,6 +13,7 @@ LLM processing, and text-to-speech components in conversational AI pipelines.
|
|||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
import json
|
import json
|
||||||
|
from abc import abstractmethod
|
||||||
from typing import Any, Dict, List, Literal, Optional, Set
|
from typing import Any, Dict, List, Literal, Optional, Set
|
||||||
|
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
@@ -169,6 +170,11 @@ class LLMContextAggregator(FrameProcessor):
|
|||||||
"""Reset the aggregation state."""
|
"""Reset the aggregation state."""
|
||||||
self._aggregation = ""
|
self._aggregation = ""
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
async def push_aggregation(self):
|
||||||
|
"""Push the current aggregation downstream."""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
class LLMUserAggregator(LLMContextAggregator):
|
class LLMUserAggregator(LLMContextAggregator):
|
||||||
"""User LLM aggregator that processes speech-to-text transcriptions.
|
"""User LLM aggregator that processes speech-to-text transcriptions.
|
||||||
@@ -301,7 +307,7 @@ class LLMUserAggregator(LLMContextAggregator):
|
|||||||
frame = LLMContextFrame(self._context)
|
frame = LLMContextFrame(self._context)
|
||||||
await self.push_frame(frame)
|
await self.push_frame(frame)
|
||||||
|
|
||||||
async def _push_aggregation(self):
|
async def push_aggregation(self):
|
||||||
"""Push the current aggregation based on interruption strategies and conditions."""
|
"""Push the current aggregation based on interruption strategies and conditions."""
|
||||||
if len(self._aggregation) > 0:
|
if len(self._aggregation) > 0:
|
||||||
if self.interruption_strategies and self._bot_speaking:
|
if self.interruption_strategies and self._bot_speaking:
|
||||||
@@ -392,7 +398,7 @@ class LLMUserAggregator(LLMContextAggregator):
|
|||||||
# pushing the aggregation as we will probably get a final transcription.
|
# pushing the aggregation as we will probably get a final transcription.
|
||||||
if len(self._aggregation) > 0:
|
if len(self._aggregation) > 0:
|
||||||
if not self._seen_interim_results:
|
if not self._seen_interim_results:
|
||||||
await self._push_aggregation()
|
await self.push_aggregation()
|
||||||
# Handles the case where both the user and the bot are not speaking,
|
# Handles the case where both the user and the bot are not speaking,
|
||||||
# and the bot was previously speaking before the user interruption.
|
# and the bot was previously speaking before the user interruption.
|
||||||
# So in this case we are resetting the aggregation timer
|
# So in this case we are resetting the aggregation timer
|
||||||
@@ -471,7 +477,7 @@ class LLMUserAggregator(LLMContextAggregator):
|
|||||||
await self._maybe_emulate_user_speaking()
|
await self._maybe_emulate_user_speaking()
|
||||||
except asyncio.TimeoutError:
|
except asyncio.TimeoutError:
|
||||||
if not self._user_speaking:
|
if not self._user_speaking:
|
||||||
await self._push_aggregation()
|
await self.push_aggregation()
|
||||||
|
|
||||||
# If we are emulating VAD we still need to send the user stopped
|
# If we are emulating VAD we still need to send the user stopped
|
||||||
# speaking frame.
|
# speaking frame.
|
||||||
@@ -607,12 +613,12 @@ class LLMAssistantAggregator(LLMContextAggregator):
|
|||||||
elif isinstance(frame, UserImageRawFrame) and frame.request and frame.request.tool_call_id:
|
elif isinstance(frame, UserImageRawFrame) and frame.request and frame.request.tool_call_id:
|
||||||
await self._handle_user_image_frame(frame)
|
await self._handle_user_image_frame(frame)
|
||||||
elif isinstance(frame, BotStoppedSpeakingFrame):
|
elif isinstance(frame, BotStoppedSpeakingFrame):
|
||||||
await self._push_aggregation()
|
await self.push_aggregation()
|
||||||
await self.push_frame(frame, direction)
|
await self.push_frame(frame, direction)
|
||||||
else:
|
else:
|
||||||
await self.push_frame(frame, direction)
|
await self.push_frame(frame, direction)
|
||||||
|
|
||||||
async def _push_aggregation(self):
|
async def push_aggregation(self):
|
||||||
"""Push the current assistant aggregation with timestamp."""
|
"""Push the current assistant aggregation with timestamp."""
|
||||||
if not self._aggregation:
|
if not self._aggregation:
|
||||||
return
|
return
|
||||||
@@ -644,7 +650,7 @@ class LLMAssistantAggregator(LLMContextAggregator):
|
|||||||
await self.push_context_frame(FrameDirection.UPSTREAM)
|
await self.push_context_frame(FrameDirection.UPSTREAM)
|
||||||
|
|
||||||
async def _handle_interruptions(self, frame: InterruptionFrame):
|
async def _handle_interruptions(self, frame: InterruptionFrame):
|
||||||
await self._push_aggregation()
|
await self.push_aggregation()
|
||||||
self._started = 0
|
self._started = 0
|
||||||
await self.reset()
|
await self.reset()
|
||||||
|
|
||||||
@@ -778,7 +784,7 @@ class LLMAssistantAggregator(LLMContextAggregator):
|
|||||||
text=frame.request.context,
|
text=frame.request.context,
|
||||||
)
|
)
|
||||||
|
|
||||||
await self._push_aggregation()
|
await self.push_aggregation()
|
||||||
await self.push_context_frame(FrameDirection.UPSTREAM)
|
await self.push_context_frame(FrameDirection.UPSTREAM)
|
||||||
|
|
||||||
async def _handle_llm_start(self, _: LLMFullResponseStartFrame):
|
async def _handle_llm_start(self, _: LLMFullResponseStartFrame):
|
||||||
@@ -786,7 +792,7 @@ class LLMAssistantAggregator(LLMContextAggregator):
|
|||||||
|
|
||||||
async def _handle_llm_end(self, _: LLMFullResponseEndFrame):
|
async def _handle_llm_end(self, _: LLMFullResponseEndFrame):
|
||||||
self._started -= 1
|
self._started -= 1
|
||||||
await self._push_aggregation()
|
await self.push_aggregation()
|
||||||
|
|
||||||
async def _handle_text(self, frame: TextFrame):
|
async def _handle_text(self, frame: TextFrame):
|
||||||
if not self._started:
|
if not self._started:
|
||||||
|
|||||||
@@ -12,14 +12,14 @@ in conversational pipelines.
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
from pipecat.frames.frames import TextFrame
|
from pipecat.frames.frames import TextFrame
|
||||||
from pipecat.processors.aggregators.llm_response import LLMUserContextAggregator
|
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
from pipecat.processors.aggregators.llm_response_universal import LLMUserAggregator
|
||||||
|
|
||||||
|
|
||||||
class UserResponseAggregator(LLMUserContextAggregator):
|
class UserResponseAggregator(LLMUserAggregator):
|
||||||
"""Aggregates user responses into TextFrame objects.
|
"""Aggregates user responses into TextFrame objects.
|
||||||
|
|
||||||
This aggregator extends LLMUserContextAggregator to specifically handle
|
This aggregator extends LLMUserAggregator to specifically handle
|
||||||
user input by collecting text responses and outputting them as TextFrame
|
user input by collecting text responses and outputting them as TextFrame
|
||||||
objects when the aggregation is complete.
|
objects when the aggregation is complete.
|
||||||
"""
|
"""
|
||||||
@@ -28,9 +28,9 @@ class UserResponseAggregator(LLMUserContextAggregator):
|
|||||||
"""Initialize the user response aggregator.
|
"""Initialize the user response aggregator.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
**kwargs: Additional arguments passed to parent LLMUserContextAggregator.
|
**kwargs: Additional arguments passed to parent LLMUserAggregator.
|
||||||
"""
|
"""
|
||||||
super().__init__(context=OpenAILLMContext(), **kwargs)
|
super().__init__(context=LLMContext(), **kwargs)
|
||||||
|
|
||||||
async def push_aggregation(self):
|
async def push_aggregation(self):
|
||||||
"""Push the aggregated user response as a TextFrame.
|
"""Push the aggregated user response as a TextFrame.
|
||||||
|
|||||||
@@ -877,6 +877,8 @@ class FrameProcessor(BaseObject):
|
|||||||
|
|
||||||
"""
|
"""
|
||||||
while True:
|
while True:
|
||||||
|
(frame, direction, callback) = await self.__input_queue.get()
|
||||||
|
|
||||||
if self.__should_block_system_frames and self.__input_event:
|
if self.__should_block_system_frames and self.__input_event:
|
||||||
logger.trace(f"{self}: system frame processing paused")
|
logger.trace(f"{self}: system frame processing paused")
|
||||||
await self.__input_event.wait()
|
await self.__input_event.wait()
|
||||||
@@ -884,8 +886,6 @@ class FrameProcessor(BaseObject):
|
|||||||
self.__should_block_system_frames = False
|
self.__should_block_system_frames = False
|
||||||
logger.trace(f"{self}: system frame processing resumed")
|
logger.trace(f"{self}: system frame processing resumed")
|
||||||
|
|
||||||
(frame, direction, callback) = await self.__input_queue.get()
|
|
||||||
|
|
||||||
if isinstance(frame, SystemFrame):
|
if isinstance(frame, SystemFrame):
|
||||||
await self.__process_frame(frame, direction, callback)
|
await self.__process_frame(frame, direction, callback)
|
||||||
elif self.__process_queue:
|
elif self.__process_queue:
|
||||||
@@ -900,6 +900,8 @@ class FrameProcessor(BaseObject):
|
|||||||
async def __process_frame_task_handler(self):
|
async def __process_frame_task_handler(self):
|
||||||
"""Handle non-system frames from the process queue."""
|
"""Handle non-system frames from the process queue."""
|
||||||
while True:
|
while True:
|
||||||
|
(frame, direction, callback) = await self.__process_queue.get()
|
||||||
|
|
||||||
if self.__should_block_frames and self.__process_event:
|
if self.__should_block_frames and self.__process_event:
|
||||||
logger.trace(f"{self}: frame processing paused")
|
logger.trace(f"{self}: frame processing paused")
|
||||||
await self.__process_event.wait()
|
await self.__process_event.wait()
|
||||||
@@ -907,8 +909,6 @@ class FrameProcessor(BaseObject):
|
|||||||
self.__should_block_frames = False
|
self.__should_block_frames = False
|
||||||
logger.trace(f"{self}: frame processing resumed")
|
logger.trace(f"{self}: frame processing resumed")
|
||||||
|
|
||||||
(frame, direction, callback) = await self.__process_queue.get()
|
|
||||||
|
|
||||||
await self.__process_frame(frame, direction, callback)
|
await self.__process_frame(frame, direction, callback)
|
||||||
|
|
||||||
self.__process_queue.task_done()
|
self.__process_queue.task_done()
|
||||||
|
|||||||
@@ -13,6 +13,7 @@ and frame observation for the RTVI protocol.
|
|||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
import base64
|
import base64
|
||||||
|
import time
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import (
|
from typing import (
|
||||||
Any,
|
Any,
|
||||||
@@ -29,6 +30,7 @@ from typing import (
|
|||||||
from loguru import logger
|
from loguru import logger
|
||||||
from pydantic import BaseModel, Field, PrivateAttr, ValidationError
|
from pydantic import BaseModel, Field, PrivateAttr, ValidationError
|
||||||
|
|
||||||
|
from pipecat.audio.utils import calculate_audio_volume
|
||||||
from pipecat.frames.frames import (
|
from pipecat.frames.frames import (
|
||||||
BotStartedSpeakingFrame,
|
BotStartedSpeakingFrame,
|
||||||
BotStoppedSpeakingFrame,
|
BotStoppedSpeakingFrame,
|
||||||
@@ -40,6 +42,7 @@ from pipecat.frames.frames import (
|
|||||||
Frame,
|
Frame,
|
||||||
FunctionCallResultFrame,
|
FunctionCallResultFrame,
|
||||||
InputAudioRawFrame,
|
InputAudioRawFrame,
|
||||||
|
InputTransportMessageFrame,
|
||||||
InterimTranscriptionFrame,
|
InterimTranscriptionFrame,
|
||||||
LLMConfigureOutputFrame,
|
LLMConfigureOutputFrame,
|
||||||
LLMContextFrame,
|
LLMContextFrame,
|
||||||
@@ -48,10 +51,11 @@ from pipecat.frames.frames import (
|
|||||||
LLMMessagesAppendFrame,
|
LLMMessagesAppendFrame,
|
||||||
LLMTextFrame,
|
LLMTextFrame,
|
||||||
MetricsFrame,
|
MetricsFrame,
|
||||||
|
OutputTransportMessageUrgentFrame,
|
||||||
StartFrame,
|
StartFrame,
|
||||||
SystemFrame,
|
SystemFrame,
|
||||||
TranscriptionFrame,
|
TranscriptionFrame,
|
||||||
TransportMessageUrgentFrame,
|
TTSAudioRawFrame,
|
||||||
TTSStartedFrame,
|
TTSStartedFrame,
|
||||||
TTSStoppedFrame,
|
TTSStoppedFrame,
|
||||||
TTSTextFrame,
|
TTSTextFrame,
|
||||||
@@ -613,9 +617,9 @@ class RTVIAppendToContextData(BaseModel):
|
|||||||
|
|
||||||
Contains the role, content, and whether to run the message immediately.
|
Contains the role, content, and whether to run the message immediately.
|
||||||
|
|
||||||
.. deprecated:: 0.0.85
|
.. deprecated:: 0.0.85
|
||||||
The RTVI message, append-to-context, has been deprecated. Use send-text
|
The RTVI message, append-to-context, has been deprecated. Use send-text
|
||||||
or custom client and server messages instead.
|
or custom client and server messages instead.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
role: Literal["user", "assistant"] | str
|
role: Literal["user", "assistant"] | str
|
||||||
@@ -839,6 +843,36 @@ class RTVIServerMessage(BaseModel):
|
|||||||
data: Any
|
data: Any
|
||||||
|
|
||||||
|
|
||||||
|
class RTVIAudioLevelMessageData(BaseModel):
|
||||||
|
"""Data format for sending audio levels."""
|
||||||
|
|
||||||
|
value: float
|
||||||
|
|
||||||
|
|
||||||
|
class RTVIUserAudioLevelMessage(BaseModel):
|
||||||
|
"""Message indicating user audio level."""
|
||||||
|
|
||||||
|
label: RTVIMessageLiteral = RTVI_MESSAGE_LABEL
|
||||||
|
type: Literal["user-audio-level"] = "user-audio-level"
|
||||||
|
data: RTVIAudioLevelMessageData
|
||||||
|
|
||||||
|
|
||||||
|
class RTVIBotAudioLevelMessage(BaseModel):
|
||||||
|
"""Message indicating bot audio level."""
|
||||||
|
|
||||||
|
label: RTVIMessageLiteral = RTVI_MESSAGE_LABEL
|
||||||
|
type: Literal["bot-audio-level"] = "bot-audio-level"
|
||||||
|
data: RTVIAudioLevelMessageData
|
||||||
|
|
||||||
|
|
||||||
|
class RTVISystemLogMessage(BaseModel):
|
||||||
|
"""Message including a system log."""
|
||||||
|
|
||||||
|
label: RTVIMessageLiteral = RTVI_MESSAGE_LABEL
|
||||||
|
type: Literal["system-log"] = "system-log"
|
||||||
|
data: RTVITextMessageData
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class RTVIServerMessageFrame(SystemFrame):
|
class RTVIServerMessageFrame(SystemFrame):
|
||||||
"""A frame for sending server messages to the client.
|
"""A frame for sending server messages to the client.
|
||||||
@@ -858,25 +892,36 @@ class RTVIServerMessageFrame(SystemFrame):
|
|||||||
class RTVIObserverParams:
|
class RTVIObserverParams:
|
||||||
"""Parameters for configuring RTVI Observer behavior.
|
"""Parameters for configuring RTVI Observer behavior.
|
||||||
|
|
||||||
|
.. deprecated:: 0.0.87
|
||||||
|
Parameter `errors_enabled` is deprecated. Error messages are always enabled.
|
||||||
|
|
||||||
Parameters:
|
Parameters:
|
||||||
bot_llm_enabled: Indicates if the bot's LLM messages should be sent.
|
bot_llm_enabled: Indicates if the bot's LLM messages should be sent.
|
||||||
bot_tts_enabled: Indicates if the bot's TTS messages should be sent.
|
bot_tts_enabled: Indicates if the bot's TTS messages should be sent.
|
||||||
bot_speaking_enabled: Indicates if the bot's started/stopped speaking messages should be sent.
|
bot_speaking_enabled: Indicates if the bot's started/stopped speaking messages should be sent.
|
||||||
|
bot_audio_level_enabled: Indicates if bot's audio level messages should be sent.
|
||||||
user_llm_enabled: Indicates if the user's LLM input messages should be sent.
|
user_llm_enabled: Indicates if the user's LLM input messages should be sent.
|
||||||
user_speaking_enabled: Indicates if the user's started/stopped speaking messages should be sent.
|
user_speaking_enabled: Indicates if the user's started/stopped speaking messages should be sent.
|
||||||
user_transcription_enabled: Indicates if user's transcription messages should be sent.
|
user_transcription_enabled: Indicates if user's transcription messages should be sent.
|
||||||
|
user_audio_level_enabled: Indicates if user's audio level messages should be sent.
|
||||||
metrics_enabled: Indicates if metrics messages should be sent.
|
metrics_enabled: Indicates if metrics messages should be sent.
|
||||||
errors_enabled: Indicates if errors messages should be sent.
|
system_logs_enabled: Indicates if system logs should be sent.
|
||||||
|
errors_enabled: [Deprecated] Indicates if errors messages should be sent.
|
||||||
|
audio_level_period_secs: How often audio levels should be sent if enabled.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
bot_llm_enabled: bool = True
|
bot_llm_enabled: bool = True
|
||||||
bot_tts_enabled: bool = True
|
bot_tts_enabled: bool = True
|
||||||
bot_speaking_enabled: bool = True
|
bot_speaking_enabled: bool = True
|
||||||
|
bot_audio_level_enabled: bool = False
|
||||||
user_llm_enabled: bool = True
|
user_llm_enabled: bool = True
|
||||||
user_speaking_enabled: bool = True
|
user_speaking_enabled: bool = True
|
||||||
user_transcription_enabled: bool = True
|
user_transcription_enabled: bool = True
|
||||||
|
user_audio_level_enabled: bool = False
|
||||||
metrics_enabled: bool = True
|
metrics_enabled: bool = True
|
||||||
errors_enabled: bool = True
|
system_logs_enabled: bool = False
|
||||||
|
errors_enabled: Optional[bool] = None
|
||||||
|
audio_level_period_secs: float = 0.15
|
||||||
|
|
||||||
|
|
||||||
class RTVIObserver(BaseObserver):
|
class RTVIObserver(BaseObserver):
|
||||||
@@ -892,7 +937,11 @@ class RTVIObserver(BaseObserver):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self, rtvi: "RTVIProcessor", *, params: Optional[RTVIObserverParams] = None, **kwargs
|
self,
|
||||||
|
rtvi: Optional["RTVIProcessor"] = None,
|
||||||
|
*,
|
||||||
|
params: Optional[RTVIObserverParams] = None,
|
||||||
|
**kwargs,
|
||||||
):
|
):
|
||||||
"""Initialize the RTVI observer.
|
"""Initialize the RTVI observer.
|
||||||
|
|
||||||
@@ -904,9 +953,50 @@ class RTVIObserver(BaseObserver):
|
|||||||
super().__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
self._rtvi = rtvi
|
self._rtvi = rtvi
|
||||||
self._params = params or RTVIObserverParams()
|
self._params = params or RTVIObserverParams()
|
||||||
self._bot_transcription = ""
|
|
||||||
self._frames_seen = set()
|
self._frames_seen = set()
|
||||||
rtvi.set_errors_enabled(self._params.errors_enabled)
|
|
||||||
|
self._bot_transcription = ""
|
||||||
|
self._last_user_audio_level = 0
|
||||||
|
self._last_bot_audio_level = 0
|
||||||
|
|
||||||
|
if self._params.system_logs_enabled:
|
||||||
|
self._system_logger_id = logger.add(self._logger_sink)
|
||||||
|
|
||||||
|
if self._params.errors_enabled is not None:
|
||||||
|
import warnings
|
||||||
|
|
||||||
|
with warnings.catch_warnings():
|
||||||
|
warnings.simplefilter("always")
|
||||||
|
warnings.warn(
|
||||||
|
"Parameter `errors_enabled` is deprecated. Error messages are always enabled.",
|
||||||
|
DeprecationWarning,
|
||||||
|
)
|
||||||
|
|
||||||
|
async def _logger_sink(self, message):
|
||||||
|
"""Logger sink so we cna send system logs to RTVI clients."""
|
||||||
|
message = RTVISystemLogMessage(data=RTVITextMessageData(text=message))
|
||||||
|
await self.send_rtvi_message(message)
|
||||||
|
|
||||||
|
async def cleanup(self):
|
||||||
|
"""Cleanup RTVI observer resources."""
|
||||||
|
await super().cleanup()
|
||||||
|
if self._params.system_logs_enabled:
|
||||||
|
logger.remove(self._system_logger_id)
|
||||||
|
|
||||||
|
async def send_rtvi_message(self, model: BaseModel, exclude_none: bool = True):
|
||||||
|
"""Send an RTVI message.
|
||||||
|
|
||||||
|
By default, we push a transport frame. But this function can be
|
||||||
|
overriden by subclass to send RTVI messages in different ways.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
model: The message to send.
|
||||||
|
exclude_none: Whether to exclude None values from the model dump.
|
||||||
|
|
||||||
|
"""
|
||||||
|
if self._rtvi:
|
||||||
|
await self._rtvi.push_transport_message(model, exclude_none)
|
||||||
|
|
||||||
async def on_push_frame(self, data: FramePushed):
|
async def on_push_frame(self, data: FramePushed):
|
||||||
"""Process a frame being pushed through the pipeline.
|
"""Process a frame being pushed through the pipeline.
|
||||||
@@ -948,52 +1038,58 @@ class RTVIObserver(BaseObserver):
|
|||||||
):
|
):
|
||||||
await self._handle_context(frame)
|
await self._handle_context(frame)
|
||||||
elif isinstance(frame, LLMFullResponseStartFrame) and self._params.bot_llm_enabled:
|
elif isinstance(frame, LLMFullResponseStartFrame) and self._params.bot_llm_enabled:
|
||||||
await self.push_transport_message_urgent(RTVIBotLLMStartedMessage())
|
await self.send_rtvi_message(RTVIBotLLMStartedMessage())
|
||||||
elif isinstance(frame, LLMFullResponseEndFrame) and self._params.bot_llm_enabled:
|
elif isinstance(frame, LLMFullResponseEndFrame) and self._params.bot_llm_enabled:
|
||||||
await self.push_transport_message_urgent(RTVIBotLLMStoppedMessage())
|
await self.send_rtvi_message(RTVIBotLLMStoppedMessage())
|
||||||
elif isinstance(frame, LLMTextFrame) and self._params.bot_llm_enabled:
|
elif isinstance(frame, LLMTextFrame) and self._params.bot_llm_enabled:
|
||||||
await self._handle_llm_text_frame(frame)
|
await self._handle_llm_text_frame(frame)
|
||||||
elif isinstance(frame, TTSStartedFrame) and self._params.bot_tts_enabled:
|
elif isinstance(frame, TTSStartedFrame) and self._params.bot_tts_enabled:
|
||||||
await self.push_transport_message_urgent(RTVIBotTTSStartedMessage())
|
await self.send_rtvi_message(RTVIBotTTSStartedMessage())
|
||||||
elif isinstance(frame, TTSStoppedFrame) and self._params.bot_tts_enabled:
|
elif isinstance(frame, TTSStoppedFrame) and self._params.bot_tts_enabled:
|
||||||
await self.push_transport_message_urgent(RTVIBotTTSStoppedMessage())
|
await self.send_rtvi_message(RTVIBotTTSStoppedMessage())
|
||||||
elif isinstance(frame, TTSTextFrame) and self._params.bot_tts_enabled:
|
elif isinstance(frame, TTSTextFrame) and self._params.bot_tts_enabled:
|
||||||
if isinstance(src, BaseOutputTransport):
|
if isinstance(src, BaseOutputTransport):
|
||||||
message = RTVIBotTTSTextMessage(data=RTVITextMessageData(text=frame.text))
|
message = RTVIBotTTSTextMessage(data=RTVITextMessageData(text=frame.text))
|
||||||
await self.push_transport_message_urgent(message)
|
await self.send_rtvi_message(message)
|
||||||
else:
|
else:
|
||||||
mark_as_seen = False
|
mark_as_seen = False
|
||||||
elif isinstance(frame, MetricsFrame) and self._params.metrics_enabled:
|
elif isinstance(frame, MetricsFrame) and self._params.metrics_enabled:
|
||||||
await self._handle_metrics(frame)
|
await self._handle_metrics(frame)
|
||||||
elif isinstance(frame, RTVIServerMessageFrame):
|
elif isinstance(frame, RTVIServerMessageFrame):
|
||||||
message = RTVIServerMessage(data=frame.data)
|
message = RTVIServerMessage(data=frame.data)
|
||||||
await self.push_transport_message_urgent(message)
|
await self.send_rtvi_message(message)
|
||||||
elif isinstance(frame, RTVIServerResponseFrame):
|
elif isinstance(frame, RTVIServerResponseFrame):
|
||||||
if frame.error is not None:
|
if frame.error is not None:
|
||||||
await self._send_error_response(frame)
|
await self._send_error_response(frame)
|
||||||
else:
|
else:
|
||||||
await self._send_server_response(frame)
|
await self._send_server_response(frame)
|
||||||
|
elif isinstance(frame, InputAudioRawFrame) and self._params.user_audio_level_enabled:
|
||||||
|
curr_time = time.time()
|
||||||
|
diff_time = curr_time - self._last_user_audio_level
|
||||||
|
if diff_time > self._params.audio_level_period_secs:
|
||||||
|
level = calculate_audio_volume(frame.audio, frame.sample_rate)
|
||||||
|
message = RTVIUserAudioLevelMessage(data=RTVIAudioLevelMessageData(value=level))
|
||||||
|
await self.send_rtvi_message(message)
|
||||||
|
self._last_user_audio_level = curr_time
|
||||||
|
elif isinstance(frame, TTSAudioRawFrame) and self._params.bot_audio_level_enabled:
|
||||||
|
curr_time = time.time()
|
||||||
|
diff_time = curr_time - self._last_bot_audio_level
|
||||||
|
if diff_time > self._params.audio_level_period_secs:
|
||||||
|
level = calculate_audio_volume(frame.audio, frame.sample_rate)
|
||||||
|
message = RTVIBotAudioLevelMessage(data=RTVIAudioLevelMessageData(value=level))
|
||||||
|
await self.send_rtvi_message(message)
|
||||||
|
self._last_bot_audio_level = curr_time
|
||||||
|
|
||||||
if mark_as_seen:
|
if mark_as_seen:
|
||||||
self._frames_seen.add(frame.id)
|
self._frames_seen.add(frame.id)
|
||||||
|
|
||||||
async def push_transport_message_urgent(self, model: BaseModel, exclude_none: bool = True):
|
|
||||||
"""Push an urgent transport message to the RTVI processor.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
model: The message model to send.
|
|
||||||
exclude_none: Whether to exclude None values from the model dump.
|
|
||||||
"""
|
|
||||||
frame = TransportMessageUrgentFrame(message=model.model_dump(exclude_none=exclude_none))
|
|
||||||
await self._rtvi.push_frame(frame)
|
|
||||||
|
|
||||||
async def _push_bot_transcription(self):
|
async def _push_bot_transcription(self):
|
||||||
"""Push accumulated bot transcription as a message."""
|
"""Push accumulated bot transcription as a message."""
|
||||||
if len(self._bot_transcription) > 0:
|
if len(self._bot_transcription) > 0:
|
||||||
message = RTVIBotTranscriptionMessage(
|
message = RTVIBotTranscriptionMessage(
|
||||||
data=RTVITextMessageData(text=self._bot_transcription)
|
data=RTVITextMessageData(text=self._bot_transcription)
|
||||||
)
|
)
|
||||||
await self.push_transport_message_urgent(message)
|
await self.send_rtvi_message(message)
|
||||||
self._bot_transcription = ""
|
self._bot_transcription = ""
|
||||||
|
|
||||||
async def _handle_interruptions(self, frame: Frame):
|
async def _handle_interruptions(self, frame: Frame):
|
||||||
@@ -1005,7 +1101,7 @@ class RTVIObserver(BaseObserver):
|
|||||||
message = RTVIUserStoppedSpeakingMessage()
|
message = RTVIUserStoppedSpeakingMessage()
|
||||||
|
|
||||||
if message:
|
if message:
|
||||||
await self.push_transport_message_urgent(message)
|
await self.send_rtvi_message(message)
|
||||||
|
|
||||||
async def _handle_bot_speaking(self, frame: Frame):
|
async def _handle_bot_speaking(self, frame: Frame):
|
||||||
"""Handle bot speaking event frames."""
|
"""Handle bot speaking event frames."""
|
||||||
@@ -1016,12 +1112,12 @@ class RTVIObserver(BaseObserver):
|
|||||||
message = RTVIBotStoppedSpeakingMessage()
|
message = RTVIBotStoppedSpeakingMessage()
|
||||||
|
|
||||||
if message:
|
if message:
|
||||||
await self.push_transport_message_urgent(message)
|
await self.send_rtvi_message(message)
|
||||||
|
|
||||||
async def _handle_llm_text_frame(self, frame: LLMTextFrame):
|
async def _handle_llm_text_frame(self, frame: LLMTextFrame):
|
||||||
"""Handle LLM text output frames."""
|
"""Handle LLM text output frames."""
|
||||||
message = RTVIBotLLMTextMessage(data=RTVITextMessageData(text=frame.text))
|
message = RTVIBotLLMTextMessage(data=RTVITextMessageData(text=frame.text))
|
||||||
await self.push_transport_message_urgent(message)
|
await self.send_rtvi_message(message)
|
||||||
|
|
||||||
self._bot_transcription += frame.text
|
self._bot_transcription += frame.text
|
||||||
if match_endofsentence(self._bot_transcription):
|
if match_endofsentence(self._bot_transcription):
|
||||||
@@ -1044,7 +1140,7 @@ class RTVIObserver(BaseObserver):
|
|||||||
)
|
)
|
||||||
|
|
||||||
if message:
|
if message:
|
||||||
await self.push_transport_message_urgent(message)
|
await self.send_rtvi_message(message)
|
||||||
|
|
||||||
async def _handle_context(self, frame: OpenAILLMContextFrame | LLMContextFrame):
|
async def _handle_context(self, frame: OpenAILLMContextFrame | LLMContextFrame):
|
||||||
"""Process LLM context frames to extract user messages for the RTVI client."""
|
"""Process LLM context frames to extract user messages for the RTVI client."""
|
||||||
@@ -1064,7 +1160,7 @@ class RTVIObserver(BaseObserver):
|
|||||||
text = "".join(part.text for part in message.parts if hasattr(part, "text"))
|
text = "".join(part.text for part in message.parts if hasattr(part, "text"))
|
||||||
if text:
|
if text:
|
||||||
rtvi_message = RTVIUserLLMTextMessage(data=RTVITextMessageData(text=text))
|
rtvi_message = RTVIUserLLMTextMessage(data=RTVITextMessageData(text=text))
|
||||||
await self.push_transport_message_urgent(rtvi_message)
|
await self.send_rtvi_message(rtvi_message)
|
||||||
|
|
||||||
# Handle OpenAI format (original implementation)
|
# Handle OpenAI format (original implementation)
|
||||||
elif isinstance(message, dict):
|
elif isinstance(message, dict):
|
||||||
@@ -1075,7 +1171,7 @@ class RTVIObserver(BaseObserver):
|
|||||||
else:
|
else:
|
||||||
text = content
|
text = content
|
||||||
rtvi_message = RTVIUserLLMTextMessage(data=RTVITextMessageData(text=text))
|
rtvi_message = RTVIUserLLMTextMessage(data=RTVITextMessageData(text=text))
|
||||||
await self.push_transport_message_urgent(rtvi_message)
|
await self.send_rtvi_message(rtvi_message)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"Caught an error while trying to handle context: {e}")
|
logger.warning(f"Caught an error while trying to handle context: {e}")
|
||||||
@@ -1102,7 +1198,7 @@ class RTVIObserver(BaseObserver):
|
|||||||
metrics["characters"].append(d.model_dump(exclude_none=True))
|
metrics["characters"].append(d.model_dump(exclude_none=True))
|
||||||
|
|
||||||
message = RTVIMetricsMessage(data=metrics)
|
message = RTVIMetricsMessage(data=metrics)
|
||||||
await self.push_transport_message_urgent(message)
|
await self.send_rtvi_message(message)
|
||||||
|
|
||||||
async def _send_server_response(self, frame: RTVIServerResponseFrame):
|
async def _send_server_response(self, frame: RTVIServerResponseFrame):
|
||||||
"""Send a response to the client for a specific request."""
|
"""Send a response to the client for a specific request."""
|
||||||
@@ -1110,15 +1206,14 @@ class RTVIObserver(BaseObserver):
|
|||||||
id=str(frame.client_msg.msg_id),
|
id=str(frame.client_msg.msg_id),
|
||||||
data=RTVIRawServerResponseData(t=frame.client_msg.type, d=frame.data),
|
data=RTVIRawServerResponseData(t=frame.client_msg.type, d=frame.data),
|
||||||
)
|
)
|
||||||
await self.push_transport_message_urgent(message)
|
await self.send_rtvi_message(message)
|
||||||
|
|
||||||
async def _send_error_response(self, frame: RTVIServerResponseFrame):
|
async def _send_error_response(self, frame: RTVIServerResponseFrame):
|
||||||
"""Send a response to the client for a specific request."""
|
"""Send a response to the client for a specific request."""
|
||||||
if self._params.errors_enabled:
|
message = RTVIErrorResponse(
|
||||||
message = RTVIErrorResponse(
|
id=str(frame.client_msg.msg_id), data=RTVIErrorResponseData(error=frame.error)
|
||||||
id=str(frame.client_msg.msg_id), data=RTVIErrorResponseData(error=frame.error)
|
)
|
||||||
)
|
await self.send_rtvi_message(message)
|
||||||
await self.push_transport_message_urgent(message)
|
|
||||||
|
|
||||||
|
|
||||||
class RTVIProcessor(FrameProcessor):
|
class RTVIProcessor(FrameProcessor):
|
||||||
@@ -1152,7 +1247,6 @@ class RTVIProcessor(FrameProcessor):
|
|||||||
# Default to 0.3.0 which is the last version before actually having a
|
# Default to 0.3.0 which is the last version before actually having a
|
||||||
# "client-version".
|
# "client-version".
|
||||||
self._client_version = [0, 3, 0]
|
self._client_version = [0, 3, 0]
|
||||||
self._errors_enabled = True
|
|
||||||
self._skip_tts: bool = False # Keep in sync with llm_service.py
|
self._skip_tts: bool = False # Keep in sync with llm_service.py
|
||||||
|
|
||||||
self._registered_actions: Dict[str, RTVIAction] = {}
|
self._registered_actions: Dict[str, RTVIAction] = {}
|
||||||
@@ -1222,14 +1316,6 @@ class RTVIProcessor(FrameProcessor):
|
|||||||
await self._update_config(self._config, False)
|
await self._update_config(self._config, False)
|
||||||
await self._send_bot_ready()
|
await self._send_bot_ready()
|
||||||
|
|
||||||
def set_errors_enabled(self, enabled: bool):
|
|
||||||
"""Enable or disable error message sending.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
enabled: Whether to send error messages.
|
|
||||||
"""
|
|
||||||
self._errors_enabled = enabled
|
|
||||||
|
|
||||||
async def interrupt_bot(self):
|
async def interrupt_bot(self):
|
||||||
"""Send a bot interruption frame upstream."""
|
"""Send a bot interruption frame upstream."""
|
||||||
await self.push_interruption_task_frame_and_wait()
|
await self.push_interruption_task_frame_and_wait()
|
||||||
@@ -1258,6 +1344,13 @@ class RTVIProcessor(FrameProcessor):
|
|||||||
"""
|
"""
|
||||||
await self._send_error_frame(ErrorFrame(error=error))
|
await self._send_error_frame(ErrorFrame(error=error))
|
||||||
|
|
||||||
|
async def push_transport_message(self, model: BaseModel, exclude_none: bool = True):
|
||||||
|
"""Push a transport message frame."""
|
||||||
|
frame = OutputTransportMessageUrgentFrame(
|
||||||
|
message=model.model_dump(exclude_none=exclude_none)
|
||||||
|
)
|
||||||
|
await self.push_frame(frame)
|
||||||
|
|
||||||
async def handle_message(self, message: RTVIMessage):
|
async def handle_message(self, message: RTVIMessage):
|
||||||
"""Handle an incoming RTVI message.
|
"""Handle an incoming RTVI message.
|
||||||
|
|
||||||
@@ -1278,7 +1371,7 @@ class RTVIProcessor(FrameProcessor):
|
|||||||
args=params.arguments,
|
args=params.arguments,
|
||||||
)
|
)
|
||||||
message = RTVILLMFunctionCallMessage(data=fn)
|
message = RTVILLMFunctionCallMessage(data=fn)
|
||||||
await self._push_transport_message(message, exclude_none=False)
|
await self.push_transport_message(message, exclude_none=False)
|
||||||
|
|
||||||
async def handle_function_call_start(
|
async def handle_function_call_start(
|
||||||
self, function_name: str, llm: FrameProcessor, context: OpenAILLMContext
|
self, function_name: str, llm: FrameProcessor, context: OpenAILLMContext
|
||||||
@@ -1305,7 +1398,7 @@ class RTVIProcessor(FrameProcessor):
|
|||||||
|
|
||||||
fn = RTVILLMFunctionCallStartMessageData(function_name=function_name)
|
fn = RTVILLMFunctionCallStartMessageData(function_name=function_name)
|
||||||
message = RTVILLMFunctionCallStartMessage(data=fn)
|
message = RTVILLMFunctionCallStartMessage(data=fn)
|
||||||
await self._push_transport_message(message, exclude_none=False)
|
await self.push_transport_message(message, exclude_none=False)
|
||||||
|
|
||||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||||
"""Process incoming frames through the RTVI processor.
|
"""Process incoming frames through the RTVI processor.
|
||||||
@@ -1328,7 +1421,7 @@ class RTVIProcessor(FrameProcessor):
|
|||||||
elif isinstance(frame, ErrorFrame):
|
elif isinstance(frame, ErrorFrame):
|
||||||
await self._send_error_frame(frame)
|
await self._send_error_frame(frame)
|
||||||
await self.push_frame(frame, direction)
|
await self.push_frame(frame, direction)
|
||||||
elif isinstance(frame, TransportMessageUrgentFrame):
|
elif isinstance(frame, InputTransportMessageFrame):
|
||||||
await self._handle_transport_message(frame)
|
await self._handle_transport_message(frame)
|
||||||
# All other system frames
|
# All other system frames
|
||||||
elif isinstance(frame, SystemFrame):
|
elif isinstance(frame, SystemFrame):
|
||||||
@@ -1377,11 +1470,6 @@ class RTVIProcessor(FrameProcessor):
|
|||||||
await self.cancel_task(self._message_task)
|
await self.cancel_task(self._message_task)
|
||||||
self._message_task = None
|
self._message_task = None
|
||||||
|
|
||||||
async def _push_transport_message(self, model: BaseModel, exclude_none: bool = True):
|
|
||||||
"""Push a transport message frame."""
|
|
||||||
frame = TransportMessageUrgentFrame(message=model.model_dump(exclude_none=exclude_none))
|
|
||||||
await self.push_frame(frame)
|
|
||||||
|
|
||||||
async def _action_task_handler(self):
|
async def _action_task_handler(self):
|
||||||
"""Handle incoming action frames."""
|
"""Handle incoming action frames."""
|
||||||
while True:
|
while True:
|
||||||
@@ -1396,7 +1484,7 @@ class RTVIProcessor(FrameProcessor):
|
|||||||
await self._handle_message(message)
|
await self._handle_message(message)
|
||||||
self._message_queue.task_done()
|
self._message_queue.task_done()
|
||||||
|
|
||||||
async def _handle_transport_message(self, frame: TransportMessageUrgentFrame):
|
async def _handle_transport_message(self, frame: InputTransportMessageFrame):
|
||||||
"""Handle an incoming transport message frame."""
|
"""Handle an incoming transport message frame."""
|
||||||
try:
|
try:
|
||||||
transport_message = frame.message
|
transport_message = frame.message
|
||||||
@@ -1518,7 +1606,7 @@ class RTVIProcessor(FrameProcessor):
|
|||||||
|
|
||||||
services = list(self._registered_services.values())
|
services = list(self._registered_services.values())
|
||||||
message = RTVIDescribeConfig(id=request_id, data=RTVIDescribeConfigData(config=services))
|
message = RTVIDescribeConfig(id=request_id, data=RTVIDescribeConfigData(config=services))
|
||||||
await self._push_transport_message(message)
|
await self.push_transport_message(message)
|
||||||
|
|
||||||
async def _handle_describe_actions(self, request_id: str):
|
async def _handle_describe_actions(self, request_id: str):
|
||||||
"""Handle a describe-actions request."""
|
"""Handle a describe-actions request."""
|
||||||
@@ -1533,7 +1621,7 @@ class RTVIProcessor(FrameProcessor):
|
|||||||
|
|
||||||
actions = list(self._registered_actions.values())
|
actions = list(self._registered_actions.values())
|
||||||
message = RTVIDescribeActions(id=request_id, data=RTVIDescribeActionsData(actions=actions))
|
message = RTVIDescribeActions(id=request_id, data=RTVIDescribeActionsData(actions=actions))
|
||||||
await self._push_transport_message(message)
|
await self.push_transport_message(message)
|
||||||
|
|
||||||
async def _handle_get_config(self, request_id: str):
|
async def _handle_get_config(self, request_id: str):
|
||||||
"""Handle a get-config request."""
|
"""Handle a get-config request."""
|
||||||
@@ -1547,7 +1635,7 @@ class RTVIProcessor(FrameProcessor):
|
|||||||
)
|
)
|
||||||
|
|
||||||
message = RTVIConfigResponse(id=request_id, data=self._config)
|
message = RTVIConfigResponse(id=request_id, data=self._config)
|
||||||
await self._push_transport_message(message)
|
await self.push_transport_message(message)
|
||||||
|
|
||||||
def _update_config_option(self, service: str, config: RTVIServiceOptionConfig):
|
def _update_config_option(self, service: str, config: RTVIServiceOptionConfig):
|
||||||
"""Update a specific configuration option."""
|
"""Update a specific configuration option."""
|
||||||
@@ -1672,7 +1760,7 @@ class RTVIProcessor(FrameProcessor):
|
|||||||
# action responses (such as webhooks) don't set a request_id
|
# action responses (such as webhooks) don't set a request_id
|
||||||
if request_id:
|
if request_id:
|
||||||
message = RTVIActionResponse(id=request_id, data=RTVIActionResponseData(result=result))
|
message = RTVIActionResponse(id=request_id, data=RTVIActionResponseData(result=result))
|
||||||
await self._push_transport_message(message)
|
await self.push_transport_message(message)
|
||||||
|
|
||||||
async def _send_bot_ready(self):
|
async def _send_bot_ready(self):
|
||||||
"""Send the bot-ready message to the client."""
|
"""Send the bot-ready message to the client."""
|
||||||
@@ -1683,23 +1771,21 @@ class RTVIProcessor(FrameProcessor):
|
|||||||
id=self._client_ready_id,
|
id=self._client_ready_id,
|
||||||
data=RTVIBotReadyData(version=RTVI_PROTOCOL_VERSION, config=config),
|
data=RTVIBotReadyData(version=RTVI_PROTOCOL_VERSION, config=config),
|
||||||
)
|
)
|
||||||
await self._push_transport_message(message)
|
await self.push_transport_message(message)
|
||||||
|
|
||||||
async def _send_server_message(self, message: RTVIServerMessage | RTVIServerResponse):
|
async def _send_server_message(self, message: RTVIServerMessage | RTVIServerResponse):
|
||||||
"""Send a message or response to the client."""
|
"""Send a message or response to the client."""
|
||||||
await self._push_transport_message(message)
|
await self.push_transport_message(message)
|
||||||
|
|
||||||
async def _send_error_frame(self, frame: ErrorFrame):
|
async def _send_error_frame(self, frame: ErrorFrame):
|
||||||
"""Send an error frame as an RTVI error message."""
|
"""Send an error frame as an RTVI error message."""
|
||||||
if self._errors_enabled:
|
message = RTVIError(data=RTVIErrorData(error=frame.error, fatal=frame.fatal))
|
||||||
message = RTVIError(data=RTVIErrorData(error=frame.error, fatal=frame.fatal))
|
await self.push_transport_message(message)
|
||||||
await self._push_transport_message(message)
|
|
||||||
|
|
||||||
async def _send_error_response(self, id: str, error: str):
|
async def _send_error_response(self, id: str, error: str):
|
||||||
"""Send an error response message."""
|
"""Send an error response message."""
|
||||||
if self._errors_enabled:
|
message = RTVIErrorResponse(id=id, data=RTVIErrorResponseData(error=error))
|
||||||
message = RTVIErrorResponse(id=id, data=RTVIErrorResponseData(error=error))
|
await self.push_transport_message(message)
|
||||||
await self._push_transport_message(message)
|
|
||||||
|
|
||||||
def _action_id(self, service: str, action: str) -> str:
|
def _action_id(self, service: str, action: str) -> str:
|
||||||
"""Generate an action ID from service and action names."""
|
"""Generate an action ID from service and action names."""
|
||||||
|
|||||||
@@ -15,7 +15,7 @@ from pipecat.frames.frames import (
|
|||||||
Frame,
|
Frame,
|
||||||
InputAudioRawFrame,
|
InputAudioRawFrame,
|
||||||
OutputAudioRawFrame,
|
OutputAudioRawFrame,
|
||||||
TransportMessageFrame,
|
UserSpeakingFrame,
|
||||||
)
|
)
|
||||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||||
|
|
||||||
@@ -36,9 +36,9 @@ class FrameLogger(FrameProcessor):
|
|||||||
color: Optional[str] = None,
|
color: Optional[str] = None,
|
||||||
ignored_frame_types: Tuple[Type[Frame], ...] = (
|
ignored_frame_types: Tuple[Type[Frame], ...] = (
|
||||||
BotSpeakingFrame,
|
BotSpeakingFrame,
|
||||||
|
UserSpeakingFrame,
|
||||||
InputAudioRawFrame,
|
InputAudioRawFrame,
|
||||||
OutputAudioRawFrame,
|
OutputAudioRawFrame,
|
||||||
TransportMessageFrame,
|
|
||||||
),
|
),
|
||||||
):
|
):
|
||||||
"""Initialize the frame logger.
|
"""Initialize the frame logger.
|
||||||
|
|||||||
@@ -82,6 +82,7 @@ async def configure(
|
|||||||
sip_enable_video: Optional[bool] = False,
|
sip_enable_video: Optional[bool] = False,
|
||||||
sip_num_endpoints: Optional[int] = 1,
|
sip_num_endpoints: Optional[int] = 1,
|
||||||
sip_codecs: Optional[Dict[str, List[str]]] = None,
|
sip_codecs: Optional[Dict[str, List[str]]] = None,
|
||||||
|
room_properties: Optional[DailyRoomProperties] = None,
|
||||||
) -> DailyRoomConfig:
|
) -> DailyRoomConfig:
|
||||||
"""Configure Daily room URL and token with optional SIP capabilities.
|
"""Configure Daily room URL and token with optional SIP capabilities.
|
||||||
|
|
||||||
@@ -99,6 +100,10 @@ async def configure(
|
|||||||
sip_num_endpoints: Number of allowed SIP endpoints.
|
sip_num_endpoints: Number of allowed SIP endpoints.
|
||||||
sip_codecs: Codecs to support for audio and video. If None, uses Daily defaults.
|
sip_codecs: Codecs to support for audio and video. If None, uses Daily defaults.
|
||||||
Example: {"audio": ["OPUS"], "video": ["H264"]}
|
Example: {"audio": ["OPUS"], "video": ["H264"]}
|
||||||
|
room_properties: Optional DailyRoomProperties to use instead of building from
|
||||||
|
individual parameters. When provided, this overrides room_exp_duration and
|
||||||
|
SIP-related parameters. If not provided, properties are built from the
|
||||||
|
individual parameters as before.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
DailyRoomConfig: Object with room_url, token, and optional sip_endpoint.
|
DailyRoomConfig: Object with room_url, token, and optional sip_endpoint.
|
||||||
@@ -115,6 +120,13 @@ async def configure(
|
|||||||
# SIP-enabled room
|
# SIP-enabled room
|
||||||
sip_config = await configure(session, sip_caller_phone="+15551234567")
|
sip_config = await configure(session, sip_caller_phone="+15551234567")
|
||||||
print(f"SIP endpoint: {sip_config.sip_endpoint}")
|
print(f"SIP endpoint: {sip_config.sip_endpoint}")
|
||||||
|
|
||||||
|
# Custom room properties with recording enabled
|
||||||
|
custom_props = DailyRoomProperties(
|
||||||
|
enable_recording="cloud",
|
||||||
|
max_participants=2,
|
||||||
|
)
|
||||||
|
config = await configure(session, room_properties=custom_props)
|
||||||
"""
|
"""
|
||||||
# Check for required API key
|
# Check for required API key
|
||||||
api_key = os.getenv("DAILY_API_KEY")
|
api_key = os.getenv("DAILY_API_KEY")
|
||||||
@@ -124,9 +136,32 @@ async def configure(
|
|||||||
"Get your API key from https://dashboard.daily.co/developers"
|
"Get your API key from https://dashboard.daily.co/developers"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Warn if both room_properties and individual parameters are provided
|
||||||
|
if room_properties is not None:
|
||||||
|
individual_params_provided = any(
|
||||||
|
[
|
||||||
|
room_exp_duration != 2.0,
|
||||||
|
token_exp_duration != 2.0,
|
||||||
|
sip_caller_phone is not None,
|
||||||
|
sip_enable_video is not False,
|
||||||
|
sip_num_endpoints != 1,
|
||||||
|
sip_codecs is not None,
|
||||||
|
]
|
||||||
|
)
|
||||||
|
if individual_params_provided:
|
||||||
|
logger.warning(
|
||||||
|
"Both room_properties and individual parameters (room_exp_duration, token_exp_duration, "
|
||||||
|
"sip_*) were provided. The room_properties will be used and individual parameters "
|
||||||
|
"will be ignored."
|
||||||
|
)
|
||||||
|
|
||||||
# Determine if SIP mode is enabled
|
# Determine if SIP mode is enabled
|
||||||
sip_enabled = sip_caller_phone is not None
|
sip_enabled = sip_caller_phone is not None
|
||||||
|
|
||||||
|
# If room_properties is provided, check if it has SIP configuration
|
||||||
|
if room_properties and room_properties.sip:
|
||||||
|
sip_enabled = True
|
||||||
|
|
||||||
daily_rest_helper = DailyRESTHelper(
|
daily_rest_helper = DailyRESTHelper(
|
||||||
daily_api_key=api_key,
|
daily_api_key=api_key,
|
||||||
daily_api_url=os.getenv("DAILY_API_URL", "https://api.daily.co/v1"),
|
daily_api_url=os.getenv("DAILY_API_URL", "https://api.daily.co/v1"),
|
||||||
@@ -150,27 +185,29 @@ async def configure(
|
|||||||
room_name = f"{room_prefix}-{uuid.uuid4().hex[:8]}"
|
room_name = f"{room_prefix}-{uuid.uuid4().hex[:8]}"
|
||||||
logger.info(f"Creating new Daily room: {room_name}")
|
logger.info(f"Creating new Daily room: {room_name}")
|
||||||
|
|
||||||
# Calculate expiration time
|
# Use provided room_properties or build from parameters
|
||||||
expiration_time = time.time() + (room_exp_duration * 60 * 60)
|
if room_properties is None:
|
||||||
|
# Calculate expiration time
|
||||||
|
expiration_time = time.time() + (room_exp_duration * 60 * 60)
|
||||||
|
|
||||||
# Create room properties
|
# Create room properties
|
||||||
room_properties = DailyRoomProperties(
|
room_properties = DailyRoomProperties(
|
||||||
exp=expiration_time,
|
exp=expiration_time,
|
||||||
eject_at_room_exp=True,
|
eject_at_room_exp=True,
|
||||||
)
|
|
||||||
|
|
||||||
# Add SIP configuration if enabled
|
|
||||||
if sip_enabled:
|
|
||||||
sip_params = DailyRoomSipParams(
|
|
||||||
display_name=sip_caller_phone,
|
|
||||||
video=sip_enable_video,
|
|
||||||
sip_mode="dial-in",
|
|
||||||
num_endpoints=sip_num_endpoints,
|
|
||||||
codecs=sip_codecs,
|
|
||||||
)
|
)
|
||||||
room_properties.sip = sip_params
|
|
||||||
room_properties.enable_dialout = True # Enable outbound calls if needed
|
# Add SIP configuration if enabled
|
||||||
room_properties.start_video_off = not sip_enable_video # Voice-only by default
|
if sip_enabled:
|
||||||
|
sip_params = DailyRoomSipParams(
|
||||||
|
display_name=sip_caller_phone,
|
||||||
|
video=sip_enable_video,
|
||||||
|
sip_mode="dial-in",
|
||||||
|
num_endpoints=sip_num_endpoints,
|
||||||
|
codecs=sip_codecs,
|
||||||
|
)
|
||||||
|
room_properties.sip = sip_params
|
||||||
|
room_properties.enable_dialout = True # Enable outbound calls if needed
|
||||||
|
room_properties.start_video_off = not sip_enable_video # Voice-only by default
|
||||||
|
|
||||||
# Create room parameters
|
# Create room parameters
|
||||||
room_params = DailyRoomParams(name=room_name, properties=room_properties)
|
room_params = DailyRoomParams(name=room_name, properties=room_properties)
|
||||||
|
|||||||
@@ -67,10 +67,17 @@ To run locally:
|
|||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import asyncio
|
import asyncio
|
||||||
|
import mimetypes
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
import uuid
|
||||||
from contextlib import asynccontextmanager
|
from contextlib import asynccontextmanager
|
||||||
|
from http import HTTPMethod
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Dict, List, Optional, TypedDict
|
||||||
|
|
||||||
|
import aiohttp
|
||||||
|
from fastapi.responses import FileResponse, Response
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
|
||||||
from pipecat.runner.types import (
|
from pipecat.runner.types import (
|
||||||
@@ -82,7 +89,7 @@ from pipecat.runner.types import (
|
|||||||
try:
|
try:
|
||||||
import uvicorn
|
import uvicorn
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
from fastapi import BackgroundTasks, FastAPI, Request, WebSocket
|
from fastapi import BackgroundTasks, FastAPI, Header, HTTPException, Request, WebSocket
|
||||||
from fastapi.middleware.cors import CORSMiddleware
|
from fastapi.middleware.cors import CORSMiddleware
|
||||||
from fastapi.responses import HTMLResponse, RedirectResponse
|
from fastapi.responses import HTMLResponse, RedirectResponse
|
||||||
except ImportError as e:
|
except ImportError as e:
|
||||||
@@ -96,6 +103,12 @@ except ImportError as e:
|
|||||||
load_dotenv(override=True)
|
load_dotenv(override=True)
|
||||||
os.environ["ENV"] = "local"
|
os.environ["ENV"] = "local"
|
||||||
|
|
||||||
|
TELEPHONY_TRANSPORTS = ["twilio", "telnyx", "plivo", "exotel"]
|
||||||
|
|
||||||
|
RUNNER_DOWNLOADS_FOLDER: Optional[str] = None
|
||||||
|
RUNNER_HOST: str = "localhost"
|
||||||
|
RUNNER_PORT: int = 7860
|
||||||
|
|
||||||
|
|
||||||
def _get_bot_module():
|
def _get_bot_module():
|
||||||
"""Get the bot module from the calling script."""
|
"""Get the bot module from the calling script."""
|
||||||
@@ -150,7 +163,13 @@ async def _run_telephony_bot(websocket: WebSocket):
|
|||||||
|
|
||||||
|
|
||||||
def _create_server_app(
|
def _create_server_app(
|
||||||
transport_type: str, host: str = "localhost", proxy: str = None, esp32_mode: bool = False
|
*,
|
||||||
|
transport_type: str,
|
||||||
|
host: str = "localhost",
|
||||||
|
proxy: str,
|
||||||
|
esp32_mode: bool = False,
|
||||||
|
whatsapp_enabled: bool = False,
|
||||||
|
folder: Optional[str] = None,
|
||||||
):
|
):
|
||||||
"""Create FastAPI app with transport-specific routes."""
|
"""Create FastAPI app with transport-specific routes."""
|
||||||
app = FastAPI()
|
app = FastAPI()
|
||||||
@@ -165,24 +184,30 @@ def _create_server_app(
|
|||||||
|
|
||||||
# Set up transport-specific routes
|
# Set up transport-specific routes
|
||||||
if transport_type == "webrtc":
|
if transport_type == "webrtc":
|
||||||
_setup_webrtc_routes(app, esp32_mode=esp32_mode, host=host)
|
_setup_webrtc_routes(app, esp32_mode=esp32_mode, host=host, folder=folder)
|
||||||
|
if whatsapp_enabled:
|
||||||
|
_setup_whatsapp_routes(app)
|
||||||
elif transport_type == "daily":
|
elif transport_type == "daily":
|
||||||
_setup_daily_routes(app)
|
_setup_daily_routes(app)
|
||||||
elif transport_type in ["twilio", "telnyx", "plivo", "exotel"]:
|
elif transport_type in TELEPHONY_TRANSPORTS:
|
||||||
_setup_telephony_routes(app, transport_type, proxy)
|
_setup_telephony_routes(app, transport_type=transport_type, proxy=proxy)
|
||||||
else:
|
else:
|
||||||
logger.warning(f"Unknown transport type: {transport_type}")
|
logger.warning(f"Unknown transport type: {transport_type}")
|
||||||
|
|
||||||
return app
|
return app
|
||||||
|
|
||||||
|
|
||||||
def _setup_webrtc_routes(app: FastAPI, esp32_mode: bool = False, host: str = "localhost"):
|
def _setup_webrtc_routes(
|
||||||
|
app: FastAPI, *, esp32_mode: bool = False, host: str = "localhost", folder: Optional[str] = None
|
||||||
|
):
|
||||||
"""Set up WebRTC-specific routes."""
|
"""Set up WebRTC-specific routes."""
|
||||||
try:
|
try:
|
||||||
from pipecat_ai_small_webrtc_prebuilt.frontend import SmallWebRTCPrebuiltUI
|
from pipecat_ai_small_webrtc_prebuilt.frontend import SmallWebRTCPrebuiltUI
|
||||||
|
|
||||||
from pipecat.transports.smallwebrtc.connection import SmallWebRTCConnection
|
from pipecat.transports.smallwebrtc.connection import IceServer, SmallWebRTCConnection
|
||||||
from pipecat.transports.smallwebrtc.request_handler import (
|
from pipecat.transports.smallwebrtc.request_handler import (
|
||||||
|
IceCandidate,
|
||||||
|
SmallWebRTCPatchRequest,
|
||||||
SmallWebRTCRequest,
|
SmallWebRTCRequest,
|
||||||
SmallWebRTCRequestHandler,
|
SmallWebRTCRequestHandler,
|
||||||
)
|
)
|
||||||
@@ -190,6 +215,16 @@ def _setup_webrtc_routes(app: FastAPI, esp32_mode: bool = False, host: str = "lo
|
|||||||
logger.error(f"WebRTC transport dependencies not installed: {e}")
|
logger.error(f"WebRTC transport dependencies not installed: {e}")
|
||||||
return
|
return
|
||||||
|
|
||||||
|
class IceConfig(TypedDict):
|
||||||
|
iceServers: List[IceServer]
|
||||||
|
|
||||||
|
class StartBotResult(TypedDict, total=False):
|
||||||
|
sessionId: str
|
||||||
|
iceConfig: Optional[IceConfig]
|
||||||
|
|
||||||
|
# In-memory store of active sessions: session_id -> session info
|
||||||
|
active_sessions: Dict[str, Dict[str, Any]] = {}
|
||||||
|
|
||||||
# Mount the frontend
|
# Mount the frontend
|
||||||
app.mount("/client", SmallWebRTCPrebuiltUI)
|
app.mount("/client", SmallWebRTCPrebuiltUI)
|
||||||
|
|
||||||
@@ -198,6 +233,21 @@ def _setup_webrtc_routes(app: FastAPI, esp32_mode: bool = False, host: str = "lo
|
|||||||
"""Redirect root requests to client interface."""
|
"""Redirect root requests to client interface."""
|
||||||
return RedirectResponse(url="/client/")
|
return RedirectResponse(url="/client/")
|
||||||
|
|
||||||
|
@app.get("/files/{filename:path}")
|
||||||
|
async def download_file(filename: str):
|
||||||
|
"""Handle file downloads."""
|
||||||
|
if not folder:
|
||||||
|
logger.warning(f"Attempting to dowload {filename}, but downloads folder not setup.")
|
||||||
|
return
|
||||||
|
|
||||||
|
file_path = Path(folder) / filename
|
||||||
|
if not os.path.exists(file_path):
|
||||||
|
raise HTTPException(404)
|
||||||
|
|
||||||
|
media_type, _ = mimetypes.guess_type(file_path)
|
||||||
|
|
||||||
|
return FileResponse(path=file_path, media_type=media_type, filename=filename)
|
||||||
|
|
||||||
# Initialize the SmallWebRTC request handler
|
# Initialize the SmallWebRTC request handler
|
||||||
small_webrtc_handler: SmallWebRTCRequestHandler = SmallWebRTCRequestHandler(
|
small_webrtc_handler: SmallWebRTCRequestHandler = SmallWebRTCRequestHandler(
|
||||||
esp32_mode=esp32_mode, host=host
|
esp32_mode=esp32_mode, host=host
|
||||||
@@ -220,13 +270,259 @@ def _setup_webrtc_routes(app: FastAPI, esp32_mode: bool = False, host: str = "lo
|
|||||||
)
|
)
|
||||||
return answer
|
return answer
|
||||||
|
|
||||||
|
@app.patch("/api/offer")
|
||||||
|
async def ice_candidate(request: SmallWebRTCPatchRequest):
|
||||||
|
"""Handle WebRTC new ice candidate requests."""
|
||||||
|
logger.debug(f"Received patch request: {request}")
|
||||||
|
await small_webrtc_handler.handle_patch_request(request)
|
||||||
|
return {"status": "success"}
|
||||||
|
|
||||||
|
@app.post("/start")
|
||||||
|
async def rtvi_start(request: Request):
|
||||||
|
"""Mimic Pipecat Cloud's /start endpoint."""
|
||||||
|
# Parse the request body
|
||||||
|
try:
|
||||||
|
request_data = await request.json()
|
||||||
|
logger.debug(f"Received request: {request_data}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to parse request body: {e}")
|
||||||
|
request_data = {}
|
||||||
|
|
||||||
|
# Store session info immediately in memory, replicate the behavior expected on Pipecat Cloud
|
||||||
|
session_id = str(uuid.uuid4())
|
||||||
|
active_sessions[session_id] = request_data
|
||||||
|
|
||||||
|
result: StartBotResult = {"sessionId": session_id}
|
||||||
|
if request_data.get("enableDefaultIceServers"):
|
||||||
|
result["iceConfig"] = IceConfig(
|
||||||
|
iceServers=[IceServer(urls="stun:stun.l.google.com:19302")]
|
||||||
|
)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
@app.api_route(
|
||||||
|
"/sessions/{session_id}/{path:path}",
|
||||||
|
methods=["GET", "POST", "PUT", "PATCH", "DELETE"],
|
||||||
|
)
|
||||||
|
async def proxy_request(
|
||||||
|
session_id: str, path: str, request: Request, background_tasks: BackgroundTasks
|
||||||
|
):
|
||||||
|
"""Mimic Pipecat Cloud's proxy."""
|
||||||
|
active_session = active_sessions.get(session_id)
|
||||||
|
if not active_session:
|
||||||
|
return Response(content="Invalid or not-yet-ready session_id", status_code=404)
|
||||||
|
|
||||||
|
if path.endswith("api/offer"):
|
||||||
|
# Parse the request body and convert to SmallWebRTCRequest
|
||||||
|
try:
|
||||||
|
request_data = await request.json()
|
||||||
|
if request.method == HTTPMethod.POST.value:
|
||||||
|
webrtc_request = SmallWebRTCRequest(
|
||||||
|
sdp=request_data["sdp"],
|
||||||
|
type=request_data["type"],
|
||||||
|
pc_id=request_data.get("pc_id"),
|
||||||
|
restart_pc=request_data.get("restart_pc"),
|
||||||
|
request_data=request_data,
|
||||||
|
)
|
||||||
|
return await offer(webrtc_request, background_tasks)
|
||||||
|
elif request.method == HTTPMethod.PATCH.value:
|
||||||
|
patch_request = SmallWebRTCPatchRequest(
|
||||||
|
pc_id=request_data["pc_id"],
|
||||||
|
candidates=[IceCandidate(**c) for c in request_data.get("candidates", [])],
|
||||||
|
)
|
||||||
|
return await ice_candidate(patch_request)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to parse WebRTC request: {e}")
|
||||||
|
return Response(content="Invalid WebRTC request", status_code=400)
|
||||||
|
|
||||||
|
logger.info(f"Received request for path: {path}")
|
||||||
|
return Response(status_code=200)
|
||||||
|
|
||||||
@asynccontextmanager
|
@asynccontextmanager
|
||||||
async def lifespan(app: FastAPI):
|
async def smallwebrtc_lifespan(app: FastAPI):
|
||||||
"""Manage FastAPI application lifecycle and cleanup connections."""
|
"""Manage FastAPI application lifecycle and cleanup connections."""
|
||||||
yield
|
yield
|
||||||
await small_webrtc_handler.close()
|
await small_webrtc_handler.close()
|
||||||
|
|
||||||
app.router.lifespan_context = lifespan
|
# Add the SmallWebRTC lifespan to the app
|
||||||
|
_add_lifespan_to_app(app, smallwebrtc_lifespan)
|
||||||
|
|
||||||
|
|
||||||
|
def _add_lifespan_to_app(app: FastAPI, new_lifespan):
|
||||||
|
"""Add a new lifespan context manager to the app, combining with existing if present.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
app: The FastAPI application instance
|
||||||
|
new_lifespan: The new lifespan context manager to add
|
||||||
|
"""
|
||||||
|
if hasattr(app.router, "lifespan_context") and app.router.lifespan_context is not None:
|
||||||
|
# If there's already a lifespan context, combine them
|
||||||
|
existing_lifespan = app.router.lifespan_context
|
||||||
|
|
||||||
|
@asynccontextmanager
|
||||||
|
async def combined_lifespan(app: FastAPI):
|
||||||
|
async with existing_lifespan(app):
|
||||||
|
async with new_lifespan(app):
|
||||||
|
yield
|
||||||
|
|
||||||
|
app.router.lifespan_context = combined_lifespan
|
||||||
|
else:
|
||||||
|
# No existing lifespan, use the new one
|
||||||
|
app.router.lifespan_context = new_lifespan
|
||||||
|
|
||||||
|
|
||||||
|
def _setup_whatsapp_routes(app: FastAPI):
|
||||||
|
"""Set up WebRTC-specific routes."""
|
||||||
|
WHATSAPP_APP_SECRET = os.getenv("WHATSAPP_APP_SECRET")
|
||||||
|
WHATSAPP_PHONE_NUMBER_ID = os.getenv("WHATSAPP_PHONE_NUMBER_ID")
|
||||||
|
WHATSAPP_TOKEN = os.getenv("WHATSAPP_TOKEN")
|
||||||
|
WHATSAPP_WEBHOOK_VERIFICATION_TOKEN = os.getenv("WHATSAPP_WEBHOOK_VERIFICATION_TOKEN")
|
||||||
|
|
||||||
|
if not all(
|
||||||
|
[
|
||||||
|
WHATSAPP_APP_SECRET,
|
||||||
|
WHATSAPP_PHONE_NUMBER_ID,
|
||||||
|
WHATSAPP_TOKEN,
|
||||||
|
WHATSAPP_WEBHOOK_VERIFICATION_TOKEN,
|
||||||
|
]
|
||||||
|
):
|
||||||
|
logger.error(
|
||||||
|
"""Missing required environment variables for WhatsApp transport:
|
||||||
|
WHATSAPP_APP_SECRET
|
||||||
|
WHATSAPP_PHONE_NUMBER_ID
|
||||||
|
WHATSAPP_TOKEN
|
||||||
|
WHATSAPP_WEBHOOK_VERIFICATION_TOKEN
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
from pipecat_ai_small_webrtc_prebuilt.frontend import SmallWebRTCPrebuiltUI
|
||||||
|
|
||||||
|
from pipecat.transports.smallwebrtc.connection import SmallWebRTCConnection
|
||||||
|
from pipecat.transports.smallwebrtc.request_handler import (
|
||||||
|
SmallWebRTCRequest,
|
||||||
|
SmallWebRTCRequestHandler,
|
||||||
|
)
|
||||||
|
from pipecat.transports.whatsapp.api import WhatsAppWebhookRequest
|
||||||
|
from pipecat.transports.whatsapp.client import WhatsAppClient
|
||||||
|
except ImportError as e:
|
||||||
|
logger.error(f"WhatsApp transport dependencies not installed: {e}")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Global WhatsApp client instance
|
||||||
|
whatsapp_client: Optional[WhatsAppClient] = None
|
||||||
|
|
||||||
|
@app.get(
|
||||||
|
"/whatsapp",
|
||||||
|
summary="Verify WhatsApp webhook",
|
||||||
|
description="Handles WhatsApp webhook verification requests from Meta",
|
||||||
|
)
|
||||||
|
async def verify_webhook(request: Request):
|
||||||
|
"""Verify WhatsApp webhook endpoint.
|
||||||
|
|
||||||
|
This endpoint is called by Meta's WhatsApp Business API to verify
|
||||||
|
the webhook URL during setup. It validates the verification token
|
||||||
|
and returns the challenge parameter if successful.
|
||||||
|
"""
|
||||||
|
if whatsapp_client is None:
|
||||||
|
logger.error("WhatsApp client is not initialized")
|
||||||
|
raise HTTPException(status_code=503, detail="Service unavailable")
|
||||||
|
|
||||||
|
params = dict(request.query_params)
|
||||||
|
logger.debug(f"Webhook verification request received with params: {list(params.keys())}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = await whatsapp_client.handle_verify_webhook_request(
|
||||||
|
params=params, expected_verification_token=WHATSAPP_WEBHOOK_VERIFICATION_TOKEN
|
||||||
|
)
|
||||||
|
logger.info("Webhook verification successful")
|
||||||
|
return result
|
||||||
|
except ValueError as e:
|
||||||
|
logger.warning(f"Webhook verification failed: {e}")
|
||||||
|
raise HTTPException(status_code=403, detail="Verification failed")
|
||||||
|
|
||||||
|
@app.post(
|
||||||
|
"/whatsapp",
|
||||||
|
summary="Handle WhatsApp webhook events",
|
||||||
|
description="Processes incoming WhatsApp messages and call events",
|
||||||
|
)
|
||||||
|
async def whatsapp_webhook(
|
||||||
|
body: WhatsAppWebhookRequest,
|
||||||
|
background_tasks: BackgroundTasks,
|
||||||
|
request: Request,
|
||||||
|
x_hub_signature_256: str = Header(None),
|
||||||
|
):
|
||||||
|
"""Handle incoming WhatsApp webhook events.
|
||||||
|
|
||||||
|
For call events, establishes WebRTC connections and spawns bot instances
|
||||||
|
in the background to handle real-time communication.
|
||||||
|
"""
|
||||||
|
if whatsapp_client is None:
|
||||||
|
logger.error("WhatsApp client is not initialized")
|
||||||
|
raise HTTPException(status_code=503, detail="Service unavailable")
|
||||||
|
|
||||||
|
# Validate webhook object type
|
||||||
|
if body.object != "whatsapp_business_account":
|
||||||
|
logger.warning(f"Invalid webhook object type: {body.object}")
|
||||||
|
raise HTTPException(status_code=400, detail="Invalid object type")
|
||||||
|
|
||||||
|
logger.debug(f"Processing WhatsApp webhook: {body.model_dump()}")
|
||||||
|
|
||||||
|
async def connection_callback(connection: SmallWebRTCConnection):
|
||||||
|
"""Handle new WebRTC connections from WhatsApp calls.
|
||||||
|
|
||||||
|
Called when a WebRTC connection is established for a WhatsApp call.
|
||||||
|
Spawns a bot instance to handle the conversation.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
connection: The established WebRTC connection
|
||||||
|
"""
|
||||||
|
bot_module = _get_bot_module()
|
||||||
|
runner_args = SmallWebRTCRunnerArguments(webrtc_connection=connection)
|
||||||
|
background_tasks.add_task(bot_module.bot, runner_args)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Process the webhook request
|
||||||
|
raw_body = await request.body()
|
||||||
|
result = await whatsapp_client.handle_webhook_request(
|
||||||
|
body, connection_callback, sha256_signature=x_hub_signature_256, raw_body=raw_body
|
||||||
|
)
|
||||||
|
logger.debug(f"Webhook processed successfully: {result}")
|
||||||
|
return {"status": "success", "message": "Webhook processed successfully"}
|
||||||
|
except ValueError as ve:
|
||||||
|
logger.warning(f"Invalid webhook request format: {ve}")
|
||||||
|
raise HTTPException(status_code=400, detail=f"Invalid request: {str(ve)}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Internal error processing webhook: {e}")
|
||||||
|
raise HTTPException(status_code=500, detail="Internal server error processing webhook")
|
||||||
|
|
||||||
|
@asynccontextmanager
|
||||||
|
async def whatsapp_lifespan(app: FastAPI):
|
||||||
|
"""Manage WhatsApp client lifecycle and cleanup connections."""
|
||||||
|
nonlocal whatsapp_client
|
||||||
|
|
||||||
|
# Initialize WhatsApp client with persistent HTTP session
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
whatsapp_client = WhatsAppClient(
|
||||||
|
whatsapp_token=WHATSAPP_TOKEN,
|
||||||
|
whatsapp_secret=WHATSAPP_APP_SECRET,
|
||||||
|
phone_number_id=WHATSAPP_PHONE_NUMBER_ID,
|
||||||
|
session=session,
|
||||||
|
)
|
||||||
|
logger.info("WhatsApp client initialized successfully")
|
||||||
|
|
||||||
|
try:
|
||||||
|
yield # Run the application
|
||||||
|
finally:
|
||||||
|
# Cleanup all active calls on shutdown
|
||||||
|
logger.info("Cleaning up WhatsApp client resources...")
|
||||||
|
if whatsapp_client:
|
||||||
|
await whatsapp_client.terminate_all_calls()
|
||||||
|
logger.info("WhatsApp cleanup completed")
|
||||||
|
|
||||||
|
# Add the WhatsApp lifespan to the app
|
||||||
|
_add_lifespan_to_app(app, whatsapp_lifespan)
|
||||||
|
|
||||||
|
|
||||||
def _setup_daily_routes(app: FastAPI):
|
def _setup_daily_routes(app: FastAPI):
|
||||||
@@ -281,8 +577,6 @@ def _setup_daily_routes(app: FastAPI):
|
|||||||
else:
|
else:
|
||||||
logger.debug("No body data provided in request")
|
logger.debug("No body data provided in request")
|
||||||
|
|
||||||
import aiohttp
|
|
||||||
|
|
||||||
from pipecat.runner.daily import configure
|
from pipecat.runner.daily import configure
|
||||||
|
|
||||||
async with aiohttp.ClientSession() as session:
|
async with aiohttp.ClientSession() as session:
|
||||||
@@ -314,7 +608,7 @@ def _setup_daily_routes(app: FastAPI):
|
|||||||
return await _handle_rtvi_request(request)
|
return await _handle_rtvi_request(request)
|
||||||
|
|
||||||
|
|
||||||
def _setup_telephony_routes(app: FastAPI, transport_type: str, proxy: str):
|
def _setup_telephony_routes(app: FastAPI, *, transport_type: str, proxy: str):
|
||||||
"""Set up telephony-specific routes."""
|
"""Set up telephony-specific routes."""
|
||||||
# XML response templates (Exotel doesn't use XML webhooks)
|
# XML response templates (Exotel doesn't use XML webhooks)
|
||||||
XML_TEMPLATES = {
|
XML_TEMPLATES = {
|
||||||
@@ -370,8 +664,6 @@ def _setup_telephony_routes(app: FastAPI, transport_type: str, proxy: str):
|
|||||||
async def _run_daily_direct():
|
async def _run_daily_direct():
|
||||||
"""Run Daily bot with direct connection (no FastAPI server)."""
|
"""Run Daily bot with direct connection (no FastAPI server)."""
|
||||||
try:
|
try:
|
||||||
import aiohttp
|
|
||||||
|
|
||||||
from pipecat.runner.daily import configure
|
from pipecat.runner.daily import configure
|
||||||
except ImportError as e:
|
except ImportError as e:
|
||||||
logger.error("Daily transport dependencies not installed.")
|
logger.error("Daily transport dependencies not installed.")
|
||||||
@@ -417,6 +709,21 @@ def _validate_and_clean_proxy(proxy: str) -> str:
|
|||||||
return proxy
|
return proxy
|
||||||
|
|
||||||
|
|
||||||
|
def runner_downloads_folder() -> Optional[str]:
|
||||||
|
"""Returns the folder where files are stored for later download."""
|
||||||
|
return RUNNER_DOWNLOADS_FOLDER
|
||||||
|
|
||||||
|
|
||||||
|
def runner_host() -> str:
|
||||||
|
"""Returns the host name of this runner."""
|
||||||
|
return RUNNER_HOST
|
||||||
|
|
||||||
|
|
||||||
|
def runner_port() -> int:
|
||||||
|
"""Returns the port of this runner."""
|
||||||
|
return RUNNER_PORT
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
"""Start the Pipecat development runner.
|
"""Start the Pipecat development runner.
|
||||||
|
|
||||||
@@ -437,14 +744,16 @@ def main():
|
|||||||
|
|
||||||
The bot file must contain a `bot(runner_args)` function as the entry point.
|
The bot file must contain a `bot(runner_args)` function as the entry point.
|
||||||
"""
|
"""
|
||||||
|
global RUNNER_DOWNLOADS_FOLDER, RUNNER_HOST, RUNNER_PORT
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(description="Pipecat Development Runner")
|
parser = argparse.ArgumentParser(description="Pipecat Development Runner")
|
||||||
parser.add_argument("--host", type=str, default="localhost", help="Host address")
|
parser.add_argument("--host", type=str, default=RUNNER_HOST, help="Host address")
|
||||||
parser.add_argument("--port", type=int, default=7860, help="Port number")
|
parser.add_argument("--port", type=int, default=RUNNER_PORT, help="Port number")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"-t",
|
"-t",
|
||||||
"--transport",
|
"--transport",
|
||||||
type=str,
|
type=str,
|
||||||
choices=["daily", "webrtc", "twilio", "telnyx", "plivo", "exotel"],
|
choices=["daily", "webrtc", *TELEPHONY_TRANSPORTS],
|
||||||
default="webrtc",
|
default="webrtc",
|
||||||
help="Transport type",
|
help="Transport type",
|
||||||
)
|
)
|
||||||
@@ -462,9 +771,16 @@ def main():
|
|||||||
default=False,
|
default=False,
|
||||||
help="Connect directly to Daily room (automatically sets transport to daily)",
|
help="Connect directly to Daily room (automatically sets transport to daily)",
|
||||||
)
|
)
|
||||||
|
parser.add_argument("-f", "--folder", type=str, help="Path to downloads folder")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--verbose", "-v", action="count", default=0, help="Increase logging verbosity"
|
"--verbose", "-v", action="count", default=0, help="Increase logging verbosity"
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--whatsapp",
|
||||||
|
action="store_true",
|
||||||
|
default=False,
|
||||||
|
help="Ensure requried WhatsApp environment variables are present",
|
||||||
|
)
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
@@ -484,6 +800,10 @@ def main():
|
|||||||
logger.error("For ESP32, you need to specify `--host IP` so we can do SDP munging.")
|
logger.error("For ESP32, you need to specify `--host IP` so we can do SDP munging.")
|
||||||
return
|
return
|
||||||
|
|
||||||
|
if args.transport in TELEPHONY_TRANSPORTS and not args.proxy:
|
||||||
|
logger.error(f"For telephony transports, you need to specify `--proxy PROXY`.")
|
||||||
|
return
|
||||||
|
|
||||||
# Log level
|
# Log level
|
||||||
logger.remove()
|
logger.remove()
|
||||||
logger.add(sys.stderr, level="TRACE" if args.verbose else "DEBUG")
|
logger.add(sys.stderr, level="TRACE" if args.verbose else "DEBUG")
|
||||||
@@ -503,10 +823,11 @@ def main():
|
|||||||
print()
|
print()
|
||||||
if args.esp32:
|
if args.esp32:
|
||||||
print(f"🚀 Bot ready! (ESP32 mode)")
|
print(f"🚀 Bot ready! (ESP32 mode)")
|
||||||
print(f" → Open http://{args.host}:{args.port}/client in your browser")
|
elif args.whatsapp:
|
||||||
|
print(f"🚀 Bot ready! (WhatsApp)")
|
||||||
else:
|
else:
|
||||||
print(f"🚀 Bot ready!")
|
print(f"🚀 Bot ready!")
|
||||||
print(f" → Open http://{args.host}:{args.port}/client in your browser")
|
print(f" → Open http://{args.host}:{args.port}/client in your browser")
|
||||||
print()
|
print()
|
||||||
elif args.transport == "daily":
|
elif args.transport == "daily":
|
||||||
print()
|
print()
|
||||||
@@ -514,8 +835,19 @@ def main():
|
|||||||
print(f" → Open http://{args.host}:{args.port} in your browser to start a session")
|
print(f" → Open http://{args.host}:{args.port} in your browser to start a session")
|
||||||
print()
|
print()
|
||||||
|
|
||||||
|
RUNNER_DOWNLOADS_FOLDER = args.folder
|
||||||
|
RUNNER_HOST = args.host
|
||||||
|
RUNNER_PORT = args.port
|
||||||
|
|
||||||
# Create the app with transport-specific setup
|
# Create the app with transport-specific setup
|
||||||
app = _create_server_app(args.transport, args.host, args.proxy, args.esp32)
|
app = _create_server_app(
|
||||||
|
transport_type=args.transport,
|
||||||
|
host=args.host,
|
||||||
|
proxy=args.proxy,
|
||||||
|
esp32_mode=args.esp32,
|
||||||
|
whatsapp_enabled=args.whatsapp,
|
||||||
|
folder=args.folder,
|
||||||
|
)
|
||||||
|
|
||||||
# Run the server
|
# Run the server
|
||||||
uvicorn.run(app, host=args.host, port=args.port)
|
uvicorn.run(app, host=args.host, port=args.port)
|
||||||
|
|||||||
@@ -99,29 +99,41 @@ async def parse_telephony_websocket(websocket: WebSocket):
|
|||||||
tuple: (transport_type: str, call_data: dict)
|
tuple: (transport_type: str, call_data: dict)
|
||||||
|
|
||||||
call_data contains provider-specific fields:
|
call_data contains provider-specific fields:
|
||||||
- Twilio: {
|
|
||||||
"stream_id": str,
|
- Twilio::
|
||||||
"call_id": str,
|
|
||||||
"body": dict
|
{
|
||||||
}
|
"stream_id": str,
|
||||||
- Telnyx: {
|
"call_id": str,
|
||||||
"stream_id": str,
|
"body": dict
|
||||||
"call_control_id": str,
|
}
|
||||||
"outbound_encoding": str,
|
|
||||||
"from": str,
|
- Telnyx::
|
||||||
"to": str,
|
|
||||||
}
|
{
|
||||||
- Plivo: {
|
"stream_id": str,
|
||||||
"stream_id": str,
|
"call_control_id": str,
|
||||||
"call_id": str,
|
"outbound_encoding": str,
|
||||||
}
|
"from": str,
|
||||||
- Exotel: {
|
"to": str,
|
||||||
"stream_id": str,
|
}
|
||||||
"call_id": str,
|
|
||||||
"account_sid": str,
|
- Plivo::
|
||||||
"from": str,
|
|
||||||
"to": str,
|
{
|
||||||
}
|
"stream_id": str,
|
||||||
|
"call_id": str,
|
||||||
|
}
|
||||||
|
|
||||||
|
- Exotel::
|
||||||
|
|
||||||
|
{
|
||||||
|
"stream_id": str,
|
||||||
|
"call_id": str,
|
||||||
|
"account_sid": str,
|
||||||
|
"from": str,
|
||||||
|
"to": str,
|
||||||
|
}
|
||||||
|
|
||||||
Example usage::
|
Example usage::
|
||||||
|
|
||||||
@@ -301,6 +313,7 @@ def _smallwebrtc_sdp_cleanup_ice_candidates(text: str, pattern: str) -> str:
|
|||||||
Returns:
|
Returns:
|
||||||
Cleaned SDP text with filtered ICE candidates.
|
Cleaned SDP text with filtered ICE candidates.
|
||||||
"""
|
"""
|
||||||
|
logger.debug("Removing unsupported ICE candidates from SDP")
|
||||||
result = []
|
result = []
|
||||||
lines = text.splitlines()
|
lines = text.splitlines()
|
||||||
for line in lines:
|
for line in lines:
|
||||||
@@ -309,7 +322,7 @@ def _smallwebrtc_sdp_cleanup_ice_candidates(text: str, pattern: str) -> str:
|
|||||||
result.append(line)
|
result.append(line)
|
||||||
else:
|
else:
|
||||||
result.append(line)
|
result.append(line)
|
||||||
return "\r\n".join(result)
|
return "\r\n".join(result) + "\r\n"
|
||||||
|
|
||||||
|
|
||||||
def _smallwebrtc_sdp_cleanup_fingerprints(text: str) -> str:
|
def _smallwebrtc_sdp_cleanup_fingerprints(text: str) -> str:
|
||||||
@@ -321,15 +334,16 @@ def _smallwebrtc_sdp_cleanup_fingerprints(text: str) -> str:
|
|||||||
Returns:
|
Returns:
|
||||||
SDP text with sha-384 and sha-512 fingerprints removed.
|
SDP text with sha-384 and sha-512 fingerprints removed.
|
||||||
"""
|
"""
|
||||||
|
logger.debug("Removing unsupported fingerprints from SDP")
|
||||||
result = []
|
result = []
|
||||||
lines = text.splitlines()
|
lines = text.splitlines()
|
||||||
for line in lines:
|
for line in lines:
|
||||||
if not re.search("sha-384", line) and not re.search("sha-512", line):
|
if not re.search("sha-384", line) and not re.search("sha-512", line):
|
||||||
result.append(line)
|
result.append(line)
|
||||||
return "\r\n".join(result)
|
return "\r\n".join(result) + "\r\n"
|
||||||
|
|
||||||
|
|
||||||
def smallwebrtc_sdp_munging(sdp: str, host: str) -> str:
|
def smallwebrtc_sdp_munging(sdp: str, host: Optional[str]) -> str:
|
||||||
"""Apply SDP modifications for SmallWebRTC compatibility.
|
"""Apply SDP modifications for SmallWebRTC compatibility.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@@ -340,7 +354,8 @@ def smallwebrtc_sdp_munging(sdp: str, host: str) -> str:
|
|||||||
Modified SDP string with fingerprint and ICE candidate cleanup.
|
Modified SDP string with fingerprint and ICE candidate cleanup.
|
||||||
"""
|
"""
|
||||||
sdp = _smallwebrtc_sdp_cleanup_fingerprints(sdp)
|
sdp = _smallwebrtc_sdp_cleanup_fingerprints(sdp)
|
||||||
sdp = _smallwebrtc_sdp_cleanup_ice_candidates(sdp, host)
|
if host:
|
||||||
|
sdp = _smallwebrtc_sdp_cleanup_ice_candidates(sdp, host)
|
||||||
return sdp
|
return sdp
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -21,9 +21,9 @@ from pipecat.frames.frames import (
|
|||||||
InputAudioRawFrame,
|
InputAudioRawFrame,
|
||||||
InputDTMFFrame,
|
InputDTMFFrame,
|
||||||
InterruptionFrame,
|
InterruptionFrame,
|
||||||
|
OutputTransportMessageFrame,
|
||||||
|
OutputTransportMessageUrgentFrame,
|
||||||
StartFrame,
|
StartFrame,
|
||||||
TransportMessageFrame,
|
|
||||||
TransportMessageUrgentFrame,
|
|
||||||
)
|
)
|
||||||
from pipecat.serializers.base_serializer import FrameSerializer, FrameSerializerType
|
from pipecat.serializers.base_serializer import FrameSerializer, FrameSerializerType
|
||||||
|
|
||||||
@@ -121,7 +121,7 @@ class ExotelFrameSerializer(FrameSerializer):
|
|||||||
}
|
}
|
||||||
|
|
||||||
return json.dumps(answer)
|
return json.dumps(answer)
|
||||||
elif isinstance(frame, (TransportMessageFrame, TransportMessageUrgentFrame)):
|
elif isinstance(frame, (OutputTransportMessageFrame, OutputTransportMessageUrgentFrame)):
|
||||||
return json.dumps(frame.message)
|
return json.dumps(frame.message)
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|||||||
@@ -25,11 +25,31 @@ except ModuleNotFoundError as e:
|
|||||||
class LivekitFrameSerializer(FrameSerializer):
|
class LivekitFrameSerializer(FrameSerializer):
|
||||||
"""Serializer for converting between Pipecat frames and LiveKit audio frames.
|
"""Serializer for converting between Pipecat frames and LiveKit audio frames.
|
||||||
|
|
||||||
|
.. deprecated:: 0.0.90
|
||||||
|
|
||||||
|
This class is deprecated and will be removed in a future version.
|
||||||
|
Please use LiveKitTransport instead, which handles audio streaming
|
||||||
|
and frame conversion natively.
|
||||||
|
|
||||||
This serializer handles the conversion of Pipecat's OutputAudioRawFrame objects
|
This serializer handles the conversion of Pipecat's OutputAudioRawFrame objects
|
||||||
to LiveKit AudioFrame objects for transmission, and the reverse conversion
|
to LiveKit AudioFrame objects for transmission, and the reverse conversion
|
||||||
for received audio data.
|
for received audio data.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
"""Initialize the LiveKit frame serializer."""
|
||||||
|
super().__init__()
|
||||||
|
import warnings
|
||||||
|
|
||||||
|
with warnings.catch_warnings():
|
||||||
|
warnings.simplefilter("always")
|
||||||
|
warnings.warn(
|
||||||
|
"LivekitFrameSerializer is deprecated and will be removed in a future version. "
|
||||||
|
"Please use LiveKitTransport instead, which handles audio streaming natively.",
|
||||||
|
DeprecationWarning,
|
||||||
|
stacklevel=2,
|
||||||
|
)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def type(self) -> FrameSerializerType:
|
def type(self) -> FrameSerializerType:
|
||||||
"""Get the serializer type.
|
"""Get the serializer type.
|
||||||
|
|||||||
@@ -23,9 +23,9 @@ from pipecat.frames.frames import (
|
|||||||
InputAudioRawFrame,
|
InputAudioRawFrame,
|
||||||
InputDTMFFrame,
|
InputDTMFFrame,
|
||||||
InterruptionFrame,
|
InterruptionFrame,
|
||||||
|
OutputTransportMessageFrame,
|
||||||
|
OutputTransportMessageUrgentFrame,
|
||||||
StartFrame,
|
StartFrame,
|
||||||
TransportMessageFrame,
|
|
||||||
TransportMessageUrgentFrame,
|
|
||||||
)
|
)
|
||||||
from pipecat.serializers.base_serializer import FrameSerializer, FrameSerializerType
|
from pipecat.serializers.base_serializer import FrameSerializer, FrameSerializerType
|
||||||
|
|
||||||
@@ -148,7 +148,7 @@ class PlivoFrameSerializer(FrameSerializer):
|
|||||||
}
|
}
|
||||||
|
|
||||||
return json.dumps(answer)
|
return json.dumps(answer)
|
||||||
elif isinstance(frame, (TransportMessageFrame, TransportMessageUrgentFrame)):
|
elif isinstance(frame, (OutputTransportMessageFrame, OutputTransportMessageUrgentFrame)):
|
||||||
return json.dumps(frame.message)
|
return json.dumps(frame.message)
|
||||||
|
|
||||||
# Return None for unhandled frames
|
# Return None for unhandled frames
|
||||||
|
|||||||
@@ -15,11 +15,12 @@ import pipecat.frames.protobufs.frames_pb2 as frame_protos
|
|||||||
from pipecat.frames.frames import (
|
from pipecat.frames.frames import (
|
||||||
Frame,
|
Frame,
|
||||||
InputAudioRawFrame,
|
InputAudioRawFrame,
|
||||||
|
InputTransportMessageFrame,
|
||||||
OutputAudioRawFrame,
|
OutputAudioRawFrame,
|
||||||
|
OutputTransportMessageFrame,
|
||||||
|
OutputTransportMessageUrgentFrame,
|
||||||
TextFrame,
|
TextFrame,
|
||||||
TranscriptionFrame,
|
TranscriptionFrame,
|
||||||
TransportMessageFrame,
|
|
||||||
TransportMessageUrgentFrame,
|
|
||||||
)
|
)
|
||||||
from pipecat.serializers.base_serializer import FrameSerializer, FrameSerializerType
|
from pipecat.serializers.base_serializer import FrameSerializer, FrameSerializerType
|
||||||
|
|
||||||
@@ -82,7 +83,7 @@ class ProtobufFrameSerializer(FrameSerializer):
|
|||||||
Serialized frame as bytes, or None if frame type is not serializable.
|
Serialized frame as bytes, or None if frame type is not serializable.
|
||||||
"""
|
"""
|
||||||
# Wrapping this messages as a JSONFrame to send
|
# Wrapping this messages as a JSONFrame to send
|
||||||
if isinstance(frame, (TransportMessageFrame, TransportMessageUrgentFrame)):
|
if isinstance(frame, (OutputTransportMessageFrame, OutputTransportMessageUrgentFrame)):
|
||||||
frame = MessageFrame(
|
frame = MessageFrame(
|
||||||
data=json.dumps(frame.message),
|
data=json.dumps(frame.message),
|
||||||
)
|
)
|
||||||
@@ -134,11 +135,11 @@ class ProtobufFrameSerializer(FrameSerializer):
|
|||||||
if "pts" in args_dict:
|
if "pts" in args_dict:
|
||||||
del args_dict["pts"]
|
del args_dict["pts"]
|
||||||
|
|
||||||
# Special handling for MessageFrame -> TransportMessageUrgentFrame
|
# Special handling for MessageFrame -> OutputTransportMessageUrgentFrame
|
||||||
if class_name == MessageFrame:
|
if class_name == MessageFrame:
|
||||||
try:
|
try:
|
||||||
msg = json.loads(args_dict["data"])
|
msg = json.loads(args_dict["data"])
|
||||||
instance = TransportMessageUrgentFrame(message=msg)
|
instance = InputTransportMessageFrame(message=msg)
|
||||||
logger.debug(f"ProtobufFrameSerializer: Transport message {instance}")
|
logger.debug(f"ProtobufFrameSerializer: Transport message {instance}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error parsing MessageFrame data: {e}")
|
logger.error(f"Error parsing MessageFrame data: {e}")
|
||||||
|
|||||||
@@ -23,9 +23,9 @@ from pipecat.frames.frames import (
|
|||||||
InputAudioRawFrame,
|
InputAudioRawFrame,
|
||||||
InputDTMFFrame,
|
InputDTMFFrame,
|
||||||
InterruptionFrame,
|
InterruptionFrame,
|
||||||
|
OutputTransportMessageFrame,
|
||||||
|
OutputTransportMessageUrgentFrame,
|
||||||
StartFrame,
|
StartFrame,
|
||||||
TransportMessageFrame,
|
|
||||||
TransportMessageUrgentFrame,
|
|
||||||
)
|
)
|
||||||
from pipecat.serializers.base_serializer import FrameSerializer, FrameSerializerType
|
from pipecat.serializers.base_serializer import FrameSerializer, FrameSerializerType
|
||||||
|
|
||||||
@@ -175,7 +175,7 @@ class TwilioFrameSerializer(FrameSerializer):
|
|||||||
}
|
}
|
||||||
|
|
||||||
return json.dumps(answer)
|
return json.dumps(answer)
|
||||||
elif isinstance(frame, (TransportMessageFrame, TransportMessageUrgentFrame)):
|
elif isinstance(frame, (OutputTransportMessageFrame, OutputTransportMessageUrgentFrame)):
|
||||||
return json.dumps(frame.message)
|
return json.dumps(frame.message)
|
||||||
|
|
||||||
# Return None for unhandled frames
|
# Return None for unhandled frames
|
||||||
|
|||||||
@@ -97,9 +97,7 @@ class AIService(FrameProcessor):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
async def _update_settings(self, settings: Mapping[str, Any]):
|
async def _update_settings(self, settings: Mapping[str, Any]):
|
||||||
from pipecat.services.openai_realtime_beta.events import (
|
from pipecat.services.openai.realtime.events import SessionProperties
|
||||||
SessionProperties,
|
|
||||||
)
|
|
||||||
|
|
||||||
for key, value in settings.items():
|
for key, value in settings.items():
|
||||||
logger.debug("Update request for:", key, value)
|
logger.debug("Update request for:", key, value)
|
||||||
@@ -111,9 +109,7 @@ class AIService(FrameProcessor):
|
|||||||
logger.debug("Attempting to update", key, value)
|
logger.debug("Attempting to update", key, value)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from pipecat.services.openai_realtime_beta.events import (
|
from pipecat.services.openai.realtime.events import TurnDetection
|
||||||
TurnDetection,
|
|
||||||
)
|
|
||||||
|
|
||||||
if isinstance(self._session_properties, SessionProperties):
|
if isinstance(self._session_properties, SessionProperties):
|
||||||
current_properties = self._session_properties
|
current_properties = self._session_properties
|
||||||
|
|||||||
@@ -151,7 +151,7 @@ class AnthropicLLMService(LLMService):
|
|||||||
self,
|
self,
|
||||||
*,
|
*,
|
||||||
api_key: str,
|
api_key: str,
|
||||||
model: str = "claude-sonnet-4-20250514",
|
model: str = "claude-sonnet-4-5-20250929",
|
||||||
params: Optional[InputParams] = None,
|
params: Optional[InputParams] = None,
|
||||||
client=None,
|
client=None,
|
||||||
retry_timeout_secs: Optional[float] = 5.0,
|
retry_timeout_secs: Optional[float] = 5.0,
|
||||||
@@ -162,7 +162,7 @@ class AnthropicLLMService(LLMService):
|
|||||||
|
|
||||||
Args:
|
Args:
|
||||||
api_key: Anthropic API key for authentication.
|
api_key: Anthropic API key for authentication.
|
||||||
model: Model name to use. Defaults to "claude-sonnet-4-20250514".
|
model: Model name to use. Defaults to "claude-sonnet-4-5-20250929".
|
||||||
params: Optional model parameters for inference.
|
params: Optional model parameters for inference.
|
||||||
client: Optional custom Anthropic client instance.
|
client: Optional custom Anthropic client instance.
|
||||||
retry_timeout_secs: Request timeout in seconds for retry logic.
|
retry_timeout_secs: Request timeout in seconds for retry logic.
|
||||||
|
|||||||
@@ -108,6 +108,8 @@ class AssemblyAIConnectionParams(BaseModel):
|
|||||||
end_of_turn_confidence_threshold: Confidence threshold for end-of-turn detection.
|
end_of_turn_confidence_threshold: Confidence threshold for end-of-turn detection.
|
||||||
min_end_of_turn_silence_when_confident: Minimum silence duration when confident about end-of-turn.
|
min_end_of_turn_silence_when_confident: Minimum silence duration when confident about end-of-turn.
|
||||||
max_turn_silence: Maximum silence duration before forcing end-of-turn.
|
max_turn_silence: Maximum silence duration before forcing end-of-turn.
|
||||||
|
keyterms_prompt: List of key terms to guide transcription. Will be JSON serialized before sending.
|
||||||
|
speech_model: Select between English and multilingual models. Defaults to "universal-streaming-english".
|
||||||
"""
|
"""
|
||||||
|
|
||||||
sample_rate: int = 16000
|
sample_rate: int = 16000
|
||||||
@@ -117,3 +119,7 @@ class AssemblyAIConnectionParams(BaseModel):
|
|||||||
end_of_turn_confidence_threshold: Optional[float] = None
|
end_of_turn_confidence_threshold: Optional[float] = None
|
||||||
min_end_of_turn_silence_when_confident: Optional[int] = None
|
min_end_of_turn_silence_when_confident: Optional[int] = None
|
||||||
max_turn_silence: Optional[int] = None
|
max_turn_silence: Optional[int] = None
|
||||||
|
keyterms_prompt: Optional[List[str]] = None
|
||||||
|
speech_model: Literal["universal-streaming-english", "universal-streaming-multilingual"] = (
|
||||||
|
"universal-streaming-english"
|
||||||
|
)
|
||||||
|
|||||||
@@ -174,11 +174,16 @@ class AssemblyAISTTService(STTService):
|
|||||||
|
|
||||||
def _build_ws_url(self) -> str:
|
def _build_ws_url(self) -> str:
|
||||||
"""Build WebSocket URL with query parameters using urllib.parse.urlencode."""
|
"""Build WebSocket URL with query parameters using urllib.parse.urlencode."""
|
||||||
params = {
|
params = {}
|
||||||
k: str(v).lower() if isinstance(v, bool) else v
|
for k, v in self._connection_params.model_dump().items():
|
||||||
for k, v in self._connection_params.model_dump().items()
|
if v is not None:
|
||||||
if v is not None
|
if k == "keyterms_prompt":
|
||||||
}
|
params[k] = json.dumps(v)
|
||||||
|
elif isinstance(v, bool):
|
||||||
|
params[k] = str(v).lower()
|
||||||
|
else:
|
||||||
|
params[k] = v
|
||||||
|
|
||||||
if params:
|
if params:
|
||||||
query_string = urlencode(params)
|
query_string = urlencode(params)
|
||||||
return f"{self._api_endpoint_base_url}?{query_string}"
|
return f"{self._api_endpoint_base_url}?{query_string}"
|
||||||
@@ -197,6 +202,8 @@ class AssemblyAISTTService(STTService):
|
|||||||
)
|
)
|
||||||
self._connected = True
|
self._connected = True
|
||||||
self._receive_task = self.create_task(self._receive_task_handler())
|
self._receive_task = self.create_task(self._receive_task_handler())
|
||||||
|
|
||||||
|
await self._call_event_handler("on_connected")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Failed to connect to AssemblyAI: {e}")
|
logger.error(f"Failed to connect to AssemblyAI: {e}")
|
||||||
self._connected = False
|
self._connected = False
|
||||||
@@ -238,6 +245,7 @@ class AssemblyAISTTService(STTService):
|
|||||||
self._websocket = None
|
self._websocket = None
|
||||||
self._connected = False
|
self._connected = False
|
||||||
self._receive_task = None
|
self._receive_task = None
|
||||||
|
await self._call_event_handler("on_disconnected")
|
||||||
|
|
||||||
async def _receive_task_handler(self):
|
async def _receive_task_handler(self):
|
||||||
"""Handle incoming WebSocket messages."""
|
"""Handle incoming WebSocket messages."""
|
||||||
|
|||||||
@@ -235,6 +235,8 @@ class AsyncAITTSService(InterruptibleTTSService):
|
|||||||
}
|
}
|
||||||
|
|
||||||
await self._get_websocket().send(json.dumps(init_msg))
|
await self._get_websocket().send(json.dumps(init_msg))
|
||||||
|
|
||||||
|
await self._call_event_handler("on_connected")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"{self} initialization error: {e}")
|
logger.error(f"{self} initialization error: {e}")
|
||||||
self._websocket = None
|
self._websocket = None
|
||||||
@@ -252,6 +254,7 @@ class AsyncAITTSService(InterruptibleTTSService):
|
|||||||
finally:
|
finally:
|
||||||
self._websocket = None
|
self._websocket = None
|
||||||
self._started = False
|
self._started = False
|
||||||
|
await self._call_event_handler("on_disconnected")
|
||||||
|
|
||||||
def _get_websocket(self):
|
def _get_websocket(self):
|
||||||
if self._websocket:
|
if self._websocket:
|
||||||
|
|||||||
@@ -9,6 +9,7 @@ import sys
|
|||||||
from pipecat.services import DeprecatedModuleProxy
|
from pipecat.services import DeprecatedModuleProxy
|
||||||
|
|
||||||
from .llm import *
|
from .llm import *
|
||||||
|
from .nova_sonic import *
|
||||||
from .stt import *
|
from .stt import *
|
||||||
from .tts import *
|
from .tts import *
|
||||||
|
|
||||||
|
|||||||
@@ -61,7 +61,6 @@ from pipecat.utils.tracing.service_decorators import traced_llm
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
import aioboto3
|
import aioboto3
|
||||||
import httpx
|
|
||||||
from botocore.config import Config
|
from botocore.config import Config
|
||||||
from botocore.exceptions import ReadTimeoutError
|
from botocore.exceptions import ReadTimeoutError
|
||||||
except ModuleNotFoundError as e:
|
except ModuleNotFoundError as e:
|
||||||
@@ -1117,7 +1116,7 @@ class AWSBedrockLLMService(LLMService):
|
|||||||
# also get cancelled.
|
# also get cancelled.
|
||||||
use_completion_tokens_estimate = True
|
use_completion_tokens_estimate = True
|
||||||
raise
|
raise
|
||||||
except httpx.TimeoutException:
|
except (ReadTimeoutError, asyncio.TimeoutError):
|
||||||
await self._call_event_handler("on_completion_timeout")
|
await self._call_event_handler("on_completion_timeout")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.exception(f"{self} exception: {e}")
|
logger.exception(f"{self} exception: {e}")
|
||||||
|
|||||||
0
src/pipecat/services/aws/nova_sonic/__init__.py
Normal file
0
src/pipecat/services/aws/nova_sonic/__init__.py
Normal file
87
src/pipecat/services/aws/nova_sonic/context.py
Normal file
87
src/pipecat/services/aws/nova_sonic/context.py
Normal file
@@ -0,0 +1,87 @@
|
|||||||
|
#
|
||||||
|
# Copyright (c) 2025, Daily
|
||||||
|
#
|
||||||
|
# SPDX-License-Identifier: BSD 2-Clause License
|
||||||
|
#
|
||||||
|
|
||||||
|
"""Context management for AWS Nova Sonic LLM service.
|
||||||
|
|
||||||
|
This module provides specialized context aggregators and message handling for AWS Nova Sonic,
|
||||||
|
including conversation history management and role-specific message processing.
|
||||||
|
|
||||||
|
.. deprecated:: 0.0.91
|
||||||
|
AWS Nova Sonic now supports `LLMContext` and `LLMContextAggregatorPair`.
|
||||||
|
Using the new patterns should allow you to not need types from this module.
|
||||||
|
|
||||||
|
BEFORE:
|
||||||
|
```
|
||||||
|
# Setup
|
||||||
|
context = OpenAILLMContext(messages, tools)
|
||||||
|
context_aggregator = llm.create_context_aggregator(context)
|
||||||
|
|
||||||
|
# Context frame type
|
||||||
|
frame: OpenAILLMContextFrame
|
||||||
|
|
||||||
|
# Context type
|
||||||
|
context: AWSNovaSonicLLMContext
|
||||||
|
# or
|
||||||
|
context: OpenAILLMContext
|
||||||
|
|
||||||
|
# Reading messages from context
|
||||||
|
messages = context.messages
|
||||||
|
```
|
||||||
|
|
||||||
|
AFTER:
|
||||||
|
```
|
||||||
|
# Setup
|
||||||
|
context = LLMContext(messages, tools)
|
||||||
|
context_aggregator = LLMContextAggregatorPair(context)
|
||||||
|
|
||||||
|
# Context frame type
|
||||||
|
frame: LLMContextFrame
|
||||||
|
|
||||||
|
# Context type
|
||||||
|
context: LLMContext
|
||||||
|
|
||||||
|
# Reading messages from context
|
||||||
|
messages = context.get_messages()
|
||||||
|
```
|
||||||
|
"""
|
||||||
|
|
||||||
|
import warnings
|
||||||
|
|
||||||
|
with warnings.catch_warnings():
|
||||||
|
warnings.simplefilter("always")
|
||||||
|
warnings.warn(
|
||||||
|
"Types in pipecat.services.aws.nova_sonic.context are deprecated. \n"
|
||||||
|
"AWS Nova Sonic now supports `LLMContext` and `LLMContextAggregatorPair`. \n"
|
||||||
|
"Using the new patterns should allow you to not need types from this module.\n\n"
|
||||||
|
"BEFORE:\n"
|
||||||
|
"```\n"
|
||||||
|
"# Setup\n"
|
||||||
|
"context = OpenAILLMContext(messages, tools)\n"
|
||||||
|
"context_aggregator = llm.create_context_aggregator(context)\n\n"
|
||||||
|
"# Context frame type\n"
|
||||||
|
"frame: OpenAILLMContextFrame\n\n"
|
||||||
|
"# Context type\n"
|
||||||
|
"context: AWSNovaSonicLLMContext\n"
|
||||||
|
"# or\n"
|
||||||
|
"context: OpenAILLMContext\n\n"
|
||||||
|
"# Reading messages from context\n"
|
||||||
|
"messages = context.messages\n"
|
||||||
|
"```\n\n"
|
||||||
|
"AFTER:\n"
|
||||||
|
"```\n"
|
||||||
|
"# Setup\n"
|
||||||
|
"context = LLMContext(messages, tools)\n"
|
||||||
|
"context_aggregator = LLMContextAggregatorPair(context)\n\n"
|
||||||
|
"# Context frame type\n"
|
||||||
|
"frame: LLMContextFrame\n\n"
|
||||||
|
"# Context type\n"
|
||||||
|
"context: LLMContext\n\n"
|
||||||
|
"# Reading messages from context\n"
|
||||||
|
"messages = context.messages\n"
|
||||||
|
"```",
|
||||||
|
DeprecationWarning,
|
||||||
|
stacklevel=2,
|
||||||
|
)
|
||||||
25
src/pipecat/services/aws/nova_sonic/frames.py
Normal file
25
src/pipecat/services/aws/nova_sonic/frames.py
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
#
|
||||||
|
# Copyright (c) 2025, Daily
|
||||||
|
#
|
||||||
|
# SPDX-License-Identifier: BSD 2-Clause License
|
||||||
|
#
|
||||||
|
|
||||||
|
"""Custom frames for AWS Nova Sonic LLM service."""
|
||||||
|
|
||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
from pipecat.frames.frames import DataFrame, FunctionCallResultFrame
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class AWSNovaSonicFunctionCallResultFrame(DataFrame):
|
||||||
|
"""Frame containing function call result for AWS Nova Sonic processing.
|
||||||
|
|
||||||
|
This frame wraps a standard function call result frame to enable
|
||||||
|
AWS Nova Sonic-specific handling and context updates.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
result_frame: The underlying function call result frame.
|
||||||
|
"""
|
||||||
|
|
||||||
|
result_frame: FunctionCallResultFrame
|
||||||
1265
src/pipecat/services/aws/nova_sonic/llm.py
Normal file
1265
src/pipecat/services/aws/nova_sonic/llm.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -286,6 +286,7 @@ class AWSTranscribeSTTService(STTService):
|
|||||||
|
|
||||||
logger.info(f"{self} Successfully connected to AWS Transcribe")
|
logger.info(f"{self} Successfully connected to AWS Transcribe")
|
||||||
|
|
||||||
|
await self._call_event_handler("on_connected")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"{self} Failed to connect to AWS Transcribe: {e}")
|
logger.error(f"{self} Failed to connect to AWS Transcribe: {e}")
|
||||||
await self._disconnect()
|
await self._disconnect()
|
||||||
@@ -310,6 +311,7 @@ class AWSTranscribeSTTService(STTService):
|
|||||||
logger.warning(f"{self} Error closing WebSocket connection: {e}")
|
logger.warning(f"{self} Error closing WebSocket connection: {e}")
|
||||||
finally:
|
finally:
|
||||||
self._ws_client = None
|
self._ws_client = None
|
||||||
|
await self._call_event_handler("on_disconnected")
|
||||||
|
|
||||||
def language_to_service_language(self, language: Language) -> str | None:
|
def language_to_service_language(self, language: Language) -> str | None:
|
||||||
"""Convert internal language enum to AWS Transcribe language code.
|
"""Convert internal language enum to AWS Transcribe language code.
|
||||||
|
|||||||
@@ -1 +1,19 @@
|
|||||||
from .aws import AWSNovaSonicLLMService, Params
|
#
|
||||||
|
# Copyright (c) 2025, Daily
|
||||||
|
#
|
||||||
|
# SPDX-License-Identifier: BSD 2-Clause License
|
||||||
|
#
|
||||||
|
|
||||||
|
import warnings
|
||||||
|
|
||||||
|
from pipecat.services.aws.nova_sonic.llm import AWSNovaSonicLLMService, Params
|
||||||
|
|
||||||
|
with warnings.catch_warnings():
|
||||||
|
warnings.simplefilter("always")
|
||||||
|
warnings.warn(
|
||||||
|
"Types in pipecat.services.aws_nova_sonic are deprecated. "
|
||||||
|
"Please use the equivalent types from "
|
||||||
|
"pipecat.services.aws.nova_sonic.llm instead.",
|
||||||
|
DeprecationWarning,
|
||||||
|
stacklevel=2,
|
||||||
|
)
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -8,360 +8,80 @@
|
|||||||
|
|
||||||
This module provides specialized context aggregators and message handling for AWS Nova Sonic,
|
This module provides specialized context aggregators and message handling for AWS Nova Sonic,
|
||||||
including conversation history management and role-specific message processing.
|
including conversation history management and role-specific message processing.
|
||||||
|
|
||||||
|
.. deprecated:: 0.0.91
|
||||||
|
AWS Nova Sonic now supports `LLMContext` and `LLMContextAggregatorPair`.
|
||||||
|
Using the new patterns should allow you to not need types from this module.
|
||||||
|
|
||||||
|
BEFORE:
|
||||||
|
```
|
||||||
|
# Setup
|
||||||
|
context = OpenAILLMContext(messages, tools)
|
||||||
|
context_aggregator = llm.create_context_aggregator(context)
|
||||||
|
|
||||||
|
# Context frame type
|
||||||
|
frame: OpenAILLMContextFrame
|
||||||
|
|
||||||
|
# Context type
|
||||||
|
context: AWSNovaSonicLLMContext
|
||||||
|
# or
|
||||||
|
context: OpenAILLMContext
|
||||||
|
|
||||||
|
# Reading messages from context
|
||||||
|
messages = context.messages
|
||||||
|
```
|
||||||
|
|
||||||
|
AFTER:
|
||||||
|
```
|
||||||
|
# Setup
|
||||||
|
context = LLMContext(messages, tools)
|
||||||
|
context_aggregator = LLMContextAggregatorPair(context)
|
||||||
|
|
||||||
|
# Context frame type
|
||||||
|
frame: LLMContextFrame
|
||||||
|
|
||||||
|
# Context type
|
||||||
|
context: LLMContext
|
||||||
|
|
||||||
|
# Reading messages from context
|
||||||
|
messages = context.get_messages()
|
||||||
|
```
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import copy
|
import warnings
|
||||||
from dataclasses import dataclass, field
|
|
||||||
from enum import Enum
|
with warnings.catch_warnings():
|
||||||
|
warnings.simplefilter("always")
|
||||||
from loguru import logger
|
warnings.warn(
|
||||||
|
"Types in pipecat.services.aws_nova_sonic.context are deprecated. \n"
|
||||||
from pipecat.frames.frames import (
|
"AWS Nova Sonic now supports `LLMContext` and `LLMContextAggregatorPair`. \n"
|
||||||
BotStoppedSpeakingFrame,
|
"Using the new patterns should allow you to not need types from this module.\n\n"
|
||||||
DataFrame,
|
"BEFORE:\n"
|
||||||
Frame,
|
"```\n"
|
||||||
FunctionCallResultFrame,
|
"# Setup\n"
|
||||||
InterruptionFrame,
|
"context = OpenAILLMContext(messages, tools)\n"
|
||||||
LLMFullResponseEndFrame,
|
"context_aggregator = llm.create_context_aggregator(context)\n\n"
|
||||||
LLMFullResponseStartFrame,
|
"# Context frame type\n"
|
||||||
LLMMessagesAppendFrame,
|
"frame: OpenAILLMContextFrame\n\n"
|
||||||
LLMMessagesUpdateFrame,
|
"# Context type\n"
|
||||||
LLMSetToolChoiceFrame,
|
"context: AWSNovaSonicLLMContext\n"
|
||||||
LLMSetToolsFrame,
|
"# or\n"
|
||||||
TextFrame,
|
"context: OpenAILLMContext\n\n"
|
||||||
UserImageRawFrame,
|
"# Reading messages from context\n"
|
||||||
)
|
"messages = context.messages\n"
|
||||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
"```\n\n"
|
||||||
from pipecat.processors.frame_processor import FrameDirection
|
"AFTER:\n"
|
||||||
from pipecat.services.aws_nova_sonic.frames import AWSNovaSonicFunctionCallResultFrame
|
"```\n"
|
||||||
from pipecat.services.openai.llm import (
|
"# Setup\n"
|
||||||
OpenAIAssistantContextAggregator,
|
"context = LLMContext(messages, tools)\n"
|
||||||
OpenAIUserContextAggregator,
|
"context_aggregator = LLMContextAggregatorPair(context)\n\n"
|
||||||
)
|
"# Context frame type\n"
|
||||||
|
"frame: LLMContextFrame\n\n"
|
||||||
|
"# Context type\n"
|
||||||
class Role(Enum):
|
"context: LLMContext\n\n"
|
||||||
"""Roles supported in AWS Nova Sonic conversations.
|
"# Reading messages from context\n"
|
||||||
|
"messages = context.messages\n"
|
||||||
Parameters:
|
"```",
|
||||||
SYSTEM: System-level messages (not used in conversation history).
|
DeprecationWarning,
|
||||||
USER: Messages sent by the user.
|
stacklevel=2,
|
||||||
ASSISTANT: Messages sent by the assistant.
|
)
|
||||||
TOOL: Messages sent by tools (not used in conversation history).
|
|
||||||
"""
|
|
||||||
|
|
||||||
SYSTEM = "SYSTEM"
|
|
||||||
USER = "USER"
|
|
||||||
ASSISTANT = "ASSISTANT"
|
|
||||||
TOOL = "TOOL"
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class AWSNovaSonicConversationHistoryMessage:
|
|
||||||
"""A single message in AWS Nova Sonic conversation history.
|
|
||||||
|
|
||||||
Parameters:
|
|
||||||
role: The role of the message sender (USER or ASSISTANT only).
|
|
||||||
text: The text content of the message.
|
|
||||||
"""
|
|
||||||
|
|
||||||
role: Role # only USER and ASSISTANT
|
|
||||||
text: str
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class AWSNovaSonicConversationHistory:
|
|
||||||
"""Complete conversation history for AWS Nova Sonic initialization.
|
|
||||||
|
|
||||||
Parameters:
|
|
||||||
system_instruction: System-level instruction for the conversation.
|
|
||||||
messages: List of conversation messages between user and assistant.
|
|
||||||
"""
|
|
||||||
|
|
||||||
system_instruction: str = None
|
|
||||||
messages: list[AWSNovaSonicConversationHistoryMessage] = field(default_factory=list)
|
|
||||||
|
|
||||||
|
|
||||||
class AWSNovaSonicLLMContext(OpenAILLMContext):
|
|
||||||
"""Specialized LLM context for AWS Nova Sonic service.
|
|
||||||
|
|
||||||
Extends OpenAI context with Nova Sonic-specific message handling,
|
|
||||||
conversation history management, and text buffering capabilities.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, messages=None, tools=None, **kwargs):
|
|
||||||
"""Initialize AWS Nova Sonic LLM context.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
messages: Initial messages for the context.
|
|
||||||
tools: Available tools for the context.
|
|
||||||
**kwargs: Additional arguments passed to parent class.
|
|
||||||
"""
|
|
||||||
super().__init__(messages=messages, tools=tools, **kwargs)
|
|
||||||
self.__setup_local()
|
|
||||||
|
|
||||||
def __setup_local(self, system_instruction: str = ""):
|
|
||||||
self._assistant_text = ""
|
|
||||||
self._user_text = ""
|
|
||||||
self._system_instruction = system_instruction
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def upgrade_to_nova_sonic(
|
|
||||||
obj: OpenAILLMContext, system_instruction: str
|
|
||||||
) -> "AWSNovaSonicLLMContext":
|
|
||||||
"""Upgrade an OpenAI context to AWS Nova Sonic context.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
obj: The OpenAI context to upgrade.
|
|
||||||
system_instruction: System instruction for the context.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
The upgraded AWS Nova Sonic context.
|
|
||||||
"""
|
|
||||||
if isinstance(obj, OpenAILLMContext) and not isinstance(obj, AWSNovaSonicLLMContext):
|
|
||||||
obj.__class__ = AWSNovaSonicLLMContext
|
|
||||||
obj.__setup_local(system_instruction)
|
|
||||||
return obj
|
|
||||||
|
|
||||||
# NOTE: this method has the side-effect of updating _system_instruction from messages
|
|
||||||
def get_messages_for_initializing_history(self) -> AWSNovaSonicConversationHistory:
|
|
||||||
"""Get conversation history for initializing AWS Nova Sonic session.
|
|
||||||
|
|
||||||
Processes stored messages and extracts system instruction and conversation
|
|
||||||
history in the format expected by AWS Nova Sonic.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Formatted conversation history with system instruction and messages.
|
|
||||||
"""
|
|
||||||
history = AWSNovaSonicConversationHistory(system_instruction=self._system_instruction)
|
|
||||||
|
|
||||||
# Bail if there are no messages
|
|
||||||
if not self.messages:
|
|
||||||
return history
|
|
||||||
|
|
||||||
messages = copy.deepcopy(self.messages)
|
|
||||||
|
|
||||||
# If we have a "system" message as our first message, let's pull that out into "instruction"
|
|
||||||
if messages[0].get("role") == "system":
|
|
||||||
system = messages.pop(0)
|
|
||||||
content = system.get("content")
|
|
||||||
if isinstance(content, str):
|
|
||||||
history.system_instruction = content
|
|
||||||
elif isinstance(content, list):
|
|
||||||
history.system_instruction = content[0].get("text")
|
|
||||||
if history.system_instruction:
|
|
||||||
self._system_instruction = history.system_instruction
|
|
||||||
|
|
||||||
# Process remaining messages to fill out conversation history.
|
|
||||||
# Nova Sonic supports "user" and "assistant" messages in history.
|
|
||||||
for message in messages:
|
|
||||||
history_message = self.from_standard_message(message)
|
|
||||||
if history_message:
|
|
||||||
history.messages.append(history_message)
|
|
||||||
|
|
||||||
return history
|
|
||||||
|
|
||||||
def get_messages_for_persistent_storage(self):
|
|
||||||
"""Get messages formatted for persistent storage.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
List of messages including system instruction if present.
|
|
||||||
"""
|
|
||||||
messages = super().get_messages_for_persistent_storage()
|
|
||||||
# If we have a system instruction and messages doesn't already contain it, add it
|
|
||||||
if self._system_instruction and not (messages and messages[0].get("role") == "system"):
|
|
||||||
messages.insert(0, {"role": "system", "content": self._system_instruction})
|
|
||||||
return messages
|
|
||||||
|
|
||||||
def from_standard_message(self, message) -> AWSNovaSonicConversationHistoryMessage:
|
|
||||||
"""Convert standard message format to Nova Sonic format.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
message: Standard message dictionary to convert.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Nova Sonic conversation history message, or None if not convertible.
|
|
||||||
"""
|
|
||||||
role = message.get("role")
|
|
||||||
if message.get("role") == "user" or message.get("role") == "assistant":
|
|
||||||
content = message.get("content")
|
|
||||||
if isinstance(message.get("content"), list):
|
|
||||||
content = ""
|
|
||||||
for c in message.get("content"):
|
|
||||||
if c.get("type") == "text":
|
|
||||||
content += " " + c.get("text")
|
|
||||||
else:
|
|
||||||
logger.error(
|
|
||||||
f"Unhandled content type in context message: {c.get('type')} - {message}"
|
|
||||||
)
|
|
||||||
# There won't be content if this is an assistant tool call entry.
|
|
||||||
# We're ignoring those since they can't be loaded into AWS Nova Sonic conversation
|
|
||||||
# history
|
|
||||||
if content:
|
|
||||||
return AWSNovaSonicConversationHistoryMessage(role=Role[role.upper()], text=content)
|
|
||||||
# NOTE: we're ignoring messages with role "tool" since they can't be loaded into AWS Nova
|
|
||||||
# Sonic conversation history
|
|
||||||
|
|
||||||
def buffer_user_text(self, text):
|
|
||||||
"""Buffer user text for later flushing to context.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
text: User text to buffer.
|
|
||||||
"""
|
|
||||||
self._user_text += f" {text}" if self._user_text else text
|
|
||||||
# logger.debug(f"User text buffered: {self._user_text}")
|
|
||||||
|
|
||||||
def flush_aggregated_user_text(self) -> str:
|
|
||||||
"""Flush buffered user text to context as a complete message.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
The flushed user text, or empty string if no text was buffered.
|
|
||||||
"""
|
|
||||||
if not self._user_text:
|
|
||||||
return ""
|
|
||||||
user_text = self._user_text
|
|
||||||
message = {
|
|
||||||
"role": "user",
|
|
||||||
"content": [{"type": "text", "text": user_text}],
|
|
||||||
}
|
|
||||||
self._user_text = ""
|
|
||||||
self.add_message(message)
|
|
||||||
# logger.debug(f"Context updated (user): {self.get_messages_for_logging()}")
|
|
||||||
return user_text
|
|
||||||
|
|
||||||
def buffer_assistant_text(self, text):
|
|
||||||
"""Buffer assistant text for later flushing to context.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
text: Assistant text to buffer.
|
|
||||||
"""
|
|
||||||
self._assistant_text += text
|
|
||||||
# logger.debug(f"Assistant text buffered: {self._assistant_text}")
|
|
||||||
|
|
||||||
def flush_aggregated_assistant_text(self):
|
|
||||||
"""Flush buffered assistant text to context as a complete message."""
|
|
||||||
if not self._assistant_text:
|
|
||||||
return
|
|
||||||
message = {
|
|
||||||
"role": "assistant",
|
|
||||||
"content": [{"type": "text", "text": self._assistant_text}],
|
|
||||||
}
|
|
||||||
self._assistant_text = ""
|
|
||||||
self.add_message(message)
|
|
||||||
# logger.debug(f"Context updated (assistant): {self.get_messages_for_logging()}")
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class AWSNovaSonicMessagesUpdateFrame(DataFrame):
|
|
||||||
"""Frame containing updated AWS Nova Sonic context.
|
|
||||||
|
|
||||||
Parameters:
|
|
||||||
context: The updated AWS Nova Sonic LLM context.
|
|
||||||
"""
|
|
||||||
|
|
||||||
context: AWSNovaSonicLLMContext
|
|
||||||
|
|
||||||
|
|
||||||
class AWSNovaSonicUserContextAggregator(OpenAIUserContextAggregator):
|
|
||||||
"""Context aggregator for user messages in AWS Nova Sonic conversations.
|
|
||||||
|
|
||||||
Extends the OpenAI user context aggregator to emit Nova Sonic-specific
|
|
||||||
context update frames.
|
|
||||||
"""
|
|
||||||
|
|
||||||
async def process_frame(
|
|
||||||
self, frame: Frame, direction: FrameDirection = FrameDirection.DOWNSTREAM
|
|
||||||
):
|
|
||||||
"""Process frames and emit Nova Sonic-specific context updates.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
frame: The frame to process.
|
|
||||||
direction: The direction the frame is traveling.
|
|
||||||
"""
|
|
||||||
await super().process_frame(frame, direction)
|
|
||||||
|
|
||||||
# Parent does not push LLMMessagesUpdateFrame
|
|
||||||
if isinstance(frame, LLMMessagesUpdateFrame):
|
|
||||||
await self.push_frame(AWSNovaSonicMessagesUpdateFrame(context=self._context))
|
|
||||||
|
|
||||||
|
|
||||||
class AWSNovaSonicAssistantContextAggregator(OpenAIAssistantContextAggregator):
|
|
||||||
"""Context aggregator for assistant messages in AWS Nova Sonic conversations.
|
|
||||||
|
|
||||||
Provides specialized handling for assistant responses and function calls
|
|
||||||
in AWS Nova Sonic context, with custom frame processing logic.
|
|
||||||
"""
|
|
||||||
|
|
||||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
|
||||||
"""Process frames with Nova Sonic-specific logic.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
frame: The frame to process.
|
|
||||||
direction: The direction the frame is traveling.
|
|
||||||
"""
|
|
||||||
# HACK: For now, disable the context aggregator by making it just pass through all frames
|
|
||||||
# that the parent handles (except the function call stuff, which we still need).
|
|
||||||
# For an explanation of this hack, see
|
|
||||||
# AWSNovaSonicLLMService._report_assistant_response_text_added.
|
|
||||||
if isinstance(
|
|
||||||
frame,
|
|
||||||
(
|
|
||||||
InterruptionFrame,
|
|
||||||
LLMFullResponseStartFrame,
|
|
||||||
LLMFullResponseEndFrame,
|
|
||||||
TextFrame,
|
|
||||||
LLMMessagesAppendFrame,
|
|
||||||
LLMMessagesUpdateFrame,
|
|
||||||
LLMSetToolsFrame,
|
|
||||||
LLMSetToolChoiceFrame,
|
|
||||||
UserImageRawFrame,
|
|
||||||
BotStoppedSpeakingFrame,
|
|
||||||
),
|
|
||||||
):
|
|
||||||
await self.push_frame(frame, direction)
|
|
||||||
else:
|
|
||||||
await super().process_frame(frame, direction)
|
|
||||||
|
|
||||||
async def handle_function_call_result(self, frame: FunctionCallResultFrame):
|
|
||||||
"""Handle function call results for AWS Nova Sonic.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
frame: The function call result frame to handle.
|
|
||||||
"""
|
|
||||||
await super().handle_function_call_result(frame)
|
|
||||||
|
|
||||||
# The standard function callback code path pushes the FunctionCallResultFrame from the LLM
|
|
||||||
# itself, so we didn't have a chance to add the result to the AWS Nova Sonic server-side
|
|
||||||
# context. Let's push a special frame to do that.
|
|
||||||
await self.push_frame(
|
|
||||||
AWSNovaSonicFunctionCallResultFrame(result_frame=frame), FrameDirection.UPSTREAM
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class AWSNovaSonicContextAggregatorPair:
|
|
||||||
"""Pair of user and assistant context aggregators for AWS Nova Sonic.
|
|
||||||
|
|
||||||
Parameters:
|
|
||||||
_user: The user context aggregator.
|
|
||||||
_assistant: The assistant context aggregator.
|
|
||||||
"""
|
|
||||||
|
|
||||||
_user: AWSNovaSonicUserContextAggregator
|
|
||||||
_assistant: AWSNovaSonicAssistantContextAggregator
|
|
||||||
|
|
||||||
def user(self) -> AWSNovaSonicUserContextAggregator:
|
|
||||||
"""Get the user context aggregator.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
The user context aggregator instance.
|
|
||||||
"""
|
|
||||||
return self._user
|
|
||||||
|
|
||||||
def assistant(self) -> AWSNovaSonicAssistantContextAggregator:
|
|
||||||
"""Get the assistant context aggregator.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
The assistant context aggregator instance.
|
|
||||||
"""
|
|
||||||
return self._assistant
|
|
||||||
|
|||||||
@@ -6,20 +6,16 @@
|
|||||||
|
|
||||||
"""Custom frames for AWS Nova Sonic LLM service."""
|
"""Custom frames for AWS Nova Sonic LLM service."""
|
||||||
|
|
||||||
from dataclasses import dataclass
|
import warnings
|
||||||
|
|
||||||
from pipecat.frames.frames import DataFrame, FunctionCallResultFrame
|
from pipecat.services.aws.nova_sonic.frames import *
|
||||||
|
|
||||||
|
with warnings.catch_warnings():
|
||||||
@dataclass
|
warnings.simplefilter("always")
|
||||||
class AWSNovaSonicFunctionCallResultFrame(DataFrame):
|
warnings.warn(
|
||||||
"""Frame containing function call result for AWS Nova Sonic processing.
|
"Types in pipecat.services.aws_nova_sonic.frames are deprecated. "
|
||||||
|
"Please use the equivalent types from "
|
||||||
This frame wraps a standard function call result frame to enable
|
"pipecat.services.aws.nova_sonic.frames instead.",
|
||||||
AWS Nova Sonic-specific handling and context updates.
|
DeprecationWarning,
|
||||||
|
stacklevel=2,
|
||||||
Parameters:
|
)
|
||||||
result_frame: The underlying function call result frame.
|
|
||||||
"""
|
|
||||||
|
|
||||||
result_frame: FunctionCallResultFrame
|
|
||||||
|
|||||||
0
src/pipecat/services/azure/realtime/__init__.py
Normal file
0
src/pipecat/services/azure/realtime/__init__.py
Normal file
65
src/pipecat/services/azure/realtime/llm.py
Normal file
65
src/pipecat/services/azure/realtime/llm.py
Normal file
@@ -0,0 +1,65 @@
|
|||||||
|
#
|
||||||
|
# Copyright (c) 2024–2025, Daily
|
||||||
|
#
|
||||||
|
# SPDX-License-Identifier: BSD 2-Clause License
|
||||||
|
#
|
||||||
|
|
||||||
|
"""Azure OpenAI Realtime LLM service implementation."""
|
||||||
|
|
||||||
|
from loguru import logger
|
||||||
|
|
||||||
|
from pipecat.services.openai.realtime.llm import OpenAIRealtimeLLMService
|
||||||
|
|
||||||
|
try:
|
||||||
|
from websockets.asyncio.client import connect as websocket_connect
|
||||||
|
except ModuleNotFoundError as e:
|
||||||
|
logger.error(f"Exception: {e}")
|
||||||
|
logger.error("In order to use Azure Realtime, you need to `pip install pipecat-ai[openai]`.")
|
||||||
|
raise Exception(f"Missing module: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
class AzureRealtimeLLMService(OpenAIRealtimeLLMService):
|
||||||
|
"""Azure OpenAI Realtime LLM service with Azure-specific authentication.
|
||||||
|
|
||||||
|
Extends the OpenAI Realtime service to work with Azure OpenAI endpoints,
|
||||||
|
using Azure's authentication headers and endpoint format. Provides the same
|
||||||
|
real-time audio and text communication capabilities as the base OpenAI service.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
api_key: str,
|
||||||
|
base_url: str,
|
||||||
|
**kwargs,
|
||||||
|
):
|
||||||
|
"""Initialize Azure Realtime LLM service.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
api_key: The API key for the Azure OpenAI service.
|
||||||
|
base_url: The full Azure WebSocket endpoint URL including api-version and deployment.
|
||||||
|
Example: "wss://my-project.openai.azure.com/openai/realtime?api-version=2024-10-01-preview&deployment=my-realtime-deployment"
|
||||||
|
**kwargs: Additional arguments passed to parent OpenAIRealtimeLLMService.
|
||||||
|
"""
|
||||||
|
super().__init__(base_url=base_url, api_key=api_key, **kwargs)
|
||||||
|
self.api_key = api_key
|
||||||
|
self.base_url = base_url
|
||||||
|
|
||||||
|
async def _connect(self):
|
||||||
|
try:
|
||||||
|
if self._websocket:
|
||||||
|
# Here we assume that if we have a websocket, we are connected. We
|
||||||
|
# handle disconnections in the send/recv code paths.
|
||||||
|
return
|
||||||
|
|
||||||
|
logger.info(f"Connecting to {self.base_url}, api key: {self.api_key}")
|
||||||
|
self._websocket = await websocket_connect(
|
||||||
|
uri=self.base_url,
|
||||||
|
additional_headers={
|
||||||
|
"api-key": self.api_key,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
self._receive_task = self.create_task(self._receive_task_handler())
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"{self} initialization error: {e}")
|
||||||
|
self._websocket = None
|
||||||
@@ -28,13 +28,12 @@ from pipecat.frames.frames import (
|
|||||||
UserStoppedSpeakingFrame,
|
UserStoppedSpeakingFrame,
|
||||||
)
|
)
|
||||||
from pipecat.processors.frame_processor import FrameDirection
|
from pipecat.processors.frame_processor import FrameDirection
|
||||||
from pipecat.services.stt_service import STTService
|
from pipecat.services.stt_service import WebsocketSTTService
|
||||||
from pipecat.transcriptions.language import Language
|
from pipecat.transcriptions.language import Language
|
||||||
from pipecat.utils.time import time_now_iso8601
|
from pipecat.utils.time import time_now_iso8601
|
||||||
from pipecat.utils.tracing.service_decorators import traced_stt
|
from pipecat.utils.tracing.service_decorators import traced_stt
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import websockets
|
|
||||||
from websockets.asyncio.client import connect as websocket_connect
|
from websockets.asyncio.client import connect as websocket_connect
|
||||||
from websockets.protocol import State
|
from websockets.protocol import State
|
||||||
except ModuleNotFoundError as e:
|
except ModuleNotFoundError as e:
|
||||||
@@ -124,7 +123,7 @@ class CartesiaLiveOptions:
|
|||||||
return cls(**json.loads(json_str))
|
return cls(**json.loads(json_str))
|
||||||
|
|
||||||
|
|
||||||
class CartesiaSTTService(STTService):
|
class CartesiaSTTService(WebsocketSTTService):
|
||||||
"""Speech-to-text service using Cartesia Live API.
|
"""Speech-to-text service using Cartesia Live API.
|
||||||
|
|
||||||
Provides real-time speech transcription through WebSocket connection
|
Provides real-time speech transcription through WebSocket connection
|
||||||
@@ -176,8 +175,7 @@ class CartesiaSTTService(STTService):
|
|||||||
self.set_model_name(merged_options.model)
|
self.set_model_name(merged_options.model)
|
||||||
self._api_key = api_key
|
self._api_key = api_key
|
||||||
self._base_url = base_url or "api.cartesia.ai"
|
self._base_url = base_url or "api.cartesia.ai"
|
||||||
self._connection = None
|
self._receive_task = None
|
||||||
self._receiver_task = None
|
|
||||||
|
|
||||||
def can_generate_metrics(self) -> bool:
|
def can_generate_metrics(self) -> bool:
|
||||||
"""Check if the service can generate processing metrics.
|
"""Check if the service can generate processing metrics.
|
||||||
@@ -214,6 +212,27 @@ class CartesiaSTTService(STTService):
|
|||||||
await super().cancel(frame)
|
await super().cancel(frame)
|
||||||
await self._disconnect()
|
await self._disconnect()
|
||||||
|
|
||||||
|
async def start_metrics(self):
|
||||||
|
"""Start performance metrics collection for transcription processing."""
|
||||||
|
await self.start_ttfb_metrics()
|
||||||
|
await self.start_processing_metrics()
|
||||||
|
|
||||||
|
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||||
|
"""Process incoming frames and handle speech events.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
frame: The frame to process.
|
||||||
|
direction: Direction of frame flow in the pipeline.
|
||||||
|
"""
|
||||||
|
await super().process_frame(frame, direction)
|
||||||
|
|
||||||
|
if isinstance(frame, UserStartedSpeakingFrame):
|
||||||
|
await self.start_metrics()
|
||||||
|
elif isinstance(frame, UserStoppedSpeakingFrame):
|
||||||
|
# Send finalize command to flush the transcription session
|
||||||
|
if self._websocket and self._websocket.state is State.OPEN:
|
||||||
|
await self._websocket.send("finalize")
|
||||||
|
|
||||||
async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]:
|
async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]:
|
||||||
"""Process audio data for speech-to-text transcription.
|
"""Process audio data for speech-to-text transcription.
|
||||||
|
|
||||||
@@ -224,45 +243,71 @@ class CartesiaSTTService(STTService):
|
|||||||
None - transcription results are handled via WebSocket responses.
|
None - transcription results are handled via WebSocket responses.
|
||||||
"""
|
"""
|
||||||
# If the connection is closed, due to timeout, we need to reconnect when the user starts speaking again
|
# If the connection is closed, due to timeout, we need to reconnect when the user starts speaking again
|
||||||
if not self._connection or self._connection.state is State.CLOSED:
|
if not self._websocket or self._websocket.state is State.CLOSED:
|
||||||
await self._connect()
|
await self._connect()
|
||||||
|
|
||||||
await self._connection.send(audio)
|
await self._websocket.send(audio)
|
||||||
yield None
|
yield None
|
||||||
|
|
||||||
async def _connect(self):
|
async def _connect(self):
|
||||||
params = self._settings.to_dict()
|
await self._connect_websocket()
|
||||||
ws_url = f"wss://{self._base_url}/stt/websocket?{urllib.parse.urlencode(params)}"
|
|
||||||
logger.debug(f"Connecting to Cartesia: {ws_url}")
|
|
||||||
headers = {"Cartesia-Version": "2025-04-16", "X-API-Key": self._api_key}
|
|
||||||
|
|
||||||
|
if self._websocket and not self._receive_task:
|
||||||
|
self._receive_task = asyncio.create_task(self._receive_task_handler(self._report_error))
|
||||||
|
|
||||||
|
async def _disconnect(self):
|
||||||
|
if self._receive_task:
|
||||||
|
await self.cancel_task(self._receive_task)
|
||||||
|
self._receive_task = None
|
||||||
|
|
||||||
|
await self._disconnect_websocket()
|
||||||
|
|
||||||
|
async def _connect_websocket(self):
|
||||||
try:
|
try:
|
||||||
self._connection = await websocket_connect(ws_url, additional_headers=headers)
|
if self._websocket and self._websocket.state is State.OPEN:
|
||||||
# Setup the receiver task to handle the incoming messages from the Cartesia server
|
return
|
||||||
if self._receiver_task is None or self._receiver_task.done():
|
logger.debug("Connecting to Cartesia STT")
|
||||||
self._receiver_task = asyncio.create_task(self._receive_messages())
|
|
||||||
logger.debug(f"Connected to Cartesia")
|
params = self._settings.to_dict()
|
||||||
|
ws_url = f"wss://{self._base_url}/stt/websocket?{urllib.parse.urlencode(params)}"
|
||||||
|
headers = {"Cartesia-Version": "2025-04-16", "X-API-Key": self._api_key}
|
||||||
|
|
||||||
|
self._websocket = await websocket_connect(ws_url, additional_headers=headers)
|
||||||
|
await self._call_event_handler("on_connected")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"{self}: unable to connect to Cartesia: {e}")
|
logger.error(f"{self}: unable to connect to Cartesia: {e}")
|
||||||
|
|
||||||
async def _receive_messages(self):
|
async def _disconnect_websocket(self):
|
||||||
try:
|
try:
|
||||||
while True:
|
if self._websocket and self._websocket.state is State.OPEN:
|
||||||
if not self._connection or self._connection.state is State.CLOSED:
|
logger.debug("Disconnecting from Cartesia STT")
|
||||||
break
|
await self._websocket.close()
|
||||||
|
|
||||||
message = await self._connection.recv()
|
|
||||||
try:
|
|
||||||
data = json.loads(message)
|
|
||||||
await self._process_response(data)
|
|
||||||
except json.JSONDecodeError:
|
|
||||||
logger.warning(f"Received non-JSON message: {message}")
|
|
||||||
except asyncio.CancelledError:
|
|
||||||
pass
|
|
||||||
except websockets.exceptions.ConnectionClosed as e:
|
|
||||||
logger.debug(f"WebSocket connection closed: {e}")
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error in message receiver: {e}")
|
logger.error(f"{self} error closing websocket: {e}")
|
||||||
|
finally:
|
||||||
|
self._websocket = None
|
||||||
|
await self._call_event_handler("on_disconnected")
|
||||||
|
|
||||||
|
def _get_websocket(self):
|
||||||
|
if self._websocket:
|
||||||
|
return self._websocket
|
||||||
|
raise Exception("Websocket not connected")
|
||||||
|
|
||||||
|
async def _process_messages(self):
|
||||||
|
async for message in self._get_websocket():
|
||||||
|
try:
|
||||||
|
data = json.loads(message)
|
||||||
|
await self._process_response(data)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
logger.warning(f"Received non-JSON message: {message}")
|
||||||
|
|
||||||
|
async def _receive_messages(self):
|
||||||
|
while True:
|
||||||
|
await self._process_messages()
|
||||||
|
# Cartesia times out after 5 minutes of innactivity (no keepalive
|
||||||
|
# mechanism is available). So, we try to reconnect.
|
||||||
|
logger.debug(f"{self} Cartesia connection was disconnected (timeout?), reconnecting")
|
||||||
|
await self._connect_websocket()
|
||||||
|
|
||||||
async def _process_response(self, data):
|
async def _process_response(self, data):
|
||||||
if "type" in data:
|
if "type" in data:
|
||||||
@@ -316,41 +361,3 @@ class CartesiaSTTService(STTService):
|
|||||||
language,
|
language,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
async def _disconnect(self):
|
|
||||||
if self._receiver_task:
|
|
||||||
self._receiver_task.cancel()
|
|
||||||
try:
|
|
||||||
await self._receiver_task
|
|
||||||
except asyncio.CancelledError:
|
|
||||||
pass
|
|
||||||
except Exception as e:
|
|
||||||
logger.exception(f"Unexpected exception while cancelling task: {e}")
|
|
||||||
self._receiver_task = None
|
|
||||||
|
|
||||||
if self._connection and self._connection.state is State.OPEN:
|
|
||||||
logger.debug("Disconnecting from Cartesia")
|
|
||||||
|
|
||||||
await self._connection.close()
|
|
||||||
self._connection = None
|
|
||||||
|
|
||||||
async def start_metrics(self):
|
|
||||||
"""Start performance metrics collection for transcription processing."""
|
|
||||||
await self.start_ttfb_metrics()
|
|
||||||
await self.start_processing_metrics()
|
|
||||||
|
|
||||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
|
||||||
"""Process incoming frames and handle speech events.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
frame: The frame to process.
|
|
||||||
direction: Direction of frame flow in the pipeline.
|
|
||||||
"""
|
|
||||||
await super().process_frame(frame, direction)
|
|
||||||
|
|
||||||
if isinstance(frame, UserStartedSpeakingFrame):
|
|
||||||
await self.start_metrics()
|
|
||||||
elif isinstance(frame, UserStoppedSpeakingFrame):
|
|
||||||
# Send finalize command to flush the transcription session
|
|
||||||
if self._connection and self._connection.state is State.OPEN:
|
|
||||||
await self._connection.send("finalize")
|
|
||||||
|
|||||||
@@ -344,10 +344,11 @@ class CartesiaTTSService(AudioContextWordTTSService):
|
|||||||
try:
|
try:
|
||||||
if self._websocket and self._websocket.state is State.OPEN:
|
if self._websocket and self._websocket.state is State.OPEN:
|
||||||
return
|
return
|
||||||
logger.debug("Connecting to Cartesia")
|
logger.debug("Connecting to Cartesia TTS")
|
||||||
self._websocket = await websocket_connect(
|
self._websocket = await websocket_connect(
|
||||||
f"{self._url}?api_key={self._api_key}&cartesia_version={self._cartesia_version}"
|
f"{self._url}?api_key={self._api_key}&cartesia_version={self._cartesia_version}"
|
||||||
)
|
)
|
||||||
|
await self._call_event_handler("on_connected")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"{self} initialization error: {e}")
|
logger.error(f"{self} initialization error: {e}")
|
||||||
self._websocket = None
|
self._websocket = None
|
||||||
@@ -365,6 +366,7 @@ class CartesiaTTSService(AudioContextWordTTSService):
|
|||||||
finally:
|
finally:
|
||||||
self._context_id = None
|
self._context_id = None
|
||||||
self._websocket = None
|
self._websocket = None
|
||||||
|
await self._call_event_handler("on_disconnected")
|
||||||
|
|
||||||
def _get_websocket(self):
|
def _get_websocket(self):
|
||||||
if self._websocket:
|
if self._websocket:
|
||||||
|
|||||||
@@ -8,6 +8,7 @@ import sys
|
|||||||
|
|
||||||
from pipecat.services import DeprecatedModuleProxy
|
from pipecat.services import DeprecatedModuleProxy
|
||||||
|
|
||||||
|
from .flux import *
|
||||||
from .stt import *
|
from .stt import *
|
||||||
from .tts import *
|
from .tts import *
|
||||||
|
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user