Compare commits
256 Commits
hush/prere
...
mb/cli
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4d02d886f3 | ||
|
|
42289eb30d | ||
|
|
a0c93ab6de | ||
|
|
4bec566bbf | ||
|
|
ec3cd24182 | ||
|
|
e36e64c2e8 | ||
|
|
02a88022dd | ||
|
|
6cae61f2cc | ||
|
|
3b40079120 | ||
|
|
ff0b38859b | ||
|
|
4d499324d1 | ||
|
|
f13e006db2 | ||
|
|
87d9e8c9cd | ||
|
|
4820f1c059 | ||
|
|
860c39d1b1 | ||
|
|
ae5c5ed7f6 | ||
|
|
7aa01c1ca8 | ||
|
|
4d6356748f | ||
|
|
5b1a182421 | ||
|
|
6ac0c34413 | ||
|
|
c115422dbf | ||
|
|
a2a973be27 | ||
|
|
0407744950 | ||
|
|
7ce370ccc6 | ||
|
|
a4867f61aa | ||
|
|
a67a765783 | ||
|
|
81221668b1 | ||
|
|
cc9c264940 | ||
|
|
f2c61ac9fd | ||
|
|
88f8c10f63 | ||
|
|
855f4842dd | ||
|
|
2bf44fe2af | ||
|
|
3e8a7cc254 | ||
|
|
a600c05570 | ||
|
|
3ba6b55659 | ||
|
|
d5f2dcfac0 | ||
|
|
d12134038b | ||
|
|
a22af3a7e0 | ||
|
|
76e07c6c48 | ||
|
|
8d8503bca7 | ||
|
|
a444097060 | ||
|
|
1b9e96c016 | ||
|
|
7967bc53c3 | ||
|
|
6381335346 | ||
|
|
0fd5d26104 | ||
|
|
41f817bf04 | ||
|
|
27115e6565 | ||
|
|
3c4807d7d4 | ||
|
|
8902f1dc94 | ||
|
|
a25333ee51 | ||
|
|
82c7d7ad83 | ||
|
|
ba2ab51ef7 | ||
|
|
22557fa668 | ||
|
|
3fbf59e7c6 | ||
|
|
129ab5ea0e | ||
|
|
dc917523d0 | ||
|
|
5ea7cc9d32 | ||
|
|
e11ede475b | ||
|
|
90d29e04af | ||
|
|
4c67136a8d | ||
|
|
9d78402a33 | ||
|
|
73877218e9 | ||
|
|
6a1be90cbb | ||
|
|
fbac959ecb | ||
|
|
18dd85431c | ||
|
|
abc569b3d2 | ||
|
|
fa5d4ecf86 | ||
|
|
83b0dc39f7 | ||
|
|
0c31b5ef19 | ||
|
|
d16c36c56d | ||
|
|
8fe3bcd484 | ||
|
|
be2858bfbb | ||
|
|
b6b0997553 | ||
|
|
3b751322d3 | ||
|
|
fce6f55ddb | ||
|
|
d9580f72a9 | ||
|
|
cc66ac14f1 | ||
|
|
9ddec0f8b4 | ||
|
|
9babfe9fd9 | ||
|
|
21d8d148b8 | ||
|
|
0588c82bbf | ||
|
|
16e9093d5a | ||
|
|
91a5d580fd | ||
|
|
0473556992 | ||
|
|
fdaa4e476e | ||
|
|
502e7e42a7 | ||
|
|
2ab3d4fb42 | ||
|
|
55014bdd77 | ||
|
|
334796bd65 | ||
|
|
1c25b6fb72 | ||
|
|
91b29de7ca | ||
|
|
21d610cd30 | ||
|
|
f7fe673ad1 | ||
|
|
4b415721e2 | ||
|
|
8d2a98e0e7 | ||
|
|
523e890c8c | ||
|
|
3c748fe772 | ||
|
|
d293cee372 | ||
|
|
8b62a96878 | ||
|
|
0c102ce70b | ||
|
|
3894d2a4b9 | ||
|
|
1f6b61c0db | ||
|
|
8ee28b37cd | ||
|
|
e85e7e4d84 | ||
|
|
1b3afb5511 | ||
|
|
7cec013666 | ||
|
|
86127167fb | ||
|
|
9935a68018 | ||
|
|
5679dde70f | ||
|
|
d81b0f6368 | ||
|
|
9698b008da | ||
|
|
7b05c9283b | ||
|
|
303dd2ec35 | ||
|
|
aa6e81648a | ||
|
|
1a87870ef3 | ||
|
|
aac4ce2d12 | ||
|
|
2a79b2c853 | ||
|
|
15bf5b1533 | ||
|
|
cdc86db8ce | ||
|
|
9d2ad750b5 | ||
|
|
19ceb1a48f | ||
|
|
59217eae38 | ||
|
|
bea0aee835 | ||
|
|
aeace9b9be | ||
|
|
2994640f47 | ||
|
|
10069719e4 | ||
|
|
046b76df60 | ||
|
|
f2d9063984 | ||
|
|
7c1e2793c5 | ||
|
|
99f008e927 | ||
|
|
2699f0c2a6 | ||
|
|
0b6dd98000 | ||
|
|
a14fb20d15 | ||
|
|
728361a6a7 | ||
|
|
106db69e8e | ||
|
|
cf90071926 | ||
|
|
deaeb75a1f | ||
|
|
a666327d70 | ||
|
|
13a0522546 | ||
|
|
7da37a0d1f | ||
|
|
7efb22a323 | ||
|
|
8084e2f909 | ||
|
|
86127c6a6e | ||
|
|
402e019ae2 | ||
|
|
f09e4e238b | ||
|
|
2921162b3b | ||
|
|
ac1582c906 | ||
|
|
e4b01a5844 | ||
|
|
fa663abbbc | ||
|
|
d19e6111c3 | ||
|
|
8a6d504a7e | ||
|
|
43915937f2 | ||
|
|
48e92a22fe | ||
|
|
566af6b0b8 | ||
|
|
12e7613d5f | ||
|
|
04a68f2c57 | ||
|
|
9b4ca12f49 | ||
|
|
453ce715a6 | ||
|
|
d87b6189ba | ||
|
|
8293347b77 | ||
|
|
c85a3f0b94 | ||
|
|
233fb25e6c | ||
|
|
080978daa6 | ||
|
|
62b7c3d3b2 | ||
|
|
4b2379cba8 | ||
|
|
92087bdfa8 | ||
|
|
617919ac09 | ||
|
|
0669daec3d | ||
|
|
7c15a8c800 | ||
|
|
066b77fba0 | ||
|
|
d9aef5f916 | ||
|
|
91ae3f8a9b | ||
|
|
36da623352 | ||
|
|
31b9087ea6 | ||
|
|
1851fed22e | ||
|
|
eddce460da | ||
|
|
da4f30cb6d | ||
|
|
250cf2d8f1 | ||
|
|
7bbdb4f991 | ||
|
|
051c4782fb | ||
|
|
b1ccec74b2 | ||
|
|
92bf0d9eda | ||
|
|
f985550441 | ||
|
|
de8ee96927 | ||
|
|
2576d0f340 | ||
|
|
f38f4711ac | ||
|
|
c2f3ddd329 | ||
|
|
73ffe96228 | ||
|
|
bd13a80da7 | ||
|
|
312959f97e | ||
|
|
fe168e3c68 | ||
|
|
28929a47f7 | ||
|
|
03f5defbc3 | ||
|
|
b216648315 | ||
|
|
084b133a01 | ||
|
|
e589876176 | ||
|
|
a826313bf9 | ||
|
|
49f44aa7c8 | ||
|
|
64ceef9cf0 | ||
|
|
cd6567c1f1 | ||
|
|
ac67ca1555 | ||
|
|
8d38994756 | ||
|
|
607e3040d4 | ||
|
|
60604a9449 | ||
|
|
4abe4a6253 | ||
|
|
4c054af17b | ||
|
|
dcba940d42 | ||
|
|
ad2adb0c58 | ||
|
|
76923010b5 | ||
|
|
1b511557b2 | ||
|
|
fdadb12933 | ||
|
|
f1bbb7ba22 | ||
|
|
c1492c5275 | ||
|
|
4ffdabcfde | ||
|
|
b489de2fc3 | ||
|
|
d9656cbb1a | ||
|
|
05fb223985 | ||
|
|
62a5f07ad2 | ||
|
|
b669e3a481 | ||
|
|
99f1041a47 | ||
|
|
37b1345bfa | ||
|
|
8994ac17eb | ||
|
|
63bc825008 | ||
|
|
e7ffde1c4c | ||
|
|
1c88565725 | ||
|
|
07a6c2fb0e | ||
|
|
e99f3bf75a | ||
|
|
f09d780413 | ||
|
|
e370d23374 | ||
|
|
b68ec14146 | ||
|
|
c567fd71b1 | ||
|
|
2ca1b2d6f8 | ||
|
|
04041a9a9a | ||
|
|
6c498dc70f | ||
|
|
32b07c1720 | ||
|
|
ad507ce23d | ||
|
|
be562cedfc | ||
|
|
089e703e1f | ||
|
|
4dc1e15a99 | ||
|
|
c7dc2e886f | ||
|
|
11bc4ea854 | ||
|
|
029d76033d | ||
|
|
924d7dea9a | ||
|
|
244e94f3ce | ||
|
|
af1f51d49e | ||
|
|
9ba3c168b8 | ||
|
|
e6ee8f7a16 | ||
|
|
2ea2bd99e0 | ||
|
|
0c2ced7c52 | ||
|
|
fb160646b8 | ||
|
|
89fed57af2 | ||
|
|
032032df65 | ||
|
|
a5595b82ea | ||
|
|
4d1915eb41 | ||
|
|
b3a84fc772 | ||
|
|
403d22e62c |
13
.github/workflows/build.yaml
vendored
13
.github/workflows/build.yaml
vendored
@@ -21,20 +21,21 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
with:
|
||||
fetch-depth: 0 # Fetch all history for setuptools_scm
|
||||
- name: Install uv
|
||||
uses: astral-sh/setup-uv@v3
|
||||
with:
|
||||
version: "latest"
|
||||
|
||||
|
||||
- name: Set up Python
|
||||
run: uv python install 3.10
|
||||
|
||||
|
||||
- name: Install development dependencies
|
||||
run: uv sync --group dev
|
||||
|
||||
|
||||
- name: Build project
|
||||
run: uv build
|
||||
|
||||
|
||||
- name: Install project in editable mode
|
||||
run: uv pip install --editable .
|
||||
run: uv pip install --editable .
|
||||
|
||||
22
.github/workflows/publish.yaml
vendored
22
.github/workflows/publish.yaml
vendored
@@ -5,25 +5,25 @@ on:
|
||||
inputs:
|
||||
gitref:
|
||||
type: string
|
||||
description: "what git tag to build (e.g. v0.0.74)"
|
||||
description: 'what git tag to build (e.g. v0.0.74)'
|
||||
required: true
|
||||
|
||||
jobs:
|
||||
build:
|
||||
name: "Build and upload wheels"
|
||||
name: 'Build and upload wheels'
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout repo
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
ref: ${{ github.event.inputs.gitref }}
|
||||
|
||||
|
||||
- name: Install uv
|
||||
uses: astral-sh/setup-uv@v3
|
||||
with:
|
||||
version: "latest"
|
||||
version: 'latest'
|
||||
- name: Set up Python
|
||||
run: uv python install 3.10
|
||||
run: uv python install 3.12
|
||||
- name: Install development dependencies
|
||||
run: uv sync --group dev
|
||||
- name: Build project
|
||||
@@ -35,9 +35,9 @@ jobs:
|
||||
path: ./dist
|
||||
|
||||
publish-to-pypi:
|
||||
name: "Publish to PyPI"
|
||||
name: 'Publish to PyPI'
|
||||
runs-on: ubuntu-latest
|
||||
needs: [ build ]
|
||||
needs: [build]
|
||||
environment:
|
||||
name: pypi
|
||||
url: https://pypi.org/p/pipecat-ai
|
||||
@@ -56,12 +56,12 @@ jobs:
|
||||
print-hash: true
|
||||
|
||||
publish-to-test-pypi:
|
||||
name: "Publish to Test PyPI"
|
||||
name: 'Publish to Test PyPI'
|
||||
runs-on: ubuntu-latest
|
||||
needs: [ build ]
|
||||
needs: [build]
|
||||
environment:
|
||||
name: testpypi
|
||||
url: https://pypi.org/p/pipecat-ai
|
||||
url: https://test.pypi.org/p/pipecat-ai
|
||||
permissions:
|
||||
id-token: write
|
||||
steps:
|
||||
@@ -70,7 +70,7 @@ jobs:
|
||||
with:
|
||||
name: wheels
|
||||
path: ./dist
|
||||
- name: Publish to PyPI
|
||||
- name: Publish to Test PyPI
|
||||
uses: pypa/gh-action-pypi-publish@release/v1
|
||||
with:
|
||||
verbose: true
|
||||
|
||||
12
.github/workflows/publish_test.yaml
vendored
12
.github/workflows/publish_test.yaml
vendored
@@ -4,7 +4,7 @@ on: workflow_dispatch
|
||||
|
||||
jobs:
|
||||
build:
|
||||
name: "Build and upload wheels"
|
||||
name: 'Build and upload wheels'
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout repo
|
||||
@@ -15,9 +15,9 @@ jobs:
|
||||
- name: Install uv
|
||||
uses: astral-sh/setup-uv@v3
|
||||
with:
|
||||
version: "latest"
|
||||
version: 'latest'
|
||||
- name: Set up Python
|
||||
run: uv python install 3.10
|
||||
run: uv python install 3.12
|
||||
- name: Install development dependencies
|
||||
run: uv sync --group dev
|
||||
- name: Build project
|
||||
@@ -29,12 +29,12 @@ jobs:
|
||||
path: ./dist
|
||||
|
||||
publish-to-test-pypi:
|
||||
name: "Publish to Test PyPI"
|
||||
name: 'Publish to Test PyPI'
|
||||
runs-on: ubuntu-latest
|
||||
needs: [build]
|
||||
environment:
|
||||
name: testpypi
|
||||
url: https://pypi.org/p/pipecat-ai
|
||||
url: https://test.pypi.org/p/pipecat-ai
|
||||
permissions:
|
||||
id-token: write
|
||||
steps:
|
||||
@@ -43,7 +43,7 @@ jobs:
|
||||
with:
|
||||
name: wheels
|
||||
path: ./dist
|
||||
- name: Publish to PyPI
|
||||
- name: Publish to Test PyPI
|
||||
uses: pypa/gh-action-pypi-publish@release/v1
|
||||
with:
|
||||
verbose: true
|
||||
|
||||
287
CHANGELOG.md
287
CHANGELOG.md
@@ -9,6 +9,265 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
|
||||
### Added
|
||||
|
||||
- Added the [Pipecat CLI](https://github.com/pipecat-ai/pipecat-cli) to the
|
||||
required dependencies, enabling you to scaffold a new project directly from
|
||||
`pipecat-ai`. Get started with:
|
||||
|
||||
```bash
|
||||
uv run pipecat init
|
||||
```
|
||||
|
||||
- Expanded support for universal `LLMContext` to `AWSNovaSonicLLMService`.
|
||||
As a reminder, the context-setup pattern when using `LLMContext` is:
|
||||
|
||||
```python
|
||||
context = LLMContext(messages, tools)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
```
|
||||
|
||||
(Note that even though `AWSNovaSonicLLMService` now supports the universal
|
||||
`LLMContext`, it is not meant to be swapped out for another LLM service at
|
||||
runtime.)
|
||||
|
||||
Worth noting: whether or not you use the new context-setup pattern with
|
||||
`AWSNovaSonicLLMService`, some types have changed under the hood:
|
||||
|
||||
```python
|
||||
## BEFORE:
|
||||
|
||||
# Context aggregator type
|
||||
context_aggregator: AWSNovaSonicContextAggregatorPair
|
||||
|
||||
# Context frame type
|
||||
frame: OpenAILLMContextFrame
|
||||
|
||||
# Context type
|
||||
context: AWSNovaSonicLLMContext
|
||||
# or
|
||||
context: OpenAILLMContext
|
||||
|
||||
# Reading messages from context
|
||||
messages = context.messages
|
||||
|
||||
## AFTER:
|
||||
|
||||
# Context aggregator type
|
||||
context_aggregator: LLMContextAggregatorPair
|
||||
|
||||
# Context frame type
|
||||
frame: LLMContextFrame
|
||||
|
||||
# Context type
|
||||
context: LLMContext
|
||||
|
||||
# Reading messages from context
|
||||
messages = context.get_messages()
|
||||
```
|
||||
|
||||
- Added support for `bulbul:v3` model in `SarvamTTSService` and
|
||||
`SarvamHttpTTSService`.
|
||||
|
||||
- Added `keyterms_prompt` parameter to `AssemblyAIConnectionParams`.
|
||||
|
||||
- Added `speech_model` parameter to `AssemblyAIConnectionParams` to access the
|
||||
multilingual model.
|
||||
|
||||
- Added support for trickle ICE to the `SmallWebRTCTransport`.
|
||||
|
||||
- Added support for updating `OpenAITTSService` settings (`instructions` and
|
||||
`speed`) at runtime via `TTSUpdateSettingsFrame`.
|
||||
|
||||
- Added `--whatsapp` flag to runner to better surface WhatsApp transport logs.
|
||||
|
||||
- Added `on_connected` and `on_disconnected` events to TTS and STT
|
||||
websocket-based services.
|
||||
|
||||
- Added an `aggregate_sentences` arg in `ElevenLabsHttpTTSService`, where the
|
||||
default value is True.
|
||||
|
||||
- Added a `room_properties` arg to the Daily runner's `configure()` method,
|
||||
allowing `DailyRoomProperties` to be provided.
|
||||
|
||||
- The runner `--folder` argument now supports downloading files from
|
||||
subdirectories.
|
||||
|
||||
### Changed
|
||||
|
||||
- `CartesiaSTTService` now inherits from `WebsocketSTTService`.
|
||||
|
||||
- Package upgrades:
|
||||
|
||||
- `daily-python` upgraded to 0.20.0.
|
||||
- `openai` upgraded to support up to 2.x.x.
|
||||
- `openpipe` upgraded to support up to 5.x.x.
|
||||
|
||||
- `SpeechmaticsSTTService` updated dependencies for `speechmatics-rt>=0.5.0`.
|
||||
|
||||
### Deprecated
|
||||
|
||||
- The `send_transcription_frames` argument to `AWSNovaSonicLLMService` is
|
||||
deprecated. Transcription frames are now always sent. They go upstream, to be
|
||||
handled by the user context aggregator. See "Added" section for details.
|
||||
|
||||
- Types in `pipecat.services.aws.nova_sonic.context` have been deprecated due
|
||||
to changes to support `LLMContext`. See "Changed" section for details.
|
||||
|
||||
### Fixed
|
||||
|
||||
- Fixed an issue in `RivaSegmentedSTTService` where a runtime error occurred due
|
||||
to a mismatch in the \_handle_transcription method's signature.
|
||||
|
||||
- Fixed multiple pipeline task cancellation issues. `asyncio.CancelledError` is
|
||||
now handled properly in `PipelineTask` making it possible to cancel an asyncio
|
||||
task that it's executing a `PipelineRunner` cleanly. Also,
|
||||
`PipelineTask.cancel()` does not block anymore waiting for the `CancelFrame`
|
||||
to reach the end of the pipeline (going back to the behavior in < 0.0.83).
|
||||
|
||||
- Fixed an issue in `ElevenLabsTTSService` and `ElevenLabsHttpTTSService` where
|
||||
the Flash models would split words, resulting in a space being inserted
|
||||
between words.
|
||||
|
||||
- Fixed an issue where audio filters' `stop()` would not be called when using
|
||||
`CancelFrame`.
|
||||
|
||||
- Fixed an issue in `ElevenLabsHttpTTSService`, where
|
||||
`apply_text_normalization` was incorrectly set as a query parameter. It's now
|
||||
being added as a request parameter.
|
||||
|
||||
- Fixed an issue where `RimeHttpTTSService` and `PiperTTSService` could generate
|
||||
incorrectly 16-bit aligned audio frames, potentially leading to internal
|
||||
errors or static audio.
|
||||
|
||||
- Fixed an issue in `SpeechmaticsSTTService` where `AdditionalVocabEntry` items
|
||||
needed to have `sounds_like` for the session to start.
|
||||
|
||||
### Other
|
||||
|
||||
- Added foundational example `47-sentry-metrics.py`, demonstrating how to use the
|
||||
`SentryMetrics` processor.
|
||||
|
||||
- Added foundational example `14x-function-calling-openpipe.py`.
|
||||
|
||||
## [0.0.90] - 2025-10-10
|
||||
|
||||
### Added
|
||||
|
||||
- Added audio filter `KrispVivaFilter` using the Krisp VIVA SDK.
|
||||
|
||||
- Added `--folder` argument to the runner, allowing files saved in that folder
|
||||
to be downloaded from `http://HOST:PORT/file/FILE`.
|
||||
|
||||
- Added `GeminiLiveVertexLLMService`, for accessing Gemini Live via Google
|
||||
Vertex AI.
|
||||
|
||||
- Added some new configuration options to `GeminiLiveLLMService`:
|
||||
|
||||
- `thinking`
|
||||
- `enable_affective_dialog`
|
||||
- `proactivity`
|
||||
|
||||
Note that these new configuration options require using a newer model than
|
||||
the default, like "gemini-2.5-flash-native-audio-preview-09-2025". The last
|
||||
two require specifying `http_options=HttpOptions(api_version="v1alpha")`.
|
||||
|
||||
- Added `on_pipeline_error` event to `PipelineTask`. This event will get fired
|
||||
when an `ErrorFrame` is pushed (use `FrameProcessor.push_error()`).
|
||||
|
||||
```python
|
||||
@task.event_handler("on_pipeline_error")
|
||||
async def on_pipeline_error(task: PipelineTask, frame: ErrorFrame):
|
||||
...
|
||||
```
|
||||
|
||||
- Added a `service_tier` `InputParam` to the `BaseOpenAILLMService`. This
|
||||
parameter can influence the latency of the response. For example `"priority"`
|
||||
will result in faster completions, but in exchange for a higher price.
|
||||
|
||||
### Changed
|
||||
|
||||
- Updated `GeminiLiveLLMService` to use the `google-genai` library rather than
|
||||
use WebSockets directly.
|
||||
|
||||
### Deprecated
|
||||
|
||||
- `LivekitFrameSerializer` is now deprecated. Use `LiveKitTransport` instead.
|
||||
|
||||
- `pipecat.service.openai_realtime` is now deprecated, use
|
||||
`pipecat.services.openai.realtime` instead or
|
||||
`pipecat.services.azure.realtime` for Azure Realtime.
|
||||
|
||||
- `pipecat.service.aws_nova_sonic` is now deprecated, use
|
||||
`pipecat.services.aws.nova_sonic` instead.
|
||||
|
||||
- `GeminiMultimodalLiveLLMService` is now deprecated, use
|
||||
`GeminiLiveLLMService`.
|
||||
|
||||
### Fixed
|
||||
|
||||
- Fixed a `GoogleVertexLLMService` issue that would generate an error if no
|
||||
token information was returned.
|
||||
|
||||
- `GeminiLiveLLMService` will now end gracefully (i.e. after the bot has
|
||||
finished) upon receiving an `EndFrame`.
|
||||
|
||||
- `GeminiLiveLLMService` will try to seamlessly reconnect when it loses its
|
||||
connection.
|
||||
|
||||
## [0.0.89] - 2025-10-07
|
||||
|
||||
### Fixed
|
||||
|
||||
- Reverted a change introduced in 0.0.88 that was causing pipelines to be frozen
|
||||
when using interruption strategies and processors that block interruption
|
||||
frames (e.g. `STTMuteFilter`).
|
||||
|
||||
## [0.0.88] - 2025-10-07
|
||||
|
||||
### Added
|
||||
|
||||
- Added support for Nano Banana models to `GoogleLLMService`. For example, you
|
||||
can now use the `gemini-2.5-flash-image` model to generate images.
|
||||
|
||||
- Added `HumeTTSService` for text-to-speech synthesis using Hume AI's expressive
|
||||
voice models. Provides high-quality, emotionally expressive speech synthesis
|
||||
with support for various voice models. Includes example in
|
||||
`examples/foundational/07ad-interruptible-hume.py`. Use with:
|
||||
`uv pip install pipecat-ai[hume]`.
|
||||
|
||||
### Changed
|
||||
|
||||
- Updated default `GoogleLLMService` model to `gemini-2.5-flash`.
|
||||
|
||||
### Deprecated
|
||||
|
||||
- PlayHT is shutting down their API on December 31st, 2025. As a result,
|
||||
`PlayHTTTSService` and `PlayHTHttpTTSService` are deprecated and will be
|
||||
removed in a future version.
|
||||
|
||||
### Fixed
|
||||
|
||||
- Fixed an issue with `AWSNovaSonicLLMService` where the client wouldn't
|
||||
connect due to a breaking change in the AWS dependency chain.
|
||||
|
||||
- `PermissionError` is now caught if NLTK's `punkt_tab` can't be downloaded.
|
||||
|
||||
- Fixed an issue that would cause wrong user/assistant context ordering when
|
||||
using interruption strategies.
|
||||
|
||||
- Fixed RTVI incoming message handling, broken in 0.0.87.
|
||||
|
||||
## [0.0.87] - 2025-10-02
|
||||
|
||||
### Added
|
||||
|
||||
- Added `WebsocketSTTService` base class for websocket-based STT services.
|
||||
Combines STT functionality with websocket connectivity, providing automatic
|
||||
error handling and reconnection capabilities with exponential backoff.
|
||||
|
||||
- Added `DeepgramFluxSTTService` for real-time speech recognition using
|
||||
Deepgram's Flux WebSocket API. Flux understands conversational flow and
|
||||
automatically handles turn-taking.
|
||||
|
||||
- Added RTVI messages for user/bot audio levels and system logs.
|
||||
|
||||
- Include OpenAI-based LLM services cached tokens to `MetricsFrame`.
|
||||
@@ -20,6 +279,21 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
|
||||
### Deprecated
|
||||
|
||||
- `DailyTransportMessageFrame` and `DailyTransportMessageUrgentFrame` are
|
||||
deprecated, use `DailyOutputTransportMessageFrame` and
|
||||
`DailyOutputTransportMessageUrgentFrame` respectively instead.
|
||||
|
||||
- `LiveKitTransportMessageFrame` and `LiveKitTransportMessageUrgentFrame` are
|
||||
deprecated, use `LiveKitOutputTransportMessageFrame` and
|
||||
`LiveKitOutputTransportMessageUrgentFrame` respectively instead.
|
||||
|
||||
- `TransportMessageFrame` and `TransportMessageUrgentFrame` are deprecated, use
|
||||
`OutputTransportMessageFrame` and `OutputTransportMessageUrgentFrame`
|
||||
respectively instead.
|
||||
|
||||
- `InputTransportMessageUrgentFrame` is deprecated, use
|
||||
`InputTransportMessageFrame` instead.
|
||||
|
||||
- `DailyUpdateRemoteParticipantsFrame` is deprecated and will be removed in a
|
||||
future version. Instead, create your own custom frame and handle it in the
|
||||
`@transport.output().event_handler("on_after_push_frame")` event handler or a
|
||||
@@ -27,6 +301,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
|
||||
## Fixed
|
||||
|
||||
- Fixed an issue in `AWSBedrockLLMService` where timeout exceptions weren't
|
||||
being detected.
|
||||
|
||||
- Fixed a `PipelineTask` issue that could prevent the application to exit if
|
||||
`task.cancel()` was called when the task was already finished.
|
||||
|
||||
@@ -1353,7 +1630,7 @@ quality and critical bugs impacting `ParallelPipelines` functionality.**
|
||||
- Added `session_token` parameter to `AWSNovaSonicLLMService`.
|
||||
|
||||
- Added Gemini Multimodal Live File API for uploading, fetching, listing, and
|
||||
deleting files. See `26f-gemini-multimodal-live-files-api.py` for example usage.
|
||||
deleting files. See `26f-gemini-live-files-api.py` for example usage.
|
||||
|
||||
### Changed
|
||||
|
||||
@@ -3359,7 +3636,7 @@ stt = DeepgramSTTService(..., live_options=LiveOptions(model="nova-2-general"))
|
||||
- Added the new modalities option and helper function to set Gemini output
|
||||
modalities.
|
||||
|
||||
- Added `examples/foundational/26d-gemini-multimodal-live-text.py` which is
|
||||
- Added `examples/foundational/26d-gemini-live-text.py` which is
|
||||
using Gemini as TEXT modality and using another TTS provider for TTS process.
|
||||
|
||||
### Changed
|
||||
@@ -3546,9 +3823,9 @@ stt = DeepgramSTTService(..., live_options=LiveOptions(model="nova-2-general"))
|
||||
- Added new foundational examples for `GeminiMultimodalLiveLLMService`:
|
||||
|
||||
- `26-gemini-multimodal-live.py`
|
||||
- `26a-gemini-multimodal-live-transcription.py`
|
||||
- `26b-gemini-multimodal-live-video.py`
|
||||
- `26c-gemini-multimodal-live-video.py`
|
||||
- `26a-gemini-live-transcription.py`
|
||||
- `26b-gemini-live-video.py`
|
||||
- `26c-gemini-live-video.py`
|
||||
|
||||
- Added `SimliVideoService`. This is an integration for Simli AI avatars.
|
||||
(see https://www.simli.com)
|
||||
|
||||
336
COMMUNITY_INTEGRATIONS.md
Normal file
336
COMMUNITY_INTEGRATIONS.md
Normal file
@@ -0,0 +1,336 @@
|
||||
# Community Integrations Guide
|
||||
|
||||
Pipecat welcomes community-maintained integrations! As our ecosystem grows, we've established a process for any developer to create and maintain their own service integrations while ensuring discoverability for the Pipecat community.
|
||||
|
||||
## Overview
|
||||
|
||||
**What we support:** Community-maintained integrations that live in separate repositories and are maintained by their authors.
|
||||
|
||||
**What we don't do:** The Pipecat team does not code review, test, or maintain community integrations. We provide guidance and list approved integrations for discoverability.
|
||||
|
||||
**Why this approach:** This allows the community to move quickly while keeping the Pipecat core team focused on maintaining the framework itself.
|
||||
|
||||
## Submitting your Integration
|
||||
|
||||
To be listed as an official community integration, follow these steps:
|
||||
|
||||
### Step 1: Build Your Integration
|
||||
|
||||
Create your integration following the patterns and examples shown in the "Integration Patterns and Examples" section below.
|
||||
|
||||
### Step 2: Set Up Your Repository
|
||||
|
||||
Your repository must contain these components:
|
||||
|
||||
- **Source code** - Complete implementation following Pipecat patterns
|
||||
- **Foundational example** - Single file example showing basic usage (see [Pipecat examples](https://github.com/pipecat-ai/pipecat/tree/main/examples/foundational))
|
||||
- **README.md** - Must include:
|
||||
|
||||
- Introduction and explanation of your integration
|
||||
- Installation instructions
|
||||
- Usage instructions with Pipecat Pipeline
|
||||
- How to run your example
|
||||
- Pipecat version compatibility (e.g., "Tested with Pipecat v0.0.86")
|
||||
- Company attribution: If you work for the company providing the service, please mention this in your README. This helps build confidence that the integration will be actively maintained.
|
||||
|
||||
- **LICENSE** - Permissive license (BSD-2 like Pipecat, or equivalent open source terms)
|
||||
- **Code documentation** - Source code with docstrings (we recommend following [Pipecat's docstring conventions](https://github.com/pipecat-ai/pipecat/blob/main/CONTRIBUTING.md#docstring-conventions))
|
||||
- **Changelog** - Maintain a changelog for version updates
|
||||
|
||||
### Step 3: Join Discord
|
||||
|
||||
Join our Discord: https://discord.gg/pipecat
|
||||
|
||||
### Step 4: Submit for Listing
|
||||
|
||||
Submit a pull request to add your integration to our [Community Integrations documentation page](https://docs.pipecat.ai/server/services/community-integrations).
|
||||
|
||||
**To submit:**
|
||||
|
||||
1. Fork the [Pipecat docs repository](https://github.com/pipecat-ai/docs)
|
||||
2. Edit the file `server/services/community-integrations.mdx`
|
||||
3. Add your integration to the appropriate service category table with:
|
||||
- Service name
|
||||
- Link to your repository
|
||||
- Maintainer GitHub username(s)
|
||||
4. Include a link to your demo video (approx 30-60 seconds) in your PR description showing:
|
||||
- Core functionality of your integration
|
||||
- Handling of an interruption (if applicable to service type)
|
||||
5. Submit your pull request
|
||||
|
||||
Once your PR is submitted, post in the `#community-integrations` Discord channel to let us know.
|
||||
|
||||
## Integration Patterns and Examples
|
||||
|
||||
### STT (Speech-to-Text) Services
|
||||
|
||||
#### Websocket-based Services
|
||||
|
||||
**Base class:** `STTService`
|
||||
|
||||
**Examples:**
|
||||
|
||||
- [DeepgramSTTService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/deepgram/stt.py)
|
||||
- [SpeechmaticsSTTService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/speechmatics/stt.py)
|
||||
|
||||
#### File-based Services
|
||||
|
||||
**Base class:** `SegmentedSTTService`
|
||||
|
||||
**Examples:**
|
||||
|
||||
- [RivaSTTService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/riva/stt.py)
|
||||
- [FalSTTService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/fal/stt.py)
|
||||
|
||||
#### Key requirements:
|
||||
|
||||
- STT services should push `InterimTranscriptionFrames` and `TranscriptionFrames`
|
||||
- If confidence values are available, filter for values >50% confidence
|
||||
|
||||
### LLM (Large Language Model) Services
|
||||
|
||||
#### OpenAI-Compatible Services
|
||||
|
||||
**Base class:** `OpenAILLMService`
|
||||
|
||||
**Examples:**
|
||||
|
||||
- [AzureLLMService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/azure/llm.py)
|
||||
- [GrokLLMService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/grok/llm.py) - Shows overriding the base class where needed
|
||||
|
||||
#### Non-OpenAI Compatible Services
|
||||
|
||||
**Requires:** Full implementation
|
||||
|
||||
**Examples:**
|
||||
|
||||
- [AnthropicLLMService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/anthropic/llm.py)
|
||||
- [GoogleLLMService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/google/llm.py)
|
||||
|
||||
#### Key requirements:
|
||||
|
||||
- **Frame sequence:** Output must follow this frame sequence pattern:
|
||||
|
||||
- `LLMFullResponseStartFrame` - Signals the start of an LLM response
|
||||
- `LLMTextFrame` - Contains LLM content, typically streamed as tokens
|
||||
- `LLMFullResponseEndFrame` - Signals the end of an LLM response
|
||||
|
||||
- **Context aggregation:** Implement context aggregation to collect user and assistant content:
|
||||
- Aggregators come in pairs with a `user()` instance and `assistant()` instance
|
||||
- Context must adhere to the `LLMContext` universal format
|
||||
- Aggregators should handle adding messages, function calls, and images to the context
|
||||
|
||||
### TTS (Text-to-Speech) Services
|
||||
|
||||
#### AudioContextWordTTSService
|
||||
|
||||
**Use for:** Websocket-based services supporting word/timestamp alignment
|
||||
|
||||
**Example:**
|
||||
|
||||
- [CartesiaTTSService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/cartesia/tts.py)
|
||||
|
||||
#### InterruptibleTTSService
|
||||
|
||||
**Use for:** Websocket-based services without word/timestamp alignment, requiring disconnection on interruption
|
||||
|
||||
**Example:**
|
||||
|
||||
- [SarvamTTSService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/sarvam/tts.py)
|
||||
|
||||
#### WordTTSService
|
||||
|
||||
**Use for:** HTTP-based services supporting word/timestamp alignment
|
||||
|
||||
**Example:**
|
||||
|
||||
- [ElevenLabsHttpTTSService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/elevenlabs/tts.py)
|
||||
|
||||
#### TTSService
|
||||
|
||||
**Use for:** HTTP-based services without word/timestamp alignment
|
||||
|
||||
**Example:**
|
||||
|
||||
- [GoogleHttpTTSService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/google/tts.py)
|
||||
|
||||
#### Key requirements:
|
||||
|
||||
- For websocket services, use asyncio WebSocket implementation (required for v13+ support)
|
||||
- Handle idle service timeouts with keepalives
|
||||
- TTSServices push both audio (`TTSRawAudioFrame`) and text (`TTSTextFrame`) frames
|
||||
|
||||
### Telephony Serializers
|
||||
|
||||
Pipecat supports telephony provider integration using websocket connections to exchange MediaStreams. These services use a FrameSerializer to serialize and deserialize inputs from the FastAPIWebsocketTransport.
|
||||
|
||||
**Examples:**
|
||||
|
||||
- [Twilio](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/serializers/twilio.py)
|
||||
- [Telnyx](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/serializers/telnyx.py)
|
||||
|
||||
#### Key requirements:
|
||||
|
||||
- Include hang-up functionality using the provider's native API, ideally using `aiohttp`
|
||||
- Support DTMF (dual-tone multi-frequency) events if the provider supports them:
|
||||
- Deserialize DTMF events from the provider's protocol to `InputDTMFFrame`
|
||||
- Use `KeypadEntry` enum for valid keypad entries (0-9, \*, #, A-D)
|
||||
- Handle invalid DTMF digits gracefully by returning `None`
|
||||
|
||||
### Image Generation Services
|
||||
|
||||
**Base class:** `ImageGenService`
|
||||
|
||||
**Examples:**
|
||||
|
||||
- [FalImageGenService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/fal/image.py)
|
||||
- [GoogleImageGenService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/google/image.py)
|
||||
|
||||
#### Key requirements:
|
||||
|
||||
- Must implement `run_image_gen` method returning an `AsyncGenerator`
|
||||
|
||||
### Vision Services
|
||||
|
||||
Vision services process images and provide analysis such as descriptions, object detection, or visual question answering.
|
||||
|
||||
**Base class:** `VisionService`
|
||||
|
||||
**Example:**
|
||||
|
||||
- [MoondreamVisionService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/moondream/vision.py)
|
||||
|
||||
#### Key requirements:
|
||||
|
||||
- Must implement `run_vision` method that takes an `LLMContext` and returns an `AsyncGenerator[Frame, None]`
|
||||
- The method processes the latest image in the context and yields frames with analysis results
|
||||
- Typically yields `TextFrame` objects containing descriptions or answers
|
||||
|
||||
## Implementation Guidelines
|
||||
|
||||
### Naming Conventions
|
||||
|
||||
- **STT:** `VendorSTTService`
|
||||
- **LLM:** `VendorLLMService`
|
||||
- **TTS:**
|
||||
- Websocket: `VendorTTSService`
|
||||
- HTTP: `VendorHttpTTSService`
|
||||
- **Image:** `VendorImageGenService`
|
||||
- **Vision:** `VendorVisionService`
|
||||
- **Telephony:** `VendorFrameSerializer`
|
||||
|
||||
### Metrics Support
|
||||
|
||||
Enable metrics in your service:
|
||||
|
||||
```python
|
||||
def can_generate_metrics(self) -> bool:
|
||||
"""Check if this service can generate processing metrics.
|
||||
|
||||
Returns:
|
||||
True, as this service supports metrics.
|
||||
"""
|
||||
return True
|
||||
```
|
||||
|
||||
### Dynamic Settings Updates
|
||||
|
||||
STT, LLM, and TTS services support `ServiceUpdateSettingsFrame` for dynamic configuration changes. The base STTService has an `_update_settings()` method that handles settings, and the private `_settings` `Dict` is used to store settings and provide access to the subclass.
|
||||
|
||||
```python
|
||||
async def set_language(self, language: Language):
|
||||
"""Set the recognition language and reconnect.
|
||||
|
||||
Args:
|
||||
language: The language to use for speech recognition.
|
||||
"""
|
||||
logger.info(f"Switching STT language to: [{language}]")
|
||||
self._settings["language"] = language
|
||||
await self._disconnect()
|
||||
await self._connect()
|
||||
```
|
||||
|
||||
Note that, in this example, Deepgram requires the websocket connection be disconnected and reconnected to reinitialize the service with the new value. Consider if your service requires reconnection.
|
||||
|
||||
### Sample Rate Handling
|
||||
|
||||
Sample rates are set via PipelineParams and passed to each frame processor at initialization. The pattern is to _not_ set the sample rate value in the constructor of a given service. Instead, use the `start()` method to initialize sample rates from the frame:
|
||||
|
||||
```python
|
||||
async def start(self, frame: StartFrame):
|
||||
"""Start the service."""
|
||||
await super().start(frame)
|
||||
self._settings["output_format"]["sample_rate"] = self.sample_rate
|
||||
await self._connect()
|
||||
```
|
||||
|
||||
Note that `self.sample_rate` is a `@property` set in the TTSService base class, which provides access to the private sample rate value obtained from the StartFrame.
|
||||
|
||||
### Tracing Decorators
|
||||
|
||||
Use Pipecat's tracing decorators:
|
||||
|
||||
- **STT:** `@traced_stt` - decorate a function that handles `transcript`, `is_final`, `language` as args
|
||||
- **LLM:** `@traced_llm` - decorate the `_process_context()` method
|
||||
- **TTS:** `@traced_tts` - decorate the `run_tts()` method
|
||||
|
||||
## Best Practices
|
||||
|
||||
### Packaging and Distribution
|
||||
|
||||
- Use [uv](https://docs.astral.sh/uv/) for packaging (encouraged)
|
||||
- Consider releasing to PyPI for easier installation
|
||||
- Follow semantic versioning principles
|
||||
- Maintain a changelog
|
||||
|
||||
### HTTP Communication
|
||||
|
||||
For REST-based communication, use aiohttp. Pipecat includes this as a required dependency, so using it prevents adding an additional dependency to your integration.
|
||||
|
||||
### Error Handling
|
||||
|
||||
- Wrap API calls in appropriate try/catch blocks
|
||||
- Handle rate limits and network failures gracefully
|
||||
- Provide meaningful error messages
|
||||
- When errors occur, raise exceptions AND push `ErrorFrame`s to notify the pipeline:
|
||||
|
||||
```python
|
||||
from pipecat.frames.frames import ErrorFrame
|
||||
|
||||
try:
|
||||
# Your API call
|
||||
result = await self._make_api_call()
|
||||
except Exception as e:
|
||||
# Push error frame to pipeline
|
||||
await self.push_error(ErrorFrame(error=f"{self} error: {e}"))
|
||||
# Raise or handle as appropriate
|
||||
raise
|
||||
```
|
||||
|
||||
### Testing
|
||||
|
||||
- Your foundational example serves as a valuable integration-level test
|
||||
- Unit tests are nice to have. As the Pipecat teams provides better guidance, we will encourage unit testing more
|
||||
|
||||
## Disclaimer
|
||||
|
||||
Community integrations are community-maintained and not officially supported by the Pipecat team. Users should evaluate these integrations independently. The Pipecat team reserves the right to remove listings that become unmaintained or problematic.
|
||||
|
||||
## Staying Up to Date
|
||||
|
||||
Pipecat evolves rapidly to support the latest AI technologies and patterns. While we strive to minimize breaking changes, they do occur as the framework matures.
|
||||
|
||||
**We strongly recommend:**
|
||||
|
||||
- Join our Discord at https://discord.gg/pipecat and monitor the `#announcements` channel for release notifications
|
||||
- Follow our changelog: https://github.com/pipecat-ai/pipecat/blob/main/CHANGELOG.md
|
||||
- Test your integration against new Pipecat releases promptly
|
||||
- Update your README with the last tested Pipecat version
|
||||
|
||||
This helps ensure your integration remains compatible and your users have clear expectations about version support.
|
||||
|
||||
## Questions?
|
||||
|
||||
Join our Discord community at https://discord.gg/pipecat and post in the `#community-integrations` channel for guidance and support.
|
||||
|
||||
For additional questions, you can also reach out to us at pipecat-ai@daily.co.
|
||||
@@ -1,5 +1,9 @@
|
||||
## Contributing to Pipecat
|
||||
|
||||
**Want to add a new service integration?**
|
||||
We encourage community-maintained integrations! Please see our [Community Integration Guide](COMMUNITY_INTEGRATIONS.md) for the process and requirements.
|
||||
|
||||
**Want to contribute to Pipecat core?**
|
||||
We welcome contributions of all kinds! Your help is appreciated. Follow these steps to get involved:
|
||||
|
||||
1. **Fork this repository**: Start by forking the Pipecat Documentation repository to your GitHub account.
|
||||
|
||||
143
README.md
143
README.md
@@ -3,6 +3,7 @@
|
||||
</div></h1>
|
||||
|
||||
[](https://pypi.org/project/pipecat-ai)  [](https://codecov.io/gh/pipecat-ai/pipecat) [](https://docs.pipecat.ai) [](https://discord.gg/pipecat) [](https://deepwiki.com/pipecat-ai/pipecat)
|
||||
[](https://getmanta.ai/pipecat)
|
||||
|
||||
# 🎙️ Pipecat: Real-Time Voice & Multimodal AI Agents
|
||||
|
||||
@@ -19,10 +20,6 @@
|
||||
- **Business Agents** – customer intake, support bots, guided flows
|
||||
- **Complex Dialog Systems** – design logic with structured conversations
|
||||
|
||||
🧭 Looking to build structured conversations? Check out [Pipecat Flows](https://github.com/pipecat-ai/pipecat-flows) for managing complex conversational states and transitions.
|
||||
|
||||
🔍 Looking for help debugging your pipeline and processors? Check out [Whisker](https://github.com/pipecat-ai/whisker), a real-time Pipecat debugger.
|
||||
|
||||
## 🧠 Why Pipecat?
|
||||
|
||||
- **Voice-first**: Integrates speech recognition, text-to-speech, and conversation handling
|
||||
@@ -30,40 +27,38 @@
|
||||
- **Composable Pipelines**: Build complex behavior from modular components
|
||||
- **Real-Time**: Ultra-low latency interaction with different transports (e.g. WebSockets or WebRTC)
|
||||
|
||||
## 📱 Client SDKs
|
||||
## 🌐 Pipecat Ecosystem
|
||||
|
||||
You can connect to Pipecat from any platform using our official SDKs:
|
||||
### 📱 Client SDKs
|
||||
|
||||
<table>
|
||||
<tr>
|
||||
<td>
|
||||
<img src="https://cdn.jsdelivr.net/gh/devicons/devicon/icons/javascript/javascript-original.svg" width="40" height="40" alt="JavaScript"/>
|
||||
<a href="https://docs.pipecat.ai/client/js/introduction">JavaScript</a>
|
||||
</td>
|
||||
<td>
|
||||
<img src="https://cdn.jsdelivr.net/gh/devicons/devicon/icons/react/react-original.svg" width="40" height="40" alt="React"/>
|
||||
<a href="https://docs.pipecat.ai/client/react/introduction">React</a>
|
||||
</td>
|
||||
<td>
|
||||
<img src="https://cdn.jsdelivr.net/gh/devicons/devicon/icons/react/react-original.svg" width="40" height="40" alt="React Native"/>
|
||||
<a href="https://docs.pipecat.ai/client/react-native/introduction">React Native</a>
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>
|
||||
<img src="https://cdn.jsdelivr.net/gh/devicons/devicon/icons/swift/swift-original.svg" width="40" height="40" alt="Swift"/>
|
||||
<a href="https://docs.pipecat.ai/client/ios/introduction">Swift</a>
|
||||
</td>
|
||||
<td>
|
||||
<img src="https://cdn.jsdelivr.net/gh/devicons/devicon/icons/kotlin/kotlin-original.svg" width="40" height="40" alt="Kotlin"/>
|
||||
<a href="https://docs.pipecat.ai/client/android/introduction">Kotlin</a>
|
||||
</td>
|
||||
<td>
|
||||
<img src="https://cdn.jsdelivr.net/gh/devicons/devicon/icons/cplusplus/cplusplus-original.svg" width="40" height="40" alt="JavaScript"/>
|
||||
<a href="https://docs.pipecat.ai/client/c++/introduction">C++</a>
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
Building client applications? You can connect to Pipecat from any platform using our official SDKs:
|
||||
|
||||
<a href="https://docs.pipecat.ai/client/js/introduction">JavaScript</a> | <a href="https://docs.pipecat.ai/client/react/introduction">React</a> | <a href="https://docs.pipecat.ai/client/react-native/introduction">React Native</a> |
|
||||
<a href="https://docs.pipecat.ai/client/ios/introduction">Swift</a> | <a href="https://docs.pipecat.ai/client/android/introduction">Kotlin</a> | <a href="https://docs.pipecat.ai/client/c++/introduction">C++</a> | <a href="https://github.com/pipecat-ai/pipecat-esp32">ESP32</a>
|
||||
|
||||
### 🧭 Structured conversations
|
||||
|
||||
Looking to build structured conversations? Check out [Pipecat Flows](https://github.com/pipecat-ai/pipecat-flows) for managing complex conversational states and transitions.
|
||||
|
||||
### 🪄 Beautiful UIs
|
||||
|
||||
Want to build beautiful and engaging experiences? Checkout the [Voice UI Kit](https://github.com/pipecat-ai/voice-ui-kit), a collection of components, hooks and templates for building voice AI applications quickly.
|
||||
|
||||
### 🛠️ CLI
|
||||
|
||||
Create a new project in under a minute with the [Pipecat CLI](https://github.com/pipecat-ai/pipecat-cli). Then use the CLI to monitor and deploy your agent to production.
|
||||
|
||||
### 🔍 Debugging
|
||||
|
||||
Looking for help debugging your pipeline and processors? Check out [Whisker](https://github.com/pipecat-ai/whisker), a real-time Pipecat debugger.
|
||||
|
||||
### 🖥️ Terminal
|
||||
|
||||
Love terminal applications? Check out [Tail](https://github.com/pipecat-ai/tail), a terminal dashboard for Pipecat.
|
||||
|
||||
### 📺️ Pipecat TV Channel
|
||||
|
||||
Catch new features, interviews, and how-tos on our [Pipecat TV](https://www.youtube.com/playlist?list=PLzU2zoMTQIHjqC3v4q2XVSR3hGSzwKFwH) channel.
|
||||
|
||||
## 🎬 See it in action
|
||||
|
||||
@@ -72,24 +67,24 @@ You can connect to Pipecat from any platform using our official SDKs:
|
||||
<a href="https://github.com/pipecat-ai/pipecat-examples/tree/main/storytelling-chatbot"><img src="https://raw.githubusercontent.com/pipecat-ai/pipecat-examples/main/storytelling-chatbot/image.png" width="400" /></a>
|
||||
<br/>
|
||||
<a href="https://github.com/pipecat-ai/pipecat-examples/tree/main/translation-chatbot"><img src="https://raw.githubusercontent.com/pipecat-ai/pipecat-examples/main/translation-chatbot/image.png" width="400" /></a>
|
||||
<a href="https://github.com/pipecat-ai/pipecat-examples/tree/main/moondream-chatbot"><img src="https://raw.githubusercontent.com/pipecat-ai/pipecat-examples/main/moondream-chatbot/image.png" width="400" /></a>
|
||||
<a href="https://github.com/pipecat-ai/pipecat/blob/main/examples/foundational/12-describe-video.py"><img src="https://github.com/pipecat-ai/pipecat/blob/main/examples/foundational/assets/moondream.png" width="400" /></a>
|
||||
</p>
|
||||
|
||||
## 🧩 Available services
|
||||
|
||||
| Category | Services |
|
||||
| ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| Speech-to-Text | [AssemblyAI](https://docs.pipecat.ai/server/services/stt/assemblyai), [AWS](https://docs.pipecat.ai/server/services/stt/aws), [Azure](https://docs.pipecat.ai/server/services/stt/azure), [Cartesia](https://docs.pipecat.ai/server/services/stt/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/stt/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/stt/elevenlabs), [Fal Wizper](https://docs.pipecat.ai/server/services/stt/fal), [Gladia](https://docs.pipecat.ai/server/services/stt/gladia), [Google](https://docs.pipecat.ai/server/services/stt/google), [Groq (Whisper)](https://docs.pipecat.ai/server/services/stt/groq), [NVIDIA Riva](https://docs.pipecat.ai/server/services/stt/riva), [OpenAI (Whisper)](https://docs.pipecat.ai/server/services/stt/openai), [SambaNova (Whisper)](https://docs.pipecat.ai/server/services/stt/sambanova), [Soniox](https://docs.pipecat.ai/server/services/stt/soniox), [Speechmatics](https://docs.pipecat.ai/server/services/stt/speechmatics), [Ultravox](https://docs.pipecat.ai/server/services/stt/ultravox), [Whisper](https://docs.pipecat.ai/server/services/stt/whisper) |
|
||||
| LLMs | [Anthropic](https://docs.pipecat.ai/server/services/llm/anthropic), [AWS](https://docs.pipecat.ai/server/services/llm/aws), [Azure](https://docs.pipecat.ai/server/services/llm/azure), [Cerebras](https://docs.pipecat.ai/server/services/llm/cerebras), [DeepSeek](https://docs.pipecat.ai/server/services/llm/deepseek), [Fireworks AI](https://docs.pipecat.ai/server/services/llm/fireworks), [Gemini](https://docs.pipecat.ai/server/services/llm/gemini), [Grok](https://docs.pipecat.ai/server/services/llm/grok), [Groq](https://docs.pipecat.ai/server/services/llm/groq), [Mistral](https://docs.pipecat.ai/server/services/llm/mistral), [NVIDIA NIM](https://docs.pipecat.ai/server/services/llm/nim), [Ollama](https://docs.pipecat.ai/server/services/llm/ollama), [OpenAI](https://docs.pipecat.ai/server/services/llm/openai), [OpenRouter](https://docs.pipecat.ai/server/services/llm/openrouter), [Perplexity](https://docs.pipecat.ai/server/services/llm/perplexity), [Qwen](https://docs.pipecat.ai/server/services/llm/qwen), [SambaNova](https://docs.pipecat.ai/server/services/llm/sambanova) [Together AI](https://docs.pipecat.ai/server/services/llm/together) |
|
||||
| Text-to-Speech | [Async](https://docs.pipecat.ai/server/services/tts/asyncai), [AWS](https://docs.pipecat.ai/server/services/tts/aws), [Azure](https://docs.pipecat.ai/server/services/tts/azure), [Cartesia](https://docs.pipecat.ai/server/services/tts/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/tts/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/tts/elevenlabs), [Fish](https://docs.pipecat.ai/server/services/tts/fish), [Google](https://docs.pipecat.ai/server/services/tts/google), [Groq](https://docs.pipecat.ai/server/services/tts/groq), [Inworld](https://docs.pipecat.ai/server/services/tts/inworld), [LMNT](https://docs.pipecat.ai/server/services/tts/lmnt), [MiniMax](https://docs.pipecat.ai/server/services/tts/minimax), [Neuphonic](https://docs.pipecat.ai/server/services/tts/neuphonic), [NVIDIA Riva](https://docs.pipecat.ai/server/services/tts/riva), [OpenAI](https://docs.pipecat.ai/server/services/tts/openai), [Piper](https://docs.pipecat.ai/server/services/tts/piper), [PlayHT](https://docs.pipecat.ai/server/services/tts/playht), [Rime](https://docs.pipecat.ai/server/services/tts/rime), [Sarvam](https://docs.pipecat.ai/server/services/tts/sarvam), [XTTS](https://docs.pipecat.ai/server/services/tts/xtts) |
|
||||
| Speech-to-Speech | [AWS Nova Sonic](https://docs.pipecat.ai/server/services/s2s/aws), [Gemini Multimodal Live](https://docs.pipecat.ai/server/services/s2s/gemini), [OpenAI Realtime](https://docs.pipecat.ai/server/services/s2s/openai) |
|
||||
| Transport | [Daily (WebRTC)](https://docs.pipecat.ai/server/services/transport/daily), [FastAPI Websocket](https://docs.pipecat.ai/server/services/transport/fastapi-websocket), [SmallWebRTCTransport](https://docs.pipecat.ai/server/services/transport/small-webrtc), [WebSocket Server](https://docs.pipecat.ai/server/services/transport/websocket-server), Local |
|
||||
| Serializers | [Plivo](https://docs.pipecat.ai/server/utilities/serializers/plivo), [Twilio](https://docs.pipecat.ai/server/utilities/serializers/twilio), [Telnyx](https://docs.pipecat.ai/server/utilities/serializers/telnyx) |
|
||||
| Video | [HeyGen](https://docs.pipecat.ai/server/services/video/heygen), [Tavus](https://docs.pipecat.ai/server/services/video/tavus), [Simli](https://docs.pipecat.ai/server/services/video/simli) |
|
||||
| Memory | [mem0](https://docs.pipecat.ai/server/services/memory/mem0) |
|
||||
| Vision & Image | [fal](https://docs.pipecat.ai/server/services/image-generation/fal), [Google Imagen](https://docs.pipecat.ai/server/services/image-generation/fal), [Moondream](https://docs.pipecat.ai/server/services/vision/moondream) |
|
||||
| Audio Processing | [Silero VAD](https://docs.pipecat.ai/server/utilities/audio/silero-vad-analyzer), [Krisp](https://docs.pipecat.ai/server/utilities/audio/krisp-filter), [Koala](https://docs.pipecat.ai/server/utilities/audio/koala-filter), [ai-coustics](https://docs.pipecat.ai/server/utilities/audio/aic-filter) |
|
||||
| Analytics & Metrics | [OpenTelemetry](https://docs.pipecat.ai/server/utilities/opentelemetry), [Sentry](https://docs.pipecat.ai/server/services/analytics/sentry) |
|
||||
| Category | Services |
|
||||
| ------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| Speech-to-Text | [AssemblyAI](https://docs.pipecat.ai/server/services/stt/assemblyai), [AWS](https://docs.pipecat.ai/server/services/stt/aws), [Azure](https://docs.pipecat.ai/server/services/stt/azure), [Cartesia](https://docs.pipecat.ai/server/services/stt/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/stt/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/stt/elevenlabs), [Fal Wizper](https://docs.pipecat.ai/server/services/stt/fal), [Gladia](https://docs.pipecat.ai/server/services/stt/gladia), [Google](https://docs.pipecat.ai/server/services/stt/google), [Groq (Whisper)](https://docs.pipecat.ai/server/services/stt/groq), [NVIDIA Riva](https://docs.pipecat.ai/server/services/stt/riva), [OpenAI (Whisper)](https://docs.pipecat.ai/server/services/stt/openai), [SambaNova (Whisper)](https://docs.pipecat.ai/server/services/stt/sambanova), [Soniox](https://docs.pipecat.ai/server/services/stt/soniox), [Speechmatics](https://docs.pipecat.ai/server/services/stt/speechmatics), [Ultravox](https://docs.pipecat.ai/server/services/stt/ultravox), [Whisper](https://docs.pipecat.ai/server/services/stt/whisper) |
|
||||
| LLMs | [Anthropic](https://docs.pipecat.ai/server/services/llm/anthropic), [AWS](https://docs.pipecat.ai/server/services/llm/aws), [Azure](https://docs.pipecat.ai/server/services/llm/azure), [Cerebras](https://docs.pipecat.ai/server/services/llm/cerebras), [DeepSeek](https://docs.pipecat.ai/server/services/llm/deepseek), [Fireworks AI](https://docs.pipecat.ai/server/services/llm/fireworks), [Gemini](https://docs.pipecat.ai/server/services/llm/gemini), [Grok](https://docs.pipecat.ai/server/services/llm/grok), [Groq](https://docs.pipecat.ai/server/services/llm/groq), [Mistral](https://docs.pipecat.ai/server/services/llm/mistral), [NVIDIA NIM](https://docs.pipecat.ai/server/services/llm/nim), [Ollama](https://docs.pipecat.ai/server/services/llm/ollama), [OpenAI](https://docs.pipecat.ai/server/services/llm/openai), [OpenRouter](https://docs.pipecat.ai/server/services/llm/openrouter), [Perplexity](https://docs.pipecat.ai/server/services/llm/perplexity), [Qwen](https://docs.pipecat.ai/server/services/llm/qwen), [SambaNova](https://docs.pipecat.ai/server/services/llm/sambanova) [Together AI](https://docs.pipecat.ai/server/services/llm/together) |
|
||||
| Text-to-Speech | [Async](https://docs.pipecat.ai/server/services/tts/asyncai), [AWS](https://docs.pipecat.ai/server/services/tts/aws), [Azure](https://docs.pipecat.ai/server/services/tts/azure), [Cartesia](https://docs.pipecat.ai/server/services/tts/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/tts/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/tts/elevenlabs), [Fish](https://docs.pipecat.ai/server/services/tts/fish), [Google](https://docs.pipecat.ai/server/services/tts/google), [Groq](https://docs.pipecat.ai/server/services/tts/groq), [Hume](https://docs.pipecat.ai/server/services/tts/hume), [Inworld](https://docs.pipecat.ai/server/services/tts/inworld), [LMNT](https://docs.pipecat.ai/server/services/tts/lmnt), [MiniMax](https://docs.pipecat.ai/server/services/tts/minimax), [Neuphonic](https://docs.pipecat.ai/server/services/tts/neuphonic), [NVIDIA Riva](https://docs.pipecat.ai/server/services/tts/riva), [OpenAI](https://docs.pipecat.ai/server/services/tts/openai), [Piper](https://docs.pipecat.ai/server/services/tts/piper), [PlayHT](https://docs.pipecat.ai/server/services/tts/playht), [Rime](https://docs.pipecat.ai/server/services/tts/rime), [Sarvam](https://docs.pipecat.ai/server/services/tts/sarvam), [XTTS](https://docs.pipecat.ai/server/services/tts/xtts) |
|
||||
| Speech-to-Speech | [AWS Nova Sonic](https://docs.pipecat.ai/server/services/s2s/aws), [Gemini Multimodal Live](https://docs.pipecat.ai/server/services/s2s/gemini), [OpenAI Realtime](https://docs.pipecat.ai/server/services/s2s/openai) |
|
||||
| Transport | [Daily (WebRTC)](https://docs.pipecat.ai/server/services/transport/daily), [FastAPI Websocket](https://docs.pipecat.ai/server/services/transport/fastapi-websocket), [SmallWebRTCTransport](https://docs.pipecat.ai/server/services/transport/small-webrtc), [WebSocket Server](https://docs.pipecat.ai/server/services/transport/websocket-server), Local |
|
||||
| Serializers | [Plivo](https://docs.pipecat.ai/server/utilities/serializers/plivo), [Twilio](https://docs.pipecat.ai/server/utilities/serializers/twilio), [Telnyx](https://docs.pipecat.ai/server/utilities/serializers/telnyx) |
|
||||
| Video | [HeyGen](https://docs.pipecat.ai/server/services/video/heygen), [Tavus](https://docs.pipecat.ai/server/services/video/tavus), [Simli](https://docs.pipecat.ai/server/services/video/simli) |
|
||||
| Memory | [mem0](https://docs.pipecat.ai/server/services/memory/mem0) |
|
||||
| Vision & Image | [fal](https://docs.pipecat.ai/server/services/image-generation/fal), [Google Imagen](https://docs.pipecat.ai/server/services/image-generation/fal), [Moondream](https://docs.pipecat.ai/server/services/vision/moondream) |
|
||||
| Audio Processing | [Silero VAD](https://docs.pipecat.ai/server/utilities/audio/silero-vad-analyzer), [Krisp](https://docs.pipecat.ai/server/utilities/audio/krisp-filter), [Koala](https://docs.pipecat.ai/server/utilities/audio/koala-filter), [ai-coustics](https://docs.pipecat.ai/server/utilities/audio/aic-filter) |
|
||||
| Analytics & Metrics | [OpenTelemetry](https://docs.pipecat.ai/server/utilities/opentelemetry), [Sentry](https://docs.pipecat.ai/server/services/analytics/sentry) |
|
||||
|
||||
📚 [View full services documentation →](https://docs.pipecat.ai/server/services/supported-services)
|
||||
|
||||
@@ -184,54 +179,6 @@ Run a specific test suite:
|
||||
uv run pytest tests/test_name.py
|
||||
```
|
||||
|
||||
### Setting up your editor
|
||||
|
||||
This project uses strict [PEP 8](https://peps.python.org/pep-0008/) formatting via [Ruff](https://github.com/astral-sh/ruff).
|
||||
|
||||
#### Emacs
|
||||
|
||||
You can use [use-package](https://github.com/jwiegley/use-package) to install [emacs-lazy-ruff](https://github.com/christophermadsen/emacs-lazy-ruff) package and configure `ruff` arguments:
|
||||
|
||||
```elisp
|
||||
(use-package lazy-ruff
|
||||
:ensure t
|
||||
:hook ((python-mode . lazy-ruff-mode))
|
||||
:config
|
||||
(setq lazy-ruff-format-command "ruff format")
|
||||
(setq lazy-ruff-check-command "ruff check --select I"))
|
||||
```
|
||||
|
||||
`ruff` was installed in the `venv` environment described before, so you should be able to use [pyvenv-auto](https://github.com/ryotaro612/pyvenv-auto) to automatically load that environment inside Emacs.
|
||||
|
||||
```elisp
|
||||
(use-package pyvenv-auto
|
||||
:ensure t
|
||||
:defer t
|
||||
:hook ((python-mode . pyvenv-auto-run)))
|
||||
```
|
||||
|
||||
#### Visual Studio Code
|
||||
|
||||
Install the
|
||||
[Ruff](https://marketplace.visualstudio.com/items?itemName=charliermarsh.ruff) extension. Then edit the user settings (_Ctrl-Shift-P_ `Open User Settings (JSON)`) and set it as the default Python formatter, and enable formatting on save:
|
||||
|
||||
```json
|
||||
"[python]": {
|
||||
"editor.defaultFormatter": "charliermarsh.ruff",
|
||||
"editor.formatOnSave": true
|
||||
}
|
||||
```
|
||||
|
||||
#### PyCharm
|
||||
|
||||
`ruff` was installed in the `venv` environment described before, now to enable autoformatting on save, go to `File` -> `Settings` -> `Tools` -> `File Watchers` and add a new watcher with the following settings:
|
||||
|
||||
1. **Name**: `Ruff formatter`
|
||||
2. **File type**: `Python`
|
||||
3. **Working directory**: `$ContentRoot$`
|
||||
4. **Arguments**: `format $FilePath$`
|
||||
5. **Program**: `$PyInterpreterDirectory$/ruff`
|
||||
|
||||
## 🤝 Contributing
|
||||
|
||||
We welcome contributions from the community! Whether you're fixing bugs, improving documentation, or adding new features, here's how you can help:
|
||||
|
||||
5
SECURITY.md
Normal file
5
SECURITY.md
Normal file
@@ -0,0 +1,5 @@
|
||||
# Security Policy
|
||||
|
||||
## Reporting a Vulnerability
|
||||
|
||||
Please email `disclosures@daily.co`.
|
||||
@@ -50,6 +50,7 @@ autodoc_mock_imports = [
|
||||
# Krisp - has build issues on some platforms
|
||||
"pipecat_ai_krisp",
|
||||
"krisp",
|
||||
"krisp_audio",
|
||||
# System-specific GUI libraries
|
||||
"_tkinter",
|
||||
"tkinter",
|
||||
|
||||
12
env.example
12
env.example
@@ -58,6 +58,9 @@ GOOGLE_CLOUD_PROJECT_ID=...
|
||||
GOOGLE_TEST_CREDENTIALS=...
|
||||
GOOGLE_VERTEX_TEST_CREDENTIALS=...
|
||||
|
||||
# Hume
|
||||
HUME_API_KEY=...
|
||||
|
||||
# LMNT
|
||||
LMNT_API_KEY=...
|
||||
LMNT_VOICE_ID=...
|
||||
@@ -87,6 +90,9 @@ SIMLI_FACE_ID=...
|
||||
# Krisp
|
||||
KRISP_MODEL_PATH=...
|
||||
|
||||
# Krisp Viva
|
||||
KRISP_VIVA_MODEL_PATH=...
|
||||
|
||||
# DeepSeek
|
||||
DEEPSEEK_API_KEY=...
|
||||
|
||||
@@ -155,3 +161,9 @@ NVIDIA_API_KEY=...
|
||||
|
||||
# Qwen
|
||||
QWEN_API_KEY=...
|
||||
|
||||
# WhatsApp
|
||||
WHATSAPP_TOKEN=
|
||||
WHATSAPP_WEBHOOK_VERIFICATION_TOKEN=
|
||||
WHATSAPP_PHONE_NUMBER_ID=
|
||||
WHATSAPP_APP_SECRET=
|
||||
@@ -25,7 +25,7 @@ from pipecat.processors.aggregators.llm_response_universal import LLMContextAggr
|
||||
from pipecat.runner.daily import configure
|
||||
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.transports.daily.transport import DailyLogLevel, DailyParams, DailyTransport
|
||||
from pipecat.transports.daily.transport import DailyParams, DailyTransport
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
@@ -49,7 +49,6 @@ async def main():
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
)
|
||||
transport.set_log_level(DailyLogLevel.Info)
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
|
||||
@@ -21,8 +21,8 @@ from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.cartesia.stt import CartesiaSTTService
|
||||
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
@@ -58,7 +58,7 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
stt = CartesiaSTTService(api_key=os.getenv("CARTESIA_API_KEY"))
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
|
||||
138
examples/foundational/07ae-interruptible-hume.py
Normal file
138
examples/foundational/07ae-interruptible-hume.py
Normal file
@@ -0,0 +1,138 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import os
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
|
||||
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
from pipecat.frames.frames import LLMRunFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.processors.frameworks.rtvi import RTVIConfig, RTVIObserver, RTVIProcessor
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.hume.tts import HUME_SAMPLE_RATE, HumeTTSService
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||
# instantiated. The function will be called when the desired transport gets
|
||||
# selected.
|
||||
transport_params = {
|
||||
"daily": lambda: DailyParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
"twilio": lambda: FastAPIWebsocketParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
"webrtc": lambda: TransportParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
|
||||
tts = HumeTTSService(
|
||||
api_key=os.getenv("HUME_API_KEY"),
|
||||
# Replace with your Hume voice ID
|
||||
voice_id="f898a92e-685f-43fa-985b-a46920f0650b",
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
|
||||
},
|
||||
]
|
||||
|
||||
context = LLMContext(messages)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
rtvi = RTVIProcessor(config=RTVIConfig(config=[]))
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(), # Transport user input
|
||||
rtvi,
|
||||
stt,
|
||||
context_aggregator.user(), # User responses
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
context_aggregator.assistant(), # Assistant spoken responses
|
||||
]
|
||||
)
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
params=PipelineParams(
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
audio_out_sample_rate=HUME_SAMPLE_RATE,
|
||||
),
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
observers=[RTVIObserver(rtvi)],
|
||||
)
|
||||
|
||||
@rtvi.event_handler("on_client_ready")
|
||||
async def on_client_ready(rtvi):
|
||||
await rtvi.set_bot_ready()
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
async def bot(runner_args: RunnerArguments):
|
||||
"""Main bot entry point compatible with Pipecat Cloud."""
|
||||
transport = await create_transport(runner_args, transport_params)
|
||||
await run_bot(transport, runner_args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from pipecat.runner.run import main
|
||||
|
||||
main()
|
||||
118
examples/foundational/07c-interruptible-deepgram-flux.py
Normal file
118
examples/foundational/07c-interruptible-deepgram-flux.py
Normal file
@@ -0,0 +1,118 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
|
||||
import os
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.frames.frames import LLMRunFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_response_universal import (
|
||||
LLMContext,
|
||||
LLMContextAggregatorPair,
|
||||
)
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.deepgram.flux.stt import DeepgramFluxSTTService
|
||||
from pipecat.services.deepgram.tts import DeepgramTTSService
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||
# instantiated. The function will be called when the desired transport gets
|
||||
# selected.
|
||||
transport_params = {
|
||||
"daily": lambda: DailyParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
),
|
||||
"twilio": lambda: FastAPIWebsocketParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
),
|
||||
"webrtc": lambda: TransportParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramFluxSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
|
||||
tts = DeepgramTTSService(api_key=os.getenv("DEEPGRAM_API_KEY"), voice="aura-2-andromeda-en")
|
||||
|
||||
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
|
||||
},
|
||||
]
|
||||
|
||||
context = LLMContext(messages)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(), # Transport user input
|
||||
stt, # STT
|
||||
context_aggregator.user(), # User responses
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
context_aggregator.assistant(), # Assistant spoken responses
|
||||
]
|
||||
)
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
params=PipelineParams(
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
),
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
async def bot(runner_args: RunnerArguments):
|
||||
"""Main bot entry point compatible with Pipecat Cloud."""
|
||||
transport = await create_transport(runner_args, transport_params)
|
||||
await run_bot(transport, runner_args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from pipecat.runner.run import main
|
||||
|
||||
main()
|
||||
@@ -23,7 +23,6 @@ from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.elevenlabs.stt import ElevenLabsSTTService
|
||||
from pipecat.services.elevenlabs.tts import ElevenLabsHttpTTSService
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
|
||||
151
examples/foundational/07n-interruptible-gemini-image.py
Normal file
151
examples/foundational/07n-interruptible-gemini-image.py
Normal file
@@ -0,0 +1,151 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""
|
||||
A conversational AI bot using Gemini for both LLM, STT and TTS.
|
||||
|
||||
This example demonstrates how to use Gemini's image generation capabilities.
|
||||
|
||||
Features showcased:
|
||||
- Gemini LLM for conversation and image generation
|
||||
- Google TTS and STT
|
||||
|
||||
Run with:
|
||||
python examples/foundational/07n-interruptible-gemini-image.py
|
||||
|
||||
Make sure to set your environment variables:
|
||||
export GOOGLE_API_KEY=your_api_key_here
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
|
||||
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
from pipecat.frames.frames import LLMRunFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.google.llm import GoogleLLMService
|
||||
from pipecat.services.google.stt import GoogleSTTService
|
||||
from pipecat.services.google.tts import GoogleTTSService
|
||||
from pipecat.transcriptions.language import Language
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||
# instantiated. The function will be called when the desired transport gets
|
||||
# selected.
|
||||
transport_params = {
|
||||
"daily": lambda: DailyParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
video_out_enabled=True,
|
||||
video_out_width=1024,
|
||||
video_out_height=1024,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
"webrtc": lambda: TransportParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
video_out_enabled=True,
|
||||
video_out_width=1024,
|
||||
video_out_height=1024,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = GoogleSTTService(
|
||||
params=GoogleSTTService.InputParams(languages=Language.EN_US),
|
||||
credentials=os.getenv("GOOGLE_TEST_CREDENTIALS"),
|
||||
)
|
||||
|
||||
tts = GoogleTTSService(
|
||||
voice_id="en-US-Chirp3-HD-Charon",
|
||||
params=GoogleTTSService.InputParams(language=Language.EN_US),
|
||||
credentials=os.getenv("GOOGLE_TEST_CREDENTIALS"),
|
||||
)
|
||||
|
||||
llm = GoogleLLMService(
|
||||
api_key=os.getenv("GOOGLE_API_KEY"),
|
||||
model="gemini-2.5-flash-image",
|
||||
)
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
|
||||
},
|
||||
]
|
||||
|
||||
context = LLMContext(messages)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(), # Transport user input
|
||||
stt, # STT
|
||||
context_aggregator.user(), # User responses
|
||||
llm, # LLM
|
||||
tts, # Gemini TTS
|
||||
transport.output(), # Transport bot output
|
||||
context_aggregator.assistant(), # Assistant spoken responses
|
||||
]
|
||||
)
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
params=PipelineParams(
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
),
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation with a styled introduction
|
||||
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
async def bot(runner_args: RunnerArguments):
|
||||
"""Main bot entry point compatible with Pipecat Cloud."""
|
||||
transport = await create_transport(runner_args, transport_params)
|
||||
await run_bot(transport, runner_args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from pipecat.runner.run import main
|
||||
|
||||
main()
|
||||
129
examples/foundational/07p-interruptible-krisp-viva.py
Normal file
129
examples/foundational/07p-interruptible-krisp-viva.py
Normal file
@@ -0,0 +1,129 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
|
||||
import os
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.audio.filters.krisp_viva_filter import KrispVivaFilter
|
||||
from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
|
||||
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
from pipecat.frames.frames import LLMRunFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.deepgram.tts import DeepgramTTSService
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||
# instantiated. The function will be called when the desired transport gets
|
||||
# selected.
|
||||
transport_params = {
|
||||
"daily": lambda: DailyParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
audio_in_filter=KrispVivaFilter(),
|
||||
),
|
||||
"twilio": lambda: FastAPIWebsocketParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
audio_in_filter=KrispVivaFilter(),
|
||||
),
|
||||
"webrtc": lambda: TransportParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
audio_in_filter=KrispVivaFilter(),
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
|
||||
tts = DeepgramTTSService(api_key=os.getenv("DEEPGRAM_API_KEY"), voice="aura-helios-en")
|
||||
|
||||
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
|
||||
},
|
||||
]
|
||||
|
||||
context = LLMContext(messages)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(), # Transport user input
|
||||
stt, # STT
|
||||
context_aggregator.user(), # User responses
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
context_aggregator.assistant(), # Assistant spoken responses
|
||||
]
|
||||
)
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
params=PipelineParams(
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
),
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
async def bot(runner_args: RunnerArguments):
|
||||
"""Main bot entry point compatible with Pipecat Cloud."""
|
||||
transport = await create_transport(runner_args, transport_params)
|
||||
await run_bot(transport, runner_args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from pipecat.runner.run import main
|
||||
|
||||
main()
|
||||
@@ -48,10 +48,7 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = CartesiaSTTService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
base_url=os.getenv("CARTESIA_BASE_URL"),
|
||||
)
|
||||
stt = CartesiaSTTService(api_key=os.getenv("CARTESIA_API_KEY"))
|
||||
|
||||
tl = TranscriptionLogger()
|
||||
|
||||
|
||||
@@ -76,9 +76,8 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
|
||||
llm = GoogleVertexLLMService(
|
||||
credentials=os.getenv("GOOGLE_VERTEX_TEST_CREDENTIALS"),
|
||||
params=GoogleVertexLLMService.InputParams(
|
||||
project_id=os.getenv("GOOGLE_CLOUD_PROJECT_ID"),
|
||||
),
|
||||
project_id=os.getenv("GOOGLE_CLOUD_PROJECT_ID"),
|
||||
location=os.getenv("GOOGLE_CLOUD_LOCATION"),
|
||||
)
|
||||
# You can aslo register a function_name of None to get all functions
|
||||
# sent to the same callback with an additional function_name parameter.
|
||||
|
||||
182
examples/foundational/14x-function-calling-openpipe.py
Normal file
182
examples/foundational/14x-function-calling-openpipe.py
Normal file
@@ -0,0 +1,182 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import os
|
||||
import time
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.adapters.schemas.function_schema import FunctionSchema
|
||||
from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
||||
from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
|
||||
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
from pipecat.frames.frames import LLMRunFrame, TTSSpeakFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.llm_service import FunctionCallParams
|
||||
from pipecat.services.openpipe.llm import OpenPipeLLMService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
async def fetch_weather_from_api(params: FunctionCallParams):
|
||||
await params.result_callback({"conditions": "nice", "temperature": "75"})
|
||||
|
||||
|
||||
async def fetch_restaurant_recommendation(params: FunctionCallParams):
|
||||
await params.result_callback({"name": "The Golden Dragon"})
|
||||
|
||||
|
||||
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||
# instantiated. The function will be called when the desired transport gets
|
||||
# selected.
|
||||
transport_params = {
|
||||
"daily": lambda: DailyParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
"twilio": lambda: FastAPIWebsocketParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
"webrtc": lambda: TransportParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
voice_id="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
)
|
||||
|
||||
timestamp = int(time.time())
|
||||
llm = OpenPipeLLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
openpipe_api_key=os.getenv("OPENPIPE_API_KEY"),
|
||||
tags={"conversation_id": f"pipecat-{timestamp}"},
|
||||
)
|
||||
|
||||
# You can also register a function_name of None to get all functions
|
||||
# sent to the same callback with an additional function_name parameter.
|
||||
llm.register_function("get_current_weather", fetch_weather_from_api)
|
||||
llm.register_function("get_restaurant_recommendation", fetch_restaurant_recommendation)
|
||||
|
||||
@llm.event_handler("on_function_calls_started")
|
||||
async def on_function_calls_started(service, function_calls):
|
||||
await tts.queue_frame(TTSSpeakFrame("Let me check on that."))
|
||||
|
||||
weather_function = FunctionSchema(
|
||||
name="get_current_weather",
|
||||
description="Get the current weather",
|
||||
properties={
|
||||
"location": {
|
||||
"type": "string",
|
||||
"description": "The city and state, e.g. San Francisco, CA",
|
||||
},
|
||||
"format": {
|
||||
"type": "string",
|
||||
"enum": ["celsius", "fahrenheit"],
|
||||
"description": "The temperature unit to use. Infer this from the user's location.",
|
||||
},
|
||||
},
|
||||
required=["location", "format"],
|
||||
)
|
||||
restaurant_function = FunctionSchema(
|
||||
name="get_restaurant_recommendation",
|
||||
description="Get a restaurant recommendation",
|
||||
properties={
|
||||
"location": {
|
||||
"type": "string",
|
||||
"description": "The city and state, e.g. San Francisco, CA",
|
||||
},
|
||||
},
|
||||
required=["location"],
|
||||
)
|
||||
tools = ToolsSchema(standard_tools=[weather_function, restaurant_function])
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
|
||||
},
|
||||
]
|
||||
|
||||
context = LLMContext(messages, tools)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(),
|
||||
stt,
|
||||
context_aggregator.user(),
|
||||
llm,
|
||||
tts,
|
||||
transport.output(),
|
||||
context_aggregator.assistant(),
|
||||
]
|
||||
)
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
params=PipelineParams(
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
),
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
async def bot(runner_args: RunnerArguments):
|
||||
"""Main bot entry point compatible with Pipecat Cloud."""
|
||||
transport = await create_transport(runner_args, transport_params)
|
||||
await run_bot(transport, runner_args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from pipecat.runner.run import main
|
||||
|
||||
main()
|
||||
@@ -26,7 +26,11 @@ from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.deepgram.tts import DeepgramTTSService
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams, DailyTransportMessageFrame
|
||||
from pipecat.transports.daily.transport import (
|
||||
DailyOutputTransportMessageFrame,
|
||||
DailyOutputTransportMessageUrgentFrame,
|
||||
DailyParams,
|
||||
)
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
@@ -128,14 +132,14 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.debug(f"Received latency ping app message: {message}")
|
||||
ts = message["latency-ping"]["ts"]
|
||||
# Send immediately
|
||||
transport.output().send_message(
|
||||
DailyTransportMessageFrame(
|
||||
await task.queue_frame(
|
||||
DailyOutputTransportMessageUrgentFrame(
|
||||
message={"latency-pong-msg-handler": {"ts": ts}}, participant_id=sender
|
||||
)
|
||||
)
|
||||
# And push to the pipeline for the Daily transport.output to send
|
||||
await task.queue_frame(
|
||||
DailyTransportMessageFrame(
|
||||
DailyOutputTransportMessageFrame(
|
||||
message={"latency-pong-pipeline-delivery": {"ts": ts}},
|
||||
participant_id=sender,
|
||||
)
|
||||
|
||||
@@ -24,14 +24,15 @@ from pipecat.processors.transcript_processor import TranscriptProcessor
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.llm_service import FunctionCallParams
|
||||
from pipecat.services.openai_realtime import (
|
||||
from pipecat.services.openai.realtime.events import (
|
||||
AudioConfiguration,
|
||||
AudioInput,
|
||||
InputAudioNoiseReduction,
|
||||
InputAudioTranscription,
|
||||
OpenAIRealtimeLLMService,
|
||||
SemanticTurnDetection,
|
||||
SessionProperties,
|
||||
)
|
||||
from pipecat.services.openai_realtime.events import AudioConfiguration, AudioInput
|
||||
from pipecat.services.openai.realtime.llm import OpenAIRealtimeLLMService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
|
||||
@@ -21,13 +21,14 @@ from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.azure.realtime.llm import AzureRealtimeLLMService
|
||||
from pipecat.services.llm_service import FunctionCallParams
|
||||
from pipecat.services.openai_realtime import (
|
||||
AzureRealtimeLLMService,
|
||||
from pipecat.services.openai.realtime.events import (
|
||||
AudioConfiguration,
|
||||
AudioInput,
|
||||
InputAudioTranscription,
|
||||
SessionProperties,
|
||||
)
|
||||
from pipecat.services.openai_realtime.events import AudioConfiguration, AudioInput
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
|
||||
@@ -22,16 +22,17 @@ from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.processors.transcript_processor import TranscriptProcessor
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.cartesia import CartesiaTTSService
|
||||
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
from pipecat.services.llm_service import FunctionCallParams
|
||||
from pipecat.services.openai_realtime import (
|
||||
from pipecat.services.openai.realtime.events import (
|
||||
AudioConfiguration,
|
||||
AudioInput,
|
||||
InputAudioNoiseReduction,
|
||||
InputAudioTranscription,
|
||||
OpenAIRealtimeLLMService,
|
||||
SemanticTurnDetection,
|
||||
SessionProperties,
|
||||
)
|
||||
from pipecat.services.openai_realtime.events import AudioConfiguration, AudioInput
|
||||
from pipecat.services.openai.realtime.llm import OpenAIRealtimeLLMService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
|
||||
@@ -25,13 +25,14 @@ from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.llm_service import FunctionCallParams
|
||||
from pipecat.services.openai_realtime import (
|
||||
from pipecat.services.openai.realtime.events import (
|
||||
AudioConfiguration,
|
||||
AudioInput,
|
||||
InputAudioTranscription,
|
||||
OpenAIRealtimeLLMService,
|
||||
SessionProperties,
|
||||
TurnDetection,
|
||||
)
|
||||
from pipecat.services.openai_realtime.events import AudioConfiguration, AudioInput
|
||||
from pipecat.services.openai.realtime.llm import OpenAIRealtimeLLMService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
|
||||
@@ -72,7 +72,6 @@ async def save_conversation(params: FunctionCallParams):
|
||||
)
|
||||
try:
|
||||
with open(filename, "w") as file:
|
||||
# todo: extract 'system' into the first message in the list
|
||||
messages = params.context.get_messages()
|
||||
# remove the last message, which is the instruction we just gave to save the conversation
|
||||
messages.pop()
|
||||
|
||||
@@ -90,7 +90,6 @@ async def save_conversation(params: FunctionCallParams):
|
||||
)
|
||||
try:
|
||||
with open(filename, "w") as file:
|
||||
# todo: extract 'system' into the first message in the list
|
||||
messages = params.context.get_messages()
|
||||
# remove the last message (the instruction to save the context)
|
||||
messages.pop()
|
||||
|
||||
@@ -20,10 +20,12 @@ from pipecat.frames.frames import LLMRunFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.aws_nova_sonic.aws import AWSNovaSonicLLMService
|
||||
from pipecat.services.aws.nova_sonic.llm import AWSNovaSonicLLMService
|
||||
from pipecat.services.llm_service import FunctionCallParams
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
@@ -75,7 +77,7 @@ async def save_conversation(params: FunctionCallParams):
|
||||
filename = f"{BASE_FILENAME}{timestamp}.json"
|
||||
try:
|
||||
with open(filename, "w") as file:
|
||||
messages = params.context.get_messages_for_persistent_storage()
|
||||
messages = params.context.get_messages()
|
||||
# remove the last few messages. in reverse order, they are:
|
||||
# - the in progress save tool call
|
||||
# - the invocation of the save tool call
|
||||
@@ -223,13 +225,13 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
llm.register_function("get_saved_conversation_filenames", get_saved_conversation_filenames)
|
||||
llm.register_function("load_conversation", load_conversation)
|
||||
|
||||
context = OpenAILLMContext(
|
||||
context = LLMContext(
|
||||
messages=[
|
||||
{"role": "system", "content": f"{system_instruction}"},
|
||||
],
|
||||
tools=tools,
|
||||
)
|
||||
context_aggregator = llm.create_context_aggregator(context)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
|
||||
@@ -17,7 +17,7 @@ from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.gemini_multimodal_live.gemini import GeminiMultimodalLiveLLMService
|
||||
from pipecat.services.google.gemini_live.llm import GeminiLiveLLMService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
@@ -65,7 +65,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
Respond to what the user said in a creative and helpful way.
|
||||
"""
|
||||
|
||||
llm = GeminiMultimodalLiveLLMService(
|
||||
llm = GeminiLiveLLMService(
|
||||
api_key=os.getenv("GOOGLE_API_KEY"),
|
||||
system_instruction=system_instruction,
|
||||
voice_id="Puck", # Aoede, Charon, Fenrir, Kore, Puck
|
||||
@@ -20,7 +20,7 @@ from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.processors.transcript_processor import TranscriptProcessor
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.gemini_multimodal_live.gemini import GeminiMultimodalLiveLLMService
|
||||
from pipecat.services.google.gemini_live.llm import GeminiLiveLLMService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
@@ -65,7 +65,7 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
llm = GeminiMultimodalLiveLLMService(
|
||||
llm = GeminiLiveLLMService(
|
||||
api_key=os.getenv("GOOGLE_API_KEY"),
|
||||
voice_id="Aoede", # Puck, Charon, Kore, Fenrir, Aoede
|
||||
# system_instruction="Talk like a pirate."
|
||||
@@ -22,7 +22,7 @@ from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.gemini_multimodal_live.gemini import GeminiMultimodalLiveLLMService
|
||||
from pipecat.services.google.gemini_live.llm import GeminiLiveLLMService
|
||||
from pipecat.services.llm_service import FunctionCallParams
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
@@ -122,12 +122,15 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
required=["location"],
|
||||
)
|
||||
search_tool = {"google_search": {}}
|
||||
# KNOWN ISSUE: If using GeminiVertexLiveLLMService, it appears
|
||||
# you cannot use the "google_search" tool alongside other tools.
|
||||
# See https://github.com/googleapis/python-genai/issues/941.
|
||||
tools = ToolsSchema(
|
||||
standard_tools=[weather_function, restaurant_function],
|
||||
custom_tools={AdapterType.GEMINI: [search_tool]},
|
||||
)
|
||||
|
||||
llm = GeminiMultimodalLiveLLMService(
|
||||
llm = GeminiLiveLLMService(
|
||||
api_key=os.getenv("GOOGLE_API_KEY"),
|
||||
system_instruction=system_instruction,
|
||||
tools=tools,
|
||||
@@ -24,7 +24,7 @@ from pipecat.runner.utils import (
|
||||
maybe_capture_participant_camera,
|
||||
maybe_capture_participant_screen,
|
||||
)
|
||||
from pipecat.services.gemini_multimodal_live.gemini import GeminiMultimodalLiveLLMService
|
||||
from pipecat.services.google.gemini_live.llm import GeminiLiveLLMService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
|
||||
@@ -58,7 +58,7 @@ transport_params = {
|
||||
|
||||
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
llm = GeminiMultimodalLiveLLMService(
|
||||
llm = GeminiLiveLLMService(
|
||||
api_key=os.getenv("GOOGLE_API_KEY"),
|
||||
voice_id="Aoede", # Puck, Charon, Kore, Fenrir, Aoede
|
||||
# system_instruction="Talk like a pirate."
|
||||
@@ -20,9 +20,9 @@ from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
from pipecat.services.gemini_multimodal_live.gemini import (
|
||||
GeminiMultimodalLiveLLMService,
|
||||
GeminiMultimodalModalities,
|
||||
from pipecat.services.google.gemini_live.llm import (
|
||||
GeminiLiveLLMService,
|
||||
GeminiModalities,
|
||||
InputParams,
|
||||
)
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
@@ -80,11 +80,15 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
llm = GeminiMultimodalLiveLLMService(
|
||||
# KNOWN ISSUE: If using GeminiLiveVertexLLMService, you cannot specify a
|
||||
# modality other than AUDIO (at least not if using the service's default
|
||||
# model, which is a native audio model:
|
||||
# https://cloud.google.com/vertex-ai/generative-ai/docs/live-api/tools#native-audio).
|
||||
llm = GeminiLiveLLMService(
|
||||
api_key=os.getenv("GOOGLE_API_KEY"),
|
||||
system_instruction=SYSTEM_INSTRUCTION,
|
||||
tools=[{"google_search": {}}, {"code_execution": {}}],
|
||||
params=InputParams(modalities=GeminiMultimodalModalities.TEXT),
|
||||
params=InputParams(modalities=GeminiModalities.TEXT),
|
||||
)
|
||||
|
||||
# Optionally, you can set the response modalities via a function
|
||||
@@ -19,7 +19,7 @@ from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.gemini_multimodal_live.gemini import GeminiMultimodalLiveLLMService
|
||||
from pipecat.services.google.gemini_live.llm import GeminiLiveLLMService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
@@ -83,7 +83,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
# Initialize the Gemini Multimodal Live model
|
||||
llm = GeminiMultimodalLiveLLMService(
|
||||
llm = GeminiLiveLLMService(
|
||||
api_key=os.getenv("GOOGLE_API_KEY"),
|
||||
voice_id="Puck", # Aoede, Charon, Fenrir, Kore, Puck
|
||||
system_instruction=system_instruction,
|
||||
@@ -19,9 +19,7 @@ from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.gemini_multimodal_live.gemini import (
|
||||
GeminiMultimodalLiveLLMService,
|
||||
)
|
||||
from pipecat.services.google.gemini_live.llm import GeminiLiveLLMService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
@@ -110,7 +108,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
"""
|
||||
|
||||
# Initialize Gemini service with File API support
|
||||
llm = GeminiMultimodalLiveLLMService(
|
||||
llm = GeminiLiveLLMService(
|
||||
api_key=os.getenv("GOOGLE_API_KEY"),
|
||||
system_instruction=system_instruction,
|
||||
voice_id="Charon", # Aoede, Charon, Fenrir, Kore, Puck
|
||||
@@ -9,13 +9,13 @@ from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
from pipecat.frames.frames import Frame, LLMRunFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.pipeline.task import PipelineTask
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.gemini_multimodal_live.gemini import GeminiMultimodalLiveLLMService
|
||||
from pipecat.services.google.frames import LLMSearchResponseFrame
|
||||
from pipecat.services.google.gemini_live.llm import GeminiLiveLLMService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
@@ -105,7 +105,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
custom_tools={AdapterType.GEMINI: [{"google_search": {}}, {"code_execution": {}}]},
|
||||
)
|
||||
|
||||
llm = GeminiMultimodalLiveLLMService(
|
||||
llm = GeminiLiveLLMService(
|
||||
api_key=os.getenv("GOOGLE_API_KEY"),
|
||||
system_instruction=SYSTEM_INSTRUCTION,
|
||||
voice_id="Charon", # Aoede, Charon, Fenrir, Kore, Puck
|
||||
191
examples/foundational/26h-gemini-live-vertex-function-calling.py
Normal file
191
examples/foundational/26h-gemini-live-vertex-function-calling.py
Normal file
@@ -0,0 +1,191 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
|
||||
import os
|
||||
from datetime import datetime
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from google.genai.types import HttpOptions
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.adapters.schemas.function_schema import FunctionSchema
|
||||
from pipecat.adapters.schemas.tools_schema import AdapterType, ToolsSchema
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
from pipecat.frames.frames import LLMRunFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.google.gemini_live.llm import GeminiLiveLLMService
|
||||
from pipecat.services.google.gemini_live.llm_vertex import GeminiLiveVertexLLMService
|
||||
from pipecat.services.llm_service import FunctionCallParams
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
async def fetch_weather_from_api(params: FunctionCallParams):
|
||||
temperature = 75 if params.arguments["format"] == "fahrenheit" else 24
|
||||
await params.result_callback(
|
||||
{
|
||||
"conditions": "nice",
|
||||
"temperature": temperature,
|
||||
"format": params.arguments["format"],
|
||||
"timestamp": datetime.now().strftime("%Y%m%d_%H%M%S"),
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
async def fetch_restaurant_recommendation(params: FunctionCallParams):
|
||||
await params.result_callback({"name": "The Golden Dragon"})
|
||||
|
||||
|
||||
system_instruction = """
|
||||
You are a helpful assistant who can answer questions and use tools.
|
||||
|
||||
You have three tools available to you:
|
||||
1. get_current_weather: Use this tool to get the current weather in a specific location.
|
||||
2. get_restaurant_recommendation: Use this tool to get a restaurant recommendation in a specific location.
|
||||
"""
|
||||
|
||||
|
||||
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||
# instantiated. The function will be called when the desired transport gets
|
||||
# selected.
|
||||
transport_params = {
|
||||
"daily": lambda: DailyParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
# set stop_secs to something roughly similar to the internal setting
|
||||
# of the Multimodal Live api, just to align events. This doesn't really
|
||||
# matter because we can only use the Multimodal Live API's phrase
|
||||
# endpointing, for now.
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.5)),
|
||||
),
|
||||
"twilio": lambda: FastAPIWebsocketParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
# set stop_secs to something roughly similar to the internal setting
|
||||
# of the Multimodal Live api, just to align events. This doesn't really
|
||||
# matter because we can only use the Multimodal Live API's phrase
|
||||
# endpointing, for now.
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.5)),
|
||||
),
|
||||
"webrtc": lambda: TransportParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
# set stop_secs to something roughly similar to the internal setting
|
||||
# of the Multimodal Live api, just to align events. This doesn't really
|
||||
# matter because we can only use the Multimodal Live API's phrase
|
||||
# endpointing, for now.
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.5)),
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
weather_function = FunctionSchema(
|
||||
name="get_current_weather",
|
||||
description="Get the current weather",
|
||||
properties={
|
||||
"location": {
|
||||
"type": "string",
|
||||
"description": "The city and state, e.g. San Francisco, CA",
|
||||
},
|
||||
"format": {
|
||||
"type": "string",
|
||||
"enum": ["celsius", "fahrenheit"],
|
||||
"description": "The temperature unit to use. Infer this from the user's location.",
|
||||
},
|
||||
},
|
||||
required=["location", "format"],
|
||||
)
|
||||
restaurant_function = FunctionSchema(
|
||||
name="get_restaurant_recommendation",
|
||||
description="Get a restaurant recommendation",
|
||||
properties={
|
||||
"location": {
|
||||
"type": "string",
|
||||
"description": "The city and state, e.g. San Francisco, CA",
|
||||
},
|
||||
},
|
||||
required=["location"],
|
||||
)
|
||||
# KNOWN ISSUE: If using GeminiVertexLiveLLMService, it appears
|
||||
# you cannot use the "google_search" tool alongside other tools.
|
||||
# See https://github.com/googleapis/python-genai/issues/941.
|
||||
tools = ToolsSchema(standard_tools=[weather_function, restaurant_function])
|
||||
|
||||
llm = GeminiLiveVertexLLMService(
|
||||
credentials=os.getenv("GOOGLE_VERTEX_TEST_CREDENTIALS"),
|
||||
project_id=os.getenv("GOOGLE_CLOUD_PROJECT_ID"),
|
||||
location=os.getenv("GOOGLE_CLOUD_LOCATION"),
|
||||
system_instruction=system_instruction,
|
||||
voice_id="Puck", # Aoede, Charon, Fenrir, Kore, Puck
|
||||
tools=tools,
|
||||
)
|
||||
|
||||
llm.register_function("get_current_weather", fetch_weather_from_api)
|
||||
llm.register_function("get_restaurant_recommendation", fetch_restaurant_recommendation)
|
||||
|
||||
context = OpenAILLMContext(
|
||||
[{"role": "user", "content": "Say hello."}],
|
||||
)
|
||||
context_aggregator = llm.create_context_aggregator(context)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(),
|
||||
context_aggregator.user(),
|
||||
llm,
|
||||
transport.output(),
|
||||
context_aggregator.assistant(),
|
||||
]
|
||||
)
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
params=PipelineParams(
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
),
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
async def bot(runner_args: RunnerArguments):
|
||||
"""Main bot entry point compatible with Pipecat Cloud."""
|
||||
transport = await create_transport(runner_args, transport_params)
|
||||
await run_bot(transport, runner_args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from pipecat.runner.run import main
|
||||
|
||||
main()
|
||||
204
examples/foundational/26i-gemini-live-graceful-end.py
Normal file
204
examples/foundational/26i-gemini-live-graceful-end.py
Normal file
@@ -0,0 +1,204 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import os
|
||||
from datetime import datetime
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.adapters.schemas.function_schema import FunctionSchema
|
||||
from pipecat.adapters.schemas.tools_schema import AdapterType, ToolsSchema
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
from pipecat.frames.frames import EndTaskFrame, LLMRunFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.processors.frame_processor import FrameDirection
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.google.gemini_live.llm import GeminiLiveLLMService
|
||||
from pipecat.services.llm_service import FunctionCallParams
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
async def fetch_weather_from_api(params: FunctionCallParams):
|
||||
temperature = 75 if params.arguments["format"] == "fahrenheit" else 24
|
||||
await params.result_callback(
|
||||
{
|
||||
"conditions": "nice",
|
||||
"temperature": temperature,
|
||||
"format": params.arguments["format"],
|
||||
"timestamp": datetime.now().strftime("%Y%m%d_%H%M%S"),
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
async def fetch_restaurant_recommendation(params: FunctionCallParams):
|
||||
await params.result_callback({"name": "The Golden Dragon"})
|
||||
|
||||
|
||||
async def end_conversation(params: FunctionCallParams):
|
||||
await params.result_callback({"success": True})
|
||||
await params.llm.push_frame(EndTaskFrame(), FrameDirection.UPSTREAM)
|
||||
|
||||
|
||||
system_instruction = """
|
||||
You are a helpful assistant who can answer questions and use tools.
|
||||
|
||||
You have three tools available to you:
|
||||
1. get_current_weather: Use this tool to get the current weather in a specific location.
|
||||
2. get_restaurant_recommendation: Use this tool to get a restaurant recommendation in a specific location.
|
||||
3. end_conversation: Use this tool to gracefully end the conversation.
|
||||
|
||||
After you've responded to the user three times, do two things, in order:
|
||||
1. Politely let them know that that's all the time you have today and say goodbye.
|
||||
2. Call the end_conversation tool to gracefully end the conversation.
|
||||
"""
|
||||
|
||||
|
||||
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||
# instantiated. The function will be called when the desired transport gets
|
||||
# selected.
|
||||
transport_params = {
|
||||
"daily": lambda: DailyParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
# set stop_secs to something roughly similar to the internal setting
|
||||
# of the Multimodal Live api, just to align events. This doesn't really
|
||||
# matter because we can only use the Multimodal Live API's phrase
|
||||
# endpointing, for now.
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.5)),
|
||||
),
|
||||
"twilio": lambda: FastAPIWebsocketParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
# set stop_secs to something roughly similar to the internal setting
|
||||
# of the Multimodal Live api, just to align events. This doesn't really
|
||||
# matter because we can only use the Multimodal Live API's phrase
|
||||
# endpointing, for now.
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.5)),
|
||||
),
|
||||
"webrtc": lambda: TransportParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
# set stop_secs to something roughly similar to the internal setting
|
||||
# of the Multimodal Live api, just to align events. This doesn't really
|
||||
# matter because we can only use the Multimodal Live API's phrase
|
||||
# endpointing, for now.
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.5)),
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
weather_function = FunctionSchema(
|
||||
name="get_current_weather",
|
||||
description="Get the current weather",
|
||||
properties={
|
||||
"location": {
|
||||
"type": "string",
|
||||
"description": "The city and state, e.g. San Francisco, CA",
|
||||
},
|
||||
"format": {
|
||||
"type": "string",
|
||||
"enum": ["celsius", "fahrenheit"],
|
||||
"description": "The temperature unit to use. Infer this from the user's location.",
|
||||
},
|
||||
},
|
||||
required=["location", "format"],
|
||||
)
|
||||
restaurant_function = FunctionSchema(
|
||||
name="get_restaurant_recommendation",
|
||||
description="Get a restaurant recommendation",
|
||||
properties={
|
||||
"location": {
|
||||
"type": "string",
|
||||
"description": "The city and state, e.g. San Francisco, CA",
|
||||
},
|
||||
},
|
||||
required=["location"],
|
||||
)
|
||||
end_conversation_function = FunctionSchema(
|
||||
name="end_conversation",
|
||||
description="Gracefully end the conversation",
|
||||
properties={},
|
||||
required=[],
|
||||
)
|
||||
search_tool = {"google_search": {}}
|
||||
tools = ToolsSchema(
|
||||
standard_tools=[weather_function, restaurant_function, end_conversation_function],
|
||||
custom_tools={AdapterType.GEMINI: [search_tool]},
|
||||
)
|
||||
|
||||
llm = GeminiLiveLLMService(
|
||||
api_key=os.getenv("GOOGLE_API_KEY"),
|
||||
system_instruction=system_instruction,
|
||||
tools=tools,
|
||||
)
|
||||
|
||||
llm.register_function("get_current_weather", fetch_weather_from_api)
|
||||
llm.register_function("get_restaurant_recommendation", fetch_restaurant_recommendation)
|
||||
llm.register_function("end_conversation", end_conversation)
|
||||
|
||||
context = OpenAILLMContext(
|
||||
[{"role": "user", "content": "Say hello."}],
|
||||
)
|
||||
context_aggregator = llm.create_context_aggregator(context)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(),
|
||||
context_aggregator.user(),
|
||||
llm,
|
||||
transport.output(),
|
||||
context_aggregator.assistant(),
|
||||
]
|
||||
)
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
params=PipelineParams(
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
),
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
async def bot(runner_args: RunnerArguments):
|
||||
"""Main bot entry point compatible with Pipecat Cloud."""
|
||||
transport = await create_transport(runner_args, transport_params)
|
||||
await run_bot(transport, runner_args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from pipecat.runner.run import main
|
||||
|
||||
main()
|
||||
@@ -18,10 +18,11 @@ from pipecat.frames.frames import LLMRunFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.aws_nova_sonic import AWSNovaSonicLLMService
|
||||
from pipecat.services.aws.nova_sonic.llm import AWSNovaSonicLLMService
|
||||
from pipecat.services.llm_service import FunctionCallParams
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
@@ -119,9 +120,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
llm.register_function("get_current_weather", fetch_weather_from_api)
|
||||
|
||||
# Set up context and context management.
|
||||
# AWSNovaSonicService will adapt OpenAI LLM context objects with standard message format to
|
||||
# what's expected by Nova Sonic.
|
||||
context = OpenAILLMContext(
|
||||
context = LLMContext(
|
||||
messages=[
|
||||
{"role": "system", "content": f"{system_instruction}"},
|
||||
{
|
||||
@@ -131,7 +130,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
],
|
||||
tools=tools,
|
||||
)
|
||||
context_aggregator = llm.create_context_aggregator(context)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
# Build the pipeline
|
||||
pipeline = Pipeline(
|
||||
|
||||
@@ -20,7 +20,7 @@ from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.processors.frameworks.rtvi import RTVIObserver, RTVIProcessor
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.gemini_multimodal_live import GeminiMultimodalLiveLLMService
|
||||
from pipecat.services.google.gemini_live.llm import GeminiLiveLLMService
|
||||
from pipecat.transports.base_transport import TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams, DailyTransport
|
||||
|
||||
@@ -94,7 +94,7 @@ Respond to what the user said in a creative and helpful way. Keep your responses
|
||||
|
||||
|
||||
async def run_bot(pipecat_transport):
|
||||
llm = GeminiMultimodalLiveLLMService(
|
||||
llm = GeminiLiveLLMService(
|
||||
api_key=os.getenv("GOOGLE_API_KEY"),
|
||||
voice_id="Puck", # Aoede, Charon, Fenrir, Kore, Puck
|
||||
transcribe_user_audio=True,
|
||||
|
||||
142
examples/foundational/47-sentry-metrics.py
Normal file
142
examples/foundational/47-sentry-metrics.py
Normal file
@@ -0,0 +1,142 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import os
|
||||
|
||||
import sentry_sdk
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
|
||||
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
from pipecat.frames.frames import LLMRunFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.processors.metrics.sentry import SentryMetrics
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||
# instantiated. The function will be called when the desired transport gets
|
||||
# selected.
|
||||
transport_params = {
|
||||
"daily": lambda: DailyParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
"twilio": lambda: FastAPIWebsocketParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
"webrtc": lambda: TransportParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
# Initialize Sentry
|
||||
sentry_sdk.init(
|
||||
dsn=os.getenv("SENTRY_DSN"),
|
||||
traces_sample_rate=1.0,
|
||||
)
|
||||
|
||||
stt = DeepgramSTTService(
|
||||
api_key=os.getenv("DEEPGRAM_API_KEY"),
|
||||
metrics=SentryMetrics(),
|
||||
)
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
voice_id="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
metrics=SentryMetrics(),
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
metrics=SentryMetrics(),
|
||||
)
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
|
||||
},
|
||||
]
|
||||
|
||||
context = LLMContext(messages)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(), # Transport user input
|
||||
stt,
|
||||
context_aggregator.user(), # User responses
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
context_aggregator.assistant(), # Assistant spoken responses
|
||||
]
|
||||
)
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
params=PipelineParams(
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
),
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
async def bot(runner_args: RunnerArguments):
|
||||
"""Main bot entry point compatible with Pipecat Cloud."""
|
||||
transport = await create_transport(runner_args, transport_params)
|
||||
await run_bot(transport, runner_args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from pipecat.runner.run import main
|
||||
|
||||
main()
|
||||
@@ -105,7 +105,7 @@ uv run 07-interruptible.py -t twilio -x NGROK_HOST_NAME
|
||||
### Vision & Multimodal
|
||||
|
||||
- **[12a-describe-video-gemini-flash.py](./12a-describe-video-gemini-flash.py)**: Bot describes user's video (Video input, Multimodal LLMs)
|
||||
- **[26c-gemini-multimodal-live-video.py](./26c-gemini-multimodal-live-video.py)**: Gemini with video input (Streaming video, Function calls)
|
||||
- **[26c-gemini-live-video.py](./26c-gemini-live-video.py)**: Gemini with video input (Streaming video, Function calls)
|
||||
|
||||
### Voice & Language
|
||||
|
||||
|
||||
BIN
examples/foundational/assets/moondream.png
Normal file
BIN
examples/foundational/assets/moondream.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 1.1 MiB |
@@ -34,10 +34,11 @@ dependencies = [
|
||||
"pyloudnorm~=0.1.1",
|
||||
"resampy~=0.4.3",
|
||||
"soxr~=0.5.0",
|
||||
"openai>=1.74.0,<=1.99.1",
|
||||
"openai>=1.74.0,<3",
|
||||
# Pinning numba to resolve package dependencies
|
||||
"numba==0.61.2",
|
||||
"wait_for2>=0.4.1; python_version<'3.12'",
|
||||
"pipecat-ai-cli"
|
||||
]
|
||||
|
||||
[project.urls]
|
||||
@@ -50,23 +51,24 @@ anthropic = [ "anthropic~=0.49.0" ]
|
||||
assemblyai = [ "pipecat-ai[websockets-base]" ]
|
||||
asyncai = [ "pipecat-ai[websockets-base]" ]
|
||||
aws = [ "aioboto3~=15.0.0", "pipecat-ai[websockets-base]" ]
|
||||
aws-nova-sonic = [ "aws_sdk_bedrock_runtime~=0.0.2; python_version>='3.12'" ]
|
||||
aws-nova-sonic = [ "aws_sdk_bedrock_runtime~=0.1.0; python_version>='3.12'" ]
|
||||
azure = [ "azure-cognitiveservices-speech~=1.42.0"]
|
||||
cartesia = [ "cartesia~=2.0.3", "pipecat-ai[websockets-base]" ]
|
||||
cerebras = []
|
||||
deepseek = []
|
||||
daily = [ "daily-python~=0.19.9" ]
|
||||
daily = [ "daily-python~=0.20.0" ]
|
||||
deepgram = [ "deepgram-sdk~=4.7.0" ]
|
||||
elevenlabs = [ "pipecat-ai[websockets-base]" ]
|
||||
fal = [ "fal-client~=0.5.9" ]
|
||||
fireworks = []
|
||||
fish = [ "ormsgpack~=1.7.0", "pipecat-ai[websockets-base]" ]
|
||||
gladia = [ "pipecat-ai[websockets-base]" ]
|
||||
google = [ "google-cloud-speech~=2.32.0", "google-cloud-texttospeech~=2.26.0", "google-genai~=1.24.0", "pipecat-ai[websockets-base]" ]
|
||||
google = [ "google-cloud-speech>=2.33.0,<3", "google-cloud-texttospeech>=2.31.0,<3", "google-genai>=1.41.0,<2", "pipecat-ai[websockets-base]" ]
|
||||
grok = []
|
||||
groq = [ "groq~=0.23.0" ]
|
||||
gstreamer = [ "pygobject~=3.50.0" ]
|
||||
heygen = [ "livekit>=1.0.13", "pipecat-ai[websockets-base]" ]
|
||||
hume = [ "hume>=0.11.2" ]
|
||||
inworld = []
|
||||
krisp = [ "pipecat-ai-krisp~=0.4.0" ]
|
||||
koala = [ "pvkoala~=2.0.3" ]
|
||||
@@ -83,7 +85,7 @@ nim = []
|
||||
neuphonic = [ "pipecat-ai[websockets-base]" ]
|
||||
noisereduce = [ "noisereduce~=3.0.3" ]
|
||||
openai = [ "pipecat-ai[websockets-base]" ]
|
||||
openpipe = [ "openpipe~=4.50.0" ]
|
||||
openpipe = [ "openpipe>=4.50.0,<6" ]
|
||||
openrouter = []
|
||||
perplexity = []
|
||||
playht = [ "pipecat-ai[websockets-base]" ]
|
||||
@@ -101,7 +103,7 @@ silero = [ "onnxruntime>=1.20.1,<2" ]
|
||||
simli = [ "simli-ai~=0.1.10"]
|
||||
soniox = [ "pipecat-ai[websockets-base]" ]
|
||||
soundfile = [ "soundfile~=0.13.0" ]
|
||||
speechmatics = [ "speechmatics-rt>=0.4.0" ]
|
||||
speechmatics = [ "speechmatics-rt>=0.5.0" ]
|
||||
strands = [ "strands-agents>=1.9.1,<2" ]
|
||||
tavus=[]
|
||||
together = []
|
||||
|
||||
@@ -67,6 +67,7 @@ TESTS_07 = [
|
||||
("07ac-interruptible-asyncai-http.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07b-interruptible-langchain.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07c-interruptible-deepgram.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07c-interruptible-deepgram-flux.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07d-interruptible-elevenlabs.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
(
|
||||
"07d-interruptible-elevenlabs-http.py",
|
||||
@@ -74,8 +75,6 @@ TESTS_07 = [
|
||||
EVAL_SIMPLE_MATH,
|
||||
BOT_SPEAKS_FIRST,
|
||||
),
|
||||
("07e-interruptible-playht.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07e-interruptible-playht-http.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07f-interruptible-azure.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07g-interruptible-openai.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07h-interruptible-openpipe.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
@@ -102,6 +101,7 @@ TESTS_07 = [
|
||||
("07w-interruptible-fal.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07y-interruptible-minimax.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07z-interruptible-sarvam.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07ae-interruptible-hume.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
# Needs a local XTTS docker instance running.
|
||||
# ("07i-interruptible-xtts.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
# Needs a Krisp license.
|
||||
@@ -136,6 +136,7 @@ TESTS_14 = [
|
||||
("14r-function-calling-aws.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
("14v-function-calling-openai.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
("14w-function-calling-mistral.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
("14x-function-calling-openpipe.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
# Currently not working.
|
||||
# ("14c-function-calling-together.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
# ("14l-function-calling-deepseek.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
@@ -147,7 +148,10 @@ TESTS_15 = [
|
||||
]
|
||||
|
||||
TESTS_19 = [
|
||||
("19-openai-realtime.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
("19-openai-realtime-beta.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
# OpenAI Realtime not released on Azure yet
|
||||
# ("19a-azure-realtime.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
("19a-azure-realtime-beta.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
("19b-openai-realtime-text.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
("19b-openai-realtime-beta-text.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
@@ -160,18 +164,18 @@ TESTS_21 = [
|
||||
TESTS_26 = [
|
||||
("26-gemini-multimodal-live.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
(
|
||||
"26a-gemini-multimodal-live-transcription.py",
|
||||
"26a-gemini-live-transcription.py",
|
||||
PROMPT_SIMPLE_MATH,
|
||||
EVAL_SIMPLE_MATH,
|
||||
BOT_SPEAKS_FIRST,
|
||||
),
|
||||
(
|
||||
"26b-gemini-multimodal-live-function-calling.py",
|
||||
"26b-gemini-live-function-calling.py",
|
||||
PROMPT_WEATHER,
|
||||
EVAL_WEATHER,
|
||||
BOT_SPEAKS_FIRST,
|
||||
),
|
||||
("26c-gemini-multimodal-live-video.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("26c-gemini-live-video.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
(
|
||||
"26e-gemini-multimodal-google-search.py",
|
||||
PROMPT_ONLINE_SEARCH,
|
||||
@@ -179,7 +183,13 @@ TESTS_26 = [
|
||||
BOT_SPEAKS_FIRST,
|
||||
),
|
||||
# Currently not working.
|
||||
# ("26d-gemini-multimodal-live-text.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
# ("26d-gemini-live-text.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
(
|
||||
"26h-gemini-live-vertex-function-calling.py",
|
||||
PROMPT_WEATHER,
|
||||
EVAL_WEATHER,
|
||||
BOT_SPEAKS_FIRST,
|
||||
),
|
||||
]
|
||||
|
||||
TESTS_27 = [
|
||||
|
||||
@@ -6,13 +6,47 @@
|
||||
|
||||
"""AWS Nova Sonic LLM adapter for Pipecat."""
|
||||
|
||||
import copy
|
||||
import json
|
||||
from typing import Any, Dict, List, TypedDict
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
from typing import Any, Dict, List, Optional, TypedDict
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.adapters.base_llm_adapter import BaseLLMAdapter
|
||||
from pipecat.adapters.schemas.function_schema import FunctionSchema
|
||||
from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext, LLMContextMessage
|
||||
|
||||
|
||||
class Role(Enum):
|
||||
"""Roles supported in AWS Nova Sonic conversations.
|
||||
|
||||
Parameters:
|
||||
SYSTEM: System-level messages (not used in conversation history).
|
||||
USER: Messages sent by the user.
|
||||
ASSISTANT: Messages sent by the assistant.
|
||||
TOOL: Messages sent by tools (not used in conversation history).
|
||||
"""
|
||||
|
||||
SYSTEM = "SYSTEM"
|
||||
USER = "USER"
|
||||
ASSISTANT = "ASSISTANT"
|
||||
TOOL = "TOOL"
|
||||
|
||||
|
||||
@dataclass
|
||||
class AWSNovaSonicConversationHistoryMessage:
|
||||
"""A single message in AWS Nova Sonic conversation history.
|
||||
|
||||
Parameters:
|
||||
role: The role of the message sender (USER or ASSISTANT only).
|
||||
text: The text content of the message.
|
||||
"""
|
||||
|
||||
role: Role # only USER and ASSISTANT
|
||||
text: str
|
||||
|
||||
|
||||
class AWSNovaSonicLLMInvocationParams(TypedDict):
|
||||
@@ -21,7 +55,9 @@ class AWSNovaSonicLLMInvocationParams(TypedDict):
|
||||
This is a placeholder until support for universal LLMContext machinery is added for AWS Nova Sonic.
|
||||
"""
|
||||
|
||||
pass
|
||||
system_instruction: Optional[str]
|
||||
messages: List[AWSNovaSonicConversationHistoryMessage]
|
||||
tools: List[Dict[str, Any]]
|
||||
|
||||
|
||||
class AWSNovaSonicLLMAdapter(BaseLLMAdapter[AWSNovaSonicLLMInvocationParams]):
|
||||
@@ -34,7 +70,7 @@ class AWSNovaSonicLLMAdapter(BaseLLMAdapter[AWSNovaSonicLLMInvocationParams]):
|
||||
@property
|
||||
def id_for_llm_specific_messages(self) -> str:
|
||||
"""Get the identifier used in LLMSpecificMessage instances for AWS Nova Sonic."""
|
||||
raise NotImplementedError("Universal LLMContext is not yet supported for AWS Nova Sonic.")
|
||||
return "aws-nova-sonic"
|
||||
|
||||
def get_llm_invocation_params(self, context: LLMContext) -> AWSNovaSonicLLMInvocationParams:
|
||||
"""Get AWS Nova Sonic-specific LLM invocation parameters from a universal LLM context.
|
||||
@@ -47,7 +83,13 @@ class AWSNovaSonicLLMAdapter(BaseLLMAdapter[AWSNovaSonicLLMInvocationParams]):
|
||||
Returns:
|
||||
Dictionary of parameters for invoking AWS Nova Sonic's LLM API.
|
||||
"""
|
||||
raise NotImplementedError("Universal LLMContext is not yet supported for AWS Nova Sonic.")
|
||||
messages = self._from_universal_context_messages(self.get_messages(context))
|
||||
return {
|
||||
"system_instruction": messages.system_instruction,
|
||||
"messages": messages.messages,
|
||||
# NOTE: LLMContext's tools are guaranteed to be a ToolsSchema (or NOT_GIVEN)
|
||||
"tools": self.from_standard_tools(context.tools) or [],
|
||||
}
|
||||
|
||||
def get_messages_for_logging(self, context) -> List[Dict[str, Any]]:
|
||||
"""Get messages from a universal LLM context in a format ready for logging about AWS Nova Sonic.
|
||||
@@ -62,7 +104,75 @@ class AWSNovaSonicLLMAdapter(BaseLLMAdapter[AWSNovaSonicLLMInvocationParams]):
|
||||
Returns:
|
||||
List of messages in a format ready for logging about AWS Nova Sonic.
|
||||
"""
|
||||
raise NotImplementedError("Universal LLMContext is not yet supported for AWS Nova Sonic.")
|
||||
return self._from_universal_context_messages(self.get_messages(context)).messages
|
||||
|
||||
@dataclass
|
||||
class ConvertedMessages:
|
||||
"""Container for Google-formatted messages converted from universal context."""
|
||||
|
||||
messages: List[AWSNovaSonicConversationHistoryMessage]
|
||||
system_instruction: Optional[str] = None
|
||||
|
||||
def _from_universal_context_messages(
|
||||
self, universal_context_messages: List[LLMContextMessage]
|
||||
) -> ConvertedMessages:
|
||||
system_instruction = None
|
||||
messages = []
|
||||
|
||||
# Bail if there are no messages
|
||||
if not universal_context_messages:
|
||||
return self.ConvertedMessages()
|
||||
|
||||
universal_context_messages = copy.deepcopy(universal_context_messages)
|
||||
|
||||
# If we have a "system" message as our first message, let's pull that out into "instruction"
|
||||
if universal_context_messages[0].get("role") == "system":
|
||||
system = universal_context_messages.pop(0)
|
||||
content = system.get("content")
|
||||
if isinstance(content, str):
|
||||
system_instruction = content
|
||||
elif isinstance(content, list):
|
||||
system_instruction = content[0].get("text")
|
||||
if system_instruction:
|
||||
self._system_instruction = system_instruction
|
||||
|
||||
# Process remaining messages to fill out conversation history.
|
||||
# Nova Sonic supports "user" and "assistant" messages in history.
|
||||
for universal_context_message in universal_context_messages:
|
||||
message = self._from_universal_context_message(universal_context_message)
|
||||
if message:
|
||||
messages.append(message)
|
||||
|
||||
return self.ConvertedMessages(messages=messages, system_instruction=system_instruction)
|
||||
|
||||
def _from_universal_context_message(self, message) -> AWSNovaSonicConversationHistoryMessage:
|
||||
"""Convert standard message format to Nova Sonic format.
|
||||
|
||||
Args:
|
||||
message: Standard message dictionary to convert.
|
||||
|
||||
Returns:
|
||||
Nova Sonic conversation history message, or None if not convertible.
|
||||
"""
|
||||
role = message.get("role")
|
||||
if message.get("role") == "user" or message.get("role") == "assistant":
|
||||
content = message.get("content")
|
||||
if isinstance(message.get("content"), list):
|
||||
content = ""
|
||||
for c in message.get("content"):
|
||||
if c.get("type") == "text":
|
||||
content += " " + c.get("text")
|
||||
else:
|
||||
logger.error(
|
||||
f"Unhandled content type in context message: {c.get('type')} - {message}"
|
||||
)
|
||||
# There won't be content if this is an assistant tool call entry.
|
||||
# We're ignoring those since they can't be loaded into AWS Nova Sonic conversation
|
||||
# history
|
||||
if content:
|
||||
return AWSNovaSonicConversationHistoryMessage(role=Role[role.upper()], text=content)
|
||||
# NOTE: we're ignoring messages with role "tool" since they can't be loaded into AWS Nova
|
||||
# Sonic conversation history
|
||||
|
||||
@staticmethod
|
||||
def _to_aws_nova_sonic_function_format(function: FunctionSchema) -> Dict[str, Any]:
|
||||
|
||||
@@ -87,9 +87,11 @@ class GeminiLLMAdapter(BaseLLMAdapter[GeminiLLMInvocationParams]):
|
||||
Includes both converted standard tools and any custom Gemini-specific tools.
|
||||
"""
|
||||
functions_schema = tools_schema.standard_tools
|
||||
formatted_standard_tools = [
|
||||
{"function_declarations": [func.to_default_dict() for func in functions_schema]}
|
||||
]
|
||||
formatted_standard_tools = (
|
||||
[{"function_declarations": [func.to_default_dict() for func in functions_schema]}]
|
||||
if functions_schema
|
||||
else []
|
||||
)
|
||||
custom_gemini_tools = []
|
||||
if tools_schema.custom_tools:
|
||||
custom_gemini_tools = tools_schema.custom_tools.get(AdapterType.GEMINI, [])
|
||||
|
||||
193
src/pipecat/audio/filters/krisp_viva_filter.py
Normal file
193
src/pipecat/audio/filters/krisp_viva_filter.py
Normal file
@@ -0,0 +1,193 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""Krisp noise reduction audio filter for Pipecat.
|
||||
|
||||
This module provides an audio filter implementation using Krisp VIVA SDK.
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
import numpy as np
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.audio.filters.base_audio_filter import BaseAudioFilter
|
||||
from pipecat.frames.frames import FilterControlFrame, FilterEnableFrame
|
||||
|
||||
try:
|
||||
import krisp_audio
|
||||
except ModuleNotFoundError as e:
|
||||
logger.error(f"Exception: {e}")
|
||||
logger.error("In order to use the Krisp filter, you need to install krisp_audio.")
|
||||
raise Exception(f"Missing module: {e}")
|
||||
|
||||
|
||||
def _log_callback(log_message, log_level):
|
||||
logger.info(f"[{log_level}] {log_message}")
|
||||
|
||||
|
||||
class KrispVivaFilter(BaseAudioFilter):
|
||||
"""Audio filter using the Krisp VIVA SDK.
|
||||
|
||||
Provides real-time noise reduction for audio streams using Krisp's
|
||||
proprietary noise suppression algorithms. This filter requires a
|
||||
valid Krisp model file to operate.
|
||||
|
||||
Supported sample rates:
|
||||
- 8000 Hz
|
||||
- 16000 Hz
|
||||
- 24000 Hz
|
||||
- 32000 Hz
|
||||
- 44100 Hz
|
||||
- 48000 Hz
|
||||
"""
|
||||
|
||||
# Initialize Krisp Audio SDK globally
|
||||
krisp_audio.globalInit("", _log_callback, krisp_audio.LogLevel.Off)
|
||||
SDK_VERSION = krisp_audio.getVersion()
|
||||
logger.debug(
|
||||
f"Krisp Audio Python SDK Version: {SDK_VERSION.major}."
|
||||
f"{SDK_VERSION.minor}.{SDK_VERSION.patch}"
|
||||
)
|
||||
|
||||
SAMPLE_RATES = {
|
||||
8000: krisp_audio.SamplingRate.Sr8000Hz,
|
||||
16000: krisp_audio.SamplingRate.Sr16000Hz,
|
||||
24000: krisp_audio.SamplingRate.Sr24000Hz,
|
||||
32000: krisp_audio.SamplingRate.Sr32000Hz,
|
||||
44100: krisp_audio.SamplingRate.Sr44100Hz,
|
||||
48000: krisp_audio.SamplingRate.Sr48000Hz,
|
||||
}
|
||||
|
||||
FRAME_SIZE_MS = 10 # Krisp requires audio frames of 10ms duration for processing.
|
||||
|
||||
def __init__(self, model_path: str = None, noise_suppression_level: int = 100) -> None:
|
||||
"""Initialize the Krisp noise reduction filter.
|
||||
|
||||
Args:
|
||||
model_path: Path to the Krisp model file (.kef extension).
|
||||
If None, uses KRISP_VIVA_MODEL_PATH environment variable.
|
||||
noise_suppression_level: Noise suppression level.
|
||||
|
||||
Raises:
|
||||
ValueError: If model_path is not provided and KRISP_VIVA_MODEL_PATH is not set.
|
||||
Exception: If model file doesn't have .kef extension.
|
||||
FileNotFoundError: If model file doesn't exist.
|
||||
"""
|
||||
super().__init__()
|
||||
|
||||
# Set model path, checking environment if not specified
|
||||
self._model_path = model_path or os.getenv("KRISP_VIVA_MODEL_PATH")
|
||||
if not self._model_path:
|
||||
logger.error("Model path is not provided and KRISP_VIVA_MODEL_PATH is not set.")
|
||||
raise ValueError("Model path for KrispAudioProcessor must be provided.")
|
||||
|
||||
if not self._model_path.endswith(".kef"):
|
||||
raise Exception("Model is expected with .kef extension")
|
||||
|
||||
if not os.path.isfile(self._model_path):
|
||||
raise FileNotFoundError(f"Model file not found: {self._model_path}")
|
||||
|
||||
self._filtering = True
|
||||
self._session = None
|
||||
self._samples_per_frame = None
|
||||
self._noise_suppression_level = noise_suppression_level
|
||||
|
||||
# Audio buffer to accumulate samples for complete frames
|
||||
self._audio_buffer = bytearray()
|
||||
|
||||
def _int_to_sample_rate(self, sample_rate):
|
||||
"""Convert integer sample rate to krisp_audio SamplingRate enum.
|
||||
|
||||
Args:
|
||||
sample_rate: Sample rate as integer
|
||||
|
||||
Returns:
|
||||
krisp_audio.SamplingRate enum value
|
||||
|
||||
Raises:
|
||||
ValueError: If sample rate is not supported
|
||||
"""
|
||||
if sample_rate not in self.SAMPLE_RATES:
|
||||
raise ValueError("Unsupported sample rate")
|
||||
return self.SAMPLE_RATES[sample_rate]
|
||||
|
||||
async def start(self, sample_rate: int):
|
||||
"""Initialize the Krisp processor with the transport's sample rate.
|
||||
|
||||
Args:
|
||||
sample_rate: The sample rate of the input transport in Hz.
|
||||
"""
|
||||
model_info = krisp_audio.ModelInfo()
|
||||
model_info.path = self._model_path
|
||||
|
||||
nc_cfg = krisp_audio.NcSessionConfig()
|
||||
nc_cfg.inputSampleRate = self._int_to_sample_rate(sample_rate)
|
||||
nc_cfg.inputFrameDuration = krisp_audio.FrameDuration.Fd10ms
|
||||
nc_cfg.outputSampleRate = nc_cfg.inputSampleRate
|
||||
nc_cfg.modelInfo = model_info
|
||||
|
||||
self._samples_per_frame = int((sample_rate * self.FRAME_SIZE_MS) / 1000)
|
||||
self._session = krisp_audio.NcInt16.create(nc_cfg)
|
||||
|
||||
async def stop(self):
|
||||
"""Clean up the Krisp processor when stopping."""
|
||||
self._session = None
|
||||
|
||||
async def process_frame(self, frame: FilterControlFrame):
|
||||
"""Process control frames to enable/disable filtering.
|
||||
|
||||
Args:
|
||||
frame: The control frame containing filter commands.
|
||||
"""
|
||||
if isinstance(frame, FilterEnableFrame):
|
||||
self._filtering = frame.enable
|
||||
|
||||
async def filter(self, audio: bytes) -> bytes:
|
||||
"""Apply Krisp noise reduction to audio data.
|
||||
|
||||
Args:
|
||||
audio: Raw audio data as bytes to be filtered.
|
||||
|
||||
Returns:
|
||||
Noise-reduced audio data as bytes.
|
||||
"""
|
||||
if not self._filtering:
|
||||
return audio
|
||||
|
||||
# Add incoming audio to our buffer
|
||||
self._audio_buffer.extend(audio)
|
||||
|
||||
# Calculate how many complete frames we can process
|
||||
total_samples = len(self._audio_buffer) // 2 # 2 bytes per int16 sample
|
||||
num_complete_frames = total_samples // self._samples_per_frame
|
||||
|
||||
if num_complete_frames == 0:
|
||||
# Not enough samples for a complete frame yet, return empty
|
||||
return b""
|
||||
|
||||
# Calculate how many bytes we need for complete frames
|
||||
complete_samples_count = num_complete_frames * self._samples_per_frame
|
||||
bytes_to_process = complete_samples_count * 2 # 2 bytes per sample
|
||||
|
||||
# Extract the bytes we can process
|
||||
audio_to_process = bytes(self._audio_buffer[:bytes_to_process])
|
||||
|
||||
# Remove processed bytes from buffer, keep the remainder
|
||||
self._audio_buffer = self._audio_buffer[bytes_to_process:]
|
||||
|
||||
# Process the complete frames
|
||||
samples = np.frombuffer(audio_to_process, dtype=np.int16)
|
||||
frames = samples.reshape(-1, self._samples_per_frame)
|
||||
processed_samples = np.empty_like(samples)
|
||||
|
||||
for i, frame in enumerate(frames):
|
||||
cleaned_frame = self._session.process(frame, self._noise_suppression_level)
|
||||
processed_samples[i * self._samples_per_frame : (i + 1) * self._samples_per_frame] = (
|
||||
cleaned_frame
|
||||
)
|
||||
|
||||
return processed_samples.tobytes()
|
||||
@@ -672,7 +672,7 @@ class TTSSpeakFrame(DataFrame):
|
||||
|
||||
|
||||
@dataclass
|
||||
class TransportMessageFrame(DataFrame):
|
||||
class OutputTransportMessageFrame(DataFrame):
|
||||
"""Frame containing transport-specific message data.
|
||||
|
||||
Parameters:
|
||||
@@ -685,6 +685,32 @@ class TransportMessageFrame(DataFrame):
|
||||
return f"{self.name}(message: {self.message})"
|
||||
|
||||
|
||||
@dataclass
|
||||
class TransportMessageFrame(OutputTransportMessageFrame):
|
||||
"""Frame containing transport-specific message data.
|
||||
|
||||
.. deprecated:: 0.0.87
|
||||
This frame is deprecated and will be removed in a future version.
|
||||
Instead, use `OutputTransportMessageFrame`.
|
||||
|
||||
Parameters:
|
||||
message: The transport message payload.
|
||||
"""
|
||||
|
||||
def __post_init__(self):
|
||||
super().__post_init__()
|
||||
import warnings
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("always")
|
||||
warnings.warn(
|
||||
"TransportMessageFrame is deprecated and will be removed in a future version. "
|
||||
"Instead, use OutputTransportMessageFrame.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class DTMFFrame:
|
||||
"""Base class for DTMF (Dual-Tone Multi-Frequency) keypad frames.
|
||||
@@ -1092,8 +1118,8 @@ class STTMuteFrame(SystemFrame):
|
||||
|
||||
|
||||
@dataclass
|
||||
class TransportMessageUrgentFrame(SystemFrame):
|
||||
"""Frame for urgent transport messages that need immediate processing.
|
||||
class InputTransportMessageFrame(SystemFrame):
|
||||
"""Frame for transport messages received from external sources.
|
||||
|
||||
Parameters:
|
||||
message: The urgent transport message payload.
|
||||
@@ -1106,20 +1132,69 @@ class TransportMessageUrgentFrame(SystemFrame):
|
||||
|
||||
|
||||
@dataclass
|
||||
class InputTransportMessageUrgentFrame(TransportMessageUrgentFrame):
|
||||
class InputTransportMessageUrgentFrame(InputTransportMessageFrame):
|
||||
"""Frame for transport messages received from external sources.
|
||||
|
||||
This frame wraps incoming transport messages to distinguish them from outgoing
|
||||
urgent transport messages (TransportMessageUrgentFrame), preventing infinite
|
||||
message loops in the transport layer. It inherits the message payload from
|
||||
TransportMessageFrame while marking the message as having been received
|
||||
rather than generated locally.
|
||||
.. deprecated:: 0.0.87
|
||||
This frame is deprecated and will be removed in a future version.
|
||||
Instead, use `InputTransportMessageFrame`.
|
||||
|
||||
Used by transport implementations to properly handle bidirectional message
|
||||
flow without creating feedback loops.
|
||||
Parameters:
|
||||
message: The urgent transport message payload.
|
||||
"""
|
||||
|
||||
pass
|
||||
def __post_init__(self):
|
||||
super().__post_init__()
|
||||
import warnings
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("always")
|
||||
warnings.warn(
|
||||
"InputTransportMessageUrgentFrame is deprecated and will be removed in a future version. "
|
||||
"Instead, use InputTransportMessageFrame.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class OutputTransportMessageUrgentFrame(SystemFrame):
|
||||
"""Frame for urgent transport messages that need to be sent immediately.
|
||||
|
||||
Parameters:
|
||||
message: The urgent transport message payload.
|
||||
"""
|
||||
|
||||
message: Any
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.name}(message: {self.message})"
|
||||
|
||||
|
||||
@dataclass
|
||||
class TransportMessageUrgentFrame(OutputTransportMessageUrgentFrame):
|
||||
"""Frame for urgent transport messages that need to be sent immediately.
|
||||
|
||||
.. deprecated:: 0.0.87
|
||||
This frame is deprecated and will be removed in a future version.
|
||||
Instead, use `OutputTransportMessageUrgentFrame`.
|
||||
|
||||
Parameters:
|
||||
message: The urgent transport message payload.
|
||||
"""
|
||||
|
||||
def __post_init__(self):
|
||||
super().__post_init__()
|
||||
import warnings
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("always")
|
||||
warnings.warn(
|
||||
"TransportMessageUrgentFrame is deprecated and will be removed in a future version. "
|
||||
"Instead, use OutputTransportMessageFrame.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
|
||||
@@ -70,11 +70,15 @@ class PipelineRunner(BaseObject):
|
||||
"""
|
||||
logger.debug(f"Runner {self} started running {task}")
|
||||
self._tasks[task.name] = task
|
||||
params = PipelineTaskParams(loop=self._loop)
|
||||
|
||||
# PipelineTask handles asyncio.CancelledError to shutdown the pipeline
|
||||
# properly and re-raises it in case there's more cleanup to do.
|
||||
try:
|
||||
params = PipelineTaskParams(loop=self._loop)
|
||||
await task.run(params)
|
||||
except asyncio.CancelledError:
|
||||
await self._cancel()
|
||||
pass
|
||||
|
||||
del self._tasks[task.name]
|
||||
|
||||
# Cleanup base object.
|
||||
|
||||
@@ -130,12 +130,16 @@ class PipelineTask(BasePipelineTask):
|
||||
|
||||
- on_pipeline_finished: Called after the pipeline has reached any terminal state.
|
||||
This includes:
|
||||
|
||||
- StopFrame: pipeline was stopped (processors keep connections open)
|
||||
- EndFrame: pipeline ended normally
|
||||
- CancelFrame: pipeline was cancelled
|
||||
|
||||
Use this event for cleanup, logging, or post-processing tasks. Users can inspect
|
||||
the frame if they need to handle specific cases.
|
||||
|
||||
- on_pipeline_error: Called when an error occurs with ErrorFrame
|
||||
|
||||
Example::
|
||||
|
||||
@task.event_handler("on_frame_reached_upstream")
|
||||
@@ -146,9 +150,17 @@ class PipelineTask(BasePipelineTask):
|
||||
async def on_pipeline_idle_timeout(task):
|
||||
...
|
||||
|
||||
@task.event_handler("on_pipeline_started")
|
||||
async def on_pipeline_started(task, frame):
|
||||
...
|
||||
|
||||
@task.event_handler("on_pipeline_finished")
|
||||
async def on_pipeline_finished(task, frame):
|
||||
...
|
||||
|
||||
@task.event_handler("on_pipeline_error")
|
||||
async def on_pipeline_error(task, frame):
|
||||
...
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
@@ -257,6 +269,9 @@ class PipelineTask(BasePipelineTask):
|
||||
# StopFrame) has been received at the end of the pipeline.
|
||||
self._pipeline_end_event = asyncio.Event()
|
||||
|
||||
# This event is set when the pipeline truly finishes.
|
||||
self._pipeline_finished_event = asyncio.Event()
|
||||
|
||||
# This is the final pipeline. It is composed of a source processor,
|
||||
# followed by the user pipeline, and ending with a sink processor. The
|
||||
# source allows us to receive and react to upstream frames, and the sink
|
||||
@@ -286,6 +301,7 @@ class PipelineTask(BasePipelineTask):
|
||||
self._register_event_handler("on_pipeline_ended")
|
||||
self._register_event_handler("on_pipeline_cancelled")
|
||||
self._register_event_handler("on_pipeline_finished")
|
||||
self._register_event_handler("on_pipeline_error")
|
||||
|
||||
@property
|
||||
def params(self) -> PipelineParams:
|
||||
@@ -388,11 +404,7 @@ class PipelineTask(BasePipelineTask):
|
||||
await self.queue_frame(EndFrame())
|
||||
|
||||
async def cancel(self):
|
||||
"""Immediately stop the running pipeline.
|
||||
|
||||
Cancels all running tasks and stops frame processing without
|
||||
waiting for completion.
|
||||
"""
|
||||
"""Request the running pipeline to cancel."""
|
||||
if not self._finished:
|
||||
await self._cancel()
|
||||
|
||||
@@ -404,51 +416,38 @@ class PipelineTask(BasePipelineTask):
|
||||
"""
|
||||
if self.has_finished():
|
||||
return
|
||||
cleanup_pipeline = True
|
||||
|
||||
# Setup processors.
|
||||
await self._setup(params)
|
||||
|
||||
# Create all main tasks and wait for the main push task. This is the
|
||||
# task that pushes frames to the very beginning of our pipeline (i.e. to
|
||||
# our controlled source processor).
|
||||
await self._create_tasks()
|
||||
|
||||
try:
|
||||
# Setup processors.
|
||||
await self._setup(params)
|
||||
|
||||
# Create all main tasks and wait of the main push task. This is the
|
||||
# task that pushes frames to the very beginning of our pipeline (our
|
||||
# controlled source processor).
|
||||
push_task = await self._create_tasks()
|
||||
await push_task
|
||||
|
||||
# We have already cleaned up the pipeline inside the task.
|
||||
cleanup_pipeline = False
|
||||
|
||||
# Pipeline has finished nicely.
|
||||
self._finished = True
|
||||
# Wait for pipeline to finish.
|
||||
await self._wait_for_pipeline_finished()
|
||||
except asyncio.CancelledError:
|
||||
# Raise exception back to the pipeline runner so it can cancel this
|
||||
# task properly.
|
||||
logger.debug(f"Pipeline task {self} got cancelled from outside...")
|
||||
# We have been cancelled from outside, let's just cancel everything.
|
||||
await self._cancel()
|
||||
# Wait again for pipeline to finish. This time we have really
|
||||
# cancelled, so it should really finish.
|
||||
await self._wait_for_pipeline_finished()
|
||||
# Re-raise in case there's more cleanup to do.
|
||||
raise
|
||||
finally:
|
||||
# We can reach this point for different reasons:
|
||||
#
|
||||
# 1. The task has finished properly (e.g. `EndFrame`).
|
||||
# 2. By calling `PipelineTask.cancel()`.
|
||||
# 3. By asyncio task cancellation.
|
||||
#
|
||||
# Case (1) will execute the code below without issues because
|
||||
# `self._finished` is true.
|
||||
#
|
||||
# Case (2) will execute the code below without issues because
|
||||
# `self._cancelled` is true.
|
||||
#
|
||||
# Case (3) will raise the exception above (because we are cancelling
|
||||
# the asyncio task). This will be then captured by the
|
||||
# `PipelineRunner` which will call `PipelineTask.cancel()` and
|
||||
# therefore becoming case (2).
|
||||
if self._finished or self._cancelled:
|
||||
logger.debug(f"Pipeline task {self} is finishing cleanup...")
|
||||
await self._cancel_tasks()
|
||||
await self._cleanup(cleanup_pipeline)
|
||||
if self._check_dangling_tasks:
|
||||
self._print_dangling_tasks()
|
||||
self._finished = True
|
||||
logger.debug(f"Pipeline task {self} has finished")
|
||||
# 1. The pipeline task has finished (try case).
|
||||
# 2. By an asyncio task cancellation (except case).
|
||||
logger.debug(f"Pipeline task {self} is finishing...")
|
||||
await self._cancel_tasks()
|
||||
if self._check_dangling_tasks:
|
||||
self._print_dangling_tasks()
|
||||
self._finished = True
|
||||
logger.debug(f"Pipeline task {self} has finished")
|
||||
|
||||
async def queue_frame(self, frame: Frame):
|
||||
"""Queue a single frame to be pushed down the pipeline.
|
||||
@@ -476,19 +475,7 @@ class PipelineTask(BasePipelineTask):
|
||||
if not self._cancelled:
|
||||
logger.debug(f"Cancelling pipeline task {self}")
|
||||
self._cancelled = True
|
||||
cancel_frame = CancelFrame()
|
||||
# Make sure everything is cleaned up downstream. This is sent
|
||||
# out-of-band from the main streaming task which is what we want since
|
||||
# we want to cancel right away.
|
||||
await self._pipeline.queue_frame(cancel_frame)
|
||||
# Wait for CancelFrame to make it through the pipeline.
|
||||
await self._wait_for_pipeline_end(cancel_frame)
|
||||
# Only cancel the push task, we don't want to be able to process any
|
||||
# other frame after cancel. Everything else will be cancelled in
|
||||
# run().
|
||||
if self._process_push_task:
|
||||
await self._task_manager.cancel_task(self._process_push_task)
|
||||
self._process_push_task = None
|
||||
await self.queue_frame(CancelFrame())
|
||||
|
||||
async def _create_tasks(self):
|
||||
"""Create and start all pipeline processing tasks."""
|
||||
@@ -590,6 +577,17 @@ class PipelineTask(BasePipelineTask):
|
||||
|
||||
self._pipeline_end_event.clear()
|
||||
|
||||
# We are really done.
|
||||
self._pipeline_finished_event.set()
|
||||
|
||||
async def _wait_for_pipeline_finished(self):
|
||||
await self._pipeline_finished_event.wait()
|
||||
self._pipeline_finished_event.clear()
|
||||
# Make sure we wait for the main task to complete.
|
||||
if self._process_push_task:
|
||||
await self._process_push_task
|
||||
self._process_push_task = None
|
||||
|
||||
async def _setup(self, params: PipelineTaskParams):
|
||||
"""Set up the pipeline task and all processors."""
|
||||
mgr_params = TaskManagerParams(loop=params.loop)
|
||||
@@ -692,12 +690,11 @@ class PipelineTask(BasePipelineTask):
|
||||
logger.debug(f"{self}: received interruption task frame {frame}")
|
||||
await self._pipeline.queue_frame(InterruptionFrame())
|
||||
elif isinstance(frame, ErrorFrame):
|
||||
await self._call_event_handler("on_pipeline_error", frame)
|
||||
if frame.fatal:
|
||||
logger.error(f"A fatal error occurred: {frame}")
|
||||
# Cancel all tasks downstream.
|
||||
await self.queue_frame(CancelFrame())
|
||||
# Tell the task we should stop.
|
||||
await self.queue_frame(StopTaskFrame())
|
||||
else:
|
||||
logger.warning(f"{self}: Something went wrong: {frame}")
|
||||
|
||||
|
||||
@@ -15,9 +15,10 @@ service-specific adapter.
|
||||
"""
|
||||
|
||||
import base64
|
||||
import copy
|
||||
import io
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, List, Optional, TypeAlias, Union
|
||||
from typing import TYPE_CHECKING, Any, List, Optional, TypeAlias, Union
|
||||
|
||||
from loguru import logger
|
||||
from openai._types import NOT_GIVEN as OPEN_AI_NOT_GIVEN
|
||||
@@ -31,6 +32,9 @@ from PIL import Image
|
||||
from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
||||
from pipecat.frames.frames import AudioRawFrame
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
|
||||
# "Re-export" types from OpenAI that we're using as universal context types.
|
||||
# NOTE: if universal message types need to someday diverge from OpenAI's, we
|
||||
# should consider managing our own definitions. But we should do so carefully,
|
||||
@@ -65,6 +69,26 @@ class LLMContext:
|
||||
and content formatting.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def from_openai_context(openai_context: "OpenAILLMContext") -> "LLMContext":
|
||||
"""Create a universal LLM context from an OpenAI-specific context.
|
||||
|
||||
NOTE: this should only be used internally, for facilitating migration
|
||||
from OpenAILLMContext to LLMContext. New user code should use
|
||||
LLMContext directly.
|
||||
|
||||
Args:
|
||||
openai_context: The OpenAI LLM context to convert.
|
||||
|
||||
Returns:
|
||||
New LLMContext instance with converted messages and settings.
|
||||
"""
|
||||
return LLMContext(
|
||||
messages=openai_context.get_messages(),
|
||||
tools=openai_context.tools,
|
||||
tool_choice=openai_context.tool_choice,
|
||||
)
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
messages: Optional[List[LLMContextMessage]] = None,
|
||||
|
||||
@@ -877,6 +877,8 @@ class FrameProcessor(BaseObject):
|
||||
|
||||
"""
|
||||
while True:
|
||||
(frame, direction, callback) = await self.__input_queue.get()
|
||||
|
||||
if self.__should_block_system_frames and self.__input_event:
|
||||
logger.trace(f"{self}: system frame processing paused")
|
||||
await self.__input_event.wait()
|
||||
@@ -884,8 +886,6 @@ class FrameProcessor(BaseObject):
|
||||
self.__should_block_system_frames = False
|
||||
logger.trace(f"{self}: system frame processing resumed")
|
||||
|
||||
(frame, direction, callback) = await self.__input_queue.get()
|
||||
|
||||
if isinstance(frame, SystemFrame):
|
||||
await self.__process_frame(frame, direction, callback)
|
||||
elif self.__process_queue:
|
||||
@@ -900,6 +900,8 @@ class FrameProcessor(BaseObject):
|
||||
async def __process_frame_task_handler(self):
|
||||
"""Handle non-system frames from the process queue."""
|
||||
while True:
|
||||
(frame, direction, callback) = await self.__process_queue.get()
|
||||
|
||||
if self.__should_block_frames and self.__process_event:
|
||||
logger.trace(f"{self}: frame processing paused")
|
||||
await self.__process_event.wait()
|
||||
@@ -907,8 +909,6 @@ class FrameProcessor(BaseObject):
|
||||
self.__should_block_frames = False
|
||||
logger.trace(f"{self}: frame processing resumed")
|
||||
|
||||
(frame, direction, callback) = await self.__process_queue.get()
|
||||
|
||||
await self.__process_frame(frame, direction, callback)
|
||||
|
||||
self.__process_queue.task_done()
|
||||
|
||||
@@ -42,6 +42,7 @@ from pipecat.frames.frames import (
|
||||
Frame,
|
||||
FunctionCallResultFrame,
|
||||
InputAudioRawFrame,
|
||||
InputTransportMessageFrame,
|
||||
InterimTranscriptionFrame,
|
||||
LLMConfigureOutputFrame,
|
||||
LLMContextFrame,
|
||||
@@ -50,10 +51,10 @@ from pipecat.frames.frames import (
|
||||
LLMMessagesAppendFrame,
|
||||
LLMTextFrame,
|
||||
MetricsFrame,
|
||||
OutputTransportMessageUrgentFrame,
|
||||
StartFrame,
|
||||
SystemFrame,
|
||||
TranscriptionFrame,
|
||||
TransportMessageUrgentFrame,
|
||||
TTSAudioRawFrame,
|
||||
TTSStartedFrame,
|
||||
TTSStoppedFrame,
|
||||
@@ -919,7 +920,7 @@ class RTVIObserverParams:
|
||||
user_audio_level_enabled: bool = False
|
||||
metrics_enabled: bool = True
|
||||
system_logs_enabled: bool = False
|
||||
errors_enabled: bool = True
|
||||
errors_enabled: Optional[bool] = None
|
||||
audio_level_period_secs: float = 0.15
|
||||
|
||||
|
||||
@@ -962,7 +963,7 @@ class RTVIObserver(BaseObserver):
|
||||
if self._params.system_logs_enabled:
|
||||
self._system_logger_id = logger.add(self._logger_sink)
|
||||
|
||||
if self._params.errors_enabled:
|
||||
if self._params.errors_enabled is not None:
|
||||
import warnings
|
||||
|
||||
with warnings.catch_warnings():
|
||||
@@ -1209,11 +1210,10 @@ class RTVIObserver(BaseObserver):
|
||||
|
||||
async def _send_error_response(self, frame: RTVIServerResponseFrame):
|
||||
"""Send a response to the client for a specific request."""
|
||||
if self._params.errors_enabled:
|
||||
message = RTVIErrorResponse(
|
||||
id=str(frame.client_msg.msg_id), data=RTVIErrorResponseData(error=frame.error)
|
||||
)
|
||||
await self.send_rtvi_message(message)
|
||||
message = RTVIErrorResponse(
|
||||
id=str(frame.client_msg.msg_id), data=RTVIErrorResponseData(error=frame.error)
|
||||
)
|
||||
await self.send_rtvi_message(message)
|
||||
|
||||
|
||||
class RTVIProcessor(FrameProcessor):
|
||||
@@ -1346,7 +1346,9 @@ class RTVIProcessor(FrameProcessor):
|
||||
|
||||
async def push_transport_message(self, model: BaseModel, exclude_none: bool = True):
|
||||
"""Push a transport message frame."""
|
||||
frame = TransportMessageUrgentFrame(message=model.model_dump(exclude_none=exclude_none))
|
||||
frame = OutputTransportMessageUrgentFrame(
|
||||
message=model.model_dump(exclude_none=exclude_none)
|
||||
)
|
||||
await self.push_frame(frame)
|
||||
|
||||
async def handle_message(self, message: RTVIMessage):
|
||||
@@ -1419,7 +1421,7 @@ class RTVIProcessor(FrameProcessor):
|
||||
elif isinstance(frame, ErrorFrame):
|
||||
await self._send_error_frame(frame)
|
||||
await self.push_frame(frame, direction)
|
||||
elif isinstance(frame, TransportMessageUrgentFrame):
|
||||
elif isinstance(frame, InputTransportMessageFrame):
|
||||
await self._handle_transport_message(frame)
|
||||
# All other system frames
|
||||
elif isinstance(frame, SystemFrame):
|
||||
@@ -1482,7 +1484,7 @@ class RTVIProcessor(FrameProcessor):
|
||||
await self._handle_message(message)
|
||||
self._message_queue.task_done()
|
||||
|
||||
async def _handle_transport_message(self, frame: TransportMessageUrgentFrame):
|
||||
async def _handle_transport_message(self, frame: InputTransportMessageFrame):
|
||||
"""Handle an incoming transport message frame."""
|
||||
try:
|
||||
transport_message = frame.message
|
||||
|
||||
@@ -15,7 +15,7 @@ from pipecat.frames.frames import (
|
||||
Frame,
|
||||
InputAudioRawFrame,
|
||||
OutputAudioRawFrame,
|
||||
TransportMessageFrame,
|
||||
UserSpeakingFrame,
|
||||
)
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
|
||||
@@ -36,9 +36,9 @@ class FrameLogger(FrameProcessor):
|
||||
color: Optional[str] = None,
|
||||
ignored_frame_types: Tuple[Type[Frame], ...] = (
|
||||
BotSpeakingFrame,
|
||||
UserSpeakingFrame,
|
||||
InputAudioRawFrame,
|
||||
OutputAudioRawFrame,
|
||||
TransportMessageFrame,
|
||||
),
|
||||
):
|
||||
"""Initialize the frame logger.
|
||||
|
||||
@@ -82,6 +82,7 @@ async def configure(
|
||||
sip_enable_video: Optional[bool] = False,
|
||||
sip_num_endpoints: Optional[int] = 1,
|
||||
sip_codecs: Optional[Dict[str, List[str]]] = None,
|
||||
room_properties: Optional[DailyRoomProperties] = None,
|
||||
) -> DailyRoomConfig:
|
||||
"""Configure Daily room URL and token with optional SIP capabilities.
|
||||
|
||||
@@ -99,6 +100,10 @@ async def configure(
|
||||
sip_num_endpoints: Number of allowed SIP endpoints.
|
||||
sip_codecs: Codecs to support for audio and video. If None, uses Daily defaults.
|
||||
Example: {"audio": ["OPUS"], "video": ["H264"]}
|
||||
room_properties: Optional DailyRoomProperties to use instead of building from
|
||||
individual parameters. When provided, this overrides room_exp_duration and
|
||||
SIP-related parameters. If not provided, properties are built from the
|
||||
individual parameters as before.
|
||||
|
||||
Returns:
|
||||
DailyRoomConfig: Object with room_url, token, and optional sip_endpoint.
|
||||
@@ -115,6 +120,13 @@ async def configure(
|
||||
# SIP-enabled room
|
||||
sip_config = await configure(session, sip_caller_phone="+15551234567")
|
||||
print(f"SIP endpoint: {sip_config.sip_endpoint}")
|
||||
|
||||
# Custom room properties with recording enabled
|
||||
custom_props = DailyRoomProperties(
|
||||
enable_recording="cloud",
|
||||
max_participants=2,
|
||||
)
|
||||
config = await configure(session, room_properties=custom_props)
|
||||
"""
|
||||
# Check for required API key
|
||||
api_key = os.getenv("DAILY_API_KEY")
|
||||
@@ -124,9 +136,32 @@ async def configure(
|
||||
"Get your API key from https://dashboard.daily.co/developers"
|
||||
)
|
||||
|
||||
# Warn if both room_properties and individual parameters are provided
|
||||
if room_properties is not None:
|
||||
individual_params_provided = any(
|
||||
[
|
||||
room_exp_duration != 2.0,
|
||||
token_exp_duration != 2.0,
|
||||
sip_caller_phone is not None,
|
||||
sip_enable_video is not False,
|
||||
sip_num_endpoints != 1,
|
||||
sip_codecs is not None,
|
||||
]
|
||||
)
|
||||
if individual_params_provided:
|
||||
logger.warning(
|
||||
"Both room_properties and individual parameters (room_exp_duration, token_exp_duration, "
|
||||
"sip_*) were provided. The room_properties will be used and individual parameters "
|
||||
"will be ignored."
|
||||
)
|
||||
|
||||
# Determine if SIP mode is enabled
|
||||
sip_enabled = sip_caller_phone is not None
|
||||
|
||||
# If room_properties is provided, check if it has SIP configuration
|
||||
if room_properties and room_properties.sip:
|
||||
sip_enabled = True
|
||||
|
||||
daily_rest_helper = DailyRESTHelper(
|
||||
daily_api_key=api_key,
|
||||
daily_api_url=os.getenv("DAILY_API_URL", "https://api.daily.co/v1"),
|
||||
@@ -150,27 +185,29 @@ async def configure(
|
||||
room_name = f"{room_prefix}-{uuid.uuid4().hex[:8]}"
|
||||
logger.info(f"Creating new Daily room: {room_name}")
|
||||
|
||||
# Calculate expiration time
|
||||
expiration_time = time.time() + (room_exp_duration * 60 * 60)
|
||||
# Use provided room_properties or build from parameters
|
||||
if room_properties is None:
|
||||
# Calculate expiration time
|
||||
expiration_time = time.time() + (room_exp_duration * 60 * 60)
|
||||
|
||||
# Create room properties
|
||||
room_properties = DailyRoomProperties(
|
||||
exp=expiration_time,
|
||||
eject_at_room_exp=True,
|
||||
)
|
||||
|
||||
# Add SIP configuration if enabled
|
||||
if sip_enabled:
|
||||
sip_params = DailyRoomSipParams(
|
||||
display_name=sip_caller_phone,
|
||||
video=sip_enable_video,
|
||||
sip_mode="dial-in",
|
||||
num_endpoints=sip_num_endpoints,
|
||||
codecs=sip_codecs,
|
||||
# Create room properties
|
||||
room_properties = DailyRoomProperties(
|
||||
exp=expiration_time,
|
||||
eject_at_room_exp=True,
|
||||
)
|
||||
room_properties.sip = sip_params
|
||||
room_properties.enable_dialout = True # Enable outbound calls if needed
|
||||
room_properties.start_video_off = not sip_enable_video # Voice-only by default
|
||||
|
||||
# Add SIP configuration if enabled
|
||||
if sip_enabled:
|
||||
sip_params = DailyRoomSipParams(
|
||||
display_name=sip_caller_phone,
|
||||
video=sip_enable_video,
|
||||
sip_mode="dial-in",
|
||||
num_endpoints=sip_num_endpoints,
|
||||
codecs=sip_codecs,
|
||||
)
|
||||
room_properties.sip = sip_params
|
||||
room_properties.enable_dialout = True # Enable outbound calls if needed
|
||||
room_properties.start_video_off = not sip_enable_video # Voice-only by default
|
||||
|
||||
# Create room parameters
|
||||
room_params = DailyRoomParams(name=room_name, properties=room_properties)
|
||||
|
||||
@@ -67,10 +67,17 @@ To run locally:
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import mimetypes
|
||||
import os
|
||||
import sys
|
||||
import uuid
|
||||
from contextlib import asynccontextmanager
|
||||
from http import HTTPMethod
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, TypedDict
|
||||
|
||||
import aiohttp
|
||||
from fastapi.responses import FileResponse, Response
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.runner.types import (
|
||||
@@ -82,7 +89,7 @@ from pipecat.runner.types import (
|
||||
try:
|
||||
import uvicorn
|
||||
from dotenv import load_dotenv
|
||||
from fastapi import BackgroundTasks, FastAPI, Request, WebSocket
|
||||
from fastapi import BackgroundTasks, FastAPI, Header, HTTPException, Request, WebSocket
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from fastapi.responses import HTMLResponse, RedirectResponse
|
||||
except ImportError as e:
|
||||
@@ -96,6 +103,12 @@ except ImportError as e:
|
||||
load_dotenv(override=True)
|
||||
os.environ["ENV"] = "local"
|
||||
|
||||
TELEPHONY_TRANSPORTS = ["twilio", "telnyx", "plivo", "exotel"]
|
||||
|
||||
RUNNER_DOWNLOADS_FOLDER: Optional[str] = None
|
||||
RUNNER_HOST: str = "localhost"
|
||||
RUNNER_PORT: int = 7860
|
||||
|
||||
|
||||
def _get_bot_module():
|
||||
"""Get the bot module from the calling script."""
|
||||
@@ -150,7 +163,13 @@ async def _run_telephony_bot(websocket: WebSocket):
|
||||
|
||||
|
||||
def _create_server_app(
|
||||
transport_type: str, host: str = "localhost", proxy: str = None, esp32_mode: bool = False
|
||||
*,
|
||||
transport_type: str,
|
||||
host: str = "localhost",
|
||||
proxy: str,
|
||||
esp32_mode: bool = False,
|
||||
whatsapp_enabled: bool = False,
|
||||
folder: Optional[str] = None,
|
||||
):
|
||||
"""Create FastAPI app with transport-specific routes."""
|
||||
app = FastAPI()
|
||||
@@ -165,24 +184,30 @@ def _create_server_app(
|
||||
|
||||
# Set up transport-specific routes
|
||||
if transport_type == "webrtc":
|
||||
_setup_webrtc_routes(app, esp32_mode=esp32_mode, host=host)
|
||||
_setup_webrtc_routes(app, esp32_mode=esp32_mode, host=host, folder=folder)
|
||||
if whatsapp_enabled:
|
||||
_setup_whatsapp_routes(app)
|
||||
elif transport_type == "daily":
|
||||
_setup_daily_routes(app)
|
||||
elif transport_type in ["twilio", "telnyx", "plivo", "exotel"]:
|
||||
_setup_telephony_routes(app, transport_type, proxy)
|
||||
elif transport_type in TELEPHONY_TRANSPORTS:
|
||||
_setup_telephony_routes(app, transport_type=transport_type, proxy=proxy)
|
||||
else:
|
||||
logger.warning(f"Unknown transport type: {transport_type}")
|
||||
|
||||
return app
|
||||
|
||||
|
||||
def _setup_webrtc_routes(app: FastAPI, esp32_mode: bool = False, host: str = "localhost"):
|
||||
def _setup_webrtc_routes(
|
||||
app: FastAPI, *, esp32_mode: bool = False, host: str = "localhost", folder: Optional[str] = None
|
||||
):
|
||||
"""Set up WebRTC-specific routes."""
|
||||
try:
|
||||
from pipecat_ai_small_webrtc_prebuilt.frontend import SmallWebRTCPrebuiltUI
|
||||
|
||||
from pipecat.transports.smallwebrtc.connection import SmallWebRTCConnection
|
||||
from pipecat.transports.smallwebrtc.connection import IceServer, SmallWebRTCConnection
|
||||
from pipecat.transports.smallwebrtc.request_handler import (
|
||||
IceCandidate,
|
||||
SmallWebRTCPatchRequest,
|
||||
SmallWebRTCRequest,
|
||||
SmallWebRTCRequestHandler,
|
||||
)
|
||||
@@ -190,6 +215,16 @@ def _setup_webrtc_routes(app: FastAPI, esp32_mode: bool = False, host: str = "lo
|
||||
logger.error(f"WebRTC transport dependencies not installed: {e}")
|
||||
return
|
||||
|
||||
class IceConfig(TypedDict):
|
||||
iceServers: List[IceServer]
|
||||
|
||||
class StartBotResult(TypedDict, total=False):
|
||||
sessionId: str
|
||||
iceConfig: Optional[IceConfig]
|
||||
|
||||
# In-memory store of active sessions: session_id -> session info
|
||||
active_sessions: Dict[str, Dict[str, Any]] = {}
|
||||
|
||||
# Mount the frontend
|
||||
app.mount("/client", SmallWebRTCPrebuiltUI)
|
||||
|
||||
@@ -198,6 +233,21 @@ def _setup_webrtc_routes(app: FastAPI, esp32_mode: bool = False, host: str = "lo
|
||||
"""Redirect root requests to client interface."""
|
||||
return RedirectResponse(url="/client/")
|
||||
|
||||
@app.get("/files/{filename:path}")
|
||||
async def download_file(filename: str):
|
||||
"""Handle file downloads."""
|
||||
if not folder:
|
||||
logger.warning(f"Attempting to dowload {filename}, but downloads folder not setup.")
|
||||
return
|
||||
|
||||
file_path = Path(folder) / filename
|
||||
if not os.path.exists(file_path):
|
||||
raise HTTPException(404)
|
||||
|
||||
media_type, _ = mimetypes.guess_type(file_path)
|
||||
|
||||
return FileResponse(path=file_path, media_type=media_type, filename=filename)
|
||||
|
||||
# Initialize the SmallWebRTC request handler
|
||||
small_webrtc_handler: SmallWebRTCRequestHandler = SmallWebRTCRequestHandler(
|
||||
esp32_mode=esp32_mode, host=host
|
||||
@@ -220,13 +270,259 @@ def _setup_webrtc_routes(app: FastAPI, esp32_mode: bool = False, host: str = "lo
|
||||
)
|
||||
return answer
|
||||
|
||||
@app.patch("/api/offer")
|
||||
async def ice_candidate(request: SmallWebRTCPatchRequest):
|
||||
"""Handle WebRTC new ice candidate requests."""
|
||||
logger.debug(f"Received patch request: {request}")
|
||||
await small_webrtc_handler.handle_patch_request(request)
|
||||
return {"status": "success"}
|
||||
|
||||
@app.post("/start")
|
||||
async def rtvi_start(request: Request):
|
||||
"""Mimic Pipecat Cloud's /start endpoint."""
|
||||
# Parse the request body
|
||||
try:
|
||||
request_data = await request.json()
|
||||
logger.debug(f"Received request: {request_data}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to parse request body: {e}")
|
||||
request_data = {}
|
||||
|
||||
# Store session info immediately in memory, replicate the behavior expected on Pipecat Cloud
|
||||
session_id = str(uuid.uuid4())
|
||||
active_sessions[session_id] = request_data
|
||||
|
||||
result: StartBotResult = {"sessionId": session_id}
|
||||
if request_data.get("enableDefaultIceServers"):
|
||||
result["iceConfig"] = IceConfig(
|
||||
iceServers=[IceServer(urls="stun:stun.l.google.com:19302")]
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
@app.api_route(
|
||||
"/sessions/{session_id}/{path:path}",
|
||||
methods=["GET", "POST", "PUT", "PATCH", "DELETE"],
|
||||
)
|
||||
async def proxy_request(
|
||||
session_id: str, path: str, request: Request, background_tasks: BackgroundTasks
|
||||
):
|
||||
"""Mimic Pipecat Cloud's proxy."""
|
||||
active_session = active_sessions.get(session_id)
|
||||
if not active_session:
|
||||
return Response(content="Invalid or not-yet-ready session_id", status_code=404)
|
||||
|
||||
if path.endswith("api/offer"):
|
||||
# Parse the request body and convert to SmallWebRTCRequest
|
||||
try:
|
||||
request_data = await request.json()
|
||||
if request.method == HTTPMethod.POST.value:
|
||||
webrtc_request = SmallWebRTCRequest(
|
||||
sdp=request_data["sdp"],
|
||||
type=request_data["type"],
|
||||
pc_id=request_data.get("pc_id"),
|
||||
restart_pc=request_data.get("restart_pc"),
|
||||
request_data=request_data,
|
||||
)
|
||||
return await offer(webrtc_request, background_tasks)
|
||||
elif request.method == HTTPMethod.PATCH.value:
|
||||
patch_request = SmallWebRTCPatchRequest(
|
||||
pc_id=request_data["pc_id"],
|
||||
candidates=[IceCandidate(**c) for c in request_data.get("candidates", [])],
|
||||
)
|
||||
return await ice_candidate(patch_request)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to parse WebRTC request: {e}")
|
||||
return Response(content="Invalid WebRTC request", status_code=400)
|
||||
|
||||
logger.info(f"Received request for path: {path}")
|
||||
return Response(status_code=200)
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
async def smallwebrtc_lifespan(app: FastAPI):
|
||||
"""Manage FastAPI application lifecycle and cleanup connections."""
|
||||
yield
|
||||
await small_webrtc_handler.close()
|
||||
|
||||
app.router.lifespan_context = lifespan
|
||||
# Add the SmallWebRTC lifespan to the app
|
||||
_add_lifespan_to_app(app, smallwebrtc_lifespan)
|
||||
|
||||
|
||||
def _add_lifespan_to_app(app: FastAPI, new_lifespan):
|
||||
"""Add a new lifespan context manager to the app, combining with existing if present.
|
||||
|
||||
Args:
|
||||
app: The FastAPI application instance
|
||||
new_lifespan: The new lifespan context manager to add
|
||||
"""
|
||||
if hasattr(app.router, "lifespan_context") and app.router.lifespan_context is not None:
|
||||
# If there's already a lifespan context, combine them
|
||||
existing_lifespan = app.router.lifespan_context
|
||||
|
||||
@asynccontextmanager
|
||||
async def combined_lifespan(app: FastAPI):
|
||||
async with existing_lifespan(app):
|
||||
async with new_lifespan(app):
|
||||
yield
|
||||
|
||||
app.router.lifespan_context = combined_lifespan
|
||||
else:
|
||||
# No existing lifespan, use the new one
|
||||
app.router.lifespan_context = new_lifespan
|
||||
|
||||
|
||||
def _setup_whatsapp_routes(app: FastAPI):
|
||||
"""Set up WebRTC-specific routes."""
|
||||
WHATSAPP_APP_SECRET = os.getenv("WHATSAPP_APP_SECRET")
|
||||
WHATSAPP_PHONE_NUMBER_ID = os.getenv("WHATSAPP_PHONE_NUMBER_ID")
|
||||
WHATSAPP_TOKEN = os.getenv("WHATSAPP_TOKEN")
|
||||
WHATSAPP_WEBHOOK_VERIFICATION_TOKEN = os.getenv("WHATSAPP_WEBHOOK_VERIFICATION_TOKEN")
|
||||
|
||||
if not all(
|
||||
[
|
||||
WHATSAPP_APP_SECRET,
|
||||
WHATSAPP_PHONE_NUMBER_ID,
|
||||
WHATSAPP_TOKEN,
|
||||
WHATSAPP_WEBHOOK_VERIFICATION_TOKEN,
|
||||
]
|
||||
):
|
||||
logger.error(
|
||||
"""Missing required environment variables for WhatsApp transport:
|
||||
WHATSAPP_APP_SECRET
|
||||
WHATSAPP_PHONE_NUMBER_ID
|
||||
WHATSAPP_TOKEN
|
||||
WHATSAPP_WEBHOOK_VERIFICATION_TOKEN
|
||||
"""
|
||||
)
|
||||
return
|
||||
|
||||
try:
|
||||
from pipecat_ai_small_webrtc_prebuilt.frontend import SmallWebRTCPrebuiltUI
|
||||
|
||||
from pipecat.transports.smallwebrtc.connection import SmallWebRTCConnection
|
||||
from pipecat.transports.smallwebrtc.request_handler import (
|
||||
SmallWebRTCRequest,
|
||||
SmallWebRTCRequestHandler,
|
||||
)
|
||||
from pipecat.transports.whatsapp.api import WhatsAppWebhookRequest
|
||||
from pipecat.transports.whatsapp.client import WhatsAppClient
|
||||
except ImportError as e:
|
||||
logger.error(f"WhatsApp transport dependencies not installed: {e}")
|
||||
return
|
||||
|
||||
# Global WhatsApp client instance
|
||||
whatsapp_client: Optional[WhatsAppClient] = None
|
||||
|
||||
@app.get(
|
||||
"/whatsapp",
|
||||
summary="Verify WhatsApp webhook",
|
||||
description="Handles WhatsApp webhook verification requests from Meta",
|
||||
)
|
||||
async def verify_webhook(request: Request):
|
||||
"""Verify WhatsApp webhook endpoint.
|
||||
|
||||
This endpoint is called by Meta's WhatsApp Business API to verify
|
||||
the webhook URL during setup. It validates the verification token
|
||||
and returns the challenge parameter if successful.
|
||||
"""
|
||||
if whatsapp_client is None:
|
||||
logger.error("WhatsApp client is not initialized")
|
||||
raise HTTPException(status_code=503, detail="Service unavailable")
|
||||
|
||||
params = dict(request.query_params)
|
||||
logger.debug(f"Webhook verification request received with params: {list(params.keys())}")
|
||||
|
||||
try:
|
||||
result = await whatsapp_client.handle_verify_webhook_request(
|
||||
params=params, expected_verification_token=WHATSAPP_WEBHOOK_VERIFICATION_TOKEN
|
||||
)
|
||||
logger.info("Webhook verification successful")
|
||||
return result
|
||||
except ValueError as e:
|
||||
logger.warning(f"Webhook verification failed: {e}")
|
||||
raise HTTPException(status_code=403, detail="Verification failed")
|
||||
|
||||
@app.post(
|
||||
"/whatsapp",
|
||||
summary="Handle WhatsApp webhook events",
|
||||
description="Processes incoming WhatsApp messages and call events",
|
||||
)
|
||||
async def whatsapp_webhook(
|
||||
body: WhatsAppWebhookRequest,
|
||||
background_tasks: BackgroundTasks,
|
||||
request: Request,
|
||||
x_hub_signature_256: str = Header(None),
|
||||
):
|
||||
"""Handle incoming WhatsApp webhook events.
|
||||
|
||||
For call events, establishes WebRTC connections and spawns bot instances
|
||||
in the background to handle real-time communication.
|
||||
"""
|
||||
if whatsapp_client is None:
|
||||
logger.error("WhatsApp client is not initialized")
|
||||
raise HTTPException(status_code=503, detail="Service unavailable")
|
||||
|
||||
# Validate webhook object type
|
||||
if body.object != "whatsapp_business_account":
|
||||
logger.warning(f"Invalid webhook object type: {body.object}")
|
||||
raise HTTPException(status_code=400, detail="Invalid object type")
|
||||
|
||||
logger.debug(f"Processing WhatsApp webhook: {body.model_dump()}")
|
||||
|
||||
async def connection_callback(connection: SmallWebRTCConnection):
|
||||
"""Handle new WebRTC connections from WhatsApp calls.
|
||||
|
||||
Called when a WebRTC connection is established for a WhatsApp call.
|
||||
Spawns a bot instance to handle the conversation.
|
||||
|
||||
Args:
|
||||
connection: The established WebRTC connection
|
||||
"""
|
||||
bot_module = _get_bot_module()
|
||||
runner_args = SmallWebRTCRunnerArguments(webrtc_connection=connection)
|
||||
background_tasks.add_task(bot_module.bot, runner_args)
|
||||
|
||||
try:
|
||||
# Process the webhook request
|
||||
raw_body = await request.body()
|
||||
result = await whatsapp_client.handle_webhook_request(
|
||||
body, connection_callback, sha256_signature=x_hub_signature_256, raw_body=raw_body
|
||||
)
|
||||
logger.debug(f"Webhook processed successfully: {result}")
|
||||
return {"status": "success", "message": "Webhook processed successfully"}
|
||||
except ValueError as ve:
|
||||
logger.warning(f"Invalid webhook request format: {ve}")
|
||||
raise HTTPException(status_code=400, detail=f"Invalid request: {str(ve)}")
|
||||
except Exception as e:
|
||||
logger.error(f"Internal error processing webhook: {e}")
|
||||
raise HTTPException(status_code=500, detail="Internal server error processing webhook")
|
||||
|
||||
@asynccontextmanager
|
||||
async def whatsapp_lifespan(app: FastAPI):
|
||||
"""Manage WhatsApp client lifecycle and cleanup connections."""
|
||||
nonlocal whatsapp_client
|
||||
|
||||
# Initialize WhatsApp client with persistent HTTP session
|
||||
async with aiohttp.ClientSession() as session:
|
||||
whatsapp_client = WhatsAppClient(
|
||||
whatsapp_token=WHATSAPP_TOKEN,
|
||||
whatsapp_secret=WHATSAPP_APP_SECRET,
|
||||
phone_number_id=WHATSAPP_PHONE_NUMBER_ID,
|
||||
session=session,
|
||||
)
|
||||
logger.info("WhatsApp client initialized successfully")
|
||||
|
||||
try:
|
||||
yield # Run the application
|
||||
finally:
|
||||
# Cleanup all active calls on shutdown
|
||||
logger.info("Cleaning up WhatsApp client resources...")
|
||||
if whatsapp_client:
|
||||
await whatsapp_client.terminate_all_calls()
|
||||
logger.info("WhatsApp cleanup completed")
|
||||
|
||||
# Add the WhatsApp lifespan to the app
|
||||
_add_lifespan_to_app(app, whatsapp_lifespan)
|
||||
|
||||
|
||||
def _setup_daily_routes(app: FastAPI):
|
||||
@@ -281,8 +577,6 @@ def _setup_daily_routes(app: FastAPI):
|
||||
else:
|
||||
logger.debug("No body data provided in request")
|
||||
|
||||
import aiohttp
|
||||
|
||||
from pipecat.runner.daily import configure
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
@@ -314,7 +608,7 @@ def _setup_daily_routes(app: FastAPI):
|
||||
return await _handle_rtvi_request(request)
|
||||
|
||||
|
||||
def _setup_telephony_routes(app: FastAPI, transport_type: str, proxy: str):
|
||||
def _setup_telephony_routes(app: FastAPI, *, transport_type: str, proxy: str):
|
||||
"""Set up telephony-specific routes."""
|
||||
# XML response templates (Exotel doesn't use XML webhooks)
|
||||
XML_TEMPLATES = {
|
||||
@@ -370,8 +664,6 @@ def _setup_telephony_routes(app: FastAPI, transport_type: str, proxy: str):
|
||||
async def _run_daily_direct():
|
||||
"""Run Daily bot with direct connection (no FastAPI server)."""
|
||||
try:
|
||||
import aiohttp
|
||||
|
||||
from pipecat.runner.daily import configure
|
||||
except ImportError as e:
|
||||
logger.error("Daily transport dependencies not installed.")
|
||||
@@ -417,6 +709,21 @@ def _validate_and_clean_proxy(proxy: str) -> str:
|
||||
return proxy
|
||||
|
||||
|
||||
def runner_downloads_folder() -> Optional[str]:
|
||||
"""Returns the folder where files are stored for later download."""
|
||||
return RUNNER_DOWNLOADS_FOLDER
|
||||
|
||||
|
||||
def runner_host() -> str:
|
||||
"""Returns the host name of this runner."""
|
||||
return RUNNER_HOST
|
||||
|
||||
|
||||
def runner_port() -> int:
|
||||
"""Returns the port of this runner."""
|
||||
return RUNNER_PORT
|
||||
|
||||
|
||||
def main():
|
||||
"""Start the Pipecat development runner.
|
||||
|
||||
@@ -437,14 +744,16 @@ def main():
|
||||
|
||||
The bot file must contain a `bot(runner_args)` function as the entry point.
|
||||
"""
|
||||
global RUNNER_DOWNLOADS_FOLDER, RUNNER_HOST, RUNNER_PORT
|
||||
|
||||
parser = argparse.ArgumentParser(description="Pipecat Development Runner")
|
||||
parser.add_argument("--host", type=str, default="localhost", help="Host address")
|
||||
parser.add_argument("--port", type=int, default=7860, help="Port number")
|
||||
parser.add_argument("--host", type=str, default=RUNNER_HOST, help="Host address")
|
||||
parser.add_argument("--port", type=int, default=RUNNER_PORT, help="Port number")
|
||||
parser.add_argument(
|
||||
"-t",
|
||||
"--transport",
|
||||
type=str,
|
||||
choices=["daily", "webrtc", "twilio", "telnyx", "plivo", "exotel"],
|
||||
choices=["daily", "webrtc", *TELEPHONY_TRANSPORTS],
|
||||
default="webrtc",
|
||||
help="Transport type",
|
||||
)
|
||||
@@ -462,9 +771,16 @@ def main():
|
||||
default=False,
|
||||
help="Connect directly to Daily room (automatically sets transport to daily)",
|
||||
)
|
||||
parser.add_argument("-f", "--folder", type=str, help="Path to downloads folder")
|
||||
parser.add_argument(
|
||||
"--verbose", "-v", action="count", default=0, help="Increase logging verbosity"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--whatsapp",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="Ensure requried WhatsApp environment variables are present",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@@ -484,6 +800,10 @@ def main():
|
||||
logger.error("For ESP32, you need to specify `--host IP` so we can do SDP munging.")
|
||||
return
|
||||
|
||||
if args.transport in TELEPHONY_TRANSPORTS and not args.proxy:
|
||||
logger.error(f"For telephony transports, you need to specify `--proxy PROXY`.")
|
||||
return
|
||||
|
||||
# Log level
|
||||
logger.remove()
|
||||
logger.add(sys.stderr, level="TRACE" if args.verbose else "DEBUG")
|
||||
@@ -503,10 +823,11 @@ def main():
|
||||
print()
|
||||
if args.esp32:
|
||||
print(f"🚀 Bot ready! (ESP32 mode)")
|
||||
print(f" → Open http://{args.host}:{args.port}/client in your browser")
|
||||
elif args.whatsapp:
|
||||
print(f"🚀 Bot ready! (WhatsApp)")
|
||||
else:
|
||||
print(f"🚀 Bot ready!")
|
||||
print(f" → Open http://{args.host}:{args.port}/client in your browser")
|
||||
print(f" → Open http://{args.host}:{args.port}/client in your browser")
|
||||
print()
|
||||
elif args.transport == "daily":
|
||||
print()
|
||||
@@ -514,8 +835,19 @@ def main():
|
||||
print(f" → Open http://{args.host}:{args.port} in your browser to start a session")
|
||||
print()
|
||||
|
||||
RUNNER_DOWNLOADS_FOLDER = args.folder
|
||||
RUNNER_HOST = args.host
|
||||
RUNNER_PORT = args.port
|
||||
|
||||
# Create the app with transport-specific setup
|
||||
app = _create_server_app(args.transport, args.host, args.proxy, args.esp32)
|
||||
app = _create_server_app(
|
||||
transport_type=args.transport,
|
||||
host=args.host,
|
||||
proxy=args.proxy,
|
||||
esp32_mode=args.esp32,
|
||||
whatsapp_enabled=args.whatsapp,
|
||||
folder=args.folder,
|
||||
)
|
||||
|
||||
# Run the server
|
||||
uvicorn.run(app, host=args.host, port=args.port)
|
||||
|
||||
@@ -99,29 +99,41 @@ async def parse_telephony_websocket(websocket: WebSocket):
|
||||
tuple: (transport_type: str, call_data: dict)
|
||||
|
||||
call_data contains provider-specific fields:
|
||||
- Twilio: {
|
||||
"stream_id": str,
|
||||
"call_id": str,
|
||||
"body": dict
|
||||
}
|
||||
- Telnyx: {
|
||||
"stream_id": str,
|
||||
"call_control_id": str,
|
||||
"outbound_encoding": str,
|
||||
"from": str,
|
||||
"to": str,
|
||||
}
|
||||
- Plivo: {
|
||||
"stream_id": str,
|
||||
"call_id": str,
|
||||
}
|
||||
- Exotel: {
|
||||
"stream_id": str,
|
||||
"call_id": str,
|
||||
"account_sid": str,
|
||||
"from": str,
|
||||
"to": str,
|
||||
}
|
||||
|
||||
- Twilio::
|
||||
|
||||
{
|
||||
"stream_id": str,
|
||||
"call_id": str,
|
||||
"body": dict
|
||||
}
|
||||
|
||||
- Telnyx::
|
||||
|
||||
{
|
||||
"stream_id": str,
|
||||
"call_control_id": str,
|
||||
"outbound_encoding": str,
|
||||
"from": str,
|
||||
"to": str,
|
||||
}
|
||||
|
||||
- Plivo::
|
||||
|
||||
{
|
||||
"stream_id": str,
|
||||
"call_id": str,
|
||||
}
|
||||
|
||||
- Exotel::
|
||||
|
||||
{
|
||||
"stream_id": str,
|
||||
"call_id": str,
|
||||
"account_sid": str,
|
||||
"from": str,
|
||||
"to": str,
|
||||
}
|
||||
|
||||
Example usage::
|
||||
|
||||
@@ -301,6 +313,7 @@ def _smallwebrtc_sdp_cleanup_ice_candidates(text: str, pattern: str) -> str:
|
||||
Returns:
|
||||
Cleaned SDP text with filtered ICE candidates.
|
||||
"""
|
||||
logger.debug("Removing unsupported ICE candidates from SDP")
|
||||
result = []
|
||||
lines = text.splitlines()
|
||||
for line in lines:
|
||||
@@ -309,7 +322,7 @@ def _smallwebrtc_sdp_cleanup_ice_candidates(text: str, pattern: str) -> str:
|
||||
result.append(line)
|
||||
else:
|
||||
result.append(line)
|
||||
return "\r\n".join(result)
|
||||
return "\r\n".join(result) + "\r\n"
|
||||
|
||||
|
||||
def _smallwebrtc_sdp_cleanup_fingerprints(text: str) -> str:
|
||||
@@ -321,15 +334,16 @@ def _smallwebrtc_sdp_cleanup_fingerprints(text: str) -> str:
|
||||
Returns:
|
||||
SDP text with sha-384 and sha-512 fingerprints removed.
|
||||
"""
|
||||
logger.debug("Removing unsupported fingerprints from SDP")
|
||||
result = []
|
||||
lines = text.splitlines()
|
||||
for line in lines:
|
||||
if not re.search("sha-384", line) and not re.search("sha-512", line):
|
||||
result.append(line)
|
||||
return "\r\n".join(result)
|
||||
return "\r\n".join(result) + "\r\n"
|
||||
|
||||
|
||||
def smallwebrtc_sdp_munging(sdp: str, host: str) -> str:
|
||||
def smallwebrtc_sdp_munging(sdp: str, host: Optional[str]) -> str:
|
||||
"""Apply SDP modifications for SmallWebRTC compatibility.
|
||||
|
||||
Args:
|
||||
@@ -340,7 +354,8 @@ def smallwebrtc_sdp_munging(sdp: str, host: str) -> str:
|
||||
Modified SDP string with fingerprint and ICE candidate cleanup.
|
||||
"""
|
||||
sdp = _smallwebrtc_sdp_cleanup_fingerprints(sdp)
|
||||
sdp = _smallwebrtc_sdp_cleanup_ice_candidates(sdp, host)
|
||||
if host:
|
||||
sdp = _smallwebrtc_sdp_cleanup_ice_candidates(sdp, host)
|
||||
return sdp
|
||||
|
||||
|
||||
|
||||
@@ -21,9 +21,9 @@ from pipecat.frames.frames import (
|
||||
InputAudioRawFrame,
|
||||
InputDTMFFrame,
|
||||
InterruptionFrame,
|
||||
OutputTransportMessageFrame,
|
||||
OutputTransportMessageUrgentFrame,
|
||||
StartFrame,
|
||||
TransportMessageFrame,
|
||||
TransportMessageUrgentFrame,
|
||||
)
|
||||
from pipecat.serializers.base_serializer import FrameSerializer, FrameSerializerType
|
||||
|
||||
@@ -121,7 +121,7 @@ class ExotelFrameSerializer(FrameSerializer):
|
||||
}
|
||||
|
||||
return json.dumps(answer)
|
||||
elif isinstance(frame, (TransportMessageFrame, TransportMessageUrgentFrame)):
|
||||
elif isinstance(frame, (OutputTransportMessageFrame, OutputTransportMessageUrgentFrame)):
|
||||
return json.dumps(frame.message)
|
||||
|
||||
return None
|
||||
|
||||
@@ -25,11 +25,31 @@ except ModuleNotFoundError as e:
|
||||
class LivekitFrameSerializer(FrameSerializer):
|
||||
"""Serializer for converting between Pipecat frames and LiveKit audio frames.
|
||||
|
||||
.. deprecated:: 0.0.90
|
||||
|
||||
This class is deprecated and will be removed in a future version.
|
||||
Please use LiveKitTransport instead, which handles audio streaming
|
||||
and frame conversion natively.
|
||||
|
||||
This serializer handles the conversion of Pipecat's OutputAudioRawFrame objects
|
||||
to LiveKit AudioFrame objects for transmission, and the reverse conversion
|
||||
for received audio data.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the LiveKit frame serializer."""
|
||||
super().__init__()
|
||||
import warnings
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("always")
|
||||
warnings.warn(
|
||||
"LivekitFrameSerializer is deprecated and will be removed in a future version. "
|
||||
"Please use LiveKitTransport instead, which handles audio streaming natively.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
@property
|
||||
def type(self) -> FrameSerializerType:
|
||||
"""Get the serializer type.
|
||||
|
||||
@@ -23,9 +23,9 @@ from pipecat.frames.frames import (
|
||||
InputAudioRawFrame,
|
||||
InputDTMFFrame,
|
||||
InterruptionFrame,
|
||||
OutputTransportMessageFrame,
|
||||
OutputTransportMessageUrgentFrame,
|
||||
StartFrame,
|
||||
TransportMessageFrame,
|
||||
TransportMessageUrgentFrame,
|
||||
)
|
||||
from pipecat.serializers.base_serializer import FrameSerializer, FrameSerializerType
|
||||
|
||||
@@ -148,7 +148,7 @@ class PlivoFrameSerializer(FrameSerializer):
|
||||
}
|
||||
|
||||
return json.dumps(answer)
|
||||
elif isinstance(frame, (TransportMessageFrame, TransportMessageUrgentFrame)):
|
||||
elif isinstance(frame, (OutputTransportMessageFrame, OutputTransportMessageUrgentFrame)):
|
||||
return json.dumps(frame.message)
|
||||
|
||||
# Return None for unhandled frames
|
||||
|
||||
@@ -15,11 +15,12 @@ import pipecat.frames.protobufs.frames_pb2 as frame_protos
|
||||
from pipecat.frames.frames import (
|
||||
Frame,
|
||||
InputAudioRawFrame,
|
||||
InputTransportMessageFrame,
|
||||
OutputAudioRawFrame,
|
||||
OutputTransportMessageFrame,
|
||||
OutputTransportMessageUrgentFrame,
|
||||
TextFrame,
|
||||
TranscriptionFrame,
|
||||
TransportMessageFrame,
|
||||
TransportMessageUrgentFrame,
|
||||
)
|
||||
from pipecat.serializers.base_serializer import FrameSerializer, FrameSerializerType
|
||||
|
||||
@@ -82,7 +83,7 @@ class ProtobufFrameSerializer(FrameSerializer):
|
||||
Serialized frame as bytes, or None if frame type is not serializable.
|
||||
"""
|
||||
# Wrapping this messages as a JSONFrame to send
|
||||
if isinstance(frame, (TransportMessageFrame, TransportMessageUrgentFrame)):
|
||||
if isinstance(frame, (OutputTransportMessageFrame, OutputTransportMessageUrgentFrame)):
|
||||
frame = MessageFrame(
|
||||
data=json.dumps(frame.message),
|
||||
)
|
||||
@@ -134,11 +135,11 @@ class ProtobufFrameSerializer(FrameSerializer):
|
||||
if "pts" in args_dict:
|
||||
del args_dict["pts"]
|
||||
|
||||
# Special handling for MessageFrame -> TransportMessageUrgentFrame
|
||||
# Special handling for MessageFrame -> OutputTransportMessageUrgentFrame
|
||||
if class_name == MessageFrame:
|
||||
try:
|
||||
msg = json.loads(args_dict["data"])
|
||||
instance = TransportMessageUrgentFrame(message=msg)
|
||||
instance = InputTransportMessageFrame(message=msg)
|
||||
logger.debug(f"ProtobufFrameSerializer: Transport message {instance}")
|
||||
except Exception as e:
|
||||
logger.error(f"Error parsing MessageFrame data: {e}")
|
||||
|
||||
@@ -23,9 +23,9 @@ from pipecat.frames.frames import (
|
||||
InputAudioRawFrame,
|
||||
InputDTMFFrame,
|
||||
InterruptionFrame,
|
||||
OutputTransportMessageFrame,
|
||||
OutputTransportMessageUrgentFrame,
|
||||
StartFrame,
|
||||
TransportMessageFrame,
|
||||
TransportMessageUrgentFrame,
|
||||
)
|
||||
from pipecat.serializers.base_serializer import FrameSerializer, FrameSerializerType
|
||||
|
||||
@@ -175,7 +175,7 @@ class TwilioFrameSerializer(FrameSerializer):
|
||||
}
|
||||
|
||||
return json.dumps(answer)
|
||||
elif isinstance(frame, (TransportMessageFrame, TransportMessageUrgentFrame)):
|
||||
elif isinstance(frame, (OutputTransportMessageFrame, OutputTransportMessageUrgentFrame)):
|
||||
return json.dumps(frame.message)
|
||||
|
||||
# Return None for unhandled frames
|
||||
|
||||
@@ -97,9 +97,7 @@ class AIService(FrameProcessor):
|
||||
pass
|
||||
|
||||
async def _update_settings(self, settings: Mapping[str, Any]):
|
||||
from pipecat.services.openai_realtime_beta.events import (
|
||||
SessionProperties,
|
||||
)
|
||||
from pipecat.services.openai.realtime.events import SessionProperties
|
||||
|
||||
for key, value in settings.items():
|
||||
logger.debug("Update request for:", key, value)
|
||||
@@ -111,9 +109,7 @@ class AIService(FrameProcessor):
|
||||
logger.debug("Attempting to update", key, value)
|
||||
|
||||
try:
|
||||
from pipecat.services.openai_realtime_beta.events import (
|
||||
TurnDetection,
|
||||
)
|
||||
from pipecat.services.openai.realtime.events import TurnDetection
|
||||
|
||||
if isinstance(self._session_properties, SessionProperties):
|
||||
current_properties = self._session_properties
|
||||
|
||||
@@ -108,6 +108,8 @@ class AssemblyAIConnectionParams(BaseModel):
|
||||
end_of_turn_confidence_threshold: Confidence threshold for end-of-turn detection.
|
||||
min_end_of_turn_silence_when_confident: Minimum silence duration when confident about end-of-turn.
|
||||
max_turn_silence: Maximum silence duration before forcing end-of-turn.
|
||||
keyterms_prompt: List of key terms to guide transcription. Will be JSON serialized before sending.
|
||||
speech_model: Select between English and multilingual models. Defaults to "universal-streaming-english".
|
||||
"""
|
||||
|
||||
sample_rate: int = 16000
|
||||
@@ -117,3 +119,7 @@ class AssemblyAIConnectionParams(BaseModel):
|
||||
end_of_turn_confidence_threshold: Optional[float] = None
|
||||
min_end_of_turn_silence_when_confident: Optional[int] = None
|
||||
max_turn_silence: Optional[int] = None
|
||||
keyterms_prompt: Optional[List[str]] = None
|
||||
speech_model: Literal["universal-streaming-english", "universal-streaming-multilingual"] = (
|
||||
"universal-streaming-english"
|
||||
)
|
||||
|
||||
@@ -174,11 +174,16 @@ class AssemblyAISTTService(STTService):
|
||||
|
||||
def _build_ws_url(self) -> str:
|
||||
"""Build WebSocket URL with query parameters using urllib.parse.urlencode."""
|
||||
params = {
|
||||
k: str(v).lower() if isinstance(v, bool) else v
|
||||
for k, v in self._connection_params.model_dump().items()
|
||||
if v is not None
|
||||
}
|
||||
params = {}
|
||||
for k, v in self._connection_params.model_dump().items():
|
||||
if v is not None:
|
||||
if k == "keyterms_prompt":
|
||||
params[k] = json.dumps(v)
|
||||
elif isinstance(v, bool):
|
||||
params[k] = str(v).lower()
|
||||
else:
|
||||
params[k] = v
|
||||
|
||||
if params:
|
||||
query_string = urlencode(params)
|
||||
return f"{self._api_endpoint_base_url}?{query_string}"
|
||||
@@ -197,6 +202,8 @@ class AssemblyAISTTService(STTService):
|
||||
)
|
||||
self._connected = True
|
||||
self._receive_task = self.create_task(self._receive_task_handler())
|
||||
|
||||
await self._call_event_handler("on_connected")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to connect to AssemblyAI: {e}")
|
||||
self._connected = False
|
||||
@@ -238,6 +245,7 @@ class AssemblyAISTTService(STTService):
|
||||
self._websocket = None
|
||||
self._connected = False
|
||||
self._receive_task = None
|
||||
await self._call_event_handler("on_disconnected")
|
||||
|
||||
async def _receive_task_handler(self):
|
||||
"""Handle incoming WebSocket messages."""
|
||||
|
||||
@@ -235,6 +235,8 @@ class AsyncAITTSService(InterruptibleTTSService):
|
||||
}
|
||||
|
||||
await self._get_websocket().send(json.dumps(init_msg))
|
||||
|
||||
await self._call_event_handler("on_connected")
|
||||
except Exception as e:
|
||||
logger.error(f"{self} initialization error: {e}")
|
||||
self._websocket = None
|
||||
@@ -252,6 +254,7 @@ class AsyncAITTSService(InterruptibleTTSService):
|
||||
finally:
|
||||
self._websocket = None
|
||||
self._started = False
|
||||
await self._call_event_handler("on_disconnected")
|
||||
|
||||
def _get_websocket(self):
|
||||
if self._websocket:
|
||||
|
||||
@@ -9,6 +9,7 @@ import sys
|
||||
from pipecat.services import DeprecatedModuleProxy
|
||||
|
||||
from .llm import *
|
||||
from .nova_sonic import *
|
||||
from .stt import *
|
||||
from .tts import *
|
||||
|
||||
|
||||
@@ -61,7 +61,6 @@ from pipecat.utils.tracing.service_decorators import traced_llm
|
||||
|
||||
try:
|
||||
import aioboto3
|
||||
import httpx
|
||||
from botocore.config import Config
|
||||
from botocore.exceptions import ReadTimeoutError
|
||||
except ModuleNotFoundError as e:
|
||||
@@ -1117,7 +1116,7 @@ class AWSBedrockLLMService(LLMService):
|
||||
# also get cancelled.
|
||||
use_completion_tokens_estimate = True
|
||||
raise
|
||||
except httpx.TimeoutException:
|
||||
except (ReadTimeoutError, asyncio.TimeoutError):
|
||||
await self._call_event_handler("on_completion_timeout")
|
||||
except Exception as e:
|
||||
logger.exception(f"{self} exception: {e}")
|
||||
|
||||
0
src/pipecat/services/aws/nova_sonic/__init__.py
Normal file
0
src/pipecat/services/aws/nova_sonic/__init__.py
Normal file
87
src/pipecat/services/aws/nova_sonic/context.py
Normal file
87
src/pipecat/services/aws/nova_sonic/context.py
Normal file
@@ -0,0 +1,87 @@
|
||||
#
|
||||
# Copyright (c) 2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""Context management for AWS Nova Sonic LLM service.
|
||||
|
||||
This module provides specialized context aggregators and message handling for AWS Nova Sonic,
|
||||
including conversation history management and role-specific message processing.
|
||||
|
||||
.. deprecated:: 0.0.91
|
||||
AWS Nova Sonic now supports `LLMContext` and `LLMContextAggregatorPair`.
|
||||
Using the new patterns should allow you to not need types from this module.
|
||||
|
||||
BEFORE:
|
||||
```
|
||||
# Setup
|
||||
context = OpenAILLMContext(messages, tools)
|
||||
context_aggregator = llm.create_context_aggregator(context)
|
||||
|
||||
# Context frame type
|
||||
frame: OpenAILLMContextFrame
|
||||
|
||||
# Context type
|
||||
context: AWSNovaSonicLLMContext
|
||||
# or
|
||||
context: OpenAILLMContext
|
||||
|
||||
# Reading messages from context
|
||||
messages = context.messages
|
||||
```
|
||||
|
||||
AFTER:
|
||||
```
|
||||
# Setup
|
||||
context = LLMContext(messages, tools)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
# Context frame type
|
||||
frame: LLMContextFrame
|
||||
|
||||
# Context type
|
||||
context: LLMContext
|
||||
|
||||
# Reading messages from context
|
||||
messages = context.get_messages()
|
||||
```
|
||||
"""
|
||||
|
||||
import warnings
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("always")
|
||||
warnings.warn(
|
||||
"Types in pipecat.services.aws.nova_sonic.context are deprecated. \n"
|
||||
"AWS Nova Sonic now supports `LLMContext` and `LLMContextAggregatorPair`. \n"
|
||||
"Using the new patterns should allow you to not need types from this module.\n\n"
|
||||
"BEFORE:\n"
|
||||
"```\n"
|
||||
"# Setup\n"
|
||||
"context = OpenAILLMContext(messages, tools)\n"
|
||||
"context_aggregator = llm.create_context_aggregator(context)\n\n"
|
||||
"# Context frame type\n"
|
||||
"frame: OpenAILLMContextFrame\n\n"
|
||||
"# Context type\n"
|
||||
"context: AWSNovaSonicLLMContext\n"
|
||||
"# or\n"
|
||||
"context: OpenAILLMContext\n\n"
|
||||
"# Reading messages from context\n"
|
||||
"messages = context.messages\n"
|
||||
"```\n\n"
|
||||
"AFTER:\n"
|
||||
"```\n"
|
||||
"# Setup\n"
|
||||
"context = LLMContext(messages, tools)\n"
|
||||
"context_aggregator = LLMContextAggregatorPair(context)\n\n"
|
||||
"# Context frame type\n"
|
||||
"frame: LLMContextFrame\n\n"
|
||||
"# Context type\n"
|
||||
"context: LLMContext\n\n"
|
||||
"# Reading messages from context\n"
|
||||
"messages = context.messages\n"
|
||||
"```",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
25
src/pipecat/services/aws/nova_sonic/frames.py
Normal file
25
src/pipecat/services/aws/nova_sonic/frames.py
Normal file
@@ -0,0 +1,25 @@
|
||||
#
|
||||
# Copyright (c) 2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""Custom frames for AWS Nova Sonic LLM service."""
|
||||
|
||||
from dataclasses import dataclass
|
||||
|
||||
from pipecat.frames.frames import DataFrame, FunctionCallResultFrame
|
||||
|
||||
|
||||
@dataclass
|
||||
class AWSNovaSonicFunctionCallResultFrame(DataFrame):
|
||||
"""Frame containing function call result for AWS Nova Sonic processing.
|
||||
|
||||
This frame wraps a standard function call result frame to enable
|
||||
AWS Nova Sonic-specific handling and context updates.
|
||||
|
||||
Parameters:
|
||||
result_frame: The underlying function call result frame.
|
||||
"""
|
||||
|
||||
result_frame: FunctionCallResultFrame
|
||||
1265
src/pipecat/services/aws/nova_sonic/llm.py
Normal file
1265
src/pipecat/services/aws/nova_sonic/llm.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -286,6 +286,7 @@ class AWSTranscribeSTTService(STTService):
|
||||
|
||||
logger.info(f"{self} Successfully connected to AWS Transcribe")
|
||||
|
||||
await self._call_event_handler("on_connected")
|
||||
except Exception as e:
|
||||
logger.error(f"{self} Failed to connect to AWS Transcribe: {e}")
|
||||
await self._disconnect()
|
||||
@@ -310,6 +311,7 @@ class AWSTranscribeSTTService(STTService):
|
||||
logger.warning(f"{self} Error closing WebSocket connection: {e}")
|
||||
finally:
|
||||
self._ws_client = None
|
||||
await self._call_event_handler("on_disconnected")
|
||||
|
||||
def language_to_service_language(self, language: Language) -> str | None:
|
||||
"""Convert internal language enum to AWS Transcribe language code.
|
||||
|
||||
@@ -1 +1,19 @@
|
||||
from .aws import AWSNovaSonicLLMService, Params
|
||||
#
|
||||
# Copyright (c) 2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import warnings
|
||||
|
||||
from pipecat.services.aws.nova_sonic.llm import AWSNovaSonicLLMService, Params
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("always")
|
||||
warnings.warn(
|
||||
"Types in pipecat.services.aws_nova_sonic are deprecated. "
|
||||
"Please use the equivalent types from "
|
||||
"pipecat.services.aws.nova_sonic.llm instead.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -8,360 +8,80 @@
|
||||
|
||||
This module provides specialized context aggregators and message handling for AWS Nova Sonic,
|
||||
including conversation history management and role-specific message processing.
|
||||
|
||||
.. deprecated:: 0.0.91
|
||||
AWS Nova Sonic now supports `LLMContext` and `LLMContextAggregatorPair`.
|
||||
Using the new patterns should allow you to not need types from this module.
|
||||
|
||||
BEFORE:
|
||||
```
|
||||
# Setup
|
||||
context = OpenAILLMContext(messages, tools)
|
||||
context_aggregator = llm.create_context_aggregator(context)
|
||||
|
||||
# Context frame type
|
||||
frame: OpenAILLMContextFrame
|
||||
|
||||
# Context type
|
||||
context: AWSNovaSonicLLMContext
|
||||
# or
|
||||
context: OpenAILLMContext
|
||||
|
||||
# Reading messages from context
|
||||
messages = context.messages
|
||||
```
|
||||
|
||||
AFTER:
|
||||
```
|
||||
# Setup
|
||||
context = LLMContext(messages, tools)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
# Context frame type
|
||||
frame: LLMContextFrame
|
||||
|
||||
# Context type
|
||||
context: LLMContext
|
||||
|
||||
# Reading messages from context
|
||||
messages = context.get_messages()
|
||||
```
|
||||
"""
|
||||
|
||||
import copy
|
||||
from dataclasses import dataclass, field
|
||||
from enum import Enum
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.frames.frames import (
|
||||
BotStoppedSpeakingFrame,
|
||||
DataFrame,
|
||||
Frame,
|
||||
FunctionCallResultFrame,
|
||||
InterruptionFrame,
|
||||
LLMFullResponseEndFrame,
|
||||
LLMFullResponseStartFrame,
|
||||
LLMMessagesAppendFrame,
|
||||
LLMMessagesUpdateFrame,
|
||||
LLMSetToolChoiceFrame,
|
||||
LLMSetToolsFrame,
|
||||
TextFrame,
|
||||
UserImageRawFrame,
|
||||
)
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.processors.frame_processor import FrameDirection
|
||||
from pipecat.services.aws_nova_sonic.frames import AWSNovaSonicFunctionCallResultFrame
|
||||
from pipecat.services.openai.llm import (
|
||||
OpenAIAssistantContextAggregator,
|
||||
OpenAIUserContextAggregator,
|
||||
)
|
||||
|
||||
|
||||
class Role(Enum):
|
||||
"""Roles supported in AWS Nova Sonic conversations.
|
||||
|
||||
Parameters:
|
||||
SYSTEM: System-level messages (not used in conversation history).
|
||||
USER: Messages sent by the user.
|
||||
ASSISTANT: Messages sent by the assistant.
|
||||
TOOL: Messages sent by tools (not used in conversation history).
|
||||
"""
|
||||
|
||||
SYSTEM = "SYSTEM"
|
||||
USER = "USER"
|
||||
ASSISTANT = "ASSISTANT"
|
||||
TOOL = "TOOL"
|
||||
|
||||
|
||||
@dataclass
|
||||
class AWSNovaSonicConversationHistoryMessage:
|
||||
"""A single message in AWS Nova Sonic conversation history.
|
||||
|
||||
Parameters:
|
||||
role: The role of the message sender (USER or ASSISTANT only).
|
||||
text: The text content of the message.
|
||||
"""
|
||||
|
||||
role: Role # only USER and ASSISTANT
|
||||
text: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class AWSNovaSonicConversationHistory:
|
||||
"""Complete conversation history for AWS Nova Sonic initialization.
|
||||
|
||||
Parameters:
|
||||
system_instruction: System-level instruction for the conversation.
|
||||
messages: List of conversation messages between user and assistant.
|
||||
"""
|
||||
|
||||
system_instruction: str = None
|
||||
messages: list[AWSNovaSonicConversationHistoryMessage] = field(default_factory=list)
|
||||
|
||||
|
||||
class AWSNovaSonicLLMContext(OpenAILLMContext):
|
||||
"""Specialized LLM context for AWS Nova Sonic service.
|
||||
|
||||
Extends OpenAI context with Nova Sonic-specific message handling,
|
||||
conversation history management, and text buffering capabilities.
|
||||
"""
|
||||
|
||||
def __init__(self, messages=None, tools=None, **kwargs):
|
||||
"""Initialize AWS Nova Sonic LLM context.
|
||||
|
||||
Args:
|
||||
messages: Initial messages for the context.
|
||||
tools: Available tools for the context.
|
||||
**kwargs: Additional arguments passed to parent class.
|
||||
"""
|
||||
super().__init__(messages=messages, tools=tools, **kwargs)
|
||||
self.__setup_local()
|
||||
|
||||
def __setup_local(self, system_instruction: str = ""):
|
||||
self._assistant_text = ""
|
||||
self._user_text = ""
|
||||
self._system_instruction = system_instruction
|
||||
|
||||
@staticmethod
|
||||
def upgrade_to_nova_sonic(
|
||||
obj: OpenAILLMContext, system_instruction: str
|
||||
) -> "AWSNovaSonicLLMContext":
|
||||
"""Upgrade an OpenAI context to AWS Nova Sonic context.
|
||||
|
||||
Args:
|
||||
obj: The OpenAI context to upgrade.
|
||||
system_instruction: System instruction for the context.
|
||||
|
||||
Returns:
|
||||
The upgraded AWS Nova Sonic context.
|
||||
"""
|
||||
if isinstance(obj, OpenAILLMContext) and not isinstance(obj, AWSNovaSonicLLMContext):
|
||||
obj.__class__ = AWSNovaSonicLLMContext
|
||||
obj.__setup_local(system_instruction)
|
||||
return obj
|
||||
|
||||
# NOTE: this method has the side-effect of updating _system_instruction from messages
|
||||
def get_messages_for_initializing_history(self) -> AWSNovaSonicConversationHistory:
|
||||
"""Get conversation history for initializing AWS Nova Sonic session.
|
||||
|
||||
Processes stored messages and extracts system instruction and conversation
|
||||
history in the format expected by AWS Nova Sonic.
|
||||
|
||||
Returns:
|
||||
Formatted conversation history with system instruction and messages.
|
||||
"""
|
||||
history = AWSNovaSonicConversationHistory(system_instruction=self._system_instruction)
|
||||
|
||||
# Bail if there are no messages
|
||||
if not self.messages:
|
||||
return history
|
||||
|
||||
messages = copy.deepcopy(self.messages)
|
||||
|
||||
# If we have a "system" message as our first message, let's pull that out into "instruction"
|
||||
if messages[0].get("role") == "system":
|
||||
system = messages.pop(0)
|
||||
content = system.get("content")
|
||||
if isinstance(content, str):
|
||||
history.system_instruction = content
|
||||
elif isinstance(content, list):
|
||||
history.system_instruction = content[0].get("text")
|
||||
if history.system_instruction:
|
||||
self._system_instruction = history.system_instruction
|
||||
|
||||
# Process remaining messages to fill out conversation history.
|
||||
# Nova Sonic supports "user" and "assistant" messages in history.
|
||||
for message in messages:
|
||||
history_message = self.from_standard_message(message)
|
||||
if history_message:
|
||||
history.messages.append(history_message)
|
||||
|
||||
return history
|
||||
|
||||
def get_messages_for_persistent_storage(self):
|
||||
"""Get messages formatted for persistent storage.
|
||||
|
||||
Returns:
|
||||
List of messages including system instruction if present.
|
||||
"""
|
||||
messages = super().get_messages_for_persistent_storage()
|
||||
# If we have a system instruction and messages doesn't already contain it, add it
|
||||
if self._system_instruction and not (messages and messages[0].get("role") == "system"):
|
||||
messages.insert(0, {"role": "system", "content": self._system_instruction})
|
||||
return messages
|
||||
|
||||
def from_standard_message(self, message) -> AWSNovaSonicConversationHistoryMessage:
|
||||
"""Convert standard message format to Nova Sonic format.
|
||||
|
||||
Args:
|
||||
message: Standard message dictionary to convert.
|
||||
|
||||
Returns:
|
||||
Nova Sonic conversation history message, or None if not convertible.
|
||||
"""
|
||||
role = message.get("role")
|
||||
if message.get("role") == "user" or message.get("role") == "assistant":
|
||||
content = message.get("content")
|
||||
if isinstance(message.get("content"), list):
|
||||
content = ""
|
||||
for c in message.get("content"):
|
||||
if c.get("type") == "text":
|
||||
content += " " + c.get("text")
|
||||
else:
|
||||
logger.error(
|
||||
f"Unhandled content type in context message: {c.get('type')} - {message}"
|
||||
)
|
||||
# There won't be content if this is an assistant tool call entry.
|
||||
# We're ignoring those since they can't be loaded into AWS Nova Sonic conversation
|
||||
# history
|
||||
if content:
|
||||
return AWSNovaSonicConversationHistoryMessage(role=Role[role.upper()], text=content)
|
||||
# NOTE: we're ignoring messages with role "tool" since they can't be loaded into AWS Nova
|
||||
# Sonic conversation history
|
||||
|
||||
def buffer_user_text(self, text):
|
||||
"""Buffer user text for later flushing to context.
|
||||
|
||||
Args:
|
||||
text: User text to buffer.
|
||||
"""
|
||||
self._user_text += f" {text}" if self._user_text else text
|
||||
# logger.debug(f"User text buffered: {self._user_text}")
|
||||
|
||||
def flush_aggregated_user_text(self) -> str:
|
||||
"""Flush buffered user text to context as a complete message.
|
||||
|
||||
Returns:
|
||||
The flushed user text, or empty string if no text was buffered.
|
||||
"""
|
||||
if not self._user_text:
|
||||
return ""
|
||||
user_text = self._user_text
|
||||
message = {
|
||||
"role": "user",
|
||||
"content": [{"type": "text", "text": user_text}],
|
||||
}
|
||||
self._user_text = ""
|
||||
self.add_message(message)
|
||||
# logger.debug(f"Context updated (user): {self.get_messages_for_logging()}")
|
||||
return user_text
|
||||
|
||||
def buffer_assistant_text(self, text):
|
||||
"""Buffer assistant text for later flushing to context.
|
||||
|
||||
Args:
|
||||
text: Assistant text to buffer.
|
||||
"""
|
||||
self._assistant_text += text
|
||||
# logger.debug(f"Assistant text buffered: {self._assistant_text}")
|
||||
|
||||
def flush_aggregated_assistant_text(self):
|
||||
"""Flush buffered assistant text to context as a complete message."""
|
||||
if not self._assistant_text:
|
||||
return
|
||||
message = {
|
||||
"role": "assistant",
|
||||
"content": [{"type": "text", "text": self._assistant_text}],
|
||||
}
|
||||
self._assistant_text = ""
|
||||
self.add_message(message)
|
||||
# logger.debug(f"Context updated (assistant): {self.get_messages_for_logging()}")
|
||||
|
||||
|
||||
@dataclass
|
||||
class AWSNovaSonicMessagesUpdateFrame(DataFrame):
|
||||
"""Frame containing updated AWS Nova Sonic context.
|
||||
|
||||
Parameters:
|
||||
context: The updated AWS Nova Sonic LLM context.
|
||||
"""
|
||||
|
||||
context: AWSNovaSonicLLMContext
|
||||
|
||||
|
||||
class AWSNovaSonicUserContextAggregator(OpenAIUserContextAggregator):
|
||||
"""Context aggregator for user messages in AWS Nova Sonic conversations.
|
||||
|
||||
Extends the OpenAI user context aggregator to emit Nova Sonic-specific
|
||||
context update frames.
|
||||
"""
|
||||
|
||||
async def process_frame(
|
||||
self, frame: Frame, direction: FrameDirection = FrameDirection.DOWNSTREAM
|
||||
):
|
||||
"""Process frames and emit Nova Sonic-specific context updates.
|
||||
|
||||
Args:
|
||||
frame: The frame to process.
|
||||
direction: The direction the frame is traveling.
|
||||
"""
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
# Parent does not push LLMMessagesUpdateFrame
|
||||
if isinstance(frame, LLMMessagesUpdateFrame):
|
||||
await self.push_frame(AWSNovaSonicMessagesUpdateFrame(context=self._context))
|
||||
|
||||
|
||||
class AWSNovaSonicAssistantContextAggregator(OpenAIAssistantContextAggregator):
|
||||
"""Context aggregator for assistant messages in AWS Nova Sonic conversations.
|
||||
|
||||
Provides specialized handling for assistant responses and function calls
|
||||
in AWS Nova Sonic context, with custom frame processing logic.
|
||||
"""
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
"""Process frames with Nova Sonic-specific logic.
|
||||
|
||||
Args:
|
||||
frame: The frame to process.
|
||||
direction: The direction the frame is traveling.
|
||||
"""
|
||||
# HACK: For now, disable the context aggregator by making it just pass through all frames
|
||||
# that the parent handles (except the function call stuff, which we still need).
|
||||
# For an explanation of this hack, see
|
||||
# AWSNovaSonicLLMService._report_assistant_response_text_added.
|
||||
if isinstance(
|
||||
frame,
|
||||
(
|
||||
InterruptionFrame,
|
||||
LLMFullResponseStartFrame,
|
||||
LLMFullResponseEndFrame,
|
||||
TextFrame,
|
||||
LLMMessagesAppendFrame,
|
||||
LLMMessagesUpdateFrame,
|
||||
LLMSetToolsFrame,
|
||||
LLMSetToolChoiceFrame,
|
||||
UserImageRawFrame,
|
||||
BotStoppedSpeakingFrame,
|
||||
),
|
||||
):
|
||||
await self.push_frame(frame, direction)
|
||||
else:
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
async def handle_function_call_result(self, frame: FunctionCallResultFrame):
|
||||
"""Handle function call results for AWS Nova Sonic.
|
||||
|
||||
Args:
|
||||
frame: The function call result frame to handle.
|
||||
"""
|
||||
await super().handle_function_call_result(frame)
|
||||
|
||||
# The standard function callback code path pushes the FunctionCallResultFrame from the LLM
|
||||
# itself, so we didn't have a chance to add the result to the AWS Nova Sonic server-side
|
||||
# context. Let's push a special frame to do that.
|
||||
await self.push_frame(
|
||||
AWSNovaSonicFunctionCallResultFrame(result_frame=frame), FrameDirection.UPSTREAM
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class AWSNovaSonicContextAggregatorPair:
|
||||
"""Pair of user and assistant context aggregators for AWS Nova Sonic.
|
||||
|
||||
Parameters:
|
||||
_user: The user context aggregator.
|
||||
_assistant: The assistant context aggregator.
|
||||
"""
|
||||
|
||||
_user: AWSNovaSonicUserContextAggregator
|
||||
_assistant: AWSNovaSonicAssistantContextAggregator
|
||||
|
||||
def user(self) -> AWSNovaSonicUserContextAggregator:
|
||||
"""Get the user context aggregator.
|
||||
|
||||
Returns:
|
||||
The user context aggregator instance.
|
||||
"""
|
||||
return self._user
|
||||
|
||||
def assistant(self) -> AWSNovaSonicAssistantContextAggregator:
|
||||
"""Get the assistant context aggregator.
|
||||
|
||||
Returns:
|
||||
The assistant context aggregator instance.
|
||||
"""
|
||||
return self._assistant
|
||||
import warnings
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("always")
|
||||
warnings.warn(
|
||||
"Types in pipecat.services.aws_nova_sonic.context are deprecated. \n"
|
||||
"AWS Nova Sonic now supports `LLMContext` and `LLMContextAggregatorPair`. \n"
|
||||
"Using the new patterns should allow you to not need types from this module.\n\n"
|
||||
"BEFORE:\n"
|
||||
"```\n"
|
||||
"# Setup\n"
|
||||
"context = OpenAILLMContext(messages, tools)\n"
|
||||
"context_aggregator = llm.create_context_aggregator(context)\n\n"
|
||||
"# Context frame type\n"
|
||||
"frame: OpenAILLMContextFrame\n\n"
|
||||
"# Context type\n"
|
||||
"context: AWSNovaSonicLLMContext\n"
|
||||
"# or\n"
|
||||
"context: OpenAILLMContext\n\n"
|
||||
"# Reading messages from context\n"
|
||||
"messages = context.messages\n"
|
||||
"```\n\n"
|
||||
"AFTER:\n"
|
||||
"```\n"
|
||||
"# Setup\n"
|
||||
"context = LLMContext(messages, tools)\n"
|
||||
"context_aggregator = LLMContextAggregatorPair(context)\n\n"
|
||||
"# Context frame type\n"
|
||||
"frame: LLMContextFrame\n\n"
|
||||
"# Context type\n"
|
||||
"context: LLMContext\n\n"
|
||||
"# Reading messages from context\n"
|
||||
"messages = context.messages\n"
|
||||
"```",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
@@ -6,20 +6,16 @@
|
||||
|
||||
"""Custom frames for AWS Nova Sonic LLM service."""
|
||||
|
||||
from dataclasses import dataclass
|
||||
import warnings
|
||||
|
||||
from pipecat.frames.frames import DataFrame, FunctionCallResultFrame
|
||||
from pipecat.services.aws.nova_sonic.frames import *
|
||||
|
||||
|
||||
@dataclass
|
||||
class AWSNovaSonicFunctionCallResultFrame(DataFrame):
|
||||
"""Frame containing function call result for AWS Nova Sonic processing.
|
||||
|
||||
This frame wraps a standard function call result frame to enable
|
||||
AWS Nova Sonic-specific handling and context updates.
|
||||
|
||||
Parameters:
|
||||
result_frame: The underlying function call result frame.
|
||||
"""
|
||||
|
||||
result_frame: FunctionCallResultFrame
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("always")
|
||||
warnings.warn(
|
||||
"Types in pipecat.services.aws_nova_sonic.frames are deprecated. "
|
||||
"Please use the equivalent types from "
|
||||
"pipecat.services.aws.nova_sonic.frames instead.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
0
src/pipecat/services/azure/realtime/__init__.py
Normal file
0
src/pipecat/services/azure/realtime/__init__.py
Normal file
65
src/pipecat/services/azure/realtime/llm.py
Normal file
65
src/pipecat/services/azure/realtime/llm.py
Normal file
@@ -0,0 +1,65 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""Azure OpenAI Realtime LLM service implementation."""
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.services.openai.realtime.llm import OpenAIRealtimeLLMService
|
||||
|
||||
try:
|
||||
from websockets.asyncio.client import connect as websocket_connect
|
||||
except ModuleNotFoundError as e:
|
||||
logger.error(f"Exception: {e}")
|
||||
logger.error("In order to use Azure Realtime, you need to `pip install pipecat-ai[openai]`.")
|
||||
raise Exception(f"Missing module: {e}")
|
||||
|
||||
|
||||
class AzureRealtimeLLMService(OpenAIRealtimeLLMService):
|
||||
"""Azure OpenAI Realtime LLM service with Azure-specific authentication.
|
||||
|
||||
Extends the OpenAI Realtime service to work with Azure OpenAI endpoints,
|
||||
using Azure's authentication headers and endpoint format. Provides the same
|
||||
real-time audio and text communication capabilities as the base OpenAI service.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
api_key: str,
|
||||
base_url: str,
|
||||
**kwargs,
|
||||
):
|
||||
"""Initialize Azure Realtime LLM service.
|
||||
|
||||
Args:
|
||||
api_key: The API key for the Azure OpenAI service.
|
||||
base_url: The full Azure WebSocket endpoint URL including api-version and deployment.
|
||||
Example: "wss://my-project.openai.azure.com/openai/realtime?api-version=2024-10-01-preview&deployment=my-realtime-deployment"
|
||||
**kwargs: Additional arguments passed to parent OpenAIRealtimeLLMService.
|
||||
"""
|
||||
super().__init__(base_url=base_url, api_key=api_key, **kwargs)
|
||||
self.api_key = api_key
|
||||
self.base_url = base_url
|
||||
|
||||
async def _connect(self):
|
||||
try:
|
||||
if self._websocket:
|
||||
# Here we assume that if we have a websocket, we are connected. We
|
||||
# handle disconnections in the send/recv code paths.
|
||||
return
|
||||
|
||||
logger.info(f"Connecting to {self.base_url}, api key: {self.api_key}")
|
||||
self._websocket = await websocket_connect(
|
||||
uri=self.base_url,
|
||||
additional_headers={
|
||||
"api-key": self.api_key,
|
||||
},
|
||||
)
|
||||
self._receive_task = self.create_task(self._receive_task_handler())
|
||||
except Exception as e:
|
||||
logger.error(f"{self} initialization error: {e}")
|
||||
self._websocket = None
|
||||
@@ -28,13 +28,12 @@ from pipecat.frames.frames import (
|
||||
UserStoppedSpeakingFrame,
|
||||
)
|
||||
from pipecat.processors.frame_processor import FrameDirection
|
||||
from pipecat.services.stt_service import STTService
|
||||
from pipecat.services.stt_service import WebsocketSTTService
|
||||
from pipecat.transcriptions.language import Language
|
||||
from pipecat.utils.time import time_now_iso8601
|
||||
from pipecat.utils.tracing.service_decorators import traced_stt
|
||||
|
||||
try:
|
||||
import websockets
|
||||
from websockets.asyncio.client import connect as websocket_connect
|
||||
from websockets.protocol import State
|
||||
except ModuleNotFoundError as e:
|
||||
@@ -124,7 +123,7 @@ class CartesiaLiveOptions:
|
||||
return cls(**json.loads(json_str))
|
||||
|
||||
|
||||
class CartesiaSTTService(STTService):
|
||||
class CartesiaSTTService(WebsocketSTTService):
|
||||
"""Speech-to-text service using Cartesia Live API.
|
||||
|
||||
Provides real-time speech transcription through WebSocket connection
|
||||
@@ -176,8 +175,7 @@ class CartesiaSTTService(STTService):
|
||||
self.set_model_name(merged_options.model)
|
||||
self._api_key = api_key
|
||||
self._base_url = base_url or "api.cartesia.ai"
|
||||
self._connection = None
|
||||
self._receiver_task = None
|
||||
self._receive_task = None
|
||||
|
||||
def can_generate_metrics(self) -> bool:
|
||||
"""Check if the service can generate processing metrics.
|
||||
@@ -214,6 +212,27 @@ class CartesiaSTTService(STTService):
|
||||
await super().cancel(frame)
|
||||
await self._disconnect()
|
||||
|
||||
async def start_metrics(self):
|
||||
"""Start performance metrics collection for transcription processing."""
|
||||
await self.start_ttfb_metrics()
|
||||
await self.start_processing_metrics()
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
"""Process incoming frames and handle speech events.
|
||||
|
||||
Args:
|
||||
frame: The frame to process.
|
||||
direction: Direction of frame flow in the pipeline.
|
||||
"""
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if isinstance(frame, UserStartedSpeakingFrame):
|
||||
await self.start_metrics()
|
||||
elif isinstance(frame, UserStoppedSpeakingFrame):
|
||||
# Send finalize command to flush the transcription session
|
||||
if self._websocket and self._websocket.state is State.OPEN:
|
||||
await self._websocket.send("finalize")
|
||||
|
||||
async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]:
|
||||
"""Process audio data for speech-to-text transcription.
|
||||
|
||||
@@ -224,45 +243,71 @@ class CartesiaSTTService(STTService):
|
||||
None - transcription results are handled via WebSocket responses.
|
||||
"""
|
||||
# If the connection is closed, due to timeout, we need to reconnect when the user starts speaking again
|
||||
if not self._connection or self._connection.state is State.CLOSED:
|
||||
if not self._websocket or self._websocket.state is State.CLOSED:
|
||||
await self._connect()
|
||||
|
||||
await self._connection.send(audio)
|
||||
await self._websocket.send(audio)
|
||||
yield None
|
||||
|
||||
async def _connect(self):
|
||||
params = self._settings.to_dict()
|
||||
ws_url = f"wss://{self._base_url}/stt/websocket?{urllib.parse.urlencode(params)}"
|
||||
logger.debug(f"Connecting to Cartesia: {ws_url}")
|
||||
headers = {"Cartesia-Version": "2025-04-16", "X-API-Key": self._api_key}
|
||||
await self._connect_websocket()
|
||||
|
||||
if self._websocket and not self._receive_task:
|
||||
self._receive_task = asyncio.create_task(self._receive_task_handler(self._report_error))
|
||||
|
||||
async def _disconnect(self):
|
||||
if self._receive_task:
|
||||
await self.cancel_task(self._receive_task)
|
||||
self._receive_task = None
|
||||
|
||||
await self._disconnect_websocket()
|
||||
|
||||
async def _connect_websocket(self):
|
||||
try:
|
||||
self._connection = await websocket_connect(ws_url, additional_headers=headers)
|
||||
# Setup the receiver task to handle the incoming messages from the Cartesia server
|
||||
if self._receiver_task is None or self._receiver_task.done():
|
||||
self._receiver_task = asyncio.create_task(self._receive_messages())
|
||||
logger.debug(f"Connected to Cartesia")
|
||||
if self._websocket and self._websocket.state is State.OPEN:
|
||||
return
|
||||
logger.debug("Connecting to Cartesia STT")
|
||||
|
||||
params = self._settings.to_dict()
|
||||
ws_url = f"wss://{self._base_url}/stt/websocket?{urllib.parse.urlencode(params)}"
|
||||
headers = {"Cartesia-Version": "2025-04-16", "X-API-Key": self._api_key}
|
||||
|
||||
self._websocket = await websocket_connect(ws_url, additional_headers=headers)
|
||||
await self._call_event_handler("on_connected")
|
||||
except Exception as e:
|
||||
logger.error(f"{self}: unable to connect to Cartesia: {e}")
|
||||
|
||||
async def _receive_messages(self):
|
||||
async def _disconnect_websocket(self):
|
||||
try:
|
||||
while True:
|
||||
if not self._connection or self._connection.state is State.CLOSED:
|
||||
break
|
||||
|
||||
message = await self._connection.recv()
|
||||
try:
|
||||
data = json.loads(message)
|
||||
await self._process_response(data)
|
||||
except json.JSONDecodeError:
|
||||
logger.warning(f"Received non-JSON message: {message}")
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
except websockets.exceptions.ConnectionClosed as e:
|
||||
logger.debug(f"WebSocket connection closed: {e}")
|
||||
if self._websocket and self._websocket.state is State.OPEN:
|
||||
logger.debug("Disconnecting from Cartesia STT")
|
||||
await self._websocket.close()
|
||||
except Exception as e:
|
||||
logger.error(f"Error in message receiver: {e}")
|
||||
logger.error(f"{self} error closing websocket: {e}")
|
||||
finally:
|
||||
self._websocket = None
|
||||
await self._call_event_handler("on_disconnected")
|
||||
|
||||
def _get_websocket(self):
|
||||
if self._websocket:
|
||||
return self._websocket
|
||||
raise Exception("Websocket not connected")
|
||||
|
||||
async def _process_messages(self):
|
||||
async for message in self._get_websocket():
|
||||
try:
|
||||
data = json.loads(message)
|
||||
await self._process_response(data)
|
||||
except json.JSONDecodeError:
|
||||
logger.warning(f"Received non-JSON message: {message}")
|
||||
|
||||
async def _receive_messages(self):
|
||||
while True:
|
||||
await self._process_messages()
|
||||
# Cartesia times out after 5 minutes of innactivity (no keepalive
|
||||
# mechanism is available). So, we try to reconnect.
|
||||
logger.debug(f"{self} Cartesia connection was disconnected (timeout?), reconnecting")
|
||||
await self._connect_websocket()
|
||||
|
||||
async def _process_response(self, data):
|
||||
if "type" in data:
|
||||
@@ -316,41 +361,3 @@ class CartesiaSTTService(STTService):
|
||||
language,
|
||||
)
|
||||
)
|
||||
|
||||
async def _disconnect(self):
|
||||
if self._receiver_task:
|
||||
self._receiver_task.cancel()
|
||||
try:
|
||||
await self._receiver_task
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
except Exception as e:
|
||||
logger.exception(f"Unexpected exception while cancelling task: {e}")
|
||||
self._receiver_task = None
|
||||
|
||||
if self._connection and self._connection.state is State.OPEN:
|
||||
logger.debug("Disconnecting from Cartesia")
|
||||
|
||||
await self._connection.close()
|
||||
self._connection = None
|
||||
|
||||
async def start_metrics(self):
|
||||
"""Start performance metrics collection for transcription processing."""
|
||||
await self.start_ttfb_metrics()
|
||||
await self.start_processing_metrics()
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
"""Process incoming frames and handle speech events.
|
||||
|
||||
Args:
|
||||
frame: The frame to process.
|
||||
direction: Direction of frame flow in the pipeline.
|
||||
"""
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if isinstance(frame, UserStartedSpeakingFrame):
|
||||
await self.start_metrics()
|
||||
elif isinstance(frame, UserStoppedSpeakingFrame):
|
||||
# Send finalize command to flush the transcription session
|
||||
if self._connection and self._connection.state is State.OPEN:
|
||||
await self._connection.send("finalize")
|
||||
|
||||
@@ -344,10 +344,11 @@ class CartesiaTTSService(AudioContextWordTTSService):
|
||||
try:
|
||||
if self._websocket and self._websocket.state is State.OPEN:
|
||||
return
|
||||
logger.debug("Connecting to Cartesia")
|
||||
logger.debug("Connecting to Cartesia TTS")
|
||||
self._websocket = await websocket_connect(
|
||||
f"{self._url}?api_key={self._api_key}&cartesia_version={self._cartesia_version}"
|
||||
)
|
||||
await self._call_event_handler("on_connected")
|
||||
except Exception as e:
|
||||
logger.error(f"{self} initialization error: {e}")
|
||||
self._websocket = None
|
||||
@@ -365,6 +366,7 @@ class CartesiaTTSService(AudioContextWordTTSService):
|
||||
finally:
|
||||
self._context_id = None
|
||||
self._websocket = None
|
||||
await self._call_event_handler("on_disconnected")
|
||||
|
||||
def _get_websocket(self):
|
||||
if self._websocket:
|
||||
|
||||
@@ -8,6 +8,7 @@ import sys
|
||||
|
||||
from pipecat.services import DeprecatedModuleProxy
|
||||
|
||||
from .flux import *
|
||||
from .stt import *
|
||||
from .tts import *
|
||||
|
||||
|
||||
0
src/pipecat/services/deepgram/flux/__init__.py
Normal file
0
src/pipecat/services/deepgram/flux/__init__.py
Normal file
640
src/pipecat/services/deepgram/flux/stt.py
Normal file
640
src/pipecat/services/deepgram/flux/stt.py
Normal file
@@ -0,0 +1,640 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""Deepgram Flux speech-to-text service implementation."""
|
||||
|
||||
import json
|
||||
from enum import Enum
|
||||
from typing import Any, AsyncGenerator, Dict, Optional
|
||||
|
||||
from loguru import logger
|
||||
from pydantic import BaseModel
|
||||
|
||||
from pipecat.frames.frames import (
|
||||
CancelFrame,
|
||||
EndFrame,
|
||||
ErrorFrame,
|
||||
Frame,
|
||||
InterimTranscriptionFrame,
|
||||
StartFrame,
|
||||
TranscriptionFrame,
|
||||
UserStartedSpeakingFrame,
|
||||
UserStoppedSpeakingFrame,
|
||||
)
|
||||
from pipecat.processors.frame_processor import FrameDirection
|
||||
from pipecat.services.stt_service import WebsocketSTTService
|
||||
from pipecat.transcriptions.language import Language
|
||||
from pipecat.utils.time import time_now_iso8601
|
||||
from pipecat.utils.tracing.service_decorators import traced_stt
|
||||
|
||||
try:
|
||||
from websockets.asyncio.client import connect as websocket_connect
|
||||
from websockets.protocol import State
|
||||
except ModuleNotFoundError as e:
|
||||
logger.error(f"Exception: {e}")
|
||||
logger.error("In order to use Deepgram Flux, you need to `pip install pipecat-ai[deepgram]`.")
|
||||
raise Exception(f"Missing module: {e}")
|
||||
|
||||
|
||||
class FluxMessageType(str, Enum):
|
||||
"""Deepgram Flux WebSocket message types.
|
||||
|
||||
These are the top-level message types that can be received from the
|
||||
Deepgram Flux WebSocket connection.
|
||||
"""
|
||||
|
||||
RECEIVE_CONNECTED = "Connected"
|
||||
RECEIVE_FATAL_ERROR = "Error"
|
||||
TURN_INFO = "TurnInfo"
|
||||
|
||||
|
||||
class FluxEventType(str, Enum):
|
||||
"""Deepgram Flux TurnInfo event types.
|
||||
|
||||
These events are contained within TurnInfo messages and indicate
|
||||
different stages of speech processing and turn detection.
|
||||
"""
|
||||
|
||||
START_OF_TURN = "StartOfTurn"
|
||||
TURN_RESUMED = "TurnResumed"
|
||||
END_OF_TURN = "EndOfTurn"
|
||||
EAGER_END_OF_TURN = "EagerEndOfTurn"
|
||||
UPDATE = "Update"
|
||||
|
||||
|
||||
class DeepgramFluxSTTService(WebsocketSTTService):
|
||||
"""Deepgram Flux speech-to-text service.
|
||||
|
||||
Provides real-time speech recognition using Deepgram's WebSocket API with Flux capabilities.
|
||||
Supports configurable models, VAD events, and various audio processing options
|
||||
including advanced turn detection and EagerEndOfTurn events for improved conversational AI performance.
|
||||
"""
|
||||
|
||||
class InputParams(BaseModel):
|
||||
"""Configuration parameters for Deepgram Flux API.
|
||||
|
||||
This class defines all available connection parameters for the Deepgram Flux API
|
||||
based on the official documentation.
|
||||
|
||||
Parameters:
|
||||
eager_eot_threshold: Optional. EagerEndOfTurn/TurnResumed are off by default.
|
||||
You can turn them on by setting eager_eot_threshold to a valid value.
|
||||
Lower values = more aggressive EagerEndOfTurning (faster response, more LLM calls).
|
||||
Higher values = more conservative EagerEndOfTurning (slower response, fewer LLM calls).
|
||||
eot_threshold: Optional. End-of-turn confidence required to finish a turn (default 0.7).
|
||||
Lower values = turns end sooner (more interruptions, faster responses).
|
||||
Higher values = turns end later (fewer interruptions, more complete utterances).
|
||||
eot_timeout_ms: Optional. Time in milliseconds after speech to finish a turn
|
||||
regardless of EOT confidence (default 5000).
|
||||
keyterm: List of keyterms to boost recognition accuracy for specialized terminology.
|
||||
mip_opt_out: Optional. Opts out requests from the Deepgram Model Improvement Program
|
||||
(default False).
|
||||
tag: List of tags to label requests for identification during usage reporting.
|
||||
"""
|
||||
|
||||
eager_eot_threshold: Optional[float] = None
|
||||
eot_threshold: Optional[float] = None
|
||||
eot_timeout_ms: Optional[int] = None
|
||||
keyterm: list = []
|
||||
mip_opt_out: Optional[bool] = None
|
||||
tag: list = []
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
api_key: str,
|
||||
url: str = "wss://api.deepgram.com/v2/listen",
|
||||
sample_rate: Optional[int] = None,
|
||||
model: str = "flux-general-en",
|
||||
flux_encoding: str = "linear16",
|
||||
params: Optional[InputParams] = None,
|
||||
**kwargs,
|
||||
):
|
||||
"""Initialize the Deepgram Flux STT service.
|
||||
|
||||
Args:
|
||||
api_key: Deepgram API key for authentication. Required for API access.
|
||||
url: WebSocket URL for the Deepgram Flux API. Defaults to the preview endpoint.
|
||||
sample_rate: Audio sample rate in Hz. If None, uses the rate from params or 16000.
|
||||
model: Deepgram Flux model to use for transcription. Currently only supports "flux-general-en".
|
||||
flux_encoding: Audio encoding format required by Flux API. Must be "linear16".
|
||||
Raw signed little-endian 16-bit PCM encoding.
|
||||
params: InputParams instance containing detailed API configuration options.
|
||||
If None, default parameters will be used.
|
||||
**kwargs: Additional arguments passed to the parent WebsocketSTTService class.
|
||||
|
||||
Examples:
|
||||
Basic usage with default parameters::
|
||||
|
||||
stt = DeepgramFluxSTTService(api_key="your-api-key")
|
||||
|
||||
Advanced usage with custom parameters::
|
||||
|
||||
params = DeepgramFluxSTTService.InputParams(
|
||||
eager_eot_threshold=0.5,
|
||||
eot_threshold=0.8,
|
||||
keyterm=["AI", "machine learning", "neural network"],
|
||||
tag=["production", "voice-agent"]
|
||||
)
|
||||
stt = DeepgramFluxSTTService(
|
||||
api_key="your-api-key",
|
||||
model="flux-general-en",
|
||||
params=params
|
||||
)
|
||||
"""
|
||||
super().__init__(sample_rate=sample_rate, **kwargs)
|
||||
|
||||
self._api_key = api_key
|
||||
self._url = url
|
||||
self._model = model
|
||||
self._params = params or DeepgramFluxSTTService.InputParams()
|
||||
self._flux_encoding = flux_encoding
|
||||
# This is the currently only supported language
|
||||
self._language = Language.EN
|
||||
self._websocket_url = None
|
||||
self._receive_task = None
|
||||
|
||||
async def _connect(self):
|
||||
"""Connect to WebSocket and start background tasks.
|
||||
|
||||
Establishes the WebSocket connection to the Deepgram Flux API and starts
|
||||
the background task for receiving transcription results.
|
||||
"""
|
||||
await self._connect_websocket()
|
||||
|
||||
if self._websocket and not self._receive_task:
|
||||
self._receive_task = self.create_task(self._receive_task_handler(self._report_error))
|
||||
|
||||
async def _disconnect(self):
|
||||
"""Disconnect from WebSocket and clean up tasks.
|
||||
|
||||
Gracefully disconnects from the Deepgram Flux API, cancels background tasks,
|
||||
and cleans up resources to prevent memory leaks.
|
||||
"""
|
||||
try:
|
||||
# Cancel background tasks BEFORE closing websocket
|
||||
if self._receive_task:
|
||||
await self.cancel_task(self._receive_task, timeout=2.0)
|
||||
self._receive_task = None
|
||||
|
||||
# Now close the websocket
|
||||
await self._disconnect_websocket()
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error during disconnect: {e}")
|
||||
finally:
|
||||
# Reset state only after everything is cleaned up
|
||||
self._websocket = None
|
||||
|
||||
async def _connect_websocket(self):
|
||||
"""Establish WebSocket connection to API.
|
||||
|
||||
Creates a WebSocket connection to the Deepgram Flux API using the configured
|
||||
URL and authentication headers. Handles connection errors and reports them
|
||||
through the event handler system.
|
||||
"""
|
||||
try:
|
||||
if self._websocket and self._websocket.state is State.OPEN:
|
||||
return
|
||||
|
||||
self._websocket = await websocket_connect(
|
||||
self._websocket_url,
|
||||
additional_headers={"Authorization": f"Token {self._api_key}"},
|
||||
)
|
||||
logger.debug("Connected to Deepgram Flux Websocket")
|
||||
await self._call_event_handler("on_connected")
|
||||
except Exception as e:
|
||||
logger.error(f"{self} initialization error: {e}")
|
||||
self._websocket = None
|
||||
await self._call_event_handler("on_connection_error", f"{e}")
|
||||
|
||||
async def _disconnect_websocket(self):
|
||||
"""Close WebSocket connection and clean up state.
|
||||
|
||||
Closes the WebSocket connection to the Deepgram Flux API and stops all
|
||||
metrics collection. Handles disconnection errors gracefully.
|
||||
"""
|
||||
try:
|
||||
await self.stop_all_metrics()
|
||||
|
||||
if self._websocket:
|
||||
await self._send_close_stream()
|
||||
logger.debug("Disconnecting from Deepgram Flux Websocket")
|
||||
await self._websocket.close()
|
||||
except Exception as e:
|
||||
logger.error(f"{self} error closing websocket: {e}")
|
||||
finally:
|
||||
self._websocket = None
|
||||
await self._call_event_handler("on_disconnected")
|
||||
|
||||
async def _send_close_stream(self) -> None:
|
||||
"""Sends a CloseStream control message to the Deepgram Flux WebSocket API.
|
||||
|
||||
This signals to the server that no more audio data will be sent.
|
||||
"""
|
||||
if self._websocket:
|
||||
logger.debug("Sending CloseStream message to Deepgram Flux")
|
||||
message = {"type": "CloseStream"}
|
||||
await self._websocket.send(json.dumps(message))
|
||||
|
||||
def can_generate_metrics(self) -> bool:
|
||||
"""Check if this service can generate processing metrics.
|
||||
|
||||
Returns:
|
||||
True, as Deepgram service supports metrics generation.
|
||||
"""
|
||||
return True
|
||||
|
||||
async def start(self, frame: StartFrame):
|
||||
"""Start the Deepgram Flux STT service.
|
||||
|
||||
Initializes the service by constructing the WebSocket URL with all configured
|
||||
parameters and establishing the connection to begin transcription processing.
|
||||
|
||||
Args:
|
||||
frame: The start frame containing initialization parameters and metadata.
|
||||
"""
|
||||
await super().start(frame)
|
||||
|
||||
url_params = [
|
||||
f"model={self._model}",
|
||||
f"sample_rate={self.sample_rate}",
|
||||
f"encoding={self._flux_encoding}",
|
||||
]
|
||||
|
||||
if self._params.eager_eot_threshold is not None:
|
||||
url_params.append(f"eager_eot_threshold={self._params.eager_eot_threshold}")
|
||||
|
||||
if self._params.eot_threshold is not None:
|
||||
url_params.append(f"eot_threshold={self._params.eot_threshold}")
|
||||
|
||||
if self._params.eot_timeout_ms is not None:
|
||||
url_params.append(f"eot_timeout_ms={self._params.eot_timeout_ms}")
|
||||
|
||||
if self._params.mip_opt_out is not None:
|
||||
url_params.append(f"mip_opt_out={str(self._params.mip_opt_out).lower()}")
|
||||
|
||||
# Add keyterm parameters (can have multiple)
|
||||
for keyterm in self._params.keyterm:
|
||||
url_params.append(f"keyterm={keyterm}")
|
||||
|
||||
# Add tag parameters (can have multiple)
|
||||
for tag_value in self._params.tag:
|
||||
url_params.append(f"tag={tag_value}")
|
||||
|
||||
self._websocket_url = f"{self._url}?{'&'.join(url_params)}"
|
||||
await self._connect()
|
||||
|
||||
async def stop(self, frame: EndFrame):
|
||||
"""Stop the Deepgram Flux STT service.
|
||||
|
||||
Args:
|
||||
frame: The end frame.
|
||||
"""
|
||||
await super().stop(frame)
|
||||
await self._disconnect()
|
||||
|
||||
async def cancel(self, frame: CancelFrame):
|
||||
"""Cancel the Deepgram Flux STT service.
|
||||
|
||||
Args:
|
||||
frame: The cancel frame.
|
||||
"""
|
||||
await super().cancel(frame)
|
||||
await self._disconnect()
|
||||
|
||||
async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]:
|
||||
"""Send audio data to Deepgram Flux for transcription.
|
||||
|
||||
Transmits raw audio bytes to the Deepgram Flux API for real-time speech
|
||||
recognition. Transcription results are received asynchronously through
|
||||
WebSocket callbacks and processed in the background.
|
||||
|
||||
Args:
|
||||
audio: Raw audio bytes in linear16 format (signed little-endian 16-bit PCM).
|
||||
|
||||
Yields:
|
||||
Frame: None (transcription results are delivered via WebSocket callbacks
|
||||
rather than as return values from this method).
|
||||
|
||||
Raises:
|
||||
Exception: If the WebSocket connection is not established or if there
|
||||
are issues sending the audio data.
|
||||
"""
|
||||
if not self._websocket:
|
||||
logger.error("Not connected to Deepgram Flux.")
|
||||
yield ErrorFrame("Not connected to Deepgram Flux.", fatal=True)
|
||||
return
|
||||
|
||||
try:
|
||||
await self._websocket.send(audio)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to send audio to Flux: {e}")
|
||||
yield ErrorFrame(f"Failed to send audio to Flux: {e}")
|
||||
return
|
||||
|
||||
yield None
|
||||
|
||||
async def start_metrics(self):
|
||||
"""Start TTFB and processing metrics collection."""
|
||||
# TTFB (Time To First Byte) metrics are currently disabled for Deepgram Flux.
|
||||
# Ideally, TTFB should measure the time from when a user starts speaking
|
||||
# until we receive the first transcript. However, Deepgram Flux delivers
|
||||
# both the "user started speaking" event and the first transcript simultaneously,
|
||||
# making this timing measurement meaningless in this context.
|
||||
# await self.start_ttfb_metrics()
|
||||
await self.start_processing_metrics()
|
||||
|
||||
@traced_stt
|
||||
async def _handle_transcription(
|
||||
self, transcript: str, is_final: bool, language: Optional[Language] = None
|
||||
):
|
||||
"""Handle a transcription result with tracing."""
|
||||
pass
|
||||
|
||||
def _get_websocket(self):
|
||||
"""Get the current WebSocket connection.
|
||||
|
||||
Returns the active WebSocket connection instance, raising an exception
|
||||
if no connection is currently established.
|
||||
|
||||
Returns:
|
||||
The active WebSocket connection instance.
|
||||
|
||||
Raises:
|
||||
Exception: If no WebSocket connection is currently active.
|
||||
"""
|
||||
if self._websocket:
|
||||
return self._websocket
|
||||
raise Exception("Websocket not connected")
|
||||
|
||||
def _validate_message(self, data: Dict[str, Any]) -> bool:
|
||||
"""Validate basic message structure from Deepgram Flux.
|
||||
|
||||
Ensures the received message has the expected structure before processing.
|
||||
|
||||
Args:
|
||||
data: The parsed JSON message data to validate.
|
||||
|
||||
Returns:
|
||||
True if the message structure is valid, False otherwise.
|
||||
"""
|
||||
if not isinstance(data, dict):
|
||||
logger.warning("Message is not a dictionary")
|
||||
return False
|
||||
|
||||
if "type" not in data:
|
||||
logger.warning("Message missing 'type' field")
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
async def _receive_messages(self):
|
||||
"""Receive and process messages from WebSocket.
|
||||
|
||||
Continuously receives messages from the Deepgram Flux WebSocket connection
|
||||
and processes various message types including connection status, transcription
|
||||
results, turn information, and error conditions. Handles different event types
|
||||
such as StartOfTurn, EndOfTurn, EagerEndOfTurn, and Update events.
|
||||
"""
|
||||
async for message in self._get_websocket():
|
||||
if isinstance(message, str):
|
||||
try:
|
||||
data = json.loads(message)
|
||||
await self._handle_message(data)
|
||||
except json.JSONDecodeError as e:
|
||||
logger.error(f"Failed to decode JSON message: {e}")
|
||||
# Skip malformed messages
|
||||
continue
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing message: {e}")
|
||||
# Error will be handled inside WebsocketService->_receive_task_handler
|
||||
raise
|
||||
else:
|
||||
logger.warning(f"Received non-string message: {type(message)}")
|
||||
|
||||
async def _handle_message(self, data: Dict[str, Any]):
|
||||
"""Handle a parsed WebSocket message from Deepgram Flux.
|
||||
|
||||
Routes messages to appropriate handlers based on their type. Validates
|
||||
message structure before processing.
|
||||
|
||||
Args:
|
||||
data: The parsed JSON message data from the WebSocket.
|
||||
"""
|
||||
if not self._validate_message(data):
|
||||
return
|
||||
|
||||
message_type = data.get("type")
|
||||
|
||||
try:
|
||||
flux_message_type = FluxMessageType(message_type)
|
||||
except ValueError:
|
||||
logger.debug(f"Unhandled message type: {message_type or 'unknown'}")
|
||||
return
|
||||
|
||||
match flux_message_type:
|
||||
case FluxMessageType.RECEIVE_CONNECTED:
|
||||
await self._handle_connection_established()
|
||||
case FluxMessageType.RECEIVE_FATAL_ERROR:
|
||||
await self._handle_fatal_error(data)
|
||||
case FluxMessageType.TURN_INFO:
|
||||
await self._handle_turn_info(data)
|
||||
|
||||
async def _handle_connection_established(self):
|
||||
"""Handle successful connection establishment to Deepgram Flux.
|
||||
|
||||
This event is fired when the WebSocket connection to Deepgram Flux
|
||||
is successfully established and ready to receive audio data for
|
||||
transcription processing.
|
||||
"""
|
||||
logger.info("Connected to Flux - ready to stream audio")
|
||||
|
||||
async def _handle_fatal_error(self, data: Dict[str, Any]):
|
||||
"""Handle fatal error messages from Deepgram Flux.
|
||||
|
||||
Fatal errors indicate unrecoverable issues with the connection or
|
||||
configuration that require intervention. These errors will cause
|
||||
the connection to be terminated.
|
||||
|
||||
Args:
|
||||
data: The error message data containing error details.
|
||||
|
||||
Raises:
|
||||
Exception: Always raises to trigger error handling in the parent service.
|
||||
"""
|
||||
error_msg = data.get("error", "Unknown error")
|
||||
deepgram_error = f"Fatal error: {error_msg}"
|
||||
logger.error(deepgram_error)
|
||||
# Error will be handled inside WebsocketService->_receive_task_handler
|
||||
raise Exception(deepgram_error)
|
||||
|
||||
async def _handle_turn_info(self, data: Dict[str, Any]):
|
||||
"""Handle TurnInfo events from Deepgram Flux.
|
||||
|
||||
TurnInfo messages contain various turn-based events that indicate
|
||||
the state of speech processing, including turn boundaries, interim
|
||||
results, and turn finalization events.
|
||||
|
||||
Args:
|
||||
data: The TurnInfo message data containing event type, transcript and some extra metadata.
|
||||
"""
|
||||
event = data.get("event")
|
||||
transcript = data.get("transcript", "")
|
||||
|
||||
try:
|
||||
flux_event_type = FluxEventType(event)
|
||||
except ValueError:
|
||||
logger.debug(f"Unhandled TurnInfo event: {event}")
|
||||
return
|
||||
|
||||
match flux_event_type:
|
||||
case FluxEventType.START_OF_TURN:
|
||||
await self._handle_start_of_turn(transcript)
|
||||
case FluxEventType.TURN_RESUMED:
|
||||
await self._handle_turn_resumed(event)
|
||||
case FluxEventType.END_OF_TURN:
|
||||
await self._handle_end_of_turn(transcript, data)
|
||||
case FluxEventType.EAGER_END_OF_TURN:
|
||||
await self._handle_eager_end_of_turn(transcript, data)
|
||||
case FluxEventType.UPDATE:
|
||||
await self._handle_update(transcript)
|
||||
|
||||
async def _handle_start_of_turn(self, transcript: str):
|
||||
"""Handle StartOfTurn events from Deepgram Flux.
|
||||
|
||||
StartOfTurn events are fired when Deepgram Flux detects the beginning
|
||||
of a new speaking turn. This triggers bot interruption to stop any
|
||||
ongoing speech synthesis and signals the start of user speech detection.
|
||||
|
||||
The service will:
|
||||
- Send a BotInterruptionFrame upstream to stop bot speech
|
||||
- Send a UserStartedSpeakingFrame downstream to notify other components
|
||||
- Start metrics collection for measuring response times
|
||||
|
||||
Args:
|
||||
transcript: maybe the first few words of the turn.
|
||||
"""
|
||||
logger.debug("User started speaking")
|
||||
await self.push_interruption_task_frame_and_wait()
|
||||
await self.push_frame(UserStartedSpeakingFrame(), FrameDirection.DOWNSTREAM)
|
||||
await self.push_frame(UserStartedSpeakingFrame(), FrameDirection.UPSTREAM)
|
||||
await self.start_metrics()
|
||||
if transcript:
|
||||
logger.trace(f"Start of turn transcript: {transcript}")
|
||||
|
||||
async def _handle_turn_resumed(self, event: str):
|
||||
"""Handle TurnResumed events from Deepgram Flux.
|
||||
|
||||
TurnResumed events indicate that speech has resumed after a brief pause
|
||||
within the same turn. This is primarily used for logging and debugging
|
||||
purposes and doesn't trigger any significant processing changes.
|
||||
|
||||
Args:
|
||||
event: The event type string for logging purposes.
|
||||
"""
|
||||
logger.trace(f"Received event TurnResumed: {event}")
|
||||
|
||||
async def _handle_end_of_turn(self, transcript: str, data: Dict[str, Any]):
|
||||
"""Handle EndOfTurn events from Deepgram Flux.
|
||||
|
||||
EndOfTurn events are fired when Deepgram Flux determines that a speaking
|
||||
turn has concluded, either due to sufficient silence or end-of-turn
|
||||
confidence thresholds being met. This provides the final transcript
|
||||
for the completed turn.
|
||||
|
||||
The service will:
|
||||
- Create and send a final TranscriptionFrame with the complete transcript
|
||||
- Trigger transcription handling with tracing for metrics
|
||||
- Stop processing metrics collection
|
||||
- Send a UserStoppedSpeakingFrame to signal turn completion
|
||||
|
||||
Args:
|
||||
transcript: The final transcript text for the completed turn.
|
||||
data: The TurnInfo message data containing event type, transcript and some extra metadata.
|
||||
"""
|
||||
logger.debug("User stopped speaking")
|
||||
|
||||
await self.push_frame(
|
||||
TranscriptionFrame(
|
||||
transcript,
|
||||
self._user_id,
|
||||
time_now_iso8601(),
|
||||
self._language,
|
||||
result=data,
|
||||
)
|
||||
)
|
||||
await self._handle_transcription(transcript, True, self._language)
|
||||
await self.stop_processing_metrics()
|
||||
await self.push_frame(UserStoppedSpeakingFrame(), FrameDirection.DOWNSTREAM)
|
||||
await self.push_frame(UserStoppedSpeakingFrame(), FrameDirection.UPSTREAM)
|
||||
|
||||
async def _handle_eager_end_of_turn(self, transcript: str, data: Dict[str, Any]):
|
||||
"""Handle EagerEndOfTurn events from Deepgram Flux.
|
||||
|
||||
EagerEndOfTurn events are fired when the end-of-turn confidence reaches the
|
||||
EagerEndOfTurn threshold but hasn't yet reached the full end-of-turn threshold.
|
||||
These provide interim transcripts that can be used for faster response
|
||||
generation while still allowing the user to continue speaking.
|
||||
|
||||
EagerEndOfTurn events enable more responsive conversational AI by allowing
|
||||
the LLM to start processing likely final transcripts before the turn
|
||||
is definitively ended.
|
||||
|
||||
Args:
|
||||
transcript: The interim transcript text that triggered the EagerEndOfTurn event.
|
||||
data: The TurnInfo message data containing event type, transcript and some extra metadata.
|
||||
"""
|
||||
logger.trace(f"EagerEndOfTurn - {transcript}")
|
||||
# Deepgram's EagerEndOfTurn feature enables lower-latency voice agents by sending
|
||||
# medium-confidence transcripts before EndOfTurn certainty, allowing LLM processing to
|
||||
# begin early.
|
||||
#
|
||||
# However, if speech resumes or the transcripts differ from the final EndOfTurn, the
|
||||
# EagerEndOfTurn response should be cancelled to avoid incorrect or partial responses.
|
||||
#
|
||||
# Pipecat doesn't yet provide built-in Gate/control mechanisms to:
|
||||
# 1. Start LLM/TTS processing early on EagerEndOfTurn events
|
||||
# 2. Cancel in-flight processing when TurnResumed occurs
|
||||
#
|
||||
# By pushing EagerEndOfTurn transcripts as InterimTranscriptionFrame, we enable
|
||||
# developers to implement custom EagerEndOfTurn handling in their applications while
|
||||
# maintaining compatibility with existing interim transcription workflows.
|
||||
#
|
||||
# TODO: Implement proper EagerEndOfTurn support with cancellable processing pipeline
|
||||
# that can start response generation on EagerEndOfTurn and cancel or confirm it.
|
||||
await self.push_frame(
|
||||
InterimTranscriptionFrame(
|
||||
transcript,
|
||||
self._user_id,
|
||||
time_now_iso8601(),
|
||||
self._language,
|
||||
result=data,
|
||||
)
|
||||
)
|
||||
|
||||
async def _handle_update(self, transcript: str):
|
||||
"""Handle Update events from Deepgram Flux.
|
||||
|
||||
Update events provide incremental transcript updates during an ongoing
|
||||
turn. These events allow for real-time display of transcription progress
|
||||
and can be used to provide visual feedback to users about what's being
|
||||
recognized.
|
||||
|
||||
The service stops TTFB (Time To First Byte) metrics when the first
|
||||
substantial update is received, indicating successful processing start.
|
||||
|
||||
Args:
|
||||
transcript: The current partial transcript text for the ongoing turn.
|
||||
"""
|
||||
if transcript:
|
||||
logger.trace(f"Update event: {transcript}")
|
||||
# TTFB (Time To First Byte) metrics are currently disabled for Deepgram Flux.
|
||||
# Ideally, TTFB should measure the time from when a user starts speaking
|
||||
# until we receive the first transcript. However, Deepgram Flux delivers
|
||||
# both the "user started speaking" event and the first transcript simultaneously,
|
||||
# making this timing measurement meaningless in this context.
|
||||
# await self.stop_ttfb_metrics()
|
||||
@@ -8,6 +8,7 @@ import sys
|
||||
|
||||
from pipecat.services import DeprecatedModuleProxy
|
||||
|
||||
from .stt import *
|
||||
from .tts import *
|
||||
|
||||
sys.modules[__name__] = DeprecatedModuleProxy(globals(), "elevenlabs", "elevenlabs.tts")
|
||||
sys.modules[__name__] = DeprecatedModuleProxy(globals(), "elevenlabs", "elevenlabs.[stt,tts]")
|
||||
|
||||
@@ -168,16 +168,24 @@ def build_elevenlabs_voice_settings(
|
||||
|
||||
|
||||
def calculate_word_times(
|
||||
alignment_info: Mapping[str, Any], cumulative_time: float
|
||||
) -> List[Tuple[str, float]]:
|
||||
alignment_info: Mapping[str, Any],
|
||||
cumulative_time: float,
|
||||
partial_word: str = "",
|
||||
partial_word_start_time: float = 0.0,
|
||||
) -> tuple[List[Tuple[str, float]], str, float]:
|
||||
"""Calculate word timestamps from character alignment information.
|
||||
|
||||
Args:
|
||||
alignment_info: Character alignment data from ElevenLabs API.
|
||||
cumulative_time: Base time offset for this chunk.
|
||||
partial_word: Partial word carried over from previous chunk.
|
||||
partial_word_start_time: Start time of the partial word.
|
||||
|
||||
Returns:
|
||||
List of (word, timestamp) tuples.
|
||||
Tuple of (word_times, new_partial_word, new_partial_word_start_time):
|
||||
- word_times: List of (word, timestamp) tuples for complete words
|
||||
- new_partial_word: Incomplete word at end of chunk (empty if chunk ends with space)
|
||||
- new_partial_word_start_time: Start time of the incomplete word
|
||||
"""
|
||||
chars = alignment_info["chars"]
|
||||
char_start_times_ms = alignment_info["charStartTimesMs"]
|
||||
@@ -186,41 +194,37 @@ def calculate_word_times(
|
||||
logger.error(
|
||||
f"calculate_word_times: length mismatch - chars={len(chars)}, times={len(char_start_times_ms)}"
|
||||
)
|
||||
return []
|
||||
return ([], partial_word, partial_word_start_time)
|
||||
|
||||
# Build words and track their start positions
|
||||
words = []
|
||||
word_start_indices = []
|
||||
current_word = ""
|
||||
word_start_index = None
|
||||
word_start_times = []
|
||||
current_word = partial_word # Start with any partial word from previous chunk
|
||||
word_start_time = partial_word_start_time if partial_word else None
|
||||
|
||||
for i, char in enumerate(chars):
|
||||
if char == " ":
|
||||
# End of current word
|
||||
if current_word: # Only add non-empty words
|
||||
words.append(current_word)
|
||||
word_start_indices.append(word_start_index)
|
||||
word_start_times.append(word_start_time)
|
||||
current_word = ""
|
||||
word_start_index = None
|
||||
word_start_time = None
|
||||
else:
|
||||
# Building a word
|
||||
if word_start_index is None: # First character of new word
|
||||
word_start_index = i
|
||||
if word_start_time is None: # First character of new word
|
||||
# Convert from milliseconds to seconds and add cumulative offset
|
||||
word_start_time = cumulative_time + (char_start_times_ms[i] / 1000.0)
|
||||
current_word += char
|
||||
|
||||
# Handle the last word if there's no trailing space
|
||||
if current_word and word_start_index is not None:
|
||||
words.append(current_word)
|
||||
word_start_indices.append(word_start_index)
|
||||
# Build result for complete words
|
||||
word_times = list(zip(words, word_start_times))
|
||||
|
||||
# Calculate timestamps for each word
|
||||
word_times = []
|
||||
for word, start_idx in zip(words, word_start_indices):
|
||||
# Convert from milliseconds to seconds and add cumulative offset
|
||||
start_time_seconds = cumulative_time + (char_start_times_ms[start_idx] / 1000.0)
|
||||
word_times.append((word, start_time_seconds))
|
||||
# Return any incomplete word at the end of this chunk
|
||||
new_partial_word = current_word if current_word else ""
|
||||
new_partial_word_start_time = word_start_time if word_start_time is not None else 0.0
|
||||
|
||||
return word_times
|
||||
return (word_times, new_partial_word, new_partial_word_start_time)
|
||||
|
||||
|
||||
class ElevenLabsTTSService(AudioContextWordTTSService):
|
||||
@@ -332,6 +336,9 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
|
||||
# there's an interruption or TTSStoppedFrame.
|
||||
self._started = False
|
||||
self._cumulative_time = 0
|
||||
# Track partial words that span across alignment chunks
|
||||
self._partial_word = ""
|
||||
self._partial_word_start_time = 0.0
|
||||
|
||||
# Context management for v1 multi API
|
||||
self._context_id = None
|
||||
@@ -521,6 +528,7 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
|
||||
url, max_size=16 * 1024 * 1024, additional_headers={"xi-api-key": self._api_key}
|
||||
)
|
||||
|
||||
await self._call_event_handler("on_connected")
|
||||
except Exception as e:
|
||||
logger.error(f"{self} initialization error: {e}")
|
||||
self._websocket = None
|
||||
@@ -543,6 +551,7 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
|
||||
self._started = False
|
||||
self._context_id = None
|
||||
self._websocket = None
|
||||
await self._call_event_handler("on_disconnected")
|
||||
|
||||
def _get_websocket(self):
|
||||
if self._websocket:
|
||||
@@ -570,6 +579,8 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
|
||||
logger.error(f"Error closing context on interruption: {e}")
|
||||
self._context_id = None
|
||||
self._started = False
|
||||
self._partial_word = ""
|
||||
self._partial_word_start_time = 0.0
|
||||
|
||||
async def _receive_messages(self):
|
||||
"""Handle incoming WebSocket messages from ElevenLabs."""
|
||||
@@ -609,7 +620,14 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
|
||||
|
||||
if msg.get("alignment"):
|
||||
alignment = msg["alignment"]
|
||||
word_times = calculate_word_times(alignment, self._cumulative_time)
|
||||
word_times, self._partial_word, self._partial_word_start_time = (
|
||||
calculate_word_times(
|
||||
alignment,
|
||||
self._cumulative_time,
|
||||
self._partial_word,
|
||||
self._partial_word_start_time,
|
||||
)
|
||||
)
|
||||
|
||||
if word_times:
|
||||
await self.add_word_timestamps(word_times)
|
||||
@@ -683,6 +701,8 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
|
||||
yield TTSStartedFrame()
|
||||
self._started = True
|
||||
self._cumulative_time = 0
|
||||
self._partial_word = ""
|
||||
self._partial_word_start_time = 0.0
|
||||
# If a context ID does not exist, create a new one and
|
||||
# register it. If an ID exists, that means the Pipeline is
|
||||
# configured for allow_interruptions=False, so continue
|
||||
@@ -756,6 +776,7 @@ class ElevenLabsHttpTTSService(WordTTSService):
|
||||
base_url: str = "https://api.elevenlabs.io",
|
||||
sample_rate: Optional[int] = None,
|
||||
params: Optional[InputParams] = None,
|
||||
aggregate_sentences: Optional[bool] = True,
|
||||
**kwargs,
|
||||
):
|
||||
"""Initialize the ElevenLabs HTTP TTS service.
|
||||
@@ -768,10 +789,11 @@ class ElevenLabsHttpTTSService(WordTTSService):
|
||||
base_url: Base URL for ElevenLabs HTTP API.
|
||||
sample_rate: Audio sample rate. If None, uses default.
|
||||
params: Additional input parameters for voice customization.
|
||||
aggregate_sentences: Whether to aggregate sentences within the TTSService.
|
||||
**kwargs: Additional arguments passed to the parent service.
|
||||
"""
|
||||
super().__init__(
|
||||
aggregate_sentences=True,
|
||||
aggregate_sentences=aggregate_sentences,
|
||||
push_text_frames=False,
|
||||
push_stop_frames=True,
|
||||
sample_rate=sample_rate,
|
||||
@@ -809,6 +831,10 @@ class ElevenLabsHttpTTSService(WordTTSService):
|
||||
# Store previous text for context within a turn
|
||||
self._previous_text = ""
|
||||
|
||||
# Track partial words that span across alignment chunks
|
||||
self._partial_word = ""
|
||||
self._partial_word_start_time = 0.0
|
||||
|
||||
def language_to_service_language(self, language: Language) -> Optional[str]:
|
||||
"""Convert pipecat Language to ElevenLabs language code.
|
||||
|
||||
@@ -836,6 +862,8 @@ class ElevenLabsHttpTTSService(WordTTSService):
|
||||
self._cumulative_time = 0
|
||||
self._started = False
|
||||
self._previous_text = ""
|
||||
self._partial_word = ""
|
||||
self._partial_word_start_time = 0.0
|
||||
logger.debug(f"{self}: Reset internal state")
|
||||
|
||||
async def start(self, frame: StartFrame):
|
||||
@@ -870,11 +898,13 @@ class ElevenLabsHttpTTSService(WordTTSService):
|
||||
def calculate_word_times(self, alignment_info: Mapping[str, Any]) -> List[Tuple[str, float]]:
|
||||
"""Calculate word timing from character alignment data.
|
||||
|
||||
This method handles partial words that may span across multiple alignment chunks.
|
||||
|
||||
Args:
|
||||
alignment_info: Character timing data from ElevenLabs.
|
||||
|
||||
Returns:
|
||||
List of (word, timestamp) pairs.
|
||||
List of (word, timestamp) pairs for complete words in this chunk.
|
||||
|
||||
Example input data::
|
||||
|
||||
@@ -900,30 +930,28 @@ class ElevenLabsHttpTTSService(WordTTSService):
|
||||
# Build the words and find their start times
|
||||
words = []
|
||||
word_start_times = []
|
||||
current_word = ""
|
||||
first_char_idx = -1
|
||||
# Start with any partial word from previous chunk
|
||||
current_word = self._partial_word
|
||||
word_start_time = self._partial_word_start_time if self._partial_word else None
|
||||
|
||||
for i, char in enumerate(chars):
|
||||
if char == " ":
|
||||
if current_word: # Only add non-empty words
|
||||
words.append(current_word)
|
||||
# Use time of the first character of the word, offset by cumulative time
|
||||
word_start_times.append(
|
||||
self._cumulative_time + char_start_times[first_char_idx]
|
||||
)
|
||||
word_start_times.append(word_start_time)
|
||||
current_word = ""
|
||||
first_char_idx = -1
|
||||
word_start_time = None
|
||||
else:
|
||||
if not current_word: # This is the first character of a new word
|
||||
first_char_idx = i
|
||||
if word_start_time is None: # First character of a new word
|
||||
# Use time of the first character of the word, offset by cumulative time
|
||||
word_start_time = self._cumulative_time + char_start_times[i]
|
||||
current_word += char
|
||||
|
||||
# Don't forget the last word if there's no trailing space
|
||||
if current_word and first_char_idx >= 0:
|
||||
words.append(current_word)
|
||||
word_start_times.append(self._cumulative_time + char_start_times[first_char_idx])
|
||||
# Store any incomplete word at the end of this chunk
|
||||
self._partial_word = current_word if current_word else ""
|
||||
self._partial_word_start_time = word_start_time if word_start_time is not None else 0.0
|
||||
|
||||
# Create word-time pairs
|
||||
# Create word-time pairs for complete words only
|
||||
word_times = list(zip(words, word_start_times))
|
||||
|
||||
return word_times
|
||||
@@ -959,6 +987,9 @@ class ElevenLabsHttpTTSService(WordTTSService):
|
||||
if self._voice_settings:
|
||||
payload["voice_settings"] = self._voice_settings
|
||||
|
||||
if self._settings["apply_text_normalization"] is not None:
|
||||
payload["apply_text_normalization"] = self._settings["apply_text_normalization"]
|
||||
|
||||
language = self._settings["language"]
|
||||
if self._model_name in ELEVENLABS_MULTILINGUAL_MODELS and language:
|
||||
payload["language_code"] = language
|
||||
@@ -979,8 +1010,6 @@ class ElevenLabsHttpTTSService(WordTTSService):
|
||||
}
|
||||
if self._settings["optimize_streaming_latency"] is not None:
|
||||
params["optimize_streaming_latency"] = self._settings["optimize_streaming_latency"]
|
||||
if self._settings["apply_text_normalization"] is not None:
|
||||
params["apply_text_normalization"] = self._settings["apply_text_normalization"]
|
||||
|
||||
try:
|
||||
await self.start_ttfb_metrics()
|
||||
@@ -1041,6 +1070,14 @@ class ElevenLabsHttpTTSService(WordTTSService):
|
||||
logger.error(f"Error processing response: {e}", exc_info=True)
|
||||
continue
|
||||
|
||||
# After processing all chunks, emit any remaining partial word
|
||||
# since this is the end of the utterance
|
||||
if self._partial_word:
|
||||
final_word_time = [(self._partial_word, self._partial_word_start_time)]
|
||||
await self.add_word_timestamps(final_word_time)
|
||||
self._partial_word = ""
|
||||
self._partial_word_start_time = 0.0
|
||||
|
||||
# After processing all chunks, add the total utterance duration
|
||||
# to the cumulative time to ensure next utterance starts after this one
|
||||
if utterance_duration > 0:
|
||||
|
||||
@@ -225,6 +225,8 @@ class FishAudioTTSService(InterruptibleTTSService):
|
||||
start_message = {"event": "start", "request": {"text": "", **self._settings}}
|
||||
await self._websocket.send(ormsgpack.packb(start_message))
|
||||
logger.debug("Sent start message to Fish Audio")
|
||||
|
||||
await self._call_event_handler("on_connected")
|
||||
except Exception as e:
|
||||
logger.error(f"Fish Audio initialization error: {e}")
|
||||
self._websocket = None
|
||||
@@ -245,6 +247,7 @@ class FishAudioTTSService(InterruptibleTTSService):
|
||||
self._request_id = None
|
||||
self._started = False
|
||||
self._websocket = None
|
||||
await self._call_event_handler("on_disconnected")
|
||||
|
||||
async def flush_audio(self):
|
||||
"""Flush any buffered audio by sending a flush event to Fish Audio."""
|
||||
|
||||
@@ -4,527 +4,41 @@
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""Event models and utilities for Google Gemini Multimodal Live API."""
|
||||
|
||||
import base64
|
||||
import io
|
||||
import json
|
||||
from enum import Enum
|
||||
from typing import List, Literal, Optional
|
||||
|
||||
from PIL import Image
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from pipecat.frames.frames import ImageRawFrame
|
||||
|
||||
#
|
||||
# Client events
|
||||
#
|
||||
|
||||
|
||||
class MediaChunk(BaseModel):
|
||||
"""Represents a chunk of media data for transmission.
|
||||
|
||||
Parameters:
|
||||
mimeType: MIME type of the media content.
|
||||
data: Base64-encoded media data.
|
||||
"""
|
||||
|
||||
mimeType: str
|
||||
data: str
|
||||
|
||||
|
||||
class ContentPart(BaseModel):
|
||||
"""Represents a part of content that can contain text or media.
|
||||
|
||||
Parameters:
|
||||
text: Text content. Defaults to None.
|
||||
inlineData: Inline media data. Defaults to None.
|
||||
"""
|
||||
|
||||
text: Optional[str] = Field(default=None, validate_default=False)
|
||||
inlineData: Optional[MediaChunk] = Field(default=None, validate_default=False)
|
||||
fileData: Optional["FileData"] = Field(default=None, validate_default=False)
|
||||
|
||||
|
||||
class FileData(BaseModel):
|
||||
"""Represents a file reference in the Gemini File API."""
|
||||
|
||||
mimeType: str
|
||||
fileUri: str
|
||||
|
||||
|
||||
ContentPart.model_rebuild() # Rebuild model to resolve forward reference
|
||||
|
||||
|
||||
class Turn(BaseModel):
|
||||
"""Represents a conversational turn in the dialogue.
|
||||
|
||||
Parameters:
|
||||
role: The role of the speaker, either "user" or "model". Defaults to "user".
|
||||
parts: List of content parts that make up the turn.
|
||||
"""
|
||||
|
||||
role: Literal["user", "model"] = "user"
|
||||
parts: List[ContentPart]
|
||||
|
||||
|
||||
class StartSensitivity(str, Enum):
|
||||
"""Determines how start of speech is detected."""
|
||||
|
||||
UNSPECIFIED = "START_SENSITIVITY_UNSPECIFIED" # Default is HIGH
|
||||
HIGH = "START_SENSITIVITY_HIGH" # Detect start of speech more often
|
||||
LOW = "START_SENSITIVITY_LOW" # Detect start of speech less often
|
||||
|
||||
|
||||
class EndSensitivity(str, Enum):
|
||||
"""Determines how end of speech is detected."""
|
||||
|
||||
UNSPECIFIED = "END_SENSITIVITY_UNSPECIFIED" # Default is HIGH
|
||||
HIGH = "END_SENSITIVITY_HIGH" # End speech more often
|
||||
LOW = "END_SENSITIVITY_LOW" # End speech less often
|
||||
|
||||
|
||||
class AutomaticActivityDetection(BaseModel):
|
||||
"""Configures automatic detection of voice activity.
|
||||
|
||||
Parameters:
|
||||
disabled: Whether automatic activity detection is disabled. Defaults to None.
|
||||
start_of_speech_sensitivity: Sensitivity for detecting speech start. Defaults to None.
|
||||
prefix_padding_ms: Padding before speech start in milliseconds. Defaults to None.
|
||||
end_of_speech_sensitivity: Sensitivity for detecting speech end. Defaults to None.
|
||||
silence_duration_ms: Duration of silence to detect speech end. Defaults to None.
|
||||
"""
|
||||
|
||||
disabled: Optional[bool] = None
|
||||
start_of_speech_sensitivity: Optional[StartSensitivity] = None
|
||||
prefix_padding_ms: Optional[int] = None
|
||||
end_of_speech_sensitivity: Optional[EndSensitivity] = None
|
||||
silence_duration_ms: Optional[int] = None
|
||||
|
||||
|
||||
class RealtimeInputConfig(BaseModel):
|
||||
"""Configures the realtime input behavior.
|
||||
|
||||
Parameters:
|
||||
automatic_activity_detection: Voice activity detection configuration. Defaults to None.
|
||||
"""
|
||||
|
||||
automatic_activity_detection: Optional[AutomaticActivityDetection] = None
|
||||
|
||||
|
||||
class RealtimeInput(BaseModel):
|
||||
"""Contains realtime input media chunks and text.
|
||||
|
||||
Parameters:
|
||||
mediaChunks: List of media chunks for realtime processing.
|
||||
text: Text for realtime processing.
|
||||
"""
|
||||
|
||||
mediaChunks: Optional[List[MediaChunk]] = None
|
||||
text: Optional[str] = None
|
||||
|
||||
|
||||
class ClientContent(BaseModel):
|
||||
"""Content sent from client to the Gemini Live API.
|
||||
|
||||
Parameters:
|
||||
turns: List of conversation turns. Defaults to None.
|
||||
turnComplete: Whether the client's turn is complete. Defaults to False.
|
||||
"""
|
||||
|
||||
turns: Optional[List[Turn]] = None
|
||||
turnComplete: bool = False
|
||||
|
||||
|
||||
class AudioInputMessage(BaseModel):
|
||||
"""Message containing audio input data.
|
||||
|
||||
Parameters:
|
||||
realtimeInput: Realtime input containing audio chunks.
|
||||
"""
|
||||
|
||||
realtimeInput: RealtimeInput
|
||||
|
||||
@classmethod
|
||||
def from_raw_audio(cls, raw_audio: bytes, sample_rate: int) -> "AudioInputMessage":
|
||||
"""Create an audio input message from raw audio data.
|
||||
|
||||
Args:
|
||||
raw_audio: Raw audio bytes.
|
||||
sample_rate: Audio sample rate in Hz.
|
||||
|
||||
Returns:
|
||||
AudioInputMessage instance with encoded audio data.
|
||||
"""
|
||||
data = base64.b64encode(raw_audio).decode("utf-8")
|
||||
return cls(
|
||||
realtimeInput=RealtimeInput(
|
||||
mediaChunks=[MediaChunk(mimeType=f"audio/pcm;rate={sample_rate}", data=data)]
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
class VideoInputMessage(BaseModel):
|
||||
"""Message containing video/image input data.
|
||||
|
||||
Parameters:
|
||||
realtimeInput: Realtime input containing video/image chunks.
|
||||
"""
|
||||
|
||||
realtimeInput: RealtimeInput
|
||||
|
||||
@classmethod
|
||||
def from_image_frame(cls, frame: ImageRawFrame) -> "VideoInputMessage":
|
||||
"""Create a video input message from an image frame.
|
||||
|
||||
Args:
|
||||
frame: Image frame to encode.
|
||||
|
||||
Returns:
|
||||
VideoInputMessage instance with encoded image data.
|
||||
"""
|
||||
buffer = io.BytesIO()
|
||||
Image.frombytes(frame.format, frame.size, frame.image).save(buffer, format="JPEG")
|
||||
data = base64.b64encode(buffer.getvalue()).decode("utf-8")
|
||||
return cls(
|
||||
realtimeInput=RealtimeInput(mediaChunks=[MediaChunk(mimeType=f"image/jpeg", data=data)])
|
||||
)
|
||||
|
||||
|
||||
class TextInputMessage(BaseModel):
|
||||
"""Message containing text input data."""
|
||||
|
||||
realtimeInput: RealtimeInput
|
||||
|
||||
@classmethod
|
||||
def from_text(cls, text: str) -> "TextInputMessage":
|
||||
"""Create a text input message from a string.
|
||||
|
||||
Args:
|
||||
text: The text to send.
|
||||
|
||||
Returns:
|
||||
A TextInputMessage instance.
|
||||
"""
|
||||
return cls(realtimeInput=RealtimeInput(text=text))
|
||||
|
||||
|
||||
class ClientContentMessage(BaseModel):
|
||||
"""Message containing client content for the API.
|
||||
|
||||
Parameters:
|
||||
clientContent: The client content to send.
|
||||
"""
|
||||
|
||||
clientContent: ClientContent
|
||||
|
||||
|
||||
class SystemInstruction(BaseModel):
|
||||
"""System instruction for the model.
|
||||
|
||||
Parameters:
|
||||
parts: List of content parts that make up the system instruction.
|
||||
"""
|
||||
|
||||
parts: List[ContentPart]
|
||||
|
||||
|
||||
class AudioTranscriptionConfig(BaseModel):
|
||||
"""Configuration for audio transcription."""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class Setup(BaseModel):
|
||||
"""Setup configuration for the Gemini Live session.
|
||||
|
||||
Parameters:
|
||||
model: Model identifier to use.
|
||||
system_instruction: System instruction for the model. Defaults to None.
|
||||
tools: List of available tools/functions. Defaults to None.
|
||||
generation_config: Generation configuration parameters. Defaults to None.
|
||||
input_audio_transcription: Input audio transcription config. Defaults to None.
|
||||
output_audio_transcription: Output audio transcription config. Defaults to None.
|
||||
realtime_input_config: Realtime input configuration. Defaults to None.
|
||||
"""
|
||||
|
||||
model: str
|
||||
system_instruction: Optional[SystemInstruction] = None
|
||||
tools: Optional[List[dict]] = None
|
||||
generation_config: Optional[dict] = None
|
||||
input_audio_transcription: Optional[AudioTranscriptionConfig] = None
|
||||
output_audio_transcription: Optional[AudioTranscriptionConfig] = None
|
||||
realtime_input_config: Optional[RealtimeInputConfig] = None
|
||||
|
||||
|
||||
class Config(BaseModel):
|
||||
"""Configuration message for session setup.
|
||||
|
||||
Parameters:
|
||||
setup: Setup configuration for the session.
|
||||
"""
|
||||
|
||||
setup: Setup
|
||||
|
||||
|
||||
#
|
||||
# Grounding metadata models
|
||||
#
|
||||
|
||||
|
||||
class SearchEntryPoint(BaseModel):
|
||||
"""Represents the search entry point with rendered content for search suggestions."""
|
||||
|
||||
renderedContent: Optional[str] = None
|
||||
|
||||
|
||||
class WebSource(BaseModel):
|
||||
"""Represents a web source from grounding chunks."""
|
||||
|
||||
uri: Optional[str] = None
|
||||
title: Optional[str] = None
|
||||
|
||||
|
||||
class GroundingChunk(BaseModel):
|
||||
"""Represents a grounding chunk containing web source information."""
|
||||
|
||||
web: Optional[WebSource] = None
|
||||
|
||||
|
||||
class GroundingSegment(BaseModel):
|
||||
"""Represents a segment of text that is grounded."""
|
||||
|
||||
startIndex: Optional[int] = None
|
||||
endIndex: Optional[int] = None
|
||||
text: Optional[str] = None
|
||||
|
||||
|
||||
class GroundingSupport(BaseModel):
|
||||
"""Represents support information for grounded text segments."""
|
||||
|
||||
segment: Optional[GroundingSegment] = None
|
||||
groundingChunkIndices: Optional[List[int]] = None
|
||||
confidenceScores: Optional[List[float]] = None
|
||||
|
||||
|
||||
class GroundingMetadata(BaseModel):
|
||||
"""Represents grounding metadata from Google Search."""
|
||||
|
||||
searchEntryPoint: Optional[SearchEntryPoint] = None
|
||||
groundingChunks: Optional[List[GroundingChunk]] = None
|
||||
groundingSupports: Optional[List[GroundingSupport]] = None
|
||||
webSearchQueries: Optional[List[str]] = None
|
||||
|
||||
|
||||
#
|
||||
# Server events
|
||||
#
|
||||
|
||||
|
||||
class SetupComplete(BaseModel):
|
||||
"""Indicates that session setup is complete."""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class InlineData(BaseModel):
|
||||
"""Inline data embedded in server responses.
|
||||
|
||||
Parameters:
|
||||
mimeType: MIME type of the data.
|
||||
data: Base64-encoded data content.
|
||||
"""
|
||||
|
||||
mimeType: str
|
||||
data: str
|
||||
|
||||
|
||||
class Part(BaseModel):
|
||||
"""Part of a server response containing data or text.
|
||||
|
||||
Parameters:
|
||||
inlineData: Inline binary data. Defaults to None.
|
||||
text: Text content. Defaults to None.
|
||||
"""
|
||||
|
||||
inlineData: Optional[InlineData] = None
|
||||
text: Optional[str] = None
|
||||
|
||||
|
||||
class ModelTurn(BaseModel):
|
||||
"""Represents a turn from the model in the conversation.
|
||||
|
||||
Parameters:
|
||||
parts: List of content parts in the model's response.
|
||||
"""
|
||||
|
||||
parts: List[Part]
|
||||
|
||||
|
||||
class ServerContentInterrupted(BaseModel):
|
||||
"""Indicates server content was interrupted.
|
||||
|
||||
Parameters:
|
||||
interrupted: Whether the content was interrupted.
|
||||
"""
|
||||
|
||||
interrupted: bool
|
||||
|
||||
|
||||
class ServerContentTurnComplete(BaseModel):
|
||||
"""Indicates the server's turn is complete.
|
||||
|
||||
Parameters:
|
||||
turnComplete: Whether the turn is complete.
|
||||
"""
|
||||
|
||||
turnComplete: bool
|
||||
|
||||
|
||||
class BidiGenerateContentTranscription(BaseModel):
|
||||
"""Transcription data from bidirectional content generation.
|
||||
|
||||
Parameters:
|
||||
text: The transcribed text content.
|
||||
"""
|
||||
|
||||
text: str
|
||||
|
||||
|
||||
class ServerContent(BaseModel):
|
||||
"""Content sent from server to client.
|
||||
|
||||
Parameters:
|
||||
modelTurn: Model's conversational turn. Defaults to None.
|
||||
interrupted: Whether content was interrupted. Defaults to None.
|
||||
turnComplete: Whether the turn is complete. Defaults to None.
|
||||
inputTranscription: Transcription of input audio. Defaults to None.
|
||||
outputTranscription: Transcription of output audio. Defaults to None.
|
||||
"""
|
||||
|
||||
modelTurn: Optional[ModelTurn] = None
|
||||
interrupted: Optional[bool] = None
|
||||
turnComplete: Optional[bool] = None
|
||||
inputTranscription: Optional[BidiGenerateContentTranscription] = None
|
||||
outputTranscription: Optional[BidiGenerateContentTranscription] = None
|
||||
groundingMetadata: Optional[GroundingMetadata] = None
|
||||
|
||||
|
||||
class FunctionCall(BaseModel):
|
||||
"""Represents a function call from the model.
|
||||
|
||||
Parameters:
|
||||
id: Unique identifier for the function call.
|
||||
name: Name of the function to call.
|
||||
args: Arguments to pass to the function.
|
||||
"""
|
||||
|
||||
id: str
|
||||
name: str
|
||||
args: dict
|
||||
|
||||
|
||||
class ToolCall(BaseModel):
|
||||
"""Contains one or more function calls.
|
||||
|
||||
Parameters:
|
||||
functionCalls: List of function calls to execute.
|
||||
"""
|
||||
|
||||
functionCalls: List[FunctionCall]
|
||||
|
||||
|
||||
class Modality(str, Enum):
|
||||
"""Modality types in token counts."""
|
||||
|
||||
UNSPECIFIED = "MODALITY_UNSPECIFIED"
|
||||
TEXT = "TEXT"
|
||||
IMAGE = "IMAGE"
|
||||
AUDIO = "AUDIO"
|
||||
VIDEO = "VIDEO"
|
||||
|
||||
|
||||
class ModalityTokenCount(BaseModel):
|
||||
"""Token count for a specific modality.
|
||||
|
||||
Parameters:
|
||||
modality: The modality type.
|
||||
tokenCount: Number of tokens for this modality.
|
||||
"""
|
||||
|
||||
modality: Modality
|
||||
tokenCount: int
|
||||
|
||||
|
||||
class UsageMetadata(BaseModel):
|
||||
"""Usage metadata about the API response.
|
||||
|
||||
Parameters:
|
||||
promptTokenCount: Number of tokens in the prompt. Defaults to None.
|
||||
cachedContentTokenCount: Number of cached content tokens. Defaults to None.
|
||||
responseTokenCount: Number of tokens in the response. Defaults to None.
|
||||
toolUsePromptTokenCount: Number of tokens for tool use prompts. Defaults to None.
|
||||
thoughtsTokenCount: Number of tokens for model thoughts. Defaults to None.
|
||||
totalTokenCount: Total number of tokens used. Defaults to None.
|
||||
promptTokensDetails: Detailed breakdown of prompt tokens by modality. Defaults to None.
|
||||
cacheTokensDetails: Detailed breakdown of cache tokens by modality. Defaults to None.
|
||||
responseTokensDetails: Detailed breakdown of response tokens by modality. Defaults to None.
|
||||
toolUsePromptTokensDetails: Detailed breakdown of tool use tokens by modality. Defaults to None.
|
||||
"""
|
||||
|
||||
promptTokenCount: Optional[int] = None
|
||||
cachedContentTokenCount: Optional[int] = None
|
||||
responseTokenCount: Optional[int] = None
|
||||
toolUsePromptTokenCount: Optional[int] = None
|
||||
thoughtsTokenCount: Optional[int] = None
|
||||
totalTokenCount: Optional[int] = None
|
||||
promptTokensDetails: Optional[List[ModalityTokenCount]] = None
|
||||
cacheTokensDetails: Optional[List[ModalityTokenCount]] = None
|
||||
responseTokensDetails: Optional[List[ModalityTokenCount]] = None
|
||||
toolUsePromptTokensDetails: Optional[List[ModalityTokenCount]] = None
|
||||
|
||||
|
||||
class ServerEvent(BaseModel):
|
||||
"""Server event received from the Gemini Live API.
|
||||
|
||||
Parameters:
|
||||
setupComplete: Setup completion notification. Defaults to None.
|
||||
serverContent: Content from the server. Defaults to None.
|
||||
toolCall: Tool/function call request. Defaults to None.
|
||||
usageMetadata: Token usage metadata. Defaults to None.
|
||||
"""
|
||||
|
||||
setupComplete: Optional[SetupComplete] = None
|
||||
serverContent: Optional[ServerContent] = None
|
||||
toolCall: Optional[ToolCall] = None
|
||||
usageMetadata: Optional[UsageMetadata] = None
|
||||
|
||||
|
||||
def parse_server_event(str):
|
||||
"""Parse a server event from JSON string.
|
||||
|
||||
Args:
|
||||
str: JSON string containing the server event.
|
||||
|
||||
Returns:
|
||||
ServerEvent instance if parsing succeeds, None otherwise.
|
||||
"""
|
||||
try:
|
||||
evt = json.loads(str)
|
||||
return ServerEvent.model_validate(evt)
|
||||
except Exception as e:
|
||||
print(f"Error parsing server event: {e}")
|
||||
return None
|
||||
|
||||
|
||||
class ContextWindowCompressionConfig(BaseModel):
|
||||
"""Configuration for context window compression.
|
||||
|
||||
Parameters:
|
||||
sliding_window: Whether to use sliding window compression. Defaults to True.
|
||||
trigger_tokens: Token count threshold to trigger compression. Defaults to None.
|
||||
"""
|
||||
|
||||
sliding_window: Optional[bool] = Field(default=True)
|
||||
trigger_tokens: Optional[int] = Field(default=None)
|
||||
"""Event models and utilities for Google Gemini Multimodal Live API.
|
||||
|
||||
.. deprecated:: 0.0.90
|
||||
Importing StartSensitivity and EndSensitivity from this module is deprecated.
|
||||
Import them directly from google.genai.types instead.
|
||||
"""
|
||||
|
||||
import warnings
|
||||
|
||||
from loguru import logger
|
||||
|
||||
try:
|
||||
from google.genai.types import (
|
||||
EndSensitivity as _EndSensitivity,
|
||||
)
|
||||
from google.genai.types import (
|
||||
StartSensitivity as _StartSensitivity,
|
||||
)
|
||||
except ModuleNotFoundError as e:
|
||||
logger.error(f"Exception: {e}")
|
||||
logger.error("In order to use Google AI, you need to `pip install pipecat-ai[google]`.")
|
||||
raise Exception(f"Missing module: {e}")
|
||||
|
||||
# These aliases are just here for backward compatibility, since we used to
|
||||
# define public-facing StartSensitivity and EndSensitivity enums in this
|
||||
# module.
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("always")
|
||||
warnings.warn(
|
||||
"Importing StartSensitivity and EndSensitivity from "
|
||||
"pipecat.services.gemini_multimodal_live.events is deprecated. "
|
||||
"Please import them directly from google.genai.types instead.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
StartSensitivity = _StartSensitivity
|
||||
EndSensitivity = _EndSensitivity
|
||||
|
||||
@@ -9,181 +9,31 @@
|
||||
This module provides a client for Google's Gemini File API, enabling file
|
||||
uploads, metadata retrieval, listing, and deletion. Files uploaded through
|
||||
this API can be referenced in Gemini generative model calls.
|
||||
|
||||
.. deprecated:: 0.0.90
|
||||
Importing GeminiFileAPI from this module is deprecated.
|
||||
Import it from pipecat.services.google.gemini_live.file_api instead.
|
||||
"""
|
||||
|
||||
import mimetypes
|
||||
from typing import Any, Dict, Optional
|
||||
import warnings
|
||||
|
||||
import aiohttp
|
||||
from loguru import logger
|
||||
|
||||
try:
|
||||
from pipecat.services.google.gemini_live.file_api import GeminiFileAPI as _GeminiFileAPI
|
||||
except ModuleNotFoundError as e:
|
||||
logger.error(f"Exception: {e}")
|
||||
logger.error("In order to use Google AI, you need to `pip install pipecat-ai[google]`.")
|
||||
raise Exception(f"Missing module: {e}")
|
||||
|
||||
class GeminiFileAPI:
|
||||
"""Client for the Gemini File API.
|
||||
|
||||
This class provides methods for uploading, fetching, listing, and deleting files
|
||||
through Google's Gemini File API.
|
||||
|
||||
Files uploaded through this API remain available for 48 hours and can be referenced
|
||||
in calls to the Gemini generative models. Maximum file size is 2GB, with total
|
||||
project storage limited to 20GB.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, api_key: str, base_url: str = "https://generativelanguage.googleapis.com/v1beta/files"
|
||||
):
|
||||
"""Initialize the Gemini File API client.
|
||||
|
||||
Args:
|
||||
api_key: Google AI API key
|
||||
base_url: Base URL for the Gemini File API (default is the v1beta endpoint)
|
||||
"""
|
||||
self._api_key = api_key
|
||||
self._base_url = base_url
|
||||
# Upload URL uses the /upload/ path
|
||||
self.upload_base_url = "https://generativelanguage.googleapis.com/upload/v1beta/files"
|
||||
|
||||
async def upload_file(
|
||||
self, file_path: str, display_name: Optional[str] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""Upload a file to the Gemini File API using the correct resumable upload protocol.
|
||||
|
||||
Args:
|
||||
file_path: Path to the file to upload
|
||||
display_name: Optional display name for the file
|
||||
|
||||
Returns:
|
||||
File metadata including uri, name, and display_name
|
||||
"""
|
||||
logger.info(f"Uploading file: {file_path}")
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
# Determine the file's MIME type
|
||||
mime_type, _ = mimetypes.guess_type(file_path)
|
||||
if not mime_type:
|
||||
mime_type = "application/octet-stream"
|
||||
|
||||
# Read the file
|
||||
with open(file_path, "rb") as f:
|
||||
file_data = f.read()
|
||||
|
||||
# Create the metadata payload
|
||||
metadata = {}
|
||||
if display_name:
|
||||
metadata = {"file": {"display_name": display_name}}
|
||||
|
||||
# Step 1: Initial resumable request to get upload URL
|
||||
headers = {
|
||||
"X-Goog-Upload-Protocol": "resumable",
|
||||
"X-Goog-Upload-Command": "start",
|
||||
"X-Goog-Upload-Header-Content-Length": str(len(file_data)),
|
||||
"X-Goog-Upload-Header-Content-Type": mime_type,
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
|
||||
logger.debug(f"Step 1: Getting upload URL from {self.upload_base_url}")
|
||||
async with session.post(
|
||||
f"{self.upload_base_url}?key={self._api_key}", headers=headers, json=metadata
|
||||
) as response:
|
||||
if response.status != 200:
|
||||
error_text = await response.text()
|
||||
logger.error(f"Error initiating file upload: {error_text}")
|
||||
raise Exception(f"Failed to initiate upload: {response.status} - {error_text}")
|
||||
|
||||
# Get the upload URL from the response header
|
||||
upload_url = response.headers.get("X-Goog-Upload-URL")
|
||||
if not upload_url:
|
||||
logger.error(f"Response headers: {dict(response.headers)}")
|
||||
raise Exception("No upload URL in response headers")
|
||||
|
||||
logger.debug(f"Got upload URL: {upload_url}")
|
||||
|
||||
# Step 2: Upload the actual file data
|
||||
upload_headers = {
|
||||
"Content-Length": str(len(file_data)),
|
||||
"X-Goog-Upload-Offset": "0",
|
||||
"X-Goog-Upload-Command": "upload, finalize",
|
||||
}
|
||||
|
||||
logger.debug(f"Step 2: Uploading file data to {upload_url}")
|
||||
async with session.post(upload_url, headers=upload_headers, data=file_data) as response:
|
||||
if response.status != 200:
|
||||
error_text = await response.text()
|
||||
logger.error(f"Error uploading file data: {error_text}")
|
||||
raise Exception(f"Failed to upload file: {response.status} - {error_text}")
|
||||
|
||||
file_info = await response.json()
|
||||
logger.info(f"File uploaded successfully: {file_info.get('file', {}).get('name')}")
|
||||
return file_info
|
||||
|
||||
async def get_file(self, name: str) -> Dict[str, Any]:
|
||||
"""Get metadata for a file.
|
||||
|
||||
Args:
|
||||
name: File name (or full path)
|
||||
|
||||
Returns:
|
||||
File metadata
|
||||
"""
|
||||
# Extract just the name part if a full path is provided
|
||||
if "/" in name:
|
||||
name = name.split("/")[-1]
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(f"{self._base_url}/{name}?key={self._api_key}") as response:
|
||||
if response.status != 200:
|
||||
error_text = await response.text()
|
||||
logger.error(f"Error getting file metadata: {error_text}")
|
||||
raise Exception(f"Failed to get file metadata: {response.status}")
|
||||
|
||||
file_info = await response.json()
|
||||
return file_info
|
||||
|
||||
async def list_files(
|
||||
self, page_size: int = 10, page_token: Optional[str] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""List uploaded files.
|
||||
|
||||
Args:
|
||||
page_size: Number of files to return per page
|
||||
page_token: Token for pagination
|
||||
|
||||
Returns:
|
||||
List of files and next page token if available
|
||||
"""
|
||||
params = {"key": self._api_key, "pageSize": page_size}
|
||||
|
||||
if page_token:
|
||||
params["pageToken"] = page_token
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(self._base_url, params=params) as response:
|
||||
if response.status != 200:
|
||||
error_text = await response.text()
|
||||
logger.error(f"Error listing files: {error_text}")
|
||||
raise Exception(f"Failed to list files: {response.status}")
|
||||
|
||||
result = await response.json()
|
||||
return result
|
||||
|
||||
async def delete_file(self, name: str) -> bool:
|
||||
"""Delete a file.
|
||||
|
||||
Args:
|
||||
name: File name (or full path)
|
||||
|
||||
Returns:
|
||||
True if deleted successfully
|
||||
"""
|
||||
# Extract just the name part if a full path is provided
|
||||
if "/" in name:
|
||||
name = name.split("/")[-1]
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.delete(f"{self._base_url}/{name}?key={self._api_key}") as response:
|
||||
if response.status != 200:
|
||||
error_text = await response.text()
|
||||
logger.error(f"Error deleting file: {error_text}")
|
||||
raise Exception(f"Failed to delete file: {response.status}")
|
||||
|
||||
return True
|
||||
# These aliases are just here for backward compatibility, since we used to
|
||||
# define public-facing StartSensitivity and EndSensitivity enums in this
|
||||
# module.
|
||||
warnings.warn(
|
||||
"Importing GeminiFileAPI from "
|
||||
"pipecat.services.gemini_multimodal_live.file_api is deprecated. "
|
||||
"Please import it from pipecat.services.google.gemini_live.file_api instead.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
GeminiFileAPI = _GeminiFileAPI
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -9,6 +9,7 @@ import sys
|
||||
from pipecat.services import DeprecatedModuleProxy
|
||||
|
||||
from .frames import *
|
||||
from .gemini_live import *
|
||||
from .image import *
|
||||
from .llm import *
|
||||
from .llm_openai import *
|
||||
|
||||
3
src/pipecat/services/google/gemini_live/__init__.py
Normal file
3
src/pipecat/services/google/gemini_live/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
||||
from .file_api import GeminiFileAPI
|
||||
from .llm import GeminiLiveLLMService
|
||||
from .llm_vertex import GeminiLiveVertexLLMService
|
||||
189
src/pipecat/services/google/gemini_live/file_api.py
Normal file
189
src/pipecat/services/google/gemini_live/file_api.py
Normal file
@@ -0,0 +1,189 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""Gemini File API client for uploading and managing files.
|
||||
|
||||
This module provides a client for Google's Gemini File API, enabling file
|
||||
uploads, metadata retrieval, listing, and deletion. Files uploaded through
|
||||
this API can be referenced in Gemini generative model calls.
|
||||
"""
|
||||
|
||||
import mimetypes
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
import aiohttp
|
||||
from loguru import logger
|
||||
|
||||
|
||||
class GeminiFileAPI:
|
||||
"""Client for the Gemini File API.
|
||||
|
||||
This class provides methods for uploading, fetching, listing, and deleting files
|
||||
through Google's Gemini File API.
|
||||
|
||||
Files uploaded through this API remain available for 48 hours and can be referenced
|
||||
in calls to the Gemini generative models. Maximum file size is 2GB, with total
|
||||
project storage limited to 20GB.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, api_key: str, base_url: str = "https://generativelanguage.googleapis.com/v1beta/files"
|
||||
):
|
||||
"""Initialize the Gemini File API client.
|
||||
|
||||
Args:
|
||||
api_key: Google AI API key
|
||||
base_url: Base URL for the Gemini File API (default is the v1beta endpoint)
|
||||
"""
|
||||
self._api_key = api_key
|
||||
self._base_url = base_url
|
||||
# Upload URL uses the /upload/ path
|
||||
self.upload_base_url = "https://generativelanguage.googleapis.com/upload/v1beta/files"
|
||||
|
||||
async def upload_file(
|
||||
self, file_path: str, display_name: Optional[str] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""Upload a file to the Gemini File API using the correct resumable upload protocol.
|
||||
|
||||
Args:
|
||||
file_path: Path to the file to upload
|
||||
display_name: Optional display name for the file
|
||||
|
||||
Returns:
|
||||
File metadata including uri, name, and display_name
|
||||
"""
|
||||
logger.info(f"Uploading file: {file_path}")
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
# Determine the file's MIME type
|
||||
mime_type, _ = mimetypes.guess_type(file_path)
|
||||
if not mime_type:
|
||||
mime_type = "application/octet-stream"
|
||||
|
||||
# Read the file
|
||||
with open(file_path, "rb") as f:
|
||||
file_data = f.read()
|
||||
|
||||
# Create the metadata payload
|
||||
metadata = {}
|
||||
if display_name:
|
||||
metadata = {"file": {"display_name": display_name}}
|
||||
|
||||
# Step 1: Initial resumable request to get upload URL
|
||||
headers = {
|
||||
"X-Goog-Upload-Protocol": "resumable",
|
||||
"X-Goog-Upload-Command": "start",
|
||||
"X-Goog-Upload-Header-Content-Length": str(len(file_data)),
|
||||
"X-Goog-Upload-Header-Content-Type": mime_type,
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
|
||||
logger.debug(f"Step 1: Getting upload URL from {self.upload_base_url}")
|
||||
async with session.post(
|
||||
f"{self.upload_base_url}?key={self._api_key}", headers=headers, json=metadata
|
||||
) as response:
|
||||
if response.status != 200:
|
||||
error_text = await response.text()
|
||||
logger.error(f"Error initiating file upload: {error_text}")
|
||||
raise Exception(f"Failed to initiate upload: {response.status} - {error_text}")
|
||||
|
||||
# Get the upload URL from the response header
|
||||
upload_url = response.headers.get("X-Goog-Upload-URL")
|
||||
if not upload_url:
|
||||
logger.error(f"Response headers: {dict(response.headers)}")
|
||||
raise Exception("No upload URL in response headers")
|
||||
|
||||
logger.debug(f"Got upload URL: {upload_url}")
|
||||
|
||||
# Step 2: Upload the actual file data
|
||||
upload_headers = {
|
||||
"Content-Length": str(len(file_data)),
|
||||
"X-Goog-Upload-Offset": "0",
|
||||
"X-Goog-Upload-Command": "upload, finalize",
|
||||
}
|
||||
|
||||
logger.debug(f"Step 2: Uploading file data to {upload_url}")
|
||||
async with session.post(upload_url, headers=upload_headers, data=file_data) as response:
|
||||
if response.status != 200:
|
||||
error_text = await response.text()
|
||||
logger.error(f"Error uploading file data: {error_text}")
|
||||
raise Exception(f"Failed to upload file: {response.status} - {error_text}")
|
||||
|
||||
file_info = await response.json()
|
||||
logger.info(f"File uploaded successfully: {file_info.get('file', {}).get('name')}")
|
||||
return file_info
|
||||
|
||||
async def get_file(self, name: str) -> Dict[str, Any]:
|
||||
"""Get metadata for a file.
|
||||
|
||||
Args:
|
||||
name: File name (or full path)
|
||||
|
||||
Returns:
|
||||
File metadata
|
||||
"""
|
||||
# Extract just the name part if a full path is provided
|
||||
if "/" in name:
|
||||
name = name.split("/")[-1]
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(f"{self._base_url}/{name}?key={self._api_key}") as response:
|
||||
if response.status != 200:
|
||||
error_text = await response.text()
|
||||
logger.error(f"Error getting file metadata: {error_text}")
|
||||
raise Exception(f"Failed to get file metadata: {response.status}")
|
||||
|
||||
file_info = await response.json()
|
||||
return file_info
|
||||
|
||||
async def list_files(
|
||||
self, page_size: int = 10, page_token: Optional[str] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""List uploaded files.
|
||||
|
||||
Args:
|
||||
page_size: Number of files to return per page
|
||||
page_token: Token for pagination
|
||||
|
||||
Returns:
|
||||
List of files and next page token if available
|
||||
"""
|
||||
params = {"key": self._api_key, "pageSize": page_size}
|
||||
|
||||
if page_token:
|
||||
params["pageToken"] = page_token
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(self._base_url, params=params) as response:
|
||||
if response.status != 200:
|
||||
error_text = await response.text()
|
||||
logger.error(f"Error listing files: {error_text}")
|
||||
raise Exception(f"Failed to list files: {response.status}")
|
||||
|
||||
result = await response.json()
|
||||
return result
|
||||
|
||||
async def delete_file(self, name: str) -> bool:
|
||||
"""Delete a file.
|
||||
|
||||
Args:
|
||||
name: File name (or full path)
|
||||
|
||||
Returns:
|
||||
True if deleted successfully
|
||||
"""
|
||||
# Extract just the name part if a full path is provided
|
||||
if "/" in name:
|
||||
name = name.split("/")[-1]
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.delete(f"{self._base_url}/{name}?key={self._api_key}") as response:
|
||||
if response.status != 200:
|
||||
error_text = await response.text()
|
||||
logger.error(f"Error deleting file: {error_text}")
|
||||
raise Exception(f"Failed to delete file: {response.status}")
|
||||
|
||||
return True
|
||||
1582
src/pipecat/services/google/gemini_live/llm.py
Normal file
1582
src/pipecat/services/google/gemini_live/llm.py
Normal file
File diff suppressed because it is too large
Load Diff
184
src/pipecat/services/google/gemini_live/llm_vertex.py
Normal file
184
src/pipecat/services/google/gemini_live/llm_vertex.py
Normal file
@@ -0,0 +1,184 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""Service for accessing Gemini Live via Google Vertex AI.
|
||||
|
||||
This module provides integration with Google's Gemini Live model via
|
||||
Vertex AI, supporting both text and audio modalities with voice transcription,
|
||||
streaming responses, and tool usage.
|
||||
"""
|
||||
|
||||
import json
|
||||
from typing import List, Optional, Union
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
||||
from pipecat.services.google.gemini_live.llm import (
|
||||
GeminiLiveLLMService,
|
||||
HttpOptions,
|
||||
InputParams,
|
||||
)
|
||||
|
||||
try:
|
||||
from google.auth import default
|
||||
from google.auth.exceptions import GoogleAuthError
|
||||
from google.auth.transport.requests import Request
|
||||
from google.genai import Client
|
||||
from google.oauth2 import service_account
|
||||
|
||||
except ModuleNotFoundError as e:
|
||||
logger.error(f"Exception: {e}")
|
||||
logger.error("In order to use Google Vertex AI, you need to `pip install pipecat-ai[google]`.")
|
||||
raise Exception(f"Missing module: {e}")
|
||||
|
||||
|
||||
class GeminiLiveVertexLLMService(GeminiLiveLLMService):
|
||||
"""Provides access to Google's Gemini Live model via Vertex AI.
|
||||
|
||||
This service enables real-time conversations with Gemini, supporting both
|
||||
text and audio modalities. It handles voice transcription, streaming audio
|
||||
responses, and tool usage.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
credentials: Optional[str] = None,
|
||||
credentials_path: Optional[str] = None,
|
||||
location: str,
|
||||
project_id: str,
|
||||
model="google/gemini-2.0-flash-live-preview-04-09",
|
||||
voice_id: str = "Charon",
|
||||
start_audio_paused: bool = False,
|
||||
start_video_paused: bool = False,
|
||||
system_instruction: Optional[str] = None,
|
||||
tools: Optional[Union[List[dict], ToolsSchema]] = None,
|
||||
params: Optional[InputParams] = None,
|
||||
inference_on_context_initialization: bool = True,
|
||||
file_api_base_url: str = "https://generativelanguage.googleapis.com/v1beta/files",
|
||||
http_options: Optional[HttpOptions] = None,
|
||||
**kwargs,
|
||||
):
|
||||
"""Initialize the service for accessing Gemini Live via Google Vertex AI.
|
||||
|
||||
Args:
|
||||
credentials: JSON string of service account credentials.
|
||||
credentials_path: Path to the service account JSON file.
|
||||
location: GCP region for Vertex AI endpoint (e.g., "us-east4").
|
||||
project_id: Google Cloud project ID.
|
||||
model: Model identifier to use. Defaults to "models/gemini-2.0-flash-live-preview-04-09".
|
||||
voice_id: TTS voice identifier. Defaults to "Charon".
|
||||
start_audio_paused: Whether to start with audio input paused. Defaults to False.
|
||||
start_video_paused: Whether to start with video input paused. Defaults to False.
|
||||
system_instruction: System prompt for the model. Defaults to None.
|
||||
tools: Tools/functions available to the model. Defaults to None.
|
||||
params: Configuration parameters for the model along with Vertex AI
|
||||
location and project ID.
|
||||
inference_on_context_initialization: Whether to generate a response when context
|
||||
is first set. Defaults to True.
|
||||
file_api_base_url: Base URL for the Gemini File API. Defaults to the official endpoint.
|
||||
http_options: HTTP options for the client.
|
||||
**kwargs: Additional arguments passed to parent GeminiLiveLLMService.
|
||||
"""
|
||||
# Check if user incorrectly passed api_key, which is used by parent
|
||||
# class but not here.
|
||||
if "api_key" in kwargs:
|
||||
logger.error(
|
||||
"GeminiLiveVertexLLMService does not accept 'api_key' parameter. "
|
||||
"Use 'credentials' or 'credentials_path' instead for Vertex AI authentication."
|
||||
)
|
||||
raise ValueError(
|
||||
"Invalid parameter 'api_key'. Use 'credentials' or 'credentials_path' for Vertex AI authentication."
|
||||
)
|
||||
|
||||
# These need to be set before calling super().__init__() because
|
||||
# super().__init__() invokes create_client(), which needs these.
|
||||
self._credentials = self._get_credentials(credentials, credentials_path)
|
||||
self._project_id = project_id
|
||||
self._location = location
|
||||
|
||||
# Call parent constructor with the obtained API key
|
||||
super().__init__(
|
||||
# api_key is required by parent class, but actually not used with
|
||||
# Vertex
|
||||
api_key="dummy",
|
||||
model=model,
|
||||
voice_id=voice_id,
|
||||
start_audio_paused=start_audio_paused,
|
||||
start_video_paused=start_video_paused,
|
||||
system_instruction=system_instruction,
|
||||
tools=tools,
|
||||
params=params,
|
||||
inference_on_context_initialization=inference_on_context_initialization,
|
||||
file_api_base_url=file_api_base_url,
|
||||
http_options=http_options,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def create_client(self):
|
||||
"""Create the Gemini client instance."""
|
||||
self._client = Client(
|
||||
vertexai=True,
|
||||
credentials=self._credentials,
|
||||
project=self._project_id,
|
||||
location=self._location,
|
||||
)
|
||||
|
||||
@property
|
||||
def file_api(self):
|
||||
"""Gemini File API is not supported with Vertex AI."""
|
||||
raise NotImplementedError(
|
||||
"When using Vertex AI, the recommended approach is to use Google Cloud Storage for file handling. The Gemini File API is not directly supported in this context."
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _get_credentials(credentials: Optional[str], credentials_path: Optional[str]) -> str:
|
||||
"""Retrieve Credentials using Google service account credentials JSON.
|
||||
|
||||
Supports multiple authentication methods:
|
||||
1. Direct JSON credentials string
|
||||
2. Path to service account JSON file
|
||||
3. Default application credentials (ADC)
|
||||
|
||||
Args:
|
||||
credentials: JSON string of service account credentials.
|
||||
credentials_path: Path to the service account JSON file.
|
||||
|
||||
Returns:
|
||||
OAuth token for API authentication.
|
||||
|
||||
Raises:
|
||||
ValueError: If no valid credentials are provided or found.
|
||||
"""
|
||||
creds: Optional[service_account.Credentials] = None
|
||||
|
||||
if credentials:
|
||||
# Parse and load credentials from JSON string
|
||||
creds = service_account.Credentials.from_service_account_info(
|
||||
json.loads(credentials),
|
||||
scopes=["https://www.googleapis.com/auth/cloud-platform"],
|
||||
)
|
||||
elif credentials_path:
|
||||
# Load credentials from JSON file
|
||||
creds = service_account.Credentials.from_service_account_file(
|
||||
credentials_path,
|
||||
scopes=["https://www.googleapis.com/auth/cloud-platform"],
|
||||
)
|
||||
else:
|
||||
try:
|
||||
creds, project_id = default(
|
||||
scopes=["https://www.googleapis.com/auth/cloud-platform"]
|
||||
)
|
||||
except GoogleAuthError:
|
||||
pass
|
||||
|
||||
if not creds:
|
||||
raise ValueError("No valid credentials provided.")
|
||||
|
||||
creds.refresh(Request()) # Ensure token is up-to-date, lifetime is 1 hour.
|
||||
|
||||
return creds
|
||||
@@ -35,6 +35,7 @@ from pipecat.frames.frames import (
|
||||
LLMMessagesFrame,
|
||||
LLMTextFrame,
|
||||
LLMUpdateSettingsFrame,
|
||||
OutputImageRawFrame,
|
||||
UserImageRawFrame,
|
||||
)
|
||||
from pipecat.metrics.metrics import LLMTokenUsage
|
||||
@@ -72,6 +73,9 @@ try:
|
||||
HttpOptions,
|
||||
Part,
|
||||
)
|
||||
|
||||
# Temporary hack to be able to process Nano Banana returned images.
|
||||
genai._api_client.READ_BUFFER_SIZE = 5 * 1024 * 1024
|
||||
except ModuleNotFoundError as e:
|
||||
logger.error(f"Exception: {e}")
|
||||
logger.error("In order to use Google AI, you need to `pip install pipecat-ai[google]`.")
|
||||
@@ -682,7 +686,7 @@ class GoogleLLMService(LLMService):
|
||||
self,
|
||||
*,
|
||||
api_key: str,
|
||||
model: str = "gemini-2.0-flash",
|
||||
model: str = "gemini-2.5-flash",
|
||||
params: Optional[InputParams] = None,
|
||||
system_instruction: Optional[str] = None,
|
||||
tools: Optional[List[Dict[str, Any]]] = None,
|
||||
@@ -710,6 +714,7 @@ class GoogleLLMService(LLMService):
|
||||
self._api_key = api_key
|
||||
self._system_instruction = system_instruction
|
||||
self._http_options = http_options
|
||||
|
||||
self._create_client(api_key, http_options)
|
||||
self._settings = {
|
||||
"max_tokens": params.max_tokens,
|
||||
@@ -788,6 +793,9 @@ class GoogleLLMService(LLMService):
|
||||
# and can be configured to turn it off.
|
||||
if not self._model_name.startswith("gemini-2.5-flash"):
|
||||
return
|
||||
# If we have an image model, we don't use a budget either.
|
||||
if "image" in self._model_name:
|
||||
return
|
||||
# If thinking_config is already set, don't override it.
|
||||
if "thinking_config" in generation_params:
|
||||
return
|
||||
@@ -927,6 +935,12 @@ class GoogleLLMService(LLMService):
|
||||
arguments=function_call.args or {},
|
||||
)
|
||||
)
|
||||
elif part.inline_data and part.inline_data.data:
|
||||
image = Image.open(io.BytesIO(part.inline_data.data))
|
||||
frame = OutputImageRawFrame(
|
||||
image=image.tobytes(), size=image.size, format="RGB"
|
||||
)
|
||||
await self.push_frame(frame)
|
||||
|
||||
if (
|
||||
candidate.grounding_metadata
|
||||
|
||||
@@ -94,9 +94,9 @@ class GoogleLLMOpenAIBetaService(OpenAILLMService):
|
||||
async for chunk in chunk_stream:
|
||||
if chunk.usage:
|
||||
tokens = LLMTokenUsage(
|
||||
prompt_tokens=chunk.usage.prompt_tokens,
|
||||
completion_tokens=chunk.usage.completion_tokens,
|
||||
total_tokens=chunk.usage.total_tokens,
|
||||
prompt_tokens=chunk.usage.prompt_tokens or 0,
|
||||
completion_tokens=chunk.usage.completion_tokens or 0,
|
||||
total_tokens=chunk.usage.total_tokens or 0,
|
||||
)
|
||||
await self.start_llm_usage_metrics(tokens)
|
||||
|
||||
|
||||
@@ -53,12 +53,44 @@ class GoogleVertexLLMService(OpenAILLMService):
|
||||
|
||||
Parameters:
|
||||
location: GCP region for Vertex AI endpoint (e.g., "us-east4").
|
||||
|
||||
.. deprecated:: 0.0.90
|
||||
Use `location` as a direct argument to
|
||||
`GoogleVertexLLMService.__init__()` instead.
|
||||
|
||||
project_id: Google Cloud project ID.
|
||||
|
||||
.. deprecated:: 0.0.90
|
||||
Use `project_id` as a direct argument to
|
||||
`GoogleVertexLLMService.__init__()` instead.
|
||||
"""
|
||||
|
||||
# https://cloud.google.com/vertex-ai/generative-ai/docs/learn/locations
|
||||
location: str = "us-east4"
|
||||
project_id: str
|
||||
location: Optional[str] = None
|
||||
project_id: Optional[str] = None
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
"""Initializes the InputParams."""
|
||||
import warnings
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("always")
|
||||
if "location" in kwargs and kwargs["location"] is not None:
|
||||
warnings.warn(
|
||||
"GoogleVertexLLMService.InputParams.location is deprecated. "
|
||||
"Please provide 'location' as a direct argument to GoogleVertexLLMService.__init__() instead.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
if "project_id" in kwargs and kwargs["project_id"] is not None:
|
||||
warnings.warn(
|
||||
"GoogleVertexLLMService.InputParams.project_id is deprecated. "
|
||||
"Please provide 'project_id' as a direct argument to GoogleVertexLLMService.__init__() instead.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
super().__init__(**kwargs)
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -66,7 +98,8 @@ class GoogleVertexLLMService(OpenAILLMService):
|
||||
credentials: Optional[str] = None,
|
||||
credentials_path: Optional[str] = None,
|
||||
model: str = "google/gemini-2.0-flash-001",
|
||||
params: Optional[InputParams] = None,
|
||||
location: Optional[str] = None,
|
||||
project_id: Optional[str] = None,
|
||||
**kwargs,
|
||||
):
|
||||
"""Initializes the VertexLLMService.
|
||||
@@ -75,33 +108,60 @@ class GoogleVertexLLMService(OpenAILLMService):
|
||||
credentials: JSON string of service account credentials.
|
||||
credentials_path: Path to the service account JSON file.
|
||||
model: Model identifier (e.g., "google/gemini-2.0-flash-001").
|
||||
params: Vertex AI input parameters including location and project.
|
||||
location: GCP region for Vertex AI endpoint (e.g., "us-east4").
|
||||
project_id: Google Cloud project ID.
|
||||
**kwargs: Additional arguments passed to OpenAILLMService.
|
||||
"""
|
||||
params = params or OpenAILLMService.InputParams()
|
||||
base_url = self._get_base_url(params)
|
||||
# Handle deprecated InputParams fields
|
||||
if "params" in kwargs and isinstance(kwargs["params"], GoogleVertexLLMService.InputParams):
|
||||
params = kwargs["params"]
|
||||
# Extract location and project_id from params if not provided
|
||||
# directly, for backward compatibility
|
||||
if project_id is None:
|
||||
project_id = params.project_id
|
||||
if location is None:
|
||||
location = params.location
|
||||
# Convert to base InputParams
|
||||
params = OpenAILLMService.InputParams(
|
||||
**params.model_dump(exclude={"location", "project_id"}, exclude_unset=True)
|
||||
)
|
||||
kwargs["params"] = params
|
||||
|
||||
# Validate project_id and location parameters
|
||||
# NOTE: once we remove Vertex-spcific InputParams class, we can update
|
||||
# __init__() signature as follows:
|
||||
# - location: str = "us-east4",
|
||||
# - project_id: str,
|
||||
# But for now, we need them as-is to maintain proper backward
|
||||
# compatibility.
|
||||
if project_id is None:
|
||||
raise ValueError("project_id is required")
|
||||
if location is None:
|
||||
# If location is not provided, default to "us-east4".
|
||||
# Note: this is legacy behavior; ideally location would be
|
||||
# required.
|
||||
logger.warning("location is not provided. Defaulting to 'us-east4'.")
|
||||
location = "us-east4" # Default location if not provided
|
||||
|
||||
base_url = self._get_base_url(location, project_id)
|
||||
self._api_key = self._get_api_token(credentials, credentials_path)
|
||||
|
||||
super().__init__(
|
||||
api_key=self._api_key,
|
||||
base_url=base_url,
|
||||
model=model,
|
||||
params=params,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _get_base_url(params: InputParams) -> str:
|
||||
def _get_base_url(location: str, project_id: str) -> str:
|
||||
"""Construct the base URL for Vertex AI API."""
|
||||
# Determine the correct API host based on location
|
||||
if params.location == "global":
|
||||
if location == "global":
|
||||
api_host = "aiplatform.googleapis.com"
|
||||
else:
|
||||
api_host = f"{params.location}-aiplatform.googleapis.com"
|
||||
return (
|
||||
f"https://{api_host}/v1/"
|
||||
f"projects/{params.project_id}/locations/{params.location}/endpoints/openapi"
|
||||
)
|
||||
api_host = f"{location}-aiplatform.googleapis.com"
|
||||
return f"https://{api_host}/v1/projects/{project_id}/locations/{location}/endpoints/openapi"
|
||||
|
||||
@staticmethod
|
||||
def _get_api_token(credentials: Optional[str], credentials_path: Optional[str]) -> str:
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user