Compare commits
152 Commits
hush/reset
...
hush/usage
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
cb6e86e69f | ||
|
|
fce6f55ddb | ||
|
|
d9580f72a9 | ||
|
|
0588c82bbf | ||
|
|
16e9093d5a | ||
|
|
91a5d580fd | ||
|
|
0473556992 | ||
|
|
fdaa4e476e | ||
|
|
502e7e42a7 | ||
|
|
2ab3d4fb42 | ||
|
|
55014bdd77 | ||
|
|
334796bd65 | ||
|
|
1c25b6fb72 | ||
|
|
91b29de7ca | ||
|
|
21d610cd30 | ||
|
|
f7fe673ad1 | ||
|
|
4b415721e2 | ||
|
|
8d2a98e0e7 | ||
|
|
523e890c8c | ||
|
|
3c748fe772 | ||
|
|
d293cee372 | ||
|
|
8b62a96878 | ||
|
|
0c102ce70b | ||
|
|
3894d2a4b9 | ||
|
|
1f6b61c0db | ||
|
|
8ee28b37cd | ||
|
|
e85e7e4d84 | ||
|
|
1b3afb5511 | ||
|
|
7cec013666 | ||
|
|
86127167fb | ||
|
|
9935a68018 | ||
|
|
5679dde70f | ||
|
|
d81b0f6368 | ||
|
|
9698b008da | ||
|
|
7b05c9283b | ||
|
|
303dd2ec35 | ||
|
|
aa6e81648a | ||
|
|
1a87870ef3 | ||
|
|
aac4ce2d12 | ||
|
|
2a79b2c853 | ||
|
|
15bf5b1533 | ||
|
|
cdc86db8ce | ||
|
|
9d2ad750b5 | ||
|
|
19ceb1a48f | ||
|
|
59217eae38 | ||
|
|
bea0aee835 | ||
|
|
aeace9b9be | ||
|
|
2994640f47 | ||
|
|
10069719e4 | ||
|
|
046b76df60 | ||
|
|
f2d9063984 | ||
|
|
99f008e927 | ||
|
|
2699f0c2a6 | ||
|
|
0b6dd98000 | ||
|
|
a14fb20d15 | ||
|
|
728361a6a7 | ||
|
|
106db69e8e | ||
|
|
cf90071926 | ||
|
|
deaeb75a1f | ||
|
|
a666327d70 | ||
|
|
13a0522546 | ||
|
|
7da37a0d1f | ||
|
|
7efb22a323 | ||
|
|
8084e2f909 | ||
|
|
86127c6a6e | ||
|
|
402e019ae2 | ||
|
|
f09e4e238b | ||
|
|
2921162b3b | ||
|
|
ac1582c906 | ||
|
|
e4b01a5844 | ||
|
|
fa663abbbc | ||
|
|
d19e6111c3 | ||
|
|
8a6d504a7e | ||
|
|
43915937f2 | ||
|
|
48e92a22fe | ||
|
|
566af6b0b8 | ||
|
|
12e7613d5f | ||
|
|
04a68f2c57 | ||
|
|
9b4ca12f49 | ||
|
|
453ce715a6 | ||
|
|
d87b6189ba | ||
|
|
8293347b77 | ||
|
|
c85a3f0b94 | ||
|
|
233fb25e6c | ||
|
|
080978daa6 | ||
|
|
62b7c3d3b2 | ||
|
|
4b2379cba8 | ||
|
|
92087bdfa8 | ||
|
|
617919ac09 | ||
|
|
0669daec3d | ||
|
|
7c15a8c800 | ||
|
|
066b77fba0 | ||
|
|
d9aef5f916 | ||
|
|
91ae3f8a9b | ||
|
|
36da623352 | ||
|
|
31b9087ea6 | ||
|
|
1851fed22e | ||
|
|
eddce460da | ||
|
|
da4f30cb6d | ||
|
|
250cf2d8f1 | ||
|
|
7bbdb4f991 | ||
|
|
051c4782fb | ||
|
|
b1ccec74b2 | ||
|
|
92bf0d9eda | ||
|
|
f985550441 | ||
|
|
de8ee96927 | ||
|
|
2576d0f340 | ||
|
|
f38f4711ac | ||
|
|
c2f3ddd329 | ||
|
|
73ffe96228 | ||
|
|
bd13a80da7 | ||
|
|
312959f97e | ||
|
|
fe168e3c68 | ||
|
|
28929a47f7 | ||
|
|
03f5defbc3 | ||
|
|
b216648315 | ||
|
|
084b133a01 | ||
|
|
e589876176 | ||
|
|
a826313bf9 | ||
|
|
49f44aa7c8 | ||
|
|
64ceef9cf0 | ||
|
|
cd6567c1f1 | ||
|
|
ac67ca1555 | ||
|
|
8d38994756 | ||
|
|
607e3040d4 | ||
|
|
60604a9449 | ||
|
|
4abe4a6253 | ||
|
|
4c054af17b | ||
|
|
dcba940d42 | ||
|
|
ad2adb0c58 | ||
|
|
76923010b5 | ||
|
|
1b511557b2 | ||
|
|
fdadb12933 | ||
|
|
f1bbb7ba22 | ||
|
|
c1492c5275 | ||
|
|
4ffdabcfde | ||
|
|
b489de2fc3 | ||
|
|
d9656cbb1a | ||
|
|
05fb223985 | ||
|
|
62a5f07ad2 | ||
|
|
b669e3a481 | ||
|
|
99f1041a47 | ||
|
|
37b1345bfa | ||
|
|
8994ac17eb | ||
|
|
63bc825008 | ||
|
|
e7ffde1c4c | ||
|
|
1c88565725 | ||
|
|
07a6c2fb0e | ||
|
|
e99f3bf75a | ||
|
|
f09d780413 | ||
|
|
e370d23374 | ||
|
|
b68ec14146 |
22
.github/workflows/publish.yaml
vendored
22
.github/workflows/publish.yaml
vendored
@@ -5,25 +5,25 @@ on:
|
||||
inputs:
|
||||
gitref:
|
||||
type: string
|
||||
description: "what git tag to build (e.g. v0.0.74)"
|
||||
description: 'what git tag to build (e.g. v0.0.74)'
|
||||
required: true
|
||||
|
||||
jobs:
|
||||
build:
|
||||
name: "Build and upload wheels"
|
||||
name: 'Build and upload wheels'
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout repo
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
ref: ${{ github.event.inputs.gitref }}
|
||||
|
||||
|
||||
- name: Install uv
|
||||
uses: astral-sh/setup-uv@v3
|
||||
with:
|
||||
version: "latest"
|
||||
version: 'latest'
|
||||
- name: Set up Python
|
||||
run: uv python install 3.10
|
||||
run: uv python install 3.12
|
||||
- name: Install development dependencies
|
||||
run: uv sync --group dev
|
||||
- name: Build project
|
||||
@@ -35,9 +35,9 @@ jobs:
|
||||
path: ./dist
|
||||
|
||||
publish-to-pypi:
|
||||
name: "Publish to PyPI"
|
||||
name: 'Publish to PyPI'
|
||||
runs-on: ubuntu-latest
|
||||
needs: [ build ]
|
||||
needs: [build]
|
||||
environment:
|
||||
name: pypi
|
||||
url: https://pypi.org/p/pipecat-ai
|
||||
@@ -56,12 +56,12 @@ jobs:
|
||||
print-hash: true
|
||||
|
||||
publish-to-test-pypi:
|
||||
name: "Publish to Test PyPI"
|
||||
name: 'Publish to Test PyPI'
|
||||
runs-on: ubuntu-latest
|
||||
needs: [ build ]
|
||||
needs: [build]
|
||||
environment:
|
||||
name: testpypi
|
||||
url: https://pypi.org/p/pipecat-ai
|
||||
url: https://test.pypi.org/p/pipecat-ai
|
||||
permissions:
|
||||
id-token: write
|
||||
steps:
|
||||
@@ -70,7 +70,7 @@ jobs:
|
||||
with:
|
||||
name: wheels
|
||||
path: ./dist
|
||||
- name: Publish to PyPI
|
||||
- name: Publish to Test PyPI
|
||||
uses: pypa/gh-action-pypi-publish@release/v1
|
||||
with:
|
||||
verbose: true
|
||||
|
||||
12
.github/workflows/publish_test.yaml
vendored
12
.github/workflows/publish_test.yaml
vendored
@@ -4,7 +4,7 @@ on: workflow_dispatch
|
||||
|
||||
jobs:
|
||||
build:
|
||||
name: "Build and upload wheels"
|
||||
name: 'Build and upload wheels'
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout repo
|
||||
@@ -15,9 +15,9 @@ jobs:
|
||||
- name: Install uv
|
||||
uses: astral-sh/setup-uv@v3
|
||||
with:
|
||||
version: "latest"
|
||||
version: 'latest'
|
||||
- name: Set up Python
|
||||
run: uv python install 3.10
|
||||
run: uv python install 3.12
|
||||
- name: Install development dependencies
|
||||
run: uv sync --group dev
|
||||
- name: Build project
|
||||
@@ -29,12 +29,12 @@ jobs:
|
||||
path: ./dist
|
||||
|
||||
publish-to-test-pypi:
|
||||
name: "Publish to Test PyPI"
|
||||
name: 'Publish to Test PyPI'
|
||||
runs-on: ubuntu-latest
|
||||
needs: [build]
|
||||
environment:
|
||||
name: testpypi
|
||||
url: https://pypi.org/p/pipecat-ai
|
||||
url: https://test.pypi.org/p/pipecat-ai
|
||||
permissions:
|
||||
id-token: write
|
||||
steps:
|
||||
@@ -43,7 +43,7 @@ jobs:
|
||||
with:
|
||||
name: wheels
|
||||
path: ./dist
|
||||
- name: Publish to PyPI
|
||||
- name: Publish to Test PyPI
|
||||
uses: pypa/gh-action-pypi-publish@release/v1
|
||||
with:
|
||||
verbose: true
|
||||
|
||||
163
CHANGELOG.md
163
CHANGELOG.md
@@ -9,6 +9,127 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
|
||||
### Added
|
||||
|
||||
- The runner `--folder` argument now supports downloading files from
|
||||
subdirectories.
|
||||
|
||||
### Fixed
|
||||
|
||||
- Fixed an issue where `RimeHttpTTSService` and `PiperTTSService` could generate
|
||||
incorrectly 16-bit aligned audio frames, potentially leading to internal
|
||||
errors or static audio.
|
||||
|
||||
## [0.0.90] - 2025-10-10
|
||||
|
||||
### Added
|
||||
|
||||
- Added audio filter `KrispVivaFilter` using the Krisp VIVA SDK.
|
||||
|
||||
- Added `--folder` argument to the runner, allowing files saved in that folder
|
||||
to be downloaded from `http://HOST:PORT/file/FILE`.
|
||||
|
||||
- Added `GeminiLiveVertexLLMService`, for accessing Gemini Live via Google
|
||||
Vertex AI.
|
||||
|
||||
- Added some new configuration options to `GeminiLiveLLMService`:
|
||||
|
||||
- `thinking`
|
||||
- `enable_affective_dialog`
|
||||
- `proactivity`
|
||||
|
||||
Note that these new configuration options require using a newer model than
|
||||
the default, like "gemini-2.5-flash-native-audio-preview-09-2025". The last
|
||||
two require specifying `http_options=HttpOptions(api_version="v1alpha")`.
|
||||
|
||||
- Added `on_pipeline_error` event to `PipelineTask`. This event will get fired
|
||||
when an `ErrorFrame` is pushed (use `FrameProcessor.push_error()`).
|
||||
|
||||
```python
|
||||
@task.event_handler("on_pipeline_error")
|
||||
async def on_pipeline_error(task: PipelineTask, frame: ErrorFrame):
|
||||
...
|
||||
```
|
||||
|
||||
- Added a `service_tier` `InputParam` to the `BaseOpenAILLMService`. This
|
||||
parameter can influence the latency of the response. For example `"priority"`
|
||||
will result in faster completions, but in exchange for a higher price.
|
||||
|
||||
### Changed
|
||||
|
||||
- Updated `GeminiLiveLLMService` to use the `google-genai` library rather than
|
||||
use WebSockets directly.
|
||||
|
||||
### Deprecated
|
||||
|
||||
- `LivekitFrameSerializer` is now deprecated. Use `LiveKitTransport` instead.
|
||||
|
||||
- `pipecat.service.openai_realtime` is now deprecated, use
|
||||
`pipecat.services.openai.realtime` instead or
|
||||
`pipecat.services.azure.realtime` for Azure Realtime.
|
||||
|
||||
- `pipecat.service.aws_nova_sonic` is now deprecated, use
|
||||
`pipecat.services.aws.nova_sonic` instead.
|
||||
|
||||
- `GeminiMultimodalLiveLLMService` is now deprecated, use
|
||||
`GeminiLiveLLMService`.
|
||||
|
||||
### Fixed
|
||||
|
||||
- Fixed a `GoogleVertexLLMService` issue that would generate an error if no
|
||||
token information was returned.
|
||||
|
||||
- `GeminiLiveLLMService` will now end gracefully (i.e. after the bot has
|
||||
finished) upon receiving an `EndFrame`.
|
||||
|
||||
- `GeminiLiveLLMService` will try to seamlessly reconnect when it loses its
|
||||
connection.
|
||||
|
||||
## [0.0.89] - 2025-10-07
|
||||
|
||||
### Fixed
|
||||
|
||||
- Reverted a change introduced in 0.0.88 that was causing pipelines to be frozen
|
||||
when using interruption strategies and processors that block interruption
|
||||
frames (e.g. `STTMuteFilter`).
|
||||
|
||||
## [0.0.88] - 2025-10-07
|
||||
|
||||
### Added
|
||||
|
||||
- Added support for Nano Banana models to `GoogleLLMService`. For example, you
|
||||
can now use the `gemini-2.5-flash-image` model to generate images.
|
||||
|
||||
- Added `HumeTTSService` for text-to-speech synthesis using Hume AI's expressive
|
||||
voice models. Provides high-quality, emotionally expressive speech synthesis
|
||||
with support for various voice models. Includes example in
|
||||
`examples/foundational/07ad-interruptible-hume.py`. Use with:
|
||||
`uv pip install pipecat-ai[hume]`.
|
||||
|
||||
### Changed
|
||||
|
||||
- Updated default `GoogleLLMService` model to `gemini-2.5-flash`.
|
||||
|
||||
### Deprecated
|
||||
|
||||
- PlayHT is shutting down their API on December 31st, 2025. As a result,
|
||||
`PlayHTTTSService` and `PlayHTHttpTTSService` are deprecated and will be
|
||||
removed in a future version.
|
||||
|
||||
### Fixed
|
||||
|
||||
- Fixed an issue with `AWSNovaSonicLLMService` where the client wouldn't
|
||||
connect due to a breaking change in the AWS dependency chain.
|
||||
|
||||
- `PermissionError` is now caught if NLTK's `punkt_tab` can't be downloaded.
|
||||
|
||||
- Fixed an issue that would cause wrong user/assistant context ordering when
|
||||
using interruption strategies.
|
||||
|
||||
- Fixed RTVI incoming message handling, broken in 0.0.87.
|
||||
|
||||
## [0.0.87] - 2025-10-02
|
||||
|
||||
### Added
|
||||
|
||||
- Added `WebsocketSTTService` base class for websocket-based STT services.
|
||||
Combines STT functionality with websocket connectivity, providing automatic
|
||||
error handling and reconnection capabilities with exponential backoff.
|
||||
@@ -28,23 +149,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
|
||||
### Deprecated
|
||||
|
||||
- `DailyUpdateRemoteParticipantsFrame` is deprecated and will be removed in a
|
||||
future version. Instead, create your own custom frame and handle it in the
|
||||
`@transport.output().event_handler("on_after_push_frame")` event handler or a
|
||||
custom processor.
|
||||
|
||||
## Fixed
|
||||
|
||||
- Fixed an issue in `AWSBedrockLLMService` where timeout exceptions weren't
|
||||
being detected.
|
||||
|
||||
- Fixed a `PipelineTask` issue that could prevent the application to exit if
|
||||
`task.cancel()` was called when the task was already finished.
|
||||
|
||||
- Fixed an issue where local SmartTurn was not being ran in a separate thread.
|
||||
|
||||
### Deprecated
|
||||
|
||||
- `DailyTransportMessageFrame` and `DailyTransportMessageUrgentFrame` are
|
||||
deprecated, use `DailyOutputTransportMessageFrame` and
|
||||
`DailyOutputTransportMessageUrgentFrame` respectively instead.
|
||||
@@ -60,6 +164,21 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
- `InputTransportMessageUrgentFrame` is deprecated, use
|
||||
`InputTransportMessageFrame` instead.
|
||||
|
||||
- `DailyUpdateRemoteParticipantsFrame` is deprecated and will be removed in a
|
||||
future version. Instead, create your own custom frame and handle it in the
|
||||
`@transport.output().event_handler("on_after_push_frame")` event handler or a
|
||||
custom processor.
|
||||
|
||||
## Fixed
|
||||
|
||||
- Fixed an issue in `AWSBedrockLLMService` where timeout exceptions weren't
|
||||
being detected.
|
||||
|
||||
- Fixed a `PipelineTask` issue that could prevent the application to exit if
|
||||
`task.cancel()` was called when the task was already finished.
|
||||
|
||||
- Fixed an issue where local SmartTurn was not being ran in a separate thread.
|
||||
|
||||
## [0.0.86] - 2025-09-24
|
||||
|
||||
### Added
|
||||
@@ -1381,7 +1500,7 @@ quality and critical bugs impacting `ParallelPipelines` functionality.**
|
||||
- Added `session_token` parameter to `AWSNovaSonicLLMService`.
|
||||
|
||||
- Added Gemini Multimodal Live File API for uploading, fetching, listing, and
|
||||
deleting files. See `26f-gemini-multimodal-live-files-api.py` for example usage.
|
||||
deleting files. See `26f-gemini-live-files-api.py` for example usage.
|
||||
|
||||
### Changed
|
||||
|
||||
@@ -3387,7 +3506,7 @@ stt = DeepgramSTTService(..., live_options=LiveOptions(model="nova-2-general"))
|
||||
- Added the new modalities option and helper function to set Gemini output
|
||||
modalities.
|
||||
|
||||
- Added `examples/foundational/26d-gemini-multimodal-live-text.py` which is
|
||||
- Added `examples/foundational/26d-gemini-live-text.py` which is
|
||||
using Gemini as TEXT modality and using another TTS provider for TTS process.
|
||||
|
||||
### Changed
|
||||
@@ -3574,9 +3693,9 @@ stt = DeepgramSTTService(..., live_options=LiveOptions(model="nova-2-general"))
|
||||
- Added new foundational examples for `GeminiMultimodalLiveLLMService`:
|
||||
|
||||
- `26-gemini-multimodal-live.py`
|
||||
- `26a-gemini-multimodal-live-transcription.py`
|
||||
- `26b-gemini-multimodal-live-video.py`
|
||||
- `26c-gemini-multimodal-live-video.py`
|
||||
- `26a-gemini-live-transcription.py`
|
||||
- `26b-gemini-live-video.py`
|
||||
- `26c-gemini-live-video.py`
|
||||
|
||||
- Added `SimliVideoService`. This is an integration for Simli AI avatars.
|
||||
(see https://www.simli.com)
|
||||
|
||||
336
COMMUNITY_INTEGRATIONS.md
Normal file
336
COMMUNITY_INTEGRATIONS.md
Normal file
@@ -0,0 +1,336 @@
|
||||
# Community Integrations Guide
|
||||
|
||||
Pipecat welcomes community-maintained integrations! As our ecosystem grows, we've established a process for any developer to create and maintain their own service integrations while ensuring discoverability for the Pipecat community.
|
||||
|
||||
## Overview
|
||||
|
||||
**What we support:** Community-maintained integrations that live in separate repositories and are maintained by their authors.
|
||||
|
||||
**What we don't do:** The Pipecat team does not code review, test, or maintain community integrations. We provide guidance and list approved integrations for discoverability.
|
||||
|
||||
**Why this approach:** This allows the community to move quickly while keeping the Pipecat core team focused on maintaining the framework itself.
|
||||
|
||||
## Submitting your Integration
|
||||
|
||||
To be listed as an official community integration, follow these steps:
|
||||
|
||||
### Step 1: Build Your Integration
|
||||
|
||||
Create your integration following the patterns and examples shown in the "Integration Patterns and Examples" section below.
|
||||
|
||||
### Step 2: Set Up Your Repository
|
||||
|
||||
Your repository must contain these components:
|
||||
|
||||
- **Source code** - Complete implementation following Pipecat patterns
|
||||
- **Foundational example** - Single file example showing basic usage (see [Pipecat examples](https://github.com/pipecat-ai/pipecat/tree/main/examples/foundational))
|
||||
- **README.md** - Must include:
|
||||
|
||||
- Introduction and explanation of your integration
|
||||
- Installation instructions
|
||||
- Usage instructions with Pipecat Pipeline
|
||||
- How to run your example
|
||||
- Pipecat version compatibility (e.g., "Tested with Pipecat v0.0.86")
|
||||
- Company attribution: If you work for the company providing the service, please mention this in your README. This helps build confidence that the integration will be actively maintained.
|
||||
|
||||
- **LICENSE** - Permissive license (BSD-2 like Pipecat, or equivalent open source terms)
|
||||
- **Code documentation** - Source code with docstrings (we recommend following [Pipecat's docstring conventions](https://github.com/pipecat-ai/pipecat/blob/main/CONTRIBUTING.md#docstring-conventions))
|
||||
- **Changelog** - Maintain a changelog for version updates
|
||||
|
||||
### Step 3: Join Discord
|
||||
|
||||
Join our Discord: https://discord.gg/pipecat
|
||||
|
||||
### Step 4: Submit for Listing
|
||||
|
||||
Submit a pull request to add your integration to our [Community Integrations documentation page](https://docs.pipecat.ai/server/services/community-integrations).
|
||||
|
||||
**To submit:**
|
||||
|
||||
1. Fork the [Pipecat docs repository](https://github.com/pipecat-ai/docs)
|
||||
2. Edit the file `server/services/community-integrations.mdx`
|
||||
3. Add your integration to the appropriate service category table with:
|
||||
- Service name
|
||||
- Link to your repository
|
||||
- Maintainer GitHub username(s)
|
||||
4. Include a link to your demo video (approx 30-60 seconds) in your PR description showing:
|
||||
- Core functionality of your integration
|
||||
- Handling of an interruption (if applicable to service type)
|
||||
5. Submit your pull request
|
||||
|
||||
Once your PR is submitted, post in the `#community-integrations` Discord channel to let us know.
|
||||
|
||||
## Integration Patterns and Examples
|
||||
|
||||
### STT (Speech-to-Text) Services
|
||||
|
||||
#### Websocket-based Services
|
||||
|
||||
**Base class:** `STTService`
|
||||
|
||||
**Examples:**
|
||||
|
||||
- [DeepgramSTTService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/deepgram/stt.py)
|
||||
- [SpeechmaticsSTTService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/speechmatics/stt.py)
|
||||
|
||||
#### File-based Services
|
||||
|
||||
**Base class:** `SegmentedSTTService`
|
||||
|
||||
**Examples:**
|
||||
|
||||
- [RivaSTTService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/riva/stt.py)
|
||||
- [FalSTTService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/fal/stt.py)
|
||||
|
||||
#### Key requirements:
|
||||
|
||||
- STT services should push `InterimTranscriptionFrames` and `TranscriptionFrames`
|
||||
- If confidence values are available, filter for values >50% confidence
|
||||
|
||||
### LLM (Large Language Model) Services
|
||||
|
||||
#### OpenAI-Compatible Services
|
||||
|
||||
**Base class:** `OpenAILLMService`
|
||||
|
||||
**Examples:**
|
||||
|
||||
- [AzureLLMService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/azure/llm.py)
|
||||
- [GrokLLMService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/grok/llm.py) - Shows overriding the base class where needed
|
||||
|
||||
#### Non-OpenAI Compatible Services
|
||||
|
||||
**Requires:** Full implementation
|
||||
|
||||
**Examples:**
|
||||
|
||||
- [AnthropicLLMService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/anthropic/llm.py)
|
||||
- [GoogleLLMService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/google/llm.py)
|
||||
|
||||
#### Key requirements:
|
||||
|
||||
- **Frame sequence:** Output must follow this frame sequence pattern:
|
||||
|
||||
- `LLMFullResponseStartFrame` - Signals the start of an LLM response
|
||||
- `LLMTextFrame` - Contains LLM content, typically streamed as tokens
|
||||
- `LLMFullResponseEndFrame` - Signals the end of an LLM response
|
||||
|
||||
- **Context aggregation:** Implement context aggregation to collect user and assistant content:
|
||||
- Aggregators come in pairs with a `user()` instance and `assistant()` instance
|
||||
- Context must adhere to the `LLMContext` universal format
|
||||
- Aggregators should handle adding messages, function calls, and images to the context
|
||||
|
||||
### TTS (Text-to-Speech) Services
|
||||
|
||||
#### AudioContextWordTTSService
|
||||
|
||||
**Use for:** Websocket-based services supporting word/timestamp alignment
|
||||
|
||||
**Example:**
|
||||
|
||||
- [CartesiaTTSService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/cartesia/tts.py)
|
||||
|
||||
#### InterruptibleTTSService
|
||||
|
||||
**Use for:** Websocket-based services without word/timestamp alignment, requiring disconnection on interruption
|
||||
|
||||
**Example:**
|
||||
|
||||
- [SarvamTTSService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/sarvam/tts.py)
|
||||
|
||||
#### WordTTSService
|
||||
|
||||
**Use for:** HTTP-based services supporting word/timestamp alignment
|
||||
|
||||
**Example:**
|
||||
|
||||
- [ElevenLabsHttpTTSService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/elevenlabs/tts.py)
|
||||
|
||||
#### TTSService
|
||||
|
||||
**Use for:** HTTP-based services without word/timestamp alignment
|
||||
|
||||
**Example:**
|
||||
|
||||
- [GoogleHttpTTSService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/google/tts.py)
|
||||
|
||||
#### Key requirements:
|
||||
|
||||
- For websocket services, use asyncio WebSocket implementation (required for v13+ support)
|
||||
- Handle idle service timeouts with keepalives
|
||||
- TTSServices push both audio (`TTSRawAudioFrame`) and text (`TTSTextFrame`) frames
|
||||
|
||||
### Telephony Serializers
|
||||
|
||||
Pipecat supports telephony provider integration using websocket connections to exchange MediaStreams. These services use a FrameSerializer to serialize and deserialize inputs from the FastAPIWebsocketTransport.
|
||||
|
||||
**Examples:**
|
||||
|
||||
- [Twilio](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/serializers/twilio.py)
|
||||
- [Telnyx](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/serializers/telnyx.py)
|
||||
|
||||
#### Key requirements:
|
||||
|
||||
- Include hang-up functionality using the provider's native API, ideally using `aiohttp`
|
||||
- Support DTMF (dual-tone multi-frequency) events if the provider supports them:
|
||||
- Deserialize DTMF events from the provider's protocol to `InputDTMFFrame`
|
||||
- Use `KeypadEntry` enum for valid keypad entries (0-9, \*, #, A-D)
|
||||
- Handle invalid DTMF digits gracefully by returning `None`
|
||||
|
||||
### Image Generation Services
|
||||
|
||||
**Base class:** `ImageGenService`
|
||||
|
||||
**Examples:**
|
||||
|
||||
- [FalImageGenService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/fal/image.py)
|
||||
- [GoogleImageGenService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/google/image.py)
|
||||
|
||||
#### Key requirements:
|
||||
|
||||
- Must implement `run_image_gen` method returning an `AsyncGenerator`
|
||||
|
||||
### Vision Services
|
||||
|
||||
Vision services process images and provide analysis such as descriptions, object detection, or visual question answering.
|
||||
|
||||
**Base class:** `VisionService`
|
||||
|
||||
**Example:**
|
||||
|
||||
- [MoondreamVisionService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/moondream/vision.py)
|
||||
|
||||
#### Key requirements:
|
||||
|
||||
- Must implement `run_vision` method that takes an `LLMContext` and returns an `AsyncGenerator[Frame, None]`
|
||||
- The method processes the latest image in the context and yields frames with analysis results
|
||||
- Typically yields `TextFrame` objects containing descriptions or answers
|
||||
|
||||
## Implementation Guidelines
|
||||
|
||||
### Naming Conventions
|
||||
|
||||
- **STT:** `VendorSTTService`
|
||||
- **LLM:** `VendorLLMService`
|
||||
- **TTS:**
|
||||
- Websocket: `VendorTTSService`
|
||||
- HTTP: `VendorHttpTTSService`
|
||||
- **Image:** `VendorImageGenService`
|
||||
- **Vision:** `VendorVisionService`
|
||||
- **Telephony:** `VendorFrameSerializer`
|
||||
|
||||
### Metrics Support
|
||||
|
||||
Enable metrics in your service:
|
||||
|
||||
```python
|
||||
def can_generate_metrics(self) -> bool:
|
||||
"""Check if this service can generate processing metrics.
|
||||
|
||||
Returns:
|
||||
True, as this service supports metrics.
|
||||
"""
|
||||
return True
|
||||
```
|
||||
|
||||
### Dynamic Settings Updates
|
||||
|
||||
STT, LLM, and TTS services support `ServiceUpdateSettingsFrame` for dynamic configuration changes. The base STTService has an `_update_settings()` method that handles settings, and the private `_settings` `Dict` is used to store settings and provide access to the subclass.
|
||||
|
||||
```python
|
||||
async def set_language(self, language: Language):
|
||||
"""Set the recognition language and reconnect.
|
||||
|
||||
Args:
|
||||
language: The language to use for speech recognition.
|
||||
"""
|
||||
logger.info(f"Switching STT language to: [{language}]")
|
||||
self._settings["language"] = language
|
||||
await self._disconnect()
|
||||
await self._connect()
|
||||
```
|
||||
|
||||
Note that, in this example, Deepgram requires the websocket connection be disconnected and reconnected to reinitialize the service with the new value. Consider if your service requires reconnection.
|
||||
|
||||
### Sample Rate Handling
|
||||
|
||||
Sample rates are set via PipelineParams and passed to each frame processor at initialization. The pattern is to _not_ set the sample rate value in the constructor of a given service. Instead, use the `start()` method to initialize sample rates from the frame:
|
||||
|
||||
```python
|
||||
async def start(self, frame: StartFrame):
|
||||
"""Start the service."""
|
||||
await super().start(frame)
|
||||
self._settings["output_format"]["sample_rate"] = self.sample_rate
|
||||
await self._connect()
|
||||
```
|
||||
|
||||
Note that `self.sample_rate` is a `@property` set in the TTSService base class, which provides access to the private sample rate value obtained from the StartFrame.
|
||||
|
||||
### Tracing Decorators
|
||||
|
||||
Use Pipecat's tracing decorators:
|
||||
|
||||
- **STT:** `@traced_stt` - decorate a function that handles `transcript`, `is_final`, `language` as args
|
||||
- **LLM:** `@traced_llm` - decorate the `_process_context()` method
|
||||
- **TTS:** `@traced_tts` - decorate the `run_tts()` method
|
||||
|
||||
## Best Practices
|
||||
|
||||
### Packaging and Distribution
|
||||
|
||||
- Use [uv](https://docs.astral.sh/uv/) for packaging (encouraged)
|
||||
- Consider releasing to PyPI for easier installation
|
||||
- Follow semantic versioning principles
|
||||
- Maintain a changelog
|
||||
|
||||
### HTTP Communication
|
||||
|
||||
For REST-based communication, use aiohttp. Pipecat includes this as a required dependency, so using it prevents adding an additional dependency to your integration.
|
||||
|
||||
### Error Handling
|
||||
|
||||
- Wrap API calls in appropriate try/catch blocks
|
||||
- Handle rate limits and network failures gracefully
|
||||
- Provide meaningful error messages
|
||||
- When errors occur, raise exceptions AND push `ErrorFrame`s to notify the pipeline:
|
||||
|
||||
```python
|
||||
from pipecat.frames.frames import ErrorFrame
|
||||
|
||||
try:
|
||||
# Your API call
|
||||
result = await self._make_api_call()
|
||||
except Exception as e:
|
||||
# Push error frame to pipeline
|
||||
await self.push_error(ErrorFrame(error=f"{self} error: {e}"))
|
||||
# Raise or handle as appropriate
|
||||
raise
|
||||
```
|
||||
|
||||
### Testing
|
||||
|
||||
- Your foundational example serves as a valuable integration-level test
|
||||
- Unit tests are nice to have. As the Pipecat teams provides better guidance, we will encourage unit testing more
|
||||
|
||||
## Disclaimer
|
||||
|
||||
Community integrations are community-maintained and not officially supported by the Pipecat team. Users should evaluate these integrations independently. The Pipecat team reserves the right to remove listings that become unmaintained or problematic.
|
||||
|
||||
## Staying Up to Date
|
||||
|
||||
Pipecat evolves rapidly to support the latest AI technologies and patterns. While we strive to minimize breaking changes, they do occur as the framework matures.
|
||||
|
||||
**We strongly recommend:**
|
||||
|
||||
- Join our Discord at https://discord.gg/pipecat and monitor the `#announcements` channel for release notifications
|
||||
- Follow our changelog: https://github.com/pipecat-ai/pipecat/blob/main/CHANGELOG.md
|
||||
- Test your integration against new Pipecat releases promptly
|
||||
- Update your README with the last tested Pipecat version
|
||||
|
||||
This helps ensure your integration remains compatible and your users have clear expectations about version support.
|
||||
|
||||
## Questions?
|
||||
|
||||
Join our Discord community at https://discord.gg/pipecat and post in the `#community-integrations` channel for guidance and support.
|
||||
|
||||
For additional questions, you can also reach out to us at pipecat-ai@daily.co.
|
||||
@@ -1,5 +1,9 @@
|
||||
## Contributing to Pipecat
|
||||
|
||||
**Want to add a new service integration?**
|
||||
We encourage community-maintained integrations! Please see our [Community Integration Guide](COMMUNITY_INTEGRATIONS.md) for the process and requirements.
|
||||
|
||||
**Want to contribute to Pipecat core?**
|
||||
We welcome contributions of all kinds! Your help is appreciated. Follow these steps to get involved:
|
||||
|
||||
1. **Fork this repository**: Start by forking the Pipecat Documentation repository to your GitHub account.
|
||||
|
||||
113
README.md
113
README.md
@@ -3,6 +3,7 @@
|
||||
</div></h1>
|
||||
|
||||
[](https://pypi.org/project/pipecat-ai)  [](https://codecov.io/gh/pipecat-ai/pipecat) [](https://docs.pipecat.ai) [](https://discord.gg/pipecat) [](https://deepwiki.com/pipecat-ai/pipecat)
|
||||
[](https://getmanta.ai/pipecat)
|
||||
|
||||
# 🎙️ Pipecat: Real-Time Voice & Multimodal AI Agents
|
||||
|
||||
@@ -19,10 +20,6 @@
|
||||
- **Business Agents** – customer intake, support bots, guided flows
|
||||
- **Complex Dialog Systems** – design logic with structured conversations
|
||||
|
||||
🧭 Looking to build structured conversations? Check out [Pipecat Flows](https://github.com/pipecat-ai/pipecat-flows) for managing complex conversational states and transitions.
|
||||
|
||||
🔍 Looking for help debugging your pipeline and processors? Check out [Whisker](https://github.com/pipecat-ai/whisker), a real-time Pipecat debugger.
|
||||
|
||||
## 🧠 Why Pipecat?
|
||||
|
||||
- **Voice-first**: Integrates speech recognition, text-to-speech, and conversation handling
|
||||
@@ -30,40 +27,34 @@
|
||||
- **Composable Pipelines**: Build complex behavior from modular components
|
||||
- **Real-Time**: Ultra-low latency interaction with different transports (e.g. WebSockets or WebRTC)
|
||||
|
||||
## 📱 Client SDKs
|
||||
## 🌐 Pipecat Ecosystem
|
||||
|
||||
You can connect to Pipecat from any platform using our official SDKs:
|
||||
### 📱 Client SDKs
|
||||
|
||||
<table>
|
||||
<tr>
|
||||
<td>
|
||||
<img src="https://cdn.jsdelivr.net/gh/devicons/devicon/icons/javascript/javascript-original.svg" width="40" height="40" alt="JavaScript"/>
|
||||
<a href="https://docs.pipecat.ai/client/js/introduction">JavaScript</a>
|
||||
</td>
|
||||
<td>
|
||||
<img src="https://cdn.jsdelivr.net/gh/devicons/devicon/icons/react/react-original.svg" width="40" height="40" alt="React"/>
|
||||
<a href="https://docs.pipecat.ai/client/react/introduction">React</a>
|
||||
</td>
|
||||
<td>
|
||||
<img src="https://cdn.jsdelivr.net/gh/devicons/devicon/icons/react/react-original.svg" width="40" height="40" alt="React Native"/>
|
||||
<a href="https://docs.pipecat.ai/client/react-native/introduction">React Native</a>
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>
|
||||
<img src="https://cdn.jsdelivr.net/gh/devicons/devicon/icons/swift/swift-original.svg" width="40" height="40" alt="Swift"/>
|
||||
<a href="https://docs.pipecat.ai/client/ios/introduction">Swift</a>
|
||||
</td>
|
||||
<td>
|
||||
<img src="https://cdn.jsdelivr.net/gh/devicons/devicon/icons/kotlin/kotlin-original.svg" width="40" height="40" alt="Kotlin"/>
|
||||
<a href="https://docs.pipecat.ai/client/android/introduction">Kotlin</a>
|
||||
</td>
|
||||
<td>
|
||||
<img src="https://cdn.jsdelivr.net/gh/devicons/devicon/icons/cplusplus/cplusplus-original.svg" width="40" height="40" alt="JavaScript"/>
|
||||
<a href="https://docs.pipecat.ai/client/c++/introduction">C++</a>
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
Building client applications? You can connect to Pipecat from any platform using our official SDKs:
|
||||
|
||||
<a href="https://docs.pipecat.ai/client/js/introduction">JavaScript</a> | <a href="https://docs.pipecat.ai/client/react/introduction">React</a> | <a href="https://docs.pipecat.ai/client/react-native/introduction">React Native</a> |
|
||||
<a href="https://docs.pipecat.ai/client/ios/introduction">Swift</a> | <a href="https://docs.pipecat.ai/client/android/introduction">Kotlin</a> | <a href="https://docs.pipecat.ai/client/c++/introduction">C++</a> | <a href="https://github.com/pipecat-ai/pipecat-esp32">ESP32</a>
|
||||
|
||||
### 🧭 Structured conversations
|
||||
|
||||
Looking to build structured conversations? Check out [Pipecat Flows](https://github.com/pipecat-ai/pipecat-flows) for managing complex conversational states and transitions.
|
||||
|
||||
### 🪄 Beautiful UIs
|
||||
|
||||
Want to build beautiful and engaging experiences? Checkout the [Voice UI Kit](https://github.com/pipecat-ai/voice-ui-kit), a collection of components, hooks and templates for building voice AI applications quickly.
|
||||
|
||||
### 🔍 Debugging
|
||||
|
||||
Looking for help debugging your pipeline and processors? Check out [Whisker](https://github.com/pipecat-ai/whisker), a real-time Pipecat debugger.
|
||||
|
||||
### 🖥️ Terminal
|
||||
|
||||
Love terminal applications? Check out [Tail](https://github.com/pipecat-ai/tail), a terminal dashboard for Pipecat.
|
||||
|
||||
### 📺️ Pipecat TV Channel
|
||||
|
||||
Catch new features, interviews, and how-tos on our [Pipecat TV](https://www.youtube.com/playlist?list=PLzU2zoMTQIHjqC3v4q2XVSR3hGSzwKFwH) channel.
|
||||
|
||||
## 🎬 See it in action
|
||||
|
||||
@@ -81,7 +72,7 @@ You can connect to Pipecat from any platform using our official SDKs:
|
||||
| ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| Speech-to-Text | [AssemblyAI](https://docs.pipecat.ai/server/services/stt/assemblyai), [AWS](https://docs.pipecat.ai/server/services/stt/aws), [Azure](https://docs.pipecat.ai/server/services/stt/azure), [Cartesia](https://docs.pipecat.ai/server/services/stt/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/stt/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/stt/elevenlabs), [Fal Wizper](https://docs.pipecat.ai/server/services/stt/fal), [Gladia](https://docs.pipecat.ai/server/services/stt/gladia), [Google](https://docs.pipecat.ai/server/services/stt/google), [Groq (Whisper)](https://docs.pipecat.ai/server/services/stt/groq), [NVIDIA Riva](https://docs.pipecat.ai/server/services/stt/riva), [OpenAI (Whisper)](https://docs.pipecat.ai/server/services/stt/openai), [SambaNova (Whisper)](https://docs.pipecat.ai/server/services/stt/sambanova), [Soniox](https://docs.pipecat.ai/server/services/stt/soniox), [Speechmatics](https://docs.pipecat.ai/server/services/stt/speechmatics), [Ultravox](https://docs.pipecat.ai/server/services/stt/ultravox), [Whisper](https://docs.pipecat.ai/server/services/stt/whisper) |
|
||||
| LLMs | [Anthropic](https://docs.pipecat.ai/server/services/llm/anthropic), [AWS](https://docs.pipecat.ai/server/services/llm/aws), [Azure](https://docs.pipecat.ai/server/services/llm/azure), [Cerebras](https://docs.pipecat.ai/server/services/llm/cerebras), [DeepSeek](https://docs.pipecat.ai/server/services/llm/deepseek), [Fireworks AI](https://docs.pipecat.ai/server/services/llm/fireworks), [Gemini](https://docs.pipecat.ai/server/services/llm/gemini), [Grok](https://docs.pipecat.ai/server/services/llm/grok), [Groq](https://docs.pipecat.ai/server/services/llm/groq), [Mistral](https://docs.pipecat.ai/server/services/llm/mistral), [NVIDIA NIM](https://docs.pipecat.ai/server/services/llm/nim), [Ollama](https://docs.pipecat.ai/server/services/llm/ollama), [OpenAI](https://docs.pipecat.ai/server/services/llm/openai), [OpenRouter](https://docs.pipecat.ai/server/services/llm/openrouter), [Perplexity](https://docs.pipecat.ai/server/services/llm/perplexity), [Qwen](https://docs.pipecat.ai/server/services/llm/qwen), [SambaNova](https://docs.pipecat.ai/server/services/llm/sambanova) [Together AI](https://docs.pipecat.ai/server/services/llm/together) |
|
||||
| Text-to-Speech | [Async](https://docs.pipecat.ai/server/services/tts/asyncai), [AWS](https://docs.pipecat.ai/server/services/tts/aws), [Azure](https://docs.pipecat.ai/server/services/tts/azure), [Cartesia](https://docs.pipecat.ai/server/services/tts/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/tts/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/tts/elevenlabs), [Fish](https://docs.pipecat.ai/server/services/tts/fish), [Google](https://docs.pipecat.ai/server/services/tts/google), [Groq](https://docs.pipecat.ai/server/services/tts/groq), [Inworld](https://docs.pipecat.ai/server/services/tts/inworld), [LMNT](https://docs.pipecat.ai/server/services/tts/lmnt), [MiniMax](https://docs.pipecat.ai/server/services/tts/minimax), [Neuphonic](https://docs.pipecat.ai/server/services/tts/neuphonic), [NVIDIA Riva](https://docs.pipecat.ai/server/services/tts/riva), [OpenAI](https://docs.pipecat.ai/server/services/tts/openai), [Piper](https://docs.pipecat.ai/server/services/tts/piper), [PlayHT](https://docs.pipecat.ai/server/services/tts/playht), [Rime](https://docs.pipecat.ai/server/services/tts/rime), [Sarvam](https://docs.pipecat.ai/server/services/tts/sarvam), [XTTS](https://docs.pipecat.ai/server/services/tts/xtts) |
|
||||
| Text-to-Speech | [Async](https://docs.pipecat.ai/server/services/tts/asyncai), [AWS](https://docs.pipecat.ai/server/services/tts/aws), [Azure](https://docs.pipecat.ai/server/services/tts/azure), [Cartesia](https://docs.pipecat.ai/server/services/tts/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/tts/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/tts/elevenlabs), [Fish](https://docs.pipecat.ai/server/services/tts/fish), [Google](https://docs.pipecat.ai/server/services/tts/google), [Groq](https://docs.pipecat.ai/server/services/tts/groq), [Hume](https://docs.pipecat.ai/server/services/tts/hume), [Inworld](https://docs.pipecat.ai/server/services/tts/inworld), [LMNT](https://docs.pipecat.ai/server/services/tts/lmnt), [MiniMax](https://docs.pipecat.ai/server/services/tts/minimax), [Neuphonic](https://docs.pipecat.ai/server/services/tts/neuphonic), [NVIDIA Riva](https://docs.pipecat.ai/server/services/tts/riva), [OpenAI](https://docs.pipecat.ai/server/services/tts/openai), [Piper](https://docs.pipecat.ai/server/services/tts/piper), [PlayHT](https://docs.pipecat.ai/server/services/tts/playht), [Rime](https://docs.pipecat.ai/server/services/tts/rime), [Sarvam](https://docs.pipecat.ai/server/services/tts/sarvam), [XTTS](https://docs.pipecat.ai/server/services/tts/xtts) |
|
||||
| Speech-to-Speech | [AWS Nova Sonic](https://docs.pipecat.ai/server/services/s2s/aws), [Gemini Multimodal Live](https://docs.pipecat.ai/server/services/s2s/gemini), [OpenAI Realtime](https://docs.pipecat.ai/server/services/s2s/openai) |
|
||||
| Transport | [Daily (WebRTC)](https://docs.pipecat.ai/server/services/transport/daily), [FastAPI Websocket](https://docs.pipecat.ai/server/services/transport/fastapi-websocket), [SmallWebRTCTransport](https://docs.pipecat.ai/server/services/transport/small-webrtc), [WebSocket Server](https://docs.pipecat.ai/server/services/transport/websocket-server), Local |
|
||||
| Serializers | [Plivo](https://docs.pipecat.ai/server/utilities/serializers/plivo), [Twilio](https://docs.pipecat.ai/server/utilities/serializers/twilio), [Telnyx](https://docs.pipecat.ai/server/utilities/serializers/telnyx) |
|
||||
@@ -184,54 +175,6 @@ Run a specific test suite:
|
||||
uv run pytest tests/test_name.py
|
||||
```
|
||||
|
||||
### Setting up your editor
|
||||
|
||||
This project uses strict [PEP 8](https://peps.python.org/pep-0008/) formatting via [Ruff](https://github.com/astral-sh/ruff).
|
||||
|
||||
#### Emacs
|
||||
|
||||
You can use [use-package](https://github.com/jwiegley/use-package) to install [emacs-lazy-ruff](https://github.com/christophermadsen/emacs-lazy-ruff) package and configure `ruff` arguments:
|
||||
|
||||
```elisp
|
||||
(use-package lazy-ruff
|
||||
:ensure t
|
||||
:hook ((python-mode . lazy-ruff-mode))
|
||||
:config
|
||||
(setq lazy-ruff-format-command "ruff format")
|
||||
(setq lazy-ruff-check-command "ruff check --select I"))
|
||||
```
|
||||
|
||||
`ruff` was installed in the `venv` environment described before, so you should be able to use [pyvenv-auto](https://github.com/ryotaro612/pyvenv-auto) to automatically load that environment inside Emacs.
|
||||
|
||||
```elisp
|
||||
(use-package pyvenv-auto
|
||||
:ensure t
|
||||
:defer t
|
||||
:hook ((python-mode . pyvenv-auto-run)))
|
||||
```
|
||||
|
||||
#### Visual Studio Code
|
||||
|
||||
Install the
|
||||
[Ruff](https://marketplace.visualstudio.com/items?itemName=charliermarsh.ruff) extension. Then edit the user settings (_Ctrl-Shift-P_ `Open User Settings (JSON)`) and set it as the default Python formatter, and enable formatting on save:
|
||||
|
||||
```json
|
||||
"[python]": {
|
||||
"editor.defaultFormatter": "charliermarsh.ruff",
|
||||
"editor.formatOnSave": true
|
||||
}
|
||||
```
|
||||
|
||||
#### PyCharm
|
||||
|
||||
`ruff` was installed in the `venv` environment described before, now to enable autoformatting on save, go to `File` -> `Settings` -> `Tools` -> `File Watchers` and add a new watcher with the following settings:
|
||||
|
||||
1. **Name**: `Ruff formatter`
|
||||
2. **File type**: `Python`
|
||||
3. **Working directory**: `$ContentRoot$`
|
||||
4. **Arguments**: `format $FilePath$`
|
||||
5. **Program**: `$PyInterpreterDirectory$/ruff`
|
||||
|
||||
## 🤝 Contributing
|
||||
|
||||
We welcome contributions from the community! Whether you're fixing bugs, improving documentation, or adding new features, here's how you can help:
|
||||
|
||||
5
SECURITY.md
Normal file
5
SECURITY.md
Normal file
@@ -0,0 +1,5 @@
|
||||
# Security Policy
|
||||
|
||||
## Reporting a Vulnerability
|
||||
|
||||
Please email `disclosures@daily.co`.
|
||||
@@ -50,6 +50,7 @@ autodoc_mock_imports = [
|
||||
# Krisp - has build issues on some platforms
|
||||
"pipecat_ai_krisp",
|
||||
"krisp",
|
||||
"krisp_audio",
|
||||
# System-specific GUI libraries
|
||||
"_tkinter",
|
||||
"tkinter",
|
||||
|
||||
12
env.example
12
env.example
@@ -58,6 +58,9 @@ GOOGLE_CLOUD_PROJECT_ID=...
|
||||
GOOGLE_TEST_CREDENTIALS=...
|
||||
GOOGLE_VERTEX_TEST_CREDENTIALS=...
|
||||
|
||||
# Hume
|
||||
HUME_API_KEY=...
|
||||
|
||||
# LMNT
|
||||
LMNT_API_KEY=...
|
||||
LMNT_VOICE_ID=...
|
||||
@@ -87,6 +90,9 @@ SIMLI_FACE_ID=...
|
||||
# Krisp
|
||||
KRISP_MODEL_PATH=...
|
||||
|
||||
# Krisp Viva
|
||||
KRISP_VIVA_MODEL_PATH=...
|
||||
|
||||
# DeepSeek
|
||||
DEEPSEEK_API_KEY=...
|
||||
|
||||
@@ -155,3 +161,9 @@ NVIDIA_API_KEY=...
|
||||
|
||||
# Qwen
|
||||
QWEN_API_KEY=...
|
||||
|
||||
# WhatsApp
|
||||
WHATSAPP_TOKEN=
|
||||
WHATSAPP_WEBHOOK_VERIFICATION_TOKEN=
|
||||
WHATSAPP_PHONE_NUMBER_ID=
|
||||
WHATSAPP_APP_SECRET=
|
||||
138
examples/foundational/07ae-interruptible-hume.py
Normal file
138
examples/foundational/07ae-interruptible-hume.py
Normal file
@@ -0,0 +1,138 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import os
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
|
||||
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
from pipecat.frames.frames import LLMRunFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.processors.frameworks.rtvi import RTVIConfig, RTVIObserver, RTVIProcessor
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.hume.tts import HUME_SAMPLE_RATE, HumeTTSService
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||
# instantiated. The function will be called when the desired transport gets
|
||||
# selected.
|
||||
transport_params = {
|
||||
"daily": lambda: DailyParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
"twilio": lambda: FastAPIWebsocketParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
"webrtc": lambda: TransportParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
|
||||
tts = HumeTTSService(
|
||||
api_key=os.getenv("HUME_API_KEY"),
|
||||
# Replace with your Hume voice ID
|
||||
voice_id="f898a92e-685f-43fa-985b-a46920f0650b",
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
|
||||
},
|
||||
]
|
||||
|
||||
context = LLMContext(messages)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
rtvi = RTVIProcessor(config=RTVIConfig(config=[]))
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(), # Transport user input
|
||||
rtvi,
|
||||
stt,
|
||||
context_aggregator.user(), # User responses
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
context_aggregator.assistant(), # Assistant spoken responses
|
||||
]
|
||||
)
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
params=PipelineParams(
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
audio_out_sample_rate=HUME_SAMPLE_RATE,
|
||||
),
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
observers=[RTVIObserver(rtvi)],
|
||||
)
|
||||
|
||||
@rtvi.event_handler("on_client_ready")
|
||||
async def on_client_ready(rtvi):
|
||||
await rtvi.set_bot_ready()
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
async def bot(runner_args: RunnerArguments):
|
||||
"""Main bot entry point compatible with Pipecat Cloud."""
|
||||
transport = await create_transport(runner_args, transport_params)
|
||||
await run_bot(transport, runner_args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from pipecat.runner.run import main
|
||||
|
||||
main()
|
||||
@@ -23,7 +23,6 @@ from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.elevenlabs.stt import ElevenLabsSTTService
|
||||
from pipecat.services.elevenlabs.tts import ElevenLabsHttpTTSService
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
|
||||
151
examples/foundational/07n-interruptible-gemini-image.py
Normal file
151
examples/foundational/07n-interruptible-gemini-image.py
Normal file
@@ -0,0 +1,151 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""
|
||||
A conversational AI bot using Gemini for both LLM, STT and TTS.
|
||||
|
||||
This example demonstrates how to use Gemini's image generation capabilities.
|
||||
|
||||
Features showcased:
|
||||
- Gemini LLM for conversation and image generation
|
||||
- Google TTS and STT
|
||||
|
||||
Run with:
|
||||
python examples/foundational/07n-interruptible-gemini-image.py
|
||||
|
||||
Make sure to set your environment variables:
|
||||
export GOOGLE_API_KEY=your_api_key_here
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
|
||||
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
from pipecat.frames.frames import LLMRunFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.google.llm import GoogleLLMService
|
||||
from pipecat.services.google.stt import GoogleSTTService
|
||||
from pipecat.services.google.tts import GoogleTTSService
|
||||
from pipecat.transcriptions.language import Language
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||
# instantiated. The function will be called when the desired transport gets
|
||||
# selected.
|
||||
transport_params = {
|
||||
"daily": lambda: DailyParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
video_out_enabled=True,
|
||||
video_out_width=1024,
|
||||
video_out_height=1024,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
"webrtc": lambda: TransportParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
video_out_enabled=True,
|
||||
video_out_width=1024,
|
||||
video_out_height=1024,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = GoogleSTTService(
|
||||
params=GoogleSTTService.InputParams(languages=Language.EN_US),
|
||||
credentials=os.getenv("GOOGLE_TEST_CREDENTIALS"),
|
||||
)
|
||||
|
||||
tts = GoogleTTSService(
|
||||
voice_id="en-US-Chirp3-HD-Charon",
|
||||
params=GoogleTTSService.InputParams(language=Language.EN_US),
|
||||
credentials=os.getenv("GOOGLE_TEST_CREDENTIALS"),
|
||||
)
|
||||
|
||||
llm = GoogleLLMService(
|
||||
api_key=os.getenv("GOOGLE_API_KEY"),
|
||||
model="gemini-2.5-flash-image",
|
||||
)
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
|
||||
},
|
||||
]
|
||||
|
||||
context = LLMContext(messages)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(), # Transport user input
|
||||
stt, # STT
|
||||
context_aggregator.user(), # User responses
|
||||
llm, # LLM
|
||||
tts, # Gemini TTS
|
||||
transport.output(), # Transport bot output
|
||||
context_aggregator.assistant(), # Assistant spoken responses
|
||||
]
|
||||
)
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
params=PipelineParams(
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
),
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation with a styled introduction
|
||||
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
async def bot(runner_args: RunnerArguments):
|
||||
"""Main bot entry point compatible with Pipecat Cloud."""
|
||||
transport = await create_transport(runner_args, transport_params)
|
||||
await run_bot(transport, runner_args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from pipecat.runner.run import main
|
||||
|
||||
main()
|
||||
129
examples/foundational/07p-interruptible-krisp-viva.py
Normal file
129
examples/foundational/07p-interruptible-krisp-viva.py
Normal file
@@ -0,0 +1,129 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
|
||||
import os
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.audio.filters.krisp_viva_filter import KrispVivaFilter
|
||||
from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
|
||||
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
from pipecat.frames.frames import LLMRunFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.deepgram.tts import DeepgramTTSService
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||
# instantiated. The function will be called when the desired transport gets
|
||||
# selected.
|
||||
transport_params = {
|
||||
"daily": lambda: DailyParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
audio_in_filter=KrispVivaFilter(),
|
||||
),
|
||||
"twilio": lambda: FastAPIWebsocketParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
audio_in_filter=KrispVivaFilter(),
|
||||
),
|
||||
"webrtc": lambda: TransportParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
audio_in_filter=KrispVivaFilter(),
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
|
||||
tts = DeepgramTTSService(api_key=os.getenv("DEEPGRAM_API_KEY"), voice="aura-helios-en")
|
||||
|
||||
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
|
||||
},
|
||||
]
|
||||
|
||||
context = LLMContext(messages)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(), # Transport user input
|
||||
stt, # STT
|
||||
context_aggregator.user(), # User responses
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
context_aggregator.assistant(), # Assistant spoken responses
|
||||
]
|
||||
)
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
params=PipelineParams(
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
),
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
async def bot(runner_args: RunnerArguments):
|
||||
"""Main bot entry point compatible with Pipecat Cloud."""
|
||||
transport = await create_transport(runner_args, transport_params)
|
||||
await run_bot(transport, runner_args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from pipecat.runner.run import main
|
||||
|
||||
main()
|
||||
@@ -76,9 +76,8 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
|
||||
llm = GoogleVertexLLMService(
|
||||
credentials=os.getenv("GOOGLE_VERTEX_TEST_CREDENTIALS"),
|
||||
params=GoogleVertexLLMService.InputParams(
|
||||
project_id=os.getenv("GOOGLE_CLOUD_PROJECT_ID"),
|
||||
),
|
||||
project_id=os.getenv("GOOGLE_CLOUD_PROJECT_ID"),
|
||||
location=os.getenv("GOOGLE_CLOUD_LOCATION"),
|
||||
)
|
||||
# You can aslo register a function_name of None to get all functions
|
||||
# sent to the same callback with an additional function_name parameter.
|
||||
|
||||
156
examples/foundational/18-openai-realtime-usage.py
Normal file
156
examples/foundational/18-openai-realtime-usage.py
Normal file
@@ -0,0 +1,156 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""Example: Print OpenAI Realtime API Token Usage Statistics
|
||||
|
||||
This example demonstrates how to access and print token usage statistics
|
||||
from the OpenAI Realtime API, including detailed breakdowns of input/output
|
||||
tokens, cached tokens, and audio/text token usage.
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.openai.realtime.llm import OpenAIRealtimeLLMService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
# We store functions so objects don't get instantiated until the desired
|
||||
# transport gets selected.
|
||||
transport_params = {
|
||||
"daily": lambda: DailyParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
),
|
||||
"twilio": lambda: FastAPIWebsocketParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
),
|
||||
"webrtc": lambda: TransportParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
"""Main function demonstrating usage statistics tracking."""
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
# Initialize the OpenAI Realtime service
|
||||
llm = OpenAIRealtimeLLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY") or "",
|
||||
model="gpt-4o-realtime-preview-2024-12-17",
|
||||
)
|
||||
|
||||
# To access usage statistics, we wrap the internal response handler
|
||||
# This is the cleanest way to intercept usage data from the realtime API
|
||||
original_handler = llm._handle_evt_response_done
|
||||
|
||||
async def custom_response_done_handler(evt):
|
||||
"""Custom handler that prints usage stats before calling original handler."""
|
||||
# Print usage statistics if available
|
||||
if evt.response.usage:
|
||||
usage = evt.response.usage
|
||||
|
||||
logger.info("\n" + "=" * 50)
|
||||
logger.info("📊 TOKEN USAGE STATISTICS")
|
||||
logger.info("=" * 50)
|
||||
logger.info(f"Total tokens: {usage.total_tokens}")
|
||||
logger.info(f"Input tokens: {usage.input_tokens}")
|
||||
logger.info(f"Output tokens: {usage.output_tokens}")
|
||||
|
||||
# Input token details
|
||||
if usage.input_token_details:
|
||||
logger.info(f"\n📥 Input token breakdown:")
|
||||
logger.info(f" • Cached tokens: {usage.input_token_details.cached_tokens}")
|
||||
logger.info(f" • Text tokens: {usage.input_token_details.text_tokens}")
|
||||
logger.info(f" • Audio tokens: {usage.input_token_details.audio_tokens}")
|
||||
|
||||
# Cached token details if available
|
||||
if usage.input_token_details.cached_tokens_details:
|
||||
logger.info(
|
||||
f" • Cached text tokens: {usage.input_token_details.cached_tokens_details.text_tokens}"
|
||||
)
|
||||
logger.info(
|
||||
f" • Cached audio tokens: {usage.input_token_details.cached_tokens_details.audio_tokens}"
|
||||
)
|
||||
|
||||
# Output token details
|
||||
if usage.output_token_details:
|
||||
logger.info(f"\n📤 Output token breakdown:")
|
||||
logger.info(f" • Text tokens: {usage.output_token_details.text_tokens}")
|
||||
logger.info(f" • Audio tokens: {usage.output_token_details.audio_tokens}")
|
||||
|
||||
logger.info("=" * 50 + "\n")
|
||||
|
||||
# Call the original handler to maintain normal functionality
|
||||
await original_handler(evt)
|
||||
|
||||
# Replace the handler with our custom one
|
||||
llm._handle_evt_response_done = custom_response_done_handler
|
||||
|
||||
# Create pipeline
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(),
|
||||
llm,
|
||||
transport.output(),
|
||||
]
|
||||
)
|
||||
|
||||
# Create task
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
params=PipelineParams(
|
||||
allow_interruptions=True,
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
),
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info("Client connected")
|
||||
logger.info("🎤 Speak into your microphone to interact with the assistant")
|
||||
logger.info("📊 Usage statistics will be printed after each response")
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info("Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
async def bot(runner_args: RunnerArguments):
|
||||
"""Main bot entry point compatible with Pipecat Cloud."""
|
||||
transport = await create_transport(runner_args, transport_params)
|
||||
await run_bot(transport, runner_args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from pipecat.runner.run import main
|
||||
|
||||
main()
|
||||
@@ -24,14 +24,15 @@ from pipecat.processors.transcript_processor import TranscriptProcessor
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.llm_service import FunctionCallParams
|
||||
from pipecat.services.openai_realtime import (
|
||||
from pipecat.services.openai.realtime.events import (
|
||||
AudioConfiguration,
|
||||
AudioInput,
|
||||
InputAudioNoiseReduction,
|
||||
InputAudioTranscription,
|
||||
OpenAIRealtimeLLMService,
|
||||
SemanticTurnDetection,
|
||||
SessionProperties,
|
||||
)
|
||||
from pipecat.services.openai_realtime.events import AudioConfiguration, AudioInput
|
||||
from pipecat.services.openai.realtime.llm import OpenAIRealtimeLLMService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
|
||||
@@ -21,13 +21,14 @@ from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.azure.realtime.llm import AzureRealtimeLLMService
|
||||
from pipecat.services.llm_service import FunctionCallParams
|
||||
from pipecat.services.openai_realtime import (
|
||||
AzureRealtimeLLMService,
|
||||
from pipecat.services.openai.realtime.events import (
|
||||
AudioConfiguration,
|
||||
AudioInput,
|
||||
InputAudioTranscription,
|
||||
SessionProperties,
|
||||
)
|
||||
from pipecat.services.openai_realtime.events import AudioConfiguration, AudioInput
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
|
||||
@@ -22,16 +22,17 @@ from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.processors.transcript_processor import TranscriptProcessor
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.cartesia import CartesiaTTSService
|
||||
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
from pipecat.services.llm_service import FunctionCallParams
|
||||
from pipecat.services.openai_realtime import (
|
||||
from pipecat.services.openai.realtime.events import (
|
||||
AudioConfiguration,
|
||||
AudioInput,
|
||||
InputAudioNoiseReduction,
|
||||
InputAudioTranscription,
|
||||
OpenAIRealtimeLLMService,
|
||||
SemanticTurnDetection,
|
||||
SessionProperties,
|
||||
)
|
||||
from pipecat.services.openai_realtime.events import AudioConfiguration, AudioInput
|
||||
from pipecat.services.openai.realtime.llm import OpenAIRealtimeLLMService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
|
||||
@@ -25,13 +25,14 @@ from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.llm_service import FunctionCallParams
|
||||
from pipecat.services.openai_realtime import (
|
||||
from pipecat.services.openai.realtime.events import (
|
||||
AudioConfiguration,
|
||||
AudioInput,
|
||||
InputAudioTranscription,
|
||||
OpenAIRealtimeLLMService,
|
||||
SessionProperties,
|
||||
TurnDetection,
|
||||
)
|
||||
from pipecat.services.openai_realtime.events import AudioConfiguration, AudioInput
|
||||
from pipecat.services.openai.realtime.llm import OpenAIRealtimeLLMService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
|
||||
@@ -23,7 +23,7 @@ from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.aws_nova_sonic.aws import AWSNovaSonicLLMService
|
||||
from pipecat.services.aws.nova_sonic.llm import AWSNovaSonicLLMService
|
||||
from pipecat.services.llm_service import FunctionCallParams
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
|
||||
@@ -17,7 +17,7 @@ from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.gemini_multimodal_live.gemini import GeminiMultimodalLiveLLMService
|
||||
from pipecat.services.google.gemini_live.llm import GeminiLiveLLMService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
@@ -65,7 +65,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
Respond to what the user said in a creative and helpful way.
|
||||
"""
|
||||
|
||||
llm = GeminiMultimodalLiveLLMService(
|
||||
llm = GeminiLiveLLMService(
|
||||
api_key=os.getenv("GOOGLE_API_KEY"),
|
||||
system_instruction=system_instruction,
|
||||
voice_id="Puck", # Aoede, Charon, Fenrir, Kore, Puck
|
||||
@@ -20,7 +20,7 @@ from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.processors.transcript_processor import TranscriptProcessor
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.gemini_multimodal_live.gemini import GeminiMultimodalLiveLLMService
|
||||
from pipecat.services.google.gemini_live.llm import GeminiLiveLLMService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
@@ -65,7 +65,7 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
llm = GeminiMultimodalLiveLLMService(
|
||||
llm = GeminiLiveLLMService(
|
||||
api_key=os.getenv("GOOGLE_API_KEY"),
|
||||
voice_id="Aoede", # Puck, Charon, Kore, Fenrir, Aoede
|
||||
# system_instruction="Talk like a pirate."
|
||||
@@ -22,7 +22,7 @@ from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.gemini_multimodal_live.gemini import GeminiMultimodalLiveLLMService
|
||||
from pipecat.services.google.gemini_live.llm import GeminiLiveLLMService
|
||||
from pipecat.services.llm_service import FunctionCallParams
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
@@ -122,12 +122,15 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
required=["location"],
|
||||
)
|
||||
search_tool = {"google_search": {}}
|
||||
# KNOWN ISSUE: If using GeminiVertexLiveLLMService, it appears
|
||||
# you cannot use the "google_search" tool alongside other tools.
|
||||
# See https://github.com/googleapis/python-genai/issues/941.
|
||||
tools = ToolsSchema(
|
||||
standard_tools=[weather_function, restaurant_function],
|
||||
custom_tools={AdapterType.GEMINI: [search_tool]},
|
||||
)
|
||||
|
||||
llm = GeminiMultimodalLiveLLMService(
|
||||
llm = GeminiLiveLLMService(
|
||||
api_key=os.getenv("GOOGLE_API_KEY"),
|
||||
system_instruction=system_instruction,
|
||||
tools=tools,
|
||||
@@ -24,7 +24,7 @@ from pipecat.runner.utils import (
|
||||
maybe_capture_participant_camera,
|
||||
maybe_capture_participant_screen,
|
||||
)
|
||||
from pipecat.services.gemini_multimodal_live.gemini import GeminiMultimodalLiveLLMService
|
||||
from pipecat.services.google.gemini_live.llm import GeminiLiveLLMService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
|
||||
@@ -58,7 +58,7 @@ transport_params = {
|
||||
|
||||
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
llm = GeminiMultimodalLiveLLMService(
|
||||
llm = GeminiLiveLLMService(
|
||||
api_key=os.getenv("GOOGLE_API_KEY"),
|
||||
voice_id="Aoede", # Puck, Charon, Kore, Fenrir, Aoede
|
||||
# system_instruction="Talk like a pirate."
|
||||
@@ -20,9 +20,9 @@ from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
from pipecat.services.gemini_multimodal_live.gemini import (
|
||||
GeminiMultimodalLiveLLMService,
|
||||
GeminiMultimodalModalities,
|
||||
from pipecat.services.google.gemini_live.llm import (
|
||||
GeminiLiveLLMService,
|
||||
GeminiModalities,
|
||||
InputParams,
|
||||
)
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
@@ -80,11 +80,15 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
llm = GeminiMultimodalLiveLLMService(
|
||||
# KNOWN ISSUE: If using GeminiLiveVertexLLMService, you cannot specify a
|
||||
# modality other than AUDIO (at least not if using the service's default
|
||||
# model, which is a native audio model:
|
||||
# https://cloud.google.com/vertex-ai/generative-ai/docs/live-api/tools#native-audio).
|
||||
llm = GeminiLiveLLMService(
|
||||
api_key=os.getenv("GOOGLE_API_KEY"),
|
||||
system_instruction=SYSTEM_INSTRUCTION,
|
||||
tools=[{"google_search": {}}, {"code_execution": {}}],
|
||||
params=InputParams(modalities=GeminiMultimodalModalities.TEXT),
|
||||
params=InputParams(modalities=GeminiModalities.TEXT),
|
||||
)
|
||||
|
||||
# Optionally, you can set the response modalities via a function
|
||||
@@ -19,7 +19,7 @@ from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.gemini_multimodal_live.gemini import GeminiMultimodalLiveLLMService
|
||||
from pipecat.services.google.gemini_live.llm import GeminiLiveLLMService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
@@ -83,7 +83,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
# Initialize the Gemini Multimodal Live model
|
||||
llm = GeminiMultimodalLiveLLMService(
|
||||
llm = GeminiLiveLLMService(
|
||||
api_key=os.getenv("GOOGLE_API_KEY"),
|
||||
voice_id="Puck", # Aoede, Charon, Fenrir, Kore, Puck
|
||||
system_instruction=system_instruction,
|
||||
@@ -19,9 +19,7 @@ from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.gemini_multimodal_live.gemini import (
|
||||
GeminiMultimodalLiveLLMService,
|
||||
)
|
||||
from pipecat.services.google.gemini_live.llm import GeminiLiveLLMService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
@@ -110,7 +108,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
"""
|
||||
|
||||
# Initialize Gemini service with File API support
|
||||
llm = GeminiMultimodalLiveLLMService(
|
||||
llm = GeminiLiveLLMService(
|
||||
api_key=os.getenv("GOOGLE_API_KEY"),
|
||||
system_instruction=system_instruction,
|
||||
voice_id="Charon", # Aoede, Charon, Fenrir, Kore, Puck
|
||||
@@ -9,13 +9,13 @@ from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
from pipecat.frames.frames import Frame, LLMRunFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.pipeline.task import PipelineTask
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.gemini_multimodal_live.gemini import GeminiMultimodalLiveLLMService
|
||||
from pipecat.services.google.frames import LLMSearchResponseFrame
|
||||
from pipecat.services.google.gemini_live.llm import GeminiLiveLLMService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
@@ -105,7 +105,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
custom_tools={AdapterType.GEMINI: [{"google_search": {}}, {"code_execution": {}}]},
|
||||
)
|
||||
|
||||
llm = GeminiMultimodalLiveLLMService(
|
||||
llm = GeminiLiveLLMService(
|
||||
api_key=os.getenv("GOOGLE_API_KEY"),
|
||||
system_instruction=SYSTEM_INSTRUCTION,
|
||||
voice_id="Charon", # Aoede, Charon, Fenrir, Kore, Puck
|
||||
191
examples/foundational/26h-gemini-live-vertex-function-calling.py
Normal file
191
examples/foundational/26h-gemini-live-vertex-function-calling.py
Normal file
@@ -0,0 +1,191 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
|
||||
import os
|
||||
from datetime import datetime
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from google.genai.types import HttpOptions
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.adapters.schemas.function_schema import FunctionSchema
|
||||
from pipecat.adapters.schemas.tools_schema import AdapterType, ToolsSchema
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
from pipecat.frames.frames import LLMRunFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.google.gemini_live.llm import GeminiLiveLLMService
|
||||
from pipecat.services.google.gemini_live.llm_vertex import GeminiLiveVertexLLMService
|
||||
from pipecat.services.llm_service import FunctionCallParams
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
async def fetch_weather_from_api(params: FunctionCallParams):
|
||||
temperature = 75 if params.arguments["format"] == "fahrenheit" else 24
|
||||
await params.result_callback(
|
||||
{
|
||||
"conditions": "nice",
|
||||
"temperature": temperature,
|
||||
"format": params.arguments["format"],
|
||||
"timestamp": datetime.now().strftime("%Y%m%d_%H%M%S"),
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
async def fetch_restaurant_recommendation(params: FunctionCallParams):
|
||||
await params.result_callback({"name": "The Golden Dragon"})
|
||||
|
||||
|
||||
system_instruction = """
|
||||
You are a helpful assistant who can answer questions and use tools.
|
||||
|
||||
You have three tools available to you:
|
||||
1. get_current_weather: Use this tool to get the current weather in a specific location.
|
||||
2. get_restaurant_recommendation: Use this tool to get a restaurant recommendation in a specific location.
|
||||
"""
|
||||
|
||||
|
||||
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||
# instantiated. The function will be called when the desired transport gets
|
||||
# selected.
|
||||
transport_params = {
|
||||
"daily": lambda: DailyParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
# set stop_secs to something roughly similar to the internal setting
|
||||
# of the Multimodal Live api, just to align events. This doesn't really
|
||||
# matter because we can only use the Multimodal Live API's phrase
|
||||
# endpointing, for now.
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.5)),
|
||||
),
|
||||
"twilio": lambda: FastAPIWebsocketParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
# set stop_secs to something roughly similar to the internal setting
|
||||
# of the Multimodal Live api, just to align events. This doesn't really
|
||||
# matter because we can only use the Multimodal Live API's phrase
|
||||
# endpointing, for now.
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.5)),
|
||||
),
|
||||
"webrtc": lambda: TransportParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
# set stop_secs to something roughly similar to the internal setting
|
||||
# of the Multimodal Live api, just to align events. This doesn't really
|
||||
# matter because we can only use the Multimodal Live API's phrase
|
||||
# endpointing, for now.
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.5)),
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
weather_function = FunctionSchema(
|
||||
name="get_current_weather",
|
||||
description="Get the current weather",
|
||||
properties={
|
||||
"location": {
|
||||
"type": "string",
|
||||
"description": "The city and state, e.g. San Francisco, CA",
|
||||
},
|
||||
"format": {
|
||||
"type": "string",
|
||||
"enum": ["celsius", "fahrenheit"],
|
||||
"description": "The temperature unit to use. Infer this from the user's location.",
|
||||
},
|
||||
},
|
||||
required=["location", "format"],
|
||||
)
|
||||
restaurant_function = FunctionSchema(
|
||||
name="get_restaurant_recommendation",
|
||||
description="Get a restaurant recommendation",
|
||||
properties={
|
||||
"location": {
|
||||
"type": "string",
|
||||
"description": "The city and state, e.g. San Francisco, CA",
|
||||
},
|
||||
},
|
||||
required=["location"],
|
||||
)
|
||||
# KNOWN ISSUE: If using GeminiVertexLiveLLMService, it appears
|
||||
# you cannot use the "google_search" tool alongside other tools.
|
||||
# See https://github.com/googleapis/python-genai/issues/941.
|
||||
tools = ToolsSchema(standard_tools=[weather_function, restaurant_function])
|
||||
|
||||
llm = GeminiLiveVertexLLMService(
|
||||
credentials=os.getenv("GOOGLE_VERTEX_TEST_CREDENTIALS"),
|
||||
project_id=os.getenv("GOOGLE_CLOUD_PROJECT_ID"),
|
||||
location=os.getenv("GOOGLE_CLOUD_LOCATION"),
|
||||
system_instruction=system_instruction,
|
||||
voice_id="Puck", # Aoede, Charon, Fenrir, Kore, Puck
|
||||
tools=tools,
|
||||
)
|
||||
|
||||
llm.register_function("get_current_weather", fetch_weather_from_api)
|
||||
llm.register_function("get_restaurant_recommendation", fetch_restaurant_recommendation)
|
||||
|
||||
context = OpenAILLMContext(
|
||||
[{"role": "user", "content": "Say hello."}],
|
||||
)
|
||||
context_aggregator = llm.create_context_aggregator(context)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(),
|
||||
context_aggregator.user(),
|
||||
llm,
|
||||
transport.output(),
|
||||
context_aggregator.assistant(),
|
||||
]
|
||||
)
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
params=PipelineParams(
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
),
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
async def bot(runner_args: RunnerArguments):
|
||||
"""Main bot entry point compatible with Pipecat Cloud."""
|
||||
transport = await create_transport(runner_args, transport_params)
|
||||
await run_bot(transport, runner_args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from pipecat.runner.run import main
|
||||
|
||||
main()
|
||||
204
examples/foundational/26i-gemini-live-graceful-end.py
Normal file
204
examples/foundational/26i-gemini-live-graceful-end.py
Normal file
@@ -0,0 +1,204 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import os
|
||||
from datetime import datetime
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.adapters.schemas.function_schema import FunctionSchema
|
||||
from pipecat.adapters.schemas.tools_schema import AdapterType, ToolsSchema
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
from pipecat.frames.frames import EndTaskFrame, LLMRunFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.processors.frame_processor import FrameDirection
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.google.gemini_live.llm import GeminiLiveLLMService
|
||||
from pipecat.services.llm_service import FunctionCallParams
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
async def fetch_weather_from_api(params: FunctionCallParams):
|
||||
temperature = 75 if params.arguments["format"] == "fahrenheit" else 24
|
||||
await params.result_callback(
|
||||
{
|
||||
"conditions": "nice",
|
||||
"temperature": temperature,
|
||||
"format": params.arguments["format"],
|
||||
"timestamp": datetime.now().strftime("%Y%m%d_%H%M%S"),
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
async def fetch_restaurant_recommendation(params: FunctionCallParams):
|
||||
await params.result_callback({"name": "The Golden Dragon"})
|
||||
|
||||
|
||||
async def end_conversation(params: FunctionCallParams):
|
||||
await params.result_callback({"success": True})
|
||||
await params.llm.push_frame(EndTaskFrame(), FrameDirection.UPSTREAM)
|
||||
|
||||
|
||||
system_instruction = """
|
||||
You are a helpful assistant who can answer questions and use tools.
|
||||
|
||||
You have three tools available to you:
|
||||
1. get_current_weather: Use this tool to get the current weather in a specific location.
|
||||
2. get_restaurant_recommendation: Use this tool to get a restaurant recommendation in a specific location.
|
||||
3. end_conversation: Use this tool to gracefully end the conversation.
|
||||
|
||||
After you've responded to the user three times, do two things, in order:
|
||||
1. Politely let them know that that's all the time you have today and say goodbye.
|
||||
2. Call the end_conversation tool to gracefully end the conversation.
|
||||
"""
|
||||
|
||||
|
||||
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||
# instantiated. The function will be called when the desired transport gets
|
||||
# selected.
|
||||
transport_params = {
|
||||
"daily": lambda: DailyParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
# set stop_secs to something roughly similar to the internal setting
|
||||
# of the Multimodal Live api, just to align events. This doesn't really
|
||||
# matter because we can only use the Multimodal Live API's phrase
|
||||
# endpointing, for now.
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.5)),
|
||||
),
|
||||
"twilio": lambda: FastAPIWebsocketParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
# set stop_secs to something roughly similar to the internal setting
|
||||
# of the Multimodal Live api, just to align events. This doesn't really
|
||||
# matter because we can only use the Multimodal Live API's phrase
|
||||
# endpointing, for now.
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.5)),
|
||||
),
|
||||
"webrtc": lambda: TransportParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
# set stop_secs to something roughly similar to the internal setting
|
||||
# of the Multimodal Live api, just to align events. This doesn't really
|
||||
# matter because we can only use the Multimodal Live API's phrase
|
||||
# endpointing, for now.
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.5)),
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
weather_function = FunctionSchema(
|
||||
name="get_current_weather",
|
||||
description="Get the current weather",
|
||||
properties={
|
||||
"location": {
|
||||
"type": "string",
|
||||
"description": "The city and state, e.g. San Francisco, CA",
|
||||
},
|
||||
"format": {
|
||||
"type": "string",
|
||||
"enum": ["celsius", "fahrenheit"],
|
||||
"description": "The temperature unit to use. Infer this from the user's location.",
|
||||
},
|
||||
},
|
||||
required=["location", "format"],
|
||||
)
|
||||
restaurant_function = FunctionSchema(
|
||||
name="get_restaurant_recommendation",
|
||||
description="Get a restaurant recommendation",
|
||||
properties={
|
||||
"location": {
|
||||
"type": "string",
|
||||
"description": "The city and state, e.g. San Francisco, CA",
|
||||
},
|
||||
},
|
||||
required=["location"],
|
||||
)
|
||||
end_conversation_function = FunctionSchema(
|
||||
name="end_conversation",
|
||||
description="Gracefully end the conversation",
|
||||
properties={},
|
||||
required=[],
|
||||
)
|
||||
search_tool = {"google_search": {}}
|
||||
tools = ToolsSchema(
|
||||
standard_tools=[weather_function, restaurant_function, end_conversation_function],
|
||||
custom_tools={AdapterType.GEMINI: [search_tool]},
|
||||
)
|
||||
|
||||
llm = GeminiLiveLLMService(
|
||||
api_key=os.getenv("GOOGLE_API_KEY"),
|
||||
system_instruction=system_instruction,
|
||||
tools=tools,
|
||||
)
|
||||
|
||||
llm.register_function("get_current_weather", fetch_weather_from_api)
|
||||
llm.register_function("get_restaurant_recommendation", fetch_restaurant_recommendation)
|
||||
llm.register_function("end_conversation", end_conversation)
|
||||
|
||||
context = OpenAILLMContext(
|
||||
[{"role": "user", "content": "Say hello."}],
|
||||
)
|
||||
context_aggregator = llm.create_context_aggregator(context)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(),
|
||||
context_aggregator.user(),
|
||||
llm,
|
||||
transport.output(),
|
||||
context_aggregator.assistant(),
|
||||
]
|
||||
)
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
params=PipelineParams(
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
),
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
async def bot(runner_args: RunnerArguments):
|
||||
"""Main bot entry point compatible with Pipecat Cloud."""
|
||||
transport = await create_transport(runner_args, transport_params)
|
||||
await run_bot(transport, runner_args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from pipecat.runner.run import main
|
||||
|
||||
main()
|
||||
@@ -21,7 +21,7 @@ from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.aws_nova_sonic import AWSNovaSonicLLMService
|
||||
from pipecat.services.aws.nova_sonic.llm import AWSNovaSonicLLMService
|
||||
from pipecat.services.llm_service import FunctionCallParams
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
|
||||
@@ -20,7 +20,7 @@ from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.processors.frameworks.rtvi import RTVIObserver, RTVIProcessor
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.gemini_multimodal_live import GeminiMultimodalLiveLLMService
|
||||
from pipecat.services.google.gemini_live.llm import GeminiLiveLLMService
|
||||
from pipecat.transports.base_transport import TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams, DailyTransport
|
||||
|
||||
@@ -94,7 +94,7 @@ Respond to what the user said in a creative and helpful way. Keep your responses
|
||||
|
||||
|
||||
async def run_bot(pipecat_transport):
|
||||
llm = GeminiMultimodalLiveLLMService(
|
||||
llm = GeminiLiveLLMService(
|
||||
api_key=os.getenv("GOOGLE_API_KEY"),
|
||||
voice_id="Puck", # Aoede, Charon, Fenrir, Kore, Puck
|
||||
transcribe_user_audio=True,
|
||||
|
||||
@@ -105,7 +105,7 @@ uv run 07-interruptible.py -t twilio -x NGROK_HOST_NAME
|
||||
### Vision & Multimodal
|
||||
|
||||
- **[12a-describe-video-gemini-flash.py](./12a-describe-video-gemini-flash.py)**: Bot describes user's video (Video input, Multimodal LLMs)
|
||||
- **[26c-gemini-multimodal-live-video.py](./26c-gemini-multimodal-live-video.py)**: Gemini with video input (Streaming video, Function calls)
|
||||
- **[26c-gemini-live-video.py](./26c-gemini-live-video.py)**: Gemini with video input (Streaming video, Function calls)
|
||||
|
||||
### Voice & Language
|
||||
|
||||
|
||||
@@ -50,7 +50,7 @@ anthropic = [ "anthropic~=0.49.0" ]
|
||||
assemblyai = [ "pipecat-ai[websockets-base]" ]
|
||||
asyncai = [ "pipecat-ai[websockets-base]" ]
|
||||
aws = [ "aioboto3~=15.0.0", "pipecat-ai[websockets-base]" ]
|
||||
aws-nova-sonic = [ "aws_sdk_bedrock_runtime~=0.0.2; python_version>='3.12'" ]
|
||||
aws-nova-sonic = [ "aws_sdk_bedrock_runtime~=0.1.0; python_version>='3.12'" ]
|
||||
azure = [ "azure-cognitiveservices-speech~=1.42.0"]
|
||||
cartesia = [ "cartesia~=2.0.3", "pipecat-ai[websockets-base]" ]
|
||||
cerebras = []
|
||||
@@ -62,11 +62,12 @@ fal = [ "fal-client~=0.5.9" ]
|
||||
fireworks = []
|
||||
fish = [ "ormsgpack~=1.7.0", "pipecat-ai[websockets-base]" ]
|
||||
gladia = [ "pipecat-ai[websockets-base]" ]
|
||||
google = [ "google-cloud-speech~=2.32.0", "google-cloud-texttospeech~=2.26.0", "google-genai~=1.24.0", "pipecat-ai[websockets-base]" ]
|
||||
google = [ "google-cloud-speech>=2.33.0,<3", "google-cloud-texttospeech>=2.31.0,<3", "google-genai>=1.41.0,<2", "pipecat-ai[websockets-base]" ]
|
||||
grok = []
|
||||
groq = [ "groq~=0.23.0" ]
|
||||
gstreamer = [ "pygobject~=3.50.0" ]
|
||||
heygen = [ "livekit>=1.0.13", "pipecat-ai[websockets-base]" ]
|
||||
hume = [ "hume>=0.11.2" ]
|
||||
inworld = []
|
||||
krisp = [ "pipecat-ai-krisp~=0.4.0" ]
|
||||
koala = [ "pvkoala~=2.0.3" ]
|
||||
|
||||
@@ -75,8 +75,6 @@ TESTS_07 = [
|
||||
EVAL_SIMPLE_MATH,
|
||||
BOT_SPEAKS_FIRST,
|
||||
),
|
||||
("07e-interruptible-playht.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07e-interruptible-playht-http.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07f-interruptible-azure.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07g-interruptible-openai.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07h-interruptible-openpipe.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
@@ -103,6 +101,7 @@ TESTS_07 = [
|
||||
("07w-interruptible-fal.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07y-interruptible-minimax.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07z-interruptible-sarvam.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07ae-interruptible-hume.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
# Needs a local XTTS docker instance running.
|
||||
# ("07i-interruptible-xtts.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
# Needs a Krisp license.
|
||||
@@ -148,7 +147,10 @@ TESTS_15 = [
|
||||
]
|
||||
|
||||
TESTS_19 = [
|
||||
("19-openai-realtime.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
("19-openai-realtime-beta.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
# OpenAI Realtime not released on Azure yet
|
||||
# ("19a-azure-realtime.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
("19a-azure-realtime-beta.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
("19b-openai-realtime-text.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
("19b-openai-realtime-beta-text.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
@@ -161,18 +163,18 @@ TESTS_21 = [
|
||||
TESTS_26 = [
|
||||
("26-gemini-multimodal-live.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
(
|
||||
"26a-gemini-multimodal-live-transcription.py",
|
||||
"26a-gemini-live-transcription.py",
|
||||
PROMPT_SIMPLE_MATH,
|
||||
EVAL_SIMPLE_MATH,
|
||||
BOT_SPEAKS_FIRST,
|
||||
),
|
||||
(
|
||||
"26b-gemini-multimodal-live-function-calling.py",
|
||||
"26b-gemini-live-function-calling.py",
|
||||
PROMPT_WEATHER,
|
||||
EVAL_WEATHER,
|
||||
BOT_SPEAKS_FIRST,
|
||||
),
|
||||
("26c-gemini-multimodal-live-video.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("26c-gemini-live-video.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
(
|
||||
"26e-gemini-multimodal-google-search.py",
|
||||
PROMPT_ONLINE_SEARCH,
|
||||
@@ -180,7 +182,13 @@ TESTS_26 = [
|
||||
BOT_SPEAKS_FIRST,
|
||||
),
|
||||
# Currently not working.
|
||||
# ("26d-gemini-multimodal-live-text.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
# ("26d-gemini-live-text.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
(
|
||||
"26h-gemini-live-vertex-function-calling.py",
|
||||
PROMPT_WEATHER,
|
||||
EVAL_WEATHER,
|
||||
BOT_SPEAKS_FIRST,
|
||||
),
|
||||
]
|
||||
|
||||
TESTS_27 = [
|
||||
|
||||
@@ -87,9 +87,11 @@ class GeminiLLMAdapter(BaseLLMAdapter[GeminiLLMInvocationParams]):
|
||||
Includes both converted standard tools and any custom Gemini-specific tools.
|
||||
"""
|
||||
functions_schema = tools_schema.standard_tools
|
||||
formatted_standard_tools = [
|
||||
{"function_declarations": [func.to_default_dict() for func in functions_schema]}
|
||||
]
|
||||
formatted_standard_tools = (
|
||||
[{"function_declarations": [func.to_default_dict() for func in functions_schema]}]
|
||||
if functions_schema
|
||||
else []
|
||||
)
|
||||
custom_gemini_tools = []
|
||||
if tools_schema.custom_tools:
|
||||
custom_gemini_tools = tools_schema.custom_tools.get(AdapterType.GEMINI, [])
|
||||
|
||||
193
src/pipecat/audio/filters/krisp_viva_filter.py
Normal file
193
src/pipecat/audio/filters/krisp_viva_filter.py
Normal file
@@ -0,0 +1,193 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""Krisp noise reduction audio filter for Pipecat.
|
||||
|
||||
This module provides an audio filter implementation using Krisp VIVA SDK.
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
import numpy as np
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.audio.filters.base_audio_filter import BaseAudioFilter
|
||||
from pipecat.frames.frames import FilterControlFrame, FilterEnableFrame
|
||||
|
||||
try:
|
||||
import krisp_audio
|
||||
except ModuleNotFoundError as e:
|
||||
logger.error(f"Exception: {e}")
|
||||
logger.error("In order to use the Krisp filter, you need to install krisp_audio.")
|
||||
raise Exception(f"Missing module: {e}")
|
||||
|
||||
|
||||
def _log_callback(log_message, log_level):
|
||||
logger.info(f"[{log_level}] {log_message}")
|
||||
|
||||
|
||||
class KrispVivaFilter(BaseAudioFilter):
|
||||
"""Audio filter using the Krisp VIVA SDK.
|
||||
|
||||
Provides real-time noise reduction for audio streams using Krisp's
|
||||
proprietary noise suppression algorithms. This filter requires a
|
||||
valid Krisp model file to operate.
|
||||
|
||||
Supported sample rates:
|
||||
- 8000 Hz
|
||||
- 16000 Hz
|
||||
- 24000 Hz
|
||||
- 32000 Hz
|
||||
- 44100 Hz
|
||||
- 48000 Hz
|
||||
"""
|
||||
|
||||
# Initialize Krisp Audio SDK globally
|
||||
krisp_audio.globalInit("", _log_callback, krisp_audio.LogLevel.Off)
|
||||
SDK_VERSION = krisp_audio.getVersion()
|
||||
logger.debug(
|
||||
f"Krisp Audio Python SDK Version: {SDK_VERSION.major}."
|
||||
f"{SDK_VERSION.minor}.{SDK_VERSION.patch}"
|
||||
)
|
||||
|
||||
SAMPLE_RATES = {
|
||||
8000: krisp_audio.SamplingRate.Sr8000Hz,
|
||||
16000: krisp_audio.SamplingRate.Sr16000Hz,
|
||||
24000: krisp_audio.SamplingRate.Sr24000Hz,
|
||||
32000: krisp_audio.SamplingRate.Sr32000Hz,
|
||||
44100: krisp_audio.SamplingRate.Sr44100Hz,
|
||||
48000: krisp_audio.SamplingRate.Sr48000Hz,
|
||||
}
|
||||
|
||||
FRAME_SIZE_MS = 10 # Krisp requires audio frames of 10ms duration for processing.
|
||||
|
||||
def __init__(self, model_path: str = None, noise_suppression_level: int = 100) -> None:
|
||||
"""Initialize the Krisp noise reduction filter.
|
||||
|
||||
Args:
|
||||
model_path: Path to the Krisp model file (.kef extension).
|
||||
If None, uses KRISP_VIVA_MODEL_PATH environment variable.
|
||||
noise_suppression_level: Noise suppression level.
|
||||
|
||||
Raises:
|
||||
ValueError: If model_path is not provided and KRISP_VIVA_MODEL_PATH is not set.
|
||||
Exception: If model file doesn't have .kef extension.
|
||||
FileNotFoundError: If model file doesn't exist.
|
||||
"""
|
||||
super().__init__()
|
||||
|
||||
# Set model path, checking environment if not specified
|
||||
self._model_path = model_path or os.getenv("KRISP_VIVA_MODEL_PATH")
|
||||
if not self._model_path:
|
||||
logger.error("Model path is not provided and KRISP_VIVA_MODEL_PATH is not set.")
|
||||
raise ValueError("Model path for KrispAudioProcessor must be provided.")
|
||||
|
||||
if not self._model_path.endswith(".kef"):
|
||||
raise Exception("Model is expected with .kef extension")
|
||||
|
||||
if not os.path.isfile(self._model_path):
|
||||
raise FileNotFoundError(f"Model file not found: {self._model_path}")
|
||||
|
||||
self._filtering = True
|
||||
self._session = None
|
||||
self._samples_per_frame = None
|
||||
self._noise_suppression_level = noise_suppression_level
|
||||
|
||||
# Audio buffer to accumulate samples for complete frames
|
||||
self._audio_buffer = bytearray()
|
||||
|
||||
def _int_to_sample_rate(self, sample_rate):
|
||||
"""Convert integer sample rate to krisp_audio SamplingRate enum.
|
||||
|
||||
Args:
|
||||
sample_rate: Sample rate as integer
|
||||
|
||||
Returns:
|
||||
krisp_audio.SamplingRate enum value
|
||||
|
||||
Raises:
|
||||
ValueError: If sample rate is not supported
|
||||
"""
|
||||
if sample_rate not in self.SAMPLE_RATES:
|
||||
raise ValueError("Unsupported sample rate")
|
||||
return self.SAMPLE_RATES[sample_rate]
|
||||
|
||||
async def start(self, sample_rate: int):
|
||||
"""Initialize the Krisp processor with the transport's sample rate.
|
||||
|
||||
Args:
|
||||
sample_rate: The sample rate of the input transport in Hz.
|
||||
"""
|
||||
model_info = krisp_audio.ModelInfo()
|
||||
model_info.path = self._model_path
|
||||
|
||||
nc_cfg = krisp_audio.NcSessionConfig()
|
||||
nc_cfg.inputSampleRate = self._int_to_sample_rate(sample_rate)
|
||||
nc_cfg.inputFrameDuration = krisp_audio.FrameDuration.Fd10ms
|
||||
nc_cfg.outputSampleRate = nc_cfg.inputSampleRate
|
||||
nc_cfg.modelInfo = model_info
|
||||
|
||||
self._samples_per_frame = int((sample_rate * self.FRAME_SIZE_MS) / 1000)
|
||||
self._session = krisp_audio.NcInt16.create(nc_cfg)
|
||||
|
||||
async def stop(self):
|
||||
"""Clean up the Krisp processor when stopping."""
|
||||
self._session = None
|
||||
|
||||
async def process_frame(self, frame: FilterControlFrame):
|
||||
"""Process control frames to enable/disable filtering.
|
||||
|
||||
Args:
|
||||
frame: The control frame containing filter commands.
|
||||
"""
|
||||
if isinstance(frame, FilterEnableFrame):
|
||||
self._filtering = frame.enable
|
||||
|
||||
async def filter(self, audio: bytes) -> bytes:
|
||||
"""Apply Krisp noise reduction to audio data.
|
||||
|
||||
Args:
|
||||
audio: Raw audio data as bytes to be filtered.
|
||||
|
||||
Returns:
|
||||
Noise-reduced audio data as bytes.
|
||||
"""
|
||||
if not self._filtering:
|
||||
return audio
|
||||
|
||||
# Add incoming audio to our buffer
|
||||
self._audio_buffer.extend(audio)
|
||||
|
||||
# Calculate how many complete frames we can process
|
||||
total_samples = len(self._audio_buffer) // 2 # 2 bytes per int16 sample
|
||||
num_complete_frames = total_samples // self._samples_per_frame
|
||||
|
||||
if num_complete_frames == 0:
|
||||
# Not enough samples for a complete frame yet, return empty
|
||||
return b""
|
||||
|
||||
# Calculate how many bytes we need for complete frames
|
||||
complete_samples_count = num_complete_frames * self._samples_per_frame
|
||||
bytes_to_process = complete_samples_count * 2 # 2 bytes per sample
|
||||
|
||||
# Extract the bytes we can process
|
||||
audio_to_process = bytes(self._audio_buffer[:bytes_to_process])
|
||||
|
||||
# Remove processed bytes from buffer, keep the remainder
|
||||
self._audio_buffer = self._audio_buffer[bytes_to_process:]
|
||||
|
||||
# Process the complete frames
|
||||
samples = np.frombuffer(audio_to_process, dtype=np.int16)
|
||||
frames = samples.reshape(-1, self._samples_per_frame)
|
||||
processed_samples = np.empty_like(samples)
|
||||
|
||||
for i, frame in enumerate(frames):
|
||||
cleaned_frame = self._session.process(frame, self._noise_suppression_level)
|
||||
processed_samples[i * self._samples_per_frame : (i + 1) * self._samples_per_frame] = (
|
||||
cleaned_frame
|
||||
)
|
||||
|
||||
return processed_samples.tobytes()
|
||||
@@ -130,12 +130,16 @@ class PipelineTask(BasePipelineTask):
|
||||
|
||||
- on_pipeline_finished: Called after the pipeline has reached any terminal state.
|
||||
This includes:
|
||||
|
||||
- StopFrame: pipeline was stopped (processors keep connections open)
|
||||
- EndFrame: pipeline ended normally
|
||||
- CancelFrame: pipeline was cancelled
|
||||
|
||||
Use this event for cleanup, logging, or post-processing tasks. Users can inspect
|
||||
the frame if they need to handle specific cases.
|
||||
|
||||
- on_pipeline_error: Called when an error occurs with ErrorFrame
|
||||
|
||||
Example::
|
||||
|
||||
@task.event_handler("on_frame_reached_upstream")
|
||||
@@ -146,9 +150,17 @@ class PipelineTask(BasePipelineTask):
|
||||
async def on_pipeline_idle_timeout(task):
|
||||
...
|
||||
|
||||
@task.event_handler("on_pipeline_started")
|
||||
async def on_pipeline_started(task, frame):
|
||||
...
|
||||
|
||||
@task.event_handler("on_pipeline_finished")
|
||||
async def on_pipeline_finished(task, frame):
|
||||
...
|
||||
|
||||
@task.event_handler("on_pipeline_error")
|
||||
async def on_pipeline_error(task, frame):
|
||||
...
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
@@ -286,6 +298,7 @@ class PipelineTask(BasePipelineTask):
|
||||
self._register_event_handler("on_pipeline_ended")
|
||||
self._register_event_handler("on_pipeline_cancelled")
|
||||
self._register_event_handler("on_pipeline_finished")
|
||||
self._register_event_handler("on_pipeline_error")
|
||||
|
||||
@property
|
||||
def params(self) -> PipelineParams:
|
||||
@@ -692,12 +705,11 @@ class PipelineTask(BasePipelineTask):
|
||||
logger.debug(f"{self}: received interruption task frame {frame}")
|
||||
await self._pipeline.queue_frame(InterruptionFrame())
|
||||
elif isinstance(frame, ErrorFrame):
|
||||
await self._call_event_handler("on_pipeline_error", frame)
|
||||
if frame.fatal:
|
||||
logger.error(f"A fatal error occurred: {frame}")
|
||||
# Cancel all tasks downstream.
|
||||
await self.queue_frame(CancelFrame())
|
||||
# Tell the task we should stop.
|
||||
await self.queue_frame(StopTaskFrame())
|
||||
else:
|
||||
logger.warning(f"{self}: Something went wrong: {frame}")
|
||||
|
||||
|
||||
@@ -877,6 +877,8 @@ class FrameProcessor(BaseObject):
|
||||
|
||||
"""
|
||||
while True:
|
||||
(frame, direction, callback) = await self.__input_queue.get()
|
||||
|
||||
if self.__should_block_system_frames and self.__input_event:
|
||||
logger.trace(f"{self}: system frame processing paused")
|
||||
await self.__input_event.wait()
|
||||
@@ -884,8 +886,6 @@ class FrameProcessor(BaseObject):
|
||||
self.__should_block_system_frames = False
|
||||
logger.trace(f"{self}: system frame processing resumed")
|
||||
|
||||
(frame, direction, callback) = await self.__input_queue.get()
|
||||
|
||||
if isinstance(frame, SystemFrame):
|
||||
await self.__process_frame(frame, direction, callback)
|
||||
elif self.__process_queue:
|
||||
@@ -900,6 +900,8 @@ class FrameProcessor(BaseObject):
|
||||
async def __process_frame_task_handler(self):
|
||||
"""Handle non-system frames from the process queue."""
|
||||
while True:
|
||||
(frame, direction, callback) = await self.__process_queue.get()
|
||||
|
||||
if self.__should_block_frames and self.__process_event:
|
||||
logger.trace(f"{self}: frame processing paused")
|
||||
await self.__process_event.wait()
|
||||
@@ -907,8 +909,6 @@ class FrameProcessor(BaseObject):
|
||||
self.__should_block_frames = False
|
||||
logger.trace(f"{self}: frame processing resumed")
|
||||
|
||||
(frame, direction, callback) = await self.__process_queue.get()
|
||||
|
||||
await self.__process_frame(frame, direction, callback)
|
||||
|
||||
self.__process_queue.task_done()
|
||||
|
||||
@@ -42,7 +42,7 @@ from pipecat.frames.frames import (
|
||||
Frame,
|
||||
FunctionCallResultFrame,
|
||||
InputAudioRawFrame,
|
||||
InputTransportMessageUrgentFrame,
|
||||
InputTransportMessageFrame,
|
||||
InterimTranscriptionFrame,
|
||||
LLMConfigureOutputFrame,
|
||||
LLMContextFrame,
|
||||
@@ -1421,7 +1421,7 @@ class RTVIProcessor(FrameProcessor):
|
||||
elif isinstance(frame, ErrorFrame):
|
||||
await self._send_error_frame(frame)
|
||||
await self.push_frame(frame, direction)
|
||||
elif isinstance(frame, InputTransportMessageUrgentFrame):
|
||||
elif isinstance(frame, InputTransportMessageFrame):
|
||||
await self._handle_transport_message(frame)
|
||||
# All other system frames
|
||||
elif isinstance(frame, SystemFrame):
|
||||
@@ -1484,7 +1484,7 @@ class RTVIProcessor(FrameProcessor):
|
||||
await self._handle_message(message)
|
||||
self._message_queue.task_done()
|
||||
|
||||
async def _handle_transport_message(self, frame: InputTransportMessageUrgentFrame):
|
||||
async def _handle_transport_message(self, frame: InputTransportMessageFrame):
|
||||
"""Handle an incoming transport message frame."""
|
||||
try:
|
||||
transport_message = frame.message
|
||||
|
||||
@@ -67,12 +67,15 @@ To run locally:
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import mimetypes
|
||||
import os
|
||||
import sys
|
||||
from contextlib import asynccontextmanager
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import aiohttp
|
||||
from fastapi.responses import FileResponse
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.runner.types import (
|
||||
@@ -84,7 +87,7 @@ from pipecat.runner.types import (
|
||||
try:
|
||||
import uvicorn
|
||||
from dotenv import load_dotenv
|
||||
from fastapi import BackgroundTasks, FastAPI, HTTPException, Request, WebSocket
|
||||
from fastapi import BackgroundTasks, FastAPI, Header, HTTPException, Request, WebSocket
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from fastapi.responses import HTMLResponse, RedirectResponse
|
||||
except ImportError as e:
|
||||
@@ -98,6 +101,12 @@ except ImportError as e:
|
||||
load_dotenv(override=True)
|
||||
os.environ["ENV"] = "local"
|
||||
|
||||
TELEPHONY_TRANSPORTS = ["twilio", "telnyx", "plivo", "exotel"]
|
||||
|
||||
RUNNER_DOWNLOADS_FOLDER: Optional[str] = None
|
||||
RUNNER_HOST: str = "localhost"
|
||||
RUNNER_PORT: int = 7860
|
||||
|
||||
|
||||
def _get_bot_module():
|
||||
"""Get the bot module from the calling script."""
|
||||
@@ -152,7 +161,12 @@ async def _run_telephony_bot(websocket: WebSocket):
|
||||
|
||||
|
||||
def _create_server_app(
|
||||
transport_type: str, host: str = "localhost", proxy: str = None, esp32_mode: bool = False
|
||||
*,
|
||||
transport_type: str,
|
||||
host: str = "localhost",
|
||||
proxy: str,
|
||||
esp32_mode: bool = False,
|
||||
folder: Optional[str] = None,
|
||||
):
|
||||
"""Create FastAPI app with transport-specific routes."""
|
||||
app = FastAPI()
|
||||
@@ -167,19 +181,21 @@ def _create_server_app(
|
||||
|
||||
# Set up transport-specific routes
|
||||
if transport_type == "webrtc":
|
||||
_setup_webrtc_routes(app, esp32_mode=esp32_mode, host=host)
|
||||
_setup_webrtc_routes(app, esp32_mode=esp32_mode, host=host, folder=folder)
|
||||
_setup_whatsapp_routes(app)
|
||||
elif transport_type == "daily":
|
||||
_setup_daily_routes(app)
|
||||
elif transport_type in ["twilio", "telnyx", "plivo", "exotel"]:
|
||||
_setup_telephony_routes(app, transport_type, proxy)
|
||||
elif transport_type in TELEPHONY_TRANSPORTS:
|
||||
_setup_telephony_routes(app, transport_type=transport_type, proxy=proxy)
|
||||
else:
|
||||
logger.warning(f"Unknown transport type: {transport_type}")
|
||||
|
||||
return app
|
||||
|
||||
|
||||
def _setup_webrtc_routes(app: FastAPI, esp32_mode: bool = False, host: str = "localhost"):
|
||||
def _setup_webrtc_routes(
|
||||
app: FastAPI, *, esp32_mode: bool = False, host: str = "localhost", folder: Optional[str] = None
|
||||
):
|
||||
"""Set up WebRTC-specific routes."""
|
||||
try:
|
||||
from pipecat_ai_small_webrtc_prebuilt.frontend import SmallWebRTCPrebuiltUI
|
||||
@@ -201,6 +217,21 @@ def _setup_webrtc_routes(app: FastAPI, esp32_mode: bool = False, host: str = "lo
|
||||
"""Redirect root requests to client interface."""
|
||||
return RedirectResponse(url="/client/")
|
||||
|
||||
@app.get("/files/{filename:path}")
|
||||
async def download_file(filename: str):
|
||||
"""Handle file downloads."""
|
||||
if not folder:
|
||||
logger.warning(f"Attempting to dowload {filename}, but downloads folder not setup.")
|
||||
return
|
||||
|
||||
file_path = Path(folder) / filename
|
||||
if not os.path.exists(file_path):
|
||||
raise HTTPException(404)
|
||||
|
||||
media_type, _ = mimetypes.guess_type(file_path)
|
||||
|
||||
return FileResponse(path=file_path, media_type=media_type, filename=filename)
|
||||
|
||||
# Initialize the SmallWebRTC request handler
|
||||
small_webrtc_handler: SmallWebRTCRequestHandler = SmallWebRTCRequestHandler(
|
||||
esp32_mode=esp32_mode, host=host
|
||||
@@ -275,6 +306,7 @@ def _setup_whatsapp_routes(app: FastAPI):
|
||||
WHATSAPP_TOKEN = os.getenv("WHATSAPP_TOKEN")
|
||||
WHATSAPP_PHONE_NUMBER_ID = os.getenv("WHATSAPP_PHONE_NUMBER_ID")
|
||||
WHATSAPP_WEBHOOK_VERIFICATION_TOKEN = os.getenv("WHATSAPP_WEBHOOK_VERIFICATION_TOKEN")
|
||||
WHATSAPP_APP_SECRET = os.getenv("WHATSAPP_APP_SECRET")
|
||||
|
||||
if not all(
|
||||
[
|
||||
@@ -325,7 +357,12 @@ def _setup_whatsapp_routes(app: FastAPI):
|
||||
summary="Handle WhatsApp webhook events",
|
||||
description="Processes incoming WhatsApp messages and call events",
|
||||
)
|
||||
async def whatsapp_webhook(body: WhatsAppWebhookRequest, background_tasks: BackgroundTasks):
|
||||
async def whatsapp_webhook(
|
||||
body: WhatsAppWebhookRequest,
|
||||
background_tasks: BackgroundTasks,
|
||||
request: Request,
|
||||
x_hub_signature_256: str = Header(None),
|
||||
):
|
||||
"""Handle incoming WhatsApp webhook events.
|
||||
|
||||
For call events, establishes WebRTC connections and spawns bot instances
|
||||
@@ -357,7 +394,10 @@ def _setup_whatsapp_routes(app: FastAPI):
|
||||
|
||||
try:
|
||||
# Process the webhook request
|
||||
result = await whatsapp_client.handle_webhook_request(body, connection_callback)
|
||||
raw_body = await request.body()
|
||||
result = await whatsapp_client.handle_webhook_request(
|
||||
body, connection_callback, sha256_signature=x_hub_signature_256, raw_body=raw_body
|
||||
)
|
||||
logger.debug(f"Webhook processed successfully: {result}")
|
||||
return {"status": "success", "message": "Webhook processed successfully"}
|
||||
except ValueError as ve:
|
||||
@@ -376,6 +416,7 @@ def _setup_whatsapp_routes(app: FastAPI):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
whatsapp_client = WhatsAppClient(
|
||||
whatsapp_token=WHATSAPP_TOKEN,
|
||||
whatsapp_secret=WHATSAPP_APP_SECRET,
|
||||
phone_number_id=WHATSAPP_PHONE_NUMBER_ID,
|
||||
session=session,
|
||||
)
|
||||
@@ -479,7 +520,7 @@ def _setup_daily_routes(app: FastAPI):
|
||||
return await _handle_rtvi_request(request)
|
||||
|
||||
|
||||
def _setup_telephony_routes(app: FastAPI, transport_type: str, proxy: str):
|
||||
def _setup_telephony_routes(app: FastAPI, *, transport_type: str, proxy: str):
|
||||
"""Set up telephony-specific routes."""
|
||||
# XML response templates (Exotel doesn't use XML webhooks)
|
||||
XML_TEMPLATES = {
|
||||
@@ -582,6 +623,21 @@ def _validate_and_clean_proxy(proxy: str) -> str:
|
||||
return proxy
|
||||
|
||||
|
||||
def runner_downloads_folder() -> Optional[str]:
|
||||
"""Returns the folder where files are stored for later download."""
|
||||
return RUNNER_DOWNLOADS_FOLDER
|
||||
|
||||
|
||||
def runner_host() -> str:
|
||||
"""Returns the host name of this runner."""
|
||||
return RUNNER_HOST
|
||||
|
||||
|
||||
def runner_port() -> int:
|
||||
"""Returns the port of this runner."""
|
||||
return RUNNER_PORT
|
||||
|
||||
|
||||
def main():
|
||||
"""Start the Pipecat development runner.
|
||||
|
||||
@@ -602,14 +658,16 @@ def main():
|
||||
|
||||
The bot file must contain a `bot(runner_args)` function as the entry point.
|
||||
"""
|
||||
global RUNNER_DOWNLOADS_FOLDER, RUNNER_HOST, RUNNER_PORT
|
||||
|
||||
parser = argparse.ArgumentParser(description="Pipecat Development Runner")
|
||||
parser.add_argument("--host", type=str, default="localhost", help="Host address")
|
||||
parser.add_argument("--port", type=int, default=7860, help="Port number")
|
||||
parser.add_argument("--host", type=str, default=RUNNER_HOST, help="Host address")
|
||||
parser.add_argument("--port", type=int, default=RUNNER_PORT, help="Port number")
|
||||
parser.add_argument(
|
||||
"-t",
|
||||
"--transport",
|
||||
type=str,
|
||||
choices=["daily", "webrtc", "twilio", "telnyx", "plivo", "exotel"],
|
||||
choices=["daily", "webrtc", *TELEPHONY_TRANSPORTS],
|
||||
default="webrtc",
|
||||
help="Transport type",
|
||||
)
|
||||
@@ -627,6 +685,7 @@ def main():
|
||||
default=False,
|
||||
help="Connect directly to Daily room (automatically sets transport to daily)",
|
||||
)
|
||||
parser.add_argument("-f", "--folder", type=str, help="Path to downloads folder")
|
||||
parser.add_argument(
|
||||
"--verbose", "-v", action="count", default=0, help="Increase logging verbosity"
|
||||
)
|
||||
@@ -649,6 +708,10 @@ def main():
|
||||
logger.error("For ESP32, you need to specify `--host IP` so we can do SDP munging.")
|
||||
return
|
||||
|
||||
if args.transport in TELEPHONY_TRANSPORTS and not args.proxy:
|
||||
logger.error(f"For telephony transports, you need to specify `--proxy PROXY`.")
|
||||
return
|
||||
|
||||
# Log level
|
||||
logger.remove()
|
||||
logger.add(sys.stderr, level="TRACE" if args.verbose else "DEBUG")
|
||||
@@ -679,8 +742,18 @@ def main():
|
||||
print(f" → Open http://{args.host}:{args.port} in your browser to start a session")
|
||||
print()
|
||||
|
||||
RUNNER_DOWNLOADS_FOLDER = args.folder
|
||||
RUNNER_HOST = args.host
|
||||
RUNNER_PORT = args.port
|
||||
|
||||
# Create the app with transport-specific setup
|
||||
app = _create_server_app(args.transport, args.host, args.proxy, args.esp32)
|
||||
app = _create_server_app(
|
||||
transport_type=args.transport,
|
||||
host=args.host,
|
||||
proxy=args.proxy,
|
||||
esp32_mode=args.esp32,
|
||||
folder=args.folder,
|
||||
)
|
||||
|
||||
# Run the server
|
||||
uvicorn.run(app, host=args.host, port=args.port)
|
||||
|
||||
@@ -99,29 +99,41 @@ async def parse_telephony_websocket(websocket: WebSocket):
|
||||
tuple: (transport_type: str, call_data: dict)
|
||||
|
||||
call_data contains provider-specific fields:
|
||||
- Twilio: {
|
||||
"stream_id": str,
|
||||
"call_id": str,
|
||||
"body": dict
|
||||
}
|
||||
- Telnyx: {
|
||||
"stream_id": str,
|
||||
"call_control_id": str,
|
||||
"outbound_encoding": str,
|
||||
"from": str,
|
||||
"to": str,
|
||||
}
|
||||
- Plivo: {
|
||||
"stream_id": str,
|
||||
"call_id": str,
|
||||
}
|
||||
- Exotel: {
|
||||
"stream_id": str,
|
||||
"call_id": str,
|
||||
"account_sid": str,
|
||||
"from": str,
|
||||
"to": str,
|
||||
}
|
||||
|
||||
- Twilio::
|
||||
|
||||
{
|
||||
"stream_id": str,
|
||||
"call_id": str,
|
||||
"body": dict
|
||||
}
|
||||
|
||||
- Telnyx::
|
||||
|
||||
{
|
||||
"stream_id": str,
|
||||
"call_control_id": str,
|
||||
"outbound_encoding": str,
|
||||
"from": str,
|
||||
"to": str,
|
||||
}
|
||||
|
||||
- Plivo::
|
||||
|
||||
{
|
||||
"stream_id": str,
|
||||
"call_id": str,
|
||||
}
|
||||
|
||||
- Exotel::
|
||||
|
||||
{
|
||||
"stream_id": str,
|
||||
"call_id": str,
|
||||
"account_sid": str,
|
||||
"from": str,
|
||||
"to": str,
|
||||
}
|
||||
|
||||
Example usage::
|
||||
|
||||
|
||||
@@ -25,11 +25,31 @@ except ModuleNotFoundError as e:
|
||||
class LivekitFrameSerializer(FrameSerializer):
|
||||
"""Serializer for converting between Pipecat frames and LiveKit audio frames.
|
||||
|
||||
.. deprecated:: 0.0.90
|
||||
|
||||
This class is deprecated and will be removed in a future version.
|
||||
Please use LiveKitTransport instead, which handles audio streaming
|
||||
and frame conversion natively.
|
||||
|
||||
This serializer handles the conversion of Pipecat's OutputAudioRawFrame objects
|
||||
to LiveKit AudioFrame objects for transmission, and the reverse conversion
|
||||
for received audio data.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the LiveKit frame serializer."""
|
||||
super().__init__()
|
||||
import warnings
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("always")
|
||||
warnings.warn(
|
||||
"LivekitFrameSerializer is deprecated and will be removed in a future version. "
|
||||
"Please use LiveKitTransport instead, which handles audio streaming natively.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
@property
|
||||
def type(self) -> FrameSerializerType:
|
||||
"""Get the serializer type.
|
||||
|
||||
@@ -97,9 +97,7 @@ class AIService(FrameProcessor):
|
||||
pass
|
||||
|
||||
async def _update_settings(self, settings: Mapping[str, Any]):
|
||||
from pipecat.services.openai_realtime_beta.events import (
|
||||
SessionProperties,
|
||||
)
|
||||
from pipecat.services.openai.realtime.events import SessionProperties
|
||||
|
||||
for key, value in settings.items():
|
||||
logger.debug("Update request for:", key, value)
|
||||
@@ -111,9 +109,7 @@ class AIService(FrameProcessor):
|
||||
logger.debug("Attempting to update", key, value)
|
||||
|
||||
try:
|
||||
from pipecat.services.openai_realtime_beta.events import (
|
||||
TurnDetection,
|
||||
)
|
||||
from pipecat.services.openai.realtime.events import TurnDetection
|
||||
|
||||
if isinstance(self._session_properties, SessionProperties):
|
||||
current_properties = self._session_properties
|
||||
|
||||
@@ -9,6 +9,7 @@ import sys
|
||||
from pipecat.services import DeprecatedModuleProxy
|
||||
|
||||
from .llm import *
|
||||
from .nova_sonic import *
|
||||
from .stt import *
|
||||
from .tts import *
|
||||
|
||||
|
||||
0
src/pipecat/services/aws/nova_sonic/__init__.py
Normal file
0
src/pipecat/services/aws/nova_sonic/__init__.py
Normal file
367
src/pipecat/services/aws/nova_sonic/context.py
Normal file
367
src/pipecat/services/aws/nova_sonic/context.py
Normal file
@@ -0,0 +1,367 @@
|
||||
#
|
||||
# Copyright (c) 2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""Context management for AWS Nova Sonic LLM service.
|
||||
|
||||
This module provides specialized context aggregators and message handling for AWS Nova Sonic,
|
||||
including conversation history management and role-specific message processing.
|
||||
"""
|
||||
|
||||
import copy
|
||||
from dataclasses import dataclass, field
|
||||
from enum import Enum
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.frames.frames import (
|
||||
BotStoppedSpeakingFrame,
|
||||
DataFrame,
|
||||
Frame,
|
||||
FunctionCallResultFrame,
|
||||
InterruptionFrame,
|
||||
LLMFullResponseEndFrame,
|
||||
LLMFullResponseStartFrame,
|
||||
LLMMessagesAppendFrame,
|
||||
LLMMessagesUpdateFrame,
|
||||
LLMSetToolChoiceFrame,
|
||||
LLMSetToolsFrame,
|
||||
TextFrame,
|
||||
UserImageRawFrame,
|
||||
)
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.processors.frame_processor import FrameDirection
|
||||
from pipecat.services.aws.nova_sonic.frames import AWSNovaSonicFunctionCallResultFrame
|
||||
from pipecat.services.openai.llm import (
|
||||
OpenAIAssistantContextAggregator,
|
||||
OpenAIUserContextAggregator,
|
||||
)
|
||||
|
||||
|
||||
class Role(Enum):
|
||||
"""Roles supported in AWS Nova Sonic conversations.
|
||||
|
||||
Parameters:
|
||||
SYSTEM: System-level messages (not used in conversation history).
|
||||
USER: Messages sent by the user.
|
||||
ASSISTANT: Messages sent by the assistant.
|
||||
TOOL: Messages sent by tools (not used in conversation history).
|
||||
"""
|
||||
|
||||
SYSTEM = "SYSTEM"
|
||||
USER = "USER"
|
||||
ASSISTANT = "ASSISTANT"
|
||||
TOOL = "TOOL"
|
||||
|
||||
|
||||
@dataclass
|
||||
class AWSNovaSonicConversationHistoryMessage:
|
||||
"""A single message in AWS Nova Sonic conversation history.
|
||||
|
||||
Parameters:
|
||||
role: The role of the message sender (USER or ASSISTANT only).
|
||||
text: The text content of the message.
|
||||
"""
|
||||
|
||||
role: Role # only USER and ASSISTANT
|
||||
text: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class AWSNovaSonicConversationHistory:
|
||||
"""Complete conversation history for AWS Nova Sonic initialization.
|
||||
|
||||
Parameters:
|
||||
system_instruction: System-level instruction for the conversation.
|
||||
messages: List of conversation messages between user and assistant.
|
||||
"""
|
||||
|
||||
system_instruction: str = None
|
||||
messages: list[AWSNovaSonicConversationHistoryMessage] = field(default_factory=list)
|
||||
|
||||
|
||||
class AWSNovaSonicLLMContext(OpenAILLMContext):
|
||||
"""Specialized LLM context for AWS Nova Sonic service.
|
||||
|
||||
Extends OpenAI context with Nova Sonic-specific message handling,
|
||||
conversation history management, and text buffering capabilities.
|
||||
"""
|
||||
|
||||
def __init__(self, messages=None, tools=None, **kwargs):
|
||||
"""Initialize AWS Nova Sonic LLM context.
|
||||
|
||||
Args:
|
||||
messages: Initial messages for the context.
|
||||
tools: Available tools for the context.
|
||||
**kwargs: Additional arguments passed to parent class.
|
||||
"""
|
||||
super().__init__(messages=messages, tools=tools, **kwargs)
|
||||
self.__setup_local()
|
||||
|
||||
def __setup_local(self, system_instruction: str = ""):
|
||||
self._assistant_text = ""
|
||||
self._user_text = ""
|
||||
self._system_instruction = system_instruction
|
||||
|
||||
@staticmethod
|
||||
def upgrade_to_nova_sonic(
|
||||
obj: OpenAILLMContext, system_instruction: str
|
||||
) -> "AWSNovaSonicLLMContext":
|
||||
"""Upgrade an OpenAI context to AWS Nova Sonic context.
|
||||
|
||||
Args:
|
||||
obj: The OpenAI context to upgrade.
|
||||
system_instruction: System instruction for the context.
|
||||
|
||||
Returns:
|
||||
The upgraded AWS Nova Sonic context.
|
||||
"""
|
||||
if isinstance(obj, OpenAILLMContext) and not isinstance(obj, AWSNovaSonicLLMContext):
|
||||
obj.__class__ = AWSNovaSonicLLMContext
|
||||
obj.__setup_local(system_instruction)
|
||||
return obj
|
||||
|
||||
# NOTE: this method has the side-effect of updating _system_instruction from messages
|
||||
def get_messages_for_initializing_history(self) -> AWSNovaSonicConversationHistory:
|
||||
"""Get conversation history for initializing AWS Nova Sonic session.
|
||||
|
||||
Processes stored messages and extracts system instruction and conversation
|
||||
history in the format expected by AWS Nova Sonic.
|
||||
|
||||
Returns:
|
||||
Formatted conversation history with system instruction and messages.
|
||||
"""
|
||||
history = AWSNovaSonicConversationHistory(system_instruction=self._system_instruction)
|
||||
|
||||
# Bail if there are no messages
|
||||
if not self.messages:
|
||||
return history
|
||||
|
||||
messages = copy.deepcopy(self.messages)
|
||||
|
||||
# If we have a "system" message as our first message, let's pull that out into "instruction"
|
||||
if messages[0].get("role") == "system":
|
||||
system = messages.pop(0)
|
||||
content = system.get("content")
|
||||
if isinstance(content, str):
|
||||
history.system_instruction = content
|
||||
elif isinstance(content, list):
|
||||
history.system_instruction = content[0].get("text")
|
||||
if history.system_instruction:
|
||||
self._system_instruction = history.system_instruction
|
||||
|
||||
# Process remaining messages to fill out conversation history.
|
||||
# Nova Sonic supports "user" and "assistant" messages in history.
|
||||
for message in messages:
|
||||
history_message = self.from_standard_message(message)
|
||||
if history_message:
|
||||
history.messages.append(history_message)
|
||||
|
||||
return history
|
||||
|
||||
def get_messages_for_persistent_storage(self):
|
||||
"""Get messages formatted for persistent storage.
|
||||
|
||||
Returns:
|
||||
List of messages including system instruction if present.
|
||||
"""
|
||||
messages = super().get_messages_for_persistent_storage()
|
||||
# If we have a system instruction and messages doesn't already contain it, add it
|
||||
if self._system_instruction and not (messages and messages[0].get("role") == "system"):
|
||||
messages.insert(0, {"role": "system", "content": self._system_instruction})
|
||||
return messages
|
||||
|
||||
def from_standard_message(self, message) -> AWSNovaSonicConversationHistoryMessage:
|
||||
"""Convert standard message format to Nova Sonic format.
|
||||
|
||||
Args:
|
||||
message: Standard message dictionary to convert.
|
||||
|
||||
Returns:
|
||||
Nova Sonic conversation history message, or None if not convertible.
|
||||
"""
|
||||
role = message.get("role")
|
||||
if message.get("role") == "user" or message.get("role") == "assistant":
|
||||
content = message.get("content")
|
||||
if isinstance(message.get("content"), list):
|
||||
content = ""
|
||||
for c in message.get("content"):
|
||||
if c.get("type") == "text":
|
||||
content += " " + c.get("text")
|
||||
else:
|
||||
logger.error(
|
||||
f"Unhandled content type in context message: {c.get('type')} - {message}"
|
||||
)
|
||||
# There won't be content if this is an assistant tool call entry.
|
||||
# We're ignoring those since they can't be loaded into AWS Nova Sonic conversation
|
||||
# history
|
||||
if content:
|
||||
return AWSNovaSonicConversationHistoryMessage(role=Role[role.upper()], text=content)
|
||||
# NOTE: we're ignoring messages with role "tool" since they can't be loaded into AWS Nova
|
||||
# Sonic conversation history
|
||||
|
||||
def buffer_user_text(self, text):
|
||||
"""Buffer user text for later flushing to context.
|
||||
|
||||
Args:
|
||||
text: User text to buffer.
|
||||
"""
|
||||
self._user_text += f" {text}" if self._user_text else text
|
||||
# logger.debug(f"User text buffered: {self._user_text}")
|
||||
|
||||
def flush_aggregated_user_text(self) -> str:
|
||||
"""Flush buffered user text to context as a complete message.
|
||||
|
||||
Returns:
|
||||
The flushed user text, or empty string if no text was buffered.
|
||||
"""
|
||||
if not self._user_text:
|
||||
return ""
|
||||
user_text = self._user_text
|
||||
message = {
|
||||
"role": "user",
|
||||
"content": [{"type": "text", "text": user_text}],
|
||||
}
|
||||
self._user_text = ""
|
||||
self.add_message(message)
|
||||
# logger.debug(f"Context updated (user): {self.get_messages_for_logging()}")
|
||||
return user_text
|
||||
|
||||
def buffer_assistant_text(self, text):
|
||||
"""Buffer assistant text for later flushing to context.
|
||||
|
||||
Args:
|
||||
text: Assistant text to buffer.
|
||||
"""
|
||||
self._assistant_text += text
|
||||
# logger.debug(f"Assistant text buffered: {self._assistant_text}")
|
||||
|
||||
def flush_aggregated_assistant_text(self):
|
||||
"""Flush buffered assistant text to context as a complete message."""
|
||||
if not self._assistant_text:
|
||||
return
|
||||
message = {
|
||||
"role": "assistant",
|
||||
"content": [{"type": "text", "text": self._assistant_text}],
|
||||
}
|
||||
self._assistant_text = ""
|
||||
self.add_message(message)
|
||||
# logger.debug(f"Context updated (assistant): {self.get_messages_for_logging()}")
|
||||
|
||||
|
||||
@dataclass
|
||||
class AWSNovaSonicMessagesUpdateFrame(DataFrame):
|
||||
"""Frame containing updated AWS Nova Sonic context.
|
||||
|
||||
Parameters:
|
||||
context: The updated AWS Nova Sonic LLM context.
|
||||
"""
|
||||
|
||||
context: AWSNovaSonicLLMContext
|
||||
|
||||
|
||||
class AWSNovaSonicUserContextAggregator(OpenAIUserContextAggregator):
|
||||
"""Context aggregator for user messages in AWS Nova Sonic conversations.
|
||||
|
||||
Extends the OpenAI user context aggregator to emit Nova Sonic-specific
|
||||
context update frames.
|
||||
"""
|
||||
|
||||
async def process_frame(
|
||||
self, frame: Frame, direction: FrameDirection = FrameDirection.DOWNSTREAM
|
||||
):
|
||||
"""Process frames and emit Nova Sonic-specific context updates.
|
||||
|
||||
Args:
|
||||
frame: The frame to process.
|
||||
direction: The direction the frame is traveling.
|
||||
"""
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
# Parent does not push LLMMessagesUpdateFrame
|
||||
if isinstance(frame, LLMMessagesUpdateFrame):
|
||||
await self.push_frame(AWSNovaSonicMessagesUpdateFrame(context=self._context))
|
||||
|
||||
|
||||
class AWSNovaSonicAssistantContextAggregator(OpenAIAssistantContextAggregator):
|
||||
"""Context aggregator for assistant messages in AWS Nova Sonic conversations.
|
||||
|
||||
Provides specialized handling for assistant responses and function calls
|
||||
in AWS Nova Sonic context, with custom frame processing logic.
|
||||
"""
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
"""Process frames with Nova Sonic-specific logic.
|
||||
|
||||
Args:
|
||||
frame: The frame to process.
|
||||
direction: The direction the frame is traveling.
|
||||
"""
|
||||
# HACK: For now, disable the context aggregator by making it just pass through all frames
|
||||
# that the parent handles (except the function call stuff, which we still need).
|
||||
# For an explanation of this hack, see
|
||||
# AWSNovaSonicLLMService._report_assistant_response_text_added.
|
||||
if isinstance(
|
||||
frame,
|
||||
(
|
||||
InterruptionFrame,
|
||||
LLMFullResponseStartFrame,
|
||||
LLMFullResponseEndFrame,
|
||||
TextFrame,
|
||||
LLMMessagesAppendFrame,
|
||||
LLMMessagesUpdateFrame,
|
||||
LLMSetToolsFrame,
|
||||
LLMSetToolChoiceFrame,
|
||||
UserImageRawFrame,
|
||||
BotStoppedSpeakingFrame,
|
||||
),
|
||||
):
|
||||
await self.push_frame(frame, direction)
|
||||
else:
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
async def handle_function_call_result(self, frame: FunctionCallResultFrame):
|
||||
"""Handle function call results for AWS Nova Sonic.
|
||||
|
||||
Args:
|
||||
frame: The function call result frame to handle.
|
||||
"""
|
||||
await super().handle_function_call_result(frame)
|
||||
|
||||
# The standard function callback code path pushes the FunctionCallResultFrame from the LLM
|
||||
# itself, so we didn't have a chance to add the result to the AWS Nova Sonic server-side
|
||||
# context. Let's push a special frame to do that.
|
||||
await self.push_frame(
|
||||
AWSNovaSonicFunctionCallResultFrame(result_frame=frame), FrameDirection.UPSTREAM
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class AWSNovaSonicContextAggregatorPair:
|
||||
"""Pair of user and assistant context aggregators for AWS Nova Sonic.
|
||||
|
||||
Parameters:
|
||||
_user: The user context aggregator.
|
||||
_assistant: The assistant context aggregator.
|
||||
"""
|
||||
|
||||
_user: AWSNovaSonicUserContextAggregator
|
||||
_assistant: AWSNovaSonicAssistantContextAggregator
|
||||
|
||||
def user(self) -> AWSNovaSonicUserContextAggregator:
|
||||
"""Get the user context aggregator.
|
||||
|
||||
Returns:
|
||||
The user context aggregator instance.
|
||||
"""
|
||||
return self._user
|
||||
|
||||
def assistant(self) -> AWSNovaSonicAssistantContextAggregator:
|
||||
"""Get the assistant context aggregator.
|
||||
|
||||
Returns:
|
||||
The assistant context aggregator instance.
|
||||
"""
|
||||
return self._assistant
|
||||
25
src/pipecat/services/aws/nova_sonic/frames.py
Normal file
25
src/pipecat/services/aws/nova_sonic/frames.py
Normal file
@@ -0,0 +1,25 @@
|
||||
#
|
||||
# Copyright (c) 2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""Custom frames for AWS Nova Sonic LLM service."""
|
||||
|
||||
from dataclasses import dataclass
|
||||
|
||||
from pipecat.frames.frames import DataFrame, FunctionCallResultFrame
|
||||
|
||||
|
||||
@dataclass
|
||||
class AWSNovaSonicFunctionCallResultFrame(DataFrame):
|
||||
"""Frame containing function call result for AWS Nova Sonic processing.
|
||||
|
||||
This frame wraps a standard function call result frame to enable
|
||||
AWS Nova Sonic-specific handling and context updates.
|
||||
|
||||
Parameters:
|
||||
result_frame: The underlying function call result frame.
|
||||
"""
|
||||
|
||||
result_frame: FunctionCallResultFrame
|
||||
1155
src/pipecat/services/aws/nova_sonic/llm.py
Normal file
1155
src/pipecat/services/aws/nova_sonic/llm.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -1 +1,19 @@
|
||||
from .aws import AWSNovaSonicLLMService, Params
|
||||
#
|
||||
# Copyright (c) 2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import warnings
|
||||
|
||||
from pipecat.services.aws.nova_sonic.llm import AWSNovaSonicLLMService, Params
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("always")
|
||||
warnings.warn(
|
||||
"Types in pipecat.services.aws_nova_sonic are deprecated. "
|
||||
"Please use the equivalent types from "
|
||||
"pipecat.services.aws.nova_sonic.llm instead.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -10,358 +10,16 @@ This module provides specialized context aggregators and message handling for AW
|
||||
including conversation history management and role-specific message processing.
|
||||
"""
|
||||
|
||||
import copy
|
||||
from dataclasses import dataclass, field
|
||||
from enum import Enum
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.frames.frames import (
|
||||
BotStoppedSpeakingFrame,
|
||||
DataFrame,
|
||||
Frame,
|
||||
FunctionCallResultFrame,
|
||||
InterruptionFrame,
|
||||
LLMFullResponseEndFrame,
|
||||
LLMFullResponseStartFrame,
|
||||
LLMMessagesAppendFrame,
|
||||
LLMMessagesUpdateFrame,
|
||||
LLMSetToolChoiceFrame,
|
||||
LLMSetToolsFrame,
|
||||
TextFrame,
|
||||
UserImageRawFrame,
|
||||
)
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.processors.frame_processor import FrameDirection
|
||||
from pipecat.services.aws_nova_sonic.frames import AWSNovaSonicFunctionCallResultFrame
|
||||
from pipecat.services.openai.llm import (
|
||||
OpenAIAssistantContextAggregator,
|
||||
OpenAIUserContextAggregator,
|
||||
)
|
||||
|
||||
|
||||
class Role(Enum):
|
||||
"""Roles supported in AWS Nova Sonic conversations.
|
||||
|
||||
Parameters:
|
||||
SYSTEM: System-level messages (not used in conversation history).
|
||||
USER: Messages sent by the user.
|
||||
ASSISTANT: Messages sent by the assistant.
|
||||
TOOL: Messages sent by tools (not used in conversation history).
|
||||
"""
|
||||
|
||||
SYSTEM = "SYSTEM"
|
||||
USER = "USER"
|
||||
ASSISTANT = "ASSISTANT"
|
||||
TOOL = "TOOL"
|
||||
|
||||
|
||||
@dataclass
|
||||
class AWSNovaSonicConversationHistoryMessage:
|
||||
"""A single message in AWS Nova Sonic conversation history.
|
||||
|
||||
Parameters:
|
||||
role: The role of the message sender (USER or ASSISTANT only).
|
||||
text: The text content of the message.
|
||||
"""
|
||||
|
||||
role: Role # only USER and ASSISTANT
|
||||
text: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class AWSNovaSonicConversationHistory:
|
||||
"""Complete conversation history for AWS Nova Sonic initialization.
|
||||
|
||||
Parameters:
|
||||
system_instruction: System-level instruction for the conversation.
|
||||
messages: List of conversation messages between user and assistant.
|
||||
"""
|
||||
|
||||
system_instruction: str = None
|
||||
messages: list[AWSNovaSonicConversationHistoryMessage] = field(default_factory=list)
|
||||
|
||||
|
||||
class AWSNovaSonicLLMContext(OpenAILLMContext):
|
||||
"""Specialized LLM context for AWS Nova Sonic service.
|
||||
|
||||
Extends OpenAI context with Nova Sonic-specific message handling,
|
||||
conversation history management, and text buffering capabilities.
|
||||
"""
|
||||
|
||||
def __init__(self, messages=None, tools=None, **kwargs):
|
||||
"""Initialize AWS Nova Sonic LLM context.
|
||||
|
||||
Args:
|
||||
messages: Initial messages for the context.
|
||||
tools: Available tools for the context.
|
||||
**kwargs: Additional arguments passed to parent class.
|
||||
"""
|
||||
super().__init__(messages=messages, tools=tools, **kwargs)
|
||||
self.__setup_local()
|
||||
|
||||
def __setup_local(self, system_instruction: str = ""):
|
||||
self._assistant_text = ""
|
||||
self._user_text = ""
|
||||
self._system_instruction = system_instruction
|
||||
|
||||
@staticmethod
|
||||
def upgrade_to_nova_sonic(
|
||||
obj: OpenAILLMContext, system_instruction: str
|
||||
) -> "AWSNovaSonicLLMContext":
|
||||
"""Upgrade an OpenAI context to AWS Nova Sonic context.
|
||||
|
||||
Args:
|
||||
obj: The OpenAI context to upgrade.
|
||||
system_instruction: System instruction for the context.
|
||||
|
||||
Returns:
|
||||
The upgraded AWS Nova Sonic context.
|
||||
"""
|
||||
if isinstance(obj, OpenAILLMContext) and not isinstance(obj, AWSNovaSonicLLMContext):
|
||||
obj.__class__ = AWSNovaSonicLLMContext
|
||||
obj.__setup_local(system_instruction)
|
||||
return obj
|
||||
|
||||
# NOTE: this method has the side-effect of updating _system_instruction from messages
|
||||
def get_messages_for_initializing_history(self) -> AWSNovaSonicConversationHistory:
|
||||
"""Get conversation history for initializing AWS Nova Sonic session.
|
||||
|
||||
Processes stored messages and extracts system instruction and conversation
|
||||
history in the format expected by AWS Nova Sonic.
|
||||
|
||||
Returns:
|
||||
Formatted conversation history with system instruction and messages.
|
||||
"""
|
||||
history = AWSNovaSonicConversationHistory(system_instruction=self._system_instruction)
|
||||
|
||||
# Bail if there are no messages
|
||||
if not self.messages:
|
||||
return history
|
||||
|
||||
messages = copy.deepcopy(self.messages)
|
||||
|
||||
# If we have a "system" message as our first message, let's pull that out into "instruction"
|
||||
if messages[0].get("role") == "system":
|
||||
system = messages.pop(0)
|
||||
content = system.get("content")
|
||||
if isinstance(content, str):
|
||||
history.system_instruction = content
|
||||
elif isinstance(content, list):
|
||||
history.system_instruction = content[0].get("text")
|
||||
if history.system_instruction:
|
||||
self._system_instruction = history.system_instruction
|
||||
|
||||
# Process remaining messages to fill out conversation history.
|
||||
# Nova Sonic supports "user" and "assistant" messages in history.
|
||||
for message in messages:
|
||||
history_message = self.from_standard_message(message)
|
||||
if history_message:
|
||||
history.messages.append(history_message)
|
||||
|
||||
return history
|
||||
|
||||
def get_messages_for_persistent_storage(self):
|
||||
"""Get messages formatted for persistent storage.
|
||||
|
||||
Returns:
|
||||
List of messages including system instruction if present.
|
||||
"""
|
||||
messages = super().get_messages_for_persistent_storage()
|
||||
# If we have a system instruction and messages doesn't already contain it, add it
|
||||
if self._system_instruction and not (messages and messages[0].get("role") == "system"):
|
||||
messages.insert(0, {"role": "system", "content": self._system_instruction})
|
||||
return messages
|
||||
|
||||
def from_standard_message(self, message) -> AWSNovaSonicConversationHistoryMessage:
|
||||
"""Convert standard message format to Nova Sonic format.
|
||||
|
||||
Args:
|
||||
message: Standard message dictionary to convert.
|
||||
|
||||
Returns:
|
||||
Nova Sonic conversation history message, or None if not convertible.
|
||||
"""
|
||||
role = message.get("role")
|
||||
if message.get("role") == "user" or message.get("role") == "assistant":
|
||||
content = message.get("content")
|
||||
if isinstance(message.get("content"), list):
|
||||
content = ""
|
||||
for c in message.get("content"):
|
||||
if c.get("type") == "text":
|
||||
content += " " + c.get("text")
|
||||
else:
|
||||
logger.error(
|
||||
f"Unhandled content type in context message: {c.get('type')} - {message}"
|
||||
)
|
||||
# There won't be content if this is an assistant tool call entry.
|
||||
# We're ignoring those since they can't be loaded into AWS Nova Sonic conversation
|
||||
# history
|
||||
if content:
|
||||
return AWSNovaSonicConversationHistoryMessage(role=Role[role.upper()], text=content)
|
||||
# NOTE: we're ignoring messages with role "tool" since they can't be loaded into AWS Nova
|
||||
# Sonic conversation history
|
||||
|
||||
def buffer_user_text(self, text):
|
||||
"""Buffer user text for later flushing to context.
|
||||
|
||||
Args:
|
||||
text: User text to buffer.
|
||||
"""
|
||||
self._user_text += f" {text}" if self._user_text else text
|
||||
# logger.debug(f"User text buffered: {self._user_text}")
|
||||
|
||||
def flush_aggregated_user_text(self) -> str:
|
||||
"""Flush buffered user text to context as a complete message.
|
||||
|
||||
Returns:
|
||||
The flushed user text, or empty string if no text was buffered.
|
||||
"""
|
||||
if not self._user_text:
|
||||
return ""
|
||||
user_text = self._user_text
|
||||
message = {
|
||||
"role": "user",
|
||||
"content": [{"type": "text", "text": user_text}],
|
||||
}
|
||||
self._user_text = ""
|
||||
self.add_message(message)
|
||||
# logger.debug(f"Context updated (user): {self.get_messages_for_logging()}")
|
||||
return user_text
|
||||
|
||||
def buffer_assistant_text(self, text):
|
||||
"""Buffer assistant text for later flushing to context.
|
||||
|
||||
Args:
|
||||
text: Assistant text to buffer.
|
||||
"""
|
||||
self._assistant_text += text
|
||||
# logger.debug(f"Assistant text buffered: {self._assistant_text}")
|
||||
|
||||
def flush_aggregated_assistant_text(self):
|
||||
"""Flush buffered assistant text to context as a complete message."""
|
||||
if not self._assistant_text:
|
||||
return
|
||||
message = {
|
||||
"role": "assistant",
|
||||
"content": [{"type": "text", "text": self._assistant_text}],
|
||||
}
|
||||
self._assistant_text = ""
|
||||
self.add_message(message)
|
||||
# logger.debug(f"Context updated (assistant): {self.get_messages_for_logging()}")
|
||||
|
||||
|
||||
@dataclass
|
||||
class AWSNovaSonicMessagesUpdateFrame(DataFrame):
|
||||
"""Frame containing updated AWS Nova Sonic context.
|
||||
|
||||
Parameters:
|
||||
context: The updated AWS Nova Sonic LLM context.
|
||||
"""
|
||||
|
||||
context: AWSNovaSonicLLMContext
|
||||
|
||||
|
||||
class AWSNovaSonicUserContextAggregator(OpenAIUserContextAggregator):
|
||||
"""Context aggregator for user messages in AWS Nova Sonic conversations.
|
||||
|
||||
Extends the OpenAI user context aggregator to emit Nova Sonic-specific
|
||||
context update frames.
|
||||
"""
|
||||
|
||||
async def process_frame(
|
||||
self, frame: Frame, direction: FrameDirection = FrameDirection.DOWNSTREAM
|
||||
):
|
||||
"""Process frames and emit Nova Sonic-specific context updates.
|
||||
|
||||
Args:
|
||||
frame: The frame to process.
|
||||
direction: The direction the frame is traveling.
|
||||
"""
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
# Parent does not push LLMMessagesUpdateFrame
|
||||
if isinstance(frame, LLMMessagesUpdateFrame):
|
||||
await self.push_frame(AWSNovaSonicMessagesUpdateFrame(context=self._context))
|
||||
|
||||
|
||||
class AWSNovaSonicAssistantContextAggregator(OpenAIAssistantContextAggregator):
|
||||
"""Context aggregator for assistant messages in AWS Nova Sonic conversations.
|
||||
|
||||
Provides specialized handling for assistant responses and function calls
|
||||
in AWS Nova Sonic context, with custom frame processing logic.
|
||||
"""
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
"""Process frames with Nova Sonic-specific logic.
|
||||
|
||||
Args:
|
||||
frame: The frame to process.
|
||||
direction: The direction the frame is traveling.
|
||||
"""
|
||||
# HACK: For now, disable the context aggregator by making it just pass through all frames
|
||||
# that the parent handles (except the function call stuff, which we still need).
|
||||
# For an explanation of this hack, see
|
||||
# AWSNovaSonicLLMService._report_assistant_response_text_added.
|
||||
if isinstance(
|
||||
frame,
|
||||
(
|
||||
InterruptionFrame,
|
||||
LLMFullResponseStartFrame,
|
||||
LLMFullResponseEndFrame,
|
||||
TextFrame,
|
||||
LLMMessagesAppendFrame,
|
||||
LLMMessagesUpdateFrame,
|
||||
LLMSetToolsFrame,
|
||||
LLMSetToolChoiceFrame,
|
||||
UserImageRawFrame,
|
||||
BotStoppedSpeakingFrame,
|
||||
),
|
||||
):
|
||||
await self.push_frame(frame, direction)
|
||||
else:
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
async def handle_function_call_result(self, frame: FunctionCallResultFrame):
|
||||
"""Handle function call results for AWS Nova Sonic.
|
||||
|
||||
Args:
|
||||
frame: The function call result frame to handle.
|
||||
"""
|
||||
await super().handle_function_call_result(frame)
|
||||
|
||||
# The standard function callback code path pushes the FunctionCallResultFrame from the LLM
|
||||
# itself, so we didn't have a chance to add the result to the AWS Nova Sonic server-side
|
||||
# context. Let's push a special frame to do that.
|
||||
await self.push_frame(
|
||||
AWSNovaSonicFunctionCallResultFrame(result_frame=frame), FrameDirection.UPSTREAM
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class AWSNovaSonicContextAggregatorPair:
|
||||
"""Pair of user and assistant context aggregators for AWS Nova Sonic.
|
||||
|
||||
Parameters:
|
||||
_user: The user context aggregator.
|
||||
_assistant: The assistant context aggregator.
|
||||
"""
|
||||
|
||||
_user: AWSNovaSonicUserContextAggregator
|
||||
_assistant: AWSNovaSonicAssistantContextAggregator
|
||||
|
||||
def user(self) -> AWSNovaSonicUserContextAggregator:
|
||||
"""Get the user context aggregator.
|
||||
|
||||
Returns:
|
||||
The user context aggregator instance.
|
||||
"""
|
||||
return self._user
|
||||
|
||||
def assistant(self) -> AWSNovaSonicAssistantContextAggregator:
|
||||
"""Get the assistant context aggregator.
|
||||
|
||||
Returns:
|
||||
The assistant context aggregator instance.
|
||||
"""
|
||||
return self._assistant
|
||||
import warnings
|
||||
|
||||
from pipecat.services.aws.nova_sonic.context import *
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("always")
|
||||
warnings.warn(
|
||||
"Types in pipecat.services.aws_nova_sonic.context are deprecated. "
|
||||
"Please use the equivalent types from "
|
||||
"pipecat.services.aws.nova_sonic.context instead.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
@@ -6,20 +6,16 @@
|
||||
|
||||
"""Custom frames for AWS Nova Sonic LLM service."""
|
||||
|
||||
from dataclasses import dataclass
|
||||
import warnings
|
||||
|
||||
from pipecat.frames.frames import DataFrame, FunctionCallResultFrame
|
||||
from pipecat.services.aws.nova_sonic.frames import *
|
||||
|
||||
|
||||
@dataclass
|
||||
class AWSNovaSonicFunctionCallResultFrame(DataFrame):
|
||||
"""Frame containing function call result for AWS Nova Sonic processing.
|
||||
|
||||
This frame wraps a standard function call result frame to enable
|
||||
AWS Nova Sonic-specific handling and context updates.
|
||||
|
||||
Parameters:
|
||||
result_frame: The underlying function call result frame.
|
||||
"""
|
||||
|
||||
result_frame: FunctionCallResultFrame
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("always")
|
||||
warnings.warn(
|
||||
"Types in pipecat.services.aws_nova_sonic.frames are deprecated. "
|
||||
"Please use the equivalent types from "
|
||||
"pipecat.services.aws.nova_sonic.frames instead.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
0
src/pipecat/services/azure/realtime/__init__.py
Normal file
0
src/pipecat/services/azure/realtime/__init__.py
Normal file
65
src/pipecat/services/azure/realtime/llm.py
Normal file
65
src/pipecat/services/azure/realtime/llm.py
Normal file
@@ -0,0 +1,65 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""Azure OpenAI Realtime LLM service implementation."""
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.services.openai.realtime.llm import OpenAIRealtimeLLMService
|
||||
|
||||
try:
|
||||
from websockets.asyncio.client import connect as websocket_connect
|
||||
except ModuleNotFoundError as e:
|
||||
logger.error(f"Exception: {e}")
|
||||
logger.error("In order to use Azure Realtime, you need to `pip install pipecat-ai[openai]`.")
|
||||
raise Exception(f"Missing module: {e}")
|
||||
|
||||
|
||||
class AzureRealtimeLLMService(OpenAIRealtimeLLMService):
|
||||
"""Azure OpenAI Realtime LLM service with Azure-specific authentication.
|
||||
|
||||
Extends the OpenAI Realtime service to work with Azure OpenAI endpoints,
|
||||
using Azure's authentication headers and endpoint format. Provides the same
|
||||
real-time audio and text communication capabilities as the base OpenAI service.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
api_key: str,
|
||||
base_url: str,
|
||||
**kwargs,
|
||||
):
|
||||
"""Initialize Azure Realtime LLM service.
|
||||
|
||||
Args:
|
||||
api_key: The API key for the Azure OpenAI service.
|
||||
base_url: The full Azure WebSocket endpoint URL including api-version and deployment.
|
||||
Example: "wss://my-project.openai.azure.com/openai/realtime?api-version=2024-10-01-preview&deployment=my-realtime-deployment"
|
||||
**kwargs: Additional arguments passed to parent OpenAIRealtimeLLMService.
|
||||
"""
|
||||
super().__init__(base_url=base_url, api_key=api_key, **kwargs)
|
||||
self.api_key = api_key
|
||||
self.base_url = base_url
|
||||
|
||||
async def _connect(self):
|
||||
try:
|
||||
if self._websocket:
|
||||
# Here we assume that if we have a websocket, we are connected. We
|
||||
# handle disconnections in the send/recv code paths.
|
||||
return
|
||||
|
||||
logger.info(f"Connecting to {self.base_url}, api key: {self.api_key}")
|
||||
self._websocket = await websocket_connect(
|
||||
uri=self.base_url,
|
||||
additional_headers={
|
||||
"api-key": self.api_key,
|
||||
},
|
||||
)
|
||||
self._receive_task = self.create_task(self._receive_task_handler())
|
||||
except Exception as e:
|
||||
logger.error(f"{self} initialization error: {e}")
|
||||
self._websocket = None
|
||||
@@ -8,6 +8,7 @@ import sys
|
||||
|
||||
from pipecat.services import DeprecatedModuleProxy
|
||||
|
||||
from .flux import *
|
||||
from .stt import *
|
||||
from .tts import *
|
||||
|
||||
|
||||
0
src/pipecat/services/deepgram/flux/__init__.py
Normal file
0
src/pipecat/services/deepgram/flux/__init__.py
Normal file
@@ -79,7 +79,7 @@ class DeepgramFluxSTTService(WebsocketSTTService):
|
||||
This class defines all available connection parameters for the Deepgram Flux API
|
||||
based on the official documentation.
|
||||
|
||||
Attributes:
|
||||
Parameters:
|
||||
eager_eot_threshold: Optional. EagerEndOfTurn/TurnResumed are off by default.
|
||||
You can turn them on by setting eager_eot_threshold to a valid value.
|
||||
Lower values = more aggressive EagerEndOfTurning (faster response, more LLM calls).
|
||||
@@ -126,24 +126,24 @@ class DeepgramFluxSTTService(WebsocketSTTService):
|
||||
If None, default parameters will be used.
|
||||
**kwargs: Additional arguments passed to the parent WebsocketSTTService class.
|
||||
|
||||
Example:
|
||||
```python
|
||||
# Basic usage with default parameters
|
||||
stt = DeepgramFluxSTTService(api_key="your-api-key")
|
||||
Examples:
|
||||
Basic usage with default parameters::
|
||||
|
||||
# Advanced usage with custom parameters
|
||||
params = DeepgramFluxSTTService.InputParams(
|
||||
eager_eot_threshold=0.5,
|
||||
eot_threshold=0.8,
|
||||
keyterm=["AI", "machine learning", "neural network"],
|
||||
tag=["production", "voice-agent"]
|
||||
)
|
||||
stt = DeepgramFluxSTTService(
|
||||
api_key="your-api-key",
|
||||
model="flux-general-en",
|
||||
params=params
|
||||
)
|
||||
```
|
||||
stt = DeepgramFluxSTTService(api_key="your-api-key")
|
||||
|
||||
Advanced usage with custom parameters::
|
||||
|
||||
params = DeepgramFluxSTTService.InputParams(
|
||||
eager_eot_threshold=0.5,
|
||||
eot_threshold=0.8,
|
||||
keyterm=["AI", "machine learning", "neural network"],
|
||||
tag=["production", "voice-agent"]
|
||||
)
|
||||
stt = DeepgramFluxSTTService(
|
||||
api_key="your-api-key",
|
||||
model="flux-general-en",
|
||||
params=params
|
||||
)
|
||||
"""
|
||||
super().__init__(sample_rate=sample_rate, **kwargs)
|
||||
|
||||
|
||||
@@ -8,6 +8,7 @@ import sys
|
||||
|
||||
from pipecat.services import DeprecatedModuleProxy
|
||||
|
||||
from .stt import *
|
||||
from .tts import *
|
||||
|
||||
sys.modules[__name__] = DeprecatedModuleProxy(globals(), "elevenlabs", "elevenlabs.tts")
|
||||
sys.modules[__name__] = DeprecatedModuleProxy(globals(), "elevenlabs", "elevenlabs.[stt,tts]")
|
||||
|
||||
@@ -4,527 +4,41 @@
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""Event models and utilities for Google Gemini Multimodal Live API."""
|
||||
|
||||
import base64
|
||||
import io
|
||||
import json
|
||||
from enum import Enum
|
||||
from typing import List, Literal, Optional
|
||||
|
||||
from PIL import Image
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from pipecat.frames.frames import ImageRawFrame
|
||||
|
||||
#
|
||||
# Client events
|
||||
#
|
||||
|
||||
|
||||
class MediaChunk(BaseModel):
|
||||
"""Represents a chunk of media data for transmission.
|
||||
|
||||
Parameters:
|
||||
mimeType: MIME type of the media content.
|
||||
data: Base64-encoded media data.
|
||||
"""
|
||||
|
||||
mimeType: str
|
||||
data: str
|
||||
|
||||
|
||||
class ContentPart(BaseModel):
|
||||
"""Represents a part of content that can contain text or media.
|
||||
|
||||
Parameters:
|
||||
text: Text content. Defaults to None.
|
||||
inlineData: Inline media data. Defaults to None.
|
||||
"""
|
||||
|
||||
text: Optional[str] = Field(default=None, validate_default=False)
|
||||
inlineData: Optional[MediaChunk] = Field(default=None, validate_default=False)
|
||||
fileData: Optional["FileData"] = Field(default=None, validate_default=False)
|
||||
|
||||
|
||||
class FileData(BaseModel):
|
||||
"""Represents a file reference in the Gemini File API."""
|
||||
|
||||
mimeType: str
|
||||
fileUri: str
|
||||
|
||||
|
||||
ContentPart.model_rebuild() # Rebuild model to resolve forward reference
|
||||
|
||||
|
||||
class Turn(BaseModel):
|
||||
"""Represents a conversational turn in the dialogue.
|
||||
|
||||
Parameters:
|
||||
role: The role of the speaker, either "user" or "model". Defaults to "user".
|
||||
parts: List of content parts that make up the turn.
|
||||
"""
|
||||
|
||||
role: Literal["user", "model"] = "user"
|
||||
parts: List[ContentPart]
|
||||
|
||||
|
||||
class StartSensitivity(str, Enum):
|
||||
"""Determines how start of speech is detected."""
|
||||
|
||||
UNSPECIFIED = "START_SENSITIVITY_UNSPECIFIED" # Default is HIGH
|
||||
HIGH = "START_SENSITIVITY_HIGH" # Detect start of speech more often
|
||||
LOW = "START_SENSITIVITY_LOW" # Detect start of speech less often
|
||||
|
||||
|
||||
class EndSensitivity(str, Enum):
|
||||
"""Determines how end of speech is detected."""
|
||||
|
||||
UNSPECIFIED = "END_SENSITIVITY_UNSPECIFIED" # Default is HIGH
|
||||
HIGH = "END_SENSITIVITY_HIGH" # End speech more often
|
||||
LOW = "END_SENSITIVITY_LOW" # End speech less often
|
||||
|
||||
|
||||
class AutomaticActivityDetection(BaseModel):
|
||||
"""Configures automatic detection of voice activity.
|
||||
|
||||
Parameters:
|
||||
disabled: Whether automatic activity detection is disabled. Defaults to None.
|
||||
start_of_speech_sensitivity: Sensitivity for detecting speech start. Defaults to None.
|
||||
prefix_padding_ms: Padding before speech start in milliseconds. Defaults to None.
|
||||
end_of_speech_sensitivity: Sensitivity for detecting speech end. Defaults to None.
|
||||
silence_duration_ms: Duration of silence to detect speech end. Defaults to None.
|
||||
"""
|
||||
|
||||
disabled: Optional[bool] = None
|
||||
start_of_speech_sensitivity: Optional[StartSensitivity] = None
|
||||
prefix_padding_ms: Optional[int] = None
|
||||
end_of_speech_sensitivity: Optional[EndSensitivity] = None
|
||||
silence_duration_ms: Optional[int] = None
|
||||
|
||||
|
||||
class RealtimeInputConfig(BaseModel):
|
||||
"""Configures the realtime input behavior.
|
||||
|
||||
Parameters:
|
||||
automatic_activity_detection: Voice activity detection configuration. Defaults to None.
|
||||
"""
|
||||
|
||||
automatic_activity_detection: Optional[AutomaticActivityDetection] = None
|
||||
|
||||
|
||||
class RealtimeInput(BaseModel):
|
||||
"""Contains realtime input media chunks and text.
|
||||
|
||||
Parameters:
|
||||
mediaChunks: List of media chunks for realtime processing.
|
||||
text: Text for realtime processing.
|
||||
"""
|
||||
|
||||
mediaChunks: Optional[List[MediaChunk]] = None
|
||||
text: Optional[str] = None
|
||||
|
||||
|
||||
class ClientContent(BaseModel):
|
||||
"""Content sent from client to the Gemini Live API.
|
||||
|
||||
Parameters:
|
||||
turns: List of conversation turns. Defaults to None.
|
||||
turnComplete: Whether the client's turn is complete. Defaults to False.
|
||||
"""
|
||||
|
||||
turns: Optional[List[Turn]] = None
|
||||
turnComplete: bool = False
|
||||
|
||||
|
||||
class AudioInputMessage(BaseModel):
|
||||
"""Message containing audio input data.
|
||||
|
||||
Parameters:
|
||||
realtimeInput: Realtime input containing audio chunks.
|
||||
"""
|
||||
|
||||
realtimeInput: RealtimeInput
|
||||
|
||||
@classmethod
|
||||
def from_raw_audio(cls, raw_audio: bytes, sample_rate: int) -> "AudioInputMessage":
|
||||
"""Create an audio input message from raw audio data.
|
||||
|
||||
Args:
|
||||
raw_audio: Raw audio bytes.
|
||||
sample_rate: Audio sample rate in Hz.
|
||||
|
||||
Returns:
|
||||
AudioInputMessage instance with encoded audio data.
|
||||
"""
|
||||
data = base64.b64encode(raw_audio).decode("utf-8")
|
||||
return cls(
|
||||
realtimeInput=RealtimeInput(
|
||||
mediaChunks=[MediaChunk(mimeType=f"audio/pcm;rate={sample_rate}", data=data)]
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
class VideoInputMessage(BaseModel):
|
||||
"""Message containing video/image input data.
|
||||
|
||||
Parameters:
|
||||
realtimeInput: Realtime input containing video/image chunks.
|
||||
"""
|
||||
|
||||
realtimeInput: RealtimeInput
|
||||
|
||||
@classmethod
|
||||
def from_image_frame(cls, frame: ImageRawFrame) -> "VideoInputMessage":
|
||||
"""Create a video input message from an image frame.
|
||||
|
||||
Args:
|
||||
frame: Image frame to encode.
|
||||
|
||||
Returns:
|
||||
VideoInputMessage instance with encoded image data.
|
||||
"""
|
||||
buffer = io.BytesIO()
|
||||
Image.frombytes(frame.format, frame.size, frame.image).save(buffer, format="JPEG")
|
||||
data = base64.b64encode(buffer.getvalue()).decode("utf-8")
|
||||
return cls(
|
||||
realtimeInput=RealtimeInput(mediaChunks=[MediaChunk(mimeType=f"image/jpeg", data=data)])
|
||||
)
|
||||
|
||||
|
||||
class TextInputMessage(BaseModel):
|
||||
"""Message containing text input data."""
|
||||
|
||||
realtimeInput: RealtimeInput
|
||||
|
||||
@classmethod
|
||||
def from_text(cls, text: str) -> "TextInputMessage":
|
||||
"""Create a text input message from a string.
|
||||
|
||||
Args:
|
||||
text: The text to send.
|
||||
|
||||
Returns:
|
||||
A TextInputMessage instance.
|
||||
"""
|
||||
return cls(realtimeInput=RealtimeInput(text=text))
|
||||
|
||||
|
||||
class ClientContentMessage(BaseModel):
|
||||
"""Message containing client content for the API.
|
||||
|
||||
Parameters:
|
||||
clientContent: The client content to send.
|
||||
"""
|
||||
|
||||
clientContent: ClientContent
|
||||
|
||||
|
||||
class SystemInstruction(BaseModel):
|
||||
"""System instruction for the model.
|
||||
|
||||
Parameters:
|
||||
parts: List of content parts that make up the system instruction.
|
||||
"""
|
||||
|
||||
parts: List[ContentPart]
|
||||
|
||||
|
||||
class AudioTranscriptionConfig(BaseModel):
|
||||
"""Configuration for audio transcription."""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class Setup(BaseModel):
|
||||
"""Setup configuration for the Gemini Live session.
|
||||
|
||||
Parameters:
|
||||
model: Model identifier to use.
|
||||
system_instruction: System instruction for the model. Defaults to None.
|
||||
tools: List of available tools/functions. Defaults to None.
|
||||
generation_config: Generation configuration parameters. Defaults to None.
|
||||
input_audio_transcription: Input audio transcription config. Defaults to None.
|
||||
output_audio_transcription: Output audio transcription config. Defaults to None.
|
||||
realtime_input_config: Realtime input configuration. Defaults to None.
|
||||
"""
|
||||
|
||||
model: str
|
||||
system_instruction: Optional[SystemInstruction] = None
|
||||
tools: Optional[List[dict]] = None
|
||||
generation_config: Optional[dict] = None
|
||||
input_audio_transcription: Optional[AudioTranscriptionConfig] = None
|
||||
output_audio_transcription: Optional[AudioTranscriptionConfig] = None
|
||||
realtime_input_config: Optional[RealtimeInputConfig] = None
|
||||
|
||||
|
||||
class Config(BaseModel):
|
||||
"""Configuration message for session setup.
|
||||
|
||||
Parameters:
|
||||
setup: Setup configuration for the session.
|
||||
"""
|
||||
|
||||
setup: Setup
|
||||
|
||||
|
||||
#
|
||||
# Grounding metadata models
|
||||
#
|
||||
|
||||
|
||||
class SearchEntryPoint(BaseModel):
|
||||
"""Represents the search entry point with rendered content for search suggestions."""
|
||||
|
||||
renderedContent: Optional[str] = None
|
||||
|
||||
|
||||
class WebSource(BaseModel):
|
||||
"""Represents a web source from grounding chunks."""
|
||||
|
||||
uri: Optional[str] = None
|
||||
title: Optional[str] = None
|
||||
|
||||
|
||||
class GroundingChunk(BaseModel):
|
||||
"""Represents a grounding chunk containing web source information."""
|
||||
|
||||
web: Optional[WebSource] = None
|
||||
|
||||
|
||||
class GroundingSegment(BaseModel):
|
||||
"""Represents a segment of text that is grounded."""
|
||||
|
||||
startIndex: Optional[int] = None
|
||||
endIndex: Optional[int] = None
|
||||
text: Optional[str] = None
|
||||
|
||||
|
||||
class GroundingSupport(BaseModel):
|
||||
"""Represents support information for grounded text segments."""
|
||||
|
||||
segment: Optional[GroundingSegment] = None
|
||||
groundingChunkIndices: Optional[List[int]] = None
|
||||
confidenceScores: Optional[List[float]] = None
|
||||
|
||||
|
||||
class GroundingMetadata(BaseModel):
|
||||
"""Represents grounding metadata from Google Search."""
|
||||
|
||||
searchEntryPoint: Optional[SearchEntryPoint] = None
|
||||
groundingChunks: Optional[List[GroundingChunk]] = None
|
||||
groundingSupports: Optional[List[GroundingSupport]] = None
|
||||
webSearchQueries: Optional[List[str]] = None
|
||||
|
||||
|
||||
#
|
||||
# Server events
|
||||
#
|
||||
|
||||
|
||||
class SetupComplete(BaseModel):
|
||||
"""Indicates that session setup is complete."""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class InlineData(BaseModel):
|
||||
"""Inline data embedded in server responses.
|
||||
|
||||
Parameters:
|
||||
mimeType: MIME type of the data.
|
||||
data: Base64-encoded data content.
|
||||
"""
|
||||
|
||||
mimeType: str
|
||||
data: str
|
||||
|
||||
|
||||
class Part(BaseModel):
|
||||
"""Part of a server response containing data or text.
|
||||
|
||||
Parameters:
|
||||
inlineData: Inline binary data. Defaults to None.
|
||||
text: Text content. Defaults to None.
|
||||
"""
|
||||
|
||||
inlineData: Optional[InlineData] = None
|
||||
text: Optional[str] = None
|
||||
|
||||
|
||||
class ModelTurn(BaseModel):
|
||||
"""Represents a turn from the model in the conversation.
|
||||
|
||||
Parameters:
|
||||
parts: List of content parts in the model's response.
|
||||
"""
|
||||
|
||||
parts: List[Part]
|
||||
|
||||
|
||||
class ServerContentInterrupted(BaseModel):
|
||||
"""Indicates server content was interrupted.
|
||||
|
||||
Parameters:
|
||||
interrupted: Whether the content was interrupted.
|
||||
"""
|
||||
|
||||
interrupted: bool
|
||||
|
||||
|
||||
class ServerContentTurnComplete(BaseModel):
|
||||
"""Indicates the server's turn is complete.
|
||||
|
||||
Parameters:
|
||||
turnComplete: Whether the turn is complete.
|
||||
"""
|
||||
|
||||
turnComplete: bool
|
||||
|
||||
|
||||
class BidiGenerateContentTranscription(BaseModel):
|
||||
"""Transcription data from bidirectional content generation.
|
||||
|
||||
Parameters:
|
||||
text: The transcribed text content.
|
||||
"""
|
||||
|
||||
text: str
|
||||
|
||||
|
||||
class ServerContent(BaseModel):
|
||||
"""Content sent from server to client.
|
||||
|
||||
Parameters:
|
||||
modelTurn: Model's conversational turn. Defaults to None.
|
||||
interrupted: Whether content was interrupted. Defaults to None.
|
||||
turnComplete: Whether the turn is complete. Defaults to None.
|
||||
inputTranscription: Transcription of input audio. Defaults to None.
|
||||
outputTranscription: Transcription of output audio. Defaults to None.
|
||||
"""
|
||||
|
||||
modelTurn: Optional[ModelTurn] = None
|
||||
interrupted: Optional[bool] = None
|
||||
turnComplete: Optional[bool] = None
|
||||
inputTranscription: Optional[BidiGenerateContentTranscription] = None
|
||||
outputTranscription: Optional[BidiGenerateContentTranscription] = None
|
||||
groundingMetadata: Optional[GroundingMetadata] = None
|
||||
|
||||
|
||||
class FunctionCall(BaseModel):
|
||||
"""Represents a function call from the model.
|
||||
|
||||
Parameters:
|
||||
id: Unique identifier for the function call.
|
||||
name: Name of the function to call.
|
||||
args: Arguments to pass to the function.
|
||||
"""
|
||||
|
||||
id: str
|
||||
name: str
|
||||
args: dict
|
||||
|
||||
|
||||
class ToolCall(BaseModel):
|
||||
"""Contains one or more function calls.
|
||||
|
||||
Parameters:
|
||||
functionCalls: List of function calls to execute.
|
||||
"""
|
||||
|
||||
functionCalls: List[FunctionCall]
|
||||
|
||||
|
||||
class Modality(str, Enum):
|
||||
"""Modality types in token counts."""
|
||||
|
||||
UNSPECIFIED = "MODALITY_UNSPECIFIED"
|
||||
TEXT = "TEXT"
|
||||
IMAGE = "IMAGE"
|
||||
AUDIO = "AUDIO"
|
||||
VIDEO = "VIDEO"
|
||||
|
||||
|
||||
class ModalityTokenCount(BaseModel):
|
||||
"""Token count for a specific modality.
|
||||
|
||||
Parameters:
|
||||
modality: The modality type.
|
||||
tokenCount: Number of tokens for this modality.
|
||||
"""
|
||||
|
||||
modality: Modality
|
||||
tokenCount: int
|
||||
|
||||
|
||||
class UsageMetadata(BaseModel):
|
||||
"""Usage metadata about the API response.
|
||||
|
||||
Parameters:
|
||||
promptTokenCount: Number of tokens in the prompt. Defaults to None.
|
||||
cachedContentTokenCount: Number of cached content tokens. Defaults to None.
|
||||
responseTokenCount: Number of tokens in the response. Defaults to None.
|
||||
toolUsePromptTokenCount: Number of tokens for tool use prompts. Defaults to None.
|
||||
thoughtsTokenCount: Number of tokens for model thoughts. Defaults to None.
|
||||
totalTokenCount: Total number of tokens used. Defaults to None.
|
||||
promptTokensDetails: Detailed breakdown of prompt tokens by modality. Defaults to None.
|
||||
cacheTokensDetails: Detailed breakdown of cache tokens by modality. Defaults to None.
|
||||
responseTokensDetails: Detailed breakdown of response tokens by modality. Defaults to None.
|
||||
toolUsePromptTokensDetails: Detailed breakdown of tool use tokens by modality. Defaults to None.
|
||||
"""
|
||||
|
||||
promptTokenCount: Optional[int] = None
|
||||
cachedContentTokenCount: Optional[int] = None
|
||||
responseTokenCount: Optional[int] = None
|
||||
toolUsePromptTokenCount: Optional[int] = None
|
||||
thoughtsTokenCount: Optional[int] = None
|
||||
totalTokenCount: Optional[int] = None
|
||||
promptTokensDetails: Optional[List[ModalityTokenCount]] = None
|
||||
cacheTokensDetails: Optional[List[ModalityTokenCount]] = None
|
||||
responseTokensDetails: Optional[List[ModalityTokenCount]] = None
|
||||
toolUsePromptTokensDetails: Optional[List[ModalityTokenCount]] = None
|
||||
|
||||
|
||||
class ServerEvent(BaseModel):
|
||||
"""Server event received from the Gemini Live API.
|
||||
|
||||
Parameters:
|
||||
setupComplete: Setup completion notification. Defaults to None.
|
||||
serverContent: Content from the server. Defaults to None.
|
||||
toolCall: Tool/function call request. Defaults to None.
|
||||
usageMetadata: Token usage metadata. Defaults to None.
|
||||
"""
|
||||
|
||||
setupComplete: Optional[SetupComplete] = None
|
||||
serverContent: Optional[ServerContent] = None
|
||||
toolCall: Optional[ToolCall] = None
|
||||
usageMetadata: Optional[UsageMetadata] = None
|
||||
|
||||
|
||||
def parse_server_event(str):
|
||||
"""Parse a server event from JSON string.
|
||||
|
||||
Args:
|
||||
str: JSON string containing the server event.
|
||||
|
||||
Returns:
|
||||
ServerEvent instance if parsing succeeds, None otherwise.
|
||||
"""
|
||||
try:
|
||||
evt = json.loads(str)
|
||||
return ServerEvent.model_validate(evt)
|
||||
except Exception as e:
|
||||
print(f"Error parsing server event: {e}")
|
||||
return None
|
||||
|
||||
|
||||
class ContextWindowCompressionConfig(BaseModel):
|
||||
"""Configuration for context window compression.
|
||||
|
||||
Parameters:
|
||||
sliding_window: Whether to use sliding window compression. Defaults to True.
|
||||
trigger_tokens: Token count threshold to trigger compression. Defaults to None.
|
||||
"""
|
||||
|
||||
sliding_window: Optional[bool] = Field(default=True)
|
||||
trigger_tokens: Optional[int] = Field(default=None)
|
||||
"""Event models and utilities for Google Gemini Multimodal Live API.
|
||||
|
||||
.. deprecated:: 0.0.90
|
||||
Importing StartSensitivity and EndSensitivity from this module is deprecated.
|
||||
Import them directly from google.genai.types instead.
|
||||
"""
|
||||
|
||||
import warnings
|
||||
|
||||
from loguru import logger
|
||||
|
||||
try:
|
||||
from google.genai.types import (
|
||||
EndSensitivity as _EndSensitivity,
|
||||
)
|
||||
from google.genai.types import (
|
||||
StartSensitivity as _StartSensitivity,
|
||||
)
|
||||
except ModuleNotFoundError as e:
|
||||
logger.error(f"Exception: {e}")
|
||||
logger.error("In order to use Google AI, you need to `pip install pipecat-ai[google]`.")
|
||||
raise Exception(f"Missing module: {e}")
|
||||
|
||||
# These aliases are just here for backward compatibility, since we used to
|
||||
# define public-facing StartSensitivity and EndSensitivity enums in this
|
||||
# module.
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("always")
|
||||
warnings.warn(
|
||||
"Importing StartSensitivity and EndSensitivity from "
|
||||
"pipecat.services.gemini_multimodal_live.events is deprecated. "
|
||||
"Please import them directly from google.genai.types instead.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
StartSensitivity = _StartSensitivity
|
||||
EndSensitivity = _EndSensitivity
|
||||
|
||||
@@ -9,181 +9,31 @@
|
||||
This module provides a client for Google's Gemini File API, enabling file
|
||||
uploads, metadata retrieval, listing, and deletion. Files uploaded through
|
||||
this API can be referenced in Gemini generative model calls.
|
||||
|
||||
.. deprecated:: 0.0.90
|
||||
Importing GeminiFileAPI from this module is deprecated.
|
||||
Import it from pipecat.services.google.gemini_live.file_api instead.
|
||||
"""
|
||||
|
||||
import mimetypes
|
||||
from typing import Any, Dict, Optional
|
||||
import warnings
|
||||
|
||||
import aiohttp
|
||||
from loguru import logger
|
||||
|
||||
try:
|
||||
from pipecat.services.google.gemini_live.file_api import GeminiFileAPI as _GeminiFileAPI
|
||||
except ModuleNotFoundError as e:
|
||||
logger.error(f"Exception: {e}")
|
||||
logger.error("In order to use Google AI, you need to `pip install pipecat-ai[google]`.")
|
||||
raise Exception(f"Missing module: {e}")
|
||||
|
||||
class GeminiFileAPI:
|
||||
"""Client for the Gemini File API.
|
||||
|
||||
This class provides methods for uploading, fetching, listing, and deleting files
|
||||
through Google's Gemini File API.
|
||||
|
||||
Files uploaded through this API remain available for 48 hours and can be referenced
|
||||
in calls to the Gemini generative models. Maximum file size is 2GB, with total
|
||||
project storage limited to 20GB.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, api_key: str, base_url: str = "https://generativelanguage.googleapis.com/v1beta/files"
|
||||
):
|
||||
"""Initialize the Gemini File API client.
|
||||
|
||||
Args:
|
||||
api_key: Google AI API key
|
||||
base_url: Base URL for the Gemini File API (default is the v1beta endpoint)
|
||||
"""
|
||||
self._api_key = api_key
|
||||
self._base_url = base_url
|
||||
# Upload URL uses the /upload/ path
|
||||
self.upload_base_url = "https://generativelanguage.googleapis.com/upload/v1beta/files"
|
||||
|
||||
async def upload_file(
|
||||
self, file_path: str, display_name: Optional[str] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""Upload a file to the Gemini File API using the correct resumable upload protocol.
|
||||
|
||||
Args:
|
||||
file_path: Path to the file to upload
|
||||
display_name: Optional display name for the file
|
||||
|
||||
Returns:
|
||||
File metadata including uri, name, and display_name
|
||||
"""
|
||||
logger.info(f"Uploading file: {file_path}")
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
# Determine the file's MIME type
|
||||
mime_type, _ = mimetypes.guess_type(file_path)
|
||||
if not mime_type:
|
||||
mime_type = "application/octet-stream"
|
||||
|
||||
# Read the file
|
||||
with open(file_path, "rb") as f:
|
||||
file_data = f.read()
|
||||
|
||||
# Create the metadata payload
|
||||
metadata = {}
|
||||
if display_name:
|
||||
metadata = {"file": {"display_name": display_name}}
|
||||
|
||||
# Step 1: Initial resumable request to get upload URL
|
||||
headers = {
|
||||
"X-Goog-Upload-Protocol": "resumable",
|
||||
"X-Goog-Upload-Command": "start",
|
||||
"X-Goog-Upload-Header-Content-Length": str(len(file_data)),
|
||||
"X-Goog-Upload-Header-Content-Type": mime_type,
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
|
||||
logger.debug(f"Step 1: Getting upload URL from {self.upload_base_url}")
|
||||
async with session.post(
|
||||
f"{self.upload_base_url}?key={self._api_key}", headers=headers, json=metadata
|
||||
) as response:
|
||||
if response.status != 200:
|
||||
error_text = await response.text()
|
||||
logger.error(f"Error initiating file upload: {error_text}")
|
||||
raise Exception(f"Failed to initiate upload: {response.status} - {error_text}")
|
||||
|
||||
# Get the upload URL from the response header
|
||||
upload_url = response.headers.get("X-Goog-Upload-URL")
|
||||
if not upload_url:
|
||||
logger.error(f"Response headers: {dict(response.headers)}")
|
||||
raise Exception("No upload URL in response headers")
|
||||
|
||||
logger.debug(f"Got upload URL: {upload_url}")
|
||||
|
||||
# Step 2: Upload the actual file data
|
||||
upload_headers = {
|
||||
"Content-Length": str(len(file_data)),
|
||||
"X-Goog-Upload-Offset": "0",
|
||||
"X-Goog-Upload-Command": "upload, finalize",
|
||||
}
|
||||
|
||||
logger.debug(f"Step 2: Uploading file data to {upload_url}")
|
||||
async with session.post(upload_url, headers=upload_headers, data=file_data) as response:
|
||||
if response.status != 200:
|
||||
error_text = await response.text()
|
||||
logger.error(f"Error uploading file data: {error_text}")
|
||||
raise Exception(f"Failed to upload file: {response.status} - {error_text}")
|
||||
|
||||
file_info = await response.json()
|
||||
logger.info(f"File uploaded successfully: {file_info.get('file', {}).get('name')}")
|
||||
return file_info
|
||||
|
||||
async def get_file(self, name: str) -> Dict[str, Any]:
|
||||
"""Get metadata for a file.
|
||||
|
||||
Args:
|
||||
name: File name (or full path)
|
||||
|
||||
Returns:
|
||||
File metadata
|
||||
"""
|
||||
# Extract just the name part if a full path is provided
|
||||
if "/" in name:
|
||||
name = name.split("/")[-1]
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(f"{self._base_url}/{name}?key={self._api_key}") as response:
|
||||
if response.status != 200:
|
||||
error_text = await response.text()
|
||||
logger.error(f"Error getting file metadata: {error_text}")
|
||||
raise Exception(f"Failed to get file metadata: {response.status}")
|
||||
|
||||
file_info = await response.json()
|
||||
return file_info
|
||||
|
||||
async def list_files(
|
||||
self, page_size: int = 10, page_token: Optional[str] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""List uploaded files.
|
||||
|
||||
Args:
|
||||
page_size: Number of files to return per page
|
||||
page_token: Token for pagination
|
||||
|
||||
Returns:
|
||||
List of files and next page token if available
|
||||
"""
|
||||
params = {"key": self._api_key, "pageSize": page_size}
|
||||
|
||||
if page_token:
|
||||
params["pageToken"] = page_token
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(self._base_url, params=params) as response:
|
||||
if response.status != 200:
|
||||
error_text = await response.text()
|
||||
logger.error(f"Error listing files: {error_text}")
|
||||
raise Exception(f"Failed to list files: {response.status}")
|
||||
|
||||
result = await response.json()
|
||||
return result
|
||||
|
||||
async def delete_file(self, name: str) -> bool:
|
||||
"""Delete a file.
|
||||
|
||||
Args:
|
||||
name: File name (or full path)
|
||||
|
||||
Returns:
|
||||
True if deleted successfully
|
||||
"""
|
||||
# Extract just the name part if a full path is provided
|
||||
if "/" in name:
|
||||
name = name.split("/")[-1]
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.delete(f"{self._base_url}/{name}?key={self._api_key}") as response:
|
||||
if response.status != 200:
|
||||
error_text = await response.text()
|
||||
logger.error(f"Error deleting file: {error_text}")
|
||||
raise Exception(f"Failed to delete file: {response.status}")
|
||||
|
||||
return True
|
||||
# These aliases are just here for backward compatibility, since we used to
|
||||
# define public-facing StartSensitivity and EndSensitivity enums in this
|
||||
# module.
|
||||
warnings.warn(
|
||||
"Importing GeminiFileAPI from "
|
||||
"pipecat.services.gemini_multimodal_live.file_api is deprecated. "
|
||||
"Please import it from pipecat.services.google.gemini_live.file_api instead.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
GeminiFileAPI = _GeminiFileAPI
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -9,6 +9,7 @@ import sys
|
||||
from pipecat.services import DeprecatedModuleProxy
|
||||
|
||||
from .frames import *
|
||||
from .gemini_live import *
|
||||
from .image import *
|
||||
from .llm import *
|
||||
from .llm_openai import *
|
||||
|
||||
3
src/pipecat/services/google/gemini_live/__init__.py
Normal file
3
src/pipecat/services/google/gemini_live/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
||||
from .file_api import GeminiFileAPI
|
||||
from .llm import GeminiLiveLLMService
|
||||
from .llm_vertex import GeminiLiveVertexLLMService
|
||||
189
src/pipecat/services/google/gemini_live/file_api.py
Normal file
189
src/pipecat/services/google/gemini_live/file_api.py
Normal file
@@ -0,0 +1,189 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""Gemini File API client for uploading and managing files.
|
||||
|
||||
This module provides a client for Google's Gemini File API, enabling file
|
||||
uploads, metadata retrieval, listing, and deletion. Files uploaded through
|
||||
this API can be referenced in Gemini generative model calls.
|
||||
"""
|
||||
|
||||
import mimetypes
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
import aiohttp
|
||||
from loguru import logger
|
||||
|
||||
|
||||
class GeminiFileAPI:
|
||||
"""Client for the Gemini File API.
|
||||
|
||||
This class provides methods for uploading, fetching, listing, and deleting files
|
||||
through Google's Gemini File API.
|
||||
|
||||
Files uploaded through this API remain available for 48 hours and can be referenced
|
||||
in calls to the Gemini generative models. Maximum file size is 2GB, with total
|
||||
project storage limited to 20GB.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, api_key: str, base_url: str = "https://generativelanguage.googleapis.com/v1beta/files"
|
||||
):
|
||||
"""Initialize the Gemini File API client.
|
||||
|
||||
Args:
|
||||
api_key: Google AI API key
|
||||
base_url: Base URL for the Gemini File API (default is the v1beta endpoint)
|
||||
"""
|
||||
self._api_key = api_key
|
||||
self._base_url = base_url
|
||||
# Upload URL uses the /upload/ path
|
||||
self.upload_base_url = "https://generativelanguage.googleapis.com/upload/v1beta/files"
|
||||
|
||||
async def upload_file(
|
||||
self, file_path: str, display_name: Optional[str] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""Upload a file to the Gemini File API using the correct resumable upload protocol.
|
||||
|
||||
Args:
|
||||
file_path: Path to the file to upload
|
||||
display_name: Optional display name for the file
|
||||
|
||||
Returns:
|
||||
File metadata including uri, name, and display_name
|
||||
"""
|
||||
logger.info(f"Uploading file: {file_path}")
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
# Determine the file's MIME type
|
||||
mime_type, _ = mimetypes.guess_type(file_path)
|
||||
if not mime_type:
|
||||
mime_type = "application/octet-stream"
|
||||
|
||||
# Read the file
|
||||
with open(file_path, "rb") as f:
|
||||
file_data = f.read()
|
||||
|
||||
# Create the metadata payload
|
||||
metadata = {}
|
||||
if display_name:
|
||||
metadata = {"file": {"display_name": display_name}}
|
||||
|
||||
# Step 1: Initial resumable request to get upload URL
|
||||
headers = {
|
||||
"X-Goog-Upload-Protocol": "resumable",
|
||||
"X-Goog-Upload-Command": "start",
|
||||
"X-Goog-Upload-Header-Content-Length": str(len(file_data)),
|
||||
"X-Goog-Upload-Header-Content-Type": mime_type,
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
|
||||
logger.debug(f"Step 1: Getting upload URL from {self.upload_base_url}")
|
||||
async with session.post(
|
||||
f"{self.upload_base_url}?key={self._api_key}", headers=headers, json=metadata
|
||||
) as response:
|
||||
if response.status != 200:
|
||||
error_text = await response.text()
|
||||
logger.error(f"Error initiating file upload: {error_text}")
|
||||
raise Exception(f"Failed to initiate upload: {response.status} - {error_text}")
|
||||
|
||||
# Get the upload URL from the response header
|
||||
upload_url = response.headers.get("X-Goog-Upload-URL")
|
||||
if not upload_url:
|
||||
logger.error(f"Response headers: {dict(response.headers)}")
|
||||
raise Exception("No upload URL in response headers")
|
||||
|
||||
logger.debug(f"Got upload URL: {upload_url}")
|
||||
|
||||
# Step 2: Upload the actual file data
|
||||
upload_headers = {
|
||||
"Content-Length": str(len(file_data)),
|
||||
"X-Goog-Upload-Offset": "0",
|
||||
"X-Goog-Upload-Command": "upload, finalize",
|
||||
}
|
||||
|
||||
logger.debug(f"Step 2: Uploading file data to {upload_url}")
|
||||
async with session.post(upload_url, headers=upload_headers, data=file_data) as response:
|
||||
if response.status != 200:
|
||||
error_text = await response.text()
|
||||
logger.error(f"Error uploading file data: {error_text}")
|
||||
raise Exception(f"Failed to upload file: {response.status} - {error_text}")
|
||||
|
||||
file_info = await response.json()
|
||||
logger.info(f"File uploaded successfully: {file_info.get('file', {}).get('name')}")
|
||||
return file_info
|
||||
|
||||
async def get_file(self, name: str) -> Dict[str, Any]:
|
||||
"""Get metadata for a file.
|
||||
|
||||
Args:
|
||||
name: File name (or full path)
|
||||
|
||||
Returns:
|
||||
File metadata
|
||||
"""
|
||||
# Extract just the name part if a full path is provided
|
||||
if "/" in name:
|
||||
name = name.split("/")[-1]
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(f"{self._base_url}/{name}?key={self._api_key}") as response:
|
||||
if response.status != 200:
|
||||
error_text = await response.text()
|
||||
logger.error(f"Error getting file metadata: {error_text}")
|
||||
raise Exception(f"Failed to get file metadata: {response.status}")
|
||||
|
||||
file_info = await response.json()
|
||||
return file_info
|
||||
|
||||
async def list_files(
|
||||
self, page_size: int = 10, page_token: Optional[str] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""List uploaded files.
|
||||
|
||||
Args:
|
||||
page_size: Number of files to return per page
|
||||
page_token: Token for pagination
|
||||
|
||||
Returns:
|
||||
List of files and next page token if available
|
||||
"""
|
||||
params = {"key": self._api_key, "pageSize": page_size}
|
||||
|
||||
if page_token:
|
||||
params["pageToken"] = page_token
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(self._base_url, params=params) as response:
|
||||
if response.status != 200:
|
||||
error_text = await response.text()
|
||||
logger.error(f"Error listing files: {error_text}")
|
||||
raise Exception(f"Failed to list files: {response.status}")
|
||||
|
||||
result = await response.json()
|
||||
return result
|
||||
|
||||
async def delete_file(self, name: str) -> bool:
|
||||
"""Delete a file.
|
||||
|
||||
Args:
|
||||
name: File name (or full path)
|
||||
|
||||
Returns:
|
||||
True if deleted successfully
|
||||
"""
|
||||
# Extract just the name part if a full path is provided
|
||||
if "/" in name:
|
||||
name = name.split("/")[-1]
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.delete(f"{self._base_url}/{name}?key={self._api_key}") as response:
|
||||
if response.status != 200:
|
||||
error_text = await response.text()
|
||||
logger.error(f"Error deleting file: {error_text}")
|
||||
raise Exception(f"Failed to delete file: {response.status}")
|
||||
|
||||
return True
|
||||
1582
src/pipecat/services/google/gemini_live/llm.py
Normal file
1582
src/pipecat/services/google/gemini_live/llm.py
Normal file
File diff suppressed because it is too large
Load Diff
184
src/pipecat/services/google/gemini_live/llm_vertex.py
Normal file
184
src/pipecat/services/google/gemini_live/llm_vertex.py
Normal file
@@ -0,0 +1,184 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""Service for accessing Gemini Live via Google Vertex AI.
|
||||
|
||||
This module provides integration with Google's Gemini Live model via
|
||||
Vertex AI, supporting both text and audio modalities with voice transcription,
|
||||
streaming responses, and tool usage.
|
||||
"""
|
||||
|
||||
import json
|
||||
from typing import List, Optional, Union
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
||||
from pipecat.services.google.gemini_live.llm import (
|
||||
GeminiLiveLLMService,
|
||||
HttpOptions,
|
||||
InputParams,
|
||||
)
|
||||
|
||||
try:
|
||||
from google.auth import default
|
||||
from google.auth.exceptions import GoogleAuthError
|
||||
from google.auth.transport.requests import Request
|
||||
from google.genai import Client
|
||||
from google.oauth2 import service_account
|
||||
|
||||
except ModuleNotFoundError as e:
|
||||
logger.error(f"Exception: {e}")
|
||||
logger.error("In order to use Google Vertex AI, you need to `pip install pipecat-ai[google]`.")
|
||||
raise Exception(f"Missing module: {e}")
|
||||
|
||||
|
||||
class GeminiLiveVertexLLMService(GeminiLiveLLMService):
|
||||
"""Provides access to Google's Gemini Live model via Vertex AI.
|
||||
|
||||
This service enables real-time conversations with Gemini, supporting both
|
||||
text and audio modalities. It handles voice transcription, streaming audio
|
||||
responses, and tool usage.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
credentials: Optional[str] = None,
|
||||
credentials_path: Optional[str] = None,
|
||||
location: str,
|
||||
project_id: str,
|
||||
model="google/gemini-2.0-flash-live-preview-04-09",
|
||||
voice_id: str = "Charon",
|
||||
start_audio_paused: bool = False,
|
||||
start_video_paused: bool = False,
|
||||
system_instruction: Optional[str] = None,
|
||||
tools: Optional[Union[List[dict], ToolsSchema]] = None,
|
||||
params: Optional[InputParams] = None,
|
||||
inference_on_context_initialization: bool = True,
|
||||
file_api_base_url: str = "https://generativelanguage.googleapis.com/v1beta/files",
|
||||
http_options: Optional[HttpOptions] = None,
|
||||
**kwargs,
|
||||
):
|
||||
"""Initialize the service for accessing Gemini Live via Google Vertex AI.
|
||||
|
||||
Args:
|
||||
credentials: JSON string of service account credentials.
|
||||
credentials_path: Path to the service account JSON file.
|
||||
location: GCP region for Vertex AI endpoint (e.g., "us-east4").
|
||||
project_id: Google Cloud project ID.
|
||||
model: Model identifier to use. Defaults to "models/gemini-2.0-flash-live-preview-04-09".
|
||||
voice_id: TTS voice identifier. Defaults to "Charon".
|
||||
start_audio_paused: Whether to start with audio input paused. Defaults to False.
|
||||
start_video_paused: Whether to start with video input paused. Defaults to False.
|
||||
system_instruction: System prompt for the model. Defaults to None.
|
||||
tools: Tools/functions available to the model. Defaults to None.
|
||||
params: Configuration parameters for the model along with Vertex AI
|
||||
location and project ID.
|
||||
inference_on_context_initialization: Whether to generate a response when context
|
||||
is first set. Defaults to True.
|
||||
file_api_base_url: Base URL for the Gemini File API. Defaults to the official endpoint.
|
||||
http_options: HTTP options for the client.
|
||||
**kwargs: Additional arguments passed to parent GeminiLiveLLMService.
|
||||
"""
|
||||
# Check if user incorrectly passed api_key, which is used by parent
|
||||
# class but not here.
|
||||
if "api_key" in kwargs:
|
||||
logger.error(
|
||||
"GeminiLiveVertexLLMService does not accept 'api_key' parameter. "
|
||||
"Use 'credentials' or 'credentials_path' instead for Vertex AI authentication."
|
||||
)
|
||||
raise ValueError(
|
||||
"Invalid parameter 'api_key'. Use 'credentials' or 'credentials_path' for Vertex AI authentication."
|
||||
)
|
||||
|
||||
# These need to be set before calling super().__init__() because
|
||||
# super().__init__() invokes create_client(), which needs these.
|
||||
self._credentials = self._get_credentials(credentials, credentials_path)
|
||||
self._project_id = project_id
|
||||
self._location = location
|
||||
|
||||
# Call parent constructor with the obtained API key
|
||||
super().__init__(
|
||||
# api_key is required by parent class, but actually not used with
|
||||
# Vertex
|
||||
api_key="dummy",
|
||||
model=model,
|
||||
voice_id=voice_id,
|
||||
start_audio_paused=start_audio_paused,
|
||||
start_video_paused=start_video_paused,
|
||||
system_instruction=system_instruction,
|
||||
tools=tools,
|
||||
params=params,
|
||||
inference_on_context_initialization=inference_on_context_initialization,
|
||||
file_api_base_url=file_api_base_url,
|
||||
http_options=http_options,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def create_client(self):
|
||||
"""Create the Gemini client instance."""
|
||||
self._client = Client(
|
||||
vertexai=True,
|
||||
credentials=self._credentials,
|
||||
project=self._project_id,
|
||||
location=self._location,
|
||||
)
|
||||
|
||||
@property
|
||||
def file_api(self):
|
||||
"""Gemini File API is not supported with Vertex AI."""
|
||||
raise NotImplementedError(
|
||||
"When using Vertex AI, the recommended approach is to use Google Cloud Storage for file handling. The Gemini File API is not directly supported in this context."
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _get_credentials(credentials: Optional[str], credentials_path: Optional[str]) -> str:
|
||||
"""Retrieve Credentials using Google service account credentials JSON.
|
||||
|
||||
Supports multiple authentication methods:
|
||||
1. Direct JSON credentials string
|
||||
2. Path to service account JSON file
|
||||
3. Default application credentials (ADC)
|
||||
|
||||
Args:
|
||||
credentials: JSON string of service account credentials.
|
||||
credentials_path: Path to the service account JSON file.
|
||||
|
||||
Returns:
|
||||
OAuth token for API authentication.
|
||||
|
||||
Raises:
|
||||
ValueError: If no valid credentials are provided or found.
|
||||
"""
|
||||
creds: Optional[service_account.Credentials] = None
|
||||
|
||||
if credentials:
|
||||
# Parse and load credentials from JSON string
|
||||
creds = service_account.Credentials.from_service_account_info(
|
||||
json.loads(credentials),
|
||||
scopes=["https://www.googleapis.com/auth/cloud-platform"],
|
||||
)
|
||||
elif credentials_path:
|
||||
# Load credentials from JSON file
|
||||
creds = service_account.Credentials.from_service_account_file(
|
||||
credentials_path,
|
||||
scopes=["https://www.googleapis.com/auth/cloud-platform"],
|
||||
)
|
||||
else:
|
||||
try:
|
||||
creds, project_id = default(
|
||||
scopes=["https://www.googleapis.com/auth/cloud-platform"]
|
||||
)
|
||||
except GoogleAuthError:
|
||||
pass
|
||||
|
||||
if not creds:
|
||||
raise ValueError("No valid credentials provided.")
|
||||
|
||||
creds.refresh(Request()) # Ensure token is up-to-date, lifetime is 1 hour.
|
||||
|
||||
return creds
|
||||
@@ -35,6 +35,7 @@ from pipecat.frames.frames import (
|
||||
LLMMessagesFrame,
|
||||
LLMTextFrame,
|
||||
LLMUpdateSettingsFrame,
|
||||
OutputImageRawFrame,
|
||||
UserImageRawFrame,
|
||||
)
|
||||
from pipecat.metrics.metrics import LLMTokenUsage
|
||||
@@ -72,6 +73,9 @@ try:
|
||||
HttpOptions,
|
||||
Part,
|
||||
)
|
||||
|
||||
# Temporary hack to be able to process Nano Banana returned images.
|
||||
genai._api_client.READ_BUFFER_SIZE = 5 * 1024 * 1024
|
||||
except ModuleNotFoundError as e:
|
||||
logger.error(f"Exception: {e}")
|
||||
logger.error("In order to use Google AI, you need to `pip install pipecat-ai[google]`.")
|
||||
@@ -682,7 +686,7 @@ class GoogleLLMService(LLMService):
|
||||
self,
|
||||
*,
|
||||
api_key: str,
|
||||
model: str = "gemini-2.0-flash",
|
||||
model: str = "gemini-2.5-flash",
|
||||
params: Optional[InputParams] = None,
|
||||
system_instruction: Optional[str] = None,
|
||||
tools: Optional[List[Dict[str, Any]]] = None,
|
||||
@@ -710,6 +714,7 @@ class GoogleLLMService(LLMService):
|
||||
self._api_key = api_key
|
||||
self._system_instruction = system_instruction
|
||||
self._http_options = http_options
|
||||
|
||||
self._create_client(api_key, http_options)
|
||||
self._settings = {
|
||||
"max_tokens": params.max_tokens,
|
||||
@@ -788,6 +793,9 @@ class GoogleLLMService(LLMService):
|
||||
# and can be configured to turn it off.
|
||||
if not self._model_name.startswith("gemini-2.5-flash"):
|
||||
return
|
||||
# If we have an image model, we don't use a budget either.
|
||||
if "image" in self._model_name:
|
||||
return
|
||||
# If thinking_config is already set, don't override it.
|
||||
if "thinking_config" in generation_params:
|
||||
return
|
||||
@@ -927,6 +935,12 @@ class GoogleLLMService(LLMService):
|
||||
arguments=function_call.args or {},
|
||||
)
|
||||
)
|
||||
elif part.inline_data and part.inline_data.data:
|
||||
image = Image.open(io.BytesIO(part.inline_data.data))
|
||||
frame = OutputImageRawFrame(
|
||||
image=image.tobytes(), size=image.size, format="RGB"
|
||||
)
|
||||
await self.push_frame(frame)
|
||||
|
||||
if (
|
||||
candidate.grounding_metadata
|
||||
|
||||
@@ -94,9 +94,9 @@ class GoogleLLMOpenAIBetaService(OpenAILLMService):
|
||||
async for chunk in chunk_stream:
|
||||
if chunk.usage:
|
||||
tokens = LLMTokenUsage(
|
||||
prompt_tokens=chunk.usage.prompt_tokens,
|
||||
completion_tokens=chunk.usage.completion_tokens,
|
||||
total_tokens=chunk.usage.total_tokens,
|
||||
prompt_tokens=chunk.usage.prompt_tokens or 0,
|
||||
completion_tokens=chunk.usage.completion_tokens or 0,
|
||||
total_tokens=chunk.usage.total_tokens or 0,
|
||||
)
|
||||
await self.start_llm_usage_metrics(tokens)
|
||||
|
||||
|
||||
@@ -53,12 +53,44 @@ class GoogleVertexLLMService(OpenAILLMService):
|
||||
|
||||
Parameters:
|
||||
location: GCP region for Vertex AI endpoint (e.g., "us-east4").
|
||||
|
||||
.. deprecated:: 0.0.90
|
||||
Use `location` as a direct argument to
|
||||
`GoogleVertexLLMService.__init__()` instead.
|
||||
|
||||
project_id: Google Cloud project ID.
|
||||
|
||||
.. deprecated:: 0.0.90
|
||||
Use `project_id` as a direct argument to
|
||||
`GoogleVertexLLMService.__init__()` instead.
|
||||
"""
|
||||
|
||||
# https://cloud.google.com/vertex-ai/generative-ai/docs/learn/locations
|
||||
location: str = "us-east4"
|
||||
project_id: str
|
||||
location: Optional[str] = None
|
||||
project_id: Optional[str] = None
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
"""Initializes the InputParams."""
|
||||
import warnings
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("always")
|
||||
if "location" in kwargs and kwargs["location"] is not None:
|
||||
warnings.warn(
|
||||
"GoogleVertexLLMService.InputParams.location is deprecated. "
|
||||
"Please provide 'location' as a direct argument to GoogleVertexLLMService.__init__() instead.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
if "project_id" in kwargs and kwargs["project_id"] is not None:
|
||||
warnings.warn(
|
||||
"GoogleVertexLLMService.InputParams.project_id is deprecated. "
|
||||
"Please provide 'project_id' as a direct argument to GoogleVertexLLMService.__init__() instead.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
super().__init__(**kwargs)
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -66,7 +98,8 @@ class GoogleVertexLLMService(OpenAILLMService):
|
||||
credentials: Optional[str] = None,
|
||||
credentials_path: Optional[str] = None,
|
||||
model: str = "google/gemini-2.0-flash-001",
|
||||
params: Optional[InputParams] = None,
|
||||
location: Optional[str] = None,
|
||||
project_id: Optional[str] = None,
|
||||
**kwargs,
|
||||
):
|
||||
"""Initializes the VertexLLMService.
|
||||
@@ -75,33 +108,60 @@ class GoogleVertexLLMService(OpenAILLMService):
|
||||
credentials: JSON string of service account credentials.
|
||||
credentials_path: Path to the service account JSON file.
|
||||
model: Model identifier (e.g., "google/gemini-2.0-flash-001").
|
||||
params: Vertex AI input parameters including location and project.
|
||||
location: GCP region for Vertex AI endpoint (e.g., "us-east4").
|
||||
project_id: Google Cloud project ID.
|
||||
**kwargs: Additional arguments passed to OpenAILLMService.
|
||||
"""
|
||||
params = params or OpenAILLMService.InputParams()
|
||||
base_url = self._get_base_url(params)
|
||||
# Handle deprecated InputParams fields
|
||||
if "params" in kwargs and isinstance(kwargs["params"], GoogleVertexLLMService.InputParams):
|
||||
params = kwargs["params"]
|
||||
# Extract location and project_id from params if not provided
|
||||
# directly, for backward compatibility
|
||||
if project_id is None:
|
||||
project_id = params.project_id
|
||||
if location is None:
|
||||
location = params.location
|
||||
# Convert to base InputParams
|
||||
params = OpenAILLMService.InputParams(
|
||||
**params.model_dump(exclude={"location", "project_id"}, exclude_unset=True)
|
||||
)
|
||||
kwargs["params"] = params
|
||||
|
||||
# Validate project_id and location parameters
|
||||
# NOTE: once we remove Vertex-spcific InputParams class, we can update
|
||||
# __init__() signature as follows:
|
||||
# - location: str = "us-east4",
|
||||
# - project_id: str,
|
||||
# But for now, we need them as-is to maintain proper backward
|
||||
# compatibility.
|
||||
if project_id is None:
|
||||
raise ValueError("project_id is required")
|
||||
if location is None:
|
||||
# If location is not provided, default to "us-east4".
|
||||
# Note: this is legacy behavior; ideally location would be
|
||||
# required.
|
||||
logger.warning("location is not provided. Defaulting to 'us-east4'.")
|
||||
location = "us-east4" # Default location if not provided
|
||||
|
||||
base_url = self._get_base_url(location, project_id)
|
||||
self._api_key = self._get_api_token(credentials, credentials_path)
|
||||
|
||||
super().__init__(
|
||||
api_key=self._api_key,
|
||||
base_url=base_url,
|
||||
model=model,
|
||||
params=params,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _get_base_url(params: InputParams) -> str:
|
||||
def _get_base_url(location: str, project_id: str) -> str:
|
||||
"""Construct the base URL for Vertex AI API."""
|
||||
# Determine the correct API host based on location
|
||||
if params.location == "global":
|
||||
if location == "global":
|
||||
api_host = "aiplatform.googleapis.com"
|
||||
else:
|
||||
api_host = f"{params.location}-aiplatform.googleapis.com"
|
||||
return (
|
||||
f"https://{api_host}/v1/"
|
||||
f"projects/{params.project_id}/locations/{params.location}/endpoints/openapi"
|
||||
)
|
||||
api_host = f"{location}-aiplatform.googleapis.com"
|
||||
return f"https://{api_host}/v1/projects/{project_id}/locations/{location}/endpoints/openapi"
|
||||
|
||||
@staticmethod
|
||||
def _get_api_token(credentials: Optional[str], credentials_path: Optional[str]) -> str:
|
||||
|
||||
5
src/pipecat/services/hume/__init__.py
Normal file
5
src/pipecat/services/hume/__init__.py
Normal file
@@ -0,0 +1,5 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
220
src/pipecat/services/hume/tts.py
Normal file
220
src/pipecat/services/hume/tts.py
Normal file
@@ -0,0 +1,220 @@
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
|
||||
"""Hume Text-to-Speech service implementation."""
|
||||
|
||||
import base64
|
||||
import os
|
||||
from typing import Any, AsyncGenerator, Optional
|
||||
|
||||
from loguru import logger
|
||||
from pydantic import BaseModel
|
||||
|
||||
from pipecat.frames.frames import (
|
||||
ErrorFrame,
|
||||
Frame,
|
||||
StartFrame,
|
||||
TTSAudioRawFrame,
|
||||
TTSStartedFrame,
|
||||
TTSStoppedFrame,
|
||||
)
|
||||
from pipecat.services.tts_service import TTSService
|
||||
from pipecat.utils.tracing.service_decorators import traced_tts
|
||||
|
||||
try:
|
||||
from hume import AsyncHumeClient
|
||||
from hume.tts import (
|
||||
FormatPcm,
|
||||
PostedUtterance,
|
||||
PostedUtteranceVoiceWithId,
|
||||
)
|
||||
except ModuleNotFoundError as e: # pragma: no cover - import-time guidance
|
||||
logger.error(f"Exception: {e}")
|
||||
logger.error("In order to use Hume, you need to `pip install pipecat-ai[hume]`.")
|
||||
raise Exception(f"Missing module: {e}")
|
||||
|
||||
|
||||
HUME_SAMPLE_RATE = 48_000 # Hume TTS streams at 48 kHz
|
||||
|
||||
|
||||
class HumeTTSService(TTSService):
|
||||
"""Hume Octave Text-to-Speech service.
|
||||
|
||||
Streams PCM audio via Hume's HTTP output streaming (JSON chunks) endpoint
|
||||
using the Python SDK and emits ``TTSAudioRawFrame`` frames suitable for Pipecat transports.
|
||||
|
||||
Supported features:
|
||||
|
||||
- Generates speech from text using Hume TTS.
|
||||
- Streams PCM audio.
|
||||
- Supports dynamic updates of voice and synthesis parameters at runtime.
|
||||
- Provides metrics for Time To First Byte (TTFB) and TTS usage.
|
||||
"""
|
||||
|
||||
class InputParams(BaseModel):
|
||||
"""Optional synthesis parameters for Hume TTS.
|
||||
|
||||
Parameters:
|
||||
description: Natural-language acting directions (up to 100 characters).
|
||||
speed: Speaking-rate multiplier (0.5-2.0).
|
||||
trailing_silence: Seconds of silence to append at the end (0-5).
|
||||
"""
|
||||
|
||||
description: Optional[str] = None
|
||||
speed: Optional[float] = None
|
||||
trailing_silence: Optional[float] = None
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
api_key: Optional[str] = None,
|
||||
voice_id: str,
|
||||
params: Optional[InputParams] = None,
|
||||
sample_rate: Optional[int] = HUME_SAMPLE_RATE,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
"""Initialize the HumeTTSService.
|
||||
|
||||
Args:
|
||||
api_key: Hume API key. If omitted, reads the ``HUME_API_KEY`` environment variable.
|
||||
voice_id: ID of the voice to use. Only voice IDs are supported; voice names are not.
|
||||
params: Optional synthesis controls (acting instructions, speed, trailing silence).
|
||||
sample_rate: Output sample rate for emitted PCM frames. Defaults to 48_000 (Hume).
|
||||
**kwargs: Additional arguments passed to the parent class.
|
||||
"""
|
||||
api_key = api_key or os.getenv("HUME_API_KEY")
|
||||
if not api_key:
|
||||
raise ValueError("HumeTTSService requires an API key (env HUME_API_KEY or api_key=)")
|
||||
|
||||
if sample_rate != HUME_SAMPLE_RATE:
|
||||
logger.warning(
|
||||
f"Hume TTS streams at {HUME_SAMPLE_RATE} Hz; configured sample_rate={sample_rate}"
|
||||
)
|
||||
|
||||
super().__init__(sample_rate=sample_rate, **kwargs)
|
||||
|
||||
self._client = AsyncHumeClient(api_key=api_key)
|
||||
self._params = params or HumeTTSService.InputParams()
|
||||
|
||||
# Store voice in the base class (mirrors other services)
|
||||
self.set_voice(voice_id)
|
||||
|
||||
self._audio_bytes = b""
|
||||
|
||||
def can_generate_metrics(self) -> bool:
|
||||
"""Can generate metrics.
|
||||
|
||||
Returns:
|
||||
True if metrics can be generated, False otherwise.
|
||||
"""
|
||||
return True
|
||||
|
||||
async def start(self, frame: StartFrame) -> None:
|
||||
"""Start the service.
|
||||
|
||||
Args:
|
||||
frame: The start frame.
|
||||
"""
|
||||
await super().start(frame)
|
||||
|
||||
async def update_setting(self, key: str, value: Any) -> None:
|
||||
"""Runtime updates via `TTSUpdateSettingsFrame`.
|
||||
|
||||
Args:
|
||||
key: The name of the setting to update. Recognized keys are:
|
||||
- "voice_id"
|
||||
- "description"
|
||||
- "speed"
|
||||
- "trailing_silence"
|
||||
value: The new value for the setting.
|
||||
"""
|
||||
key_l = (key or "").lower()
|
||||
|
||||
if key_l == "voice_id":
|
||||
self.set_voice(str(value))
|
||||
logger.info(f"HumeTTSService voice_id set to: {self.voice}")
|
||||
elif key_l == "description":
|
||||
self._params.description = None if value is None else str(value)
|
||||
elif key_l == "speed":
|
||||
self._params.speed = None if value is None else float(value)
|
||||
elif key_l == "trailing_silence":
|
||||
self._params.trailing_silence = None if value is None else float(value)
|
||||
else:
|
||||
# Defer unknown keys to the base class
|
||||
await super().update_setting(key, value)
|
||||
|
||||
@traced_tts
|
||||
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
|
||||
"""Generate speech from text using Hume TTS.
|
||||
|
||||
Args:
|
||||
text: The text to be synthesized.
|
||||
|
||||
Returns:
|
||||
An async generator that yields `Frame` objects, including
|
||||
`TTSStartedFrame`, `TTSAudioRawFrame`, `ErrorFrame`, and
|
||||
`TTSStoppedFrame`.
|
||||
"""
|
||||
logger.debug(f"{self}: Generating Hume TTS: [{text}]")
|
||||
|
||||
# Build the request payload
|
||||
utterance_kwargs: dict[str, Any] = {
|
||||
"text": text,
|
||||
"voice": PostedUtteranceVoiceWithId(id=self._voice_id),
|
||||
}
|
||||
if self._params.description is not None:
|
||||
utterance_kwargs["description"] = self._params.description
|
||||
if self._params.speed is not None:
|
||||
utterance_kwargs["speed"] = self._params.speed
|
||||
if self._params.trailing_silence is not None:
|
||||
utterance_kwargs["trailing_silence"] = self._params.trailing_silence
|
||||
|
||||
utterance = PostedUtterance(**utterance_kwargs)
|
||||
|
||||
# Request raw PCM chunks in the streaming JSON
|
||||
pcm_fmt = FormatPcm(type="pcm")
|
||||
|
||||
await self.start_ttfb_metrics()
|
||||
await self.start_tts_usage_metrics(text)
|
||||
yield TTSStartedFrame()
|
||||
|
||||
try:
|
||||
# Instant mode is always enabled here (not user-configurable)
|
||||
# Hume emits mono PCM at 48 kHz; downstream can resample if needed.
|
||||
# We buffer audio bytes before sending to prevent glitches.
|
||||
self._audio_bytes = b""
|
||||
async for chunk in self._client.tts.synthesize_json_streaming(
|
||||
utterances=[utterance],
|
||||
format=pcm_fmt,
|
||||
instant_mode=True,
|
||||
version="2",
|
||||
):
|
||||
audio_b64 = getattr(chunk, "audio", None)
|
||||
if not audio_b64:
|
||||
continue
|
||||
|
||||
pcm_bytes = base64.b64decode(audio_b64)
|
||||
self._audio_bytes += pcm_bytes
|
||||
|
||||
# Buffer audio until we have enough to avoid glitches
|
||||
if len(self._audio_bytes) < self.chunk_size:
|
||||
continue
|
||||
|
||||
frame = TTSAudioRawFrame(
|
||||
audio=self._audio_bytes,
|
||||
sample_rate=self.sample_rate,
|
||||
num_channels=1,
|
||||
)
|
||||
|
||||
yield frame
|
||||
|
||||
self._audio_bytes = b""
|
||||
|
||||
except Exception as e:
|
||||
logger.exception(f"{self} error generating TTS: {e}")
|
||||
await self.push_error(ErrorFrame(f"Error generating TTS: {e}"))
|
||||
finally:
|
||||
# Ensure TTFB timer is stopped even on early failures
|
||||
await self.stop_ttfb_metrics()
|
||||
yield TTSStoppedFrame()
|
||||
@@ -10,6 +10,7 @@ from pipecat.services import DeprecatedModuleProxy
|
||||
|
||||
from .image import *
|
||||
from .llm import *
|
||||
from .realtime import *
|
||||
from .stt import *
|
||||
from .tts import *
|
||||
|
||||
|
||||
@@ -66,6 +66,7 @@ class BaseOpenAILLMService(LLMService):
|
||||
top_p: Top-p (nucleus) sampling parameter (0.0 to 1.0).
|
||||
max_tokens: Maximum tokens in response (deprecated, use max_completion_tokens).
|
||||
max_completion_tokens: Maximum completion tokens to generate.
|
||||
service_tier: Service tier to use (e.g., "auto", "flex", "priority").
|
||||
extra: Additional model-specific parameters.
|
||||
"""
|
||||
|
||||
@@ -83,6 +84,7 @@ class BaseOpenAILLMService(LLMService):
|
||||
top_p: Optional[float] = Field(default_factory=lambda: NOT_GIVEN, ge=0.0, le=1.0)
|
||||
max_tokens: Optional[int] = Field(default_factory=lambda: NOT_GIVEN, ge=1)
|
||||
max_completion_tokens: Optional[int] = Field(default_factory=lambda: NOT_GIVEN, ge=1)
|
||||
service_tier: Optional[str] = Field(default_factory=lambda: NOT_GIVEN)
|
||||
extra: Optional[Dict[str, Any]] = Field(default_factory=dict)
|
||||
|
||||
def __init__(
|
||||
@@ -125,6 +127,7 @@ class BaseOpenAILLMService(LLMService):
|
||||
"top_p": params.top_p,
|
||||
"max_tokens": params.max_tokens,
|
||||
"max_completion_tokens": params.max_completion_tokens,
|
||||
"service_tier": params.service_tier,
|
||||
"extra": params.extra if isinstance(params.extra, dict) else {},
|
||||
}
|
||||
self._retry_timeout_secs = retry_timeout_secs
|
||||
@@ -236,6 +239,7 @@ class BaseOpenAILLMService(LLMService):
|
||||
"top_p": self._settings["top_p"],
|
||||
"max_tokens": self._settings["max_tokens"],
|
||||
"max_completion_tokens": self._settings["max_completion_tokens"],
|
||||
"service_tier": self._settings["service_tier"],
|
||||
}
|
||||
|
||||
# Messages, tools, tool_choice
|
||||
|
||||
0
src/pipecat/services/openai/realtime/__init__.py
Normal file
0
src/pipecat/services/openai/realtime/__init__.py
Normal file
272
src/pipecat/services/openai/realtime/context.py
Normal file
272
src/pipecat/services/openai/realtime/context.py
Normal file
@@ -0,0 +1,272 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""OpenAI Realtime LLM context and aggregator implementations."""
|
||||
|
||||
import copy
|
||||
import json
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.frames.frames import (
|
||||
Frame,
|
||||
FunctionCallResultFrame,
|
||||
InterimTranscriptionFrame,
|
||||
LLMMessagesUpdateFrame,
|
||||
LLMSetToolsFrame,
|
||||
LLMTextFrame,
|
||||
TranscriptionFrame,
|
||||
)
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.processors.frame_processor import FrameDirection
|
||||
from pipecat.services.openai.llm import (
|
||||
OpenAIAssistantContextAggregator,
|
||||
OpenAIUserContextAggregator,
|
||||
)
|
||||
|
||||
from . import events
|
||||
from .frames import RealtimeFunctionCallResultFrame, RealtimeMessagesUpdateFrame
|
||||
|
||||
|
||||
class OpenAIRealtimeLLMContext(OpenAILLMContext):
|
||||
"""OpenAI Realtime LLM context with session management and message conversion.
|
||||
|
||||
Extends the standard OpenAI LLM context to support real-time session properties,
|
||||
instruction management, and conversion between standard message formats and
|
||||
realtime conversation items.
|
||||
"""
|
||||
|
||||
def __init__(self, messages=None, tools=None, **kwargs):
|
||||
"""Initialize the OpenAIRealtimeLLMContext.
|
||||
|
||||
Args:
|
||||
messages: Initial conversation messages. Defaults to None.
|
||||
tools: Available function tools. Defaults to None.
|
||||
**kwargs: Additional arguments passed to parent OpenAILLMContext.
|
||||
"""
|
||||
super().__init__(messages=messages, tools=tools, **kwargs)
|
||||
self.__setup_local()
|
||||
|
||||
def __setup_local(self):
|
||||
self.llm_needs_settings_update = True
|
||||
self.llm_needs_initial_messages = True
|
||||
self._session_instructions = ""
|
||||
|
||||
return
|
||||
|
||||
@staticmethod
|
||||
def upgrade_to_realtime(obj: OpenAILLMContext) -> "OpenAIRealtimeLLMContext":
|
||||
"""Upgrade a standard OpenAI LLM context to a realtime context.
|
||||
|
||||
Args:
|
||||
obj: The OpenAILLMContext instance to upgrade.
|
||||
|
||||
Returns:
|
||||
The upgraded OpenAIRealtimeLLMContext instance.
|
||||
"""
|
||||
if isinstance(obj, OpenAILLMContext) and not isinstance(obj, OpenAIRealtimeLLMContext):
|
||||
obj.__class__ = OpenAIRealtimeLLMContext
|
||||
obj.__setup_local()
|
||||
return obj
|
||||
|
||||
# todo
|
||||
# - finish implementing all frames
|
||||
|
||||
def from_standard_message(self, message):
|
||||
"""Convert a standard message format to a realtime conversation item.
|
||||
|
||||
Args:
|
||||
message: The standard message dictionary to convert.
|
||||
|
||||
Returns:
|
||||
A ConversationItem instance for the realtime API.
|
||||
"""
|
||||
if message.get("role") == "user":
|
||||
content = message.get("content")
|
||||
if isinstance(message.get("content"), list):
|
||||
content = ""
|
||||
for c in message.get("content"):
|
||||
if c.get("type") == "text":
|
||||
content += " " + c.get("text")
|
||||
else:
|
||||
logger.error(
|
||||
f"Unhandled content type in context message: {c.get('type')} - {message}"
|
||||
)
|
||||
return events.ConversationItem(
|
||||
role="user",
|
||||
type="message",
|
||||
content=[events.ItemContent(type="input_text", text=content)],
|
||||
)
|
||||
if message.get("role") == "assistant" and message.get("tool_calls"):
|
||||
tc = message.get("tool_calls")[0]
|
||||
return events.ConversationItem(
|
||||
type="function_call",
|
||||
call_id=tc["id"],
|
||||
name=tc["function"]["name"],
|
||||
arguments=tc["function"]["arguments"],
|
||||
)
|
||||
logger.error(f"Unhandled message type in from_standard_message: {message}")
|
||||
|
||||
def get_messages_for_initializing_history(self):
|
||||
"""Get conversation items for initializing the realtime session history.
|
||||
|
||||
Converts the context's messages to a format suitable for the realtime API,
|
||||
handling system instructions and conversation history packaging.
|
||||
|
||||
Returns:
|
||||
List of conversation items for session initialization.
|
||||
"""
|
||||
# We can't load a long conversation history into the openai realtime api yet. (The API/model
|
||||
# forgets that it can do audio, if you do a series of `conversation.item.create` calls.) So
|
||||
# our general strategy until this is fixed is just to put everything into a first "user"
|
||||
# message as a single input.
|
||||
if not self.messages:
|
||||
return []
|
||||
|
||||
messages = copy.deepcopy(self.messages)
|
||||
|
||||
# If we have a "system" message as our first message, let's pull that out into session
|
||||
# "instructions"
|
||||
if messages[0].get("role") == "system":
|
||||
self.llm_needs_settings_update = True
|
||||
system = messages.pop(0)
|
||||
content = system.get("content")
|
||||
if isinstance(content, str):
|
||||
self._session_instructions = content
|
||||
elif isinstance(content, list):
|
||||
self._session_instructions = content[0].get("text")
|
||||
if not messages:
|
||||
return []
|
||||
|
||||
# If we have just a single "user" item, we can just send it normally
|
||||
if len(messages) == 1 and messages[0].get("role") == "user":
|
||||
return [self.from_standard_message(messages[0])]
|
||||
|
||||
# Otherwise, let's pack everything into a single "user" message with a bit of
|
||||
# explanation for the LLM
|
||||
intro_text = """
|
||||
This is a previously saved conversation. Please treat this conversation history as a
|
||||
starting point for the current conversation."""
|
||||
|
||||
trailing_text = """
|
||||
This is the end of the previously saved conversation. Please continue the conversation
|
||||
from here. If the last message is a user instruction or question, act on that instruction
|
||||
or answer the question. If the last message is an assistant response, simple say that you
|
||||
are ready to continue the conversation."""
|
||||
|
||||
return [
|
||||
{
|
||||
"role": "user",
|
||||
"type": "message",
|
||||
"content": [
|
||||
{
|
||||
"type": "input_text",
|
||||
"text": "\n\n".join(
|
||||
[intro_text, json.dumps(messages, indent=2), trailing_text]
|
||||
),
|
||||
}
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
def add_user_content_item_as_message(self, item):
|
||||
"""Add a user content item as a standard message to the context.
|
||||
|
||||
Args:
|
||||
item: The conversation item to add as a user message.
|
||||
"""
|
||||
message = {
|
||||
"role": "user",
|
||||
"content": [{"type": "text", "text": item.content[0].transcript}],
|
||||
}
|
||||
self.add_message(message)
|
||||
|
||||
|
||||
class OpenAIRealtimeUserContextAggregator(OpenAIUserContextAggregator):
|
||||
"""User context aggregator for OpenAI Realtime API.
|
||||
|
||||
Handles user input frames and generates appropriate context updates
|
||||
for the realtime conversation, including message updates and tool settings.
|
||||
|
||||
Args:
|
||||
context: The OpenAI realtime LLM context.
|
||||
**kwargs: Additional arguments passed to parent aggregator.
|
||||
"""
|
||||
|
||||
async def process_frame(
|
||||
self, frame: Frame, direction: FrameDirection = FrameDirection.DOWNSTREAM
|
||||
):
|
||||
"""Process incoming frames and handle realtime-specific frame types.
|
||||
|
||||
Args:
|
||||
frame: The frame to process.
|
||||
direction: The direction of frame flow in the pipeline.
|
||||
"""
|
||||
await super().process_frame(frame, direction)
|
||||
# Parent does not push LLMMessagesUpdateFrame. This ensures that in a typical pipeline,
|
||||
# messages are only processed by the user context aggregator, which is generally what we want. But
|
||||
# we also need to send new messages over the websocket, so the openai realtime API has them
|
||||
# in its context.
|
||||
if isinstance(frame, LLMMessagesUpdateFrame):
|
||||
await self.push_frame(RealtimeMessagesUpdateFrame(context=self._context))
|
||||
|
||||
# Parent also doesn't push the LLMSetToolsFrame.
|
||||
if isinstance(frame, LLMSetToolsFrame):
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
async def push_aggregation(self):
|
||||
"""Push user input aggregation.
|
||||
|
||||
Currently ignores all user input coming into the pipeline as realtime
|
||||
audio input is handled directly by the service.
|
||||
"""
|
||||
# for the moment, ignore all user input coming into the pipeline.
|
||||
# todo: think about whether/how to fix this to allow for text input from
|
||||
# upstream (transport/transcription, or other sources)
|
||||
pass
|
||||
|
||||
|
||||
class OpenAIRealtimeAssistantContextAggregator(OpenAIAssistantContextAggregator):
|
||||
"""Assistant context aggregator for OpenAI Realtime API.
|
||||
|
||||
Handles assistant output frames from the realtime service, filtering
|
||||
out duplicate text frames and managing function call results.
|
||||
|
||||
Args:
|
||||
context: The OpenAI realtime LLM context.
|
||||
**kwargs: Additional arguments passed to parent aggregator.
|
||||
"""
|
||||
|
||||
# The LLMAssistantContextAggregator uses TextFrames to aggregate the LLM output,
|
||||
# but the OpenAIRealtimeLLMService pushes LLMTextFrames and TTSTextFrames. We
|
||||
# need to override this proces_frame for LLMTextFrame, so that only the TTSTextFrames
|
||||
# are process. This ensures that the context gets only one set of messages.
|
||||
# OpenAIRealtimeLLMService also pushes TranscriptionFrames and InterimTranscriptionFrames,
|
||||
# so we need to ignore pushing those as well, as they're also TextFrames.
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
"""Process assistant frames, filtering out duplicate text content.
|
||||
|
||||
Args:
|
||||
frame: The frame to process.
|
||||
direction: The direction of frame flow in the pipeline.
|
||||
"""
|
||||
if not isinstance(frame, (LLMTextFrame, TranscriptionFrame, InterimTranscriptionFrame)):
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
async def handle_function_call_result(self, frame: FunctionCallResultFrame):
|
||||
"""Handle function call result and notify the realtime service.
|
||||
|
||||
Args:
|
||||
frame: The function call result frame to handle.
|
||||
"""
|
||||
await super().handle_function_call_result(frame)
|
||||
|
||||
# The standard function callback code path pushes the FunctionCallResultFrame from the llm itself,
|
||||
# so we didn't have a chance to add the result to the openai realtime api context. Let's push a
|
||||
# special frame to do that.
|
||||
await self.push_frame(
|
||||
RealtimeFunctionCallResultFrame(result_frame=frame), FrameDirection.UPSTREAM
|
||||
)
|
||||
1106
src/pipecat/services/openai/realtime/events.py
Normal file
1106
src/pipecat/services/openai/realtime/events.py
Normal file
File diff suppressed because it is too large
Load Diff
37
src/pipecat/services/openai/realtime/frames.py
Normal file
37
src/pipecat/services/openai/realtime/frames.py
Normal file
@@ -0,0 +1,37 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""Custom frame types for OpenAI Realtime API integration."""
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from pipecat.frames.frames import DataFrame, FunctionCallResultFrame
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pipecat.services.openai.realtime.context import OpenAIRealtimeLLMContext
|
||||
|
||||
|
||||
@dataclass
|
||||
class RealtimeMessagesUpdateFrame(DataFrame):
|
||||
"""Frame indicating that the realtime context messages have been updated.
|
||||
|
||||
Parameters:
|
||||
context: The updated OpenAI realtime LLM context.
|
||||
"""
|
||||
|
||||
context: "OpenAIRealtimeLLMContext"
|
||||
|
||||
|
||||
@dataclass
|
||||
class RealtimeFunctionCallResultFrame(DataFrame):
|
||||
"""Frame containing function call results for the realtime service.
|
||||
|
||||
Parameters:
|
||||
result_frame: The function call result frame to send to the realtime API.
|
||||
"""
|
||||
|
||||
result_frame: FunctionCallResultFrame
|
||||
@@ -1,9 +1,27 @@
|
||||
from .azure import AzureRealtimeLLMService
|
||||
from .events import (
|
||||
#
|
||||
# Copyright (c) 2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import warnings
|
||||
|
||||
from pipecat.services.azure.realtime.llm import AzureRealtimeLLMService
|
||||
from pipecat.services.openai.realtime.events import (
|
||||
InputAudioNoiseReduction,
|
||||
InputAudioTranscription,
|
||||
SemanticTurnDetection,
|
||||
SessionProperties,
|
||||
TurnDetection,
|
||||
)
|
||||
from .openai import OpenAIRealtimeLLMService
|
||||
from pipecat.services.openai.realtime.llm import OpenAIRealtimeLLMService
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("always")
|
||||
warnings.warn(
|
||||
"Types in pipecat.services.openai_realtime are deprecated. "
|
||||
"Please use the equivalent types from "
|
||||
"pipecat.services.openai.realtime instead.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
@@ -1,67 +1,21 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
# Copyright (c) 2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""Azure OpenAI Realtime LLM service implementation."""
|
||||
|
||||
from loguru import logger
|
||||
import warnings
|
||||
|
||||
from .openai import OpenAIRealtimeLLMService
|
||||
from pipecat.services.azure.realtime.llm import *
|
||||
|
||||
try:
|
||||
from websockets.asyncio.client import connect as websocket_connect
|
||||
except ModuleNotFoundError as e:
|
||||
logger.error(f"Exception: {e}")
|
||||
logger.error(
|
||||
"In order to use OpenAI, you need to `pip install pipecat-ai[openai]`. Also, set `OPENAI_API_KEY` environment variable."
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("always")
|
||||
warnings.warn(
|
||||
"Types in pipecat.services.openai_realtime.azure are deprecated. "
|
||||
"Please use the equivalent types from "
|
||||
"pipecat.services.azure.realtime.llm instead.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
raise Exception(f"Missing module: {e}")
|
||||
|
||||
|
||||
class AzureRealtimeLLMService(OpenAIRealtimeLLMService):
|
||||
"""Azure OpenAI Realtime LLM service with Azure-specific authentication.
|
||||
|
||||
Extends the OpenAI Realtime service to work with Azure OpenAI endpoints,
|
||||
using Azure's authentication headers and endpoint format. Provides the same
|
||||
real-time audio and text communication capabilities as the base OpenAI service.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
api_key: str,
|
||||
base_url: str,
|
||||
**kwargs,
|
||||
):
|
||||
"""Initialize Azure Realtime LLM service.
|
||||
|
||||
Args:
|
||||
api_key: The API key for the Azure OpenAI service.
|
||||
base_url: The full Azure WebSocket endpoint URL including api-version and deployment.
|
||||
Example: "wss://my-project.openai.azure.com/openai/realtime?api-version=2024-10-01-preview&deployment=my-realtime-deployment"
|
||||
**kwargs: Additional arguments passed to parent OpenAIRealtimeLLMService.
|
||||
"""
|
||||
super().__init__(base_url=base_url, api_key=api_key, **kwargs)
|
||||
self.api_key = api_key
|
||||
self.base_url = base_url
|
||||
|
||||
async def _connect(self):
|
||||
try:
|
||||
if self._websocket:
|
||||
# Here we assume that if we have a websocket, we are connected. We
|
||||
# handle disconnections in the send/recv code paths.
|
||||
return
|
||||
|
||||
logger.info(f"Connecting to {self.base_url}, api key: {self.api_key}")
|
||||
self._websocket = await websocket_connect(
|
||||
uri=self.base_url,
|
||||
additional_headers={
|
||||
"api-key": self.api_key,
|
||||
},
|
||||
)
|
||||
self._receive_task = self.create_task(self._receive_task_handler())
|
||||
except Exception as e:
|
||||
logger.error(f"{self} initialization error: {e}")
|
||||
self._websocket = None
|
||||
|
||||
@@ -1,272 +1,21 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
# Copyright (c) 2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""OpenAI Realtime LLM context and aggregator implementations."""
|
||||
|
||||
import copy
|
||||
import json
|
||||
import warnings
|
||||
|
||||
from loguru import logger
|
||||
from pipecat.services.openai.realtime.context import *
|
||||
|
||||
from pipecat.frames.frames import (
|
||||
Frame,
|
||||
FunctionCallResultFrame,
|
||||
InterimTranscriptionFrame,
|
||||
LLMMessagesUpdateFrame,
|
||||
LLMSetToolsFrame,
|
||||
LLMTextFrame,
|
||||
TranscriptionFrame,
|
||||
)
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.processors.frame_processor import FrameDirection
|
||||
from pipecat.services.openai.llm import (
|
||||
OpenAIAssistantContextAggregator,
|
||||
OpenAIUserContextAggregator,
|
||||
)
|
||||
|
||||
from . import events
|
||||
from .frames import RealtimeFunctionCallResultFrame, RealtimeMessagesUpdateFrame
|
||||
|
||||
|
||||
class OpenAIRealtimeLLMContext(OpenAILLMContext):
|
||||
"""OpenAI Realtime LLM context with session management and message conversion.
|
||||
|
||||
Extends the standard OpenAI LLM context to support real-time session properties,
|
||||
instruction management, and conversion between standard message formats and
|
||||
realtime conversation items.
|
||||
"""
|
||||
|
||||
def __init__(self, messages=None, tools=None, **kwargs):
|
||||
"""Initialize the OpenAIRealtimeLLMContext.
|
||||
|
||||
Args:
|
||||
messages: Initial conversation messages. Defaults to None.
|
||||
tools: Available function tools. Defaults to None.
|
||||
**kwargs: Additional arguments passed to parent OpenAILLMContext.
|
||||
"""
|
||||
super().__init__(messages=messages, tools=tools, **kwargs)
|
||||
self.__setup_local()
|
||||
|
||||
def __setup_local(self):
|
||||
self.llm_needs_settings_update = True
|
||||
self.llm_needs_initial_messages = True
|
||||
self._session_instructions = ""
|
||||
|
||||
return
|
||||
|
||||
@staticmethod
|
||||
def upgrade_to_realtime(obj: OpenAILLMContext) -> "OpenAIRealtimeLLMContext":
|
||||
"""Upgrade a standard OpenAI LLM context to a realtime context.
|
||||
|
||||
Args:
|
||||
obj: The OpenAILLMContext instance to upgrade.
|
||||
|
||||
Returns:
|
||||
The upgraded OpenAIRealtimeLLMContext instance.
|
||||
"""
|
||||
if isinstance(obj, OpenAILLMContext) and not isinstance(obj, OpenAIRealtimeLLMContext):
|
||||
obj.__class__ = OpenAIRealtimeLLMContext
|
||||
obj.__setup_local()
|
||||
return obj
|
||||
|
||||
# todo
|
||||
# - finish implementing all frames
|
||||
|
||||
def from_standard_message(self, message):
|
||||
"""Convert a standard message format to a realtime conversation item.
|
||||
|
||||
Args:
|
||||
message: The standard message dictionary to convert.
|
||||
|
||||
Returns:
|
||||
A ConversationItem instance for the realtime API.
|
||||
"""
|
||||
if message.get("role") == "user":
|
||||
content = message.get("content")
|
||||
if isinstance(message.get("content"), list):
|
||||
content = ""
|
||||
for c in message.get("content"):
|
||||
if c.get("type") == "text":
|
||||
content += " " + c.get("text")
|
||||
else:
|
||||
logger.error(
|
||||
f"Unhandled content type in context message: {c.get('type')} - {message}"
|
||||
)
|
||||
return events.ConversationItem(
|
||||
role="user",
|
||||
type="message",
|
||||
content=[events.ItemContent(type="input_text", text=content)],
|
||||
)
|
||||
if message.get("role") == "assistant" and message.get("tool_calls"):
|
||||
tc = message.get("tool_calls")[0]
|
||||
return events.ConversationItem(
|
||||
type="function_call",
|
||||
call_id=tc["id"],
|
||||
name=tc["function"]["name"],
|
||||
arguments=tc["function"]["arguments"],
|
||||
)
|
||||
logger.error(f"Unhandled message type in from_standard_message: {message}")
|
||||
|
||||
def get_messages_for_initializing_history(self):
|
||||
"""Get conversation items for initializing the realtime session history.
|
||||
|
||||
Converts the context's messages to a format suitable for the realtime API,
|
||||
handling system instructions and conversation history packaging.
|
||||
|
||||
Returns:
|
||||
List of conversation items for session initialization.
|
||||
"""
|
||||
# We can't load a long conversation history into the openai realtime api yet. (The API/model
|
||||
# forgets that it can do audio, if you do a series of `conversation.item.create` calls.) So
|
||||
# our general strategy until this is fixed is just to put everything into a first "user"
|
||||
# message as a single input.
|
||||
if not self.messages:
|
||||
return []
|
||||
|
||||
messages = copy.deepcopy(self.messages)
|
||||
|
||||
# If we have a "system" message as our first message, let's pull that out into session
|
||||
# "instructions"
|
||||
if messages[0].get("role") == "system":
|
||||
self.llm_needs_settings_update = True
|
||||
system = messages.pop(0)
|
||||
content = system.get("content")
|
||||
if isinstance(content, str):
|
||||
self._session_instructions = content
|
||||
elif isinstance(content, list):
|
||||
self._session_instructions = content[0].get("text")
|
||||
if not messages:
|
||||
return []
|
||||
|
||||
# If we have just a single "user" item, we can just send it normally
|
||||
if len(messages) == 1 and messages[0].get("role") == "user":
|
||||
return [self.from_standard_message(messages[0])]
|
||||
|
||||
# Otherwise, let's pack everything into a single "user" message with a bit of
|
||||
# explanation for the LLM
|
||||
intro_text = """
|
||||
This is a previously saved conversation. Please treat this conversation history as a
|
||||
starting point for the current conversation."""
|
||||
|
||||
trailing_text = """
|
||||
This is the end of the previously saved conversation. Please continue the conversation
|
||||
from here. If the last message is a user instruction or question, act on that instruction
|
||||
or answer the question. If the last message is an assistant response, simple say that you
|
||||
are ready to continue the conversation."""
|
||||
|
||||
return [
|
||||
{
|
||||
"role": "user",
|
||||
"type": "message",
|
||||
"content": [
|
||||
{
|
||||
"type": "input_text",
|
||||
"text": "\n\n".join(
|
||||
[intro_text, json.dumps(messages, indent=2), trailing_text]
|
||||
),
|
||||
}
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
def add_user_content_item_as_message(self, item):
|
||||
"""Add a user content item as a standard message to the context.
|
||||
|
||||
Args:
|
||||
item: The conversation item to add as a user message.
|
||||
"""
|
||||
message = {
|
||||
"role": "user",
|
||||
"content": [{"type": "text", "text": item.content[0].transcript}],
|
||||
}
|
||||
self.add_message(message)
|
||||
|
||||
|
||||
class OpenAIRealtimeUserContextAggregator(OpenAIUserContextAggregator):
|
||||
"""User context aggregator for OpenAI Realtime API.
|
||||
|
||||
Handles user input frames and generates appropriate context updates
|
||||
for the realtime conversation, including message updates and tool settings.
|
||||
|
||||
Args:
|
||||
context: The OpenAI realtime LLM context.
|
||||
**kwargs: Additional arguments passed to parent aggregator.
|
||||
"""
|
||||
|
||||
async def process_frame(
|
||||
self, frame: Frame, direction: FrameDirection = FrameDirection.DOWNSTREAM
|
||||
):
|
||||
"""Process incoming frames and handle realtime-specific frame types.
|
||||
|
||||
Args:
|
||||
frame: The frame to process.
|
||||
direction: The direction of frame flow in the pipeline.
|
||||
"""
|
||||
await super().process_frame(frame, direction)
|
||||
# Parent does not push LLMMessagesUpdateFrame. This ensures that in a typical pipeline,
|
||||
# messages are only processed by the user context aggregator, which is generally what we want. But
|
||||
# we also need to send new messages over the websocket, so the openai realtime API has them
|
||||
# in its context.
|
||||
if isinstance(frame, LLMMessagesUpdateFrame):
|
||||
await self.push_frame(RealtimeMessagesUpdateFrame(context=self._context))
|
||||
|
||||
# Parent also doesn't push the LLMSetToolsFrame.
|
||||
if isinstance(frame, LLMSetToolsFrame):
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
async def push_aggregation(self):
|
||||
"""Push user input aggregation.
|
||||
|
||||
Currently ignores all user input coming into the pipeline as realtime
|
||||
audio input is handled directly by the service.
|
||||
"""
|
||||
# for the moment, ignore all user input coming into the pipeline.
|
||||
# todo: think about whether/how to fix this to allow for text input from
|
||||
# upstream (transport/transcription, or other sources)
|
||||
pass
|
||||
|
||||
|
||||
class OpenAIRealtimeAssistantContextAggregator(OpenAIAssistantContextAggregator):
|
||||
"""Assistant context aggregator for OpenAI Realtime API.
|
||||
|
||||
Handles assistant output frames from the realtime service, filtering
|
||||
out duplicate text frames and managing function call results.
|
||||
|
||||
Args:
|
||||
context: The OpenAI realtime LLM context.
|
||||
**kwargs: Additional arguments passed to parent aggregator.
|
||||
"""
|
||||
|
||||
# The LLMAssistantContextAggregator uses TextFrames to aggregate the LLM output,
|
||||
# but the OpenAIRealtimeLLMService pushes LLMTextFrames and TTSTextFrames. We
|
||||
# need to override this proces_frame for LLMTextFrame, so that only the TTSTextFrames
|
||||
# are process. This ensures that the context gets only one set of messages.
|
||||
# OpenAIRealtimeLLMService also pushes TranscriptionFrames and InterimTranscriptionFrames,
|
||||
# so we need to ignore pushing those as well, as they're also TextFrames.
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
"""Process assistant frames, filtering out duplicate text content.
|
||||
|
||||
Args:
|
||||
frame: The frame to process.
|
||||
direction: The direction of frame flow in the pipeline.
|
||||
"""
|
||||
if not isinstance(frame, (LLMTextFrame, TranscriptionFrame, InterimTranscriptionFrame)):
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
async def handle_function_call_result(self, frame: FunctionCallResultFrame):
|
||||
"""Handle function call result and notify the realtime service.
|
||||
|
||||
Args:
|
||||
frame: The function call result frame to handle.
|
||||
"""
|
||||
await super().handle_function_call_result(frame)
|
||||
|
||||
# The standard function callback code path pushes the FunctionCallResultFrame from the llm itself,
|
||||
# so we didn't have a chance to add the result to the openai realtime api context. Let's push a
|
||||
# special frame to do that.
|
||||
await self.push_frame(
|
||||
RealtimeFunctionCallResultFrame(result_frame=frame), FrameDirection.UPSTREAM
|
||||
)
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("always")
|
||||
warnings.warn(
|
||||
"Types in pipecat.services.openai_realtime.context are deprecated. "
|
||||
"Please use the equivalent types from "
|
||||
"pipecat.services.openai.realtime.context instead.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,37 +1,21 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
# Copyright (c) 2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""Custom frame types for OpenAI Realtime API integration."""
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import TYPE_CHECKING
|
||||
import warnings
|
||||
|
||||
from pipecat.frames.frames import DataFrame, FunctionCallResultFrame
|
||||
from pipecat.services.openai.realtime.frames import *
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pipecat.services.openai_realtime_beta.context import OpenAIRealtimeLLMContext
|
||||
|
||||
|
||||
@dataclass
|
||||
class RealtimeMessagesUpdateFrame(DataFrame):
|
||||
"""Frame indicating that the realtime context messages have been updated.
|
||||
|
||||
Parameters:
|
||||
context: The updated OpenAI realtime LLM context.
|
||||
"""
|
||||
|
||||
context: "OpenAIRealtimeLLMContext"
|
||||
|
||||
|
||||
@dataclass
|
||||
class RealtimeFunctionCallResultFrame(DataFrame):
|
||||
"""Frame containing function call results for the realtime service.
|
||||
|
||||
Parameters:
|
||||
result_frame: The function call result frame to send to the realtime API.
|
||||
"""
|
||||
|
||||
result_frame: FunctionCallResultFrame
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("always")
|
||||
warnings.warn(
|
||||
"Types in pipecat.services.openai_realtime.frames are deprecated. "
|
||||
"Please use the equivalent types from "
|
||||
"pipecat.services.openai.realtime.frames instead.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
@@ -14,7 +14,6 @@ from loguru import logger
|
||||
from pipecat.frames.frames import (
|
||||
ErrorFrame,
|
||||
Frame,
|
||||
TTSAudioRawFrame,
|
||||
TTSStartedFrame,
|
||||
TTSStoppedFrame,
|
||||
)
|
||||
@@ -99,16 +98,15 @@ class PiperTTSService(TTSService):
|
||||
|
||||
await self.start_tts_usage_metrics(text)
|
||||
|
||||
yield TTSStartedFrame()
|
||||
|
||||
CHUNK_SIZE = self.chunk_size
|
||||
|
||||
yield TTSStartedFrame()
|
||||
async for chunk in response.content.iter_chunked(CHUNK_SIZE):
|
||||
# remove wav header if present
|
||||
if chunk.startswith(b"RIFF"):
|
||||
chunk = chunk[44:]
|
||||
if len(chunk) > 0:
|
||||
await self.stop_ttfb_metrics()
|
||||
yield TTSAudioRawFrame(chunk, self.sample_rate, 1)
|
||||
async for frame in self._stream_audio_frames_from_iterator(
|
||||
response.content.iter_chunked(CHUNK_SIZE), strip_wav_header=True
|
||||
):
|
||||
await self.stop_ttfb_metrics()
|
||||
yield frame
|
||||
except Exception as e:
|
||||
logger.error(f"Error in run_tts: {e}")
|
||||
yield ErrorFrame(error=str(e))
|
||||
|
||||
@@ -14,6 +14,7 @@ import io
|
||||
import json
|
||||
import struct
|
||||
import uuid
|
||||
import warnings
|
||||
from typing import AsyncGenerator, Optional
|
||||
|
||||
import aiohttp
|
||||
@@ -110,6 +111,11 @@ def language_to_playht_language(language: Language) -> Optional[str]:
|
||||
class PlayHTTTSService(InterruptibleTTSService):
|
||||
"""PlayHT WebSocket-based text-to-speech service.
|
||||
|
||||
.. deprecated:: 0.0.88
|
||||
|
||||
This class is deprecated and will be removed in a future version.
|
||||
PlayHT is shutting down their API on December 31st, 2025.
|
||||
|
||||
Provides real-time text-to-speech synthesis using PlayHT's WebSocket API.
|
||||
Supports streaming audio generation with configurable voice engines and
|
||||
language settings.
|
||||
@@ -158,6 +164,15 @@ class PlayHTTTSService(InterruptibleTTSService):
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("always")
|
||||
warnings.warn(
|
||||
"PlayHT is shutting down their API on December 31st, 2025. "
|
||||
"'PlayHTTTSService' is deprecated and will be removed in a future version.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
params = params or PlayHTTTSService.InputParams()
|
||||
|
||||
self._api_key = api_key
|
||||
@@ -401,6 +416,11 @@ class PlayHTTTSService(InterruptibleTTSService):
|
||||
class PlayHTHttpTTSService(TTSService):
|
||||
"""PlayHT HTTP-based text-to-speech service.
|
||||
|
||||
.. deprecated:: 0.0.88
|
||||
|
||||
This class is deprecated and will be removed in a future version.
|
||||
PlayHT is shutting down their API on December 31st, 2025.
|
||||
|
||||
Provides text-to-speech synthesis using PlayHT's HTTP API for simpler,
|
||||
non-streaming synthesis. Suitable for use cases where streaming is not
|
||||
required and simpler integration is preferred.
|
||||
@@ -454,8 +474,6 @@ class PlayHTHttpTTSService(TTSService):
|
||||
|
||||
# Warn about deprecated protocol parameter if explicitly provided
|
||||
if protocol:
|
||||
import warnings
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("always")
|
||||
warnings.warn(
|
||||
@@ -464,6 +482,15 @@ class PlayHTHttpTTSService(TTSService):
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("always")
|
||||
warnings.warn(
|
||||
"PlayHT is shutting down their API on December 31st, 2025. "
|
||||
"'PlayHTHttpTTSService' is deprecated and will be removed in a future version.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
params = params or PlayHTHttpTTSService.InputParams()
|
||||
|
||||
self._user_id = user_id
|
||||
|
||||
@@ -553,15 +553,13 @@ class RimeHttpTTSService(TTSService):
|
||||
|
||||
CHUNK_SIZE = self.chunk_size
|
||||
|
||||
async for chunk in response.content.iter_chunked(CHUNK_SIZE):
|
||||
if need_to_strip_wav_header and chunk.startswith(b"RIFF"):
|
||||
chunk = chunk[44:]
|
||||
need_to_strip_wav_header = False
|
||||
async for frame in self._stream_audio_frames_from_iterator(
|
||||
response.content.iter_chunked(CHUNK_SIZE),
|
||||
strip_wav_header=need_to_strip_wav_header,
|
||||
):
|
||||
await self.stop_ttfb_metrics()
|
||||
yield frame
|
||||
|
||||
if len(chunk) > 0:
|
||||
await self.stop_ttfb_metrics()
|
||||
frame = TTSAudioRawFrame(chunk, self.sample_rate, 1)
|
||||
yield frame
|
||||
except Exception as e:
|
||||
logger.exception(f"Error generating TTS: {e}")
|
||||
yield ErrorFrame(error=f"Rime TTS error: {str(e)}")
|
||||
|
||||
@@ -8,7 +8,17 @@
|
||||
|
||||
import asyncio
|
||||
from abc import abstractmethod
|
||||
from typing import Any, AsyncGenerator, Dict, List, Mapping, Optional, Sequence, Tuple
|
||||
from typing import (
|
||||
Any,
|
||||
AsyncGenerator,
|
||||
AsyncIterator,
|
||||
Dict,
|
||||
List,
|
||||
Mapping,
|
||||
Optional,
|
||||
Sequence,
|
||||
Tuple,
|
||||
)
|
||||
|
||||
from loguru import logger
|
||||
|
||||
@@ -374,6 +384,36 @@ class TTSService(AIService):
|
||||
):
|
||||
await self._stop_frame_queue.put(frame)
|
||||
|
||||
async def _stream_audio_frames_from_iterator(
|
||||
self, iterator: AsyncIterator[bytes], *, strip_wav_header: bool
|
||||
) -> AsyncGenerator[Frame, None]:
|
||||
buffer = bytearray()
|
||||
need_to_strip_wav_header = strip_wav_header
|
||||
async for chunk in iterator:
|
||||
if need_to_strip_wav_header and chunk.startswith(b"RIFF"):
|
||||
chunk = chunk[44:]
|
||||
need_to_strip_wav_header = False
|
||||
|
||||
# Append to current buffer.
|
||||
buffer.extend(chunk)
|
||||
|
||||
# Round to nearest even number.
|
||||
aligned_length = len(buffer) & ~1 # 111111111...11110
|
||||
if aligned_length > 0:
|
||||
aligned_chunk = buffer[:aligned_length]
|
||||
buffer = buffer[aligned_length:] # keep any leftover byte
|
||||
|
||||
if len(aligned_chunk) > 0:
|
||||
frame = TTSAudioRawFrame(bytes(aligned_chunk), self.sample_rate, 1)
|
||||
yield frame
|
||||
|
||||
if len(buffer) > 0:
|
||||
# Make sure we don't need an extra padding byte.
|
||||
if len(buffer) % 2 == 1:
|
||||
buffer.extend(b"\x00")
|
||||
frame = TTSAudioRawFrame(bytes(buffer), self.sample_rate, 1)
|
||||
yield frame
|
||||
|
||||
async def _handle_interruption(self, frame: InterruptionFrame, direction: FrameDirection):
|
||||
self._processing_text = False
|
||||
await self._text_aggregator.handle_interruption()
|
||||
|
||||
@@ -354,10 +354,10 @@ class LiveKitTransportClient:
|
||||
logger.error(f"Error sending data: {e}")
|
||||
|
||||
async def send_dtmf(self, digit: str):
|
||||
"""Send DTMF tone to the room.
|
||||
r"""Send DTMF tone to the room.
|
||||
|
||||
Args:
|
||||
digit: The DTMF digit to send (0-9, *, #).
|
||||
digit: The DTMF digit to send (0-9, \*, #).
|
||||
"""
|
||||
if not self._connected:
|
||||
return
|
||||
|
||||
@@ -116,6 +116,10 @@ class SmallWebRTCRequestHandler:
|
||||
detail="Cannot create new connection with existing connection active",
|
||||
)
|
||||
|
||||
def update_ice_servers(self, ice_servers: Optional[List[IceServer]] = None):
|
||||
"""Update the list of ICE servers used for WebRTC connections."""
|
||||
self._ice_servers = ice_servers
|
||||
|
||||
async def handle_web_request(
|
||||
self,
|
||||
request: SmallWebRTCRequest,
|
||||
|
||||
@@ -241,6 +241,14 @@ class WhatsAppApi:
|
||||
self._whatsapp_url = f"{self.BASE_URL}{phone_number_id}/calls"
|
||||
self._whatsapp_token = whatsapp_token
|
||||
|
||||
def update_whatsapp_token(self, whatsapp_token: str):
|
||||
"""Update the WhatsApp access token for authentication."""
|
||||
self._whatsapp_token = whatsapp_token
|
||||
|
||||
def update_whatsapp_phone_number_id(self, phone_number_id: str):
|
||||
"""Update the WhatsApp phone number ID for authentication."""
|
||||
self._phone_number_id = phone_number_id
|
||||
|
||||
async def answer_call_to_whatsapp(self, call_id: str, action: str, sdp: str, from_: str):
|
||||
"""Answer an incoming WhatsApp call.
|
||||
|
||||
|
||||
@@ -12,6 +12,8 @@ WhatsApp call events.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import hashlib
|
||||
import hmac
|
||||
from typing import Awaitable, Callable, Dict, List, Optional
|
||||
|
||||
import aiohttp
|
||||
@@ -47,6 +49,7 @@ class WhatsAppClient:
|
||||
phone_number_id: str,
|
||||
session: aiohttp.ClientSession,
|
||||
ice_servers: Optional[List[IceServer]] = None,
|
||||
whatsapp_secret: Optional[str] = None,
|
||||
) -> None:
|
||||
"""Initialize the WhatsApp client.
|
||||
|
||||
@@ -56,10 +59,12 @@ class WhatsAppClient:
|
||||
session: aiohttp session for making HTTP requests
|
||||
ice_servers: List of ICE servers for WebRTC connections. If None,
|
||||
defaults to Google's public STUN server
|
||||
whatsapp_secret: WhatsApp APP secret for validating that the webhook request came from WhatsApp.
|
||||
"""
|
||||
self._whatsapp_api = WhatsAppApi(
|
||||
whatsapp_token=whatsapp_token, phone_number_id=phone_number_id, session=session
|
||||
)
|
||||
self._whatsapp_secret = whatsapp_secret
|
||||
self._ongoing_calls_map: Dict[str, SmallWebRTCConnection] = {}
|
||||
|
||||
# Set default ICE servers if none provided
|
||||
@@ -68,6 +73,22 @@ class WhatsAppClient:
|
||||
else:
|
||||
self._ice_servers = ice_servers
|
||||
|
||||
def update_ice_servers(self, ice_servers: Optional[List[IceServer]] = None):
|
||||
"""Update the list of ICE servers used for WebRTC connections."""
|
||||
self._ice_servers = ice_servers
|
||||
|
||||
def update_whatsapp_secret(self, whatsapp_secret: Optional[str] = None):
|
||||
"""Update the WhatsApp APP secret for validating that the webhook request came from WhatsApp."""
|
||||
self._whatsapp_secret = whatsapp_secret
|
||||
|
||||
def update_whatsapp_token(self, whatsapp_token: str):
|
||||
"""Update the WhatsApp API access token."""
|
||||
self._whatsapp_api.update_whatsapp_token(whatsapp_token)
|
||||
|
||||
def update_whatsapp_phone_number_id(self, phone_number_id: str):
|
||||
"""Update the WhatsApp phone number ID for authentication."""
|
||||
self._whatsapp_api.update_whatsapp_phone_number_id(phone_number_id)
|
||||
|
||||
async def terminate_all_calls(self) -> None:
|
||||
"""Terminate all ongoing WhatsApp calls.
|
||||
|
||||
@@ -133,10 +154,32 @@ class WhatsAppClient:
|
||||
|
||||
return int(challenge)
|
||||
|
||||
async def _validate_whatsapp_webhook_request(self, raw_body: bytes, sha256_signature: str):
|
||||
"""Common handler for both /start and /connect endpoints."""
|
||||
# Compute HMAC SHA256 using your App Secret
|
||||
expected_signature = hmac.new(
|
||||
key=self._whatsapp_secret.encode("utf-8"),
|
||||
msg=raw_body,
|
||||
digestmod=hashlib.sha256,
|
||||
).hexdigest()
|
||||
|
||||
# Extract signature from header (strip 'sha256=' prefix)
|
||||
if not sha256_signature:
|
||||
raise Exception("Missing X-Hub-Signature-256 header")
|
||||
received_signature = sha256_signature.split("sha256=")[-1]
|
||||
|
||||
# Compare signatures securely
|
||||
if not hmac.compare_digest(expected_signature, received_signature):
|
||||
raise Exception("Invalid webhook signature")
|
||||
|
||||
logger.debug(f"Webhook signature verified!")
|
||||
|
||||
async def handle_webhook_request(
|
||||
self,
|
||||
request: WhatsAppWebhookRequest,
|
||||
connection_callback: Optional[Callable[[SmallWebRTCConnection], Awaitable[None]]] = None,
|
||||
raw_body: Optional[bytes] = None,
|
||||
sha256_signature: Optional[str] = None,
|
||||
) -> bool:
|
||||
"""Handle a webhook request from WhatsApp.
|
||||
|
||||
@@ -150,6 +193,8 @@ class WhatsAppClient:
|
||||
connection_callback: Optional callback function to invoke when a new
|
||||
WebRTC connection is established. The callback
|
||||
receives the SmallWebRTCConnection instance.
|
||||
raw_body: Optional bytes containing the raw request body.
|
||||
sha256_signature: Optional X-Hub-Signature-256 header value from the request.
|
||||
|
||||
Returns:
|
||||
bool: True if the webhook request was handled successfully, False otherwise
|
||||
@@ -159,6 +204,8 @@ class WhatsAppClient:
|
||||
Exception: If connection establishment or API calls fail
|
||||
"""
|
||||
try:
|
||||
if self._whatsapp_secret:
|
||||
await self._validate_whatsapp_webhook_request(raw_body, sha256_signature)
|
||||
for entry in request.entry:
|
||||
for change in entry.changes:
|
||||
# Handle connect events
|
||||
|
||||
@@ -29,7 +29,7 @@ class EventHandler:
|
||||
This data class stores the event name, a list of handlers to run for this
|
||||
event, and whether these handlers will be executed in a task.
|
||||
|
||||
Attributes:
|
||||
Parameters:
|
||||
name (str): The name of the event handler.
|
||||
handlers (List[Any]): A list of functions to be called when this event is triggered.
|
||||
is_sync (bool): Indicates whether the functions are executed in a task.
|
||||
|
||||
@@ -21,13 +21,24 @@ import re
|
||||
from typing import FrozenSet, Optional, Sequence, Tuple
|
||||
|
||||
import nltk
|
||||
from loguru import logger
|
||||
from nltk.tokenize import sent_tokenize
|
||||
|
||||
# Ensure punkt_tab tokenizer data is available
|
||||
try:
|
||||
nltk.data.find("tokenizers/punkt_tab")
|
||||
except LookupError:
|
||||
nltk.download("punkt_tab", quiet=True)
|
||||
try:
|
||||
nltk.download("punkt_tab", quiet=True)
|
||||
except (OSError, PermissionError) as e:
|
||||
logger.error(
|
||||
f"Failed to download NLTK 'punkt_tab' tokenizer data: {e}. "
|
||||
"This data is required for sentence tokenization features. "
|
||||
"The download failed due to filesystem permissions. "
|
||||
"To resolve: pre-install the data in a location with appropriate read permissions, "
|
||||
"or set the NLTK_DATA environment variable to point to a writable directory. "
|
||||
"See https://www.nltk.org/data.html for more information."
|
||||
)
|
||||
|
||||
SENTENCE_ENDING_PUNCTUATION: FrozenSet[str] = frozenset(
|
||||
{
|
||||
|
||||
@@ -651,9 +651,9 @@ def traced_gemini_live(operation: str) -> Callable:
|
||||
|
||||
elif operation == "llm_tool_call" and args:
|
||||
# Extract tool call information
|
||||
evt = args[0] if args else None
|
||||
if evt and hasattr(evt, "toolCall") and evt.toolCall.functionCalls:
|
||||
function_calls = evt.toolCall.functionCalls
|
||||
msg = args[0] if args else None
|
||||
if msg and hasattr(msg, "tool_call") and msg.tool_call.function_calls:
|
||||
function_calls = msg.tool_call.function_calls
|
||||
if function_calls:
|
||||
# Add information about the first function call
|
||||
call = function_calls[0]
|
||||
@@ -722,19 +722,19 @@ def traced_gemini_live(operation: str) -> Callable:
|
||||
|
||||
elif operation == "llm_response" and args:
|
||||
# Extract usage and response metadata from turn complete event
|
||||
evt = args[0] if args else None
|
||||
if evt and hasattr(evt, "usageMetadata") and evt.usageMetadata:
|
||||
usage = evt.usageMetadata
|
||||
msg = args[0] if args else None
|
||||
if msg and hasattr(msg, "usage_metadata") and msg.usage_metadata:
|
||||
usage = msg.usage_metadata
|
||||
|
||||
# Token usage - basic attributes for span visibility
|
||||
if hasattr(usage, "promptTokenCount"):
|
||||
operation_attrs["tokens.prompt"] = usage.promptTokenCount or 0
|
||||
if hasattr(usage, "responseTokenCount"):
|
||||
if hasattr(usage, "prompt_token_count"):
|
||||
operation_attrs["tokens.prompt"] = usage.prompt_token_count or 0
|
||||
if hasattr(usage, "response_token_count"):
|
||||
operation_attrs["tokens.completion"] = (
|
||||
usage.responseTokenCount or 0
|
||||
usage.response_token_count or 0
|
||||
)
|
||||
if hasattr(usage, "totalTokenCount"):
|
||||
operation_attrs["tokens.total"] = usage.totalTokenCount or 0
|
||||
if hasattr(usage, "total_token_count"):
|
||||
operation_attrs["tokens.total"] = usage.total_token_count or 0
|
||||
|
||||
# Get output text and modality from service state
|
||||
text = getattr(self, "_bot_text_buffer", "")
|
||||
@@ -751,9 +751,9 @@ def traced_gemini_live(operation: str) -> Callable:
|
||||
|
||||
# Add turn completion status
|
||||
if (
|
||||
evt
|
||||
and hasattr(evt, "serverContent")
|
||||
and evt.serverContent.turnComplete
|
||||
msg
|
||||
and hasattr(msg, "server_content")
|
||||
and msg.server_content.turn_complete
|
||||
):
|
||||
operation_attrs["turn_complete"] = True
|
||||
|
||||
@@ -772,16 +772,16 @@ def traced_gemini_live(operation: str) -> Callable:
|
||||
|
||||
# For llm_response operation, also handle token usage metrics
|
||||
if operation == "llm_response" and hasattr(self, "start_llm_usage_metrics"):
|
||||
evt = args[0] if args else None
|
||||
if evt and hasattr(evt, "usageMetadata") and evt.usageMetadata:
|
||||
usage = evt.usageMetadata
|
||||
msg = args[0] if args else None
|
||||
if msg and hasattr(msg, "usage_metadata") and msg.usage_metadata:
|
||||
usage = msg.usage_metadata
|
||||
# Create LLMTokenUsage object
|
||||
from pipecat.metrics.metrics import LLMTokenUsage
|
||||
|
||||
tokens = LLMTokenUsage(
|
||||
prompt_tokens=usage.promptTokenCount or 0,
|
||||
completion_tokens=usage.responseTokenCount or 0,
|
||||
total_tokens=usage.totalTokenCount or 0,
|
||||
prompt_tokens=usage.prompt_token_count or 0,
|
||||
completion_tokens=usage.response_token_count or 0,
|
||||
total_tokens=usage.total_token_count or 0,
|
||||
)
|
||||
_add_token_usage_to_span(current_span, tokens)
|
||||
|
||||
|
||||
@@ -11,6 +11,7 @@ import unittest
|
||||
from pipecat.frames.frames import (
|
||||
CancelFrame,
|
||||
EndFrame,
|
||||
ErrorFrame,
|
||||
Frame,
|
||||
HeartbeatFrame,
|
||||
InputAudioRawFrame,
|
||||
@@ -450,3 +451,34 @@ class TestPipelineTask(unittest.IsolatedAsyncioTestCase):
|
||||
await task.run(PipelineTaskParams(loop=asyncio.get_event_loop()))
|
||||
except asyncio.CancelledError:
|
||||
assert cancelled
|
||||
|
||||
async def test_task_error(self):
|
||||
class ErrorProcessor(FrameProcessor):
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if isinstance(frame, TextFrame):
|
||||
await self.push_error(ErrorFrame("Boo!"))
|
||||
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
error_received = False
|
||||
|
||||
pipeline = Pipeline([ErrorProcessor()])
|
||||
task = PipelineTask(pipeline)
|
||||
|
||||
@task.event_handler("on_pipeline_error")
|
||||
async def on_pipeline_error(task: PipelineTask, frame: ErrorFrame):
|
||||
nonlocal error_received
|
||||
error_received = True
|
||||
await task.cancel()
|
||||
|
||||
await task.queue_frame(TextFrame(text="Hello from Pipecat!"))
|
||||
|
||||
try:
|
||||
await task.run(PipelineTaskParams(loop=asyncio.get_event_loop()))
|
||||
except asyncio.CancelledError:
|
||||
assert error_received
|
||||
|
||||
Reference in New Issue
Block a user