Compare commits

..

4 Commits

Author SHA1 Message Date
James Hush
a39f8b4882 Remove extra code 2025-10-01 14:48:12 +08:00
James Hush
76fc36f621 Pre recorded message example 2025-10-01 14:43:36 +08:00
James Hush
c0878c5e09 Save progress 2025-10-01 13:58:32 +08:00
James Hush
c6a1013051 Pre-recorded message example 2025-10-01 13:52:58 +08:00
146 changed files with 6878 additions and 11305 deletions

View File

@@ -21,21 +21,20 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0 # Fetch all history for setuptools_scm
- name: Install uv
uses: astral-sh/setup-uv@v3
with:
version: "latest"
- name: Set up Python
run: uv python install 3.10
- name: Install development dependencies
run: uv sync --group dev
- name: Build project
run: uv build
- name: Install project in editable mode
run: uv pip install --editable .
run: uv pip install --editable .

View File

@@ -5,25 +5,25 @@ on:
inputs:
gitref:
type: string
description: 'what git tag to build (e.g. v0.0.74)'
description: "what git tag to build (e.g. v0.0.74)"
required: true
jobs:
build:
name: 'Build and upload wheels'
name: "Build and upload wheels"
runs-on: ubuntu-latest
steps:
- name: Checkout repo
uses: actions/checkout@v4
with:
ref: ${{ github.event.inputs.gitref }}
- name: Install uv
uses: astral-sh/setup-uv@v3
with:
version: 'latest'
version: "latest"
- name: Set up Python
run: uv python install 3.12
run: uv python install 3.10
- name: Install development dependencies
run: uv sync --group dev
- name: Build project
@@ -35,9 +35,9 @@ jobs:
path: ./dist
publish-to-pypi:
name: 'Publish to PyPI'
name: "Publish to PyPI"
runs-on: ubuntu-latest
needs: [build]
needs: [ build ]
environment:
name: pypi
url: https://pypi.org/p/pipecat-ai
@@ -56,12 +56,12 @@ jobs:
print-hash: true
publish-to-test-pypi:
name: 'Publish to Test PyPI'
name: "Publish to Test PyPI"
runs-on: ubuntu-latest
needs: [build]
needs: [ build ]
environment:
name: testpypi
url: https://test.pypi.org/p/pipecat-ai
url: https://pypi.org/p/pipecat-ai
permissions:
id-token: write
steps:
@@ -70,7 +70,7 @@ jobs:
with:
name: wheels
path: ./dist
- name: Publish to Test PyPI
- name: Publish to PyPI
uses: pypa/gh-action-pypi-publish@release/v1
with:
verbose: true

View File

@@ -4,7 +4,7 @@ on: workflow_dispatch
jobs:
build:
name: 'Build and upload wheels'
name: "Build and upload wheels"
runs-on: ubuntu-latest
steps:
- name: Checkout repo
@@ -15,9 +15,9 @@ jobs:
- name: Install uv
uses: astral-sh/setup-uv@v3
with:
version: 'latest'
version: "latest"
- name: Set up Python
run: uv python install 3.12
run: uv python install 3.10
- name: Install development dependencies
run: uv sync --group dev
- name: Build project
@@ -29,12 +29,12 @@ jobs:
path: ./dist
publish-to-test-pypi:
name: 'Publish to Test PyPI'
name: "Publish to Test PyPI"
runs-on: ubuntu-latest
needs: [build]
environment:
name: testpypi
url: https://test.pypi.org/p/pipecat-ai
url: https://pypi.org/p/pipecat-ai
permissions:
id-token: write
steps:
@@ -43,7 +43,7 @@ jobs:
with:
name: wheels
path: ./dist
- name: Publish to Test PyPI
- name: Publish to PyPI
uses: pypa/gh-action-pypi-publish@release/v1
with:
verbose: true

View File

@@ -9,265 +9,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Added
- Added the [Pipecat CLI](https://github.com/pipecat-ai/pipecat-cli) to the
required dependencies, enabling you to scaffold a new project directly from
`pipecat-ai`. Get started with:
```bash
uv run pipecat init
```
- Expanded support for universal `LLMContext` to `AWSNovaSonicLLMService`.
As a reminder, the context-setup pattern when using `LLMContext` is:
```python
context = LLMContext(messages, tools)
context_aggregator = LLMContextAggregatorPair(context)
```
(Note that even though `AWSNovaSonicLLMService` now supports the universal
`LLMContext`, it is not meant to be swapped out for another LLM service at
runtime.)
Worth noting: whether or not you use the new context-setup pattern with
`AWSNovaSonicLLMService`, some types have changed under the hood:
```python
## BEFORE:
# Context aggregator type
context_aggregator: AWSNovaSonicContextAggregatorPair
# Context frame type
frame: OpenAILLMContextFrame
# Context type
context: AWSNovaSonicLLMContext
# or
context: OpenAILLMContext
# Reading messages from context
messages = context.messages
## AFTER:
# Context aggregator type
context_aggregator: LLMContextAggregatorPair
# Context frame type
frame: LLMContextFrame
# Context type
context: LLMContext
# Reading messages from context
messages = context.get_messages()
```
- Added support for `bulbul:v3` model in `SarvamTTSService` and
`SarvamHttpTTSService`.
- Added `keyterms_prompt` parameter to `AssemblyAIConnectionParams`.
- Added `speech_model` parameter to `AssemblyAIConnectionParams` to access the
multilingual model.
- Added support for trickle ICE to the `SmallWebRTCTransport`.
- Added support for updating `OpenAITTSService` settings (`instructions` and
`speed`) at runtime via `TTSUpdateSettingsFrame`.
- Added `--whatsapp` flag to runner to better surface WhatsApp transport logs.
- Added `on_connected` and `on_disconnected` events to TTS and STT
websocket-based services.
- Added an `aggregate_sentences` arg in `ElevenLabsHttpTTSService`, where the
default value is True.
- Added a `room_properties` arg to the Daily runner's `configure()` method,
allowing `DailyRoomProperties` to be provided.
- The runner `--folder` argument now supports downloading files from
subdirectories.
### Changed
- `CartesiaSTTService` now inherits from `WebsocketSTTService`.
- Package upgrades:
- `daily-python` upgraded to 0.20.0.
- `openai` upgraded to support up to 2.x.x.
- `openpipe` upgraded to support up to 5.x.x.
- `SpeechmaticsSTTService` updated dependencies for `speechmatics-rt>=0.5.0`.
### Deprecated
- The `send_transcription_frames` argument to `AWSNovaSonicLLMService` is
deprecated. Transcription frames are now always sent. They go upstream, to be
handled by the user context aggregator. See "Added" section for details.
- Types in `pipecat.services.aws.nova_sonic.context` have been deprecated due
to changes to support `LLMContext`. See "Changed" section for details.
### Fixed
- Fixed an issue in `RivaSegmentedSTTService` where a runtime error occurred due
to a mismatch in the \_handle_transcription method's signature.
- Fixed multiple pipeline task cancellation issues. `asyncio.CancelledError` is
now handled properly in `PipelineTask` making it possible to cancel an asyncio
task that it's executing a `PipelineRunner` cleanly. Also,
`PipelineTask.cancel()` does not block anymore waiting for the `CancelFrame`
to reach the end of the pipeline (going back to the behavior in < 0.0.83).
- Fixed an issue in `ElevenLabsTTSService` and `ElevenLabsHttpTTSService` where
the Flash models would split words, resulting in a space being inserted
between words.
- Fixed an issue where audio filters' `stop()` would not be called when using
`CancelFrame`.
- Fixed an issue in `ElevenLabsHttpTTSService`, where
`apply_text_normalization` was incorrectly set as a query parameter. It's now
being added as a request parameter.
- Fixed an issue where `RimeHttpTTSService` and `PiperTTSService` could generate
incorrectly 16-bit aligned audio frames, potentially leading to internal
errors or static audio.
- Fixed an issue in `SpeechmaticsSTTService` where `AdditionalVocabEntry` items
needed to have `sounds_like` for the session to start.
### Other
- Added foundational example `47-sentry-metrics.py`, demonstrating how to use the
`SentryMetrics` processor.
- Added foundational example `14x-function-calling-openpipe.py`.
## [0.0.90] - 2025-10-10
### Added
- Added audio filter `KrispVivaFilter` using the Krisp VIVA SDK.
- Added `--folder` argument to the runner, allowing files saved in that folder
to be downloaded from `http://HOST:PORT/file/FILE`.
- Added `GeminiLiveVertexLLMService`, for accessing Gemini Live via Google
Vertex AI.
- Added some new configuration options to `GeminiLiveLLMService`:
- `thinking`
- `enable_affective_dialog`
- `proactivity`
Note that these new configuration options require using a newer model than
the default, like "gemini-2.5-flash-native-audio-preview-09-2025". The last
two require specifying `http_options=HttpOptions(api_version="v1alpha")`.
- Added `on_pipeline_error` event to `PipelineTask`. This event will get fired
when an `ErrorFrame` is pushed (use `FrameProcessor.push_error()`).
```python
@task.event_handler("on_pipeline_error")
async def on_pipeline_error(task: PipelineTask, frame: ErrorFrame):
...
```
- Added a `service_tier` `InputParam` to the `BaseOpenAILLMService`. This
parameter can influence the latency of the response. For example `"priority"`
will result in faster completions, but in exchange for a higher price.
### Changed
- Updated `GeminiLiveLLMService` to use the `google-genai` library rather than
use WebSockets directly.
### Deprecated
- `LivekitFrameSerializer` is now deprecated. Use `LiveKitTransport` instead.
- `pipecat.service.openai_realtime` is now deprecated, use
`pipecat.services.openai.realtime` instead or
`pipecat.services.azure.realtime` for Azure Realtime.
- `pipecat.service.aws_nova_sonic` is now deprecated, use
`pipecat.services.aws.nova_sonic` instead.
- `GeminiMultimodalLiveLLMService` is now deprecated, use
`GeminiLiveLLMService`.
### Fixed
- Fixed a `GoogleVertexLLMService` issue that would generate an error if no
token information was returned.
- `GeminiLiveLLMService` will now end gracefully (i.e. after the bot has
finished) upon receiving an `EndFrame`.
- `GeminiLiveLLMService` will try to seamlessly reconnect when it loses its
connection.
## [0.0.89] - 2025-10-07
### Fixed
- Reverted a change introduced in 0.0.88 that was causing pipelines to be frozen
when using interruption strategies and processors that block interruption
frames (e.g. `STTMuteFilter`).
## [0.0.88] - 2025-10-07
### Added
- Added support for Nano Banana models to `GoogleLLMService`. For example, you
can now use the `gemini-2.5-flash-image` model to generate images.
- Added `HumeTTSService` for text-to-speech synthesis using Hume AI's expressive
voice models. Provides high-quality, emotionally expressive speech synthesis
with support for various voice models. Includes example in
`examples/foundational/07ad-interruptible-hume.py`. Use with:
`uv pip install pipecat-ai[hume]`.
### Changed
- Updated default `GoogleLLMService` model to `gemini-2.5-flash`.
### Deprecated
- PlayHT is shutting down their API on December 31st, 2025. As a result,
`PlayHTTTSService` and `PlayHTHttpTTSService` are deprecated and will be
removed in a future version.
### Fixed
- Fixed an issue with `AWSNovaSonicLLMService` where the client wouldn't
connect due to a breaking change in the AWS dependency chain.
- `PermissionError` is now caught if NLTK's `punkt_tab` can't be downloaded.
- Fixed an issue that would cause wrong user/assistant context ordering when
using interruption strategies.
- Fixed RTVI incoming message handling, broken in 0.0.87.
## [0.0.87] - 2025-10-02
### Added
- Added `WebsocketSTTService` base class for websocket-based STT services.
Combines STT functionality with websocket connectivity, providing automatic
error handling and reconnection capabilities with exponential backoff.
- Added `DeepgramFluxSTTService` for real-time speech recognition using
Deepgram's Flux WebSocket API. Flux understands conversational flow and
automatically handles turn-taking.
- Added RTVI messages for user/bot audio levels and system logs.
- Include OpenAI-based LLM services cached tokens to `MetricsFrame`.
@@ -279,21 +20,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Deprecated
- `DailyTransportMessageFrame` and `DailyTransportMessageUrgentFrame` are
deprecated, use `DailyOutputTransportMessageFrame` and
`DailyOutputTransportMessageUrgentFrame` respectively instead.
- `LiveKitTransportMessageFrame` and `LiveKitTransportMessageUrgentFrame` are
deprecated, use `LiveKitOutputTransportMessageFrame` and
`LiveKitOutputTransportMessageUrgentFrame` respectively instead.
- `TransportMessageFrame` and `TransportMessageUrgentFrame` are deprecated, use
`OutputTransportMessageFrame` and `OutputTransportMessageUrgentFrame`
respectively instead.
- `InputTransportMessageUrgentFrame` is deprecated, use
`InputTransportMessageFrame` instead.
- `DailyUpdateRemoteParticipantsFrame` is deprecated and will be removed in a
future version. Instead, create your own custom frame and handle it in the
`@transport.output().event_handler("on_after_push_frame")` event handler or a
@@ -301,9 +27,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## Fixed
- Fixed an issue in `AWSBedrockLLMService` where timeout exceptions weren't
being detected.
- Fixed a `PipelineTask` issue that could prevent the application to exit if
`task.cancel()` was called when the task was already finished.
@@ -1630,7 +1353,7 @@ quality and critical bugs impacting `ParallelPipelines` functionality.**
- Added `session_token` parameter to `AWSNovaSonicLLMService`.
- Added Gemini Multimodal Live File API for uploading, fetching, listing, and
deleting files. See `26f-gemini-live-files-api.py` for example usage.
deleting files. See `26f-gemini-multimodal-live-files-api.py` for example usage.
### Changed
@@ -3636,7 +3359,7 @@ stt = DeepgramSTTService(..., live_options=LiveOptions(model="nova-2-general"))
- Added the new modalities option and helper function to set Gemini output
modalities.
- Added `examples/foundational/26d-gemini-live-text.py` which is
- Added `examples/foundational/26d-gemini-multimodal-live-text.py` which is
using Gemini as TEXT modality and using another TTS provider for TTS process.
### Changed
@@ -3823,9 +3546,9 @@ stt = DeepgramSTTService(..., live_options=LiveOptions(model="nova-2-general"))
- Added new foundational examples for `GeminiMultimodalLiveLLMService`:
- `26-gemini-multimodal-live.py`
- `26a-gemini-live-transcription.py`
- `26b-gemini-live-video.py`
- `26c-gemini-live-video.py`
- `26a-gemini-multimodal-live-transcription.py`
- `26b-gemini-multimodal-live-video.py`
- `26c-gemini-multimodal-live-video.py`
- Added `SimliVideoService`. This is an integration for Simli AI avatars.
(see https://www.simli.com)

View File

@@ -1,336 +0,0 @@
# Community Integrations Guide
Pipecat welcomes community-maintained integrations! As our ecosystem grows, we've established a process for any developer to create and maintain their own service integrations while ensuring discoverability for the Pipecat community.
## Overview
**What we support:** Community-maintained integrations that live in separate repositories and are maintained by their authors.
**What we don't do:** The Pipecat team does not code review, test, or maintain community integrations. We provide guidance and list approved integrations for discoverability.
**Why this approach:** This allows the community to move quickly while keeping the Pipecat core team focused on maintaining the framework itself.
## Submitting your Integration
To be listed as an official community integration, follow these steps:
### Step 1: Build Your Integration
Create your integration following the patterns and examples shown in the "Integration Patterns and Examples" section below.
### Step 2: Set Up Your Repository
Your repository must contain these components:
- **Source code** - Complete implementation following Pipecat patterns
- **Foundational example** - Single file example showing basic usage (see [Pipecat examples](https://github.com/pipecat-ai/pipecat/tree/main/examples/foundational))
- **README.md** - Must include:
- Introduction and explanation of your integration
- Installation instructions
- Usage instructions with Pipecat Pipeline
- How to run your example
- Pipecat version compatibility (e.g., "Tested with Pipecat v0.0.86")
- Company attribution: If you work for the company providing the service, please mention this in your README. This helps build confidence that the integration will be actively maintained.
- **LICENSE** - Permissive license (BSD-2 like Pipecat, or equivalent open source terms)
- **Code documentation** - Source code with docstrings (we recommend following [Pipecat's docstring conventions](https://github.com/pipecat-ai/pipecat/blob/main/CONTRIBUTING.md#docstring-conventions))
- **Changelog** - Maintain a changelog for version updates
### Step 3: Join Discord
Join our Discord: https://discord.gg/pipecat
### Step 4: Submit for Listing
Submit a pull request to add your integration to our [Community Integrations documentation page](https://docs.pipecat.ai/server/services/community-integrations).
**To submit:**
1. Fork the [Pipecat docs repository](https://github.com/pipecat-ai/docs)
2. Edit the file `server/services/community-integrations.mdx`
3. Add your integration to the appropriate service category table with:
- Service name
- Link to your repository
- Maintainer GitHub username(s)
4. Include a link to your demo video (approx 30-60 seconds) in your PR description showing:
- Core functionality of your integration
- Handling of an interruption (if applicable to service type)
5. Submit your pull request
Once your PR is submitted, post in the `#community-integrations` Discord channel to let us know.
## Integration Patterns and Examples
### STT (Speech-to-Text) Services
#### Websocket-based Services
**Base class:** `STTService`
**Examples:**
- [DeepgramSTTService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/deepgram/stt.py)
- [SpeechmaticsSTTService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/speechmatics/stt.py)
#### File-based Services
**Base class:** `SegmentedSTTService`
**Examples:**
- [RivaSTTService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/riva/stt.py)
- [FalSTTService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/fal/stt.py)
#### Key requirements:
- STT services should push `InterimTranscriptionFrames` and `TranscriptionFrames`
- If confidence values are available, filter for values >50% confidence
### LLM (Large Language Model) Services
#### OpenAI-Compatible Services
**Base class:** `OpenAILLMService`
**Examples:**
- [AzureLLMService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/azure/llm.py)
- [GrokLLMService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/grok/llm.py) - Shows overriding the base class where needed
#### Non-OpenAI Compatible Services
**Requires:** Full implementation
**Examples:**
- [AnthropicLLMService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/anthropic/llm.py)
- [GoogleLLMService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/google/llm.py)
#### Key requirements:
- **Frame sequence:** Output must follow this frame sequence pattern:
- `LLMFullResponseStartFrame` - Signals the start of an LLM response
- `LLMTextFrame` - Contains LLM content, typically streamed as tokens
- `LLMFullResponseEndFrame` - Signals the end of an LLM response
- **Context aggregation:** Implement context aggregation to collect user and assistant content:
- Aggregators come in pairs with a `user()` instance and `assistant()` instance
- Context must adhere to the `LLMContext` universal format
- Aggregators should handle adding messages, function calls, and images to the context
### TTS (Text-to-Speech) Services
#### AudioContextWordTTSService
**Use for:** Websocket-based services supporting word/timestamp alignment
**Example:**
- [CartesiaTTSService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/cartesia/tts.py)
#### InterruptibleTTSService
**Use for:** Websocket-based services without word/timestamp alignment, requiring disconnection on interruption
**Example:**
- [SarvamTTSService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/sarvam/tts.py)
#### WordTTSService
**Use for:** HTTP-based services supporting word/timestamp alignment
**Example:**
- [ElevenLabsHttpTTSService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/elevenlabs/tts.py)
#### TTSService
**Use for:** HTTP-based services without word/timestamp alignment
**Example:**
- [GoogleHttpTTSService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/google/tts.py)
#### Key requirements:
- For websocket services, use asyncio WebSocket implementation (required for v13+ support)
- Handle idle service timeouts with keepalives
- TTSServices push both audio (`TTSRawAudioFrame`) and text (`TTSTextFrame`) frames
### Telephony Serializers
Pipecat supports telephony provider integration using websocket connections to exchange MediaStreams. These services use a FrameSerializer to serialize and deserialize inputs from the FastAPIWebsocketTransport.
**Examples:**
- [Twilio](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/serializers/twilio.py)
- [Telnyx](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/serializers/telnyx.py)
#### Key requirements:
- Include hang-up functionality using the provider's native API, ideally using `aiohttp`
- Support DTMF (dual-tone multi-frequency) events if the provider supports them:
- Deserialize DTMF events from the provider's protocol to `InputDTMFFrame`
- Use `KeypadEntry` enum for valid keypad entries (0-9, \*, #, A-D)
- Handle invalid DTMF digits gracefully by returning `None`
### Image Generation Services
**Base class:** `ImageGenService`
**Examples:**
- [FalImageGenService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/fal/image.py)
- [GoogleImageGenService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/google/image.py)
#### Key requirements:
- Must implement `run_image_gen` method returning an `AsyncGenerator`
### Vision Services
Vision services process images and provide analysis such as descriptions, object detection, or visual question answering.
**Base class:** `VisionService`
**Example:**
- [MoondreamVisionService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/moondream/vision.py)
#### Key requirements:
- Must implement `run_vision` method that takes an `LLMContext` and returns an `AsyncGenerator[Frame, None]`
- The method processes the latest image in the context and yields frames with analysis results
- Typically yields `TextFrame` objects containing descriptions or answers
## Implementation Guidelines
### Naming Conventions
- **STT:** `VendorSTTService`
- **LLM:** `VendorLLMService`
- **TTS:**
- Websocket: `VendorTTSService`
- HTTP: `VendorHttpTTSService`
- **Image:** `VendorImageGenService`
- **Vision:** `VendorVisionService`
- **Telephony:** `VendorFrameSerializer`
### Metrics Support
Enable metrics in your service:
```python
def can_generate_metrics(self) -> bool:
"""Check if this service can generate processing metrics.
Returns:
True, as this service supports metrics.
"""
return True
```
### Dynamic Settings Updates
STT, LLM, and TTS services support `ServiceUpdateSettingsFrame` for dynamic configuration changes. The base STTService has an `_update_settings()` method that handles settings, and the private `_settings` `Dict` is used to store settings and provide access to the subclass.
```python
async def set_language(self, language: Language):
"""Set the recognition language and reconnect.
Args:
language: The language to use for speech recognition.
"""
logger.info(f"Switching STT language to: [{language}]")
self._settings["language"] = language
await self._disconnect()
await self._connect()
```
Note that, in this example, Deepgram requires the websocket connection be disconnected and reconnected to reinitialize the service with the new value. Consider if your service requires reconnection.
### Sample Rate Handling
Sample rates are set via PipelineParams and passed to each frame processor at initialization. The pattern is to _not_ set the sample rate value in the constructor of a given service. Instead, use the `start()` method to initialize sample rates from the frame:
```python
async def start(self, frame: StartFrame):
"""Start the service."""
await super().start(frame)
self._settings["output_format"]["sample_rate"] = self.sample_rate
await self._connect()
```
Note that `self.sample_rate` is a `@property` set in the TTSService base class, which provides access to the private sample rate value obtained from the StartFrame.
### Tracing Decorators
Use Pipecat's tracing decorators:
- **STT:** `@traced_stt` - decorate a function that handles `transcript`, `is_final`, `language` as args
- **LLM:** `@traced_llm` - decorate the `_process_context()` method
- **TTS:** `@traced_tts` - decorate the `run_tts()` method
## Best Practices
### Packaging and Distribution
- Use [uv](https://docs.astral.sh/uv/) for packaging (encouraged)
- Consider releasing to PyPI for easier installation
- Follow semantic versioning principles
- Maintain a changelog
### HTTP Communication
For REST-based communication, use aiohttp. Pipecat includes this as a required dependency, so using it prevents adding an additional dependency to your integration.
### Error Handling
- Wrap API calls in appropriate try/catch blocks
- Handle rate limits and network failures gracefully
- Provide meaningful error messages
- When errors occur, raise exceptions AND push `ErrorFrame`s to notify the pipeline:
```python
from pipecat.frames.frames import ErrorFrame
try:
# Your API call
result = await self._make_api_call()
except Exception as e:
# Push error frame to pipeline
await self.push_error(ErrorFrame(error=f"{self} error: {e}"))
# Raise or handle as appropriate
raise
```
### Testing
- Your foundational example serves as a valuable integration-level test
- Unit tests are nice to have. As the Pipecat teams provides better guidance, we will encourage unit testing more
## Disclaimer
Community integrations are community-maintained and not officially supported by the Pipecat team. Users should evaluate these integrations independently. The Pipecat team reserves the right to remove listings that become unmaintained or problematic.
## Staying Up to Date
Pipecat evolves rapidly to support the latest AI technologies and patterns. While we strive to minimize breaking changes, they do occur as the framework matures.
**We strongly recommend:**
- Join our Discord at https://discord.gg/pipecat and monitor the `#announcements` channel for release notifications
- Follow our changelog: https://github.com/pipecat-ai/pipecat/blob/main/CHANGELOG.md
- Test your integration against new Pipecat releases promptly
- Update your README with the last tested Pipecat version
This helps ensure your integration remains compatible and your users have clear expectations about version support.
## Questions?
Join our Discord community at https://discord.gg/pipecat and post in the `#community-integrations` channel for guidance and support.
For additional questions, you can also reach out to us at pipecat-ai@daily.co.

View File

@@ -1,9 +1,5 @@
## Contributing to Pipecat
**Want to add a new service integration?**
We encourage community-maintained integrations! Please see our [Community Integration Guide](COMMUNITY_INTEGRATIONS.md) for the process and requirements.
**Want to contribute to Pipecat core?**
We welcome contributions of all kinds! Your help is appreciated. Follow these steps to get involved:
1. **Fork this repository**: Start by forking the Pipecat Documentation repository to your GitHub account.

143
README.md
View File

@@ -3,7 +3,6 @@
</div></h1>
[![PyPI](https://img.shields.io/pypi/v/pipecat-ai)](https://pypi.org/project/pipecat-ai) ![Tests](https://github.com/pipecat-ai/pipecat/actions/workflows/tests.yaml/badge.svg) [![codecov](https://codecov.io/gh/pipecat-ai/pipecat/graph/badge.svg?token=LNVUIVO4Y9)](https://codecov.io/gh/pipecat-ai/pipecat) [![Docs](https://img.shields.io/badge/Documentation-blue)](https://docs.pipecat.ai) [![Discord](https://img.shields.io/discord/1239284677165056021)](https://discord.gg/pipecat) [![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/pipecat-ai/pipecat)
[![](https://getmanta.ai/api/badges?text=Manta%20Graph&link=manta)](https://getmanta.ai/pipecat)
# 🎙️ Pipecat: Real-Time Voice & Multimodal AI Agents
@@ -20,6 +19,10 @@
- **Business Agents** customer intake, support bots, guided flows
- **Complex Dialog Systems** design logic with structured conversations
🧭 Looking to build structured conversations? Check out [Pipecat Flows](https://github.com/pipecat-ai/pipecat-flows) for managing complex conversational states and transitions.
🔍 Looking for help debugging your pipeline and processors? Check out [Whisker](https://github.com/pipecat-ai/whisker), a real-time Pipecat debugger.
## 🧠 Why Pipecat?
- **Voice-first**: Integrates speech recognition, text-to-speech, and conversation handling
@@ -27,38 +30,40 @@
- **Composable Pipelines**: Build complex behavior from modular components
- **Real-Time**: Ultra-low latency interaction with different transports (e.g. WebSockets or WebRTC)
## 🌐 Pipecat Ecosystem
## 📱 Client SDKs
### 📱 Client SDKs
You can connect to Pipecat from any platform using our official SDKs:
Building client applications? You can connect to Pipecat from any platform using our official SDKs:
<a href="https://docs.pipecat.ai/client/js/introduction">JavaScript</a> | <a href="https://docs.pipecat.ai/client/react/introduction">React</a> | <a href="https://docs.pipecat.ai/client/react-native/introduction">React Native</a> |
<a href="https://docs.pipecat.ai/client/ios/introduction">Swift</a> | <a href="https://docs.pipecat.ai/client/android/introduction">Kotlin</a> | <a href="https://docs.pipecat.ai/client/c++/introduction">C++</a> | <a href="https://github.com/pipecat-ai/pipecat-esp32">ESP32</a>
### 🧭 Structured conversations
Looking to build structured conversations? Check out [Pipecat Flows](https://github.com/pipecat-ai/pipecat-flows) for managing complex conversational states and transitions.
### 🪄 Beautiful UIs
Want to build beautiful and engaging experiences? Checkout the [Voice UI Kit](https://github.com/pipecat-ai/voice-ui-kit), a collection of components, hooks and templates for building voice AI applications quickly.
### 🛠️ CLI
Create a new project in under a minute with the [Pipecat CLI](https://github.com/pipecat-ai/pipecat-cli). Then use the CLI to monitor and deploy your agent to production.
### 🔍 Debugging
Looking for help debugging your pipeline and processors? Check out [Whisker](https://github.com/pipecat-ai/whisker), a real-time Pipecat debugger.
### 🖥️ Terminal
Love terminal applications? Check out [Tail](https://github.com/pipecat-ai/tail), a terminal dashboard for Pipecat.
### 📺️ Pipecat TV Channel
Catch new features, interviews, and how-tos on our [Pipecat TV](https://www.youtube.com/playlist?list=PLzU2zoMTQIHjqC3v4q2XVSR3hGSzwKFwH) channel.
<table>
<tr>
<td>
<img src="https://cdn.jsdelivr.net/gh/devicons/devicon/icons/javascript/javascript-original.svg" width="40" height="40" alt="JavaScript"/>
<a href="https://docs.pipecat.ai/client/js/introduction">JavaScript</a>
</td>
<td>
<img src="https://cdn.jsdelivr.net/gh/devicons/devicon/icons/react/react-original.svg" width="40" height="40" alt="React"/>
<a href="https://docs.pipecat.ai/client/react/introduction">React</a>
</td>
<td>
<img src="https://cdn.jsdelivr.net/gh/devicons/devicon/icons/react/react-original.svg" width="40" height="40" alt="React Native"/>
<a href="https://docs.pipecat.ai/client/react-native/introduction">React Native</a>
</td>
</tr>
<tr>
<td>
<img src="https://cdn.jsdelivr.net/gh/devicons/devicon/icons/swift/swift-original.svg" width="40" height="40" alt="Swift"/>
<a href="https://docs.pipecat.ai/client/ios/introduction">Swift</a>
</td>
<td>
<img src="https://cdn.jsdelivr.net/gh/devicons/devicon/icons/kotlin/kotlin-original.svg" width="40" height="40" alt="Kotlin"/>
<a href="https://docs.pipecat.ai/client/android/introduction">Kotlin</a>
</td>
<td>
<img src="https://cdn.jsdelivr.net/gh/devicons/devicon/icons/cplusplus/cplusplus-original.svg" width="40" height="40" alt="JavaScript"/>
<a href="https://docs.pipecat.ai/client/c++/introduction">C++</a>
</td>
</tr>
</table>
## 🎬 See it in action
@@ -67,24 +72,24 @@ Catch new features, interviews, and how-tos on our [Pipecat TV](https://www.yout
<a href="https://github.com/pipecat-ai/pipecat-examples/tree/main/storytelling-chatbot"><img src="https://raw.githubusercontent.com/pipecat-ai/pipecat-examples/main/storytelling-chatbot/image.png" width="400" /></a>
<br/>
<a href="https://github.com/pipecat-ai/pipecat-examples/tree/main/translation-chatbot"><img src="https://raw.githubusercontent.com/pipecat-ai/pipecat-examples/main/translation-chatbot/image.png" width="400" /></a>&nbsp;
<a href="https://github.com/pipecat-ai/pipecat/blob/main/examples/foundational/12-describe-video.py"><img src="https://github.com/pipecat-ai/pipecat/blob/main/examples/foundational/assets/moondream.png" width="400" /></a>
<a href="https://github.com/pipecat-ai/pipecat-examples/tree/main/moondream-chatbot"><img src="https://raw.githubusercontent.com/pipecat-ai/pipecat-examples/main/moondream-chatbot/image.png" width="400" /></a>
</p>
## 🧩 Available services
| Category | Services |
| ------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| Speech-to-Text | [AssemblyAI](https://docs.pipecat.ai/server/services/stt/assemblyai), [AWS](https://docs.pipecat.ai/server/services/stt/aws), [Azure](https://docs.pipecat.ai/server/services/stt/azure), [Cartesia](https://docs.pipecat.ai/server/services/stt/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/stt/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/stt/elevenlabs), [Fal Wizper](https://docs.pipecat.ai/server/services/stt/fal), [Gladia](https://docs.pipecat.ai/server/services/stt/gladia), [Google](https://docs.pipecat.ai/server/services/stt/google), [Groq (Whisper)](https://docs.pipecat.ai/server/services/stt/groq), [NVIDIA Riva](https://docs.pipecat.ai/server/services/stt/riva), [OpenAI (Whisper)](https://docs.pipecat.ai/server/services/stt/openai), [SambaNova (Whisper)](https://docs.pipecat.ai/server/services/stt/sambanova), [Soniox](https://docs.pipecat.ai/server/services/stt/soniox), [Speechmatics](https://docs.pipecat.ai/server/services/stt/speechmatics), [Ultravox](https://docs.pipecat.ai/server/services/stt/ultravox), [Whisper](https://docs.pipecat.ai/server/services/stt/whisper) |
| LLMs | [Anthropic](https://docs.pipecat.ai/server/services/llm/anthropic), [AWS](https://docs.pipecat.ai/server/services/llm/aws), [Azure](https://docs.pipecat.ai/server/services/llm/azure), [Cerebras](https://docs.pipecat.ai/server/services/llm/cerebras), [DeepSeek](https://docs.pipecat.ai/server/services/llm/deepseek), [Fireworks AI](https://docs.pipecat.ai/server/services/llm/fireworks), [Gemini](https://docs.pipecat.ai/server/services/llm/gemini), [Grok](https://docs.pipecat.ai/server/services/llm/grok), [Groq](https://docs.pipecat.ai/server/services/llm/groq), [Mistral](https://docs.pipecat.ai/server/services/llm/mistral), [NVIDIA NIM](https://docs.pipecat.ai/server/services/llm/nim), [Ollama](https://docs.pipecat.ai/server/services/llm/ollama), [OpenAI](https://docs.pipecat.ai/server/services/llm/openai), [OpenRouter](https://docs.pipecat.ai/server/services/llm/openrouter), [Perplexity](https://docs.pipecat.ai/server/services/llm/perplexity), [Qwen](https://docs.pipecat.ai/server/services/llm/qwen), [SambaNova](https://docs.pipecat.ai/server/services/llm/sambanova) [Together AI](https://docs.pipecat.ai/server/services/llm/together) |
| Text-to-Speech | [Async](https://docs.pipecat.ai/server/services/tts/asyncai), [AWS](https://docs.pipecat.ai/server/services/tts/aws), [Azure](https://docs.pipecat.ai/server/services/tts/azure), [Cartesia](https://docs.pipecat.ai/server/services/tts/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/tts/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/tts/elevenlabs), [Fish](https://docs.pipecat.ai/server/services/tts/fish), [Google](https://docs.pipecat.ai/server/services/tts/google), [Groq](https://docs.pipecat.ai/server/services/tts/groq), [Hume](https://docs.pipecat.ai/server/services/tts/hume), [Inworld](https://docs.pipecat.ai/server/services/tts/inworld), [LMNT](https://docs.pipecat.ai/server/services/tts/lmnt), [MiniMax](https://docs.pipecat.ai/server/services/tts/minimax), [Neuphonic](https://docs.pipecat.ai/server/services/tts/neuphonic), [NVIDIA Riva](https://docs.pipecat.ai/server/services/tts/riva), [OpenAI](https://docs.pipecat.ai/server/services/tts/openai), [Piper](https://docs.pipecat.ai/server/services/tts/piper), [PlayHT](https://docs.pipecat.ai/server/services/tts/playht), [Rime](https://docs.pipecat.ai/server/services/tts/rime), [Sarvam](https://docs.pipecat.ai/server/services/tts/sarvam), [XTTS](https://docs.pipecat.ai/server/services/tts/xtts) |
| Speech-to-Speech | [AWS Nova Sonic](https://docs.pipecat.ai/server/services/s2s/aws), [Gemini Multimodal Live](https://docs.pipecat.ai/server/services/s2s/gemini), [OpenAI Realtime](https://docs.pipecat.ai/server/services/s2s/openai) |
| Transport | [Daily (WebRTC)](https://docs.pipecat.ai/server/services/transport/daily), [FastAPI Websocket](https://docs.pipecat.ai/server/services/transport/fastapi-websocket), [SmallWebRTCTransport](https://docs.pipecat.ai/server/services/transport/small-webrtc), [WebSocket Server](https://docs.pipecat.ai/server/services/transport/websocket-server), Local |
| Serializers | [Plivo](https://docs.pipecat.ai/server/utilities/serializers/plivo), [Twilio](https://docs.pipecat.ai/server/utilities/serializers/twilio), [Telnyx](https://docs.pipecat.ai/server/utilities/serializers/telnyx) |
| Video | [HeyGen](https://docs.pipecat.ai/server/services/video/heygen), [Tavus](https://docs.pipecat.ai/server/services/video/tavus), [Simli](https://docs.pipecat.ai/server/services/video/simli) |
| Memory | [mem0](https://docs.pipecat.ai/server/services/memory/mem0) |
| Vision & Image | [fal](https://docs.pipecat.ai/server/services/image-generation/fal), [Google Imagen](https://docs.pipecat.ai/server/services/image-generation/fal), [Moondream](https://docs.pipecat.ai/server/services/vision/moondream) |
| Audio Processing | [Silero VAD](https://docs.pipecat.ai/server/utilities/audio/silero-vad-analyzer), [Krisp](https://docs.pipecat.ai/server/utilities/audio/krisp-filter), [Koala](https://docs.pipecat.ai/server/utilities/audio/koala-filter), [ai-coustics](https://docs.pipecat.ai/server/utilities/audio/aic-filter) |
| Analytics & Metrics | [OpenTelemetry](https://docs.pipecat.ai/server/utilities/opentelemetry), [Sentry](https://docs.pipecat.ai/server/services/analytics/sentry) |
| Category | Services |
| ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| Speech-to-Text | [AssemblyAI](https://docs.pipecat.ai/server/services/stt/assemblyai), [AWS](https://docs.pipecat.ai/server/services/stt/aws), [Azure](https://docs.pipecat.ai/server/services/stt/azure), [Cartesia](https://docs.pipecat.ai/server/services/stt/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/stt/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/stt/elevenlabs), [Fal Wizper](https://docs.pipecat.ai/server/services/stt/fal), [Gladia](https://docs.pipecat.ai/server/services/stt/gladia), [Google](https://docs.pipecat.ai/server/services/stt/google), [Groq (Whisper)](https://docs.pipecat.ai/server/services/stt/groq), [NVIDIA Riva](https://docs.pipecat.ai/server/services/stt/riva), [OpenAI (Whisper)](https://docs.pipecat.ai/server/services/stt/openai), [SambaNova (Whisper)](https://docs.pipecat.ai/server/services/stt/sambanova), [Soniox](https://docs.pipecat.ai/server/services/stt/soniox), [Speechmatics](https://docs.pipecat.ai/server/services/stt/speechmatics), [Ultravox](https://docs.pipecat.ai/server/services/stt/ultravox), [Whisper](https://docs.pipecat.ai/server/services/stt/whisper) |
| LLMs | [Anthropic](https://docs.pipecat.ai/server/services/llm/anthropic), [AWS](https://docs.pipecat.ai/server/services/llm/aws), [Azure](https://docs.pipecat.ai/server/services/llm/azure), [Cerebras](https://docs.pipecat.ai/server/services/llm/cerebras), [DeepSeek](https://docs.pipecat.ai/server/services/llm/deepseek), [Fireworks AI](https://docs.pipecat.ai/server/services/llm/fireworks), [Gemini](https://docs.pipecat.ai/server/services/llm/gemini), [Grok](https://docs.pipecat.ai/server/services/llm/grok), [Groq](https://docs.pipecat.ai/server/services/llm/groq), [Mistral](https://docs.pipecat.ai/server/services/llm/mistral), [NVIDIA NIM](https://docs.pipecat.ai/server/services/llm/nim), [Ollama](https://docs.pipecat.ai/server/services/llm/ollama), [OpenAI](https://docs.pipecat.ai/server/services/llm/openai), [OpenRouter](https://docs.pipecat.ai/server/services/llm/openrouter), [Perplexity](https://docs.pipecat.ai/server/services/llm/perplexity), [Qwen](https://docs.pipecat.ai/server/services/llm/qwen), [SambaNova](https://docs.pipecat.ai/server/services/llm/sambanova) [Together AI](https://docs.pipecat.ai/server/services/llm/together) |
| Text-to-Speech | [Async](https://docs.pipecat.ai/server/services/tts/asyncai), [AWS](https://docs.pipecat.ai/server/services/tts/aws), [Azure](https://docs.pipecat.ai/server/services/tts/azure), [Cartesia](https://docs.pipecat.ai/server/services/tts/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/tts/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/tts/elevenlabs), [Fish](https://docs.pipecat.ai/server/services/tts/fish), [Google](https://docs.pipecat.ai/server/services/tts/google), [Groq](https://docs.pipecat.ai/server/services/tts/groq), [Inworld](https://docs.pipecat.ai/server/services/tts/inworld), [LMNT](https://docs.pipecat.ai/server/services/tts/lmnt), [MiniMax](https://docs.pipecat.ai/server/services/tts/minimax), [Neuphonic](https://docs.pipecat.ai/server/services/tts/neuphonic), [NVIDIA Riva](https://docs.pipecat.ai/server/services/tts/riva), [OpenAI](https://docs.pipecat.ai/server/services/tts/openai), [Piper](https://docs.pipecat.ai/server/services/tts/piper), [PlayHT](https://docs.pipecat.ai/server/services/tts/playht), [Rime](https://docs.pipecat.ai/server/services/tts/rime), [Sarvam](https://docs.pipecat.ai/server/services/tts/sarvam), [XTTS](https://docs.pipecat.ai/server/services/tts/xtts) |
| Speech-to-Speech | [AWS Nova Sonic](https://docs.pipecat.ai/server/services/s2s/aws), [Gemini Multimodal Live](https://docs.pipecat.ai/server/services/s2s/gemini), [OpenAI Realtime](https://docs.pipecat.ai/server/services/s2s/openai) |
| Transport | [Daily (WebRTC)](https://docs.pipecat.ai/server/services/transport/daily), [FastAPI Websocket](https://docs.pipecat.ai/server/services/transport/fastapi-websocket), [SmallWebRTCTransport](https://docs.pipecat.ai/server/services/transport/small-webrtc), [WebSocket Server](https://docs.pipecat.ai/server/services/transport/websocket-server), Local |
| Serializers | [Plivo](https://docs.pipecat.ai/server/utilities/serializers/plivo), [Twilio](https://docs.pipecat.ai/server/utilities/serializers/twilio), [Telnyx](https://docs.pipecat.ai/server/utilities/serializers/telnyx) |
| Video | [HeyGen](https://docs.pipecat.ai/server/services/video/heygen), [Tavus](https://docs.pipecat.ai/server/services/video/tavus), [Simli](https://docs.pipecat.ai/server/services/video/simli) |
| Memory | [mem0](https://docs.pipecat.ai/server/services/memory/mem0) |
| Vision & Image | [fal](https://docs.pipecat.ai/server/services/image-generation/fal), [Google Imagen](https://docs.pipecat.ai/server/services/image-generation/fal), [Moondream](https://docs.pipecat.ai/server/services/vision/moondream) |
| Audio Processing | [Silero VAD](https://docs.pipecat.ai/server/utilities/audio/silero-vad-analyzer), [Krisp](https://docs.pipecat.ai/server/utilities/audio/krisp-filter), [Koala](https://docs.pipecat.ai/server/utilities/audio/koala-filter), [ai-coustics](https://docs.pipecat.ai/server/utilities/audio/aic-filter) |
| Analytics & Metrics | [OpenTelemetry](https://docs.pipecat.ai/server/utilities/opentelemetry), [Sentry](https://docs.pipecat.ai/server/services/analytics/sentry) |
📚 [View full services documentation →](https://docs.pipecat.ai/server/services/supported-services)
@@ -179,6 +184,54 @@ Run a specific test suite:
uv run pytest tests/test_name.py
```
### Setting up your editor
This project uses strict [PEP 8](https://peps.python.org/pep-0008/) formatting via [Ruff](https://github.com/astral-sh/ruff).
#### Emacs
You can use [use-package](https://github.com/jwiegley/use-package) to install [emacs-lazy-ruff](https://github.com/christophermadsen/emacs-lazy-ruff) package and configure `ruff` arguments:
```elisp
(use-package lazy-ruff
:ensure t
:hook ((python-mode . lazy-ruff-mode))
:config
(setq lazy-ruff-format-command "ruff format")
(setq lazy-ruff-check-command "ruff check --select I"))
```
`ruff` was installed in the `venv` environment described before, so you should be able to use [pyvenv-auto](https://github.com/ryotaro612/pyvenv-auto) to automatically load that environment inside Emacs.
```elisp
(use-package pyvenv-auto
:ensure t
:defer t
:hook ((python-mode . pyvenv-auto-run)))
```
#### Visual Studio Code
Install the
[Ruff](https://marketplace.visualstudio.com/items?itemName=charliermarsh.ruff) extension. Then edit the user settings (_Ctrl-Shift-P_ `Open User Settings (JSON)`) and set it as the default Python formatter, and enable formatting on save:
```json
"[python]": {
"editor.defaultFormatter": "charliermarsh.ruff",
"editor.formatOnSave": true
}
```
#### PyCharm
`ruff` was installed in the `venv` environment described before, now to enable autoformatting on save, go to `File` -> `Settings` -> `Tools` -> `File Watchers` and add a new watcher with the following settings:
1. **Name**: `Ruff formatter`
2. **File type**: `Python`
3. **Working directory**: `$ContentRoot$`
4. **Arguments**: `format $FilePath$`
5. **Program**: `$PyInterpreterDirectory$/ruff`
## 🤝 Contributing
We welcome contributions from the community! Whether you're fixing bugs, improving documentation, or adding new features, here's how you can help:

View File

@@ -1,5 +0,0 @@
# Security Policy
## Reporting a Vulnerability
Please email `disclosures@daily.co`.

View File

@@ -50,7 +50,6 @@ autodoc_mock_imports = [
# Krisp - has build issues on some platforms
"pipecat_ai_krisp",
"krisp",
"krisp_audio",
# System-specific GUI libraries
"_tkinter",
"tkinter",

View File

@@ -58,9 +58,6 @@ GOOGLE_CLOUD_PROJECT_ID=...
GOOGLE_TEST_CREDENTIALS=...
GOOGLE_VERTEX_TEST_CREDENTIALS=...
# Hume
HUME_API_KEY=...
# LMNT
LMNT_API_KEY=...
LMNT_VOICE_ID=...
@@ -90,9 +87,6 @@ SIMLI_FACE_ID=...
# Krisp
KRISP_MODEL_PATH=...
# Krisp Viva
KRISP_VIVA_MODEL_PATH=...
# DeepSeek
DEEPSEEK_API_KEY=...
@@ -161,9 +155,3 @@ NVIDIA_API_KEY=...
# Qwen
QWEN_API_KEY=...
# WhatsApp
WHATSAPP_TOKEN=
WHATSAPP_WEBHOOK_VERIFICATION_TOKEN=
WHATSAPP_PHONE_NUMBER_ID=
WHATSAPP_APP_SECRET=

View File

@@ -25,7 +25,7 @@ from pipecat.processors.aggregators.llm_response_universal import LLMContextAggr
from pipecat.runner.daily import configure
from pipecat.services.cartesia.tts import CartesiaTTSService
from pipecat.services.openai.llm import OpenAILLMService
from pipecat.transports.daily.transport import DailyParams, DailyTransport
from pipecat.transports.daily.transport import DailyLogLevel, DailyParams, DailyTransport
load_dotenv(override=True)
@@ -49,6 +49,7 @@ async def main():
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
),
)
transport.set_log_level(DailyLogLevel.Info)
tts = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),

View File

@@ -5,6 +5,7 @@
#
import os
import wave
from dotenv import load_dotenv
from loguru import logger
@@ -13,7 +14,14 @@ from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.audio.vad.vad_analyzer import VADParams
from pipecat.frames.frames import LLMRunFrame
from pipecat.frames.frames import (
LLMFullResponseEndFrame,
LLMFullResponseStartFrame,
LLMRunFrame,
LLMTextFrame,
OutputAudioRawFrame,
TextFrame,
)
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
@@ -21,8 +29,8 @@ from pipecat.processors.aggregators.llm_context import LLMContext
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
from pipecat.runner.types import RunnerArguments
from pipecat.runner.utils import create_transport
from pipecat.services.cartesia.stt import CartesiaSTTService
from pipecat.services.cartesia.tts import CartesiaTTSService
from pipecat.services.deepgram.stt import DeepgramSTTService
from pipecat.services.openai.llm import OpenAILLMService
from pipecat.transports.base_transport import BaseTransport, TransportParams
from pipecat.transports.daily.transport import DailyParams
@@ -58,7 +66,7 @@ transport_params = {
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
logger.info(f"Starting bot")
stt = CartesiaSTTService(api_key=os.getenv("CARTESIA_API_KEY"))
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
tts = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),
@@ -103,7 +111,27 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
logger.info(f"Client connected")
# Kick off the conversation.
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
await task.queue_frames([LLMRunFrame()])
audio_file_path = os.path.join(os.path.dirname(__file__), "assets", "pre-recorded.wav")
with wave.open(audio_file_path, "rb") as wav_file:
llm_text_frame = TextFrame(text="This is a pre-recorded message.")
llm_text_frame.skip_tts = True
audio_data = wav_file.readframes(wav_file.getnframes())
output_audio_raw_frame = OutputAudioRawFrame(
audio=audio_data, sample_rate=44100, num_channels=1
)
await task.queue_frames(
[
LLMRunFrame(),
LLMFullResponseStartFrame(),
llm_text_frame,
output_audio_raw_frame,
LLMFullResponseEndFrame(),
]
)
@transport.event_handler("on_client_disconnected")
async def on_client_disconnected(transport, client):

View File

@@ -1,138 +0,0 @@
#
# Copyright (c) 20242025, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import os
from dotenv import load_dotenv
from loguru import logger
from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.audio.vad.vad_analyzer import VADParams
from pipecat.frames.frames import LLMRunFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.llm_context import LLMContext
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
from pipecat.processors.frameworks.rtvi import RTVIConfig, RTVIObserver, RTVIProcessor
from pipecat.runner.types import RunnerArguments
from pipecat.runner.utils import create_transport
from pipecat.services.deepgram.stt import DeepgramSTTService
from pipecat.services.hume.tts import HUME_SAMPLE_RATE, HumeTTSService
from pipecat.services.openai.llm import OpenAILLMService
from pipecat.transports.base_transport import BaseTransport, TransportParams
from pipecat.transports.daily.transport import DailyParams
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
load_dotenv(override=True)
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
# instantiated. The function will be called when the desired transport gets
# selected.
transport_params = {
"daily": lambda: DailyParams(
audio_in_enabled=True,
audio_out_enabled=True,
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
),
"twilio": lambda: FastAPIWebsocketParams(
audio_in_enabled=True,
audio_out_enabled=True,
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
),
"webrtc": lambda: TransportParams(
audio_in_enabled=True,
audio_out_enabled=True,
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
),
}
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
logger.info(f"Starting bot")
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
tts = HumeTTSService(
api_key=os.getenv("HUME_API_KEY"),
# Replace with your Hume voice ID
voice_id="f898a92e-685f-43fa-985b-a46920f0650b",
)
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
messages = [
{
"role": "system",
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
},
]
context = LLMContext(messages)
context_aggregator = LLMContextAggregatorPair(context)
rtvi = RTVIProcessor(config=RTVIConfig(config=[]))
pipeline = Pipeline(
[
transport.input(), # Transport user input
rtvi,
stt,
context_aggregator.user(), # User responses
llm, # LLM
tts, # TTS
transport.output(), # Transport bot output
context_aggregator.assistant(), # Assistant spoken responses
]
)
task = PipelineTask(
pipeline,
params=PipelineParams(
enable_metrics=True,
enable_usage_metrics=True,
audio_out_sample_rate=HUME_SAMPLE_RATE,
),
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
observers=[RTVIObserver(rtvi)],
)
@rtvi.event_handler("on_client_ready")
async def on_client_ready(rtvi):
await rtvi.set_bot_ready()
@transport.event_handler("on_client_connected")
async def on_client_connected(transport, client):
logger.info(f"Client connected")
# Kick off the conversation.
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
await task.queue_frames([LLMRunFrame()])
@transport.event_handler("on_client_disconnected")
async def on_client_disconnected(transport, client):
logger.info(f"Client disconnected")
await task.cancel()
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
await runner.run(task)
async def bot(runner_args: RunnerArguments):
"""Main bot entry point compatible with Pipecat Cloud."""
transport = await create_transport(runner_args, transport_params)
await run_bot(transport, runner_args)
if __name__ == "__main__":
from pipecat.runner.run import main
main()

View File

@@ -1,118 +0,0 @@
#
# Copyright (c) 20242025, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import os
from dotenv import load_dotenv
from loguru import logger
from pipecat.frames.frames import LLMRunFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.llm_response_universal import (
LLMContext,
LLMContextAggregatorPair,
)
from pipecat.runner.types import RunnerArguments
from pipecat.runner.utils import create_transport
from pipecat.services.deepgram.flux.stt import DeepgramFluxSTTService
from pipecat.services.deepgram.tts import DeepgramTTSService
from pipecat.services.openai.llm import OpenAILLMService
from pipecat.transports.base_transport import BaseTransport, TransportParams
from pipecat.transports.daily.transport import DailyParams
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
load_dotenv(override=True)
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
# instantiated. The function will be called when the desired transport gets
# selected.
transport_params = {
"daily": lambda: DailyParams(
audio_in_enabled=True,
audio_out_enabled=True,
),
"twilio": lambda: FastAPIWebsocketParams(
audio_in_enabled=True,
audio_out_enabled=True,
),
"webrtc": lambda: TransportParams(
audio_in_enabled=True,
audio_out_enabled=True,
),
}
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
logger.info(f"Starting bot")
stt = DeepgramFluxSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
tts = DeepgramTTSService(api_key=os.getenv("DEEPGRAM_API_KEY"), voice="aura-2-andromeda-en")
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
messages = [
{
"role": "system",
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
},
]
context = LLMContext(messages)
context_aggregator = LLMContextAggregatorPair(context)
pipeline = Pipeline(
[
transport.input(), # Transport user input
stt, # STT
context_aggregator.user(), # User responses
llm, # LLM
tts, # TTS
transport.output(), # Transport bot output
context_aggregator.assistant(), # Assistant spoken responses
]
)
task = PipelineTask(
pipeline,
params=PipelineParams(
enable_metrics=True,
enable_usage_metrics=True,
),
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
)
@transport.event_handler("on_client_connected")
async def on_client_connected(transport, client):
logger.info(f"Client connected")
# Kick off the conversation.
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
await task.queue_frames([LLMRunFrame()])
@transport.event_handler("on_client_disconnected")
async def on_client_disconnected(transport, client):
logger.info(f"Client disconnected")
await task.cancel()
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
await runner.run(task)
async def bot(runner_args: RunnerArguments):
"""Main bot entry point compatible with Pipecat Cloud."""
transport = await create_transport(runner_args, transport_params)
await run_bot(transport, runner_args)
if __name__ == "__main__":
from pipecat.runner.run import main
main()

View File

@@ -23,6 +23,7 @@ from pipecat.processors.aggregators.llm_context import LLMContext
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
from pipecat.runner.types import RunnerArguments
from pipecat.runner.utils import create_transport
from pipecat.services.deepgram.stt import DeepgramSTTService
from pipecat.services.elevenlabs.stt import ElevenLabsSTTService
from pipecat.services.elevenlabs.tts import ElevenLabsHttpTTSService
from pipecat.services.openai.llm import OpenAILLMService

View File

@@ -1,151 +0,0 @@
#
# Copyright (c) 20242025, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
"""
A conversational AI bot using Gemini for both LLM, STT and TTS.
This example demonstrates how to use Gemini's image generation capabilities.
Features showcased:
- Gemini LLM for conversation and image generation
- Google TTS and STT
Run with:
python examples/foundational/07n-interruptible-gemini-image.py
Make sure to set your environment variables:
export GOOGLE_API_KEY=your_api_key_here
"""
import os
from dotenv import load_dotenv
from loguru import logger
from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.audio.vad.vad_analyzer import VADParams
from pipecat.frames.frames import LLMRunFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.llm_context import LLMContext
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
from pipecat.runner.types import RunnerArguments
from pipecat.runner.utils import create_transport
from pipecat.services.google.llm import GoogleLLMService
from pipecat.services.google.stt import GoogleSTTService
from pipecat.services.google.tts import GoogleTTSService
from pipecat.transcriptions.language import Language
from pipecat.transports.base_transport import BaseTransport, TransportParams
from pipecat.transports.daily.transport import DailyParams
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
load_dotenv(override=True)
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
# instantiated. The function will be called when the desired transport gets
# selected.
transport_params = {
"daily": lambda: DailyParams(
audio_in_enabled=True,
audio_out_enabled=True,
video_out_enabled=True,
video_out_width=1024,
video_out_height=1024,
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
),
"webrtc": lambda: TransportParams(
audio_in_enabled=True,
audio_out_enabled=True,
video_out_enabled=True,
video_out_width=1024,
video_out_height=1024,
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
),
}
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
logger.info(f"Starting bot")
stt = GoogleSTTService(
params=GoogleSTTService.InputParams(languages=Language.EN_US),
credentials=os.getenv("GOOGLE_TEST_CREDENTIALS"),
)
tts = GoogleTTSService(
voice_id="en-US-Chirp3-HD-Charon",
params=GoogleTTSService.InputParams(language=Language.EN_US),
credentials=os.getenv("GOOGLE_TEST_CREDENTIALS"),
)
llm = GoogleLLMService(
api_key=os.getenv("GOOGLE_API_KEY"),
model="gemini-2.5-flash-image",
)
messages = [
{
"role": "system",
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
},
]
context = LLMContext(messages)
context_aggregator = LLMContextAggregatorPair(context)
pipeline = Pipeline(
[
transport.input(), # Transport user input
stt, # STT
context_aggregator.user(), # User responses
llm, # LLM
tts, # Gemini TTS
transport.output(), # Transport bot output
context_aggregator.assistant(), # Assistant spoken responses
]
)
task = PipelineTask(
pipeline,
params=PipelineParams(
enable_metrics=True,
enable_usage_metrics=True,
),
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
)
@transport.event_handler("on_client_connected")
async def on_client_connected(transport, client):
logger.info(f"Client connected")
# Kick off the conversation with a styled introduction
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
await task.queue_frames([LLMRunFrame()])
@transport.event_handler("on_client_disconnected")
async def on_client_disconnected(transport, client):
logger.info(f"Client disconnected")
await task.cancel()
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
await runner.run(task)
async def bot(runner_args: RunnerArguments):
"""Main bot entry point compatible with Pipecat Cloud."""
transport = await create_transport(runner_args, transport_params)
await run_bot(transport, runner_args)
if __name__ == "__main__":
from pipecat.runner.run import main
main()

View File

@@ -1,129 +0,0 @@
#
# Copyright (c) 20242025, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import os
from dotenv import load_dotenv
from loguru import logger
from pipecat.audio.filters.krisp_viva_filter import KrispVivaFilter
from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.audio.vad.vad_analyzer import VADParams
from pipecat.frames.frames import LLMRunFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.llm_context import LLMContext
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
from pipecat.runner.types import RunnerArguments
from pipecat.runner.utils import create_transport
from pipecat.services.deepgram.stt import DeepgramSTTService
from pipecat.services.deepgram.tts import DeepgramTTSService
from pipecat.services.openai.llm import OpenAILLMService
from pipecat.transports.base_transport import BaseTransport, TransportParams
from pipecat.transports.daily.transport import DailyParams
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
load_dotenv(override=True)
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
# instantiated. The function will be called when the desired transport gets
# selected.
transport_params = {
"daily": lambda: DailyParams(
audio_in_enabled=True,
audio_out_enabled=True,
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
audio_in_filter=KrispVivaFilter(),
),
"twilio": lambda: FastAPIWebsocketParams(
audio_in_enabled=True,
audio_out_enabled=True,
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
audio_in_filter=KrispVivaFilter(),
),
"webrtc": lambda: TransportParams(
audio_in_enabled=True,
audio_out_enabled=True,
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
audio_in_filter=KrispVivaFilter(),
),
}
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
logger.info(f"Starting bot")
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
tts = DeepgramTTSService(api_key=os.getenv("DEEPGRAM_API_KEY"), voice="aura-helios-en")
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
messages = [
{
"role": "system",
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
},
]
context = LLMContext(messages)
context_aggregator = LLMContextAggregatorPair(context)
pipeline = Pipeline(
[
transport.input(), # Transport user input
stt, # STT
context_aggregator.user(), # User responses
llm, # LLM
tts, # TTS
transport.output(), # Transport bot output
context_aggregator.assistant(), # Assistant spoken responses
]
)
task = PipelineTask(
pipeline,
params=PipelineParams(
enable_metrics=True,
enable_usage_metrics=True,
),
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
)
@transport.event_handler("on_client_connected")
async def on_client_connected(transport, client):
logger.info(f"Client connected")
# Kick off the conversation.
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
await task.queue_frames([LLMRunFrame()])
@transport.event_handler("on_client_disconnected")
async def on_client_disconnected(transport, client):
logger.info(f"Client disconnected")
await task.cancel()
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
await runner.run(task)
async def bot(runner_args: RunnerArguments):
"""Main bot entry point compatible with Pipecat Cloud."""
transport = await create_transport(runner_args, transport_params)
await run_bot(transport, runner_args)
if __name__ == "__main__":
from pipecat.runner.run import main
main()

View File

@@ -48,7 +48,10 @@ transport_params = {
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
logger.info(f"Starting bot")
stt = CartesiaSTTService(api_key=os.getenv("CARTESIA_API_KEY"))
stt = CartesiaSTTService(
api_key=os.getenv("CARTESIA_API_KEY"),
base_url=os.getenv("CARTESIA_BASE_URL"),
)
tl = TranscriptionLogger()

View File

@@ -76,8 +76,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
llm = GoogleVertexLLMService(
credentials=os.getenv("GOOGLE_VERTEX_TEST_CREDENTIALS"),
project_id=os.getenv("GOOGLE_CLOUD_PROJECT_ID"),
location=os.getenv("GOOGLE_CLOUD_LOCATION"),
params=GoogleVertexLLMService.InputParams(
project_id=os.getenv("GOOGLE_CLOUD_PROJECT_ID"),
),
)
# You can aslo register a function_name of None to get all functions
# sent to the same callback with an additional function_name parameter.

View File

@@ -1,182 +0,0 @@
#
# Copyright (c) 20242025, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import os
import time
from dotenv import load_dotenv
from loguru import logger
from pipecat.adapters.schemas.function_schema import FunctionSchema
from pipecat.adapters.schemas.tools_schema import ToolsSchema
from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.audio.vad.vad_analyzer import VADParams
from pipecat.frames.frames import LLMRunFrame, TTSSpeakFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.llm_context import LLMContext
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
from pipecat.runner.types import RunnerArguments
from pipecat.runner.utils import create_transport
from pipecat.services.cartesia.tts import CartesiaTTSService
from pipecat.services.deepgram.stt import DeepgramSTTService
from pipecat.services.llm_service import FunctionCallParams
from pipecat.services.openpipe.llm import OpenPipeLLMService
from pipecat.transports.base_transport import BaseTransport, TransportParams
from pipecat.transports.daily.transport import DailyParams
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
load_dotenv(override=True)
async def fetch_weather_from_api(params: FunctionCallParams):
await params.result_callback({"conditions": "nice", "temperature": "75"})
async def fetch_restaurant_recommendation(params: FunctionCallParams):
await params.result_callback({"name": "The Golden Dragon"})
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
# instantiated. The function will be called when the desired transport gets
# selected.
transport_params = {
"daily": lambda: DailyParams(
audio_in_enabled=True,
audio_out_enabled=True,
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
),
"twilio": lambda: FastAPIWebsocketParams(
audio_in_enabled=True,
audio_out_enabled=True,
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
),
"webrtc": lambda: TransportParams(
audio_in_enabled=True,
audio_out_enabled=True,
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
),
}
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
logger.info(f"Starting bot")
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
tts = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),
voice_id="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
)
timestamp = int(time.time())
llm = OpenPipeLLMService(
api_key=os.getenv("OPENAI_API_KEY"),
openpipe_api_key=os.getenv("OPENPIPE_API_KEY"),
tags={"conversation_id": f"pipecat-{timestamp}"},
)
# You can also register a function_name of None to get all functions
# sent to the same callback with an additional function_name parameter.
llm.register_function("get_current_weather", fetch_weather_from_api)
llm.register_function("get_restaurant_recommendation", fetch_restaurant_recommendation)
@llm.event_handler("on_function_calls_started")
async def on_function_calls_started(service, function_calls):
await tts.queue_frame(TTSSpeakFrame("Let me check on that."))
weather_function = FunctionSchema(
name="get_current_weather",
description="Get the current weather",
properties={
"location": {
"type": "string",
"description": "The city and state, e.g. San Francisco, CA",
},
"format": {
"type": "string",
"enum": ["celsius", "fahrenheit"],
"description": "The temperature unit to use. Infer this from the user's location.",
},
},
required=["location", "format"],
)
restaurant_function = FunctionSchema(
name="get_restaurant_recommendation",
description="Get a restaurant recommendation",
properties={
"location": {
"type": "string",
"description": "The city and state, e.g. San Francisco, CA",
},
},
required=["location"],
)
tools = ToolsSchema(standard_tools=[weather_function, restaurant_function])
messages = [
{
"role": "system",
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
},
]
context = LLMContext(messages, tools)
context_aggregator = LLMContextAggregatorPair(context)
pipeline = Pipeline(
[
transport.input(),
stt,
context_aggregator.user(),
llm,
tts,
transport.output(),
context_aggregator.assistant(),
]
)
task = PipelineTask(
pipeline,
params=PipelineParams(
enable_metrics=True,
enable_usage_metrics=True,
),
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
)
@transport.event_handler("on_client_connected")
async def on_client_connected(transport, client):
logger.info(f"Client connected")
# Kick off the conversation.
await task.queue_frames([LLMRunFrame()])
@transport.event_handler("on_client_disconnected")
async def on_client_disconnected(transport, client):
logger.info(f"Client disconnected")
await task.cancel()
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
await runner.run(task)
async def bot(runner_args: RunnerArguments):
"""Main bot entry point compatible with Pipecat Cloud."""
transport = await create_transport(runner_args, transport_params)
await run_bot(transport, runner_args)
if __name__ == "__main__":
from pipecat.runner.run import main
main()

View File

@@ -26,11 +26,7 @@ from pipecat.services.deepgram.stt import DeepgramSTTService
from pipecat.services.deepgram.tts import DeepgramTTSService
from pipecat.services.openai.llm import OpenAILLMService
from pipecat.transports.base_transport import BaseTransport, TransportParams
from pipecat.transports.daily.transport import (
DailyOutputTransportMessageFrame,
DailyOutputTransportMessageUrgentFrame,
DailyParams,
)
from pipecat.transports.daily.transport import DailyParams, DailyTransportMessageFrame
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
load_dotenv(override=True)
@@ -132,14 +128,14 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
logger.debug(f"Received latency ping app message: {message}")
ts = message["latency-ping"]["ts"]
# Send immediately
await task.queue_frame(
DailyOutputTransportMessageUrgentFrame(
transport.output().send_message(
DailyTransportMessageFrame(
message={"latency-pong-msg-handler": {"ts": ts}}, participant_id=sender
)
)
# And push to the pipeline for the Daily transport.output to send
await task.queue_frame(
DailyOutputTransportMessageFrame(
DailyTransportMessageFrame(
message={"latency-pong-pipeline-delivery": {"ts": ts}},
participant_id=sender,
)

View File

@@ -24,15 +24,14 @@ from pipecat.processors.transcript_processor import TranscriptProcessor
from pipecat.runner.types import RunnerArguments
from pipecat.runner.utils import create_transport
from pipecat.services.llm_service import FunctionCallParams
from pipecat.services.openai.realtime.events import (
AudioConfiguration,
AudioInput,
from pipecat.services.openai_realtime import (
InputAudioNoiseReduction,
InputAudioTranscription,
OpenAIRealtimeLLMService,
SemanticTurnDetection,
SessionProperties,
)
from pipecat.services.openai.realtime.llm import OpenAIRealtimeLLMService
from pipecat.services.openai_realtime.events import AudioConfiguration, AudioInput
from pipecat.transports.base_transport import BaseTransport, TransportParams
from pipecat.transports.daily.transport import DailyParams
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams

View File

@@ -21,14 +21,13 @@ from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.runner.types import RunnerArguments
from pipecat.runner.utils import create_transport
from pipecat.services.azure.realtime.llm import AzureRealtimeLLMService
from pipecat.services.llm_service import FunctionCallParams
from pipecat.services.openai.realtime.events import (
AudioConfiguration,
AudioInput,
from pipecat.services.openai_realtime import (
AzureRealtimeLLMService,
InputAudioTranscription,
SessionProperties,
)
from pipecat.services.openai_realtime.events import AudioConfiguration, AudioInput
from pipecat.transports.base_transport import BaseTransport, TransportParams
from pipecat.transports.daily.transport import DailyParams
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams

View File

@@ -22,17 +22,16 @@ from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.processors.transcript_processor import TranscriptProcessor
from pipecat.runner.types import RunnerArguments
from pipecat.runner.utils import create_transport
from pipecat.services.cartesia.tts import CartesiaTTSService
from pipecat.services.cartesia import CartesiaTTSService
from pipecat.services.llm_service import FunctionCallParams
from pipecat.services.openai.realtime.events import (
AudioConfiguration,
AudioInput,
from pipecat.services.openai_realtime import (
InputAudioNoiseReduction,
InputAudioTranscription,
OpenAIRealtimeLLMService,
SemanticTurnDetection,
SessionProperties,
)
from pipecat.services.openai.realtime.llm import OpenAIRealtimeLLMService
from pipecat.services.openai_realtime.events import AudioConfiguration, AudioInput
from pipecat.transports.base_transport import BaseTransport, TransportParams
from pipecat.transports.daily.transport import DailyParams
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams

View File

@@ -25,14 +25,13 @@ from pipecat.runner.types import RunnerArguments
from pipecat.runner.utils import create_transport
from pipecat.services.deepgram.stt import DeepgramSTTService
from pipecat.services.llm_service import FunctionCallParams
from pipecat.services.openai.realtime.events import (
AudioConfiguration,
AudioInput,
from pipecat.services.openai_realtime import (
InputAudioTranscription,
OpenAIRealtimeLLMService,
SessionProperties,
TurnDetection,
)
from pipecat.services.openai.realtime.llm import OpenAIRealtimeLLMService
from pipecat.services.openai_realtime.events import AudioConfiguration, AudioInput
from pipecat.transports.base_transport import BaseTransport, TransportParams
from pipecat.transports.daily.transport import DailyParams
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams

View File

@@ -72,6 +72,7 @@ async def save_conversation(params: FunctionCallParams):
)
try:
with open(filename, "w") as file:
# todo: extract 'system' into the first message in the list
messages = params.context.get_messages()
# remove the last message, which is the instruction we just gave to save the conversation
messages.pop()

View File

@@ -90,6 +90,7 @@ async def save_conversation(params: FunctionCallParams):
)
try:
with open(filename, "w") as file:
# todo: extract 'system' into the first message in the list
messages = params.context.get_messages()
# remove the last message (the instruction to save the context)
messages.pop()

View File

@@ -20,12 +20,10 @@ from pipecat.frames.frames import LLMRunFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.llm_context import LLMContext
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.runner.types import RunnerArguments
from pipecat.runner.utils import create_transport
from pipecat.services.aws.nova_sonic.llm import AWSNovaSonicLLMService
from pipecat.services.aws_nova_sonic.aws import AWSNovaSonicLLMService
from pipecat.services.llm_service import FunctionCallParams
from pipecat.transports.base_transport import BaseTransport, TransportParams
from pipecat.transports.daily.transport import DailyParams
@@ -77,7 +75,7 @@ async def save_conversation(params: FunctionCallParams):
filename = f"{BASE_FILENAME}{timestamp}.json"
try:
with open(filename, "w") as file:
messages = params.context.get_messages()
messages = params.context.get_messages_for_persistent_storage()
# remove the last few messages. in reverse order, they are:
# - the in progress save tool call
# - the invocation of the save tool call
@@ -225,13 +223,13 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
llm.register_function("get_saved_conversation_filenames", get_saved_conversation_filenames)
llm.register_function("load_conversation", load_conversation)
context = LLMContext(
context = OpenAILLMContext(
messages=[
{"role": "system", "content": f"{system_instruction}"},
],
tools=tools,
)
context_aggregator = LLMContextAggregatorPair(context)
context_aggregator = llm.create_context_aggregator(context)
pipeline = Pipeline(
[

View File

@@ -17,7 +17,7 @@ from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.runner.types import RunnerArguments
from pipecat.runner.utils import create_transport
from pipecat.services.google.gemini_live.llm import GeminiLiveLLMService
from pipecat.services.gemini_multimodal_live.gemini import GeminiMultimodalLiveLLMService
from pipecat.transports.base_transport import BaseTransport, TransportParams
from pipecat.transports.daily.transport import DailyParams
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
@@ -65,7 +65,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
Respond to what the user said in a creative and helpful way.
"""
llm = GeminiLiveLLMService(
llm = GeminiMultimodalLiveLLMService(
api_key=os.getenv("GOOGLE_API_KEY"),
system_instruction=system_instruction,
voice_id="Puck", # Aoede, Charon, Fenrir, Kore, Puck

View File

@@ -20,7 +20,7 @@ from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.processors.transcript_processor import TranscriptProcessor
from pipecat.runner.types import RunnerArguments
from pipecat.runner.utils import create_transport
from pipecat.services.google.gemini_live.llm import GeminiLiveLLMService
from pipecat.services.gemini_multimodal_live.gemini import GeminiMultimodalLiveLLMService
from pipecat.transports.base_transport import BaseTransport, TransportParams
from pipecat.transports.daily.transport import DailyParams
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
@@ -65,7 +65,7 @@ transport_params = {
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
logger.info(f"Starting bot")
llm = GeminiLiveLLMService(
llm = GeminiMultimodalLiveLLMService(
api_key=os.getenv("GOOGLE_API_KEY"),
voice_id="Aoede", # Puck, Charon, Kore, Fenrir, Aoede
# system_instruction="Talk like a pirate."

View File

@@ -22,7 +22,7 @@ from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.runner.types import RunnerArguments
from pipecat.runner.utils import create_transport
from pipecat.services.google.gemini_live.llm import GeminiLiveLLMService
from pipecat.services.gemini_multimodal_live.gemini import GeminiMultimodalLiveLLMService
from pipecat.services.llm_service import FunctionCallParams
from pipecat.transports.base_transport import BaseTransport, TransportParams
from pipecat.transports.daily.transport import DailyParams
@@ -122,15 +122,12 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
required=["location"],
)
search_tool = {"google_search": {}}
# KNOWN ISSUE: If using GeminiVertexLiveLLMService, it appears
# you cannot use the "google_search" tool alongside other tools.
# See https://github.com/googleapis/python-genai/issues/941.
tools = ToolsSchema(
standard_tools=[weather_function, restaurant_function],
custom_tools={AdapterType.GEMINI: [search_tool]},
)
llm = GeminiLiveLLMService(
llm = GeminiMultimodalLiveLLMService(
api_key=os.getenv("GOOGLE_API_KEY"),
system_instruction=system_instruction,
tools=tools,

View File

@@ -24,7 +24,7 @@ from pipecat.runner.utils import (
maybe_capture_participant_camera,
maybe_capture_participant_screen,
)
from pipecat.services.google.gemini_live.llm import GeminiLiveLLMService
from pipecat.services.gemini_multimodal_live.gemini import GeminiMultimodalLiveLLMService
from pipecat.transports.base_transport import BaseTransport, TransportParams
from pipecat.transports.daily.transport import DailyParams
@@ -58,7 +58,7 @@ transport_params = {
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
llm = GeminiLiveLLMService(
llm = GeminiMultimodalLiveLLMService(
api_key=os.getenv("GOOGLE_API_KEY"),
voice_id="Aoede", # Puck, Charon, Kore, Fenrir, Aoede
# system_instruction="Talk like a pirate."

View File

@@ -20,9 +20,9 @@ from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.runner.types import RunnerArguments
from pipecat.runner.utils import create_transport
from pipecat.services.cartesia.tts import CartesiaTTSService
from pipecat.services.google.gemini_live.llm import (
GeminiLiveLLMService,
GeminiModalities,
from pipecat.services.gemini_multimodal_live.gemini import (
GeminiMultimodalLiveLLMService,
GeminiMultimodalModalities,
InputParams,
)
from pipecat.transports.base_transport import BaseTransport, TransportParams
@@ -80,15 +80,11 @@ transport_params = {
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
logger.info(f"Starting bot")
# KNOWN ISSUE: If using GeminiLiveVertexLLMService, you cannot specify a
# modality other than AUDIO (at least not if using the service's default
# model, which is a native audio model:
# https://cloud.google.com/vertex-ai/generative-ai/docs/live-api/tools#native-audio).
llm = GeminiLiveLLMService(
llm = GeminiMultimodalLiveLLMService(
api_key=os.getenv("GOOGLE_API_KEY"),
system_instruction=SYSTEM_INSTRUCTION,
tools=[{"google_search": {}}, {"code_execution": {}}],
params=InputParams(modalities=GeminiModalities.TEXT),
params=InputParams(modalities=GeminiMultimodalModalities.TEXT),
)
# Optionally, you can set the response modalities via a function

View File

@@ -19,7 +19,7 @@ from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.runner.types import RunnerArguments
from pipecat.runner.utils import create_transport
from pipecat.services.google.gemini_live.llm import GeminiLiveLLMService
from pipecat.services.gemini_multimodal_live.gemini import GeminiMultimodalLiveLLMService
from pipecat.transports.base_transport import BaseTransport, TransportParams
from pipecat.transports.daily.transport import DailyParams
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
@@ -83,7 +83,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
logger.info(f"Starting bot")
# Initialize the Gemini Multimodal Live model
llm = GeminiLiveLLMService(
llm = GeminiMultimodalLiveLLMService(
api_key=os.getenv("GOOGLE_API_KEY"),
voice_id="Puck", # Aoede, Charon, Fenrir, Kore, Puck
system_instruction=system_instruction,

View File

@@ -19,7 +19,9 @@ from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.runner.types import RunnerArguments
from pipecat.runner.utils import create_transport
from pipecat.services.google.gemini_live.llm import GeminiLiveLLMService
from pipecat.services.gemini_multimodal_live.gemini import (
GeminiMultimodalLiveLLMService,
)
from pipecat.transports.base_transport import BaseTransport, TransportParams
from pipecat.transports.daily.transport import DailyParams
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
@@ -108,7 +110,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
"""
# Initialize Gemini service with File API support
llm = GeminiLiveLLMService(
llm = GeminiMultimodalLiveLLMService(
api_key=os.getenv("GOOGLE_API_KEY"),
system_instruction=system_instruction,
voice_id="Charon", # Aoede, Charon, Fenrir, Kore, Puck

View File

@@ -9,13 +9,13 @@ from pipecat.audio.vad.vad_analyzer import VADParams
from pipecat.frames.frames import Frame, LLMRunFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineTask
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
from pipecat.runner.types import RunnerArguments
from pipecat.runner.utils import create_transport
from pipecat.services.gemini_multimodal_live.gemini import GeminiMultimodalLiveLLMService
from pipecat.services.google.frames import LLMSearchResponseFrame
from pipecat.services.google.gemini_live.llm import GeminiLiveLLMService
from pipecat.transports.base_transport import BaseTransport, TransportParams
from pipecat.transports.daily.transport import DailyParams
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
@@ -105,7 +105,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
custom_tools={AdapterType.GEMINI: [{"google_search": {}}, {"code_execution": {}}]},
)
llm = GeminiLiveLLMService(
llm = GeminiMultimodalLiveLLMService(
api_key=os.getenv("GOOGLE_API_KEY"),
system_instruction=SYSTEM_INSTRUCTION,
voice_id="Charon", # Aoede, Charon, Fenrir, Kore, Puck

View File

@@ -1,191 +0,0 @@
#
# Copyright (c) 20242025, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import os
from datetime import datetime
from dotenv import load_dotenv
from google.genai.types import HttpOptions
from loguru import logger
from pipecat.adapters.schemas.function_schema import FunctionSchema
from pipecat.adapters.schemas.tools_schema import AdapterType, ToolsSchema
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.audio.vad.vad_analyzer import VADParams
from pipecat.frames.frames import LLMRunFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.runner.types import RunnerArguments
from pipecat.runner.utils import create_transport
from pipecat.services.google.gemini_live.llm import GeminiLiveLLMService
from pipecat.services.google.gemini_live.llm_vertex import GeminiLiveVertexLLMService
from pipecat.services.llm_service import FunctionCallParams
from pipecat.transports.base_transport import BaseTransport, TransportParams
from pipecat.transports.daily.transport import DailyParams
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
load_dotenv(override=True)
async def fetch_weather_from_api(params: FunctionCallParams):
temperature = 75 if params.arguments["format"] == "fahrenheit" else 24
await params.result_callback(
{
"conditions": "nice",
"temperature": temperature,
"format": params.arguments["format"],
"timestamp": datetime.now().strftime("%Y%m%d_%H%M%S"),
}
)
async def fetch_restaurant_recommendation(params: FunctionCallParams):
await params.result_callback({"name": "The Golden Dragon"})
system_instruction = """
You are a helpful assistant who can answer questions and use tools.
You have three tools available to you:
1. get_current_weather: Use this tool to get the current weather in a specific location.
2. get_restaurant_recommendation: Use this tool to get a restaurant recommendation in a specific location.
"""
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
# instantiated. The function will be called when the desired transport gets
# selected.
transport_params = {
"daily": lambda: DailyParams(
audio_in_enabled=True,
audio_out_enabled=True,
# set stop_secs to something roughly similar to the internal setting
# of the Multimodal Live api, just to align events. This doesn't really
# matter because we can only use the Multimodal Live API's phrase
# endpointing, for now.
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.5)),
),
"twilio": lambda: FastAPIWebsocketParams(
audio_in_enabled=True,
audio_out_enabled=True,
# set stop_secs to something roughly similar to the internal setting
# of the Multimodal Live api, just to align events. This doesn't really
# matter because we can only use the Multimodal Live API's phrase
# endpointing, for now.
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.5)),
),
"webrtc": lambda: TransportParams(
audio_in_enabled=True,
audio_out_enabled=True,
# set stop_secs to something roughly similar to the internal setting
# of the Multimodal Live api, just to align events. This doesn't really
# matter because we can only use the Multimodal Live API's phrase
# endpointing, for now.
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.5)),
),
}
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
logger.info(f"Starting bot")
weather_function = FunctionSchema(
name="get_current_weather",
description="Get the current weather",
properties={
"location": {
"type": "string",
"description": "The city and state, e.g. San Francisco, CA",
},
"format": {
"type": "string",
"enum": ["celsius", "fahrenheit"],
"description": "The temperature unit to use. Infer this from the user's location.",
},
},
required=["location", "format"],
)
restaurant_function = FunctionSchema(
name="get_restaurant_recommendation",
description="Get a restaurant recommendation",
properties={
"location": {
"type": "string",
"description": "The city and state, e.g. San Francisco, CA",
},
},
required=["location"],
)
# KNOWN ISSUE: If using GeminiVertexLiveLLMService, it appears
# you cannot use the "google_search" tool alongside other tools.
# See https://github.com/googleapis/python-genai/issues/941.
tools = ToolsSchema(standard_tools=[weather_function, restaurant_function])
llm = GeminiLiveVertexLLMService(
credentials=os.getenv("GOOGLE_VERTEX_TEST_CREDENTIALS"),
project_id=os.getenv("GOOGLE_CLOUD_PROJECT_ID"),
location=os.getenv("GOOGLE_CLOUD_LOCATION"),
system_instruction=system_instruction,
voice_id="Puck", # Aoede, Charon, Fenrir, Kore, Puck
tools=tools,
)
llm.register_function("get_current_weather", fetch_weather_from_api)
llm.register_function("get_restaurant_recommendation", fetch_restaurant_recommendation)
context = OpenAILLMContext(
[{"role": "user", "content": "Say hello."}],
)
context_aggregator = llm.create_context_aggregator(context)
pipeline = Pipeline(
[
transport.input(),
context_aggregator.user(),
llm,
transport.output(),
context_aggregator.assistant(),
]
)
task = PipelineTask(
pipeline,
params=PipelineParams(
enable_metrics=True,
enable_usage_metrics=True,
),
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
)
@transport.event_handler("on_client_connected")
async def on_client_connected(transport, client):
logger.info(f"Client connected")
# Kick off the conversation.
await task.queue_frames([LLMRunFrame()])
@transport.event_handler("on_client_disconnected")
async def on_client_disconnected(transport, client):
logger.info(f"Client disconnected")
await task.cancel()
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
await runner.run(task)
async def bot(runner_args: RunnerArguments):
"""Main bot entry point compatible with Pipecat Cloud."""
transport = await create_transport(runner_args, transport_params)
await run_bot(transport, runner_args)
if __name__ == "__main__":
from pipecat.runner.run import main
main()

View File

@@ -1,204 +0,0 @@
#
# Copyright (c) 20242025, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import os
from datetime import datetime
from dotenv import load_dotenv
from loguru import logger
from pipecat.adapters.schemas.function_schema import FunctionSchema
from pipecat.adapters.schemas.tools_schema import AdapterType, ToolsSchema
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.audio.vad.vad_analyzer import VADParams
from pipecat.frames.frames import EndTaskFrame, LLMRunFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.processors.frame_processor import FrameDirection
from pipecat.runner.types import RunnerArguments
from pipecat.runner.utils import create_transport
from pipecat.services.google.gemini_live.llm import GeminiLiveLLMService
from pipecat.services.llm_service import FunctionCallParams
from pipecat.transports.base_transport import BaseTransport, TransportParams
from pipecat.transports.daily.transport import DailyParams
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
load_dotenv(override=True)
async def fetch_weather_from_api(params: FunctionCallParams):
temperature = 75 if params.arguments["format"] == "fahrenheit" else 24
await params.result_callback(
{
"conditions": "nice",
"temperature": temperature,
"format": params.arguments["format"],
"timestamp": datetime.now().strftime("%Y%m%d_%H%M%S"),
}
)
async def fetch_restaurant_recommendation(params: FunctionCallParams):
await params.result_callback({"name": "The Golden Dragon"})
async def end_conversation(params: FunctionCallParams):
await params.result_callback({"success": True})
await params.llm.push_frame(EndTaskFrame(), FrameDirection.UPSTREAM)
system_instruction = """
You are a helpful assistant who can answer questions and use tools.
You have three tools available to you:
1. get_current_weather: Use this tool to get the current weather in a specific location.
2. get_restaurant_recommendation: Use this tool to get a restaurant recommendation in a specific location.
3. end_conversation: Use this tool to gracefully end the conversation.
After you've responded to the user three times, do two things, in order:
1. Politely let them know that that's all the time you have today and say goodbye.
2. Call the end_conversation tool to gracefully end the conversation.
"""
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
# instantiated. The function will be called when the desired transport gets
# selected.
transport_params = {
"daily": lambda: DailyParams(
audio_in_enabled=True,
audio_out_enabled=True,
# set stop_secs to something roughly similar to the internal setting
# of the Multimodal Live api, just to align events. This doesn't really
# matter because we can only use the Multimodal Live API's phrase
# endpointing, for now.
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.5)),
),
"twilio": lambda: FastAPIWebsocketParams(
audio_in_enabled=True,
audio_out_enabled=True,
# set stop_secs to something roughly similar to the internal setting
# of the Multimodal Live api, just to align events. This doesn't really
# matter because we can only use the Multimodal Live API's phrase
# endpointing, for now.
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.5)),
),
"webrtc": lambda: TransportParams(
audio_in_enabled=True,
audio_out_enabled=True,
# set stop_secs to something roughly similar to the internal setting
# of the Multimodal Live api, just to align events. This doesn't really
# matter because we can only use the Multimodal Live API's phrase
# endpointing, for now.
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.5)),
),
}
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
logger.info(f"Starting bot")
weather_function = FunctionSchema(
name="get_current_weather",
description="Get the current weather",
properties={
"location": {
"type": "string",
"description": "The city and state, e.g. San Francisco, CA",
},
"format": {
"type": "string",
"enum": ["celsius", "fahrenheit"],
"description": "The temperature unit to use. Infer this from the user's location.",
},
},
required=["location", "format"],
)
restaurant_function = FunctionSchema(
name="get_restaurant_recommendation",
description="Get a restaurant recommendation",
properties={
"location": {
"type": "string",
"description": "The city and state, e.g. San Francisco, CA",
},
},
required=["location"],
)
end_conversation_function = FunctionSchema(
name="end_conversation",
description="Gracefully end the conversation",
properties={},
required=[],
)
search_tool = {"google_search": {}}
tools = ToolsSchema(
standard_tools=[weather_function, restaurant_function, end_conversation_function],
custom_tools={AdapterType.GEMINI: [search_tool]},
)
llm = GeminiLiveLLMService(
api_key=os.getenv("GOOGLE_API_KEY"),
system_instruction=system_instruction,
tools=tools,
)
llm.register_function("get_current_weather", fetch_weather_from_api)
llm.register_function("get_restaurant_recommendation", fetch_restaurant_recommendation)
llm.register_function("end_conversation", end_conversation)
context = OpenAILLMContext(
[{"role": "user", "content": "Say hello."}],
)
context_aggregator = llm.create_context_aggregator(context)
pipeline = Pipeline(
[
transport.input(),
context_aggregator.user(),
llm,
transport.output(),
context_aggregator.assistant(),
]
)
task = PipelineTask(
pipeline,
params=PipelineParams(
enable_metrics=True,
enable_usage_metrics=True,
),
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
)
@transport.event_handler("on_client_connected")
async def on_client_connected(transport, client):
logger.info(f"Client connected")
# Kick off the conversation.
await task.queue_frames([LLMRunFrame()])
@transport.event_handler("on_client_disconnected")
async def on_client_disconnected(transport, client):
logger.info(f"Client disconnected")
await task.cancel()
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
await runner.run(task)
async def bot(runner_args: RunnerArguments):
"""Main bot entry point compatible with Pipecat Cloud."""
transport = await create_transport(runner_args, transport_params)
await run_bot(transport, runner_args)
if __name__ == "__main__":
from pipecat.runner.run import main
main()

View File

@@ -18,11 +18,10 @@ from pipecat.frames.frames import LLMRunFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.llm_context import LLMContext
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.runner.types import RunnerArguments
from pipecat.runner.utils import create_transport
from pipecat.services.aws.nova_sonic.llm import AWSNovaSonicLLMService
from pipecat.services.aws_nova_sonic import AWSNovaSonicLLMService
from pipecat.services.llm_service import FunctionCallParams
from pipecat.transports.base_transport import BaseTransport, TransportParams
from pipecat.transports.daily.transport import DailyParams
@@ -120,7 +119,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
llm.register_function("get_current_weather", fetch_weather_from_api)
# Set up context and context management.
context = LLMContext(
# AWSNovaSonicService will adapt OpenAI LLM context objects with standard message format to
# what's expected by Nova Sonic.
context = OpenAILLMContext(
messages=[
{"role": "system", "content": f"{system_instruction}"},
{
@@ -130,7 +131,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
],
tools=tools,
)
context_aggregator = LLMContextAggregatorPair(context)
context_aggregator = llm.create_context_aggregator(context)
# Build the pipeline
pipeline = Pipeline(

View File

@@ -20,7 +20,7 @@ from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
from pipecat.processors.frameworks.rtvi import RTVIObserver, RTVIProcessor
from pipecat.runner.types import RunnerArguments
from pipecat.runner.utils import create_transport
from pipecat.services.google.gemini_live.llm import GeminiLiveLLMService
from pipecat.services.gemini_multimodal_live import GeminiMultimodalLiveLLMService
from pipecat.transports.base_transport import TransportParams
from pipecat.transports.daily.transport import DailyParams, DailyTransport
@@ -94,7 +94,7 @@ Respond to what the user said in a creative and helpful way. Keep your responses
async def run_bot(pipecat_transport):
llm = GeminiLiveLLMService(
llm = GeminiMultimodalLiveLLMService(
api_key=os.getenv("GOOGLE_API_KEY"),
voice_id="Puck", # Aoede, Charon, Fenrir, Kore, Puck
transcribe_user_audio=True,

View File

@@ -1,142 +0,0 @@
#
# Copyright (c) 20242025, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import os
import sentry_sdk
from dotenv import load_dotenv
from loguru import logger
from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.audio.vad.vad_analyzer import VADParams
from pipecat.frames.frames import LLMRunFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.llm_context import LLMContext
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
from pipecat.processors.metrics.sentry import SentryMetrics
from pipecat.runner.types import RunnerArguments
from pipecat.runner.utils import create_transport
from pipecat.services.cartesia.tts import CartesiaTTSService
from pipecat.services.deepgram.stt import DeepgramSTTService
from pipecat.services.openai.llm import OpenAILLMService
from pipecat.transports.base_transport import BaseTransport, TransportParams
from pipecat.transports.daily.transport import DailyParams
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
load_dotenv(override=True)
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
# instantiated. The function will be called when the desired transport gets
# selected.
transport_params = {
"daily": lambda: DailyParams(
audio_in_enabled=True,
audio_out_enabled=True,
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
),
"twilio": lambda: FastAPIWebsocketParams(
audio_in_enabled=True,
audio_out_enabled=True,
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
),
"webrtc": lambda: TransportParams(
audio_in_enabled=True,
audio_out_enabled=True,
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
),
}
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
logger.info(f"Starting bot")
# Initialize Sentry
sentry_sdk.init(
dsn=os.getenv("SENTRY_DSN"),
traces_sample_rate=1.0,
)
stt = DeepgramSTTService(
api_key=os.getenv("DEEPGRAM_API_KEY"),
metrics=SentryMetrics(),
)
tts = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),
voice_id="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
metrics=SentryMetrics(),
)
llm = OpenAILLMService(
api_key=os.getenv("OPENAI_API_KEY"),
metrics=SentryMetrics(),
)
messages = [
{
"role": "system",
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
},
]
context = LLMContext(messages)
context_aggregator = LLMContextAggregatorPair(context)
pipeline = Pipeline(
[
transport.input(), # Transport user input
stt,
context_aggregator.user(), # User responses
llm, # LLM
tts, # TTS
transport.output(), # Transport bot output
context_aggregator.assistant(), # Assistant spoken responses
]
)
task = PipelineTask(
pipeline,
params=PipelineParams(
enable_metrics=True,
enable_usage_metrics=True,
),
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
)
@transport.event_handler("on_client_connected")
async def on_client_connected(transport, client):
logger.info(f"Client connected")
# Kick off the conversation.
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
await task.queue_frames([LLMRunFrame()])
@transport.event_handler("on_client_disconnected")
async def on_client_disconnected(transport, client):
logger.info(f"Client disconnected")
await task.cancel()
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
await runner.run(task)
async def bot(runner_args: RunnerArguments):
"""Main bot entry point compatible with Pipecat Cloud."""
transport = await create_transport(runner_args, transport_params)
await run_bot(transport, runner_args)
if __name__ == "__main__":
from pipecat.runner.run import main
main()

View File

@@ -105,7 +105,7 @@ uv run 07-interruptible.py -t twilio -x NGROK_HOST_NAME
### Vision & Multimodal
- **[12a-describe-video-gemini-flash.py](./12a-describe-video-gemini-flash.py)**: Bot describes user's video (Video input, Multimodal LLMs)
- **[26c-gemini-live-video.py](./26c-gemini-live-video.py)**: Gemini with video input (Streaming video, Function calls)
- **[26c-gemini-multimodal-live-video.py](./26c-gemini-multimodal-live-video.py)**: Gemini with video input (Streaming video, Function calls)
### Voice & Language

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.1 MiB

Binary file not shown.

View File

@@ -34,11 +34,10 @@ dependencies = [
"pyloudnorm~=0.1.1",
"resampy~=0.4.3",
"soxr~=0.5.0",
"openai>=1.74.0,<3",
"openai>=1.74.0,<=1.99.1",
# Pinning numba to resolve package dependencies
"numba==0.61.2",
"wait_for2>=0.4.1; python_version<'3.12'",
"pipecat-ai-cli"
]
[project.urls]
@@ -51,24 +50,23 @@ anthropic = [ "anthropic~=0.49.0" ]
assemblyai = [ "pipecat-ai[websockets-base]" ]
asyncai = [ "pipecat-ai[websockets-base]" ]
aws = [ "aioboto3~=15.0.0", "pipecat-ai[websockets-base]" ]
aws-nova-sonic = [ "aws_sdk_bedrock_runtime~=0.1.0; python_version>='3.12'" ]
aws-nova-sonic = [ "aws_sdk_bedrock_runtime~=0.0.2; python_version>='3.12'" ]
azure = [ "azure-cognitiveservices-speech~=1.42.0"]
cartesia = [ "cartesia~=2.0.3", "pipecat-ai[websockets-base]" ]
cerebras = []
deepseek = []
daily = [ "daily-python~=0.20.0" ]
daily = [ "daily-python~=0.19.9" ]
deepgram = [ "deepgram-sdk~=4.7.0" ]
elevenlabs = [ "pipecat-ai[websockets-base]" ]
fal = [ "fal-client~=0.5.9" ]
fireworks = []
fish = [ "ormsgpack~=1.7.0", "pipecat-ai[websockets-base]" ]
gladia = [ "pipecat-ai[websockets-base]" ]
google = [ "google-cloud-speech>=2.33.0,<3", "google-cloud-texttospeech>=2.31.0,<3", "google-genai>=1.41.0,<2", "pipecat-ai[websockets-base]" ]
google = [ "google-cloud-speech~=2.32.0", "google-cloud-texttospeech~=2.26.0", "google-genai~=1.24.0", "pipecat-ai[websockets-base]" ]
grok = []
groq = [ "groq~=0.23.0" ]
gstreamer = [ "pygobject~=3.50.0" ]
heygen = [ "livekit>=1.0.13", "pipecat-ai[websockets-base]" ]
hume = [ "hume>=0.11.2" ]
inworld = []
krisp = [ "pipecat-ai-krisp~=0.4.0" ]
koala = [ "pvkoala~=2.0.3" ]
@@ -85,7 +83,7 @@ nim = []
neuphonic = [ "pipecat-ai[websockets-base]" ]
noisereduce = [ "noisereduce~=3.0.3" ]
openai = [ "pipecat-ai[websockets-base]" ]
openpipe = [ "openpipe>=4.50.0,<6" ]
openpipe = [ "openpipe~=4.50.0" ]
openrouter = []
perplexity = []
playht = [ "pipecat-ai[websockets-base]" ]
@@ -103,7 +101,7 @@ silero = [ "onnxruntime>=1.20.1,<2" ]
simli = [ "simli-ai~=0.1.10"]
soniox = [ "pipecat-ai[websockets-base]" ]
soundfile = [ "soundfile~=0.13.0" ]
speechmatics = [ "speechmatics-rt>=0.5.0" ]
speechmatics = [ "speechmatics-rt>=0.4.0" ]
strands = [ "strands-agents>=1.9.1,<2" ]
tavus=[]
together = []

View File

@@ -67,7 +67,6 @@ TESTS_07 = [
("07ac-interruptible-asyncai-http.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
("07b-interruptible-langchain.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
("07c-interruptible-deepgram.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
("07c-interruptible-deepgram-flux.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
("07d-interruptible-elevenlabs.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
(
"07d-interruptible-elevenlabs-http.py",
@@ -75,6 +74,8 @@ TESTS_07 = [
EVAL_SIMPLE_MATH,
BOT_SPEAKS_FIRST,
),
("07e-interruptible-playht.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
("07e-interruptible-playht-http.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
("07f-interruptible-azure.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
("07g-interruptible-openai.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
("07h-interruptible-openpipe.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
@@ -101,7 +102,6 @@ TESTS_07 = [
("07w-interruptible-fal.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
("07y-interruptible-minimax.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
("07z-interruptible-sarvam.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
("07ae-interruptible-hume.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
# Needs a local XTTS docker instance running.
# ("07i-interruptible-xtts.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
# Needs a Krisp license.
@@ -136,7 +136,6 @@ TESTS_14 = [
("14r-function-calling-aws.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
("14v-function-calling-openai.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
("14w-function-calling-mistral.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
("14x-function-calling-openpipe.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
# Currently not working.
# ("14c-function-calling-together.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
# ("14l-function-calling-deepseek.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
@@ -148,10 +147,7 @@ TESTS_15 = [
]
TESTS_19 = [
("19-openai-realtime.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
("19-openai-realtime-beta.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
# OpenAI Realtime not released on Azure yet
# ("19a-azure-realtime.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
("19a-azure-realtime-beta.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
("19b-openai-realtime-text.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
("19b-openai-realtime-beta-text.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
@@ -164,18 +160,18 @@ TESTS_21 = [
TESTS_26 = [
("26-gemini-multimodal-live.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
(
"26a-gemini-live-transcription.py",
"26a-gemini-multimodal-live-transcription.py",
PROMPT_SIMPLE_MATH,
EVAL_SIMPLE_MATH,
BOT_SPEAKS_FIRST,
),
(
"26b-gemini-live-function-calling.py",
"26b-gemini-multimodal-live-function-calling.py",
PROMPT_WEATHER,
EVAL_WEATHER,
BOT_SPEAKS_FIRST,
),
("26c-gemini-live-video.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
("26c-gemini-multimodal-live-video.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
(
"26e-gemini-multimodal-google-search.py",
PROMPT_ONLINE_SEARCH,
@@ -183,13 +179,7 @@ TESTS_26 = [
BOT_SPEAKS_FIRST,
),
# Currently not working.
# ("26d-gemini-live-text.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
(
"26h-gemini-live-vertex-function-calling.py",
PROMPT_WEATHER,
EVAL_WEATHER,
BOT_SPEAKS_FIRST,
),
# ("26d-gemini-multimodal-live-text.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
]
TESTS_27 = [

View File

@@ -6,47 +6,13 @@
"""AWS Nova Sonic LLM adapter for Pipecat."""
import copy
import json
from dataclasses import dataclass
from enum import Enum
from typing import Any, Dict, List, Optional, TypedDict
from loguru import logger
from typing import Any, Dict, List, TypedDict
from pipecat.adapters.base_llm_adapter import BaseLLMAdapter
from pipecat.adapters.schemas.function_schema import FunctionSchema
from pipecat.adapters.schemas.tools_schema import ToolsSchema
from pipecat.processors.aggregators.llm_context import LLMContext, LLMContextMessage
class Role(Enum):
"""Roles supported in AWS Nova Sonic conversations.
Parameters:
SYSTEM: System-level messages (not used in conversation history).
USER: Messages sent by the user.
ASSISTANT: Messages sent by the assistant.
TOOL: Messages sent by tools (not used in conversation history).
"""
SYSTEM = "SYSTEM"
USER = "USER"
ASSISTANT = "ASSISTANT"
TOOL = "TOOL"
@dataclass
class AWSNovaSonicConversationHistoryMessage:
"""A single message in AWS Nova Sonic conversation history.
Parameters:
role: The role of the message sender (USER or ASSISTANT only).
text: The text content of the message.
"""
role: Role # only USER and ASSISTANT
text: str
from pipecat.processors.aggregators.llm_context import LLMContext
class AWSNovaSonicLLMInvocationParams(TypedDict):
@@ -55,9 +21,7 @@ class AWSNovaSonicLLMInvocationParams(TypedDict):
This is a placeholder until support for universal LLMContext machinery is added for AWS Nova Sonic.
"""
system_instruction: Optional[str]
messages: List[AWSNovaSonicConversationHistoryMessage]
tools: List[Dict[str, Any]]
pass
class AWSNovaSonicLLMAdapter(BaseLLMAdapter[AWSNovaSonicLLMInvocationParams]):
@@ -70,7 +34,7 @@ class AWSNovaSonicLLMAdapter(BaseLLMAdapter[AWSNovaSonicLLMInvocationParams]):
@property
def id_for_llm_specific_messages(self) -> str:
"""Get the identifier used in LLMSpecificMessage instances for AWS Nova Sonic."""
return "aws-nova-sonic"
raise NotImplementedError("Universal LLMContext is not yet supported for AWS Nova Sonic.")
def get_llm_invocation_params(self, context: LLMContext) -> AWSNovaSonicLLMInvocationParams:
"""Get AWS Nova Sonic-specific LLM invocation parameters from a universal LLM context.
@@ -83,13 +47,7 @@ class AWSNovaSonicLLMAdapter(BaseLLMAdapter[AWSNovaSonicLLMInvocationParams]):
Returns:
Dictionary of parameters for invoking AWS Nova Sonic's LLM API.
"""
messages = self._from_universal_context_messages(self.get_messages(context))
return {
"system_instruction": messages.system_instruction,
"messages": messages.messages,
# NOTE: LLMContext's tools are guaranteed to be a ToolsSchema (or NOT_GIVEN)
"tools": self.from_standard_tools(context.tools) or [],
}
raise NotImplementedError("Universal LLMContext is not yet supported for AWS Nova Sonic.")
def get_messages_for_logging(self, context) -> List[Dict[str, Any]]:
"""Get messages from a universal LLM context in a format ready for logging about AWS Nova Sonic.
@@ -104,75 +62,7 @@ class AWSNovaSonicLLMAdapter(BaseLLMAdapter[AWSNovaSonicLLMInvocationParams]):
Returns:
List of messages in a format ready for logging about AWS Nova Sonic.
"""
return self._from_universal_context_messages(self.get_messages(context)).messages
@dataclass
class ConvertedMessages:
"""Container for Google-formatted messages converted from universal context."""
messages: List[AWSNovaSonicConversationHistoryMessage]
system_instruction: Optional[str] = None
def _from_universal_context_messages(
self, universal_context_messages: List[LLMContextMessage]
) -> ConvertedMessages:
system_instruction = None
messages = []
# Bail if there are no messages
if not universal_context_messages:
return self.ConvertedMessages()
universal_context_messages = copy.deepcopy(universal_context_messages)
# If we have a "system" message as our first message, let's pull that out into "instruction"
if universal_context_messages[0].get("role") == "system":
system = universal_context_messages.pop(0)
content = system.get("content")
if isinstance(content, str):
system_instruction = content
elif isinstance(content, list):
system_instruction = content[0].get("text")
if system_instruction:
self._system_instruction = system_instruction
# Process remaining messages to fill out conversation history.
# Nova Sonic supports "user" and "assistant" messages in history.
for universal_context_message in universal_context_messages:
message = self._from_universal_context_message(universal_context_message)
if message:
messages.append(message)
return self.ConvertedMessages(messages=messages, system_instruction=system_instruction)
def _from_universal_context_message(self, message) -> AWSNovaSonicConversationHistoryMessage:
"""Convert standard message format to Nova Sonic format.
Args:
message: Standard message dictionary to convert.
Returns:
Nova Sonic conversation history message, or None if not convertible.
"""
role = message.get("role")
if message.get("role") == "user" or message.get("role") == "assistant":
content = message.get("content")
if isinstance(message.get("content"), list):
content = ""
for c in message.get("content"):
if c.get("type") == "text":
content += " " + c.get("text")
else:
logger.error(
f"Unhandled content type in context message: {c.get('type')} - {message}"
)
# There won't be content if this is an assistant tool call entry.
# We're ignoring those since they can't be loaded into AWS Nova Sonic conversation
# history
if content:
return AWSNovaSonicConversationHistoryMessage(role=Role[role.upper()], text=content)
# NOTE: we're ignoring messages with role "tool" since they can't be loaded into AWS Nova
# Sonic conversation history
raise NotImplementedError("Universal LLMContext is not yet supported for AWS Nova Sonic.")
@staticmethod
def _to_aws_nova_sonic_function_format(function: FunctionSchema) -> Dict[str, Any]:

View File

@@ -87,11 +87,9 @@ class GeminiLLMAdapter(BaseLLMAdapter[GeminiLLMInvocationParams]):
Includes both converted standard tools and any custom Gemini-specific tools.
"""
functions_schema = tools_schema.standard_tools
formatted_standard_tools = (
[{"function_declarations": [func.to_default_dict() for func in functions_schema]}]
if functions_schema
else []
)
formatted_standard_tools = [
{"function_declarations": [func.to_default_dict() for func in functions_schema]}
]
custom_gemini_tools = []
if tools_schema.custom_tools:
custom_gemini_tools = tools_schema.custom_tools.get(AdapterType.GEMINI, [])

View File

@@ -1,193 +0,0 @@
#
# Copyright (c) 20242025, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
"""Krisp noise reduction audio filter for Pipecat.
This module provides an audio filter implementation using Krisp VIVA SDK.
"""
import os
import numpy as np
from loguru import logger
from pipecat.audio.filters.base_audio_filter import BaseAudioFilter
from pipecat.frames.frames import FilterControlFrame, FilterEnableFrame
try:
import krisp_audio
except ModuleNotFoundError as e:
logger.error(f"Exception: {e}")
logger.error("In order to use the Krisp filter, you need to install krisp_audio.")
raise Exception(f"Missing module: {e}")
def _log_callback(log_message, log_level):
logger.info(f"[{log_level}] {log_message}")
class KrispVivaFilter(BaseAudioFilter):
"""Audio filter using the Krisp VIVA SDK.
Provides real-time noise reduction for audio streams using Krisp's
proprietary noise suppression algorithms. This filter requires a
valid Krisp model file to operate.
Supported sample rates:
- 8000 Hz
- 16000 Hz
- 24000 Hz
- 32000 Hz
- 44100 Hz
- 48000 Hz
"""
# Initialize Krisp Audio SDK globally
krisp_audio.globalInit("", _log_callback, krisp_audio.LogLevel.Off)
SDK_VERSION = krisp_audio.getVersion()
logger.debug(
f"Krisp Audio Python SDK Version: {SDK_VERSION.major}."
f"{SDK_VERSION.minor}.{SDK_VERSION.patch}"
)
SAMPLE_RATES = {
8000: krisp_audio.SamplingRate.Sr8000Hz,
16000: krisp_audio.SamplingRate.Sr16000Hz,
24000: krisp_audio.SamplingRate.Sr24000Hz,
32000: krisp_audio.SamplingRate.Sr32000Hz,
44100: krisp_audio.SamplingRate.Sr44100Hz,
48000: krisp_audio.SamplingRate.Sr48000Hz,
}
FRAME_SIZE_MS = 10 # Krisp requires audio frames of 10ms duration for processing.
def __init__(self, model_path: str = None, noise_suppression_level: int = 100) -> None:
"""Initialize the Krisp noise reduction filter.
Args:
model_path: Path to the Krisp model file (.kef extension).
If None, uses KRISP_VIVA_MODEL_PATH environment variable.
noise_suppression_level: Noise suppression level.
Raises:
ValueError: If model_path is not provided and KRISP_VIVA_MODEL_PATH is not set.
Exception: If model file doesn't have .kef extension.
FileNotFoundError: If model file doesn't exist.
"""
super().__init__()
# Set model path, checking environment if not specified
self._model_path = model_path or os.getenv("KRISP_VIVA_MODEL_PATH")
if not self._model_path:
logger.error("Model path is not provided and KRISP_VIVA_MODEL_PATH is not set.")
raise ValueError("Model path for KrispAudioProcessor must be provided.")
if not self._model_path.endswith(".kef"):
raise Exception("Model is expected with .kef extension")
if not os.path.isfile(self._model_path):
raise FileNotFoundError(f"Model file not found: {self._model_path}")
self._filtering = True
self._session = None
self._samples_per_frame = None
self._noise_suppression_level = noise_suppression_level
# Audio buffer to accumulate samples for complete frames
self._audio_buffer = bytearray()
def _int_to_sample_rate(self, sample_rate):
"""Convert integer sample rate to krisp_audio SamplingRate enum.
Args:
sample_rate: Sample rate as integer
Returns:
krisp_audio.SamplingRate enum value
Raises:
ValueError: If sample rate is not supported
"""
if sample_rate not in self.SAMPLE_RATES:
raise ValueError("Unsupported sample rate")
return self.SAMPLE_RATES[sample_rate]
async def start(self, sample_rate: int):
"""Initialize the Krisp processor with the transport's sample rate.
Args:
sample_rate: The sample rate of the input transport in Hz.
"""
model_info = krisp_audio.ModelInfo()
model_info.path = self._model_path
nc_cfg = krisp_audio.NcSessionConfig()
nc_cfg.inputSampleRate = self._int_to_sample_rate(sample_rate)
nc_cfg.inputFrameDuration = krisp_audio.FrameDuration.Fd10ms
nc_cfg.outputSampleRate = nc_cfg.inputSampleRate
nc_cfg.modelInfo = model_info
self._samples_per_frame = int((sample_rate * self.FRAME_SIZE_MS) / 1000)
self._session = krisp_audio.NcInt16.create(nc_cfg)
async def stop(self):
"""Clean up the Krisp processor when stopping."""
self._session = None
async def process_frame(self, frame: FilterControlFrame):
"""Process control frames to enable/disable filtering.
Args:
frame: The control frame containing filter commands.
"""
if isinstance(frame, FilterEnableFrame):
self._filtering = frame.enable
async def filter(self, audio: bytes) -> bytes:
"""Apply Krisp noise reduction to audio data.
Args:
audio: Raw audio data as bytes to be filtered.
Returns:
Noise-reduced audio data as bytes.
"""
if not self._filtering:
return audio
# Add incoming audio to our buffer
self._audio_buffer.extend(audio)
# Calculate how many complete frames we can process
total_samples = len(self._audio_buffer) // 2 # 2 bytes per int16 sample
num_complete_frames = total_samples // self._samples_per_frame
if num_complete_frames == 0:
# Not enough samples for a complete frame yet, return empty
return b""
# Calculate how many bytes we need for complete frames
complete_samples_count = num_complete_frames * self._samples_per_frame
bytes_to_process = complete_samples_count * 2 # 2 bytes per sample
# Extract the bytes we can process
audio_to_process = bytes(self._audio_buffer[:bytes_to_process])
# Remove processed bytes from buffer, keep the remainder
self._audio_buffer = self._audio_buffer[bytes_to_process:]
# Process the complete frames
samples = np.frombuffer(audio_to_process, dtype=np.int16)
frames = samples.reshape(-1, self._samples_per_frame)
processed_samples = np.empty_like(samples)
for i, frame in enumerate(frames):
cleaned_frame = self._session.process(frame, self._noise_suppression_level)
processed_samples[i * self._samples_per_frame : (i + 1) * self._samples_per_frame] = (
cleaned_frame
)
return processed_samples.tobytes()

View File

@@ -672,7 +672,7 @@ class TTSSpeakFrame(DataFrame):
@dataclass
class OutputTransportMessageFrame(DataFrame):
class TransportMessageFrame(DataFrame):
"""Frame containing transport-specific message data.
Parameters:
@@ -685,32 +685,6 @@ class OutputTransportMessageFrame(DataFrame):
return f"{self.name}(message: {self.message})"
@dataclass
class TransportMessageFrame(OutputTransportMessageFrame):
"""Frame containing transport-specific message data.
.. deprecated:: 0.0.87
This frame is deprecated and will be removed in a future version.
Instead, use `OutputTransportMessageFrame`.
Parameters:
message: The transport message payload.
"""
def __post_init__(self):
super().__post_init__()
import warnings
with warnings.catch_warnings():
warnings.simplefilter("always")
warnings.warn(
"TransportMessageFrame is deprecated and will be removed in a future version. "
"Instead, use OutputTransportMessageFrame.",
DeprecationWarning,
stacklevel=2,
)
@dataclass
class DTMFFrame:
"""Base class for DTMF (Dual-Tone Multi-Frequency) keypad frames.
@@ -1118,8 +1092,8 @@ class STTMuteFrame(SystemFrame):
@dataclass
class InputTransportMessageFrame(SystemFrame):
"""Frame for transport messages received from external sources.
class TransportMessageUrgentFrame(SystemFrame):
"""Frame for urgent transport messages that need immediate processing.
Parameters:
message: The urgent transport message payload.
@@ -1132,69 +1106,20 @@ class InputTransportMessageFrame(SystemFrame):
@dataclass
class InputTransportMessageUrgentFrame(InputTransportMessageFrame):
class InputTransportMessageUrgentFrame(TransportMessageUrgentFrame):
"""Frame for transport messages received from external sources.
.. deprecated:: 0.0.87
This frame is deprecated and will be removed in a future version.
Instead, use `InputTransportMessageFrame`.
This frame wraps incoming transport messages to distinguish them from outgoing
urgent transport messages (TransportMessageUrgentFrame), preventing infinite
message loops in the transport layer. It inherits the message payload from
TransportMessageFrame while marking the message as having been received
rather than generated locally.
Parameters:
message: The urgent transport message payload.
Used by transport implementations to properly handle bidirectional message
flow without creating feedback loops.
"""
def __post_init__(self):
super().__post_init__()
import warnings
with warnings.catch_warnings():
warnings.simplefilter("always")
warnings.warn(
"InputTransportMessageUrgentFrame is deprecated and will be removed in a future version. "
"Instead, use InputTransportMessageFrame.",
DeprecationWarning,
stacklevel=2,
)
@dataclass
class OutputTransportMessageUrgentFrame(SystemFrame):
"""Frame for urgent transport messages that need to be sent immediately.
Parameters:
message: The urgent transport message payload.
"""
message: Any
def __str__(self):
return f"{self.name}(message: {self.message})"
@dataclass
class TransportMessageUrgentFrame(OutputTransportMessageUrgentFrame):
"""Frame for urgent transport messages that need to be sent immediately.
.. deprecated:: 0.0.87
This frame is deprecated and will be removed in a future version.
Instead, use `OutputTransportMessageUrgentFrame`.
Parameters:
message: The urgent transport message payload.
"""
def __post_init__(self):
super().__post_init__()
import warnings
with warnings.catch_warnings():
warnings.simplefilter("always")
warnings.warn(
"TransportMessageUrgentFrame is deprecated and will be removed in a future version. "
"Instead, use OutputTransportMessageFrame.",
DeprecationWarning,
stacklevel=2,
)
pass
@dataclass

View File

@@ -70,15 +70,11 @@ class PipelineRunner(BaseObject):
"""
logger.debug(f"Runner {self} started running {task}")
self._tasks[task.name] = task
# PipelineTask handles asyncio.CancelledError to shutdown the pipeline
# properly and re-raises it in case there's more cleanup to do.
params = PipelineTaskParams(loop=self._loop)
try:
params = PipelineTaskParams(loop=self._loop)
await task.run(params)
except asyncio.CancelledError:
pass
await self._cancel()
del self._tasks[task.name]
# Cleanup base object.

View File

@@ -130,16 +130,12 @@ class PipelineTask(BasePipelineTask):
- on_pipeline_finished: Called after the pipeline has reached any terminal state.
This includes:
- StopFrame: pipeline was stopped (processors keep connections open)
- EndFrame: pipeline ended normally
- CancelFrame: pipeline was cancelled
Use this event for cleanup, logging, or post-processing tasks. Users can inspect
the frame if they need to handle specific cases.
- on_pipeline_error: Called when an error occurs with ErrorFrame
Example::
@task.event_handler("on_frame_reached_upstream")
@@ -150,17 +146,9 @@ class PipelineTask(BasePipelineTask):
async def on_pipeline_idle_timeout(task):
...
@task.event_handler("on_pipeline_started")
async def on_pipeline_started(task, frame):
...
@task.event_handler("on_pipeline_finished")
async def on_pipeline_finished(task, frame):
...
@task.event_handler("on_pipeline_error")
async def on_pipeline_error(task, frame):
...
"""
def __init__(
@@ -269,9 +257,6 @@ class PipelineTask(BasePipelineTask):
# StopFrame) has been received at the end of the pipeline.
self._pipeline_end_event = asyncio.Event()
# This event is set when the pipeline truly finishes.
self._pipeline_finished_event = asyncio.Event()
# This is the final pipeline. It is composed of a source processor,
# followed by the user pipeline, and ending with a sink processor. The
# source allows us to receive and react to upstream frames, and the sink
@@ -301,7 +286,6 @@ class PipelineTask(BasePipelineTask):
self._register_event_handler("on_pipeline_ended")
self._register_event_handler("on_pipeline_cancelled")
self._register_event_handler("on_pipeline_finished")
self._register_event_handler("on_pipeline_error")
@property
def params(self) -> PipelineParams:
@@ -404,7 +388,11 @@ class PipelineTask(BasePipelineTask):
await self.queue_frame(EndFrame())
async def cancel(self):
"""Request the running pipeline to cancel."""
"""Immediately stop the running pipeline.
Cancels all running tasks and stops frame processing without
waiting for completion.
"""
if not self._finished:
await self._cancel()
@@ -416,38 +404,51 @@ class PipelineTask(BasePipelineTask):
"""
if self.has_finished():
return
# Setup processors.
await self._setup(params)
# Create all main tasks and wait for the main push task. This is the
# task that pushes frames to the very beginning of our pipeline (i.e. to
# our controlled source processor).
await self._create_tasks()
cleanup_pipeline = True
try:
# Wait for pipeline to finish.
await self._wait_for_pipeline_finished()
# Setup processors.
await self._setup(params)
# Create all main tasks and wait of the main push task. This is the
# task that pushes frames to the very beginning of our pipeline (our
# controlled source processor).
push_task = await self._create_tasks()
await push_task
# We have already cleaned up the pipeline inside the task.
cleanup_pipeline = False
# Pipeline has finished nicely.
self._finished = True
except asyncio.CancelledError:
logger.debug(f"Pipeline task {self} got cancelled from outside...")
# We have been cancelled from outside, let's just cancel everything.
await self._cancel()
# Wait again for pipeline to finish. This time we have really
# cancelled, so it should really finish.
await self._wait_for_pipeline_finished()
# Re-raise in case there's more cleanup to do.
# Raise exception back to the pipeline runner so it can cancel this
# task properly.
raise
finally:
# We can reach this point for different reasons:
#
# 1. The pipeline task has finished (try case).
# 2. By an asyncio task cancellation (except case).
logger.debug(f"Pipeline task {self} is finishing...")
await self._cancel_tasks()
if self._check_dangling_tasks:
self._print_dangling_tasks()
self._finished = True
logger.debug(f"Pipeline task {self} has finished")
# 1. The task has finished properly (e.g. `EndFrame`).
# 2. By calling `PipelineTask.cancel()`.
# 3. By asyncio task cancellation.
#
# Case (1) will execute the code below without issues because
# `self._finished` is true.
#
# Case (2) will execute the code below without issues because
# `self._cancelled` is true.
#
# Case (3) will raise the exception above (because we are cancelling
# the asyncio task). This will be then captured by the
# `PipelineRunner` which will call `PipelineTask.cancel()` and
# therefore becoming case (2).
if self._finished or self._cancelled:
logger.debug(f"Pipeline task {self} is finishing cleanup...")
await self._cancel_tasks()
await self._cleanup(cleanup_pipeline)
if self._check_dangling_tasks:
self._print_dangling_tasks()
self._finished = True
logger.debug(f"Pipeline task {self} has finished")
async def queue_frame(self, frame: Frame):
"""Queue a single frame to be pushed down the pipeline.
@@ -475,7 +476,19 @@ class PipelineTask(BasePipelineTask):
if not self._cancelled:
logger.debug(f"Cancelling pipeline task {self}")
self._cancelled = True
await self.queue_frame(CancelFrame())
cancel_frame = CancelFrame()
# Make sure everything is cleaned up downstream. This is sent
# out-of-band from the main streaming task which is what we want since
# we want to cancel right away.
await self._pipeline.queue_frame(cancel_frame)
# Wait for CancelFrame to make it through the pipeline.
await self._wait_for_pipeline_end(cancel_frame)
# Only cancel the push task, we don't want to be able to process any
# other frame after cancel. Everything else will be cancelled in
# run().
if self._process_push_task:
await self._task_manager.cancel_task(self._process_push_task)
self._process_push_task = None
async def _create_tasks(self):
"""Create and start all pipeline processing tasks."""
@@ -577,17 +590,6 @@ class PipelineTask(BasePipelineTask):
self._pipeline_end_event.clear()
# We are really done.
self._pipeline_finished_event.set()
async def _wait_for_pipeline_finished(self):
await self._pipeline_finished_event.wait()
self._pipeline_finished_event.clear()
# Make sure we wait for the main task to complete.
if self._process_push_task:
await self._process_push_task
self._process_push_task = None
async def _setup(self, params: PipelineTaskParams):
"""Set up the pipeline task and all processors."""
mgr_params = TaskManagerParams(loop=params.loop)
@@ -690,11 +692,12 @@ class PipelineTask(BasePipelineTask):
logger.debug(f"{self}: received interruption task frame {frame}")
await self._pipeline.queue_frame(InterruptionFrame())
elif isinstance(frame, ErrorFrame):
await self._call_event_handler("on_pipeline_error", frame)
if frame.fatal:
logger.error(f"A fatal error occurred: {frame}")
# Cancel all tasks downstream.
await self.queue_frame(CancelFrame())
# Tell the task we should stop.
await self.queue_frame(StopTaskFrame())
else:
logger.warning(f"{self}: Something went wrong: {frame}")

View File

@@ -15,10 +15,9 @@ service-specific adapter.
"""
import base64
import copy
import io
from dataclasses import dataclass
from typing import TYPE_CHECKING, Any, List, Optional, TypeAlias, Union
from typing import Any, List, Optional, TypeAlias, Union
from loguru import logger
from openai._types import NOT_GIVEN as OPEN_AI_NOT_GIVEN
@@ -32,9 +31,6 @@ from PIL import Image
from pipecat.adapters.schemas.tools_schema import ToolsSchema
from pipecat.frames.frames import AudioRawFrame
if TYPE_CHECKING:
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
# "Re-export" types from OpenAI that we're using as universal context types.
# NOTE: if universal message types need to someday diverge from OpenAI's, we
# should consider managing our own definitions. But we should do so carefully,
@@ -69,26 +65,6 @@ class LLMContext:
and content formatting.
"""
@staticmethod
def from_openai_context(openai_context: "OpenAILLMContext") -> "LLMContext":
"""Create a universal LLM context from an OpenAI-specific context.
NOTE: this should only be used internally, for facilitating migration
from OpenAILLMContext to LLMContext. New user code should use
LLMContext directly.
Args:
openai_context: The OpenAI LLM context to convert.
Returns:
New LLMContext instance with converted messages and settings.
"""
return LLMContext(
messages=openai_context.get_messages(),
tools=openai_context.tools,
tool_choice=openai_context.tool_choice,
)
def __init__(
self,
messages: Optional[List[LLMContextMessage]] = None,

View File

@@ -877,8 +877,6 @@ class FrameProcessor(BaseObject):
"""
while True:
(frame, direction, callback) = await self.__input_queue.get()
if self.__should_block_system_frames and self.__input_event:
logger.trace(f"{self}: system frame processing paused")
await self.__input_event.wait()
@@ -886,6 +884,8 @@ class FrameProcessor(BaseObject):
self.__should_block_system_frames = False
logger.trace(f"{self}: system frame processing resumed")
(frame, direction, callback) = await self.__input_queue.get()
if isinstance(frame, SystemFrame):
await self.__process_frame(frame, direction, callback)
elif self.__process_queue:
@@ -900,8 +900,6 @@ class FrameProcessor(BaseObject):
async def __process_frame_task_handler(self):
"""Handle non-system frames from the process queue."""
while True:
(frame, direction, callback) = await self.__process_queue.get()
if self.__should_block_frames and self.__process_event:
logger.trace(f"{self}: frame processing paused")
await self.__process_event.wait()
@@ -909,6 +907,8 @@ class FrameProcessor(BaseObject):
self.__should_block_frames = False
logger.trace(f"{self}: frame processing resumed")
(frame, direction, callback) = await self.__process_queue.get()
await self.__process_frame(frame, direction, callback)
self.__process_queue.task_done()

View File

@@ -42,7 +42,6 @@ from pipecat.frames.frames import (
Frame,
FunctionCallResultFrame,
InputAudioRawFrame,
InputTransportMessageFrame,
InterimTranscriptionFrame,
LLMConfigureOutputFrame,
LLMContextFrame,
@@ -51,10 +50,10 @@ from pipecat.frames.frames import (
LLMMessagesAppendFrame,
LLMTextFrame,
MetricsFrame,
OutputTransportMessageUrgentFrame,
StartFrame,
SystemFrame,
TranscriptionFrame,
TransportMessageUrgentFrame,
TTSAudioRawFrame,
TTSStartedFrame,
TTSStoppedFrame,
@@ -920,7 +919,7 @@ class RTVIObserverParams:
user_audio_level_enabled: bool = False
metrics_enabled: bool = True
system_logs_enabled: bool = False
errors_enabled: Optional[bool] = None
errors_enabled: bool = True
audio_level_period_secs: float = 0.15
@@ -963,7 +962,7 @@ class RTVIObserver(BaseObserver):
if self._params.system_logs_enabled:
self._system_logger_id = logger.add(self._logger_sink)
if self._params.errors_enabled is not None:
if self._params.errors_enabled:
import warnings
with warnings.catch_warnings():
@@ -1210,10 +1209,11 @@ class RTVIObserver(BaseObserver):
async def _send_error_response(self, frame: RTVIServerResponseFrame):
"""Send a response to the client for a specific request."""
message = RTVIErrorResponse(
id=str(frame.client_msg.msg_id), data=RTVIErrorResponseData(error=frame.error)
)
await self.send_rtvi_message(message)
if self._params.errors_enabled:
message = RTVIErrorResponse(
id=str(frame.client_msg.msg_id), data=RTVIErrorResponseData(error=frame.error)
)
await self.send_rtvi_message(message)
class RTVIProcessor(FrameProcessor):
@@ -1346,9 +1346,7 @@ class RTVIProcessor(FrameProcessor):
async def push_transport_message(self, model: BaseModel, exclude_none: bool = True):
"""Push a transport message frame."""
frame = OutputTransportMessageUrgentFrame(
message=model.model_dump(exclude_none=exclude_none)
)
frame = TransportMessageUrgentFrame(message=model.model_dump(exclude_none=exclude_none))
await self.push_frame(frame)
async def handle_message(self, message: RTVIMessage):
@@ -1421,7 +1419,7 @@ class RTVIProcessor(FrameProcessor):
elif isinstance(frame, ErrorFrame):
await self._send_error_frame(frame)
await self.push_frame(frame, direction)
elif isinstance(frame, InputTransportMessageFrame):
elif isinstance(frame, TransportMessageUrgentFrame):
await self._handle_transport_message(frame)
# All other system frames
elif isinstance(frame, SystemFrame):
@@ -1484,7 +1482,7 @@ class RTVIProcessor(FrameProcessor):
await self._handle_message(message)
self._message_queue.task_done()
async def _handle_transport_message(self, frame: InputTransportMessageFrame):
async def _handle_transport_message(self, frame: TransportMessageUrgentFrame):
"""Handle an incoming transport message frame."""
try:
transport_message = frame.message

View File

@@ -15,7 +15,7 @@ from pipecat.frames.frames import (
Frame,
InputAudioRawFrame,
OutputAudioRawFrame,
UserSpeakingFrame,
TransportMessageFrame,
)
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
@@ -36,9 +36,9 @@ class FrameLogger(FrameProcessor):
color: Optional[str] = None,
ignored_frame_types: Tuple[Type[Frame], ...] = (
BotSpeakingFrame,
UserSpeakingFrame,
InputAudioRawFrame,
OutputAudioRawFrame,
TransportMessageFrame,
),
):
"""Initialize the frame logger.

View File

@@ -82,7 +82,6 @@ async def configure(
sip_enable_video: Optional[bool] = False,
sip_num_endpoints: Optional[int] = 1,
sip_codecs: Optional[Dict[str, List[str]]] = None,
room_properties: Optional[DailyRoomProperties] = None,
) -> DailyRoomConfig:
"""Configure Daily room URL and token with optional SIP capabilities.
@@ -100,10 +99,6 @@ async def configure(
sip_num_endpoints: Number of allowed SIP endpoints.
sip_codecs: Codecs to support for audio and video. If None, uses Daily defaults.
Example: {"audio": ["OPUS"], "video": ["H264"]}
room_properties: Optional DailyRoomProperties to use instead of building from
individual parameters. When provided, this overrides room_exp_duration and
SIP-related parameters. If not provided, properties are built from the
individual parameters as before.
Returns:
DailyRoomConfig: Object with room_url, token, and optional sip_endpoint.
@@ -120,13 +115,6 @@ async def configure(
# SIP-enabled room
sip_config = await configure(session, sip_caller_phone="+15551234567")
print(f"SIP endpoint: {sip_config.sip_endpoint}")
# Custom room properties with recording enabled
custom_props = DailyRoomProperties(
enable_recording="cloud",
max_participants=2,
)
config = await configure(session, room_properties=custom_props)
"""
# Check for required API key
api_key = os.getenv("DAILY_API_KEY")
@@ -136,32 +124,9 @@ async def configure(
"Get your API key from https://dashboard.daily.co/developers"
)
# Warn if both room_properties and individual parameters are provided
if room_properties is not None:
individual_params_provided = any(
[
room_exp_duration != 2.0,
token_exp_duration != 2.0,
sip_caller_phone is not None,
sip_enable_video is not False,
sip_num_endpoints != 1,
sip_codecs is not None,
]
)
if individual_params_provided:
logger.warning(
"Both room_properties and individual parameters (room_exp_duration, token_exp_duration, "
"sip_*) were provided. The room_properties will be used and individual parameters "
"will be ignored."
)
# Determine if SIP mode is enabled
sip_enabled = sip_caller_phone is not None
# If room_properties is provided, check if it has SIP configuration
if room_properties and room_properties.sip:
sip_enabled = True
daily_rest_helper = DailyRESTHelper(
daily_api_key=api_key,
daily_api_url=os.getenv("DAILY_API_URL", "https://api.daily.co/v1"),
@@ -185,29 +150,27 @@ async def configure(
room_name = f"{room_prefix}-{uuid.uuid4().hex[:8]}"
logger.info(f"Creating new Daily room: {room_name}")
# Use provided room_properties or build from parameters
if room_properties is None:
# Calculate expiration time
expiration_time = time.time() + (room_exp_duration * 60 * 60)
# Calculate expiration time
expiration_time = time.time() + (room_exp_duration * 60 * 60)
# Create room properties
room_properties = DailyRoomProperties(
exp=expiration_time,
eject_at_room_exp=True,
# Create room properties
room_properties = DailyRoomProperties(
exp=expiration_time,
eject_at_room_exp=True,
)
# Add SIP configuration if enabled
if sip_enabled:
sip_params = DailyRoomSipParams(
display_name=sip_caller_phone,
video=sip_enable_video,
sip_mode="dial-in",
num_endpoints=sip_num_endpoints,
codecs=sip_codecs,
)
# Add SIP configuration if enabled
if sip_enabled:
sip_params = DailyRoomSipParams(
display_name=sip_caller_phone,
video=sip_enable_video,
sip_mode="dial-in",
num_endpoints=sip_num_endpoints,
codecs=sip_codecs,
)
room_properties.sip = sip_params
room_properties.enable_dialout = True # Enable outbound calls if needed
room_properties.start_video_off = not sip_enable_video # Voice-only by default
room_properties.sip = sip_params
room_properties.enable_dialout = True # Enable outbound calls if needed
room_properties.start_video_off = not sip_enable_video # Voice-only by default
# Create room parameters
room_params = DailyRoomParams(name=room_name, properties=room_properties)

View File

@@ -67,17 +67,10 @@ To run locally:
import argparse
import asyncio
import mimetypes
import os
import sys
import uuid
from contextlib import asynccontextmanager
from http import HTTPMethod
from pathlib import Path
from typing import Any, Dict, List, Optional, TypedDict
import aiohttp
from fastapi.responses import FileResponse, Response
from loguru import logger
from pipecat.runner.types import (
@@ -89,7 +82,7 @@ from pipecat.runner.types import (
try:
import uvicorn
from dotenv import load_dotenv
from fastapi import BackgroundTasks, FastAPI, Header, HTTPException, Request, WebSocket
from fastapi import BackgroundTasks, FastAPI, Request, WebSocket
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import HTMLResponse, RedirectResponse
except ImportError as e:
@@ -103,12 +96,6 @@ except ImportError as e:
load_dotenv(override=True)
os.environ["ENV"] = "local"
TELEPHONY_TRANSPORTS = ["twilio", "telnyx", "plivo", "exotel"]
RUNNER_DOWNLOADS_FOLDER: Optional[str] = None
RUNNER_HOST: str = "localhost"
RUNNER_PORT: int = 7860
def _get_bot_module():
"""Get the bot module from the calling script."""
@@ -163,13 +150,7 @@ async def _run_telephony_bot(websocket: WebSocket):
def _create_server_app(
*,
transport_type: str,
host: str = "localhost",
proxy: str,
esp32_mode: bool = False,
whatsapp_enabled: bool = False,
folder: Optional[str] = None,
transport_type: str, host: str = "localhost", proxy: str = None, esp32_mode: bool = False
):
"""Create FastAPI app with transport-specific routes."""
app = FastAPI()
@@ -184,30 +165,24 @@ def _create_server_app(
# Set up transport-specific routes
if transport_type == "webrtc":
_setup_webrtc_routes(app, esp32_mode=esp32_mode, host=host, folder=folder)
if whatsapp_enabled:
_setup_whatsapp_routes(app)
_setup_webrtc_routes(app, esp32_mode=esp32_mode, host=host)
elif transport_type == "daily":
_setup_daily_routes(app)
elif transport_type in TELEPHONY_TRANSPORTS:
_setup_telephony_routes(app, transport_type=transport_type, proxy=proxy)
elif transport_type in ["twilio", "telnyx", "plivo", "exotel"]:
_setup_telephony_routes(app, transport_type, proxy)
else:
logger.warning(f"Unknown transport type: {transport_type}")
return app
def _setup_webrtc_routes(
app: FastAPI, *, esp32_mode: bool = False, host: str = "localhost", folder: Optional[str] = None
):
def _setup_webrtc_routes(app: FastAPI, esp32_mode: bool = False, host: str = "localhost"):
"""Set up WebRTC-specific routes."""
try:
from pipecat_ai_small_webrtc_prebuilt.frontend import SmallWebRTCPrebuiltUI
from pipecat.transports.smallwebrtc.connection import IceServer, SmallWebRTCConnection
from pipecat.transports.smallwebrtc.connection import SmallWebRTCConnection
from pipecat.transports.smallwebrtc.request_handler import (
IceCandidate,
SmallWebRTCPatchRequest,
SmallWebRTCRequest,
SmallWebRTCRequestHandler,
)
@@ -215,16 +190,6 @@ def _setup_webrtc_routes(
logger.error(f"WebRTC transport dependencies not installed: {e}")
return
class IceConfig(TypedDict):
iceServers: List[IceServer]
class StartBotResult(TypedDict, total=False):
sessionId: str
iceConfig: Optional[IceConfig]
# In-memory store of active sessions: session_id -> session info
active_sessions: Dict[str, Dict[str, Any]] = {}
# Mount the frontend
app.mount("/client", SmallWebRTCPrebuiltUI)
@@ -233,21 +198,6 @@ def _setup_webrtc_routes(
"""Redirect root requests to client interface."""
return RedirectResponse(url="/client/")
@app.get("/files/{filename:path}")
async def download_file(filename: str):
"""Handle file downloads."""
if not folder:
logger.warning(f"Attempting to dowload {filename}, but downloads folder not setup.")
return
file_path = Path(folder) / filename
if not os.path.exists(file_path):
raise HTTPException(404)
media_type, _ = mimetypes.guess_type(file_path)
return FileResponse(path=file_path, media_type=media_type, filename=filename)
# Initialize the SmallWebRTC request handler
small_webrtc_handler: SmallWebRTCRequestHandler = SmallWebRTCRequestHandler(
esp32_mode=esp32_mode, host=host
@@ -270,259 +220,13 @@ def _setup_webrtc_routes(
)
return answer
@app.patch("/api/offer")
async def ice_candidate(request: SmallWebRTCPatchRequest):
"""Handle WebRTC new ice candidate requests."""
logger.debug(f"Received patch request: {request}")
await small_webrtc_handler.handle_patch_request(request)
return {"status": "success"}
@app.post("/start")
async def rtvi_start(request: Request):
"""Mimic Pipecat Cloud's /start endpoint."""
# Parse the request body
try:
request_data = await request.json()
logger.debug(f"Received request: {request_data}")
except Exception as e:
logger.error(f"Failed to parse request body: {e}")
request_data = {}
# Store session info immediately in memory, replicate the behavior expected on Pipecat Cloud
session_id = str(uuid.uuid4())
active_sessions[session_id] = request_data
result: StartBotResult = {"sessionId": session_id}
if request_data.get("enableDefaultIceServers"):
result["iceConfig"] = IceConfig(
iceServers=[IceServer(urls="stun:stun.l.google.com:19302")]
)
return result
@app.api_route(
"/sessions/{session_id}/{path:path}",
methods=["GET", "POST", "PUT", "PATCH", "DELETE"],
)
async def proxy_request(
session_id: str, path: str, request: Request, background_tasks: BackgroundTasks
):
"""Mimic Pipecat Cloud's proxy."""
active_session = active_sessions.get(session_id)
if not active_session:
return Response(content="Invalid or not-yet-ready session_id", status_code=404)
if path.endswith("api/offer"):
# Parse the request body and convert to SmallWebRTCRequest
try:
request_data = await request.json()
if request.method == HTTPMethod.POST.value:
webrtc_request = SmallWebRTCRequest(
sdp=request_data["sdp"],
type=request_data["type"],
pc_id=request_data.get("pc_id"),
restart_pc=request_data.get("restart_pc"),
request_data=request_data,
)
return await offer(webrtc_request, background_tasks)
elif request.method == HTTPMethod.PATCH.value:
patch_request = SmallWebRTCPatchRequest(
pc_id=request_data["pc_id"],
candidates=[IceCandidate(**c) for c in request_data.get("candidates", [])],
)
return await ice_candidate(patch_request)
except Exception as e:
logger.error(f"Failed to parse WebRTC request: {e}")
return Response(content="Invalid WebRTC request", status_code=400)
logger.info(f"Received request for path: {path}")
return Response(status_code=200)
@asynccontextmanager
async def smallwebrtc_lifespan(app: FastAPI):
async def lifespan(app: FastAPI):
"""Manage FastAPI application lifecycle and cleanup connections."""
yield
await small_webrtc_handler.close()
# Add the SmallWebRTC lifespan to the app
_add_lifespan_to_app(app, smallwebrtc_lifespan)
def _add_lifespan_to_app(app: FastAPI, new_lifespan):
"""Add a new lifespan context manager to the app, combining with existing if present.
Args:
app: The FastAPI application instance
new_lifespan: The new lifespan context manager to add
"""
if hasattr(app.router, "lifespan_context") and app.router.lifespan_context is not None:
# If there's already a lifespan context, combine them
existing_lifespan = app.router.lifespan_context
@asynccontextmanager
async def combined_lifespan(app: FastAPI):
async with existing_lifespan(app):
async with new_lifespan(app):
yield
app.router.lifespan_context = combined_lifespan
else:
# No existing lifespan, use the new one
app.router.lifespan_context = new_lifespan
def _setup_whatsapp_routes(app: FastAPI):
"""Set up WebRTC-specific routes."""
WHATSAPP_APP_SECRET = os.getenv("WHATSAPP_APP_SECRET")
WHATSAPP_PHONE_NUMBER_ID = os.getenv("WHATSAPP_PHONE_NUMBER_ID")
WHATSAPP_TOKEN = os.getenv("WHATSAPP_TOKEN")
WHATSAPP_WEBHOOK_VERIFICATION_TOKEN = os.getenv("WHATSAPP_WEBHOOK_VERIFICATION_TOKEN")
if not all(
[
WHATSAPP_APP_SECRET,
WHATSAPP_PHONE_NUMBER_ID,
WHATSAPP_TOKEN,
WHATSAPP_WEBHOOK_VERIFICATION_TOKEN,
]
):
logger.error(
"""Missing required environment variables for WhatsApp transport:
WHATSAPP_APP_SECRET
WHATSAPP_PHONE_NUMBER_ID
WHATSAPP_TOKEN
WHATSAPP_WEBHOOK_VERIFICATION_TOKEN
"""
)
return
try:
from pipecat_ai_small_webrtc_prebuilt.frontend import SmallWebRTCPrebuiltUI
from pipecat.transports.smallwebrtc.connection import SmallWebRTCConnection
from pipecat.transports.smallwebrtc.request_handler import (
SmallWebRTCRequest,
SmallWebRTCRequestHandler,
)
from pipecat.transports.whatsapp.api import WhatsAppWebhookRequest
from pipecat.transports.whatsapp.client import WhatsAppClient
except ImportError as e:
logger.error(f"WhatsApp transport dependencies not installed: {e}")
return
# Global WhatsApp client instance
whatsapp_client: Optional[WhatsAppClient] = None
@app.get(
"/whatsapp",
summary="Verify WhatsApp webhook",
description="Handles WhatsApp webhook verification requests from Meta",
)
async def verify_webhook(request: Request):
"""Verify WhatsApp webhook endpoint.
This endpoint is called by Meta's WhatsApp Business API to verify
the webhook URL during setup. It validates the verification token
and returns the challenge parameter if successful.
"""
if whatsapp_client is None:
logger.error("WhatsApp client is not initialized")
raise HTTPException(status_code=503, detail="Service unavailable")
params = dict(request.query_params)
logger.debug(f"Webhook verification request received with params: {list(params.keys())}")
try:
result = await whatsapp_client.handle_verify_webhook_request(
params=params, expected_verification_token=WHATSAPP_WEBHOOK_VERIFICATION_TOKEN
)
logger.info("Webhook verification successful")
return result
except ValueError as e:
logger.warning(f"Webhook verification failed: {e}")
raise HTTPException(status_code=403, detail="Verification failed")
@app.post(
"/whatsapp",
summary="Handle WhatsApp webhook events",
description="Processes incoming WhatsApp messages and call events",
)
async def whatsapp_webhook(
body: WhatsAppWebhookRequest,
background_tasks: BackgroundTasks,
request: Request,
x_hub_signature_256: str = Header(None),
):
"""Handle incoming WhatsApp webhook events.
For call events, establishes WebRTC connections and spawns bot instances
in the background to handle real-time communication.
"""
if whatsapp_client is None:
logger.error("WhatsApp client is not initialized")
raise HTTPException(status_code=503, detail="Service unavailable")
# Validate webhook object type
if body.object != "whatsapp_business_account":
logger.warning(f"Invalid webhook object type: {body.object}")
raise HTTPException(status_code=400, detail="Invalid object type")
logger.debug(f"Processing WhatsApp webhook: {body.model_dump()}")
async def connection_callback(connection: SmallWebRTCConnection):
"""Handle new WebRTC connections from WhatsApp calls.
Called when a WebRTC connection is established for a WhatsApp call.
Spawns a bot instance to handle the conversation.
Args:
connection: The established WebRTC connection
"""
bot_module = _get_bot_module()
runner_args = SmallWebRTCRunnerArguments(webrtc_connection=connection)
background_tasks.add_task(bot_module.bot, runner_args)
try:
# Process the webhook request
raw_body = await request.body()
result = await whatsapp_client.handle_webhook_request(
body, connection_callback, sha256_signature=x_hub_signature_256, raw_body=raw_body
)
logger.debug(f"Webhook processed successfully: {result}")
return {"status": "success", "message": "Webhook processed successfully"}
except ValueError as ve:
logger.warning(f"Invalid webhook request format: {ve}")
raise HTTPException(status_code=400, detail=f"Invalid request: {str(ve)}")
except Exception as e:
logger.error(f"Internal error processing webhook: {e}")
raise HTTPException(status_code=500, detail="Internal server error processing webhook")
@asynccontextmanager
async def whatsapp_lifespan(app: FastAPI):
"""Manage WhatsApp client lifecycle and cleanup connections."""
nonlocal whatsapp_client
# Initialize WhatsApp client with persistent HTTP session
async with aiohttp.ClientSession() as session:
whatsapp_client = WhatsAppClient(
whatsapp_token=WHATSAPP_TOKEN,
whatsapp_secret=WHATSAPP_APP_SECRET,
phone_number_id=WHATSAPP_PHONE_NUMBER_ID,
session=session,
)
logger.info("WhatsApp client initialized successfully")
try:
yield # Run the application
finally:
# Cleanup all active calls on shutdown
logger.info("Cleaning up WhatsApp client resources...")
if whatsapp_client:
await whatsapp_client.terminate_all_calls()
logger.info("WhatsApp cleanup completed")
# Add the WhatsApp lifespan to the app
_add_lifespan_to_app(app, whatsapp_lifespan)
app.router.lifespan_context = lifespan
def _setup_daily_routes(app: FastAPI):
@@ -577,6 +281,8 @@ def _setup_daily_routes(app: FastAPI):
else:
logger.debug("No body data provided in request")
import aiohttp
from pipecat.runner.daily import configure
async with aiohttp.ClientSession() as session:
@@ -608,7 +314,7 @@ def _setup_daily_routes(app: FastAPI):
return await _handle_rtvi_request(request)
def _setup_telephony_routes(app: FastAPI, *, transport_type: str, proxy: str):
def _setup_telephony_routes(app: FastAPI, transport_type: str, proxy: str):
"""Set up telephony-specific routes."""
# XML response templates (Exotel doesn't use XML webhooks)
XML_TEMPLATES = {
@@ -664,6 +370,8 @@ def _setup_telephony_routes(app: FastAPI, *, transport_type: str, proxy: str):
async def _run_daily_direct():
"""Run Daily bot with direct connection (no FastAPI server)."""
try:
import aiohttp
from pipecat.runner.daily import configure
except ImportError as e:
logger.error("Daily transport dependencies not installed.")
@@ -709,21 +417,6 @@ def _validate_and_clean_proxy(proxy: str) -> str:
return proxy
def runner_downloads_folder() -> Optional[str]:
"""Returns the folder where files are stored for later download."""
return RUNNER_DOWNLOADS_FOLDER
def runner_host() -> str:
"""Returns the host name of this runner."""
return RUNNER_HOST
def runner_port() -> int:
"""Returns the port of this runner."""
return RUNNER_PORT
def main():
"""Start the Pipecat development runner.
@@ -744,16 +437,14 @@ def main():
The bot file must contain a `bot(runner_args)` function as the entry point.
"""
global RUNNER_DOWNLOADS_FOLDER, RUNNER_HOST, RUNNER_PORT
parser = argparse.ArgumentParser(description="Pipecat Development Runner")
parser.add_argument("--host", type=str, default=RUNNER_HOST, help="Host address")
parser.add_argument("--port", type=int, default=RUNNER_PORT, help="Port number")
parser.add_argument("--host", type=str, default="localhost", help="Host address")
parser.add_argument("--port", type=int, default=7860, help="Port number")
parser.add_argument(
"-t",
"--transport",
type=str,
choices=["daily", "webrtc", *TELEPHONY_TRANSPORTS],
choices=["daily", "webrtc", "twilio", "telnyx", "plivo", "exotel"],
default="webrtc",
help="Transport type",
)
@@ -771,16 +462,9 @@ def main():
default=False,
help="Connect directly to Daily room (automatically sets transport to daily)",
)
parser.add_argument("-f", "--folder", type=str, help="Path to downloads folder")
parser.add_argument(
"--verbose", "-v", action="count", default=0, help="Increase logging verbosity"
)
parser.add_argument(
"--whatsapp",
action="store_true",
default=False,
help="Ensure requried WhatsApp environment variables are present",
)
args = parser.parse_args()
@@ -800,10 +484,6 @@ def main():
logger.error("For ESP32, you need to specify `--host IP` so we can do SDP munging.")
return
if args.transport in TELEPHONY_TRANSPORTS and not args.proxy:
logger.error(f"For telephony transports, you need to specify `--proxy PROXY`.")
return
# Log level
logger.remove()
logger.add(sys.stderr, level="TRACE" if args.verbose else "DEBUG")
@@ -823,11 +503,10 @@ def main():
print()
if args.esp32:
print(f"🚀 Bot ready! (ESP32 mode)")
elif args.whatsapp:
print(f"🚀 Bot ready! (WhatsApp)")
print(f" → Open http://{args.host}:{args.port}/client in your browser")
else:
print(f"🚀 Bot ready!")
print(f" → Open http://{args.host}:{args.port}/client in your browser")
print(f" → Open http://{args.host}:{args.port}/client in your browser")
print()
elif args.transport == "daily":
print()
@@ -835,19 +514,8 @@ def main():
print(f" → Open http://{args.host}:{args.port} in your browser to start a session")
print()
RUNNER_DOWNLOADS_FOLDER = args.folder
RUNNER_HOST = args.host
RUNNER_PORT = args.port
# Create the app with transport-specific setup
app = _create_server_app(
transport_type=args.transport,
host=args.host,
proxy=args.proxy,
esp32_mode=args.esp32,
whatsapp_enabled=args.whatsapp,
folder=args.folder,
)
app = _create_server_app(args.transport, args.host, args.proxy, args.esp32)
# Run the server
uvicorn.run(app, host=args.host, port=args.port)

View File

@@ -99,41 +99,29 @@ async def parse_telephony_websocket(websocket: WebSocket):
tuple: (transport_type: str, call_data: dict)
call_data contains provider-specific fields:
- Twilio::
{
"stream_id": str,
"call_id": str,
"body": dict
}
- Telnyx::
{
"stream_id": str,
"call_control_id": str,
"outbound_encoding": str,
"from": str,
"to": str,
}
- Plivo::
{
"stream_id": str,
"call_id": str,
}
- Exotel::
{
"stream_id": str,
"call_id": str,
"account_sid": str,
"from": str,
"to": str,
}
- Twilio: {
"stream_id": str,
"call_id": str,
"body": dict
}
- Telnyx: {
"stream_id": str,
"call_control_id": str,
"outbound_encoding": str,
"from": str,
"to": str,
}
- Plivo: {
"stream_id": str,
"call_id": str,
}
- Exotel: {
"stream_id": str,
"call_id": str,
"account_sid": str,
"from": str,
"to": str,
}
Example usage::
@@ -313,7 +301,6 @@ def _smallwebrtc_sdp_cleanup_ice_candidates(text: str, pattern: str) -> str:
Returns:
Cleaned SDP text with filtered ICE candidates.
"""
logger.debug("Removing unsupported ICE candidates from SDP")
result = []
lines = text.splitlines()
for line in lines:
@@ -322,7 +309,7 @@ def _smallwebrtc_sdp_cleanup_ice_candidates(text: str, pattern: str) -> str:
result.append(line)
else:
result.append(line)
return "\r\n".join(result) + "\r\n"
return "\r\n".join(result)
def _smallwebrtc_sdp_cleanup_fingerprints(text: str) -> str:
@@ -334,16 +321,15 @@ def _smallwebrtc_sdp_cleanup_fingerprints(text: str) -> str:
Returns:
SDP text with sha-384 and sha-512 fingerprints removed.
"""
logger.debug("Removing unsupported fingerprints from SDP")
result = []
lines = text.splitlines()
for line in lines:
if not re.search("sha-384", line) and not re.search("sha-512", line):
result.append(line)
return "\r\n".join(result) + "\r\n"
return "\r\n".join(result)
def smallwebrtc_sdp_munging(sdp: str, host: Optional[str]) -> str:
def smallwebrtc_sdp_munging(sdp: str, host: str) -> str:
"""Apply SDP modifications for SmallWebRTC compatibility.
Args:
@@ -354,8 +340,7 @@ def smallwebrtc_sdp_munging(sdp: str, host: Optional[str]) -> str:
Modified SDP string with fingerprint and ICE candidate cleanup.
"""
sdp = _smallwebrtc_sdp_cleanup_fingerprints(sdp)
if host:
sdp = _smallwebrtc_sdp_cleanup_ice_candidates(sdp, host)
sdp = _smallwebrtc_sdp_cleanup_ice_candidates(sdp, host)
return sdp

View File

@@ -21,9 +21,9 @@ from pipecat.frames.frames import (
InputAudioRawFrame,
InputDTMFFrame,
InterruptionFrame,
OutputTransportMessageFrame,
OutputTransportMessageUrgentFrame,
StartFrame,
TransportMessageFrame,
TransportMessageUrgentFrame,
)
from pipecat.serializers.base_serializer import FrameSerializer, FrameSerializerType
@@ -121,7 +121,7 @@ class ExotelFrameSerializer(FrameSerializer):
}
return json.dumps(answer)
elif isinstance(frame, (OutputTransportMessageFrame, OutputTransportMessageUrgentFrame)):
elif isinstance(frame, (TransportMessageFrame, TransportMessageUrgentFrame)):
return json.dumps(frame.message)
return None

View File

@@ -25,31 +25,11 @@ except ModuleNotFoundError as e:
class LivekitFrameSerializer(FrameSerializer):
"""Serializer for converting between Pipecat frames and LiveKit audio frames.
.. deprecated:: 0.0.90
This class is deprecated and will be removed in a future version.
Please use LiveKitTransport instead, which handles audio streaming
and frame conversion natively.
This serializer handles the conversion of Pipecat's OutputAudioRawFrame objects
to LiveKit AudioFrame objects for transmission, and the reverse conversion
for received audio data.
"""
def __init__(self):
"""Initialize the LiveKit frame serializer."""
super().__init__()
import warnings
with warnings.catch_warnings():
warnings.simplefilter("always")
warnings.warn(
"LivekitFrameSerializer is deprecated and will be removed in a future version. "
"Please use LiveKitTransport instead, which handles audio streaming natively.",
DeprecationWarning,
stacklevel=2,
)
@property
def type(self) -> FrameSerializerType:
"""Get the serializer type.

View File

@@ -23,9 +23,9 @@ from pipecat.frames.frames import (
InputAudioRawFrame,
InputDTMFFrame,
InterruptionFrame,
OutputTransportMessageFrame,
OutputTransportMessageUrgentFrame,
StartFrame,
TransportMessageFrame,
TransportMessageUrgentFrame,
)
from pipecat.serializers.base_serializer import FrameSerializer, FrameSerializerType
@@ -148,7 +148,7 @@ class PlivoFrameSerializer(FrameSerializer):
}
return json.dumps(answer)
elif isinstance(frame, (OutputTransportMessageFrame, OutputTransportMessageUrgentFrame)):
elif isinstance(frame, (TransportMessageFrame, TransportMessageUrgentFrame)):
return json.dumps(frame.message)
# Return None for unhandled frames

View File

@@ -15,12 +15,11 @@ import pipecat.frames.protobufs.frames_pb2 as frame_protos
from pipecat.frames.frames import (
Frame,
InputAudioRawFrame,
InputTransportMessageFrame,
OutputAudioRawFrame,
OutputTransportMessageFrame,
OutputTransportMessageUrgentFrame,
TextFrame,
TranscriptionFrame,
TransportMessageFrame,
TransportMessageUrgentFrame,
)
from pipecat.serializers.base_serializer import FrameSerializer, FrameSerializerType
@@ -83,7 +82,7 @@ class ProtobufFrameSerializer(FrameSerializer):
Serialized frame as bytes, or None if frame type is not serializable.
"""
# Wrapping this messages as a JSONFrame to send
if isinstance(frame, (OutputTransportMessageFrame, OutputTransportMessageUrgentFrame)):
if isinstance(frame, (TransportMessageFrame, TransportMessageUrgentFrame)):
frame = MessageFrame(
data=json.dumps(frame.message),
)
@@ -135,11 +134,11 @@ class ProtobufFrameSerializer(FrameSerializer):
if "pts" in args_dict:
del args_dict["pts"]
# Special handling for MessageFrame -> OutputTransportMessageUrgentFrame
# Special handling for MessageFrame -> TransportMessageUrgentFrame
if class_name == MessageFrame:
try:
msg = json.loads(args_dict["data"])
instance = InputTransportMessageFrame(message=msg)
instance = TransportMessageUrgentFrame(message=msg)
logger.debug(f"ProtobufFrameSerializer: Transport message {instance}")
except Exception as e:
logger.error(f"Error parsing MessageFrame data: {e}")

View File

@@ -23,9 +23,9 @@ from pipecat.frames.frames import (
InputAudioRawFrame,
InputDTMFFrame,
InterruptionFrame,
OutputTransportMessageFrame,
OutputTransportMessageUrgentFrame,
StartFrame,
TransportMessageFrame,
TransportMessageUrgentFrame,
)
from pipecat.serializers.base_serializer import FrameSerializer, FrameSerializerType
@@ -175,7 +175,7 @@ class TwilioFrameSerializer(FrameSerializer):
}
return json.dumps(answer)
elif isinstance(frame, (OutputTransportMessageFrame, OutputTransportMessageUrgentFrame)):
elif isinstance(frame, (TransportMessageFrame, TransportMessageUrgentFrame)):
return json.dumps(frame.message)
# Return None for unhandled frames

View File

@@ -97,7 +97,9 @@ class AIService(FrameProcessor):
pass
async def _update_settings(self, settings: Mapping[str, Any]):
from pipecat.services.openai.realtime.events import SessionProperties
from pipecat.services.openai_realtime_beta.events import (
SessionProperties,
)
for key, value in settings.items():
logger.debug("Update request for:", key, value)
@@ -109,7 +111,9 @@ class AIService(FrameProcessor):
logger.debug("Attempting to update", key, value)
try:
from pipecat.services.openai.realtime.events import TurnDetection
from pipecat.services.openai_realtime_beta.events import (
TurnDetection,
)
if isinstance(self._session_properties, SessionProperties):
current_properties = self._session_properties

View File

@@ -108,8 +108,6 @@ class AssemblyAIConnectionParams(BaseModel):
end_of_turn_confidence_threshold: Confidence threshold for end-of-turn detection.
min_end_of_turn_silence_when_confident: Minimum silence duration when confident about end-of-turn.
max_turn_silence: Maximum silence duration before forcing end-of-turn.
keyterms_prompt: List of key terms to guide transcription. Will be JSON serialized before sending.
speech_model: Select between English and multilingual models. Defaults to "universal-streaming-english".
"""
sample_rate: int = 16000
@@ -119,7 +117,3 @@ class AssemblyAIConnectionParams(BaseModel):
end_of_turn_confidence_threshold: Optional[float] = None
min_end_of_turn_silence_when_confident: Optional[int] = None
max_turn_silence: Optional[int] = None
keyterms_prompt: Optional[List[str]] = None
speech_model: Literal["universal-streaming-english", "universal-streaming-multilingual"] = (
"universal-streaming-english"
)

View File

@@ -174,16 +174,11 @@ class AssemblyAISTTService(STTService):
def _build_ws_url(self) -> str:
"""Build WebSocket URL with query parameters using urllib.parse.urlencode."""
params = {}
for k, v in self._connection_params.model_dump().items():
if v is not None:
if k == "keyterms_prompt":
params[k] = json.dumps(v)
elif isinstance(v, bool):
params[k] = str(v).lower()
else:
params[k] = v
params = {
k: str(v).lower() if isinstance(v, bool) else v
for k, v in self._connection_params.model_dump().items()
if v is not None
}
if params:
query_string = urlencode(params)
return f"{self._api_endpoint_base_url}?{query_string}"
@@ -202,8 +197,6 @@ class AssemblyAISTTService(STTService):
)
self._connected = True
self._receive_task = self.create_task(self._receive_task_handler())
await self._call_event_handler("on_connected")
except Exception as e:
logger.error(f"Failed to connect to AssemblyAI: {e}")
self._connected = False
@@ -245,7 +238,6 @@ class AssemblyAISTTService(STTService):
self._websocket = None
self._connected = False
self._receive_task = None
await self._call_event_handler("on_disconnected")
async def _receive_task_handler(self):
"""Handle incoming WebSocket messages."""

View File

@@ -235,8 +235,6 @@ class AsyncAITTSService(InterruptibleTTSService):
}
await self._get_websocket().send(json.dumps(init_msg))
await self._call_event_handler("on_connected")
except Exception as e:
logger.error(f"{self} initialization error: {e}")
self._websocket = None
@@ -254,7 +252,6 @@ class AsyncAITTSService(InterruptibleTTSService):
finally:
self._websocket = None
self._started = False
await self._call_event_handler("on_disconnected")
def _get_websocket(self):
if self._websocket:

View File

@@ -9,7 +9,6 @@ import sys
from pipecat.services import DeprecatedModuleProxy
from .llm import *
from .nova_sonic import *
from .stt import *
from .tts import *

View File

@@ -61,6 +61,7 @@ from pipecat.utils.tracing.service_decorators import traced_llm
try:
import aioboto3
import httpx
from botocore.config import Config
from botocore.exceptions import ReadTimeoutError
except ModuleNotFoundError as e:
@@ -1116,7 +1117,7 @@ class AWSBedrockLLMService(LLMService):
# also get cancelled.
use_completion_tokens_estimate = True
raise
except (ReadTimeoutError, asyncio.TimeoutError):
except httpx.TimeoutException:
await self._call_event_handler("on_completion_timeout")
except Exception as e:
logger.exception(f"{self} exception: {e}")

View File

@@ -1,87 +0,0 @@
#
# Copyright (c) 2025, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
"""Context management for AWS Nova Sonic LLM service.
This module provides specialized context aggregators and message handling for AWS Nova Sonic,
including conversation history management and role-specific message processing.
.. deprecated:: 0.0.91
AWS Nova Sonic now supports `LLMContext` and `LLMContextAggregatorPair`.
Using the new patterns should allow you to not need types from this module.
BEFORE:
```
# Setup
context = OpenAILLMContext(messages, tools)
context_aggregator = llm.create_context_aggregator(context)
# Context frame type
frame: OpenAILLMContextFrame
# Context type
context: AWSNovaSonicLLMContext
# or
context: OpenAILLMContext
# Reading messages from context
messages = context.messages
```
AFTER:
```
# Setup
context = LLMContext(messages, tools)
context_aggregator = LLMContextAggregatorPair(context)
# Context frame type
frame: LLMContextFrame
# Context type
context: LLMContext
# Reading messages from context
messages = context.get_messages()
```
"""
import warnings
with warnings.catch_warnings():
warnings.simplefilter("always")
warnings.warn(
"Types in pipecat.services.aws.nova_sonic.context are deprecated. \n"
"AWS Nova Sonic now supports `LLMContext` and `LLMContextAggregatorPair`. \n"
"Using the new patterns should allow you to not need types from this module.\n\n"
"BEFORE:\n"
"```\n"
"# Setup\n"
"context = OpenAILLMContext(messages, tools)\n"
"context_aggregator = llm.create_context_aggregator(context)\n\n"
"# Context frame type\n"
"frame: OpenAILLMContextFrame\n\n"
"# Context type\n"
"context: AWSNovaSonicLLMContext\n"
"# or\n"
"context: OpenAILLMContext\n\n"
"# Reading messages from context\n"
"messages = context.messages\n"
"```\n\n"
"AFTER:\n"
"```\n"
"# Setup\n"
"context = LLMContext(messages, tools)\n"
"context_aggregator = LLMContextAggregatorPair(context)\n\n"
"# Context frame type\n"
"frame: LLMContextFrame\n\n"
"# Context type\n"
"context: LLMContext\n\n"
"# Reading messages from context\n"
"messages = context.messages\n"
"```",
DeprecationWarning,
stacklevel=2,
)

View File

@@ -1,25 +0,0 @@
#
# Copyright (c) 2025, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
"""Custom frames for AWS Nova Sonic LLM service."""
from dataclasses import dataclass
from pipecat.frames.frames import DataFrame, FunctionCallResultFrame
@dataclass
class AWSNovaSonicFunctionCallResultFrame(DataFrame):
"""Frame containing function call result for AWS Nova Sonic processing.
This frame wraps a standard function call result frame to enable
AWS Nova Sonic-specific handling and context updates.
Parameters:
result_frame: The underlying function call result frame.
"""
result_frame: FunctionCallResultFrame

File diff suppressed because it is too large Load Diff

View File

@@ -286,7 +286,6 @@ class AWSTranscribeSTTService(STTService):
logger.info(f"{self} Successfully connected to AWS Transcribe")
await self._call_event_handler("on_connected")
except Exception as e:
logger.error(f"{self} Failed to connect to AWS Transcribe: {e}")
await self._disconnect()
@@ -311,7 +310,6 @@ class AWSTranscribeSTTService(STTService):
logger.warning(f"{self} Error closing WebSocket connection: {e}")
finally:
self._ws_client = None
await self._call_event_handler("on_disconnected")
def language_to_service_language(self, language: Language) -> str | None:
"""Convert internal language enum to AWS Transcribe language code.

View File

@@ -1,19 +1 @@
#
# Copyright (c) 2025, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import warnings
from pipecat.services.aws.nova_sonic.llm import AWSNovaSonicLLMService, Params
with warnings.catch_warnings():
warnings.simplefilter("always")
warnings.warn(
"Types in pipecat.services.aws_nova_sonic are deprecated. "
"Please use the equivalent types from "
"pipecat.services.aws.nova_sonic.llm instead.",
DeprecationWarning,
stacklevel=2,
)
from .aws import AWSNovaSonicLLMService, Params

File diff suppressed because it is too large Load Diff

View File

@@ -8,80 +8,360 @@
This module provides specialized context aggregators and message handling for AWS Nova Sonic,
including conversation history management and role-specific message processing.
.. deprecated:: 0.0.91
AWS Nova Sonic now supports `LLMContext` and `LLMContextAggregatorPair`.
Using the new patterns should allow you to not need types from this module.
BEFORE:
```
# Setup
context = OpenAILLMContext(messages, tools)
context_aggregator = llm.create_context_aggregator(context)
# Context frame type
frame: OpenAILLMContextFrame
# Context type
context: AWSNovaSonicLLMContext
# or
context: OpenAILLMContext
# Reading messages from context
messages = context.messages
```
AFTER:
```
# Setup
context = LLMContext(messages, tools)
context_aggregator = LLMContextAggregatorPair(context)
# Context frame type
frame: LLMContextFrame
# Context type
context: LLMContext
# Reading messages from context
messages = context.get_messages()
```
"""
import warnings
import copy
from dataclasses import dataclass, field
from enum import Enum
with warnings.catch_warnings():
warnings.simplefilter("always")
warnings.warn(
"Types in pipecat.services.aws_nova_sonic.context are deprecated. \n"
"AWS Nova Sonic now supports `LLMContext` and `LLMContextAggregatorPair`. \n"
"Using the new patterns should allow you to not need types from this module.\n\n"
"BEFORE:\n"
"```\n"
"# Setup\n"
"context = OpenAILLMContext(messages, tools)\n"
"context_aggregator = llm.create_context_aggregator(context)\n\n"
"# Context frame type\n"
"frame: OpenAILLMContextFrame\n\n"
"# Context type\n"
"context: AWSNovaSonicLLMContext\n"
"# or\n"
"context: OpenAILLMContext\n\n"
"# Reading messages from context\n"
"messages = context.messages\n"
"```\n\n"
"AFTER:\n"
"```\n"
"# Setup\n"
"context = LLMContext(messages, tools)\n"
"context_aggregator = LLMContextAggregatorPair(context)\n\n"
"# Context frame type\n"
"frame: LLMContextFrame\n\n"
"# Context type\n"
"context: LLMContext\n\n"
"# Reading messages from context\n"
"messages = context.messages\n"
"```",
DeprecationWarning,
stacklevel=2,
)
from loguru import logger
from pipecat.frames.frames import (
BotStoppedSpeakingFrame,
DataFrame,
Frame,
FunctionCallResultFrame,
InterruptionFrame,
LLMFullResponseEndFrame,
LLMFullResponseStartFrame,
LLMMessagesAppendFrame,
LLMMessagesUpdateFrame,
LLMSetToolChoiceFrame,
LLMSetToolsFrame,
TextFrame,
UserImageRawFrame,
)
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.processors.frame_processor import FrameDirection
from pipecat.services.aws_nova_sonic.frames import AWSNovaSonicFunctionCallResultFrame
from pipecat.services.openai.llm import (
OpenAIAssistantContextAggregator,
OpenAIUserContextAggregator,
)
class Role(Enum):
"""Roles supported in AWS Nova Sonic conversations.
Parameters:
SYSTEM: System-level messages (not used in conversation history).
USER: Messages sent by the user.
ASSISTANT: Messages sent by the assistant.
TOOL: Messages sent by tools (not used in conversation history).
"""
SYSTEM = "SYSTEM"
USER = "USER"
ASSISTANT = "ASSISTANT"
TOOL = "TOOL"
@dataclass
class AWSNovaSonicConversationHistoryMessage:
"""A single message in AWS Nova Sonic conversation history.
Parameters:
role: The role of the message sender (USER or ASSISTANT only).
text: The text content of the message.
"""
role: Role # only USER and ASSISTANT
text: str
@dataclass
class AWSNovaSonicConversationHistory:
"""Complete conversation history for AWS Nova Sonic initialization.
Parameters:
system_instruction: System-level instruction for the conversation.
messages: List of conversation messages between user and assistant.
"""
system_instruction: str = None
messages: list[AWSNovaSonicConversationHistoryMessage] = field(default_factory=list)
class AWSNovaSonicLLMContext(OpenAILLMContext):
"""Specialized LLM context for AWS Nova Sonic service.
Extends OpenAI context with Nova Sonic-specific message handling,
conversation history management, and text buffering capabilities.
"""
def __init__(self, messages=None, tools=None, **kwargs):
"""Initialize AWS Nova Sonic LLM context.
Args:
messages: Initial messages for the context.
tools: Available tools for the context.
**kwargs: Additional arguments passed to parent class.
"""
super().__init__(messages=messages, tools=tools, **kwargs)
self.__setup_local()
def __setup_local(self, system_instruction: str = ""):
self._assistant_text = ""
self._user_text = ""
self._system_instruction = system_instruction
@staticmethod
def upgrade_to_nova_sonic(
obj: OpenAILLMContext, system_instruction: str
) -> "AWSNovaSonicLLMContext":
"""Upgrade an OpenAI context to AWS Nova Sonic context.
Args:
obj: The OpenAI context to upgrade.
system_instruction: System instruction for the context.
Returns:
The upgraded AWS Nova Sonic context.
"""
if isinstance(obj, OpenAILLMContext) and not isinstance(obj, AWSNovaSonicLLMContext):
obj.__class__ = AWSNovaSonicLLMContext
obj.__setup_local(system_instruction)
return obj
# NOTE: this method has the side-effect of updating _system_instruction from messages
def get_messages_for_initializing_history(self) -> AWSNovaSonicConversationHistory:
"""Get conversation history for initializing AWS Nova Sonic session.
Processes stored messages and extracts system instruction and conversation
history in the format expected by AWS Nova Sonic.
Returns:
Formatted conversation history with system instruction and messages.
"""
history = AWSNovaSonicConversationHistory(system_instruction=self._system_instruction)
# Bail if there are no messages
if not self.messages:
return history
messages = copy.deepcopy(self.messages)
# If we have a "system" message as our first message, let's pull that out into "instruction"
if messages[0].get("role") == "system":
system = messages.pop(0)
content = system.get("content")
if isinstance(content, str):
history.system_instruction = content
elif isinstance(content, list):
history.system_instruction = content[0].get("text")
if history.system_instruction:
self._system_instruction = history.system_instruction
# Process remaining messages to fill out conversation history.
# Nova Sonic supports "user" and "assistant" messages in history.
for message in messages:
history_message = self.from_standard_message(message)
if history_message:
history.messages.append(history_message)
return history
def get_messages_for_persistent_storage(self):
"""Get messages formatted for persistent storage.
Returns:
List of messages including system instruction if present.
"""
messages = super().get_messages_for_persistent_storage()
# If we have a system instruction and messages doesn't already contain it, add it
if self._system_instruction and not (messages and messages[0].get("role") == "system"):
messages.insert(0, {"role": "system", "content": self._system_instruction})
return messages
def from_standard_message(self, message) -> AWSNovaSonicConversationHistoryMessage:
"""Convert standard message format to Nova Sonic format.
Args:
message: Standard message dictionary to convert.
Returns:
Nova Sonic conversation history message, or None if not convertible.
"""
role = message.get("role")
if message.get("role") == "user" or message.get("role") == "assistant":
content = message.get("content")
if isinstance(message.get("content"), list):
content = ""
for c in message.get("content"):
if c.get("type") == "text":
content += " " + c.get("text")
else:
logger.error(
f"Unhandled content type in context message: {c.get('type')} - {message}"
)
# There won't be content if this is an assistant tool call entry.
# We're ignoring those since they can't be loaded into AWS Nova Sonic conversation
# history
if content:
return AWSNovaSonicConversationHistoryMessage(role=Role[role.upper()], text=content)
# NOTE: we're ignoring messages with role "tool" since they can't be loaded into AWS Nova
# Sonic conversation history
def buffer_user_text(self, text):
"""Buffer user text for later flushing to context.
Args:
text: User text to buffer.
"""
self._user_text += f" {text}" if self._user_text else text
# logger.debug(f"User text buffered: {self._user_text}")
def flush_aggregated_user_text(self) -> str:
"""Flush buffered user text to context as a complete message.
Returns:
The flushed user text, or empty string if no text was buffered.
"""
if not self._user_text:
return ""
user_text = self._user_text
message = {
"role": "user",
"content": [{"type": "text", "text": user_text}],
}
self._user_text = ""
self.add_message(message)
# logger.debug(f"Context updated (user): {self.get_messages_for_logging()}")
return user_text
def buffer_assistant_text(self, text):
"""Buffer assistant text for later flushing to context.
Args:
text: Assistant text to buffer.
"""
self._assistant_text += text
# logger.debug(f"Assistant text buffered: {self._assistant_text}")
def flush_aggregated_assistant_text(self):
"""Flush buffered assistant text to context as a complete message."""
if not self._assistant_text:
return
message = {
"role": "assistant",
"content": [{"type": "text", "text": self._assistant_text}],
}
self._assistant_text = ""
self.add_message(message)
# logger.debug(f"Context updated (assistant): {self.get_messages_for_logging()}")
@dataclass
class AWSNovaSonicMessagesUpdateFrame(DataFrame):
"""Frame containing updated AWS Nova Sonic context.
Parameters:
context: The updated AWS Nova Sonic LLM context.
"""
context: AWSNovaSonicLLMContext
class AWSNovaSonicUserContextAggregator(OpenAIUserContextAggregator):
"""Context aggregator for user messages in AWS Nova Sonic conversations.
Extends the OpenAI user context aggregator to emit Nova Sonic-specific
context update frames.
"""
async def process_frame(
self, frame: Frame, direction: FrameDirection = FrameDirection.DOWNSTREAM
):
"""Process frames and emit Nova Sonic-specific context updates.
Args:
frame: The frame to process.
direction: The direction the frame is traveling.
"""
await super().process_frame(frame, direction)
# Parent does not push LLMMessagesUpdateFrame
if isinstance(frame, LLMMessagesUpdateFrame):
await self.push_frame(AWSNovaSonicMessagesUpdateFrame(context=self._context))
class AWSNovaSonicAssistantContextAggregator(OpenAIAssistantContextAggregator):
"""Context aggregator for assistant messages in AWS Nova Sonic conversations.
Provides specialized handling for assistant responses and function calls
in AWS Nova Sonic context, with custom frame processing logic.
"""
async def process_frame(self, frame: Frame, direction: FrameDirection):
"""Process frames with Nova Sonic-specific logic.
Args:
frame: The frame to process.
direction: The direction the frame is traveling.
"""
# HACK: For now, disable the context aggregator by making it just pass through all frames
# that the parent handles (except the function call stuff, which we still need).
# For an explanation of this hack, see
# AWSNovaSonicLLMService._report_assistant_response_text_added.
if isinstance(
frame,
(
InterruptionFrame,
LLMFullResponseStartFrame,
LLMFullResponseEndFrame,
TextFrame,
LLMMessagesAppendFrame,
LLMMessagesUpdateFrame,
LLMSetToolsFrame,
LLMSetToolChoiceFrame,
UserImageRawFrame,
BotStoppedSpeakingFrame,
),
):
await self.push_frame(frame, direction)
else:
await super().process_frame(frame, direction)
async def handle_function_call_result(self, frame: FunctionCallResultFrame):
"""Handle function call results for AWS Nova Sonic.
Args:
frame: The function call result frame to handle.
"""
await super().handle_function_call_result(frame)
# The standard function callback code path pushes the FunctionCallResultFrame from the LLM
# itself, so we didn't have a chance to add the result to the AWS Nova Sonic server-side
# context. Let's push a special frame to do that.
await self.push_frame(
AWSNovaSonicFunctionCallResultFrame(result_frame=frame), FrameDirection.UPSTREAM
)
@dataclass
class AWSNovaSonicContextAggregatorPair:
"""Pair of user and assistant context aggregators for AWS Nova Sonic.
Parameters:
_user: The user context aggregator.
_assistant: The assistant context aggregator.
"""
_user: AWSNovaSonicUserContextAggregator
_assistant: AWSNovaSonicAssistantContextAggregator
def user(self) -> AWSNovaSonicUserContextAggregator:
"""Get the user context aggregator.
Returns:
The user context aggregator instance.
"""
return self._user
def assistant(self) -> AWSNovaSonicAssistantContextAggregator:
"""Get the assistant context aggregator.
Returns:
The assistant context aggregator instance.
"""
return self._assistant

View File

@@ -6,16 +6,20 @@
"""Custom frames for AWS Nova Sonic LLM service."""
import warnings
from dataclasses import dataclass
from pipecat.services.aws.nova_sonic.frames import *
from pipecat.frames.frames import DataFrame, FunctionCallResultFrame
with warnings.catch_warnings():
warnings.simplefilter("always")
warnings.warn(
"Types in pipecat.services.aws_nova_sonic.frames are deprecated. "
"Please use the equivalent types from "
"pipecat.services.aws.nova_sonic.frames instead.",
DeprecationWarning,
stacklevel=2,
)
@dataclass
class AWSNovaSonicFunctionCallResultFrame(DataFrame):
"""Frame containing function call result for AWS Nova Sonic processing.
This frame wraps a standard function call result frame to enable
AWS Nova Sonic-specific handling and context updates.
Parameters:
result_frame: The underlying function call result frame.
"""
result_frame: FunctionCallResultFrame

View File

@@ -1,65 +0,0 @@
#
# Copyright (c) 20242025, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
"""Azure OpenAI Realtime LLM service implementation."""
from loguru import logger
from pipecat.services.openai.realtime.llm import OpenAIRealtimeLLMService
try:
from websockets.asyncio.client import connect as websocket_connect
except ModuleNotFoundError as e:
logger.error(f"Exception: {e}")
logger.error("In order to use Azure Realtime, you need to `pip install pipecat-ai[openai]`.")
raise Exception(f"Missing module: {e}")
class AzureRealtimeLLMService(OpenAIRealtimeLLMService):
"""Azure OpenAI Realtime LLM service with Azure-specific authentication.
Extends the OpenAI Realtime service to work with Azure OpenAI endpoints,
using Azure's authentication headers and endpoint format. Provides the same
real-time audio and text communication capabilities as the base OpenAI service.
"""
def __init__(
self,
*,
api_key: str,
base_url: str,
**kwargs,
):
"""Initialize Azure Realtime LLM service.
Args:
api_key: The API key for the Azure OpenAI service.
base_url: The full Azure WebSocket endpoint URL including api-version and deployment.
Example: "wss://my-project.openai.azure.com/openai/realtime?api-version=2024-10-01-preview&deployment=my-realtime-deployment"
**kwargs: Additional arguments passed to parent OpenAIRealtimeLLMService.
"""
super().__init__(base_url=base_url, api_key=api_key, **kwargs)
self.api_key = api_key
self.base_url = base_url
async def _connect(self):
try:
if self._websocket:
# Here we assume that if we have a websocket, we are connected. We
# handle disconnections in the send/recv code paths.
return
logger.info(f"Connecting to {self.base_url}, api key: {self.api_key}")
self._websocket = await websocket_connect(
uri=self.base_url,
additional_headers={
"api-key": self.api_key,
},
)
self._receive_task = self.create_task(self._receive_task_handler())
except Exception as e:
logger.error(f"{self} initialization error: {e}")
self._websocket = None

View File

@@ -28,12 +28,13 @@ from pipecat.frames.frames import (
UserStoppedSpeakingFrame,
)
from pipecat.processors.frame_processor import FrameDirection
from pipecat.services.stt_service import WebsocketSTTService
from pipecat.services.stt_service import STTService
from pipecat.transcriptions.language import Language
from pipecat.utils.time import time_now_iso8601
from pipecat.utils.tracing.service_decorators import traced_stt
try:
import websockets
from websockets.asyncio.client import connect as websocket_connect
from websockets.protocol import State
except ModuleNotFoundError as e:
@@ -123,7 +124,7 @@ class CartesiaLiveOptions:
return cls(**json.loads(json_str))
class CartesiaSTTService(WebsocketSTTService):
class CartesiaSTTService(STTService):
"""Speech-to-text service using Cartesia Live API.
Provides real-time speech transcription through WebSocket connection
@@ -175,7 +176,8 @@ class CartesiaSTTService(WebsocketSTTService):
self.set_model_name(merged_options.model)
self._api_key = api_key
self._base_url = base_url or "api.cartesia.ai"
self._receive_task = None
self._connection = None
self._receiver_task = None
def can_generate_metrics(self) -> bool:
"""Check if the service can generate processing metrics.
@@ -212,27 +214,6 @@ class CartesiaSTTService(WebsocketSTTService):
await super().cancel(frame)
await self._disconnect()
async def start_metrics(self):
"""Start performance metrics collection for transcription processing."""
await self.start_ttfb_metrics()
await self.start_processing_metrics()
async def process_frame(self, frame: Frame, direction: FrameDirection):
"""Process incoming frames and handle speech events.
Args:
frame: The frame to process.
direction: Direction of frame flow in the pipeline.
"""
await super().process_frame(frame, direction)
if isinstance(frame, UserStartedSpeakingFrame):
await self.start_metrics()
elif isinstance(frame, UserStoppedSpeakingFrame):
# Send finalize command to flush the transcription session
if self._websocket and self._websocket.state is State.OPEN:
await self._websocket.send("finalize")
async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]:
"""Process audio data for speech-to-text transcription.
@@ -243,71 +224,45 @@ class CartesiaSTTService(WebsocketSTTService):
None - transcription results are handled via WebSocket responses.
"""
# If the connection is closed, due to timeout, we need to reconnect when the user starts speaking again
if not self._websocket or self._websocket.state is State.CLOSED:
if not self._connection or self._connection.state is State.CLOSED:
await self._connect()
await self._websocket.send(audio)
await self._connection.send(audio)
yield None
async def _connect(self):
await self._connect_websocket()
params = self._settings.to_dict()
ws_url = f"wss://{self._base_url}/stt/websocket?{urllib.parse.urlencode(params)}"
logger.debug(f"Connecting to Cartesia: {ws_url}")
headers = {"Cartesia-Version": "2025-04-16", "X-API-Key": self._api_key}
if self._websocket and not self._receive_task:
self._receive_task = asyncio.create_task(self._receive_task_handler(self._report_error))
async def _disconnect(self):
if self._receive_task:
await self.cancel_task(self._receive_task)
self._receive_task = None
await self._disconnect_websocket()
async def _connect_websocket(self):
try:
if self._websocket and self._websocket.state is State.OPEN:
return
logger.debug("Connecting to Cartesia STT")
params = self._settings.to_dict()
ws_url = f"wss://{self._base_url}/stt/websocket?{urllib.parse.urlencode(params)}"
headers = {"Cartesia-Version": "2025-04-16", "X-API-Key": self._api_key}
self._websocket = await websocket_connect(ws_url, additional_headers=headers)
await self._call_event_handler("on_connected")
self._connection = await websocket_connect(ws_url, additional_headers=headers)
# Setup the receiver task to handle the incoming messages from the Cartesia server
if self._receiver_task is None or self._receiver_task.done():
self._receiver_task = asyncio.create_task(self._receive_messages())
logger.debug(f"Connected to Cartesia")
except Exception as e:
logger.error(f"{self}: unable to connect to Cartesia: {e}")
async def _disconnect_websocket(self):
try:
if self._websocket and self._websocket.state is State.OPEN:
logger.debug("Disconnecting from Cartesia STT")
await self._websocket.close()
except Exception as e:
logger.error(f"{self} error closing websocket: {e}")
finally:
self._websocket = None
await self._call_event_handler("on_disconnected")
def _get_websocket(self):
if self._websocket:
return self._websocket
raise Exception("Websocket not connected")
async def _process_messages(self):
async for message in self._get_websocket():
try:
data = json.loads(message)
await self._process_response(data)
except json.JSONDecodeError:
logger.warning(f"Received non-JSON message: {message}")
async def _receive_messages(self):
while True:
await self._process_messages()
# Cartesia times out after 5 minutes of innactivity (no keepalive
# mechanism is available). So, we try to reconnect.
logger.debug(f"{self} Cartesia connection was disconnected (timeout?), reconnecting")
await self._connect_websocket()
try:
while True:
if not self._connection or self._connection.state is State.CLOSED:
break
message = await self._connection.recv()
try:
data = json.loads(message)
await self._process_response(data)
except json.JSONDecodeError:
logger.warning(f"Received non-JSON message: {message}")
except asyncio.CancelledError:
pass
except websockets.exceptions.ConnectionClosed as e:
logger.debug(f"WebSocket connection closed: {e}")
except Exception as e:
logger.error(f"Error in message receiver: {e}")
async def _process_response(self, data):
if "type" in data:
@@ -361,3 +316,41 @@ class CartesiaSTTService(WebsocketSTTService):
language,
)
)
async def _disconnect(self):
if self._receiver_task:
self._receiver_task.cancel()
try:
await self._receiver_task
except asyncio.CancelledError:
pass
except Exception as e:
logger.exception(f"Unexpected exception while cancelling task: {e}")
self._receiver_task = None
if self._connection and self._connection.state is State.OPEN:
logger.debug("Disconnecting from Cartesia")
await self._connection.close()
self._connection = None
async def start_metrics(self):
"""Start performance metrics collection for transcription processing."""
await self.start_ttfb_metrics()
await self.start_processing_metrics()
async def process_frame(self, frame: Frame, direction: FrameDirection):
"""Process incoming frames and handle speech events.
Args:
frame: The frame to process.
direction: Direction of frame flow in the pipeline.
"""
await super().process_frame(frame, direction)
if isinstance(frame, UserStartedSpeakingFrame):
await self.start_metrics()
elif isinstance(frame, UserStoppedSpeakingFrame):
# Send finalize command to flush the transcription session
if self._connection and self._connection.state is State.OPEN:
await self._connection.send("finalize")

View File

@@ -344,11 +344,10 @@ class CartesiaTTSService(AudioContextWordTTSService):
try:
if self._websocket and self._websocket.state is State.OPEN:
return
logger.debug("Connecting to Cartesia TTS")
logger.debug("Connecting to Cartesia")
self._websocket = await websocket_connect(
f"{self._url}?api_key={self._api_key}&cartesia_version={self._cartesia_version}"
)
await self._call_event_handler("on_connected")
except Exception as e:
logger.error(f"{self} initialization error: {e}")
self._websocket = None
@@ -366,7 +365,6 @@ class CartesiaTTSService(AudioContextWordTTSService):
finally:
self._context_id = None
self._websocket = None
await self._call_event_handler("on_disconnected")
def _get_websocket(self):
if self._websocket:

View File

@@ -8,7 +8,6 @@ import sys
from pipecat.services import DeprecatedModuleProxy
from .flux import *
from .stt import *
from .tts import *

View File

@@ -1,640 +0,0 @@
#
# Copyright (c) 20242025, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
"""Deepgram Flux speech-to-text service implementation."""
import json
from enum import Enum
from typing import Any, AsyncGenerator, Dict, Optional
from loguru import logger
from pydantic import BaseModel
from pipecat.frames.frames import (
CancelFrame,
EndFrame,
ErrorFrame,
Frame,
InterimTranscriptionFrame,
StartFrame,
TranscriptionFrame,
UserStartedSpeakingFrame,
UserStoppedSpeakingFrame,
)
from pipecat.processors.frame_processor import FrameDirection
from pipecat.services.stt_service import WebsocketSTTService
from pipecat.transcriptions.language import Language
from pipecat.utils.time import time_now_iso8601
from pipecat.utils.tracing.service_decorators import traced_stt
try:
from websockets.asyncio.client import connect as websocket_connect
from websockets.protocol import State
except ModuleNotFoundError as e:
logger.error(f"Exception: {e}")
logger.error("In order to use Deepgram Flux, you need to `pip install pipecat-ai[deepgram]`.")
raise Exception(f"Missing module: {e}")
class FluxMessageType(str, Enum):
"""Deepgram Flux WebSocket message types.
These are the top-level message types that can be received from the
Deepgram Flux WebSocket connection.
"""
RECEIVE_CONNECTED = "Connected"
RECEIVE_FATAL_ERROR = "Error"
TURN_INFO = "TurnInfo"
class FluxEventType(str, Enum):
"""Deepgram Flux TurnInfo event types.
These events are contained within TurnInfo messages and indicate
different stages of speech processing and turn detection.
"""
START_OF_TURN = "StartOfTurn"
TURN_RESUMED = "TurnResumed"
END_OF_TURN = "EndOfTurn"
EAGER_END_OF_TURN = "EagerEndOfTurn"
UPDATE = "Update"
class DeepgramFluxSTTService(WebsocketSTTService):
"""Deepgram Flux speech-to-text service.
Provides real-time speech recognition using Deepgram's WebSocket API with Flux capabilities.
Supports configurable models, VAD events, and various audio processing options
including advanced turn detection and EagerEndOfTurn events for improved conversational AI performance.
"""
class InputParams(BaseModel):
"""Configuration parameters for Deepgram Flux API.
This class defines all available connection parameters for the Deepgram Flux API
based on the official documentation.
Parameters:
eager_eot_threshold: Optional. EagerEndOfTurn/TurnResumed are off by default.
You can turn them on by setting eager_eot_threshold to a valid value.
Lower values = more aggressive EagerEndOfTurning (faster response, more LLM calls).
Higher values = more conservative EagerEndOfTurning (slower response, fewer LLM calls).
eot_threshold: Optional. End-of-turn confidence required to finish a turn (default 0.7).
Lower values = turns end sooner (more interruptions, faster responses).
Higher values = turns end later (fewer interruptions, more complete utterances).
eot_timeout_ms: Optional. Time in milliseconds after speech to finish a turn
regardless of EOT confidence (default 5000).
keyterm: List of keyterms to boost recognition accuracy for specialized terminology.
mip_opt_out: Optional. Opts out requests from the Deepgram Model Improvement Program
(default False).
tag: List of tags to label requests for identification during usage reporting.
"""
eager_eot_threshold: Optional[float] = None
eot_threshold: Optional[float] = None
eot_timeout_ms: Optional[int] = None
keyterm: list = []
mip_opt_out: Optional[bool] = None
tag: list = []
def __init__(
self,
*,
api_key: str,
url: str = "wss://api.deepgram.com/v2/listen",
sample_rate: Optional[int] = None,
model: str = "flux-general-en",
flux_encoding: str = "linear16",
params: Optional[InputParams] = None,
**kwargs,
):
"""Initialize the Deepgram Flux STT service.
Args:
api_key: Deepgram API key for authentication. Required for API access.
url: WebSocket URL for the Deepgram Flux API. Defaults to the preview endpoint.
sample_rate: Audio sample rate in Hz. If None, uses the rate from params or 16000.
model: Deepgram Flux model to use for transcription. Currently only supports "flux-general-en".
flux_encoding: Audio encoding format required by Flux API. Must be "linear16".
Raw signed little-endian 16-bit PCM encoding.
params: InputParams instance containing detailed API configuration options.
If None, default parameters will be used.
**kwargs: Additional arguments passed to the parent WebsocketSTTService class.
Examples:
Basic usage with default parameters::
stt = DeepgramFluxSTTService(api_key="your-api-key")
Advanced usage with custom parameters::
params = DeepgramFluxSTTService.InputParams(
eager_eot_threshold=0.5,
eot_threshold=0.8,
keyterm=["AI", "machine learning", "neural network"],
tag=["production", "voice-agent"]
)
stt = DeepgramFluxSTTService(
api_key="your-api-key",
model="flux-general-en",
params=params
)
"""
super().__init__(sample_rate=sample_rate, **kwargs)
self._api_key = api_key
self._url = url
self._model = model
self._params = params or DeepgramFluxSTTService.InputParams()
self._flux_encoding = flux_encoding
# This is the currently only supported language
self._language = Language.EN
self._websocket_url = None
self._receive_task = None
async def _connect(self):
"""Connect to WebSocket and start background tasks.
Establishes the WebSocket connection to the Deepgram Flux API and starts
the background task for receiving transcription results.
"""
await self._connect_websocket()
if self._websocket and not self._receive_task:
self._receive_task = self.create_task(self._receive_task_handler(self._report_error))
async def _disconnect(self):
"""Disconnect from WebSocket and clean up tasks.
Gracefully disconnects from the Deepgram Flux API, cancels background tasks,
and cleans up resources to prevent memory leaks.
"""
try:
# Cancel background tasks BEFORE closing websocket
if self._receive_task:
await self.cancel_task(self._receive_task, timeout=2.0)
self._receive_task = None
# Now close the websocket
await self._disconnect_websocket()
except Exception as e:
logger.error(f"Error during disconnect: {e}")
finally:
# Reset state only after everything is cleaned up
self._websocket = None
async def _connect_websocket(self):
"""Establish WebSocket connection to API.
Creates a WebSocket connection to the Deepgram Flux API using the configured
URL and authentication headers. Handles connection errors and reports them
through the event handler system.
"""
try:
if self._websocket and self._websocket.state is State.OPEN:
return
self._websocket = await websocket_connect(
self._websocket_url,
additional_headers={"Authorization": f"Token {self._api_key}"},
)
logger.debug("Connected to Deepgram Flux Websocket")
await self._call_event_handler("on_connected")
except Exception as e:
logger.error(f"{self} initialization error: {e}")
self._websocket = None
await self._call_event_handler("on_connection_error", f"{e}")
async def _disconnect_websocket(self):
"""Close WebSocket connection and clean up state.
Closes the WebSocket connection to the Deepgram Flux API and stops all
metrics collection. Handles disconnection errors gracefully.
"""
try:
await self.stop_all_metrics()
if self._websocket:
await self._send_close_stream()
logger.debug("Disconnecting from Deepgram Flux Websocket")
await self._websocket.close()
except Exception as e:
logger.error(f"{self} error closing websocket: {e}")
finally:
self._websocket = None
await self._call_event_handler("on_disconnected")
async def _send_close_stream(self) -> None:
"""Sends a CloseStream control message to the Deepgram Flux WebSocket API.
This signals to the server that no more audio data will be sent.
"""
if self._websocket:
logger.debug("Sending CloseStream message to Deepgram Flux")
message = {"type": "CloseStream"}
await self._websocket.send(json.dumps(message))
def can_generate_metrics(self) -> bool:
"""Check if this service can generate processing metrics.
Returns:
True, as Deepgram service supports metrics generation.
"""
return True
async def start(self, frame: StartFrame):
"""Start the Deepgram Flux STT service.
Initializes the service by constructing the WebSocket URL with all configured
parameters and establishing the connection to begin transcription processing.
Args:
frame: The start frame containing initialization parameters and metadata.
"""
await super().start(frame)
url_params = [
f"model={self._model}",
f"sample_rate={self.sample_rate}",
f"encoding={self._flux_encoding}",
]
if self._params.eager_eot_threshold is not None:
url_params.append(f"eager_eot_threshold={self._params.eager_eot_threshold}")
if self._params.eot_threshold is not None:
url_params.append(f"eot_threshold={self._params.eot_threshold}")
if self._params.eot_timeout_ms is not None:
url_params.append(f"eot_timeout_ms={self._params.eot_timeout_ms}")
if self._params.mip_opt_out is not None:
url_params.append(f"mip_opt_out={str(self._params.mip_opt_out).lower()}")
# Add keyterm parameters (can have multiple)
for keyterm in self._params.keyterm:
url_params.append(f"keyterm={keyterm}")
# Add tag parameters (can have multiple)
for tag_value in self._params.tag:
url_params.append(f"tag={tag_value}")
self._websocket_url = f"{self._url}?{'&'.join(url_params)}"
await self._connect()
async def stop(self, frame: EndFrame):
"""Stop the Deepgram Flux STT service.
Args:
frame: The end frame.
"""
await super().stop(frame)
await self._disconnect()
async def cancel(self, frame: CancelFrame):
"""Cancel the Deepgram Flux STT service.
Args:
frame: The cancel frame.
"""
await super().cancel(frame)
await self._disconnect()
async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]:
"""Send audio data to Deepgram Flux for transcription.
Transmits raw audio bytes to the Deepgram Flux API for real-time speech
recognition. Transcription results are received asynchronously through
WebSocket callbacks and processed in the background.
Args:
audio: Raw audio bytes in linear16 format (signed little-endian 16-bit PCM).
Yields:
Frame: None (transcription results are delivered via WebSocket callbacks
rather than as return values from this method).
Raises:
Exception: If the WebSocket connection is not established or if there
are issues sending the audio data.
"""
if not self._websocket:
logger.error("Not connected to Deepgram Flux.")
yield ErrorFrame("Not connected to Deepgram Flux.", fatal=True)
return
try:
await self._websocket.send(audio)
except Exception as e:
logger.error(f"Failed to send audio to Flux: {e}")
yield ErrorFrame(f"Failed to send audio to Flux: {e}")
return
yield None
async def start_metrics(self):
"""Start TTFB and processing metrics collection."""
# TTFB (Time To First Byte) metrics are currently disabled for Deepgram Flux.
# Ideally, TTFB should measure the time from when a user starts speaking
# until we receive the first transcript. However, Deepgram Flux delivers
# both the "user started speaking" event and the first transcript simultaneously,
# making this timing measurement meaningless in this context.
# await self.start_ttfb_metrics()
await self.start_processing_metrics()
@traced_stt
async def _handle_transcription(
self, transcript: str, is_final: bool, language: Optional[Language] = None
):
"""Handle a transcription result with tracing."""
pass
def _get_websocket(self):
"""Get the current WebSocket connection.
Returns the active WebSocket connection instance, raising an exception
if no connection is currently established.
Returns:
The active WebSocket connection instance.
Raises:
Exception: If no WebSocket connection is currently active.
"""
if self._websocket:
return self._websocket
raise Exception("Websocket not connected")
def _validate_message(self, data: Dict[str, Any]) -> bool:
"""Validate basic message structure from Deepgram Flux.
Ensures the received message has the expected structure before processing.
Args:
data: The parsed JSON message data to validate.
Returns:
True if the message structure is valid, False otherwise.
"""
if not isinstance(data, dict):
logger.warning("Message is not a dictionary")
return False
if "type" not in data:
logger.warning("Message missing 'type' field")
return False
return True
async def _receive_messages(self):
"""Receive and process messages from WebSocket.
Continuously receives messages from the Deepgram Flux WebSocket connection
and processes various message types including connection status, transcription
results, turn information, and error conditions. Handles different event types
such as StartOfTurn, EndOfTurn, EagerEndOfTurn, and Update events.
"""
async for message in self._get_websocket():
if isinstance(message, str):
try:
data = json.loads(message)
await self._handle_message(data)
except json.JSONDecodeError as e:
logger.error(f"Failed to decode JSON message: {e}")
# Skip malformed messages
continue
except Exception as e:
logger.error(f"Error processing message: {e}")
# Error will be handled inside WebsocketService->_receive_task_handler
raise
else:
logger.warning(f"Received non-string message: {type(message)}")
async def _handle_message(self, data: Dict[str, Any]):
"""Handle a parsed WebSocket message from Deepgram Flux.
Routes messages to appropriate handlers based on their type. Validates
message structure before processing.
Args:
data: The parsed JSON message data from the WebSocket.
"""
if not self._validate_message(data):
return
message_type = data.get("type")
try:
flux_message_type = FluxMessageType(message_type)
except ValueError:
logger.debug(f"Unhandled message type: {message_type or 'unknown'}")
return
match flux_message_type:
case FluxMessageType.RECEIVE_CONNECTED:
await self._handle_connection_established()
case FluxMessageType.RECEIVE_FATAL_ERROR:
await self._handle_fatal_error(data)
case FluxMessageType.TURN_INFO:
await self._handle_turn_info(data)
async def _handle_connection_established(self):
"""Handle successful connection establishment to Deepgram Flux.
This event is fired when the WebSocket connection to Deepgram Flux
is successfully established and ready to receive audio data for
transcription processing.
"""
logger.info("Connected to Flux - ready to stream audio")
async def _handle_fatal_error(self, data: Dict[str, Any]):
"""Handle fatal error messages from Deepgram Flux.
Fatal errors indicate unrecoverable issues with the connection or
configuration that require intervention. These errors will cause
the connection to be terminated.
Args:
data: The error message data containing error details.
Raises:
Exception: Always raises to trigger error handling in the parent service.
"""
error_msg = data.get("error", "Unknown error")
deepgram_error = f"Fatal error: {error_msg}"
logger.error(deepgram_error)
# Error will be handled inside WebsocketService->_receive_task_handler
raise Exception(deepgram_error)
async def _handle_turn_info(self, data: Dict[str, Any]):
"""Handle TurnInfo events from Deepgram Flux.
TurnInfo messages contain various turn-based events that indicate
the state of speech processing, including turn boundaries, interim
results, and turn finalization events.
Args:
data: The TurnInfo message data containing event type, transcript and some extra metadata.
"""
event = data.get("event")
transcript = data.get("transcript", "")
try:
flux_event_type = FluxEventType(event)
except ValueError:
logger.debug(f"Unhandled TurnInfo event: {event}")
return
match flux_event_type:
case FluxEventType.START_OF_TURN:
await self._handle_start_of_turn(transcript)
case FluxEventType.TURN_RESUMED:
await self._handle_turn_resumed(event)
case FluxEventType.END_OF_TURN:
await self._handle_end_of_turn(transcript, data)
case FluxEventType.EAGER_END_OF_TURN:
await self._handle_eager_end_of_turn(transcript, data)
case FluxEventType.UPDATE:
await self._handle_update(transcript)
async def _handle_start_of_turn(self, transcript: str):
"""Handle StartOfTurn events from Deepgram Flux.
StartOfTurn events are fired when Deepgram Flux detects the beginning
of a new speaking turn. This triggers bot interruption to stop any
ongoing speech synthesis and signals the start of user speech detection.
The service will:
- Send a BotInterruptionFrame upstream to stop bot speech
- Send a UserStartedSpeakingFrame downstream to notify other components
- Start metrics collection for measuring response times
Args:
transcript: maybe the first few words of the turn.
"""
logger.debug("User started speaking")
await self.push_interruption_task_frame_and_wait()
await self.push_frame(UserStartedSpeakingFrame(), FrameDirection.DOWNSTREAM)
await self.push_frame(UserStartedSpeakingFrame(), FrameDirection.UPSTREAM)
await self.start_metrics()
if transcript:
logger.trace(f"Start of turn transcript: {transcript}")
async def _handle_turn_resumed(self, event: str):
"""Handle TurnResumed events from Deepgram Flux.
TurnResumed events indicate that speech has resumed after a brief pause
within the same turn. This is primarily used for logging and debugging
purposes and doesn't trigger any significant processing changes.
Args:
event: The event type string for logging purposes.
"""
logger.trace(f"Received event TurnResumed: {event}")
async def _handle_end_of_turn(self, transcript: str, data: Dict[str, Any]):
"""Handle EndOfTurn events from Deepgram Flux.
EndOfTurn events are fired when Deepgram Flux determines that a speaking
turn has concluded, either due to sufficient silence or end-of-turn
confidence thresholds being met. This provides the final transcript
for the completed turn.
The service will:
- Create and send a final TranscriptionFrame with the complete transcript
- Trigger transcription handling with tracing for metrics
- Stop processing metrics collection
- Send a UserStoppedSpeakingFrame to signal turn completion
Args:
transcript: The final transcript text for the completed turn.
data: The TurnInfo message data containing event type, transcript and some extra metadata.
"""
logger.debug("User stopped speaking")
await self.push_frame(
TranscriptionFrame(
transcript,
self._user_id,
time_now_iso8601(),
self._language,
result=data,
)
)
await self._handle_transcription(transcript, True, self._language)
await self.stop_processing_metrics()
await self.push_frame(UserStoppedSpeakingFrame(), FrameDirection.DOWNSTREAM)
await self.push_frame(UserStoppedSpeakingFrame(), FrameDirection.UPSTREAM)
async def _handle_eager_end_of_turn(self, transcript: str, data: Dict[str, Any]):
"""Handle EagerEndOfTurn events from Deepgram Flux.
EagerEndOfTurn events are fired when the end-of-turn confidence reaches the
EagerEndOfTurn threshold but hasn't yet reached the full end-of-turn threshold.
These provide interim transcripts that can be used for faster response
generation while still allowing the user to continue speaking.
EagerEndOfTurn events enable more responsive conversational AI by allowing
the LLM to start processing likely final transcripts before the turn
is definitively ended.
Args:
transcript: The interim transcript text that triggered the EagerEndOfTurn event.
data: The TurnInfo message data containing event type, transcript and some extra metadata.
"""
logger.trace(f"EagerEndOfTurn - {transcript}")
# Deepgram's EagerEndOfTurn feature enables lower-latency voice agents by sending
# medium-confidence transcripts before EndOfTurn certainty, allowing LLM processing to
# begin early.
#
# However, if speech resumes or the transcripts differ from the final EndOfTurn, the
# EagerEndOfTurn response should be cancelled to avoid incorrect or partial responses.
#
# Pipecat doesn't yet provide built-in Gate/control mechanisms to:
# 1. Start LLM/TTS processing early on EagerEndOfTurn events
# 2. Cancel in-flight processing when TurnResumed occurs
#
# By pushing EagerEndOfTurn transcripts as InterimTranscriptionFrame, we enable
# developers to implement custom EagerEndOfTurn handling in their applications while
# maintaining compatibility with existing interim transcription workflows.
#
# TODO: Implement proper EagerEndOfTurn support with cancellable processing pipeline
# that can start response generation on EagerEndOfTurn and cancel or confirm it.
await self.push_frame(
InterimTranscriptionFrame(
transcript,
self._user_id,
time_now_iso8601(),
self._language,
result=data,
)
)
async def _handle_update(self, transcript: str):
"""Handle Update events from Deepgram Flux.
Update events provide incremental transcript updates during an ongoing
turn. These events allow for real-time display of transcription progress
and can be used to provide visual feedback to users about what's being
recognized.
The service stops TTFB (Time To First Byte) metrics when the first
substantial update is received, indicating successful processing start.
Args:
transcript: The current partial transcript text for the ongoing turn.
"""
if transcript:
logger.trace(f"Update event: {transcript}")
# TTFB (Time To First Byte) metrics are currently disabled for Deepgram Flux.
# Ideally, TTFB should measure the time from when a user starts speaking
# until we receive the first transcript. However, Deepgram Flux delivers
# both the "user started speaking" event and the first transcript simultaneously,
# making this timing measurement meaningless in this context.
# await self.stop_ttfb_metrics()

View File

@@ -8,7 +8,6 @@ import sys
from pipecat.services import DeprecatedModuleProxy
from .stt import *
from .tts import *
sys.modules[__name__] = DeprecatedModuleProxy(globals(), "elevenlabs", "elevenlabs.[stt,tts]")
sys.modules[__name__] = DeprecatedModuleProxy(globals(), "elevenlabs", "elevenlabs.tts")

View File

@@ -168,24 +168,16 @@ def build_elevenlabs_voice_settings(
def calculate_word_times(
alignment_info: Mapping[str, Any],
cumulative_time: float,
partial_word: str = "",
partial_word_start_time: float = 0.0,
) -> tuple[List[Tuple[str, float]], str, float]:
alignment_info: Mapping[str, Any], cumulative_time: float
) -> List[Tuple[str, float]]:
"""Calculate word timestamps from character alignment information.
Args:
alignment_info: Character alignment data from ElevenLabs API.
cumulative_time: Base time offset for this chunk.
partial_word: Partial word carried over from previous chunk.
partial_word_start_time: Start time of the partial word.
Returns:
Tuple of (word_times, new_partial_word, new_partial_word_start_time):
- word_times: List of (word, timestamp) tuples for complete words
- new_partial_word: Incomplete word at end of chunk (empty if chunk ends with space)
- new_partial_word_start_time: Start time of the incomplete word
List of (word, timestamp) tuples.
"""
chars = alignment_info["chars"]
char_start_times_ms = alignment_info["charStartTimesMs"]
@@ -194,37 +186,41 @@ def calculate_word_times(
logger.error(
f"calculate_word_times: length mismatch - chars={len(chars)}, times={len(char_start_times_ms)}"
)
return ([], partial_word, partial_word_start_time)
return []
# Build words and track their start positions
words = []
word_start_times = []
current_word = partial_word # Start with any partial word from previous chunk
word_start_time = partial_word_start_time if partial_word else None
word_start_indices = []
current_word = ""
word_start_index = None
for i, char in enumerate(chars):
if char == " ":
# End of current word
if current_word: # Only add non-empty words
words.append(current_word)
word_start_times.append(word_start_time)
word_start_indices.append(word_start_index)
current_word = ""
word_start_time = None
word_start_index = None
else:
# Building a word
if word_start_time is None: # First character of new word
# Convert from milliseconds to seconds and add cumulative offset
word_start_time = cumulative_time + (char_start_times_ms[i] / 1000.0)
if word_start_index is None: # First character of new word
word_start_index = i
current_word += char
# Build result for complete words
word_times = list(zip(words, word_start_times))
# Handle the last word if there's no trailing space
if current_word and word_start_index is not None:
words.append(current_word)
word_start_indices.append(word_start_index)
# Return any incomplete word at the end of this chunk
new_partial_word = current_word if current_word else ""
new_partial_word_start_time = word_start_time if word_start_time is not None else 0.0
# Calculate timestamps for each word
word_times = []
for word, start_idx in zip(words, word_start_indices):
# Convert from milliseconds to seconds and add cumulative offset
start_time_seconds = cumulative_time + (char_start_times_ms[start_idx] / 1000.0)
word_times.append((word, start_time_seconds))
return (word_times, new_partial_word, new_partial_word_start_time)
return word_times
class ElevenLabsTTSService(AudioContextWordTTSService):
@@ -336,9 +332,6 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
# there's an interruption or TTSStoppedFrame.
self._started = False
self._cumulative_time = 0
# Track partial words that span across alignment chunks
self._partial_word = ""
self._partial_word_start_time = 0.0
# Context management for v1 multi API
self._context_id = None
@@ -528,7 +521,6 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
url, max_size=16 * 1024 * 1024, additional_headers={"xi-api-key": self._api_key}
)
await self._call_event_handler("on_connected")
except Exception as e:
logger.error(f"{self} initialization error: {e}")
self._websocket = None
@@ -551,7 +543,6 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
self._started = False
self._context_id = None
self._websocket = None
await self._call_event_handler("on_disconnected")
def _get_websocket(self):
if self._websocket:
@@ -579,8 +570,6 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
logger.error(f"Error closing context on interruption: {e}")
self._context_id = None
self._started = False
self._partial_word = ""
self._partial_word_start_time = 0.0
async def _receive_messages(self):
"""Handle incoming WebSocket messages from ElevenLabs."""
@@ -620,14 +609,7 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
if msg.get("alignment"):
alignment = msg["alignment"]
word_times, self._partial_word, self._partial_word_start_time = (
calculate_word_times(
alignment,
self._cumulative_time,
self._partial_word,
self._partial_word_start_time,
)
)
word_times = calculate_word_times(alignment, self._cumulative_time)
if word_times:
await self.add_word_timestamps(word_times)
@@ -701,8 +683,6 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
yield TTSStartedFrame()
self._started = True
self._cumulative_time = 0
self._partial_word = ""
self._partial_word_start_time = 0.0
# If a context ID does not exist, create a new one and
# register it. If an ID exists, that means the Pipeline is
# configured for allow_interruptions=False, so continue
@@ -776,7 +756,6 @@ class ElevenLabsHttpTTSService(WordTTSService):
base_url: str = "https://api.elevenlabs.io",
sample_rate: Optional[int] = None,
params: Optional[InputParams] = None,
aggregate_sentences: Optional[bool] = True,
**kwargs,
):
"""Initialize the ElevenLabs HTTP TTS service.
@@ -789,11 +768,10 @@ class ElevenLabsHttpTTSService(WordTTSService):
base_url: Base URL for ElevenLabs HTTP API.
sample_rate: Audio sample rate. If None, uses default.
params: Additional input parameters for voice customization.
aggregate_sentences: Whether to aggregate sentences within the TTSService.
**kwargs: Additional arguments passed to the parent service.
"""
super().__init__(
aggregate_sentences=aggregate_sentences,
aggregate_sentences=True,
push_text_frames=False,
push_stop_frames=True,
sample_rate=sample_rate,
@@ -831,10 +809,6 @@ class ElevenLabsHttpTTSService(WordTTSService):
# Store previous text for context within a turn
self._previous_text = ""
# Track partial words that span across alignment chunks
self._partial_word = ""
self._partial_word_start_time = 0.0
def language_to_service_language(self, language: Language) -> Optional[str]:
"""Convert pipecat Language to ElevenLabs language code.
@@ -862,8 +836,6 @@ class ElevenLabsHttpTTSService(WordTTSService):
self._cumulative_time = 0
self._started = False
self._previous_text = ""
self._partial_word = ""
self._partial_word_start_time = 0.0
logger.debug(f"{self}: Reset internal state")
async def start(self, frame: StartFrame):
@@ -898,13 +870,11 @@ class ElevenLabsHttpTTSService(WordTTSService):
def calculate_word_times(self, alignment_info: Mapping[str, Any]) -> List[Tuple[str, float]]:
"""Calculate word timing from character alignment data.
This method handles partial words that may span across multiple alignment chunks.
Args:
alignment_info: Character timing data from ElevenLabs.
Returns:
List of (word, timestamp) pairs for complete words in this chunk.
List of (word, timestamp) pairs.
Example input data::
@@ -930,28 +900,30 @@ class ElevenLabsHttpTTSService(WordTTSService):
# Build the words and find their start times
words = []
word_start_times = []
# Start with any partial word from previous chunk
current_word = self._partial_word
word_start_time = self._partial_word_start_time if self._partial_word else None
current_word = ""
first_char_idx = -1
for i, char in enumerate(chars):
if char == " ":
if current_word: # Only add non-empty words
words.append(current_word)
word_start_times.append(word_start_time)
current_word = ""
word_start_time = None
else:
if word_start_time is None: # First character of a new word
# Use time of the first character of the word, offset by cumulative time
word_start_time = self._cumulative_time + char_start_times[i]
word_start_times.append(
self._cumulative_time + char_start_times[first_char_idx]
)
current_word = ""
first_char_idx = -1
else:
if not current_word: # This is the first character of a new word
first_char_idx = i
current_word += char
# Store any incomplete word at the end of this chunk
self._partial_word = current_word if current_word else ""
self._partial_word_start_time = word_start_time if word_start_time is not None else 0.0
# Don't forget the last word if there's no trailing space
if current_word and first_char_idx >= 0:
words.append(current_word)
word_start_times.append(self._cumulative_time + char_start_times[first_char_idx])
# Create word-time pairs for complete words only
# Create word-time pairs
word_times = list(zip(words, word_start_times))
return word_times
@@ -987,9 +959,6 @@ class ElevenLabsHttpTTSService(WordTTSService):
if self._voice_settings:
payload["voice_settings"] = self._voice_settings
if self._settings["apply_text_normalization"] is not None:
payload["apply_text_normalization"] = self._settings["apply_text_normalization"]
language = self._settings["language"]
if self._model_name in ELEVENLABS_MULTILINGUAL_MODELS and language:
payload["language_code"] = language
@@ -1010,6 +979,8 @@ class ElevenLabsHttpTTSService(WordTTSService):
}
if self._settings["optimize_streaming_latency"] is not None:
params["optimize_streaming_latency"] = self._settings["optimize_streaming_latency"]
if self._settings["apply_text_normalization"] is not None:
params["apply_text_normalization"] = self._settings["apply_text_normalization"]
try:
await self.start_ttfb_metrics()
@@ -1070,14 +1041,6 @@ class ElevenLabsHttpTTSService(WordTTSService):
logger.error(f"Error processing response: {e}", exc_info=True)
continue
# After processing all chunks, emit any remaining partial word
# since this is the end of the utterance
if self._partial_word:
final_word_time = [(self._partial_word, self._partial_word_start_time)]
await self.add_word_timestamps(final_word_time)
self._partial_word = ""
self._partial_word_start_time = 0.0
# After processing all chunks, add the total utterance duration
# to the cumulative time to ensure next utterance starts after this one
if utterance_duration > 0:

View File

@@ -225,8 +225,6 @@ class FishAudioTTSService(InterruptibleTTSService):
start_message = {"event": "start", "request": {"text": "", **self._settings}}
await self._websocket.send(ormsgpack.packb(start_message))
logger.debug("Sent start message to Fish Audio")
await self._call_event_handler("on_connected")
except Exception as e:
logger.error(f"Fish Audio initialization error: {e}")
self._websocket = None
@@ -247,7 +245,6 @@ class FishAudioTTSService(InterruptibleTTSService):
self._request_id = None
self._started = False
self._websocket = None
await self._call_event_handler("on_disconnected")
async def flush_audio(self):
"""Flush any buffered audio by sending a flush event to Fish Audio."""

View File

@@ -4,41 +4,527 @@
# SPDX-License-Identifier: BSD 2-Clause License
#
"""Event models and utilities for Google Gemini Multimodal Live API.
"""Event models and utilities for Google Gemini Multimodal Live API."""
.. deprecated:: 0.0.90
Importing StartSensitivity and EndSensitivity from this module is deprecated.
Import them directly from google.genai.types instead.
"""
import base64
import io
import json
from enum import Enum
from typing import List, Literal, Optional
import warnings
from PIL import Image
from pydantic import BaseModel, Field
from loguru import logger
from pipecat.frames.frames import ImageRawFrame
try:
from google.genai.types import (
EndSensitivity as _EndSensitivity,
)
from google.genai.types import (
StartSensitivity as _StartSensitivity,
)
except ModuleNotFoundError as e:
logger.error(f"Exception: {e}")
logger.error("In order to use Google AI, you need to `pip install pipecat-ai[google]`.")
raise Exception(f"Missing module: {e}")
#
# Client events
#
# These aliases are just here for backward compatibility, since we used to
# define public-facing StartSensitivity and EndSensitivity enums in this
# module.
with warnings.catch_warnings():
warnings.simplefilter("always")
warnings.warn(
"Importing StartSensitivity and EndSensitivity from "
"pipecat.services.gemini_multimodal_live.events is deprecated. "
"Please import them directly from google.genai.types instead.",
DeprecationWarning,
stacklevel=2,
)
StartSensitivity = _StartSensitivity
EndSensitivity = _EndSensitivity
class MediaChunk(BaseModel):
"""Represents a chunk of media data for transmission.
Parameters:
mimeType: MIME type of the media content.
data: Base64-encoded media data.
"""
mimeType: str
data: str
class ContentPart(BaseModel):
"""Represents a part of content that can contain text or media.
Parameters:
text: Text content. Defaults to None.
inlineData: Inline media data. Defaults to None.
"""
text: Optional[str] = Field(default=None, validate_default=False)
inlineData: Optional[MediaChunk] = Field(default=None, validate_default=False)
fileData: Optional["FileData"] = Field(default=None, validate_default=False)
class FileData(BaseModel):
"""Represents a file reference in the Gemini File API."""
mimeType: str
fileUri: str
ContentPart.model_rebuild() # Rebuild model to resolve forward reference
class Turn(BaseModel):
"""Represents a conversational turn in the dialogue.
Parameters:
role: The role of the speaker, either "user" or "model". Defaults to "user".
parts: List of content parts that make up the turn.
"""
role: Literal["user", "model"] = "user"
parts: List[ContentPart]
class StartSensitivity(str, Enum):
"""Determines how start of speech is detected."""
UNSPECIFIED = "START_SENSITIVITY_UNSPECIFIED" # Default is HIGH
HIGH = "START_SENSITIVITY_HIGH" # Detect start of speech more often
LOW = "START_SENSITIVITY_LOW" # Detect start of speech less often
class EndSensitivity(str, Enum):
"""Determines how end of speech is detected."""
UNSPECIFIED = "END_SENSITIVITY_UNSPECIFIED" # Default is HIGH
HIGH = "END_SENSITIVITY_HIGH" # End speech more often
LOW = "END_SENSITIVITY_LOW" # End speech less often
class AutomaticActivityDetection(BaseModel):
"""Configures automatic detection of voice activity.
Parameters:
disabled: Whether automatic activity detection is disabled. Defaults to None.
start_of_speech_sensitivity: Sensitivity for detecting speech start. Defaults to None.
prefix_padding_ms: Padding before speech start in milliseconds. Defaults to None.
end_of_speech_sensitivity: Sensitivity for detecting speech end. Defaults to None.
silence_duration_ms: Duration of silence to detect speech end. Defaults to None.
"""
disabled: Optional[bool] = None
start_of_speech_sensitivity: Optional[StartSensitivity] = None
prefix_padding_ms: Optional[int] = None
end_of_speech_sensitivity: Optional[EndSensitivity] = None
silence_duration_ms: Optional[int] = None
class RealtimeInputConfig(BaseModel):
"""Configures the realtime input behavior.
Parameters:
automatic_activity_detection: Voice activity detection configuration. Defaults to None.
"""
automatic_activity_detection: Optional[AutomaticActivityDetection] = None
class RealtimeInput(BaseModel):
"""Contains realtime input media chunks and text.
Parameters:
mediaChunks: List of media chunks for realtime processing.
text: Text for realtime processing.
"""
mediaChunks: Optional[List[MediaChunk]] = None
text: Optional[str] = None
class ClientContent(BaseModel):
"""Content sent from client to the Gemini Live API.
Parameters:
turns: List of conversation turns. Defaults to None.
turnComplete: Whether the client's turn is complete. Defaults to False.
"""
turns: Optional[List[Turn]] = None
turnComplete: bool = False
class AudioInputMessage(BaseModel):
"""Message containing audio input data.
Parameters:
realtimeInput: Realtime input containing audio chunks.
"""
realtimeInput: RealtimeInput
@classmethod
def from_raw_audio(cls, raw_audio: bytes, sample_rate: int) -> "AudioInputMessage":
"""Create an audio input message from raw audio data.
Args:
raw_audio: Raw audio bytes.
sample_rate: Audio sample rate in Hz.
Returns:
AudioInputMessage instance with encoded audio data.
"""
data = base64.b64encode(raw_audio).decode("utf-8")
return cls(
realtimeInput=RealtimeInput(
mediaChunks=[MediaChunk(mimeType=f"audio/pcm;rate={sample_rate}", data=data)]
)
)
class VideoInputMessage(BaseModel):
"""Message containing video/image input data.
Parameters:
realtimeInput: Realtime input containing video/image chunks.
"""
realtimeInput: RealtimeInput
@classmethod
def from_image_frame(cls, frame: ImageRawFrame) -> "VideoInputMessage":
"""Create a video input message from an image frame.
Args:
frame: Image frame to encode.
Returns:
VideoInputMessage instance with encoded image data.
"""
buffer = io.BytesIO()
Image.frombytes(frame.format, frame.size, frame.image).save(buffer, format="JPEG")
data = base64.b64encode(buffer.getvalue()).decode("utf-8")
return cls(
realtimeInput=RealtimeInput(mediaChunks=[MediaChunk(mimeType=f"image/jpeg", data=data)])
)
class TextInputMessage(BaseModel):
"""Message containing text input data."""
realtimeInput: RealtimeInput
@classmethod
def from_text(cls, text: str) -> "TextInputMessage":
"""Create a text input message from a string.
Args:
text: The text to send.
Returns:
A TextInputMessage instance.
"""
return cls(realtimeInput=RealtimeInput(text=text))
class ClientContentMessage(BaseModel):
"""Message containing client content for the API.
Parameters:
clientContent: The client content to send.
"""
clientContent: ClientContent
class SystemInstruction(BaseModel):
"""System instruction for the model.
Parameters:
parts: List of content parts that make up the system instruction.
"""
parts: List[ContentPart]
class AudioTranscriptionConfig(BaseModel):
"""Configuration for audio transcription."""
pass
class Setup(BaseModel):
"""Setup configuration for the Gemini Live session.
Parameters:
model: Model identifier to use.
system_instruction: System instruction for the model. Defaults to None.
tools: List of available tools/functions. Defaults to None.
generation_config: Generation configuration parameters. Defaults to None.
input_audio_transcription: Input audio transcription config. Defaults to None.
output_audio_transcription: Output audio transcription config. Defaults to None.
realtime_input_config: Realtime input configuration. Defaults to None.
"""
model: str
system_instruction: Optional[SystemInstruction] = None
tools: Optional[List[dict]] = None
generation_config: Optional[dict] = None
input_audio_transcription: Optional[AudioTranscriptionConfig] = None
output_audio_transcription: Optional[AudioTranscriptionConfig] = None
realtime_input_config: Optional[RealtimeInputConfig] = None
class Config(BaseModel):
"""Configuration message for session setup.
Parameters:
setup: Setup configuration for the session.
"""
setup: Setup
#
# Grounding metadata models
#
class SearchEntryPoint(BaseModel):
"""Represents the search entry point with rendered content for search suggestions."""
renderedContent: Optional[str] = None
class WebSource(BaseModel):
"""Represents a web source from grounding chunks."""
uri: Optional[str] = None
title: Optional[str] = None
class GroundingChunk(BaseModel):
"""Represents a grounding chunk containing web source information."""
web: Optional[WebSource] = None
class GroundingSegment(BaseModel):
"""Represents a segment of text that is grounded."""
startIndex: Optional[int] = None
endIndex: Optional[int] = None
text: Optional[str] = None
class GroundingSupport(BaseModel):
"""Represents support information for grounded text segments."""
segment: Optional[GroundingSegment] = None
groundingChunkIndices: Optional[List[int]] = None
confidenceScores: Optional[List[float]] = None
class GroundingMetadata(BaseModel):
"""Represents grounding metadata from Google Search."""
searchEntryPoint: Optional[SearchEntryPoint] = None
groundingChunks: Optional[List[GroundingChunk]] = None
groundingSupports: Optional[List[GroundingSupport]] = None
webSearchQueries: Optional[List[str]] = None
#
# Server events
#
class SetupComplete(BaseModel):
"""Indicates that session setup is complete."""
pass
class InlineData(BaseModel):
"""Inline data embedded in server responses.
Parameters:
mimeType: MIME type of the data.
data: Base64-encoded data content.
"""
mimeType: str
data: str
class Part(BaseModel):
"""Part of a server response containing data or text.
Parameters:
inlineData: Inline binary data. Defaults to None.
text: Text content. Defaults to None.
"""
inlineData: Optional[InlineData] = None
text: Optional[str] = None
class ModelTurn(BaseModel):
"""Represents a turn from the model in the conversation.
Parameters:
parts: List of content parts in the model's response.
"""
parts: List[Part]
class ServerContentInterrupted(BaseModel):
"""Indicates server content was interrupted.
Parameters:
interrupted: Whether the content was interrupted.
"""
interrupted: bool
class ServerContentTurnComplete(BaseModel):
"""Indicates the server's turn is complete.
Parameters:
turnComplete: Whether the turn is complete.
"""
turnComplete: bool
class BidiGenerateContentTranscription(BaseModel):
"""Transcription data from bidirectional content generation.
Parameters:
text: The transcribed text content.
"""
text: str
class ServerContent(BaseModel):
"""Content sent from server to client.
Parameters:
modelTurn: Model's conversational turn. Defaults to None.
interrupted: Whether content was interrupted. Defaults to None.
turnComplete: Whether the turn is complete. Defaults to None.
inputTranscription: Transcription of input audio. Defaults to None.
outputTranscription: Transcription of output audio. Defaults to None.
"""
modelTurn: Optional[ModelTurn] = None
interrupted: Optional[bool] = None
turnComplete: Optional[bool] = None
inputTranscription: Optional[BidiGenerateContentTranscription] = None
outputTranscription: Optional[BidiGenerateContentTranscription] = None
groundingMetadata: Optional[GroundingMetadata] = None
class FunctionCall(BaseModel):
"""Represents a function call from the model.
Parameters:
id: Unique identifier for the function call.
name: Name of the function to call.
args: Arguments to pass to the function.
"""
id: str
name: str
args: dict
class ToolCall(BaseModel):
"""Contains one or more function calls.
Parameters:
functionCalls: List of function calls to execute.
"""
functionCalls: List[FunctionCall]
class Modality(str, Enum):
"""Modality types in token counts."""
UNSPECIFIED = "MODALITY_UNSPECIFIED"
TEXT = "TEXT"
IMAGE = "IMAGE"
AUDIO = "AUDIO"
VIDEO = "VIDEO"
class ModalityTokenCount(BaseModel):
"""Token count for a specific modality.
Parameters:
modality: The modality type.
tokenCount: Number of tokens for this modality.
"""
modality: Modality
tokenCount: int
class UsageMetadata(BaseModel):
"""Usage metadata about the API response.
Parameters:
promptTokenCount: Number of tokens in the prompt. Defaults to None.
cachedContentTokenCount: Number of cached content tokens. Defaults to None.
responseTokenCount: Number of tokens in the response. Defaults to None.
toolUsePromptTokenCount: Number of tokens for tool use prompts. Defaults to None.
thoughtsTokenCount: Number of tokens for model thoughts. Defaults to None.
totalTokenCount: Total number of tokens used. Defaults to None.
promptTokensDetails: Detailed breakdown of prompt tokens by modality. Defaults to None.
cacheTokensDetails: Detailed breakdown of cache tokens by modality. Defaults to None.
responseTokensDetails: Detailed breakdown of response tokens by modality. Defaults to None.
toolUsePromptTokensDetails: Detailed breakdown of tool use tokens by modality. Defaults to None.
"""
promptTokenCount: Optional[int] = None
cachedContentTokenCount: Optional[int] = None
responseTokenCount: Optional[int] = None
toolUsePromptTokenCount: Optional[int] = None
thoughtsTokenCount: Optional[int] = None
totalTokenCount: Optional[int] = None
promptTokensDetails: Optional[List[ModalityTokenCount]] = None
cacheTokensDetails: Optional[List[ModalityTokenCount]] = None
responseTokensDetails: Optional[List[ModalityTokenCount]] = None
toolUsePromptTokensDetails: Optional[List[ModalityTokenCount]] = None
class ServerEvent(BaseModel):
"""Server event received from the Gemini Live API.
Parameters:
setupComplete: Setup completion notification. Defaults to None.
serverContent: Content from the server. Defaults to None.
toolCall: Tool/function call request. Defaults to None.
usageMetadata: Token usage metadata. Defaults to None.
"""
setupComplete: Optional[SetupComplete] = None
serverContent: Optional[ServerContent] = None
toolCall: Optional[ToolCall] = None
usageMetadata: Optional[UsageMetadata] = None
def parse_server_event(str):
"""Parse a server event from JSON string.
Args:
str: JSON string containing the server event.
Returns:
ServerEvent instance if parsing succeeds, None otherwise.
"""
try:
evt = json.loads(str)
return ServerEvent.model_validate(evt)
except Exception as e:
print(f"Error parsing server event: {e}")
return None
class ContextWindowCompressionConfig(BaseModel):
"""Configuration for context window compression.
Parameters:
sliding_window: Whether to use sliding window compression. Defaults to True.
trigger_tokens: Token count threshold to trigger compression. Defaults to None.
"""
sliding_window: Optional[bool] = Field(default=True)
trigger_tokens: Optional[int] = Field(default=None)

View File

@@ -9,31 +9,181 @@
This module provides a client for Google's Gemini File API, enabling file
uploads, metadata retrieval, listing, and deletion. Files uploaded through
this API can be referenced in Gemini generative model calls.
.. deprecated:: 0.0.90
Importing GeminiFileAPI from this module is deprecated.
Import it from pipecat.services.google.gemini_live.file_api instead.
"""
import warnings
import mimetypes
from typing import Any, Dict, Optional
import aiohttp
from loguru import logger
try:
from pipecat.services.google.gemini_live.file_api import GeminiFileAPI as _GeminiFileAPI
except ModuleNotFoundError as e:
logger.error(f"Exception: {e}")
logger.error("In order to use Google AI, you need to `pip install pipecat-ai[google]`.")
raise Exception(f"Missing module: {e}")
# These aliases are just here for backward compatibility, since we used to
# define public-facing StartSensitivity and EndSensitivity enums in this
# module.
warnings.warn(
"Importing GeminiFileAPI from "
"pipecat.services.gemini_multimodal_live.file_api is deprecated. "
"Please import it from pipecat.services.google.gemini_live.file_api instead.",
DeprecationWarning,
stacklevel=2,
)
GeminiFileAPI = _GeminiFileAPI
class GeminiFileAPI:
"""Client for the Gemini File API.
This class provides methods for uploading, fetching, listing, and deleting files
through Google's Gemini File API.
Files uploaded through this API remain available for 48 hours and can be referenced
in calls to the Gemini generative models. Maximum file size is 2GB, with total
project storage limited to 20GB.
"""
def __init__(
self, api_key: str, base_url: str = "https://generativelanguage.googleapis.com/v1beta/files"
):
"""Initialize the Gemini File API client.
Args:
api_key: Google AI API key
base_url: Base URL for the Gemini File API (default is the v1beta endpoint)
"""
self._api_key = api_key
self._base_url = base_url
# Upload URL uses the /upload/ path
self.upload_base_url = "https://generativelanguage.googleapis.com/upload/v1beta/files"
async def upload_file(
self, file_path: str, display_name: Optional[str] = None
) -> Dict[str, Any]:
"""Upload a file to the Gemini File API using the correct resumable upload protocol.
Args:
file_path: Path to the file to upload
display_name: Optional display name for the file
Returns:
File metadata including uri, name, and display_name
"""
logger.info(f"Uploading file: {file_path}")
async with aiohttp.ClientSession() as session:
# Determine the file's MIME type
mime_type, _ = mimetypes.guess_type(file_path)
if not mime_type:
mime_type = "application/octet-stream"
# Read the file
with open(file_path, "rb") as f:
file_data = f.read()
# Create the metadata payload
metadata = {}
if display_name:
metadata = {"file": {"display_name": display_name}}
# Step 1: Initial resumable request to get upload URL
headers = {
"X-Goog-Upload-Protocol": "resumable",
"X-Goog-Upload-Command": "start",
"X-Goog-Upload-Header-Content-Length": str(len(file_data)),
"X-Goog-Upload-Header-Content-Type": mime_type,
"Content-Type": "application/json",
}
logger.debug(f"Step 1: Getting upload URL from {self.upload_base_url}")
async with session.post(
f"{self.upload_base_url}?key={self._api_key}", headers=headers, json=metadata
) as response:
if response.status != 200:
error_text = await response.text()
logger.error(f"Error initiating file upload: {error_text}")
raise Exception(f"Failed to initiate upload: {response.status} - {error_text}")
# Get the upload URL from the response header
upload_url = response.headers.get("X-Goog-Upload-URL")
if not upload_url:
logger.error(f"Response headers: {dict(response.headers)}")
raise Exception("No upload URL in response headers")
logger.debug(f"Got upload URL: {upload_url}")
# Step 2: Upload the actual file data
upload_headers = {
"Content-Length": str(len(file_data)),
"X-Goog-Upload-Offset": "0",
"X-Goog-Upload-Command": "upload, finalize",
}
logger.debug(f"Step 2: Uploading file data to {upload_url}")
async with session.post(upload_url, headers=upload_headers, data=file_data) as response:
if response.status != 200:
error_text = await response.text()
logger.error(f"Error uploading file data: {error_text}")
raise Exception(f"Failed to upload file: {response.status} - {error_text}")
file_info = await response.json()
logger.info(f"File uploaded successfully: {file_info.get('file', {}).get('name')}")
return file_info
async def get_file(self, name: str) -> Dict[str, Any]:
"""Get metadata for a file.
Args:
name: File name (or full path)
Returns:
File metadata
"""
# Extract just the name part if a full path is provided
if "/" in name:
name = name.split("/")[-1]
async with aiohttp.ClientSession() as session:
async with session.get(f"{self._base_url}/{name}?key={self._api_key}") as response:
if response.status != 200:
error_text = await response.text()
logger.error(f"Error getting file metadata: {error_text}")
raise Exception(f"Failed to get file metadata: {response.status}")
file_info = await response.json()
return file_info
async def list_files(
self, page_size: int = 10, page_token: Optional[str] = None
) -> Dict[str, Any]:
"""List uploaded files.
Args:
page_size: Number of files to return per page
page_token: Token for pagination
Returns:
List of files and next page token if available
"""
params = {"key": self._api_key, "pageSize": page_size}
if page_token:
params["pageToken"] = page_token
async with aiohttp.ClientSession() as session:
async with session.get(self._base_url, params=params) as response:
if response.status != 200:
error_text = await response.text()
logger.error(f"Error listing files: {error_text}")
raise Exception(f"Failed to list files: {response.status}")
result = await response.json()
return result
async def delete_file(self, name: str) -> bool:
"""Delete a file.
Args:
name: File name (or full path)
Returns:
True if deleted successfully
"""
# Extract just the name part if a full path is provided
if "/" in name:
name = name.split("/")[-1]
async with aiohttp.ClientSession() as session:
async with session.delete(f"{self._base_url}/{name}?key={self._api_key}") as response:
if response.status != 200:
error_text = await response.text()
logger.error(f"Error deleting file: {error_text}")
raise Exception(f"Failed to delete file: {response.status}")
return True

File diff suppressed because it is too large Load Diff

View File

@@ -9,7 +9,6 @@ import sys
from pipecat.services import DeprecatedModuleProxy
from .frames import *
from .gemini_live import *
from .image import *
from .llm import *
from .llm_openai import *

View File

@@ -1,3 +0,0 @@
from .file_api import GeminiFileAPI
from .llm import GeminiLiveLLMService
from .llm_vertex import GeminiLiveVertexLLMService

View File

@@ -1,189 +0,0 @@
#
# Copyright (c) 20242025, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
"""Gemini File API client for uploading and managing files.
This module provides a client for Google's Gemini File API, enabling file
uploads, metadata retrieval, listing, and deletion. Files uploaded through
this API can be referenced in Gemini generative model calls.
"""
import mimetypes
from typing import Any, Dict, Optional
import aiohttp
from loguru import logger
class GeminiFileAPI:
"""Client for the Gemini File API.
This class provides methods for uploading, fetching, listing, and deleting files
through Google's Gemini File API.
Files uploaded through this API remain available for 48 hours and can be referenced
in calls to the Gemini generative models. Maximum file size is 2GB, with total
project storage limited to 20GB.
"""
def __init__(
self, api_key: str, base_url: str = "https://generativelanguage.googleapis.com/v1beta/files"
):
"""Initialize the Gemini File API client.
Args:
api_key: Google AI API key
base_url: Base URL for the Gemini File API (default is the v1beta endpoint)
"""
self._api_key = api_key
self._base_url = base_url
# Upload URL uses the /upload/ path
self.upload_base_url = "https://generativelanguage.googleapis.com/upload/v1beta/files"
async def upload_file(
self, file_path: str, display_name: Optional[str] = None
) -> Dict[str, Any]:
"""Upload a file to the Gemini File API using the correct resumable upload protocol.
Args:
file_path: Path to the file to upload
display_name: Optional display name for the file
Returns:
File metadata including uri, name, and display_name
"""
logger.info(f"Uploading file: {file_path}")
async with aiohttp.ClientSession() as session:
# Determine the file's MIME type
mime_type, _ = mimetypes.guess_type(file_path)
if not mime_type:
mime_type = "application/octet-stream"
# Read the file
with open(file_path, "rb") as f:
file_data = f.read()
# Create the metadata payload
metadata = {}
if display_name:
metadata = {"file": {"display_name": display_name}}
# Step 1: Initial resumable request to get upload URL
headers = {
"X-Goog-Upload-Protocol": "resumable",
"X-Goog-Upload-Command": "start",
"X-Goog-Upload-Header-Content-Length": str(len(file_data)),
"X-Goog-Upload-Header-Content-Type": mime_type,
"Content-Type": "application/json",
}
logger.debug(f"Step 1: Getting upload URL from {self.upload_base_url}")
async with session.post(
f"{self.upload_base_url}?key={self._api_key}", headers=headers, json=metadata
) as response:
if response.status != 200:
error_text = await response.text()
logger.error(f"Error initiating file upload: {error_text}")
raise Exception(f"Failed to initiate upload: {response.status} - {error_text}")
# Get the upload URL from the response header
upload_url = response.headers.get("X-Goog-Upload-URL")
if not upload_url:
logger.error(f"Response headers: {dict(response.headers)}")
raise Exception("No upload URL in response headers")
logger.debug(f"Got upload URL: {upload_url}")
# Step 2: Upload the actual file data
upload_headers = {
"Content-Length": str(len(file_data)),
"X-Goog-Upload-Offset": "0",
"X-Goog-Upload-Command": "upload, finalize",
}
logger.debug(f"Step 2: Uploading file data to {upload_url}")
async with session.post(upload_url, headers=upload_headers, data=file_data) as response:
if response.status != 200:
error_text = await response.text()
logger.error(f"Error uploading file data: {error_text}")
raise Exception(f"Failed to upload file: {response.status} - {error_text}")
file_info = await response.json()
logger.info(f"File uploaded successfully: {file_info.get('file', {}).get('name')}")
return file_info
async def get_file(self, name: str) -> Dict[str, Any]:
"""Get metadata for a file.
Args:
name: File name (or full path)
Returns:
File metadata
"""
# Extract just the name part if a full path is provided
if "/" in name:
name = name.split("/")[-1]
async with aiohttp.ClientSession() as session:
async with session.get(f"{self._base_url}/{name}?key={self._api_key}") as response:
if response.status != 200:
error_text = await response.text()
logger.error(f"Error getting file metadata: {error_text}")
raise Exception(f"Failed to get file metadata: {response.status}")
file_info = await response.json()
return file_info
async def list_files(
self, page_size: int = 10, page_token: Optional[str] = None
) -> Dict[str, Any]:
"""List uploaded files.
Args:
page_size: Number of files to return per page
page_token: Token for pagination
Returns:
List of files and next page token if available
"""
params = {"key": self._api_key, "pageSize": page_size}
if page_token:
params["pageToken"] = page_token
async with aiohttp.ClientSession() as session:
async with session.get(self._base_url, params=params) as response:
if response.status != 200:
error_text = await response.text()
logger.error(f"Error listing files: {error_text}")
raise Exception(f"Failed to list files: {response.status}")
result = await response.json()
return result
async def delete_file(self, name: str) -> bool:
"""Delete a file.
Args:
name: File name (or full path)
Returns:
True if deleted successfully
"""
# Extract just the name part if a full path is provided
if "/" in name:
name = name.split("/")[-1]
async with aiohttp.ClientSession() as session:
async with session.delete(f"{self._base_url}/{name}?key={self._api_key}") as response:
if response.status != 200:
error_text = await response.text()
logger.error(f"Error deleting file: {error_text}")
raise Exception(f"Failed to delete file: {response.status}")
return True

File diff suppressed because it is too large Load Diff

View File

@@ -1,184 +0,0 @@
#
# Copyright (c) 20242025, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
"""Service for accessing Gemini Live via Google Vertex AI.
This module provides integration with Google's Gemini Live model via
Vertex AI, supporting both text and audio modalities with voice transcription,
streaming responses, and tool usage.
"""
import json
from typing import List, Optional, Union
from loguru import logger
from pipecat.adapters.schemas.tools_schema import ToolsSchema
from pipecat.services.google.gemini_live.llm import (
GeminiLiveLLMService,
HttpOptions,
InputParams,
)
try:
from google.auth import default
from google.auth.exceptions import GoogleAuthError
from google.auth.transport.requests import Request
from google.genai import Client
from google.oauth2 import service_account
except ModuleNotFoundError as e:
logger.error(f"Exception: {e}")
logger.error("In order to use Google Vertex AI, you need to `pip install pipecat-ai[google]`.")
raise Exception(f"Missing module: {e}")
class GeminiLiveVertexLLMService(GeminiLiveLLMService):
"""Provides access to Google's Gemini Live model via Vertex AI.
This service enables real-time conversations with Gemini, supporting both
text and audio modalities. It handles voice transcription, streaming audio
responses, and tool usage.
"""
def __init__(
self,
*,
credentials: Optional[str] = None,
credentials_path: Optional[str] = None,
location: str,
project_id: str,
model="google/gemini-2.0-flash-live-preview-04-09",
voice_id: str = "Charon",
start_audio_paused: bool = False,
start_video_paused: bool = False,
system_instruction: Optional[str] = None,
tools: Optional[Union[List[dict], ToolsSchema]] = None,
params: Optional[InputParams] = None,
inference_on_context_initialization: bool = True,
file_api_base_url: str = "https://generativelanguage.googleapis.com/v1beta/files",
http_options: Optional[HttpOptions] = None,
**kwargs,
):
"""Initialize the service for accessing Gemini Live via Google Vertex AI.
Args:
credentials: JSON string of service account credentials.
credentials_path: Path to the service account JSON file.
location: GCP region for Vertex AI endpoint (e.g., "us-east4").
project_id: Google Cloud project ID.
model: Model identifier to use. Defaults to "models/gemini-2.0-flash-live-preview-04-09".
voice_id: TTS voice identifier. Defaults to "Charon".
start_audio_paused: Whether to start with audio input paused. Defaults to False.
start_video_paused: Whether to start with video input paused. Defaults to False.
system_instruction: System prompt for the model. Defaults to None.
tools: Tools/functions available to the model. Defaults to None.
params: Configuration parameters for the model along with Vertex AI
location and project ID.
inference_on_context_initialization: Whether to generate a response when context
is first set. Defaults to True.
file_api_base_url: Base URL for the Gemini File API. Defaults to the official endpoint.
http_options: HTTP options for the client.
**kwargs: Additional arguments passed to parent GeminiLiveLLMService.
"""
# Check if user incorrectly passed api_key, which is used by parent
# class but not here.
if "api_key" in kwargs:
logger.error(
"GeminiLiveVertexLLMService does not accept 'api_key' parameter. "
"Use 'credentials' or 'credentials_path' instead for Vertex AI authentication."
)
raise ValueError(
"Invalid parameter 'api_key'. Use 'credentials' or 'credentials_path' for Vertex AI authentication."
)
# These need to be set before calling super().__init__() because
# super().__init__() invokes create_client(), which needs these.
self._credentials = self._get_credentials(credentials, credentials_path)
self._project_id = project_id
self._location = location
# Call parent constructor with the obtained API key
super().__init__(
# api_key is required by parent class, but actually not used with
# Vertex
api_key="dummy",
model=model,
voice_id=voice_id,
start_audio_paused=start_audio_paused,
start_video_paused=start_video_paused,
system_instruction=system_instruction,
tools=tools,
params=params,
inference_on_context_initialization=inference_on_context_initialization,
file_api_base_url=file_api_base_url,
http_options=http_options,
**kwargs,
)
def create_client(self):
"""Create the Gemini client instance."""
self._client = Client(
vertexai=True,
credentials=self._credentials,
project=self._project_id,
location=self._location,
)
@property
def file_api(self):
"""Gemini File API is not supported with Vertex AI."""
raise NotImplementedError(
"When using Vertex AI, the recommended approach is to use Google Cloud Storage for file handling. The Gemini File API is not directly supported in this context."
)
@staticmethod
def _get_credentials(credentials: Optional[str], credentials_path: Optional[str]) -> str:
"""Retrieve Credentials using Google service account credentials JSON.
Supports multiple authentication methods:
1. Direct JSON credentials string
2. Path to service account JSON file
3. Default application credentials (ADC)
Args:
credentials: JSON string of service account credentials.
credentials_path: Path to the service account JSON file.
Returns:
OAuth token for API authentication.
Raises:
ValueError: If no valid credentials are provided or found.
"""
creds: Optional[service_account.Credentials] = None
if credentials:
# Parse and load credentials from JSON string
creds = service_account.Credentials.from_service_account_info(
json.loads(credentials),
scopes=["https://www.googleapis.com/auth/cloud-platform"],
)
elif credentials_path:
# Load credentials from JSON file
creds = service_account.Credentials.from_service_account_file(
credentials_path,
scopes=["https://www.googleapis.com/auth/cloud-platform"],
)
else:
try:
creds, project_id = default(
scopes=["https://www.googleapis.com/auth/cloud-platform"]
)
except GoogleAuthError:
pass
if not creds:
raise ValueError("No valid credentials provided.")
creds.refresh(Request()) # Ensure token is up-to-date, lifetime is 1 hour.
return creds

View File

@@ -35,7 +35,6 @@ from pipecat.frames.frames import (
LLMMessagesFrame,
LLMTextFrame,
LLMUpdateSettingsFrame,
OutputImageRawFrame,
UserImageRawFrame,
)
from pipecat.metrics.metrics import LLMTokenUsage
@@ -73,9 +72,6 @@ try:
HttpOptions,
Part,
)
# Temporary hack to be able to process Nano Banana returned images.
genai._api_client.READ_BUFFER_SIZE = 5 * 1024 * 1024
except ModuleNotFoundError as e:
logger.error(f"Exception: {e}")
logger.error("In order to use Google AI, you need to `pip install pipecat-ai[google]`.")
@@ -686,7 +682,7 @@ class GoogleLLMService(LLMService):
self,
*,
api_key: str,
model: str = "gemini-2.5-flash",
model: str = "gemini-2.0-flash",
params: Optional[InputParams] = None,
system_instruction: Optional[str] = None,
tools: Optional[List[Dict[str, Any]]] = None,
@@ -714,7 +710,6 @@ class GoogleLLMService(LLMService):
self._api_key = api_key
self._system_instruction = system_instruction
self._http_options = http_options
self._create_client(api_key, http_options)
self._settings = {
"max_tokens": params.max_tokens,
@@ -793,9 +788,6 @@ class GoogleLLMService(LLMService):
# and can be configured to turn it off.
if not self._model_name.startswith("gemini-2.5-flash"):
return
# If we have an image model, we don't use a budget either.
if "image" in self._model_name:
return
# If thinking_config is already set, don't override it.
if "thinking_config" in generation_params:
return
@@ -935,12 +927,6 @@ class GoogleLLMService(LLMService):
arguments=function_call.args or {},
)
)
elif part.inline_data and part.inline_data.data:
image = Image.open(io.BytesIO(part.inline_data.data))
frame = OutputImageRawFrame(
image=image.tobytes(), size=image.size, format="RGB"
)
await self.push_frame(frame)
if (
candidate.grounding_metadata

View File

@@ -94,9 +94,9 @@ class GoogleLLMOpenAIBetaService(OpenAILLMService):
async for chunk in chunk_stream:
if chunk.usage:
tokens = LLMTokenUsage(
prompt_tokens=chunk.usage.prompt_tokens or 0,
completion_tokens=chunk.usage.completion_tokens or 0,
total_tokens=chunk.usage.total_tokens or 0,
prompt_tokens=chunk.usage.prompt_tokens,
completion_tokens=chunk.usage.completion_tokens,
total_tokens=chunk.usage.total_tokens,
)
await self.start_llm_usage_metrics(tokens)

Some files were not shown because too many files have changed in this diff Show More