Compare commits
261 Commits
jh/aws-aut
...
mb/openrou
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f179364fde | ||
|
|
d3c978e8ca | ||
|
|
6b42aaead8 | ||
|
|
4b98c2b7f1 | ||
|
|
c51a817efa | ||
|
|
d85eda6da8 | ||
|
|
71feb42711 | ||
|
|
6b93ca0cb6 | ||
|
|
b6ecce754b | ||
|
|
d39e6bf921 | ||
|
|
63064860ef | ||
|
|
f5158d51e7 | ||
|
|
94dbd2fa68 | ||
|
|
c6ea6c6522 | ||
|
|
58a22aeeb1 | ||
|
|
5403aa56e4 | ||
|
|
0e0d76d020 | ||
|
|
b493ed8d3a | ||
|
|
c3338667b1 | ||
|
|
ea296babe9 | ||
|
|
b13af2b053 | ||
|
|
7b6d878f07 | ||
|
|
8e405f15aa | ||
|
|
44a40e8eb2 | ||
|
|
ea97cb1a78 | ||
|
|
22650b1b56 | ||
|
|
b76831e677 | ||
|
|
b57111743f | ||
|
|
dcbb0070c9 | ||
|
|
73278d3309 | ||
|
|
c8efe319b3 | ||
|
|
49bda11ae8 | ||
|
|
07640582ce | ||
|
|
078af6969a | ||
|
|
9f40ba21c2 | ||
|
|
82f0896d6a | ||
|
|
7e4cd23de4 | ||
|
|
97f50c8aa2 | ||
|
|
08680732f6 | ||
|
|
064b68aa01 | ||
|
|
b0f8ea7e28 | ||
|
|
ad50c8d5d5 | ||
|
|
5fef239b68 | ||
|
|
9148e307cc | ||
|
|
703d23b658 | ||
|
|
227ba288da | ||
|
|
39e7f9e354 | ||
|
|
7cc7968abb | ||
|
|
52d8008783 | ||
|
|
a3ce963b54 | ||
|
|
e70ee603b2 | ||
|
|
111e59a7b1 | ||
|
|
079282d140 | ||
|
|
0ccdd808e6 | ||
|
|
3e8c5c08f4 | ||
|
|
644030584f | ||
|
|
0740021ff4 | ||
|
|
68f265fa62 | ||
|
|
b9f052079d | ||
|
|
130bb7371c | ||
|
|
5d61763987 | ||
|
|
7984556692 | ||
|
|
bea9e4b3ba | ||
|
|
19df443500 | ||
|
|
07f241143b | ||
|
|
2fdb9bbf42 | ||
|
|
0146947b68 | ||
|
|
863a1bf177 | ||
|
|
58333b2705 | ||
|
|
ecaff1d1eb | ||
|
|
e2bfa6352f | ||
|
|
abd28e2ac1 | ||
|
|
88deebbf5f | ||
|
|
9b55d4ddd4 | ||
|
|
c2bdc1aada | ||
|
|
fc0589e8f1 | ||
|
|
67f8d34e9f | ||
|
|
d3b8710720 | ||
|
|
86e2aa85d3 | ||
|
|
b89500256d | ||
|
|
a52bdef32b | ||
|
|
afd9fc5fdf | ||
|
|
7f98dba925 | ||
|
|
6a27ed35b1 | ||
|
|
a34864d643 | ||
|
|
007fa3a3a8 | ||
|
|
5dd7413c00 | ||
|
|
8e0a338d96 | ||
|
|
d6655e7a5e | ||
|
|
33b73df6ec | ||
|
|
d65aee9181 | ||
|
|
1755016679 | ||
|
|
b7f6298601 | ||
|
|
396873ac7e | ||
|
|
5b33964a1b | ||
|
|
8b37cd1d3a | ||
|
|
7a2b667fa1 | ||
|
|
ee8c607315 | ||
|
|
71578e7151 | ||
|
|
77058b01c4 | ||
|
|
4f85e7c089 | ||
|
|
15531c8112 | ||
|
|
b9e8f13105 | ||
|
|
784667bad2 | ||
|
|
33db71ec32 | ||
|
|
dc035df0aa | ||
|
|
df1b071a13 | ||
|
|
95bcebe774 | ||
|
|
5509377344 | ||
|
|
e21180b962 | ||
|
|
53922819ed | ||
|
|
6faeffb884 | ||
|
|
9086a46900 | ||
|
|
1a4a6f4edf | ||
|
|
ff80cde44e | ||
|
|
fb74f7714c | ||
|
|
4864eddbc7 | ||
|
|
d831930bd0 | ||
|
|
2c65713c99 | ||
|
|
b14a03d01f | ||
|
|
ad0f0a1294 | ||
|
|
72d0fb418a | ||
|
|
c9f0172e9f | ||
|
|
2638885c62 | ||
|
|
94a94ee28c | ||
|
|
c46ede8335 | ||
|
|
457a68ce64 | ||
|
|
b78cecf7b2 | ||
|
|
952dddca8b | ||
|
|
e3e90d38aa | ||
|
|
d1c8162b0c | ||
|
|
1fa0310ea8 | ||
|
|
2281cd8359 | ||
|
|
480eca42f5 | ||
|
|
1073510574 | ||
|
|
47c05f3f30 | ||
|
|
24904b89f5 | ||
|
|
c78977e4c7 | ||
|
|
f78b5f9240 | ||
|
|
406f8b730b | ||
|
|
7a2cec2e45 | ||
|
|
edfcd6948b | ||
|
|
991ee9e0e6 | ||
|
|
cb426cbb14 | ||
|
|
d39beff817 | ||
|
|
1eade184f1 | ||
|
|
a696729343 | ||
|
|
ba705e9501 | ||
|
|
98c370457b | ||
|
|
3fa193b983 | ||
|
|
6189e920e1 | ||
|
|
73625a273a | ||
|
|
f91a55c97c | ||
|
|
5f256e241c | ||
|
|
954f63dc7b | ||
|
|
6cc66a3df1 | ||
|
|
a445399337 | ||
|
|
5ed2057599 | ||
|
|
cacde00e26 | ||
|
|
b1b598f65e | ||
|
|
c48ee93892 | ||
|
|
cf22dac171 | ||
|
|
36f6e22aee | ||
|
|
921a7a46cb | ||
|
|
fda18a9afa | ||
|
|
d146a7f8e0 | ||
|
|
90f0f7cd27 | ||
|
|
37376b3506 | ||
|
|
729418c2b7 | ||
|
|
4512038a17 | ||
|
|
a23baf9de6 | ||
|
|
d18fe7c39c | ||
|
|
41124dc494 | ||
|
|
95db08646c | ||
|
|
03e5ebb266 | ||
|
|
5daf267c11 | ||
|
|
1cb77b422a | ||
|
|
0c779b4c3d | ||
|
|
138991418a | ||
|
|
94e136a6b7 | ||
|
|
9598e262b5 | ||
|
|
8c3521f2e4 | ||
|
|
eda98fb13f | ||
|
|
3722ee223c | ||
|
|
2620e76dab | ||
|
|
2447db766e | ||
|
|
61a81ed87b | ||
|
|
735cd09c7e | ||
|
|
2616076bec | ||
|
|
40667e50fc | ||
|
|
e06e0c0282 | ||
|
|
84eefba4df | ||
|
|
fe3af5d9f7 | ||
|
|
7729eecfe4 | ||
|
|
fa31a2fd63 | ||
|
|
678d40e102 | ||
|
|
8becafee38 | ||
|
|
83190d38e9 | ||
|
|
7519c26ac5 | ||
|
|
b2b7e9ee6f | ||
|
|
e864d5778a | ||
|
|
89f10dd9a1 | ||
|
|
f67e3ef0b2 | ||
|
|
5b087d6aeb | ||
|
|
e780f759d0 | ||
|
|
35153de28e | ||
|
|
9886d72f5e | ||
|
|
90e6b51acd | ||
|
|
61acdba3ae | ||
|
|
f1a3ee97de | ||
|
|
b363b91d12 | ||
|
|
43abca0b06 | ||
|
|
30efd11e15 | ||
|
|
a745e8d318 | ||
|
|
2730e47e61 | ||
|
|
4703df8686 | ||
|
|
26a40e2e62 | ||
|
|
31ff07916f | ||
|
|
814f00ce41 | ||
|
|
96756bc1f6 | ||
|
|
5e24027fd5 | ||
|
|
ef226c8a8e | ||
|
|
2a731336be | ||
|
|
bec407ce3a | ||
|
|
1cd73b1ef8 | ||
|
|
c4f5f1ebbb | ||
|
|
49068ff557 | ||
|
|
d23bdaaacd | ||
|
|
53ce57b7fa | ||
|
|
dabca70744 | ||
|
|
191bdc733f | ||
|
|
5e1bb4cbe5 | ||
|
|
9ee123bf33 | ||
|
|
66f43baf8f | ||
|
|
252bb493af | ||
|
|
c517b67bad | ||
|
|
70aeb5c7c2 | ||
|
|
440738f727 | ||
|
|
7da94436f5 | ||
|
|
492c9702ee | ||
|
|
f1eef9ba0a | ||
|
|
132b9b1002 | ||
|
|
eb4e56d2d9 | ||
|
|
13643b192b | ||
|
|
6d66bbceeb | ||
|
|
6cab2ce3f7 | ||
|
|
a27d9fc30b | ||
|
|
2a8f4734e0 | ||
|
|
48ac68e3c8 | ||
|
|
c3ef199efa | ||
|
|
1b5c4cfa2a | ||
|
|
6e9dd1dbcc | ||
|
|
6487f895b3 | ||
|
|
351105a975 | ||
|
|
8ea963852d | ||
|
|
6f4458f21d | ||
|
|
fb42a7dcf3 | ||
|
|
21547c8680 | ||
|
|
3e5aabc5f2 | ||
|
|
e508642b0a | ||
|
|
e546541e20 |
1
.agents/skills/changelog
Symbolic link
1
.agents/skills/changelog
Symbolic link
@@ -0,0 +1 @@
|
||||
../../.claude/skills/changelog
|
||||
1
.agents/skills/cleanup
Symbolic link
1
.agents/skills/cleanup
Symbolic link
@@ -0,0 +1 @@
|
||||
../../.claude/skills/cleanup
|
||||
1
.agents/skills/code-review
Symbolic link
1
.agents/skills/code-review
Symbolic link
@@ -0,0 +1 @@
|
||||
../../.claude/skills/code-review
|
||||
1
.agents/skills/docstring
Symbolic link
1
.agents/skills/docstring
Symbolic link
@@ -0,0 +1 @@
|
||||
../../.claude/skills/docstring
|
||||
1
.agents/skills/pr-description
Symbolic link
1
.agents/skills/pr-description
Symbolic link
@@ -0,0 +1 @@
|
||||
../../.claude/skills/pr-description
|
||||
1
.agents/skills/pr-submit
Symbolic link
1
.agents/skills/pr-submit
Symbolic link
@@ -0,0 +1 @@
|
||||
../../.claude/skills/pr-submit
|
||||
1
.agents/skills/update-docs
Symbolic link
1
.agents/skills/update-docs
Symbolic link
@@ -0,0 +1 @@
|
||||
../../.claude/skills/update-docs
|
||||
@@ -1,3 +1,8 @@
|
||||
---
|
||||
name: cleanup
|
||||
description: Review, refactor, document, and validate code changes in the current branch
|
||||
---
|
||||
|
||||
# Code Cleanup Skill
|
||||
|
||||
The **Code Cleanup Skill** reviews, refactors, and documents code changes in your current branch, ensuring alignment with **Pipecat's architecture, coding standards, and example patterns**.
|
||||
|
||||
1
.github/workflows/coverage.yaml
vendored
1
.github/workflows/coverage.yaml
vendored
@@ -42,6 +42,7 @@ jobs:
|
||||
--extra langchain \
|
||||
--extra livekit \
|
||||
--extra piper \
|
||||
--extra runner \
|
||||
--extra sagemaker \
|
||||
--extra tracing \
|
||||
--extra websocket
|
||||
|
||||
4
.github/workflows/format.yaml
vendored
4
.github/workflows/format.yaml
vendored
@@ -32,7 +32,9 @@ jobs:
|
||||
run: uv python install 3.12
|
||||
|
||||
- name: Install development dependencies
|
||||
run: uv sync --group dev --extra daily --extra tracing
|
||||
# `--all-extras` (matching the dev setup in README.md) so pyright can
|
||||
# resolve types from various optional dependencies.
|
||||
run: uv sync --group dev --all-extras --no-extra gstreamer --no-extra local
|
||||
|
||||
- name: Ruff formatter
|
||||
id: ruff-format
|
||||
|
||||
1
.github/workflows/tests.yaml
vendored
1
.github/workflows/tests.yaml
vendored
@@ -46,6 +46,7 @@ jobs:
|
||||
--extra langchain \
|
||||
--extra livekit \
|
||||
--extra piper \
|
||||
--extra runner \
|
||||
--extra sagemaker \
|
||||
--extra tracing \
|
||||
--extra websocket
|
||||
|
||||
174
AGENTS.md
Normal file
174
AGENTS.md
Normal file
@@ -0,0 +1,174 @@
|
||||
# AGENTS.md
|
||||
|
||||
This file provides guidance to AI coding agents when working with code in this repository.
|
||||
|
||||
## Project Overview
|
||||
|
||||
Pipecat is an open-source Python framework for building real-time voice and multimodal conversational AI agents. It orchestrates audio/video, AI services, transports, and conversation pipelines using a frame-based architecture.
|
||||
|
||||
## Common Commands
|
||||
|
||||
```bash
|
||||
# Setup development environment
|
||||
uv sync --group dev --all-extras --no-extra gstreamer --no-extra local
|
||||
|
||||
# Install pre-commit hooks
|
||||
uv run pre-commit install
|
||||
|
||||
# Run all tests
|
||||
uv run pytest
|
||||
|
||||
# Run a single test file
|
||||
uv run pytest tests/test_name.py
|
||||
|
||||
# Run a specific test
|
||||
uv run pytest tests/test_name.py::test_function_name
|
||||
|
||||
# Preview changelog
|
||||
uv run towncrier build --draft --version Unreleased
|
||||
|
||||
# Lint and format check
|
||||
uv run ruff check
|
||||
uv run ruff format --check
|
||||
|
||||
# Update dependencies (after editing pyproject.toml)
|
||||
uv lock && uv sync
|
||||
```
|
||||
|
||||
## Architecture
|
||||
|
||||
### Frame-Based Pipeline Processing
|
||||
|
||||
All data flows as **Frame** objects through a pipeline of **FrameProcessors**:
|
||||
|
||||
```
|
||||
[Processor1] → [Processor2] → ... → [ProcessorN]
|
||||
```
|
||||
|
||||
**Key components:**
|
||||
|
||||
- **Frames** (`src/pipecat/frames/frames.py`): Data units (audio, text, video) and control signals. Flow DOWNSTREAM (input→output) or UPSTREAM (acknowledgments/errors).
|
||||
|
||||
- **FrameProcessor** (`src/pipecat/processors/frame_processor.py`): Base processing unit. Each processor receives frames, processes them, and pushes results downstream.
|
||||
|
||||
- **Pipeline** (`src/pipecat/pipeline/pipeline.py`): Chains processors together.
|
||||
|
||||
- **ParallelPipeline** (`src/pipecat/pipeline/parallel_pipeline.py`): Runs multiple pipelines in parallel.
|
||||
|
||||
- **Transports** (`src/pipecat/transports/`): Transports are frame processors used for external I/O layer (Daily WebRTC, LiveKit WebRTC, WebSocket, Local). Abstract interface via `BaseTransport`, `BaseInputTransport` and `BaseOutputTransport`.
|
||||
|
||||
- **Pipeline Task (`src/pipecat/pipeline/task.py`)**: Runs and manages a pipeline. Pipeline tasks send the first frame, `StartFrame`, to the pipeline in order for processors to know they can start processing and pushing frames. Pipeline tasks internally create a pipeline with two additional processors, a source processor before the user-defined pipeline and a sink processor at the end. Those are used for multiple things: error handling, pipeline task level events, heartbeat monitoring, etc.
|
||||
|
||||
- **Pipeline Runner (`src/pipecat/pipeline/runner.py`)**: High-level entry point for executing pipeline tasks. Handles signal management (SIGINT/SIGTERM) for graceful shutdown and optional garbage collection. Run a single pipeline task with `await runner.run(task)` or multiple concurrently with `await asyncio.gather(runner.run(task1), runner.run(task2))`.
|
||||
|
||||
- **Services** (`src/pipecat/services/`): 60+ AI provider integrations (STT, TTS, LLM, etc.). Extend base classes: `AIService`, `LLMService`, `STTService`, `TTSService`, `VisionService`.
|
||||
|
||||
- **Serializers** (`src/pipecat/serializers/`): Convert frames to/from wire formats for WebSocket transports. `FrameSerializer` base class defines `serialize()` and `deserialize()`. Telephony serializers (Twilio, Plivo, Vonage, Telnyx, Exotel, Genesys) handle provider-specific protocols and audio encoding (e.g., μ-law).
|
||||
|
||||
- **RTVI** (`src/pipecat/processors/frameworks/rtvi.py`): Real-Time Voice Interface protocol bridging clients and the pipeline. `RTVIProcessor` handles incoming client messages (text input, audio, function call results). `RTVIObserver` converts pipeline frames to outgoing messages: user/bot speaking events, transcriptions, LLM/TTS lifecycle, function calls, metrics, and audio levels.
|
||||
|
||||
- **Observers** (`src/pipecat/observers/`): Monitor frame flow without modifying the pipeline. Passed to `PipelineTask` via the `observers` parameter. Implement `on_process_frame()` and `on_push_frame()` callbacks.
|
||||
|
||||
### Important Patterns
|
||||
|
||||
- **Context Aggregation**: `LLMContext` accumulates messages for LLM calls; `UserResponse` aggregates user input
|
||||
|
||||
- **Turn Management**: Turn management is done through `LLMUserAggregator` and
|
||||
`LLMAssistantAggregator`, created with `LLMContextAggregatorPair`
|
||||
|
||||
- **User turn strategies**: Detection of when the user starts and stops speaking is done via user turn start/stop strategies. They push `UserStartedSpeakingFrame` and `UserStoppedSpeakingFrame` respectively.
|
||||
|
||||
- **Interruptions**: Interruptions are usually triggered by a user turn start strategy (e.g. `VADUserTurnStartStrategy`) but they can be triggered by other processors as well, in which case the user turn start strategies don't need to. An `InterruptionFrame` carries an optional `asyncio.Event` that is set when the frame reaches the pipeline sink. If a processor stops an `InterruptionFrame` from propagating downstream (i.e., doesn't push it), it **must** call `frame.complete()` to avoid stalling `push_interruption_task_frame_and_wait()` callers.
|
||||
|
||||
- **Uninterruptible Frames**: These are frames that will not be removed from internal queues even if there's an interruption. For example, `EndFrame` and `StopFrame`.
|
||||
|
||||
- **Events**: Most classes in Pipecat have `BaseObject` as the very base class. `BaseObject` has support for events. Events can run in the background in an async task (default) or synchronously (`sync=True`) if we want immediate action. Synchronous event handlers need to execute fast.
|
||||
|
||||
- **Async Task Management**: Always use `self.create_task(coroutine, name)` instead of raw `asyncio.create_task()`. The `TaskManager` automatically tracks tasks and cleans them up on processor shutdown. Use `await self.cancel_task(task, timeout)` for cancellation.
|
||||
|
||||
- **Error Handling**: Use `await self.push_error(msg, exception, fatal)` to push errors upstream. Services should use `fatal=False` (the default) so application code can handle errors and take action (e.g. switch to another service).
|
||||
|
||||
### Key Directories
|
||||
|
||||
| Directory | Purpose |
|
||||
| -------------------------- | -------------------------------------------------- |
|
||||
| `src/pipecat/frames/` | Frame definitions (100+ types) |
|
||||
| `src/pipecat/processors/` | FrameProcessor base + aggregators, filters, audio |
|
||||
| `src/pipecat/pipeline/` | Pipeline orchestration |
|
||||
| `src/pipecat/services/` | AI service integrations (60+ providers) |
|
||||
| `src/pipecat/transports/` | Transport layer (Daily, LiveKit, WebSocket, Local) |
|
||||
| `src/pipecat/serializers/` | Frame serialization for WebSocket protocols |
|
||||
| `src/pipecat/observers/` | Pipeline observers for monitoring frame flow |
|
||||
| `src/pipecat/audio/` | VAD, filters, mixers, turn detection, DTMF |
|
||||
| `src/pipecat/turns/` | User turn management |
|
||||
|
||||
## Code Style
|
||||
|
||||
- **Docstrings**: Google-style. Classes describe purpose; `__init__` has `Args:` section; dataclasses use `Parameters:` section.
|
||||
- **Deprecations**: Use the `.. deprecated:: <version>` Sphinx directive in docstrings (never inline tags like `[DEPRECATED]`), and pair it with a runtime `warnings.warn(..., DeprecationWarning)` at the call site. See `CONTRIBUTING.md` for full conventions.
|
||||
- **Linting**: Ruff (line length 100). Pre-commit hooks enforce formatting.
|
||||
- **Type hints**: Required for complex async code.
|
||||
- **Dataclass vs Pydantic**: Use `@dataclass` for frames and internal pipeline data (high-frequency, no validation needed). Use Pydantic `BaseModel` for configuration, parameters, metrics, and external API data (benefits from validation and serialization). Specifically:
|
||||
- `@dataclass`: Frame types, context aggregator pairs, internal data containers
|
||||
- `BaseModel`: Service `InputParams`, transport/VAD/turn params, metrics data, API request/response models, serializer params
|
||||
|
||||
### Docstring Example
|
||||
|
||||
```python
|
||||
class MyService(LLMService):
|
||||
"""Description of what the service does.
|
||||
|
||||
More detailed description.
|
||||
|
||||
Event handlers available:
|
||||
|
||||
- on_connected: Called when we are connected
|
||||
|
||||
Example::
|
||||
|
||||
@service.event_handler("on_connected")
|
||||
async def on_connected(service, frame):
|
||||
...
|
||||
"""
|
||||
|
||||
def __init__(self, param1: str, **kwargs):
|
||||
"""Initialize the service.
|
||||
|
||||
Args:
|
||||
param1: Description of param1.
|
||||
**kwargs: Additional arguments passed to parent.
|
||||
"""
|
||||
super().__init__(**kwargs)
|
||||
|
||||
|
||||
# Pydantic params class with a deprecated field
|
||||
class MyParams(BaseModel):
|
||||
"""Configuration parameters for MyService.
|
||||
|
||||
Parameters:
|
||||
new_setting: Replacement for ``old_setting``.
|
||||
old_setting: Legacy setting, no longer used.
|
||||
|
||||
.. deprecated:: 1.2.0
|
||||
Use ``new_setting`` instead. Will be removed in 2.0.0.
|
||||
"""
|
||||
|
||||
new_setting: str = "default"
|
||||
old_setting: str | None = None
|
||||
```
|
||||
|
||||
## Service Implementation
|
||||
|
||||
When adding a new service:
|
||||
|
||||
1. Extend the appropriate base class (`STTService`, `TTSService`, `LLMService`, etc.)
|
||||
2. Implement required abstract methods
|
||||
3. Handle necessary frames
|
||||
4. By default, all frames should be pushed in the direction they came
|
||||
5. Push `ErrorFrame` on failures
|
||||
6. Add metrics tracking via `MetricsData` if relevant
|
||||
7. Follow the pattern of existing services in `src/pipecat/services/`
|
||||
|
||||
## Testing
|
||||
|
||||
Test utilities live in `src/pipecat/tests/utils.py`. Use `run_test()` to send frames through a pipeline and assert expected output frames in each direction. Use `SleepFrame(sleep=N)` to add delays between frames.
|
||||
509
CHANGELOG.md
509
CHANGELOG.md
@@ -7,6 +7,515 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
|
||||
<!-- towncrier release notes start -->
|
||||
|
||||
## [1.2.1] - 2026-05-15
|
||||
|
||||
### Changed
|
||||
|
||||
- Changed the default WebSocket endpoints for `GradiumSTTService` and
|
||||
`GradiumTTSService` to the region-neutral
|
||||
`wss://api.gradium.ai/api/speech/asr` and
|
||||
`wss://api.gradium.ai/api/speech/tts`. Gradium now automatically routes
|
||||
traffic to the nearest endpoint. Override the url to pin to a specific
|
||||
region.
|
||||
(PR [#4500](https://github.com/pipecat-ai/pipecat/pull/4500))
|
||||
|
||||
### Fixed
|
||||
|
||||
- Fixed bot hangs when `filter_incomplete_user_turns` was enabled and the LLM
|
||||
responded by calling a tool. The user turn never finalized, so the assistant
|
||||
aggregator gated the tool-result context push and the LLM continuation never
|
||||
ran. Tool calls now finalize the turn the moment they start, before the
|
||||
function dispatches.
|
||||
(PR [#4501](https://github.com/pipecat-ai/pipecat/pull/4501))
|
||||
|
||||
## [1.2.0] - 2026-05-14
|
||||
|
||||
### Added
|
||||
|
||||
- Added a `session_id` field to `RunnerArguments` so bots can log or trace a
|
||||
per-session identifier in local development the same way they can in Pipecat
|
||||
Cloud. The development runner now mints a UUID at every construction site,
|
||||
and paths that already returned a `sessionId` to the caller (Daily `/start`,
|
||||
dial-in webhook) share that same UUID with the runner args instead of
|
||||
generating two. The SmallWebRTC `/api/offer` endpoint also accepts an
|
||||
optional `session_id` query parameter so the `/sessions/{session_id}/...`
|
||||
proxy can thread it through.
|
||||
(PR [#4385](https://github.com/pipecat-ai/pipecat/pull/4385))
|
||||
|
||||
- Added a `max_buffer_delay_ms` constructor argument to `CartesiaTTSService`
|
||||
for controlling Cartesia's server-side text buffering. When unset, Pipecat
|
||||
picks a sensible default based on `text_aggregation_mode`: `0` in `SENTENCE`
|
||||
mode (custom buffering — avoids stacking client-side aggregation on top of
|
||||
Cartesia's default 3000ms server buffer) and unset in `TOKEN` mode
|
||||
(Cartesia's managed buffering applies). Pass an explicit value (0–5000ms) to
|
||||
override.
|
||||
(PR [#4390](https://github.com/pipecat-ai/pipecat/pull/4390))
|
||||
|
||||
- Added a `mip_opt_out` constructor argument to `DeepgramTTSService` and
|
||||
`DeepgramHttpTTSService` so callers can opt out of the Deepgram Model
|
||||
Improvement Program. When set, the value is forwarded to Deepgram as a query
|
||||
parameter on the speak request. Defaults to `None`, which preserves the
|
||||
existing behavior. See https://dpgr.am/deepgram-mip for pricing implications
|
||||
before enabling.
|
||||
(PR [#4400](https://github.com/pipecat-ai/pipecat/pull/4400))
|
||||
|
||||
- Added an opt-in `add_tool_change_messages` flag to the LLM aggregators (set
|
||||
via `LLMContextAggregatorPair(..., add_tool_change_messages=True)`) that
|
||||
appends a developer-role message to the context whenever `LLMSetToolsFrame`
|
||||
changes the set of advertised standard tools. Helps the LLM stay coherent
|
||||
across mid-conversation tool changes, mitigating several flavors of
|
||||
tool-call-related hallucination: calling tools that have been removed,
|
||||
avoiding tools that have been re-added, and hallucinating output (made-up
|
||||
answers or tool-call-shaped non-tool-calls) when tools are unavailable.
|
||||
(PR [#4404](https://github.com/pipecat-ai/pipecat/pull/4404))
|
||||
|
||||
- Added `deferred(strategy)` and `DeferredUserTurnStopStrategy` in
|
||||
`pipecat.turns.user_stop`. Wraps a stop strategy so it fires only the
|
||||
inference-triggered event and suppresses `on_user_turn_stopped`, leaving
|
||||
finalization to another strategy in the chain such as
|
||||
`LLMTurnCompletionUserTurnStopStrategy`.
|
||||
(PR [#4405](https://github.com/pipecat-ai/pipecat/pull/4405))
|
||||
|
||||
- Added `ExternalUserTurnCompletionStopStrategy` in `pipecat.turns.user_stop` —
|
||||
a generic stop strategy that finalizes the user turn whenever a
|
||||
`UserTurnInferenceCompletedFrame` arrives, regardless of which component
|
||||
produced it. `LLMTurnCompletionUserTurnStopStrategy` now extends this base;
|
||||
future producers (Flux, custom end-of-turn classifiers, etc.) can use the
|
||||
base directly or subclass it to add producer-specific setup.
|
||||
(PR [#4405](https://github.com/pipecat-ai/pipecat/pull/4405))
|
||||
|
||||
- Added `on_user_turn_inference_triggered`, a new event on the user turn
|
||||
controller, processor, aggregator and stop strategies that fires when a
|
||||
strategy has enough signal to start LLM inference. By default it fires
|
||||
together with `on_user_turn_stopped`; a gating strategy can fire only the
|
||||
inference-triggered event and defer finalization to a peer.
|
||||
(PR [#4405](https://github.com/pipecat-ai/pipecat/pull/4405))
|
||||
|
||||
- Added `FilterIncompleteUserTurnStrategies` in
|
||||
`pipecat.turns.user_turn_strategies` — a `UserTurnStrategies` specialization
|
||||
that wraps the detector chain with `deferred(...)` and appends
|
||||
`LLMTurnCompletionUserTurnStopStrategy` as the finalizer. Common case:
|
||||
`user_turn_strategies=FilterIncompleteUserTurnStrategies()`. Pass
|
||||
`config=UserTurnCompletionConfig(...)` to customize timeouts and prompts.
|
||||
(PR [#4405](https://github.com/pipecat-ai/pipecat/pull/4405))
|
||||
|
||||
- Added `LLMTurnCompletionUserTurnStopStrategy` in `pipecat.turns.user_stop`.
|
||||
When installed, the strategy gates `on_user_turn_stopped` on a
|
||||
`UserTurnInferenceCompletedFrame` (a new fieldless system frame emitted by
|
||||
any component that can judge turn completeness — e.g. the
|
||||
`UserTurnCompletionLLMServiceMixin` on `✓`). A `finalization_timeout`
|
||||
provides a safety net if no completion frame ever arrives.
|
||||
(PR [#4405](https://github.com/pipecat-ai/pipecat/pull/4405))
|
||||
|
||||
- Added first-class RTVI support for the UI Agent Protocol:
|
||||
- Adds `ui-event`, `ui-snapshot`, and `ui-cancel-task` client-to-server
|
||||
messages, plus `ui-command` and `ui-task` server-to-client messages, with
|
||||
paired `*Data` / `*Message` pydantic models.
|
||||
- Adds built-in command payload models for `Toast`, `Navigate`, `ScrollTo`,
|
||||
`Highlight`, `Focus`, `Click`, `SetInputValue`, and `SelectText`; matching
|
||||
default handlers live in `@pipecat-ai/client-react`.
|
||||
- Adds `RTVIProcessor.on_ui_message` for inbound `ui-event`, `ui-snapshot`,
|
||||
and `ui-cancel-task` messages.
|
||||
- Adds five UI pipeline frames, mirroring the `client-message`
|
||||
frame-and-event pattern: downstream code pushes `RTVIUICommandFrame` /
|
||||
`RTVIUITaskFrame` for the observer to wrap into outbound `UICommandMessage` /
|
||||
`UITaskMessage` envelopes, while the processor pushes inbound
|
||||
`RTVIUIEventFrame`, `RTVIUISnapshotFrame`, and `RTVIUICancelTaskFrame`
|
||||
alongside `on_ui_message`.
|
||||
- Bumps the RTVI `PROTOCOL_VERSION` from `1.2.0` to `1.3.0`.
|
||||
(PR [#4407](https://github.com/pipecat-ai/pipecat/pull/4407))
|
||||
|
||||
- AWS Transcribe STT, Polly TTS, Bedrock LLM, and the Bedrock AgentCore
|
||||
processor now resolve credentials via the standard boto3 provider chain (EC2
|
||||
instance profiles, EKS pod roles / IRSA, ECS task roles, SSO,
|
||||
`~/.aws/credentials`) when explicit credentials and `AWS_*` environment
|
||||
variables are absent. Services running with IAM roles no longer need to
|
||||
export static credentials.
|
||||
(PR [#4416](https://github.com/pipecat-ai/pipecat/pull/4416))
|
||||
|
||||
- Added `keyterms` support to ElevenLabs STT services so Scribe V2 callers can
|
||||
bias transcription for both file-based and realtime transcription.
|
||||
(PR [#4426](https://github.com/pipecat-ai/pipecat/pull/4426))
|
||||
|
||||
- Added `watchdog_min_timeout` parameter to `DeepgramFluxSTT` and
|
||||
`DeepgramFluxSageMakerSTT` (default `0.5` seconds) to control the minimum
|
||||
silence duration before the watchdog sends a silence packet to prevent
|
||||
dangling turns. The actual threshold is `max(chunk_duration * 2,
|
||||
watchdog_min_timeout)`, so it also adapts automatically to the audio chunk
|
||||
size in use.
|
||||
(PR [#4430](https://github.com/pipecat-ai/pipecat/pull/4430))
|
||||
|
||||
- Added `cancel_on_interruption=False` support for `GeminiLiveLLMService` on
|
||||
models that support Gemini's NON_BLOCKING tool mechanism (currently Gemini
|
||||
2.x); the conversation now continues while the tool runs. On models that
|
||||
don't yet support NON_BLOCKING (Gemini 3.x), the service surfaces a one-time
|
||||
warning explaining the limitation. (Note: an intermittent 1008 error can
|
||||
occasionally fire on Gemini 2.5 during long-running tool calls; we
|
||||
auto-reconnect.)
|
||||
(PR [#4448](https://github.com/pipecat-ai/pipecat/pull/4448))
|
||||
|
||||
- Added `NvidiaSageMakerWebsocketSTTService` for streaming speech recognition
|
||||
using NVIDIA Nemotron ASR via an AWS SageMaker bidirectional-stream endpoint.
|
||||
Produces `InterimTranscriptionFrame` and `TranscriptionFrame` frames, is
|
||||
VAD-aware, and automatically reconnects on error.
|
||||
(PR [#4464](https://github.com/pipecat-ai/pipecat/pull/4464))
|
||||
|
||||
- Added NVIDIA Magpie TTS services via AWS SageMaker:
|
||||
`NvidiaSageMakerHTTPTTSService` (single HTTP invocation, streams raw PCM
|
||||
back) and `NvidiaSageMakerWebsocketTTSService` (persistent HTTP/2 bidi-stream
|
||||
with full interruption support via `InterruptibleTTSService`).
|
||||
(PR [#4464](https://github.com/pipecat-ai/pipecat/pull/4464))
|
||||
|
||||
- Added support for `reasoning` configuration on `OpenAIRealtimeLLMService`,
|
||||
for use with reasoning-capable Realtime models such as `gpt-realtime-2`.
|
||||
(PR [#4470](https://github.com/pipecat-ai/pipecat/pull/4470))
|
||||
|
||||
- Inworld TTS updates:
|
||||
- Added `delivery_mode` setting (`STABLE`/`BALANCED`/`CREATIVE`) to
|
||||
`InworldTTSService` and `InworldHttpTTSService`, enabling the
|
||||
stability-vs-creativity tradeoff in `inworld-tts-2`.
|
||||
- Added language support to `InworldTTSService` and
|
||||
`InworldHttpTTSService`. The `language` setting is now forwarded to the API,
|
||||
and a new `language_to_inworld_language()` helper normalizes Pipecat
|
||||
`Language` enums to Inworld's BCP-47 locale tags.
|
||||
(PR [#4473](https://github.com/pipecat-ai/pipecat/pull/4473))
|
||||
|
||||
### Changed
|
||||
|
||||
- Updated the default `SonioxTTSService` model from `tts-rt-v1-preview` to the
|
||||
generally available `tts-rt-v1`.
|
||||
(PR [#4386](https://github.com/pipecat-ai/pipecat/pull/4386))
|
||||
|
||||
- Default `cartesia_version` for `CartesiaTTSService` bumped from `2025-04-16`
|
||||
to `2026-03-01`, matching `CartesiaHttpTTSService` and unlocking the
|
||||
`use_normalized_timestamps` and `max_buffer_delay_ms` fields.
|
||||
(PR [#4390](https://github.com/pipecat-ai/pipecat/pull/4390))
|
||||
|
||||
- ⚠️ `CartesiaTTSService` now sends `use_normalized_timestamps: true` instead
|
||||
of the deprecated `use_original_timestamps` field. Word timestamps now
|
||||
reflect what was actually spoken (post text-normalization and
|
||||
pronunciation-dictionary substitution), matching the convention Pipecat uses
|
||||
for ElevenLabs. This is a behavior change for `sonic-3` users, who were
|
||||
previously receiving timestamps tied to the input transcript.
|
||||
(PR [#4390](https://github.com/pipecat-ai/pipecat/pull/4390))
|
||||
|
||||
- Broadened `tool_resources` to `app_resources` for easy access not just in
|
||||
tool handlers but in other places like custom `FrameProcessor`s. Three
|
||||
changes: a rename (`tool_resources` → `app_resources`), a new `app_resources`
|
||||
property on `PipelineTask`, and a new `pipeline_task` property on
|
||||
`FrameProcessor`. Tool handlers now read `params.app_resources`; custom
|
||||
processors read `self.pipeline_task.app_resources`. The previous
|
||||
`tool_resources` aliases (on `PipelineTask`, `FunctionCallParams`, and
|
||||
`FrameProcessorSetup`) keep working but are deprecated as of 1.2.0 and emit
|
||||
`DeprecationWarning`s.
|
||||
(PR [#4395](https://github.com/pipecat-ai/pipecat/pull/4395))
|
||||
|
||||
- Lowered the per-message log in
|
||||
`SmallWebRTCInputTransport._handle_app_message` from `debug` to `trace`. App
|
||||
messages can be high-frequency and were noisy at debug level; set the loguru
|
||||
level to `TRACE` to see them again.
|
||||
(PR [#4397](https://github.com/pipecat-ai/pipecat/pull/4397))
|
||||
|
||||
- Changed the default model for `GrokRealtimeLLMService` to
|
||||
`grok-voice-think-fast-1.0`, xAI's recommended Voice Agent model. The
|
||||
previous default of `grok-voice-fast-1.0` has been deprecated by xAI and is
|
||||
being removed.
|
||||
(PR [#4401](https://github.com/pipecat-ai/pipecat/pull/4401))
|
||||
|
||||
- Changed the default Inworld TTS model from `inworld-tts-1.5-max` to
|
||||
`inworld-tts-2` (Realtime TTS-2) across `InworldHttpTTSService`,
|
||||
`InworldTTSService`, and the `InworldRealtimeLLMService` cascade. Existing
|
||||
users can pin the prior model explicitly via the `model`/`tts_model`
|
||||
argument; both `inworld-tts-1.5-max` and `inworld-tts-1.5-mini` remain valid
|
||||
model IDs.
|
||||
(PR [#4422](https://github.com/pipecat-ai/pipecat/pull/4422))
|
||||
|
||||
- Changed the default model for `GrokLLMService` from `grok-3` to
|
||||
`grok-4.20-non-reasoning`. xAI is retiring `grok-3` on May 15, 2026.
|
||||
(PR [#4429](https://github.com/pipecat-ai/pipecat/pull/4429))
|
||||
|
||||
- `DeepgramFluxSTT` watchdog silence threshold is now dynamic:
|
||||
`max(chunk_duration * 2, watchdog_min_timeout)` instead of a fixed 500 ms.
|
||||
This prevents false silence injections when large audio chunks are sent at
|
||||
lower frequency.
|
||||
(PR [#4430](https://github.com/pipecat-ai/pipecat/pull/4430))
|
||||
|
||||
- `ElevenLabsTTSService` now sends `close_context` to the server as soon as the
|
||||
turn is complete (on `on_turn_context_completed`) rather than waiting until
|
||||
all audio has finished playing back. The `isFinal` message from ElevenLabs is
|
||||
now used to signal `TTSStoppedFrame` and clean up the audio context,
|
||||
improving turn transition timing.
|
||||
(PR [#4433](https://github.com/pipecat-ai/pipecat/pull/4433))
|
||||
|
||||
- Updated `InworldHttpTTSService` and `InworldTTSService` to use PCM audio
|
||||
encoding by default, which returns audio bytes without headers.
|
||||
(PR [#4446](https://github.com/pipecat-ai/pipecat/pull/4446))
|
||||
|
||||
- Moved `create_task`, `cancel_task`, the `task_manager` property, and
|
||||
`setup(task_manager)` up from `FrameProcessor` to `BaseObject`. Custom
|
||||
`BaseObject` subclasses (turn strategies, controllers, etc.) now inherit
|
||||
these methods directly instead of reimplementing the task manager wiring.
|
||||
Owners propagate the task manager to their child `BaseObject`s via `await
|
||||
child.setup(task_manager)`.
|
||||
(PR [#4449](https://github.com/pipecat-ai/pipecat/pull/4449))
|
||||
|
||||
- Changed the default OpenAI Realtime input audio transcription model from
|
||||
`gpt-4o-transcribe` to `gpt-realtime-whisper` for both
|
||||
`OpenAIRealtimeSTTService` and `OpenAIRealtimeLLMService`. The new model does
|
||||
not accept the `prompt` parameter; if a prompt is supplied alongside
|
||||
`gpt-realtime-whisper`, it is dropped automatically and a warning is logged.
|
||||
To keep using prompt hints, explicitly pin `model="gpt-4o-transcribe"` (or
|
||||
`"gpt-4o-mini-transcribe"`).
|
||||
(PR [#4450](https://github.com/pipecat-ai/pipecat/pull/4450))
|
||||
|
||||
- Updated the default model for `CartesiaTTSService` and
|
||||
`CartesiaHttpTTSService` from `sonic-3` to `sonic-3.5`.
|
||||
(PR [#4462](https://github.com/pipecat-ai/pipecat/pull/4462))
|
||||
|
||||
- Changed the default model for `OpenAIRealtimeLLMService` from
|
||||
`gpt-realtime-1.5` to `gpt-realtime-2`.
|
||||
(PR [#4472](https://github.com/pipecat-ai/pipecat/pull/4472))
|
||||
|
||||
### Deprecated
|
||||
|
||||
- Deprecated `LLMUserAggregatorParams.filter_incomplete_user_turns`. Use
|
||||
`user_turn_strategies=FilterIncompleteUserTurnStrategies()` (or add
|
||||
`LLMTurnCompletionUserTurnStopStrategy` to a custom
|
||||
`user_turn_strategies.stop`) instead. Setting the legacy flag still works for
|
||||
one release: the aggregator emits a `DeprecationWarning` and rewires the
|
||||
strategies as if you had passed `FilterIncompleteUserTurnStrategies`
|
||||
directly.
|
||||
(PR [#4405](https://github.com/pipecat-ai/pipecat/pull/4405))
|
||||
|
||||
- Deprecated `ResampyResampler` in favor of `SOXRAudioResampler` (or the
|
||||
`create_file_resampler()` / `create_stream_resampler()` factories).
|
||||
Instantiating `ResampyResampler` now emits a `DeprecationWarning`. The class
|
||||
will be removed in Pipecat 2.0 along with the default `resampy` and `numba`
|
||||
dependencies.
|
||||
(PR [#4428](https://github.com/pipecat-ai/pipecat/pull/4428))
|
||||
|
||||
### Fixed
|
||||
|
||||
- Fixed `CartesiaTTSService` surfacing `flush_done` messages from Cartesia as
|
||||
`ErrorFrame`s. The latest API emits a `flush_done` per transcript when
|
||||
server-side buffering is disabled; Pipecat now consumes them silently since
|
||||
each turn already has its own `context_id`.
|
||||
(PR [#4390](https://github.com/pipecat-ai/pipecat/pull/4390))
|
||||
|
||||
- Fixed Cartesia tag helpers (`SPELL`, `EMOTION_TAG`, `PAUSE_TAG`,
|
||||
`VOLUME_TAG`, `SPEED_TAG`) raising `TypeError` when called on an instance
|
||||
(e.g. `tts.SPELL("hi")`). They're now `@staticmethod` and callable from both
|
||||
the class and an instance.
|
||||
(PR [#4390](https://github.com/pipecat-ai/pipecat/pull/4390))
|
||||
|
||||
- Fixed `CartesiaHttpTTSService` pushing two `ErrorFrame`s on a non-200
|
||||
response — one with the API's error text and a second, less informative
|
||||
"Unknown error" frame from the outer exception handler. It now pushes a
|
||||
single frame that includes the HTTP status code and returns cleanly.
|
||||
(PR [#4390](https://github.com/pipecat-ai/pipecat/pull/4390))
|
||||
|
||||
- Fixed an issue where `LocalSmartTurnAnalyzerV3` was imported unconditionally
|
||||
for user turn stop strategies. It is now only imported when
|
||||
`default_user_turn_stop_strategies()` is called. This improves startup time
|
||||
and removes the `transformers` "PyTorch/TensorFlow/Flax not found" warning
|
||||
when the default stop strategies are not used.
|
||||
(PR [#4393](https://github.com/pipecat-ai/pipecat/pull/4393))
|
||||
|
||||
- Fixed `GrokRealtimeLLMService` ignoring the configured model. The model was
|
||||
stored in `Settings` but never sent to xAI, so every session silently fell
|
||||
back to xAI's server-side default. The model is now passed via the `?model=`
|
||||
query parameter on the WebSocket URL as xAI's Voice Agent API requires.
|
||||
(PR [#4401](https://github.com/pipecat-ai/pipecat/pull/4401))
|
||||
|
||||
- Fixed `on_user_turn_stopped` firing prematurely when
|
||||
`filter_incomplete_user_turns` was enabled. The event now fires only after
|
||||
the LLM confirms the user turn is complete (`✓`); previously the smart-turn
|
||||
detector's tentative stop was bubbling up before the LLM had a chance to veto
|
||||
it, causing observers, transcript appenders and UI indicators to receive an
|
||||
early — and sometimes duplicated — signal.
|
||||
(PR [#4405](https://github.com/pipecat-ai/pipecat/pull/4405))
|
||||
|
||||
- Fixed `TTSSpeakFrame(append_to_context=True)` greetings sometimes splitting
|
||||
across two assistant messages in the LLM context and not surfacing in
|
||||
`on_assistant_turn_stopped`. The `LLMAssistantPushAggregationFrame` emitted
|
||||
at the end of a TTS context now carries a PTS just past the last word so it
|
||||
can't overtake clock-queued `TTSTextFrame`s in the transport's output, and
|
||||
`LLMAssistantAggregator` now triggers
|
||||
`on_assistant_turn_started`/`on_assistant_turn_stopped` when it receives the
|
||||
frame outside an LLM response cycle (restoring v0.0.104 behavior for greeting
|
||||
transcripts).
|
||||
(PR [#4414](https://github.com/pipecat-ai/pipecat/pull/4414))
|
||||
|
||||
- Fixed `ElevenLabsTTSService` and `ElevenLabsHttpTTSService` producing merged
|
||||
words (e.g. `bookLook`) when using Flash models. Flash often splits sentences
|
||||
mid-stream into alignment chunks that begin with a real inter-word space, but
|
||||
the previous fix unconditionally stripped that space from every chunk.
|
||||
Leading spaces are now stripped only on the first alignment chunk of an
|
||||
utterance, so subsequent chunks correctly flush partial words across
|
||||
boundaries.
|
||||
(PR [#4415](https://github.com/pipecat-ai/pipecat/pull/4415))
|
||||
|
||||
- Fixed AWS Polly TTS, Bedrock LLM, and the Bedrock AgentCore processor
|
||||
erroring out when only one of `AWS_ACCESS_KEY_ID` / `AWS_SECRET_ACCESS_KEY`
|
||||
was set in the environment. The half-populated kwargs are no longer forwarded
|
||||
to aioboto3; partial env-var configurations now fall through to the boto3
|
||||
credential chain like fully-unset configurations do.
|
||||
(PR [#4416](https://github.com/pipecat-ai/pipecat/pull/4416))
|
||||
|
||||
- Fixed `ElevenLabsTTSService` and `ElevenLabsHttpTTSService` writing
|
||||
romanized/normalized text to the LLM context. With non-Latin input (e.g.,
|
||||
Chinese), the assistant transcript was getting populated with pinyin (`Ni Hao
|
||||
!` instead of `你好!`), which then degraded subsequent LLM turns. The services
|
||||
now consume `alignment` by default and only switch to `normalizedAlignment` /
|
||||
`normalized_alignment` when `pronunciation_dictionary_locators` is configured
|
||||
(where `alignment` has overlapping restarts that produce duplicated/garbled
|
||||
words, per #4316). Both fields are read with preferred-with-fallback
|
||||
semantics since each is nullable per the API schema.
|
||||
(PR [#4424](https://github.com/pipecat-ai/pipecat/pull/4424))
|
||||
|
||||
- Fixed a deadlock in `TTSService` that could permanently stall pipeline
|
||||
processing when all three conditions occurred together:
|
||||
`pause_frame_processing=True`, an interruption arrived before any TTS audio
|
||||
was played, and an `UninterruptibleFrame` (e.g. `TTSUpdateSettingsFrame`,
|
||||
`FunctionCallResultFrame`) was in the processing queue at that moment. The
|
||||
process task would block on `__process_event.wait()` indefinitely because
|
||||
`BotStoppedSpeakingFrame` never arrives (no audio was played) and the
|
||||
interruption handler did not resume processing. Affects services using
|
||||
`pause_frame_processing=True` such as ElevenLabs, Rime, AsyncAI, Gradium, and
|
||||
ResembleAI.
|
||||
(PR [#4431](https://github.com/pipecat-ai/pipecat/pull/4431))
|
||||
|
||||
- Fixed interruptions being delayed when a slow non-uninterruptible frame was
|
||||
processing and an uninterruptible frame was waiting in the queue. The bot
|
||||
would stall until the slow frame finished instead of cancelling it
|
||||
immediately on interruption.
|
||||
(PR [#4434](https://github.com/pipecat-ai/pipecat/pull/4434))
|
||||
|
||||
- Fixed `TTSService` dropping uninterruptible frames (e.g.
|
||||
`FunctionCallResultFrame`) from its internal serialization queue when an
|
||||
interruption occurs. Previously, the queue was recreated on every
|
||||
interruption, silently discarding any queued frames. The queue is now reset
|
||||
instead of recreated, preserving uninterruptible frames so they are always
|
||||
delivered downstream.
|
||||
(PR [#4435](https://github.com/pipecat-ai/pipecat/pull/4435))
|
||||
|
||||
- Fixed a race condition in the Daily transport that caused `AttributeError:
|
||||
'NoneType' object has no attribute 'send_app_message'` when tearing down a
|
||||
pipeline. Both `DailyInputTransport` and `DailyOutputTransport` share the
|
||||
same `DailyTransportClient` and both call `cleanup()`, which was releasing
|
||||
the underlying `CallClient` on the first call — leaving the second caller
|
||||
with a `None` client.
|
||||
(PR [#4440](https://github.com/pipecat-ai/pipecat/pull/4440))
|
||||
|
||||
- Restored `cancel_on_interruption=False` support for `AWSNovaSonicLLMService`
|
||||
and `OpenAIRealtimeLLMService`. These services previously honored the flag by
|
||||
simply not cancelling in-flight function calls on interruption; the
|
||||
introduction of the new async-tool mechanism (which threads
|
||||
started/intermediate/final messages through the LLM context) broke that path
|
||||
because the realtime services didn't know how to interpret those messages.
|
||||
Note that new-style streamed intermediate results
|
||||
(`FunctionCallResultProperties(is_final=False)`) are not supported on these
|
||||
realtime services. Similar fixes for other impacted realtime services are
|
||||
forthcoming.
|
||||
(PR [#4441](https://github.com/pipecat-ai/pipecat/pull/4441))
|
||||
|
||||
- Fixed two misspelled Gemini TTS voice names in
|
||||
`GeminiTTSService.AVAILABLE_VOICES`.
|
||||
(PR [#4443](https://github.com/pipecat-ai/pipecat/pull/4443))
|
||||
|
||||
- Extended the `cancel_on_interruption=False` regression fix to
|
||||
`GrokRealtimeLLMService`, `AzureRealtimeLLMService`, and
|
||||
`UltravoxRealtimeLLMService`. Grok and Azure use the same approach as in
|
||||
#4441 (each service detects async-tool messages in the LLM context and routes
|
||||
the final result to its formal tool-result channel; Azure inherits
|
||||
transitively from `OpenAIRealtimeLLMService`). Ultravox needed a different
|
||||
approach because its API freezes the conversation between
|
||||
`client_tool_invocation` and the matching `client_tool_result` — for
|
||||
async-registered functions it now ships a placeholder `client_tool_result`
|
||||
immediately when the function is invoked (to unfreeze the conversation), then
|
||||
injects the real result as user-side text once the tool finishes. Streamed
|
||||
intermediate results (`FunctionCallResultProperties(is_final=False)`) are
|
||||
still not supported on any of these realtime services. `GeminiLiveLLMService`
|
||||
and `InworldRealtimeLLMService` are excluded for now: Gemini Live's
|
||||
async-tool path needs deeper investigation, and Inworld tool calling needs to
|
||||
be sorted out first.
|
||||
(PR [#4447](https://github.com/pipecat-ai/pipecat/pull/4447))
|
||||
|
||||
- Fixed `OpenAIRealtimeLLMService` handling of multi-output-item responses
|
||||
(observed with `gpt-realtime-2`). A single response can now contain more than
|
||||
one audio item, and the first item's `audio.done` may arrive after the second
|
||||
item's deltas have started. Deltas still arrive strictly in playback order,
|
||||
so we continue to forward them as received (matching OpenAI's reference
|
||||
implementation). The fix removes spurious warnings, ensures truncation always
|
||||
targets the latest audio item, and emits a single bracketing
|
||||
`TTSStartedFrame`/`TTSStoppedFrame` pair per assistant turn (the Stopped is
|
||||
now pushed on `response.done`).
|
||||
(PR [#4465](https://github.com/pipecat-ai/pipecat/pull/4465))
|
||||
|
||||
- Fixed missing `output` attribute on LLM OpenTelemetry spans when the LLM call
|
||||
is interrupted mid-stream.
|
||||
(PR [#4467](https://github.com/pipecat-ai/pipecat/pull/4467))
|
||||
|
||||
- Fixed incorrect `metrics.ttfb` on STT OpenTelemetry spans, and parented them
|
||||
to the current turn span.
|
||||
(PR [#4467](https://github.com/pipecat-ai/pipecat/pull/4467))
|
||||
|
||||
- Fixed incorrect `metrics.ttfb` on TTS OpenTelemetry spans for streaming
|
||||
services.
|
||||
(PR [#4467](https://github.com/pipecat-ai/pipecat/pull/4467))
|
||||
|
||||
- Extended the `cancel_on_interruption=False` regression fix to
|
||||
`InworldRealtimeLLMService`. Uses the same approach as in #4441 (the service
|
||||
detects async-tool messages in the LLM context and routes the final result to
|
||||
its formal tool-result channel). Note: as of this writing, Inworld Realtime
|
||||
doesn't appear to handle the resulting delayed tool result reliably — the
|
||||
routing is best-effort and the service surfaces a one-time warning when
|
||||
async-tool messages are seen. Streamed intermediate results
|
||||
(`FunctionCallResultProperties(is_final=False)`) are still not supported on
|
||||
this realtime service. (Inworld was excluded from #4447 pending resolution of
|
||||
an unrelated tool-calling issue, which turned out to be an account-level
|
||||
matter.)
|
||||
(PR [#4474](https://github.com/pipecat-ai/pipecat/pull/4474))
|
||||
|
||||
- Fixed Cartesia TTS Korean word timestamps to use normal spacing rules,
|
||||
preserving word boundaries and per-word timestamp alignment during downstream
|
||||
aggregation.
|
||||
(PR [#4475](https://github.com/pipecat-ai/pipecat/pull/4475))
|
||||
|
||||
- Fixed Cartesia TTS Chinese and Japanese timestamp grouping to preserve
|
||||
provider text spacing, avoiding artificial spaces when timestamp groups are
|
||||
reassembled downstream.
|
||||
(PR [#4475](https://github.com/pipecat-ai/pipecat/pull/4475))
|
||||
|
||||
- Fixed `SonioxSTTService` final transcription frames missing detected language
|
||||
metadata when Soniox returns token-level language annotations.
|
||||
(PR [#4482](https://github.com/pipecat-ai/pipecat/pull/4482))
|
||||
|
||||
- Fixed Soniox final transcription language detection to use the most common
|
||||
recognized token language, avoiding mislabeling an utterance when the last
|
||||
token is tagged with a different language.
|
||||
(PR [#4495](https://github.com/pipecat-ai/pipecat/pull/4495))
|
||||
|
||||
- Fixed dropped audio in streaming TTS services whose wire protocol doesn't
|
||||
echo `context_id` back on incoming audio (Sarvam, Smallest, Soniox, Inworld,
|
||||
and others). Previously, audio that arrived between contexts or at the very
|
||||
start of a turn was tagged with `context_id=None` and silently dropped with
|
||||
an "unable to append audio to context: no context ID provided" debug log.
|
||||
`TTSService.get_active_audio_context_id()` now falls back to the
|
||||
synthesis-side `_turn_context_id` when the playback cursor isn't set yet.
|
||||
(PR [#4497](https://github.com/pipecat-ai/pipecat/pull/4497))
|
||||
|
||||
### Security
|
||||
|
||||
- Fixed a path traversal issue in the development runner's
|
||||
`/files/{filename:path}` download endpoint. Previously, when the runner was
|
||||
started with `--folder`, a request like `/files/..%2F..%2Fetc%2Fpasswd` could
|
||||
escape the configured folder because `%2F`-encoded separators bypassed
|
||||
Starlette's path normalisation. The endpoint now resolves the joined path and
|
||||
rejects any filename that escapes the allowed base with a 403, and also
|
||||
returns 404 (instead of an implicit `null` 200) when `--folder` is unset.
|
||||
(PR [#4417](https://github.com/pipecat-ai/pipecat/pull/4417))
|
||||
|
||||
## [1.1.0] - 2026-04-27
|
||||
|
||||
### Added
|
||||
|
||||
158
CLAUDE.md
158
CLAUDE.md
@@ -1,157 +1 @@
|
||||
# CLAUDE.md
|
||||
|
||||
This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
|
||||
|
||||
## Project Overview
|
||||
|
||||
Pipecat is an open-source Python framework for building real-time voice and multimodal conversational AI agents. It orchestrates audio/video, AI services, transports, and conversation pipelines using a frame-based architecture.
|
||||
|
||||
## Common Commands
|
||||
|
||||
```bash
|
||||
# Setup development environment
|
||||
uv sync --group dev --all-extras --no-extra gstreamer
|
||||
|
||||
# Install pre-commit hooks
|
||||
uv run pre-commit install
|
||||
|
||||
# Run all tests
|
||||
uv run pytest
|
||||
|
||||
# Run a single test file
|
||||
uv run pytest tests/test_name.py
|
||||
|
||||
# Run a specific test
|
||||
uv run pytest tests/test_name.py::test_function_name
|
||||
|
||||
# Preview changelog
|
||||
uv run towncrier build --draft --version Unreleased
|
||||
|
||||
# Lint and format check
|
||||
uv run ruff check
|
||||
uv run ruff format --check
|
||||
|
||||
# Update dependencies (after editing pyproject.toml)
|
||||
uv lock && uv sync
|
||||
```
|
||||
|
||||
## Architecture
|
||||
|
||||
### Frame-Based Pipeline Processing
|
||||
|
||||
All data flows as **Frame** objects through a pipeline of **FrameProcessors**:
|
||||
|
||||
```
|
||||
[Processor1] → [Processor2] → ... → [ProcessorN]
|
||||
```
|
||||
|
||||
**Key components:**
|
||||
|
||||
- **Frames** (`src/pipecat/frames/frames.py`): Data units (audio, text, video) and control signals. Flow DOWNSTREAM (input→output) or UPSTREAM (acknowledgments/errors).
|
||||
|
||||
- **FrameProcessor** (`src/pipecat/processors/frame_processor.py`): Base processing unit. Each processor receives frames, processes them, and pushes results downstream.
|
||||
|
||||
- **Pipeline** (`src/pipecat/pipeline/pipeline.py`): Chains processors together.
|
||||
|
||||
- **ParallelPipeline** (`src/pipecat/pipeline/parallel_pipeline.py`): Runs multiple pipelines in parallel.
|
||||
|
||||
- **Transports** (`src/pipecat/transports/`): Transports are frame processors used for external I/O layer (Daily WebRTC, LiveKit WebRTC, WebSocket, Local). Abstract interface via `BaseTransport`, `BaseInputTransport` and `BaseOutputTransport`.
|
||||
|
||||
- **Pipeline Task (`src/pipecat/pipeline/task.py`)**: Runs and manages a pipeline. Pipeline tasks send the first frame, `StartFrame`, to the pipeline in order for processors to know they can start processing and pushing frames. Pipeline tasks internally create a pipeline with two additional processors, a source processor before the user-defined pipeline and a sink processor at the end. Those are used for multiple things: error handling, pipeline task level events, heartbeat monitoring, etc.
|
||||
|
||||
- **Pipeline Runner (`src/pipecat/pipeline/runner.py`)**: High-level entry point for executing pipeline tasks. Handles signal management (SIGINT/SIGTERM) for graceful shutdown and optional garbage collection. Run a single pipeline task with `await runner.run(task)` or multiple concurrently with `await asyncio.gather(runner.run(task1), runner.run(task2))`.
|
||||
|
||||
- **Services** (`src/pipecat/services/`): 60+ AI provider integrations (STT, TTS, LLM, etc.). Extend base classes: `AIService`, `LLMService`, `STTService`, `TTSService`, `VisionService`.
|
||||
|
||||
- **Serializers** (`src/pipecat/serializers/`): Convert frames to/from wire formats for WebSocket transports. `FrameSerializer` base class defines `serialize()` and `deserialize()`. Telephony serializers (Twilio, Plivo, Vonage, Telnyx, Exotel, Genesys) handle provider-specific protocols and audio encoding (e.g., μ-law).
|
||||
|
||||
- **RTVI** (`src/pipecat/processors/frameworks/rtvi.py`): Real-Time Voice Interface protocol bridging clients and the pipeline. `RTVIProcessor` handles incoming client messages (text input, audio, function call results). `RTVIObserver` converts pipeline frames to outgoing messages: user/bot speaking events, transcriptions, LLM/TTS lifecycle, function calls, metrics, and audio levels.
|
||||
|
||||
- **Observers** (`src/pipecat/observers/`): Monitor frame flow without modifying the pipeline. Passed to `PipelineTask` via the `observers` parameter. Implement `on_process_frame()` and `on_push_frame()` callbacks.
|
||||
|
||||
### Important Patterns
|
||||
|
||||
- **Context Aggregation**: `LLMContext` accumulates messages for LLM calls; `UserResponse` aggregates user input
|
||||
|
||||
- **Turn Management**: Turn management is done through `LLMUserAggregator` and
|
||||
`LLMAssistantAggregator`, created with `LLMContextAggregatorPair`
|
||||
|
||||
- **User turn strategies**: Detection of when the user starts and stops speaking is done via user turn start/stop strategies. They push `UserStartedSpeakingFrame` and `UserStoppedSpeakingFrame` respectively.
|
||||
|
||||
- **Interruptions**: Interruptions are usually triggered by a user turn start strategy (e.g. `VADUserTurnStartStrategy`) but they can be triggered by other processors as well, in which case the user turn start strategies don't need to. An `InterruptionFrame` carries an optional `asyncio.Event` that is set when the frame reaches the pipeline sink. If a processor stops an `InterruptionFrame` from propagating downstream (i.e., doesn't push it), it **must** call `frame.complete()` to avoid stalling `push_interruption_task_frame_and_wait()` callers.
|
||||
|
||||
- **Uninterruptible Frames**: These are frames that will not be removed from internal queues even if there's an interruption. For example, `EndFrame` and `StopFrame`.
|
||||
|
||||
- **Events**: Most classes in Pipecat have `BaseObject` as the very base class. `BaseObject` has support for events. Events can run in the background in an async task (default) or synchronously (`sync=True`) if we want immediate action. Synchronous event handlers need to execute fast.
|
||||
|
||||
- **Async Task Management**: Always use `self.create_task(coroutine, name)` instead of raw `asyncio.create_task()`. The `TaskManager` automatically tracks tasks and cleans them up on processor shutdown. Use `await self.cancel_task(task, timeout)` for cancellation.
|
||||
|
||||
- **Error Handling**: Use `await self.push_error(msg, exception, fatal)` to push errors upstream. Services should use `fatal=False` (the default) so application code can handle errors and take action (e.g. switch to another service).
|
||||
|
||||
### Key Directories
|
||||
|
||||
| Directory | Purpose |
|
||||
| -------------------------- | -------------------------------------------------- |
|
||||
| `src/pipecat/frames/` | Frame definitions (100+ types) |
|
||||
| `src/pipecat/processors/` | FrameProcessor base + aggregators, filters, audio |
|
||||
| `src/pipecat/pipeline/` | Pipeline orchestration |
|
||||
| `src/pipecat/services/` | AI service integrations (60+ providers) |
|
||||
| `src/pipecat/transports/` | Transport layer (Daily, LiveKit, WebSocket, Local) |
|
||||
| `src/pipecat/serializers/` | Frame serialization for WebSocket protocols |
|
||||
| `src/pipecat/observers/` | Pipeline observers for monitoring frame flow |
|
||||
| `src/pipecat/audio/` | VAD, filters, mixers, turn detection, DTMF |
|
||||
| `src/pipecat/turns/` | User turn management |
|
||||
|
||||
## Code Style
|
||||
|
||||
- **Docstrings**: Google-style. Classes describe purpose; `__init__` has `Args:` section; dataclasses use `Parameters:` section.
|
||||
- **Linting**: Ruff (line length 100). Pre-commit hooks enforce formatting.
|
||||
- **Type hints**: Required for complex async code.
|
||||
- **Dataclass vs Pydantic**: Use `@dataclass` for frames and internal pipeline data (high-frequency, no validation needed). Use Pydantic `BaseModel` for configuration, parameters, metrics, and external API data (benefits from validation and serialization). Specifically:
|
||||
- `@dataclass`: Frame types, context aggregator pairs, internal data containers
|
||||
- `BaseModel`: Service `InputParams`, transport/VAD/turn params, metrics data, API request/response models, serializer params
|
||||
|
||||
### Docstring Example
|
||||
|
||||
```python
|
||||
class MyService(LLMService):
|
||||
"""Description of what the service does.
|
||||
|
||||
More detailed description.
|
||||
|
||||
Event handlers available:
|
||||
|
||||
- on_connected: Called when we are connected
|
||||
|
||||
Example::
|
||||
|
||||
@service.event_handler("on_connected")
|
||||
async def on_connected(service, frame):
|
||||
...
|
||||
"""
|
||||
|
||||
def __init__(self, param1: str, **kwargs):
|
||||
"""Initialize the service.
|
||||
|
||||
Args:
|
||||
param1: Description of param1.
|
||||
**kwargs: Additional arguments passed to parent.
|
||||
"""
|
||||
super().__init__(**kwargs)
|
||||
```
|
||||
|
||||
## Service Implementation
|
||||
|
||||
When adding a new service:
|
||||
|
||||
1. Extend the appropriate base class (`STTService`, `TTSService`, `LLMService`, etc.)
|
||||
2. Implement required abstract methods
|
||||
3. Handle necessary frames
|
||||
4. By default, all frames should be pushed in the direction they came
|
||||
5. Push `ErrorFrame` on failures
|
||||
6. Add metrics tracking via `MetricsData` if relevant
|
||||
7. Follow the pattern of existing services in `src/pipecat/services/`
|
||||
|
||||
## Testing
|
||||
|
||||
Test utilities live in `src/pipecat/tests/utils.py`. Use `run_test()` to send frames through a pipeline and assert expected output frames in each direction. Use `SleepFrame(sleep=N)` to add delays between frames.
|
||||
@AGENTS.md
|
||||
|
||||
@@ -1 +0,0 @@
|
||||
- Added a `session_id` field to `RunnerArguments` so bots can log or trace a per-session identifier in local development the same way they can in Pipecat Cloud. The development runner now mints a UUID at every construction site, and paths that already returned a `sessionId` to the caller (Daily `/start`, dial-in webhook) share that same UUID with the runner args instead of generating two. The SmallWebRTC `/api/offer` endpoint also accepts an optional `session_id` query parameter so the `/sessions/{session_id}/...` proxy can thread it through.
|
||||
@@ -1 +0,0 @@
|
||||
- Updated the default `SonioxTTSService` model from `tts-rt-v1-preview` to the generally available `tts-rt-v1`.
|
||||
1
changelog/4442.added.2.md
Normal file
1
changelog/4442.added.2.md
Normal file
@@ -0,0 +1 @@
|
||||
- Added `GET /status` endpoint to the development runner that reports which transports the running instance accepts (all by default, or the single transport passed via `-t`).
|
||||
1
changelog/4442.added.md
Normal file
1
changelog/4442.added.md
Normal file
@@ -0,0 +1 @@
|
||||
- Added plain WebSocket transport support to the development runner. Bots can now accept connections from non-telephony WebSocket clients (e.g., browser apps using protobuf framing) via the `/ws-client` endpoint alongside other transports.
|
||||
1
changelog/4442.changed.md
Normal file
1
changelog/4442.changed.md
Normal file
@@ -0,0 +1 @@
|
||||
- ⚠️ The development runner now supports all transports (WebRTC, Daily, telephony, plain WebSocket) simultaneously from a single server. The `/start` endpoint accepts a `"transport"` field to select the transport per-request; omitting `-t` at startup enables all transports instead of defaulting to WebRTC. The Daily browser-redirect route moved from `GET /` to `GET /daily`.
|
||||
1
changelog/4507.fixed.md
Normal file
1
changelog/4507.fixed.md
Normal file
@@ -0,0 +1 @@
|
||||
- Fixed `ElevenLabsSTTService` crashing when `language` was passed as `None`. When `language` is not set, the service now lets ElevenLabs auto-detect the audio language.
|
||||
1
changelog/4513.changed.2.md
Normal file
1
changelog/4513.changed.2.md
Normal file
@@ -0,0 +1 @@
|
||||
- OpenRouter LLM requests now convert `developer` messages to `user` messages by default for broader model compatibility. Override this by subclassing `OpenRouterLLMService` or setting `llm.supports_developer_role = True` for models that support the `developer` role.
|
||||
1
changelog/4513.changed.md
Normal file
1
changelog/4513.changed.md
Normal file
@@ -0,0 +1 @@
|
||||
- OpenRouter LLM service now defaults to `openai/gpt-4.1`.
|
||||
@@ -132,6 +132,10 @@ NOVITA_API_KEY=...
|
||||
|
||||
# NVIDIA
|
||||
NVIDIA_API_KEY=...
|
||||
# For a full example of how to deploy to SageMaker, see:
|
||||
# https://github.com/pipecat-ai/pipecat-examples/tree/main/nvidia_sagemaker_example/deployment/aws-sagemaker-nvidia
|
||||
SAGEMAKER_ASR_ENDPOINT_NAME=...
|
||||
SAGEMAKER_MAGPIE_ENDPOINT_NAME=...
|
||||
|
||||
# OpenAI
|
||||
OPENAI_API_KEY=...
|
||||
|
||||
232
examples/features/features-add-tool-change-messages.py
Normal file
232
examples/features/features-add-tool-change-messages.py
Normal file
@@ -0,0 +1,232 @@
|
||||
#
|
||||
# Copyright (c) 2024-2026, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""Manual validation harness for the ``add_tool_change_messages`` feature.
|
||||
|
||||
When tools change mid-conversation, LLMs can produce a few different
|
||||
flavors of tool-call-related hallucination:
|
||||
|
||||
- **Forward hallucination** — calling a tool that has been removed.
|
||||
- **Negative hallucination** — refusing to call a tool that has been
|
||||
re-added (because recent context is full of "I can't" responses).
|
||||
- **Hallucinated output when tools are unavailable** — making up an
|
||||
answer rather than declining gracefully, or producing JSON that
|
||||
*looks* like a tool call but is actually just an assistant text
|
||||
response.
|
||||
|
||||
The ``add_tool_change_messages`` feature mitigates these by appending a
|
||||
developer-role message to the conversation whenever ``LLMSetToolsFrame``
|
||||
changes the set of advertised tools, so the LLM stays in sync with what's
|
||||
actually available.
|
||||
|
||||
This harness exercises all of those flavors by flipping the advertised
|
||||
tool set on a turn counter:
|
||||
|
||||
Phase 0 (turns 1–4): weather tool ACTIVE — confirm baseline.
|
||||
Phase 1 (turns 5–8): tool REMOVED — keep asking for weather.
|
||||
Phase 2 (turn 9+): tool RE-ADDED — does the LLM call it again?
|
||||
|
||||
Set ``ADD_TOOL_CHANGE_MESSAGES=0`` to disable the mitigation and see the
|
||||
unmitigated behavior. The default is ON so a fresh run shows the feature
|
||||
working.
|
||||
|
||||
Defaults to Llama 3.1 8B Instruct via a locally-running Ollama —
|
||||
anecdotally one of the more hallucination-prone of the easily accessible
|
||||
models. Pull the model once with ``ollama pull llama3.1:8b`` and make
|
||||
sure ``ollama serve`` is running. Swap the LLM service to validate other
|
||||
providers.
|
||||
|
||||
Run with::
|
||||
|
||||
uv run examples/features/features-add-tool-change-messages.py
|
||||
ADD_TOOL_CHANGE_MESSAGES=0 uv run examples/features/features-add-tool-change-messages.py
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.adapters.schemas.function_schema import FunctionSchema
|
||||
from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.frames.frames import LLMRunFrame, LLMSetToolsFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import NOT_GIVEN, LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import (
|
||||
LLMContextAggregatorPair,
|
||||
LLMUserAggregatorParams,
|
||||
)
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.llm_service import FunctionCallParams
|
||||
from pipecat.services.ollama.llm import OLLamaLLMService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
# Default ON so a fresh run shows the feature working. Set to "0" to A/B
|
||||
# against the unmitigated behavior.
|
||||
ADD_TOOL_CHANGE_MESSAGES = os.environ.get("ADD_TOOL_CHANGE_MESSAGES", "1") == "1"
|
||||
|
||||
|
||||
async def fetch_weather_from_api(params: FunctionCallParams):
|
||||
await params.result_callback({"conditions": "nice", "temperature": "75"})
|
||||
|
||||
|
||||
weather_function = FunctionSchema(
|
||||
name="get_current_weather",
|
||||
description="Get the current weather",
|
||||
properties={
|
||||
"location": {
|
||||
"type": "string",
|
||||
"description": "The city and state, e.g. San Francisco, CA",
|
||||
},
|
||||
"format": {
|
||||
"type": "string",
|
||||
"enum": ["celsius", "fahrenheit"],
|
||||
"description": "The temperature unit to use. Infer this from the user's location.",
|
||||
},
|
||||
},
|
||||
required=["location", "format"],
|
||||
)
|
||||
weather_tools = ToolsSchema(standard_tools=[weather_function])
|
||||
|
||||
|
||||
transport_params = {
|
||||
"daily": lambda: DailyParams(audio_in_enabled=True, audio_out_enabled=True),
|
||||
"twilio": lambda: FastAPIWebsocketParams(audio_in_enabled=True, audio_out_enabled=True),
|
||||
"webrtc": lambda: TransportParams(audio_in_enabled=True, audio_out_enabled=True),
|
||||
}
|
||||
|
||||
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(
|
||||
f"Starting add_tool_change_messages demo bot "
|
||||
f"(ADD_TOOL_CHANGE_MESSAGES={ADD_TOOL_CHANGE_MESSAGES})"
|
||||
)
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.environ["DEEPGRAM_API_KEY"])
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.environ["CARTESIA_API_KEY"],
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
),
|
||||
)
|
||||
|
||||
llm = OLLamaLLMService(
|
||||
settings=OLLamaLLMService.Settings(
|
||||
# Llama 3.1 8B Instruct is anecdotally one of the more
|
||||
# hallucination-prone of the easily accessible models — exactly
|
||||
# what we want for this validation harness. Pull it with
|
||||
# ``ollama pull llama3.1:8b`` and make sure ``ollama serve``
|
||||
# is running.
|
||||
model="llama3.1:8b",
|
||||
system_instruction=(
|
||||
"You are a helpful assistant in a voice conversation. Your responses "
|
||||
"will be spoken aloud, so avoid emojis, bullet points, or other "
|
||||
"formatting that can't be spoken. Respond briefly and naturally. "
|
||||
"If the user asks for the current weather, use the `get_current_weather` "
|
||||
"function if it's available. IMPORTANT: if you do not have access to the function, "
|
||||
"say something along the lines of 'Sorry, I can't check the weather right now.'."
|
||||
),
|
||||
),
|
||||
)
|
||||
llm.register_function("get_current_weather", fetch_weather_from_api)
|
||||
|
||||
context = LLMContext(tools=weather_tools)
|
||||
user_aggregator, assistant_aggregator = LLMContextAggregatorPair(
|
||||
context,
|
||||
user_params=LLMUserAggregatorParams(vad_analyzer=SileroVADAnalyzer()),
|
||||
add_tool_change_messages=ADD_TOOL_CHANGE_MESSAGES,
|
||||
)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(),
|
||||
stt,
|
||||
user_aggregator,
|
||||
llm,
|
||||
tts,
|
||||
transport.output(),
|
||||
assistant_aggregator,
|
||||
]
|
||||
)
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
params=PipelineParams(enable_metrics=True, enable_usage_metrics=True),
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
)
|
||||
|
||||
# Phase controller: roughly 4 turns per phase.
|
||||
user_turn_count = 0
|
||||
REMOVE_AT_TURN = 5 # tool gone for turn N onward
|
||||
READD_AT_TURN = 9 # tool back for turn N onward
|
||||
|
||||
@user_aggregator.event_handler("on_user_turn_stopped")
|
||||
async def on_user_turn_stopped(aggregator, strategy, message):
|
||||
nonlocal user_turn_count
|
||||
user_turn_count += 1
|
||||
logger.info(f"=== User turn {user_turn_count} complete ===")
|
||||
|
||||
if user_turn_count == REMOVE_AT_TURN - 1:
|
||||
logger.info(
|
||||
"=== Phase 1: weather tool REMOVED. Keep asking about the weather "
|
||||
"to exercise hallucination scenarios. ==="
|
||||
)
|
||||
await task.queue_frame(LLMSetToolsFrame(tools=NOT_GIVEN))
|
||||
elif user_turn_count == READD_AT_TURN - 1:
|
||||
logger.info(
|
||||
"=== Phase 2: weather tool RE-ADDED. Ask for the weather again — "
|
||||
"does the LLM call it, or keep refusing? (THIS IS THE TEST.) ==="
|
||||
)
|
||||
await task.queue_frame(LLMSetToolsFrame(tools=weather_tools))
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info("Client connected")
|
||||
logger.info(
|
||||
"=== Phase 0: weather tool ACTIVE. Ask for the weather a few times "
|
||||
"to confirm it's working. ==="
|
||||
)
|
||||
context.add_message(
|
||||
{
|
||||
"role": "developer",
|
||||
"content": (
|
||||
"Please introduce yourself briefly to the user, then invite them "
|
||||
"to ask about the weather."
|
||||
),
|
||||
}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info("Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
async def bot(runner_args: RunnerArguments):
|
||||
"""Main bot entry point compatible with Pipecat Cloud."""
|
||||
transport = await create_transport(runner_args, transport_params)
|
||||
await run_bot(transport, runner_args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from pipecat.runner.run import main
|
||||
|
||||
main()
|
||||
@@ -4,23 +4,33 @@
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""Example demonstrating ``PipelineTask(tool_resources=...)``.
|
||||
"""Example demonstrating ``PipelineTask(app_resources=...)``.
|
||||
|
||||
``tool_resources`` is an application-defined bag of anything you want every
|
||||
tool handler in a session to share by reference: database handles, HTTP
|
||||
clients, feature flags, per-user state, observability clients, in-memory
|
||||
caches — whatever fits your app. Pipecat passes it through untouched as
|
||||
``FunctionCallParams.tool_resources``.
|
||||
``app_resources`` is an application-defined bag of anything your
|
||||
application code may want to share across a session: database handles,
|
||||
HTTP clients, feature flags, per-user state, observability clients,
|
||||
in-memory caches — whatever fits your app. Pipecat passes it through
|
||||
untouched and exposes it as ``task.app_resources``, so any code with a
|
||||
handle on the task can read or mutate it.
|
||||
|
||||
This example uses a small ``ToolCallLogger`` as a stand-in for that "shared
|
||||
thing". A real app might just as easily pass a Postgres pool, a Redis
|
||||
client, a Stripe SDK instance, or any combination thereof. The mechanics
|
||||
shown here — construct once, hand to the task, read it from each handler,
|
||||
inspect it after the session — are the same regardless of what you put in.
|
||||
Two of the convenience aliases exercised below:
|
||||
|
||||
We bundle resources in a typed ``SessionResources`` dataclass and cast back
|
||||
to it at the top of each handler. Pipecat doesn't care what type you pass
|
||||
(a plain dict works too), but a typed container gives you autocomplete and
|
||||
- Tool handlers read it from ``FunctionCallParams.app_resources``.
|
||||
- Custom ``FrameProcessor`` subclasses read it from
|
||||
``self.pipeline_task.app_resources``.
|
||||
|
||||
This example uses two small loggers as stand-ins for that "shared thing":
|
||||
``ToolCallLogger`` (written from tool handlers) and
|
||||
``TranscriptionLogger`` (written from a custom ``FrameProcessor`` that
|
||||
sits in the pipeline). A real app might just as easily pass a Postgres
|
||||
pool, a Redis client, a Stripe SDK instance, or any combination thereof.
|
||||
The mechanics shown here — construct once, hand to the task, read it
|
||||
from each site, inspect it after the session — are the same regardless
|
||||
of what you put in.
|
||||
|
||||
We bundle resources in a typed ``AppResources`` dataclass and cast back
|
||||
to it at each read site. Pipecat doesn't care what type you pass (a
|
||||
plain dict works too), but a typed container gives you autocomplete and
|
||||
refactor safety instead of dict-by-string-key lookups.
|
||||
"""
|
||||
|
||||
@@ -28,7 +38,7 @@ import json
|
||||
import os
|
||||
from collections.abc import Mapping
|
||||
from dataclasses import dataclass
|
||||
from datetime import UTC, datetime, timezone
|
||||
from datetime import UTC, datetime
|
||||
from typing import Any, cast
|
||||
|
||||
from dotenv import load_dotenv
|
||||
@@ -37,7 +47,7 @@ from loguru import logger
|
||||
from pipecat.adapters.schemas.function_schema import FunctionSchema
|
||||
from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.frames.frames import LLMRunFrame, TTSSpeakFrame
|
||||
from pipecat.frames.frames import Frame, LLMRunFrame, TranscriptionFrame, TTSSpeakFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
@@ -46,6 +56,7 @@ from pipecat.processors.aggregators.llm_response_universal import (
|
||||
LLMContextAggregatorPair,
|
||||
LLMUserAggregatorParams,
|
||||
)
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
@@ -86,30 +97,80 @@ class ToolCallLogger:
|
||||
return json.dumps(self._calls, indent=2)
|
||||
|
||||
|
||||
class TranscriptionLogger:
|
||||
"""Records final user transcriptions — written from a custom FrameProcessor."""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the logger with an empty list of recorded transcriptions."""
|
||||
self._entries: list[dict[str, Any]] = []
|
||||
|
||||
def log_transcription(self, text: str) -> None:
|
||||
"""Record a transcription.
|
||||
|
||||
Args:
|
||||
text: The transcribed user utterance.
|
||||
"""
|
||||
entry = {
|
||||
"timestamp": datetime.now(UTC).isoformat(),
|
||||
"text": text,
|
||||
}
|
||||
self._entries.append(entry)
|
||||
logger.info(f"[TranscriptionLogger] {text!r}")
|
||||
|
||||
def dump(self) -> str:
|
||||
"""Return all recorded transcriptions as a JSON string."""
|
||||
return json.dumps(self._entries, indent=2)
|
||||
|
||||
|
||||
@dataclass
|
||||
class SessionResources:
|
||||
"""Typed container for everything the tool handlers in this session share.
|
||||
class AppResources:
|
||||
"""Typed container for everything the app shares across this session.
|
||||
|
||||
Add fields here as the app grows (e.g. ``db: AsyncConnection``,
|
||||
``http: httpx.AsyncClient``). Handlers ``cast()`` ``params.tool_resources``
|
||||
to this type to get autocomplete and refactor safety.
|
||||
``http: httpx.AsyncClient``). Read sites ``cast()`` to this type to
|
||||
get autocomplete and refactor safety:
|
||||
|
||||
- In tools: ``cast(AppResources, params.app_resources)``.
|
||||
- In custom processors: ``cast(AppResources, self.pipeline_task.app_resources)``.
|
||||
"""
|
||||
|
||||
tool_call_logger: ToolCallLogger
|
||||
transcription_logger: TranscriptionLogger
|
||||
|
||||
|
||||
async def fetch_weather_from_api(params: FunctionCallParams):
|
||||
resources = cast(SessionResources, params.tool_resources)
|
||||
resources = cast(AppResources, params.app_resources)
|
||||
resources.tool_call_logger.log_tool_call(params.function_name, params.arguments)
|
||||
await params.result_callback({"conditions": "nice", "temperature": "75"})
|
||||
|
||||
|
||||
async def fetch_restaurant_recommendation(params: FunctionCallParams):
|
||||
resources = cast(SessionResources, params.tool_resources)
|
||||
resources = cast(AppResources, params.app_resources)
|
||||
resources.tool_call_logger.log_tool_call(params.function_name, params.arguments)
|
||||
await params.result_callback({"name": "The Golden Dragon"})
|
||||
|
||||
|
||||
class TranscriptionLoggingProcessor(FrameProcessor):
|
||||
"""Logs each final user transcription into the shared app resources.
|
||||
|
||||
Demonstrates the second read site for ``app_resources``: any custom
|
||||
``FrameProcessor`` can reach the same bag every tool handler sees by
|
||||
going through ``self.pipeline_task.app_resources``. ``pipeline_task``
|
||||
is ``None`` until the task sets the processor up, so we guard against
|
||||
that case.
|
||||
"""
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
"""Forward all frames; log final user transcriptions on the way through."""
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if isinstance(frame, TranscriptionFrame) and self.pipeline_task is not None:
|
||||
resources = cast(AppResources, self.pipeline_task.app_resources)
|
||||
resources.transcription_logger.log_transcription(frame.text)
|
||||
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
|
||||
# We use lambdas to defer transport parameter creation until the transport
|
||||
# type is selected at runtime.
|
||||
transport_params = {
|
||||
@@ -203,6 +264,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
[
|
||||
transport.input(),
|
||||
stt,
|
||||
TranscriptionLoggingProcessor(),
|
||||
user_aggregator,
|
||||
llm,
|
||||
tts,
|
||||
@@ -211,10 +273,14 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
]
|
||||
)
|
||||
|
||||
# Keep a local handle so we can read collected state after the session
|
||||
# Keep local handles so we can read collected state after the session
|
||||
# ends; Pipecat never copies or clears the object.
|
||||
tool_call_logger = ToolCallLogger()
|
||||
resources = SessionResources(tool_call_logger=tool_call_logger)
|
||||
transcription_logger = TranscriptionLogger()
|
||||
resources = AppResources(
|
||||
tool_call_logger=tool_call_logger,
|
||||
transcription_logger=transcription_logger,
|
||||
)
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
@@ -223,7 +289,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
enable_usage_metrics=True,
|
||||
),
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
tool_resources=resources,
|
||||
app_resources=resources,
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
@@ -246,6 +312,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
|
||||
# The session has ended; read whatever state the handlers built up.
|
||||
logger.info(f"Tool calls logged during session:\n{tool_call_logger.dump()}")
|
||||
logger.info(f"Transcriptions logged during session:\n{transcription_logger.dump()}")
|
||||
|
||||
|
||||
async def bot(runner_args: RunnerArguments):
|
||||
@@ -133,6 +133,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
context.add_message(
|
||||
{"role": "developer", "content": "Please introduce yourself to the user."}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
|
||||
@@ -142,6 +142,9 @@ Start by asking me for my location. Then, use 'get_weather_current' to give me a
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
context.add_message(
|
||||
{"role": "developer", "content": "Please introduce yourself to the user."}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
|
||||
@@ -143,6 +143,9 @@ Start by asking me for my location. Then, use 'get_weather_current' to give me a
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
context.add_message(
|
||||
{"role": "developer", "content": "Please introduce yourself to the user."}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
|
||||
@@ -134,6 +134,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
context.add_message(
|
||||
{"role": "developer", "content": "Please introduce yourself to the user."}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
|
||||
@@ -131,6 +131,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
context.add_message(
|
||||
{"role": "developer", "content": "Please introduce yourself to the user."}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
|
||||
187
examples/function-calling/function-calling-missing-handler.py
Normal file
187
examples/function-calling/function-calling-missing-handler.py
Normal file
@@ -0,0 +1,187 @@
|
||||
#
|
||||
# Copyright (c) 2024-2026, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""Manual demonstration of the missing-handler (developer-error) recovery path.
|
||||
|
||||
When a tool is advertised to the LLM via ``tools``/``LLMContext`` but
|
||||
the developer forgets to call ``llm.register_function(...)`` to wire up
|
||||
its handler, the LLM happily emits a tool call and then... nothing
|
||||
happens on the Pipecat side, leaving the conversation stuck.
|
||||
|
||||
Pipecat's recovery path (``LLMService._missing_function_call_handler``)
|
||||
catches this case:
|
||||
|
||||
- Logs a ``logger.error`` distinguishing **developer error** (tool advertised
|
||||
but no handler registered) from a hallucination (tool not advertised),
|
||||
pointing at the missing ``register_function`` call.
|
||||
- Returns a neutral terminal tool result
|
||||
(``LLMService.MISSING_FUNCTION_CALL_MESSAGE_TEMPLATE``: "The function
|
||||
`X` is not currently available.") so the call still terminates with a
|
||||
normal tool result instead of leaving the conversation stuck.
|
||||
|
||||
This example is **deliberately broken**: the weather schema is in
|
||||
``tools`` but ``register_function`` is *not* called. Ask the bot about
|
||||
the weather and observe:
|
||||
|
||||
1. The LLM emits a tool call for ``get_current_weather``.
|
||||
2. ``logger.error`` fires with "advertised … but has no registered handler
|
||||
— did you forget to call register_function()?"
|
||||
3. The terminal tool result is fed back to the LLM.
|
||||
4. The LLM responds in voice based on that result (typically something
|
||||
like "the weather function isn't available right now").
|
||||
|
||||
Uses the OpenAI LLM service with defaults. Swap to another provider to
|
||||
validate this behavior elsewhere.
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.adapters.schemas.function_schema import FunctionSchema
|
||||
from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.frames.frames import LLMRunFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import (
|
||||
LLMContextAggregatorPair,
|
||||
LLMUserAggregatorParams,
|
||||
)
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
weather_function = FunctionSchema(
|
||||
name="get_current_weather",
|
||||
description="Get the current weather",
|
||||
properties={
|
||||
"location": {
|
||||
"type": "string",
|
||||
"description": "The city and state, e.g. San Francisco, CA",
|
||||
},
|
||||
"format": {
|
||||
"type": "string",
|
||||
"enum": ["celsius", "fahrenheit"],
|
||||
"description": "The temperature unit to use. Infer this from the user's location.",
|
||||
},
|
||||
},
|
||||
required=["location", "format"],
|
||||
)
|
||||
weather_tools = ToolsSchema(standard_tools=[weather_function])
|
||||
|
||||
|
||||
transport_params = {
|
||||
"daily": lambda: DailyParams(audio_in_enabled=True, audio_out_enabled=True),
|
||||
"twilio": lambda: FastAPIWebsocketParams(audio_in_enabled=True, audio_out_enabled=True),
|
||||
"webrtc": lambda: TransportParams(audio_in_enabled=True, audio_out_enabled=True),
|
||||
}
|
||||
|
||||
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info("Starting missing-handler demo bot (no handler is registered on purpose)")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.environ["DEEPGRAM_API_KEY"])
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.environ["CARTESIA_API_KEY"],
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
),
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.environ["OPENAI_API_KEY"],
|
||||
settings=OpenAILLMService.Settings(
|
||||
system_instruction=(
|
||||
"You are a helpful assistant in a voice conversation. Your responses "
|
||||
"will be spoken aloud, so avoid emojis, bullet points, or other "
|
||||
"formatting that can't be spoken. Respond briefly and naturally. "
|
||||
"Always use the get_current_weather function to answer questions "
|
||||
"about the current weather."
|
||||
),
|
||||
),
|
||||
)
|
||||
|
||||
# *** DELIBERATELY OMITTED ***
|
||||
# The whole point of this example is to demonstrate the missing-handler
|
||||
# recovery path. Re-add this line to wire the tool up correctly:
|
||||
#
|
||||
# llm.register_function("get_current_weather", fetch_weather_from_api)
|
||||
|
||||
context = LLMContext(tools=weather_tools)
|
||||
user_aggregator, assistant_aggregator = LLMContextAggregatorPair(
|
||||
context,
|
||||
user_params=LLMUserAggregatorParams(vad_analyzer=SileroVADAnalyzer()),
|
||||
)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(),
|
||||
stt,
|
||||
user_aggregator,
|
||||
llm,
|
||||
tts,
|
||||
transport.output(),
|
||||
assistant_aggregator,
|
||||
]
|
||||
)
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
params=PipelineParams(enable_metrics=True, enable_usage_metrics=True),
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info("Client connected")
|
||||
logger.info(
|
||||
"=== Ask for the weather. Watch for a logger.error about the missing "
|
||||
"handler, and listen for the LLM's response based on the recovery "
|
||||
"message. ==="
|
||||
)
|
||||
context.add_message(
|
||||
{
|
||||
"role": "developer",
|
||||
"content": (
|
||||
"Please introduce yourself briefly to the user, then invite "
|
||||
"them to ask about the weather."
|
||||
),
|
||||
}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info("Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
async def bot(runner_args: RunnerArguments):
|
||||
"""Main bot entry point compatible with Pipecat Cloud."""
|
||||
transport = await create_transport(runner_args, transport_params)
|
||||
await run_bot(transport, runner_args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from pipecat.runner.run import main
|
||||
|
||||
main()
|
||||
@@ -144,6 +144,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
context.add_message(
|
||||
{"role": "developer", "content": "Please introduce yourself to the user."}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
|
||||
@@ -149,6 +149,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
context.add_message(
|
||||
{"role": "developer", "content": "Please introduce yourself to the user."}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
|
||||
@@ -135,6 +135,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
context.add_message(
|
||||
{"role": "developer", "content": "Please introduce yourself to the user."}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
|
||||
@@ -149,6 +149,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
context.add_message(
|
||||
{"role": "developer", "content": "Please introduce yourself to the user."}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
|
||||
@@ -29,7 +29,7 @@ from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.llm_service import FunctionCallParams
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.services.openai.stt import OpenAISTTService
|
||||
from pipecat.services.openai.stt import OpenAIRealtimeSTTService
|
||||
from pipecat.services.openai.tts import OpenAITTSService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
@@ -69,13 +69,7 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = OpenAISTTService(
|
||||
api_key=os.environ["OPENAI_API_KEY"],
|
||||
settings=OpenAISTTService.Settings(
|
||||
model="gpt-4o-transcribe",
|
||||
prompt="Expect words related weather, such as temperature and conditions. And restaurant names.",
|
||||
),
|
||||
)
|
||||
stt = OpenAIRealtimeSTTService(api_key=os.environ["OPENAI_API_KEY"])
|
||||
|
||||
tts = OpenAITTSService(
|
||||
api_key=os.environ["OPENAI_API_KEY"],
|
||||
|
||||
@@ -187,6 +187,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
context.add_message(
|
||||
{"role": "developer", "content": "Please introduce yourself to the user."}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
|
||||
@@ -25,7 +25,7 @@ from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.llm_service import FunctionCallParams
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.services.openai.stt import OpenAISTTService
|
||||
from pipecat.services.openai.stt import OpenAIRealtimeSTTService
|
||||
from pipecat.services.openai.tts import OpenAITTSService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
@@ -63,20 +63,14 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = OpenAISTTService(
|
||||
api_key=os.environ["OPENAI_API_KEY"],
|
||||
settings=OpenAISTTService.Settings(
|
||||
model="gpt-4o-transcribe",
|
||||
prompt="Expect words related weather, such as temperature and conditions. And restaurant names.",
|
||||
),
|
||||
)
|
||||
stt = OpenAIRealtimeSTTService(api_key=os.environ["OPENAI_API_KEY"])
|
||||
|
||||
tts = OpenAITTSService(
|
||||
api_key=os.environ["OPENAI_API_KEY"],
|
||||
settings=OpenAITTSService.Settings(
|
||||
instructions="Please speak clearly and at a moderate pace.",
|
||||
voice="ballad",
|
||||
),
|
||||
instructions="Please speak clearly and at a moderate pace.",
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
@@ -154,6 +148,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
context.add_message(
|
||||
{"role": "developer", "content": "Please introduce yourself to the user."}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
|
||||
@@ -76,7 +76,6 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
llm = OpenRouterLLMService(
|
||||
api_key=os.environ["OPENROUTER_API_KEY"],
|
||||
settings=OpenRouterLLMService.Settings(
|
||||
model="openai/gpt-4o-2024-11-20",
|
||||
system_instruction="You are a helpful assistant in a voice conversation. Your responses will be spoken aloud, so avoid emojis, bullet points, or other formatting that can't be spoken. Respond to what the user said in a creative, helpful, and brief way.",
|
||||
),
|
||||
)
|
||||
@@ -136,6 +135,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
context.add_message(
|
||||
{"role": "developer", "content": "Please introduce yourself to the user."}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
|
||||
@@ -71,8 +71,8 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
|
||||
llm = QwenLLMService(
|
||||
api_key=os.environ["QWEN_API_KEY"],
|
||||
model="qwen2.5-72b-instruct",
|
||||
settings=QwenLLMService.Settings(
|
||||
model="qwen2.5-72b-instruct",
|
||||
system_instruction="You are a helpful assistant in a voice conversation. Your responses will be spoken aloud, so avoid emojis, bullet points, or other formatting that can't be spoken. Respond to what the user said in a creative, helpful, and brief way.",
|
||||
),
|
||||
)
|
||||
@@ -134,6 +134,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
context.add_message(
|
||||
{"role": "developer", "content": "Please introduce yourself to the user."}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
|
||||
@@ -133,6 +133,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
context.add_message(
|
||||
{"role": "developer", "content": "Please introduce yourself to the user."}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
|
||||
184
examples/realtime/realtime-aws-nova-sonic-async-tool.py
Normal file
184
examples/realtime/realtime-aws-nova-sonic-async-tool.py
Normal file
@@ -0,0 +1,184 @@
|
||||
#
|
||||
# Copyright (c) 2024-2026, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""Example: async function call with the AWS Nova Sonic LLM service.
|
||||
|
||||
The ``get_current_weather`` tool is registered with
|
||||
``cancel_on_interruption=False`` and simulates a slow API call (10s sleep).
|
||||
While the call is in flight the conversation continues; the result arrives
|
||||
later via the async-tool mechanism and is forwarded to Nova Sonic via the
|
||||
formal toolResult channel so the model can integrate it naturally into its
|
||||
next turn.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import random
|
||||
from datetime import datetime
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.adapters.schemas.function_schema import FunctionSchema
|
||||
from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.frames.frames import LLMRunFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import (
|
||||
LLMContextAggregatorPair,
|
||||
LLMUserAggregatorParams,
|
||||
)
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.aws.nova_sonic.llm import AWSNovaSonicLLMService
|
||||
from pipecat.services.llm_service import FunctionCallParams
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
async def fetch_weather_from_api(params: FunctionCallParams):
|
||||
# Simulate a long-running API call so we can demonstrate that the
|
||||
# conversation continues while the tool is in flight.
|
||||
await asyncio.sleep(10)
|
||||
temperature = (
|
||||
random.randint(60, 85)
|
||||
if params.arguments["format"] == "fahrenheit"
|
||||
else random.randint(15, 30)
|
||||
)
|
||||
await params.result_callback(
|
||||
{
|
||||
"conditions": "nice",
|
||||
"temperature": temperature,
|
||||
"location": params.arguments["location"],
|
||||
"format": params.arguments["format"],
|
||||
"timestamp": datetime.now().strftime("%Y%m%d_%H%M%S"),
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
weather_function = FunctionSchema(
|
||||
name="get_current_weather",
|
||||
description="Get the current weather",
|
||||
properties={
|
||||
"location": {
|
||||
"type": "string",
|
||||
"description": "The city and state, e.g. San Francisco, CA",
|
||||
},
|
||||
"format": {
|
||||
"type": "string",
|
||||
"enum": ["celsius", "fahrenheit"],
|
||||
"description": "The temperature unit to use. Infer this from the users location.",
|
||||
},
|
||||
},
|
||||
required=["location", "format"],
|
||||
)
|
||||
|
||||
tools = ToolsSchema(standard_tools=[weather_function])
|
||||
|
||||
|
||||
transport_params = {
|
||||
"daily": lambda: DailyParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
),
|
||||
"twilio": lambda: FastAPIWebsocketParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
),
|
||||
"webrtc": lambda: TransportParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
system_instruction = (
|
||||
"You are a friendly assistant. The user and you will engage in a spoken "
|
||||
"dialog exchanging the transcripts of a natural real-time conversation. "
|
||||
"Keep your responses short, generally two or three sentences for chatty "
|
||||
"scenarios. When the user asks for the weather, call get_current_weather. "
|
||||
"While you wait for the result, keep chatting with the user. When the "
|
||||
"result arrives, share it with the user naturally."
|
||||
)
|
||||
|
||||
llm = AWSNovaSonicLLMService(
|
||||
secret_access_key=os.environ["AWS_SECRET_ACCESS_KEY"],
|
||||
access_key_id=os.environ["AWS_ACCESS_KEY_ID"],
|
||||
region=os.environ["AWS_REGION"],
|
||||
session_token=os.getenv("AWS_SESSION_TOKEN"),
|
||||
settings=AWSNovaSonicLLMService.Settings(
|
||||
voice="tiffany",
|
||||
system_instruction=system_instruction,
|
||||
),
|
||||
)
|
||||
|
||||
llm.register_function(
|
||||
"get_current_weather",
|
||||
fetch_weather_from_api,
|
||||
cancel_on_interruption=False,
|
||||
)
|
||||
|
||||
context = LLMContext(tools=tools)
|
||||
user_aggregator, assistant_aggregator = LLMContextAggregatorPair(
|
||||
context,
|
||||
user_params=LLMUserAggregatorParams(vad_analyzer=SileroVADAnalyzer()),
|
||||
)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(),
|
||||
user_aggregator,
|
||||
llm,
|
||||
transport.output(),
|
||||
assistant_aggregator,
|
||||
]
|
||||
)
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
params=PipelineParams(
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
),
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
context.add_message(
|
||||
{"role": "developer", "content": "Please introduce yourself to the user."}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
async def bot(runner_args: RunnerArguments):
|
||||
"""Main bot entry point compatible with Pipecat Cloud."""
|
||||
transport = await create_transport(runner_args, transport_params)
|
||||
await run_bot(transport, runner_args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from pipecat.runner.run import main
|
||||
|
||||
main()
|
||||
@@ -46,11 +46,6 @@ async def fetch_weather_from_api(params: FunctionCallParams):
|
||||
if params.arguments["format"] == "fahrenheit"
|
||||
else random.randint(15, 30)
|
||||
)
|
||||
# Simulate a long network delay.
|
||||
# You can continue chatting while waiting for this to complete.
|
||||
# With Nova 2 Sonic (the default model), the assistant will respond
|
||||
# appropriately once the function call is complete.
|
||||
await asyncio.sleep(5)
|
||||
await params.result_callback(
|
||||
{
|
||||
"conditions": "nice",
|
||||
@@ -150,9 +145,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
# Register function for function calls
|
||||
# you can either register a single function for all function calls, or specific functions
|
||||
# llm.register_function(None, fetch_weather_from_api)
|
||||
llm.register_function(
|
||||
"get_current_weather", fetch_weather_from_api, cancel_on_interruption=False
|
||||
)
|
||||
llm.register_function("get_current_weather", fetch_weather_from_api)
|
||||
|
||||
# Set up context and context management.
|
||||
context = LLMContext(tools=tools)
|
||||
|
||||
195
examples/realtime/realtime-azure-async-tool.py
Normal file
195
examples/realtime/realtime-azure-async-tool.py
Normal file
@@ -0,0 +1,195 @@
|
||||
#
|
||||
# Copyright (c) 2024-2026, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""Example: async function call with the Azure Realtime LLM service.
|
||||
|
||||
The ``get_current_weather`` tool is registered with
|
||||
``cancel_on_interruption=False`` and simulates a slow API call (10s sleep).
|
||||
While the call is in flight the conversation continues; the result arrives
|
||||
later via the async-tool mechanism and is forwarded to Azure Realtime as a
|
||||
``function_call_output`` so the model can integrate it naturally into its
|
||||
next turn.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import random
|
||||
from datetime import datetime
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.adapters.schemas.function_schema import FunctionSchema
|
||||
from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.frames.frames import LLMRunFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import (
|
||||
LLMContextAggregatorPair,
|
||||
LLMUserAggregatorParams,
|
||||
)
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.azure.realtime.llm import AzureRealtimeLLMService
|
||||
from pipecat.services.llm_service import FunctionCallParams
|
||||
from pipecat.services.openai.realtime.events import (
|
||||
AudioConfiguration,
|
||||
AudioInput,
|
||||
InputAudioTranscription,
|
||||
SessionProperties,
|
||||
)
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
async def fetch_weather_from_api(params: FunctionCallParams):
|
||||
# Simulate a long-running API call so we can demonstrate that the
|
||||
# conversation continues while the tool is in flight.
|
||||
await asyncio.sleep(10)
|
||||
temperature = (
|
||||
random.randint(60, 85)
|
||||
if params.arguments["format"] == "fahrenheit"
|
||||
else random.randint(15, 30)
|
||||
)
|
||||
await params.result_callback(
|
||||
{
|
||||
"conditions": "nice",
|
||||
"temperature": temperature,
|
||||
"location": params.arguments["location"],
|
||||
"format": params.arguments["format"],
|
||||
"timestamp": datetime.now().strftime("%Y%m%d_%H%M%S"),
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
weather_function = FunctionSchema(
|
||||
name="get_current_weather",
|
||||
description="Get the current weather",
|
||||
properties={
|
||||
"location": {
|
||||
"type": "string",
|
||||
"description": "The city and state, e.g. San Francisco, CA",
|
||||
},
|
||||
"format": {
|
||||
"type": "string",
|
||||
"enum": ["celsius", "fahrenheit"],
|
||||
"description": "The temperature unit to use. Infer this from the users location.",
|
||||
},
|
||||
},
|
||||
required=["location", "format"],
|
||||
)
|
||||
|
||||
tools = ToolsSchema(standard_tools=[weather_function])
|
||||
|
||||
|
||||
system_instruction = (
|
||||
"You are a friendly assistant. The user and you will engage in a spoken "
|
||||
"dialog exchanging the transcripts of a natural real-time conversation. "
|
||||
"Keep your responses short, generally two or three sentences for chatty "
|
||||
"scenarios. When the user asks for the weather, call get_current_weather. "
|
||||
"While you wait for the result, keep chatting with the user. When the "
|
||||
"result arrives, share it with the user naturally."
|
||||
)
|
||||
|
||||
|
||||
transport_params = {
|
||||
"daily": lambda: DailyParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
),
|
||||
"twilio": lambda: FastAPIWebsocketParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
),
|
||||
"webrtc": lambda: TransportParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
llm = AzureRealtimeLLMService(
|
||||
api_key=os.environ["AZURE_REALTIME_API_KEY"],
|
||||
base_url=os.environ["AZURE_REALTIME_BASE_URL"],
|
||||
settings=AzureRealtimeLLMService.Settings(
|
||||
system_instruction=system_instruction,
|
||||
session_properties=SessionProperties(
|
||||
audio=AudioConfiguration(
|
||||
input=AudioInput(
|
||||
transcription=InputAudioTranscription(model="whisper-1"),
|
||||
)
|
||||
),
|
||||
),
|
||||
),
|
||||
)
|
||||
|
||||
llm.register_function(
|
||||
"get_current_weather",
|
||||
fetch_weather_from_api,
|
||||
cancel_on_interruption=False,
|
||||
)
|
||||
|
||||
context = LLMContext(tools=tools)
|
||||
user_aggregator, assistant_aggregator = LLMContextAggregatorPair(
|
||||
context,
|
||||
user_params=LLMUserAggregatorParams(vad_analyzer=SileroVADAnalyzer()),
|
||||
)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(),
|
||||
user_aggregator,
|
||||
llm,
|
||||
transport.output(),
|
||||
assistant_aggregator,
|
||||
]
|
||||
)
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
params=PipelineParams(
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
),
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
context.add_message(
|
||||
{"role": "developer", "content": "Please introduce yourself to the user."}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
async def bot(runner_args: RunnerArguments):
|
||||
"""Main bot entry point compatible with Pipecat Cloud."""
|
||||
transport = await create_transport(runner_args, transport_params)
|
||||
await run_bot(transport, runner_args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from pipecat.runner.run import main
|
||||
|
||||
main()
|
||||
@@ -4,15 +4,25 @@
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""Example: async function call with the Gemini Live LLM service.
|
||||
|
||||
The ``get_current_weather`` tool is registered with
|
||||
``cancel_on_interruption=False`` and simulates a slow API call (10s sleep).
|
||||
While the call is in flight the conversation continues; the result arrives
|
||||
later via the async-tool mechanism and is forwarded to Gemini Live as a
|
||||
FunctionResponse so the model can integrate it naturally into its next turn.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import random
|
||||
from datetime import datetime
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.adapters.schemas.function_schema import FunctionSchema
|
||||
from pipecat.adapters.schemas.tools_schema import AdapterType, ToolsSchema
|
||||
from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
||||
from pipecat.frames.frames import LLMRunFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
@@ -31,33 +41,55 @@ load_dotenv(override=True)
|
||||
|
||||
|
||||
async def fetch_weather_from_api(params: FunctionCallParams):
|
||||
temperature = 75 if params.arguments["format"] == "fahrenheit" else 24
|
||||
# Simulate a long-running API call so we can demonstrate that the
|
||||
# conversation continues while the tool is in flight.
|
||||
await asyncio.sleep(10)
|
||||
temperature = (
|
||||
random.randint(60, 85)
|
||||
if params.arguments["format"] == "fahrenheit"
|
||||
else random.randint(15, 30)
|
||||
)
|
||||
await params.result_callback(
|
||||
{
|
||||
"conditions": "nice",
|
||||
"temperature": temperature,
|
||||
"location": params.arguments["location"],
|
||||
"format": params.arguments["format"],
|
||||
"timestamp": datetime.now().strftime("%Y%m%d_%H%M%S"),
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
async def fetch_restaurant_recommendation(params: FunctionCallParams):
|
||||
await params.result_callback({"name": "The Golden Dragon"})
|
||||
weather_function = FunctionSchema(
|
||||
name="get_current_weather",
|
||||
description="Get the current weather",
|
||||
properties={
|
||||
"location": {
|
||||
"type": "string",
|
||||
"description": "The city and state, e.g. San Francisco, CA",
|
||||
},
|
||||
"format": {
|
||||
"type": "string",
|
||||
"enum": ["celsius", "fahrenheit"],
|
||||
"description": "The temperature unit to use. Infer this from the user's location.",
|
||||
},
|
||||
},
|
||||
required=["location", "format"],
|
||||
)
|
||||
|
||||
tools = ToolsSchema(standard_tools=[weather_function])
|
||||
|
||||
|
||||
system_instruction = """
|
||||
You are a helpful assistant who can answer questions and use tools.
|
||||
|
||||
You have three tools available to you:
|
||||
1. get_current_weather: Use this tool to get the current weather in a specific location.
|
||||
2. get_restaurant_recommendation: Use this tool to get a restaurant recommendation in a specific location.
|
||||
3. google_search: Use this tool to search the web for information.
|
||||
"""
|
||||
system_instruction = (
|
||||
"You are a friendly assistant. The user and you will engage in a spoken "
|
||||
"dialog exchanging the transcripts of a natural real-time conversation. "
|
||||
"Keep your responses short, generally two or three sentences for chatty "
|
||||
"scenarios. When the user asks for the weather, call get_current_weather. "
|
||||
"While you wait for the result, keep chatting with the user. When the "
|
||||
"result arrives, share it with the user naturally."
|
||||
)
|
||||
|
||||
|
||||
# We use lambdas to defer transport parameter creation until the transport
|
||||
# type is selected at runtime.
|
||||
transport_params = {
|
||||
"daily": lambda: DailyParams(
|
||||
audio_in_enabled=True,
|
||||
@@ -77,42 +109,6 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
weather_function = FunctionSchema(
|
||||
name="get_current_weather",
|
||||
description="Get the current weather",
|
||||
properties={
|
||||
"location": {
|
||||
"type": "string",
|
||||
"description": "The city and state, e.g. San Francisco, CA",
|
||||
},
|
||||
"format": {
|
||||
"type": "string",
|
||||
"enum": ["celsius", "fahrenheit"],
|
||||
"description": "The temperature unit to use. Infer this from the user's location.",
|
||||
},
|
||||
},
|
||||
required=["location", "format"],
|
||||
)
|
||||
restaurant_function = FunctionSchema(
|
||||
name="get_restaurant_recommendation",
|
||||
description="Get a restaurant recommendation",
|
||||
properties={
|
||||
"location": {
|
||||
"type": "string",
|
||||
"description": "The city and state, e.g. San Francisco, CA",
|
||||
},
|
||||
},
|
||||
required=["location"],
|
||||
)
|
||||
search_tool = {"google_search": {}}
|
||||
# KNOWN ISSUE: If using GeminiVertexLiveLLMService, it appears
|
||||
# you cannot use the "google_search" tool alongside other tools.
|
||||
# See https://github.com/googleapis/python-genai/issues/941.
|
||||
tools = ToolsSchema(
|
||||
standard_tools=[weather_function, restaurant_function],
|
||||
custom_tools={AdapterType.GEMINI: [search_tool]},
|
||||
)
|
||||
|
||||
llm = GeminiLiveLLMService(
|
||||
api_key=os.environ["GOOGLE_API_KEY"],
|
||||
settings=GeminiLiveLLMService.Settings(
|
||||
@@ -121,13 +117,12 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
tools=tools,
|
||||
)
|
||||
|
||||
llm.register_function("get_current_weather", fetch_weather_from_api)
|
||||
llm.register_function("get_restaurant_recommendation", fetch_restaurant_recommendation)
|
||||
llm.register_function(
|
||||
"get_current_weather",
|
||||
fetch_weather_from_api,
|
||||
cancel_on_interruption=False,
|
||||
)
|
||||
|
||||
# You can provide the system instructions and tools in the context rather
|
||||
# than as arguments to GeminiLiveLLMService, but note that doing so will
|
||||
# trigger a (fast) reconnection when the GeminiLiveLLMService first
|
||||
# receives the context (i.e. when we send the LLMRunFrame below).
|
||||
context = LLMContext()
|
||||
# Server-side VAD is enabled by default; no local VAD is added.
|
||||
user_aggregator, assistant_aggregator = LLMContextAggregatorPair(context)
|
||||
@@ -154,7 +149,6 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
context.add_message(
|
||||
{"role": "developer", "content": "Please introduce yourself to the user."}
|
||||
)
|
||||
@@ -166,7 +160,6 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
await task.cancel()
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
@@ -6,10 +6,13 @@
|
||||
|
||||
|
||||
import os
|
||||
from datetime import datetime
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.adapters.schemas.function_schema import FunctionSchema
|
||||
from pipecat.adapters.schemas.tools_schema import AdapterType, ToolsSchema
|
||||
from pipecat.frames.frames import LLMRunFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
@@ -23,6 +26,7 @@ from pipecat.processors.aggregators.llm_response_universal import (
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.google.gemini_live.llm import GeminiLiveLLMService
|
||||
from pipecat.services.llm_service import FunctionCallParams
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
@@ -30,6 +34,32 @@ from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
async def fetch_weather_from_api(params: FunctionCallParams):
|
||||
temperature = 75 if params.arguments["format"] == "fahrenheit" else 24
|
||||
await params.result_callback(
|
||||
{
|
||||
"conditions": "nice",
|
||||
"temperature": temperature,
|
||||
"format": params.arguments["format"],
|
||||
"timestamp": datetime.now().strftime("%Y%m%d_%H%M%S"),
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
async def fetch_restaurant_recommendation(params: FunctionCallParams):
|
||||
await params.result_callback({"name": "The Golden Dragon"})
|
||||
|
||||
|
||||
system_instruction = """
|
||||
You are a helpful assistant who can answer questions and use tools.
|
||||
|
||||
You have three tools available to you:
|
||||
1. get_current_weather: Use this tool to get the current weather in a specific location.
|
||||
2. get_restaurant_recommendation: Use this tool to get a restaurant recommendation in a specific location.
|
||||
3. google_search: Use this tool to search the web for information.
|
||||
"""
|
||||
|
||||
|
||||
# We use lambdas to defer transport parameter creation until the transport
|
||||
# type is selected at runtime.
|
||||
transport_params = {
|
||||
@@ -51,23 +81,55 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
weather_function = FunctionSchema(
|
||||
name="get_current_weather",
|
||||
description="Get the current weather",
|
||||
properties={
|
||||
"location": {
|
||||
"type": "string",
|
||||
"description": "The city and state, e.g. San Francisco, CA",
|
||||
},
|
||||
"format": {
|
||||
"type": "string",
|
||||
"enum": ["celsius", "fahrenheit"],
|
||||
"description": "The temperature unit to use. Infer this from the user's location.",
|
||||
},
|
||||
},
|
||||
required=["location", "format"],
|
||||
)
|
||||
restaurant_function = FunctionSchema(
|
||||
name="get_restaurant_recommendation",
|
||||
description="Get a restaurant recommendation",
|
||||
properties={
|
||||
"location": {
|
||||
"type": "string",
|
||||
"description": "The city and state, e.g. San Francisco, CA",
|
||||
},
|
||||
},
|
||||
required=["location"],
|
||||
)
|
||||
search_tool = {"google_search": {}}
|
||||
# KNOWN ISSUE: If using GeminiVertexLiveLLMService, it appears
|
||||
# you cannot use the "google_search" tool alongside other tools.
|
||||
# See https://github.com/googleapis/python-genai/issues/941.
|
||||
tools = ToolsSchema(
|
||||
standard_tools=[weather_function, restaurant_function],
|
||||
custom_tools={AdapterType.GEMINI: [search_tool]},
|
||||
)
|
||||
|
||||
llm = GeminiLiveLLMService(
|
||||
api_key=os.environ["GOOGLE_API_KEY"],
|
||||
settings=GeminiLiveLLMService.Settings(
|
||||
system_instruction=system_instruction,
|
||||
voice="Aoede", # Puck, Charon, Kore, Fenrir, Aoede
|
||||
# system_instruction="Talk like a pirate."
|
||||
),
|
||||
# inference_on_context_initialization=False,
|
||||
tools=tools,
|
||||
)
|
||||
|
||||
context = LLMContext(
|
||||
[
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Say hello. Then ask if I want to hear a joke.",
|
||||
},
|
||||
],
|
||||
)
|
||||
llm.register_function("get_current_weather", fetch_weather_from_api)
|
||||
llm.register_function("get_restaurant_recommendation", fetch_restaurant_recommendation)
|
||||
|
||||
context = LLMContext()
|
||||
# Server-side VAD is enabled by default; no local VAD is added.
|
||||
user_aggregator, assistant_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
@@ -94,6 +156,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
context.add_message(
|
||||
{"role": "developer", "content": "Please introduce yourself to the user."}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
|
||||
179
examples/realtime/realtime-grok-async-tool.py
Normal file
179
examples/realtime/realtime-grok-async-tool.py
Normal file
@@ -0,0 +1,179 @@
|
||||
#
|
||||
# Copyright (c) 2024-2026, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""Example: async function call with the Grok Realtime LLM service.
|
||||
|
||||
The ``get_current_weather`` tool is registered with
|
||||
``cancel_on_interruption=False`` and simulates a slow API call (10s sleep).
|
||||
While the call is in flight the conversation continues; the result arrives
|
||||
later via the async-tool mechanism and is forwarded to Grok Realtime as a
|
||||
``function_call_output`` so the model can integrate it naturally into its
|
||||
next turn.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import random
|
||||
from datetime import datetime
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.adapters.schemas.function_schema import FunctionSchema
|
||||
from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
||||
from pipecat.frames.frames import LLMRunFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.llm_service import FunctionCallParams
|
||||
from pipecat.services.xai.realtime.events import SessionProperties
|
||||
from pipecat.services.xai.realtime.llm import GrokRealtimeLLMService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
async def fetch_weather_from_api(params: FunctionCallParams):
|
||||
# Simulate a long-running API call so we can demonstrate that the
|
||||
# conversation continues while the tool is in flight.
|
||||
await asyncio.sleep(10)
|
||||
temperature = (
|
||||
random.randint(60, 85)
|
||||
if params.arguments["format"] == "fahrenheit"
|
||||
else random.randint(15, 30)
|
||||
)
|
||||
await params.result_callback(
|
||||
{
|
||||
"conditions": "nice",
|
||||
"temperature": temperature,
|
||||
"location": params.arguments["location"],
|
||||
"format": params.arguments["format"],
|
||||
"timestamp": datetime.now().strftime("%Y%m%d_%H%M%S"),
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
weather_function = FunctionSchema(
|
||||
name="get_current_weather",
|
||||
description="Get the current weather",
|
||||
properties={
|
||||
"location": {
|
||||
"type": "string",
|
||||
"description": "The city and state, e.g. San Francisco, CA",
|
||||
},
|
||||
"format": {
|
||||
"type": "string",
|
||||
"enum": ["celsius", "fahrenheit"],
|
||||
"description": "The temperature unit to use. Infer this from the users location.",
|
||||
},
|
||||
},
|
||||
required=["location", "format"],
|
||||
)
|
||||
|
||||
tools = ToolsSchema(standard_tools=[weather_function])
|
||||
|
||||
|
||||
system_instruction = (
|
||||
"You are a friendly assistant. The user and you will engage in a spoken "
|
||||
"dialog exchanging the transcripts of a natural real-time conversation. "
|
||||
"Keep your responses short, generally two or three sentences for chatty "
|
||||
"scenarios. When the user asks for the weather, call get_current_weather. "
|
||||
"While you wait for the result, keep chatting with the user. When the "
|
||||
"result arrives, share it with the user naturally."
|
||||
)
|
||||
|
||||
|
||||
# Note: Grok has built-in server-side VAD, so we don't need local VAD.
|
||||
transport_params = {
|
||||
"daily": lambda: DailyParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
),
|
||||
"twilio": lambda: FastAPIWebsocketParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
),
|
||||
"webrtc": lambda: TransportParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
llm = GrokRealtimeLLMService(
|
||||
api_key=os.environ["XAI_API_KEY"],
|
||||
settings=GrokRealtimeLLMService.Settings(
|
||||
system_instruction=system_instruction,
|
||||
session_properties=SessionProperties(
|
||||
voice="Ara",
|
||||
),
|
||||
),
|
||||
)
|
||||
|
||||
llm.register_function(
|
||||
"get_current_weather",
|
||||
fetch_weather_from_api,
|
||||
cancel_on_interruption=False,
|
||||
)
|
||||
|
||||
context = LLMContext(tools=tools)
|
||||
user_aggregator, assistant_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(),
|
||||
user_aggregator,
|
||||
llm,
|
||||
transport.output(),
|
||||
assistant_aggregator,
|
||||
]
|
||||
)
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
params=PipelineParams(
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
),
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
context.add_message(
|
||||
{"role": "developer", "content": "Please introduce yourself to the user."}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
async def bot(runner_args: RunnerArguments):
|
||||
"""Main bot entry point compatible with Pipecat Cloud."""
|
||||
transport = await create_transport(runner_args, transport_params)
|
||||
await run_bot(transport, runner_args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from pipecat.runner.run import main
|
||||
|
||||
main()
|
||||
@@ -28,10 +28,14 @@ Usage:
|
||||
"""
|
||||
|
||||
import os
|
||||
import random
|
||||
from datetime import datetime
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.adapters.schemas.function_schema import FunctionSchema
|
||||
from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
||||
from pipecat.frames.frames import LLMRunFrame
|
||||
from pipecat.observers.loggers.transcription_log_observer import (
|
||||
TranscriptionLogObserver,
|
||||
@@ -48,6 +52,7 @@ from pipecat.processors.aggregators.llm_response_universal import (
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.inworld.realtime.llm import InworldRealtimeLLMService
|
||||
from pipecat.services.llm_service import FunctionCallParams
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
@@ -55,6 +60,43 @@ from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
async def fetch_weather_from_api(params: FunctionCallParams):
|
||||
temperature = (
|
||||
random.randint(60, 85)
|
||||
if params.arguments["format"] == "fahrenheit"
|
||||
else random.randint(15, 30)
|
||||
)
|
||||
await params.result_callback(
|
||||
{
|
||||
"conditions": "nice",
|
||||
"temperature": temperature,
|
||||
"location": params.arguments["location"],
|
||||
"format": params.arguments["format"],
|
||||
"timestamp": datetime.now().strftime("%Y%m%d_%H%M%S"),
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
weather_function = FunctionSchema(
|
||||
name="get_current_weather",
|
||||
description="Get the current weather",
|
||||
properties={
|
||||
"location": {
|
||||
"type": "string",
|
||||
"description": "The city and state, e.g. San Francisco, CA",
|
||||
},
|
||||
"format": {
|
||||
"type": "string",
|
||||
"enum": ["celsius", "fahrenheit"],
|
||||
"description": "The temperature unit to use. Infer this from the users location.",
|
||||
},
|
||||
},
|
||||
required=["location", "format"],
|
||||
)
|
||||
|
||||
tools = ToolsSchema(standard_tools=[weather_function])
|
||||
|
||||
|
||||
# --- Transport Configuration ---
|
||||
|
||||
# No local VAD needed — Inworld's server-side semantic VAD handles turn detection.
|
||||
@@ -85,7 +127,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
# See: https://docs.inworld.ai/router/introduction
|
||||
llm = InworldRealtimeLLMService(
|
||||
api_key=os.environ["INWORLD_API_KEY"],
|
||||
llm_model="xai/grok-4-1-fast-non-reasoning",
|
||||
llm_model="openai/gpt-4.1-mini",
|
||||
voice="Sarah",
|
||||
settings=InworldRealtimeLLMService.Settings(
|
||||
system_instruction="""You are a helpful and friendly AI assistant powered by Inworld.
|
||||
@@ -97,9 +139,14 @@ Always be helpful and proactive in offering assistance.""",
|
||||
),
|
||||
)
|
||||
|
||||
# Create context with initial message
|
||||
# Note: function calling requires a paid Inworld account and a
|
||||
# function-calling-capable model
|
||||
llm.register_function("get_current_weather", fetch_weather_from_api)
|
||||
|
||||
# Create context with initial message + tools
|
||||
context = LLMContext(
|
||||
[{"role": "developer", "content": "Say hello and introduce yourself!"}],
|
||||
tools,
|
||||
)
|
||||
|
||||
user_aggregator, assistant_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
198
examples/realtime/realtime-openai-async-tool.py
Normal file
198
examples/realtime/realtime-openai-async-tool.py
Normal file
@@ -0,0 +1,198 @@
|
||||
#
|
||||
# Copyright (c) 2024-2026, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""Example: async function call with the OpenAI Realtime LLM service.
|
||||
|
||||
The ``get_current_weather`` tool is registered with
|
||||
``cancel_on_interruption=False`` and simulates a slow API call (10s sleep).
|
||||
While the call is in flight the conversation continues; the result arrives
|
||||
later via the async-tool mechanism and is forwarded to OpenAI Realtime as a
|
||||
``function_call_output`` so the model can integrate it naturally into its
|
||||
next turn.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import random
|
||||
from datetime import datetime
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.adapters.schemas.function_schema import FunctionSchema
|
||||
from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.frames.frames import LLMRunFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import (
|
||||
LLMContextAggregatorPair,
|
||||
LLMUserAggregatorParams,
|
||||
)
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.llm_service import FunctionCallParams
|
||||
from pipecat.services.openai.realtime.events import (
|
||||
AudioConfiguration,
|
||||
AudioInput,
|
||||
InputAudioNoiseReduction,
|
||||
InputAudioTranscription,
|
||||
SemanticTurnDetection,
|
||||
SessionProperties,
|
||||
)
|
||||
from pipecat.services.openai.realtime.llm import OpenAIRealtimeLLMService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
async def fetch_weather_from_api(params: FunctionCallParams):
|
||||
# Simulate a long-running API call so we can demonstrate that the
|
||||
# conversation continues while the tool is in flight.
|
||||
await asyncio.sleep(10)
|
||||
temperature = (
|
||||
random.randint(60, 85)
|
||||
if params.arguments["format"] == "fahrenheit"
|
||||
else random.randint(15, 30)
|
||||
)
|
||||
await params.result_callback(
|
||||
{
|
||||
"conditions": "nice",
|
||||
"temperature": temperature,
|
||||
"location": params.arguments["location"],
|
||||
"format": params.arguments["format"],
|
||||
"timestamp": datetime.now().strftime("%Y%m%d_%H%M%S"),
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
weather_function = FunctionSchema(
|
||||
name="get_current_weather",
|
||||
description="Get the current weather",
|
||||
properties={
|
||||
"location": {
|
||||
"type": "string",
|
||||
"description": "The city and state, e.g. San Francisco, CA",
|
||||
},
|
||||
"format": {
|
||||
"type": "string",
|
||||
"enum": ["celsius", "fahrenheit"],
|
||||
"description": "The temperature unit to use. Infer this from the users location.",
|
||||
},
|
||||
},
|
||||
required=["location", "format"],
|
||||
)
|
||||
|
||||
tools = ToolsSchema(standard_tools=[weather_function])
|
||||
|
||||
|
||||
system_instruction = (
|
||||
"You are a friendly assistant. The user and you will engage in a spoken "
|
||||
"dialog exchanging the transcripts of a natural real-time conversation. "
|
||||
"Keep your responses short, generally two or three sentences for chatty "
|
||||
"scenarios. When the user asks for the weather, call get_current_weather. "
|
||||
"While you wait for the result, keep chatting with the user. When the "
|
||||
"result arrives, share it with the user naturally."
|
||||
)
|
||||
|
||||
|
||||
transport_params = {
|
||||
"daily": lambda: DailyParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
),
|
||||
"twilio": lambda: FastAPIWebsocketParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
),
|
||||
"webrtc": lambda: TransportParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
llm = OpenAIRealtimeLLMService(
|
||||
api_key=os.environ["OPENAI_API_KEY"],
|
||||
settings=OpenAIRealtimeLLMService.Settings(
|
||||
system_instruction=system_instruction,
|
||||
session_properties=SessionProperties(
|
||||
audio=AudioConfiguration(
|
||||
input=AudioInput(
|
||||
transcription=InputAudioTranscription(),
|
||||
turn_detection=SemanticTurnDetection(),
|
||||
noise_reduction=InputAudioNoiseReduction(type="near_field"),
|
||||
)
|
||||
),
|
||||
),
|
||||
),
|
||||
)
|
||||
|
||||
llm.register_function(
|
||||
"get_current_weather",
|
||||
fetch_weather_from_api,
|
||||
cancel_on_interruption=False,
|
||||
)
|
||||
|
||||
context = LLMContext(tools=tools)
|
||||
user_aggregator, assistant_aggregator = LLMContextAggregatorPair(
|
||||
context,
|
||||
user_params=LLMUserAggregatorParams(vad_analyzer=SileroVADAnalyzer()),
|
||||
)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(),
|
||||
user_aggregator,
|
||||
llm,
|
||||
transport.output(),
|
||||
assistant_aggregator,
|
||||
]
|
||||
)
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
params=PipelineParams(
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
),
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
context.add_message(
|
||||
{"role": "developer", "content": "Please introduce yourself to the user."}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
async def bot(runner_args: RunnerArguments):
|
||||
"""Main bot entry point compatible with Pipecat Cloud."""
|
||||
transport = await create_transport(runner_args, transport_params)
|
||||
await run_bot(transport, runner_args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from pipecat.runner.run import main
|
||||
|
||||
main()
|
||||
@@ -232,6 +232,20 @@ Remember, your responses should be short. Just one or two sentences, usually. Re
|
||||
# [LLMUpdateSettingsFrame(settings=SessionProperties(tools=new_tools).model_dump())]
|
||||
# )
|
||||
|
||||
# Reasoning effort can be changed at runtime too. Only
|
||||
# reasoning-capable Realtime models (e.g. gpt-realtime-2) support this.
|
||||
# await task.queue_frames(
|
||||
# [
|
||||
# LLMUpdateSettingsFrame(
|
||||
# delta=OpenAIRealtimeLLMService.Settings(
|
||||
# session_properties=SessionProperties(
|
||||
# reasoning=Reasoning(effort="xhigh"),
|
||||
# ),
|
||||
# )
|
||||
# )
|
||||
# ]
|
||||
# )
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
|
||||
186
examples/realtime/realtime-ultravox-async-tool.py
Normal file
186
examples/realtime/realtime-ultravox-async-tool.py
Normal file
@@ -0,0 +1,186 @@
|
||||
#
|
||||
# Copyright (c) 2024-2026, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""Example: async function call with the Ultravox Realtime LLM service.
|
||||
|
||||
The ``get_current_weather`` tool is registered with
|
||||
``cancel_on_interruption=False`` and simulates a slow API call (10s sleep).
|
||||
|
||||
Ultravox's API freezes the conversation between ``client_tool_invocation``
|
||||
and the matching ``client_tool_result``, so the service ships a placeholder
|
||||
``client_tool_result`` immediately when an async-registered function is
|
||||
invoked (to unfreeze the conversation). When the real tool finishes, the
|
||||
actual result is injected as user-side text so the model picks it up.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import datetime
|
||||
import os
|
||||
import random
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.adapters.schemas.function_schema import FunctionSchema
|
||||
from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import (
|
||||
LLMContextAggregatorPair,
|
||||
LLMUserAggregatorParams,
|
||||
)
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.llm_service import FunctionCallParams
|
||||
from pipecat.services.ultravox.llm import OneShotInputParams, UltravoxRealtimeLLMService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
from pipecat.turns.user_stop import SpeechTimeoutUserTurnStopStrategy
|
||||
from pipecat.turns.user_turn_strategies import UserTurnStrategies
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
async def fetch_weather_from_api(params: FunctionCallParams):
|
||||
# Simulate a long-running API call so we can demonstrate that the
|
||||
# conversation continues while the tool is in flight.
|
||||
await asyncio.sleep(10)
|
||||
temperature = (
|
||||
random.randint(60, 85)
|
||||
if params.arguments["format"] == "fahrenheit"
|
||||
else random.randint(15, 30)
|
||||
)
|
||||
await params.result_callback(
|
||||
{
|
||||
"conditions": "nice",
|
||||
"temperature": temperature,
|
||||
"location": params.arguments["location"],
|
||||
"format": params.arguments["format"],
|
||||
"timestamp": datetime.datetime.now().strftime("%Y%m%d_%H%M%S"),
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
weather_function = FunctionSchema(
|
||||
name="get_current_weather",
|
||||
description="Get the current weather",
|
||||
properties={
|
||||
"location": {
|
||||
"type": "string",
|
||||
"description": "The city and state, e.g. San Francisco, CA",
|
||||
},
|
||||
"format": {
|
||||
"type": "string",
|
||||
"enum": ["celsius", "fahrenheit"],
|
||||
"description": "The temperature unit to use. Infer this from the users location.",
|
||||
},
|
||||
},
|
||||
required=["location", "format"],
|
||||
)
|
||||
|
||||
|
||||
system_prompt = (
|
||||
"You are a friendly assistant. The user and you will engage in a spoken "
|
||||
"dialog exchanging the transcripts of a natural real-time conversation. "
|
||||
"Keep your responses short, generally two or three sentences for chatty "
|
||||
"scenarios. When the user asks for the weather, call get_current_weather. "
|
||||
"While you wait for the result, keep chatting with the user. When the "
|
||||
"result arrives, share it with the user naturally."
|
||||
)
|
||||
|
||||
|
||||
transport_params = {
|
||||
"daily": lambda: DailyParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
),
|
||||
"twilio": lambda: FastAPIWebsocketParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
),
|
||||
"webrtc": lambda: TransportParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
llm = UltravoxRealtimeLLMService(
|
||||
params=OneShotInputParams(
|
||||
api_key=os.environ["ULTRAVOX_API_KEY"],
|
||||
system_prompt=system_prompt,
|
||||
temperature=0.3,
|
||||
max_duration=datetime.timedelta(minutes=3),
|
||||
),
|
||||
one_shot_selected_tools=ToolsSchema(standard_tools=[weather_function]),
|
||||
)
|
||||
|
||||
llm.register_function(
|
||||
"get_current_weather",
|
||||
fetch_weather_from_api,
|
||||
cancel_on_interruption=False,
|
||||
)
|
||||
|
||||
context = LLMContext([])
|
||||
user_aggregator, assistant_aggregator = LLMContextAggregatorPair(
|
||||
context,
|
||||
user_params=LLMUserAggregatorParams(
|
||||
user_turn_strategies=UserTurnStrategies(
|
||||
stop=[SpeechTimeoutUserTurnStopStrategy()],
|
||||
),
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
),
|
||||
)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(),
|
||||
user_aggregator,
|
||||
llm,
|
||||
transport.output(),
|
||||
assistant_aggregator,
|
||||
]
|
||||
)
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
params=PipelineParams(
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
),
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
async def bot(runner_args: RunnerArguments):
|
||||
"""Main bot entry point compatible with Pipecat Cloud."""
|
||||
transport = await create_transport(runner_args, transport_params)
|
||||
await run_bot(transport, runner_args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from pipecat.runner.run import main
|
||||
|
||||
main()
|
||||
@@ -51,7 +51,6 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
|
||||
stt = GradiumSTTService(
|
||||
api_key=os.environ["GRADIUM_API_KEY"],
|
||||
api_endpoint_base_url="wss://us.api.gradium.ai/api/speech/asr",
|
||||
settings=GradiumSTTService.Settings(
|
||||
language=Language.EN,
|
||||
delay_in_frames=8,
|
||||
|
||||
@@ -49,13 +49,7 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = OpenAIRealtimeSTTService(
|
||||
api_key=os.environ["OPENAI_API_KEY"],
|
||||
settings=OpenAIRealtimeSTTService.Settings(
|
||||
model="gpt-4o-transcribe",
|
||||
prompt="Expect words related to dogs, such as breed names.",
|
||||
),
|
||||
)
|
||||
stt = OpenAIRealtimeSTTService(api_key=os.environ["OPENAI_API_KEY"])
|
||||
|
||||
tl = TranscriptionLogger()
|
||||
vad_processor = VADProcessor(vad_analyzer=SileroVADAnalyzer())
|
||||
|
||||
@@ -0,0 +1,201 @@
|
||||
#
|
||||
# Copyright (c) 2024-2026, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""Example 22: Filter Incomplete Turns
|
||||
|
||||
Demonstrates LLM-based turn completion detection to suppress bot responses when
|
||||
the user was cut off mid-thought. The LLM outputs one of three markers:
|
||||
- ✓ (complete): User finished their thought, respond normally
|
||||
- ○ (incomplete short): User was cut off, wait ~5s then prompt
|
||||
- ◐ (incomplete long): User needs time to think, wait ~10s then prompt
|
||||
|
||||
When incomplete is detected, the bot's response is suppressed. After the timeout
|
||||
expires, the LLM is automatically prompted to re-engage the user.
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.frames.frames import LLMRunFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import (
|
||||
AssistantTurnStoppedMessage,
|
||||
LLMContextAggregatorPair,
|
||||
LLMUserAggregatorParams,
|
||||
UserTurnStoppedMessage,
|
||||
)
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.llm_service import FunctionCallParams
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
from pipecat.turns.user_turn_strategies import FilterIncompleteUserTurnStrategies
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
# We use lambdas to defer transport parameter creation until the transport
|
||||
# type is selected at runtime.
|
||||
transport_params = {
|
||||
"daily": lambda: DailyParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
),
|
||||
"twilio": lambda: FastAPIWebsocketParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
),
|
||||
"webrtc": lambda: TransportParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
async def get_weather(params: FunctionCallParams, location: str):
|
||||
"""Return the current weather for a location.
|
||||
|
||||
A stub that always reports the same conditions — replace with a real
|
||||
weather API in production.
|
||||
|
||||
Args:
|
||||
location (str): The city and state or country, e.g. "Paris, France".
|
||||
"""
|
||||
await params.result_callback(
|
||||
{
|
||||
"location": location,
|
||||
"temperature_celsius": 22,
|
||||
"conditions": "partly cloudy",
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.environ["DEEPGRAM_API_KEY"])
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.environ["OPENAI_API_KEY"],
|
||||
settings=OpenAILLMService.Settings(
|
||||
system_instruction=(
|
||||
"You are a helpful assistant in a voice conversation. Your "
|
||||
"responses will be spoken aloud, so avoid emojis, bullet "
|
||||
"points, or other formatting that can't be spoken. Respond to "
|
||||
"what the user said in a creative, helpful, and brief way. "
|
||||
"If the user asks about the weather, call the get_weather "
|
||||
"tool and speak the result back naturally."
|
||||
),
|
||||
),
|
||||
)
|
||||
llm.register_direct_function(get_weather)
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.environ["CARTESIA_API_KEY"],
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
),
|
||||
)
|
||||
|
||||
context = LLMContext(tools=ToolsSchema(standard_tools=[get_weather]))
|
||||
# `FilterIncompleteUserTurnStrategies` pairs the default detector
|
||||
# chain with `LLMTurnCompletionUserTurnStopStrategy`: detectors
|
||||
# trigger LLM inference but the public `on_user_turn_stopped` event
|
||||
# fires only when the LLM confirms ✓. The LLM marks each response
|
||||
# with one of:
|
||||
# ✓ = complete (respond normally)
|
||||
# ○ = incomplete short (wait 5s, then prompt)
|
||||
# ◐ = incomplete long (wait 10s, then prompt)
|
||||
user_aggregator, assistant_aggregator = LLMContextAggregatorPair(
|
||||
context,
|
||||
user_params=LLMUserAggregatorParams(
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
user_turn_strategies=FilterIncompleteUserTurnStrategies(
|
||||
# Optional: customize turn completion behavior
|
||||
# config=UserTurnCompletionConfig(
|
||||
# incomplete_short_timeout=5.0,
|
||||
# incomplete_long_timeout=10.0,
|
||||
# incomplete_short_prompt="Custom prompt...",
|
||||
# incomplete_long_prompt="Custom prompt...",
|
||||
# instructions="Custom turn completion instructions...",
|
||||
# ),
|
||||
),
|
||||
),
|
||||
)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(), # Transport user input
|
||||
stt,
|
||||
user_aggregator, # User responses
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
assistant_aggregator, # Assistant spoken responses
|
||||
]
|
||||
)
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
params=PipelineParams(
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
),
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
context.add_message(
|
||||
{"role": "developer", "content": "Please introduce yourself to the user."}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
@user_aggregator.event_handler("on_user_turn_stopped")
|
||||
async def on_user_turn_stopped(aggregator, strategy, message: UserTurnStoppedMessage):
|
||||
timestamp = f"[{message.timestamp}] " if message.timestamp else ""
|
||||
line = f"{timestamp}user: {message.content}"
|
||||
logger.info(f"Transcript: {line}")
|
||||
|
||||
@assistant_aggregator.event_handler("on_assistant_turn_stopped")
|
||||
async def on_assistant_turn_stopped(aggregator, message: AssistantTurnStoppedMessage):
|
||||
timestamp = f"[{message.timestamp}] " if message.timestamp else ""
|
||||
line = f"{timestamp}assistant: {message.content}"
|
||||
logger.info(f"Transcript: {line}")
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
async def bot(runner_args: RunnerArguments):
|
||||
"""Main bot entry point compatible with Pipecat Cloud."""
|
||||
transport = await create_transport(runner_args, transport_params)
|
||||
await run_bot(transport, runner_args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from pipecat.runner.run import main
|
||||
|
||||
main()
|
||||
@@ -10,7 +10,7 @@ Demonstrates LLM-based turn completion detection to suppress bot responses when
|
||||
the user was cut off mid-thought. The LLM outputs one of three markers:
|
||||
- ✓ (complete): User finished their thought, respond normally
|
||||
- ○ (incomplete short): User was cut off, wait ~5s then prompt
|
||||
- ◐ (incomplete long): User needs time to think, wait ~15s then prompt
|
||||
- ◐ (incomplete long): User needs time to think, wait ~10s then prompt
|
||||
|
||||
When incomplete is detected, the bot's response is suppressed. After the timeout
|
||||
expires, the LLM is automatically prompted to re-engage the user.
|
||||
@@ -41,6 +41,7 @@ from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
from pipecat.turns.user_turn_strategies import FilterIncompleteUserTurnStrategies
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
@@ -83,23 +84,28 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
)
|
||||
|
||||
context = LLMContext()
|
||||
# `FilterIncompleteUserTurnStrategies` pairs the default detector
|
||||
# chain with `LLMTurnCompletionUserTurnStopStrategy`: detectors
|
||||
# trigger LLM inference but the public `on_user_turn_stopped` event
|
||||
# fires only when the LLM confirms ✓. The LLM marks each response
|
||||
# with one of:
|
||||
# ✓ = complete (respond normally)
|
||||
# ○ = incomplete short (wait 5s, then prompt)
|
||||
# ◐ = incomplete long (wait 10s, then prompt)
|
||||
user_aggregator, assistant_aggregator = LLMContextAggregatorPair(
|
||||
context,
|
||||
user_params=LLMUserAggregatorParams(
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
# Enable turn completion filtering - the LLM will output:
|
||||
# ✓ = complete (respond normally)
|
||||
# ○ = incomplete short (wait 5s, then prompt)
|
||||
# ◐ = incomplete long (wait 15s, then prompt)
|
||||
filter_incomplete_user_turns=True,
|
||||
# Optional: customize turn completion behavior
|
||||
# turn_completion_config=TurnCompletionConfig(
|
||||
# incomplete_short_timeout=5.0,
|
||||
# incomplete_long_timeout=15.0,
|
||||
# incomplete_short_prompt="Custom prompt...",
|
||||
# incomplete_long_prompt="Custom prompt...",
|
||||
# instructions="Custom turn completion instructions...",
|
||||
# ),
|
||||
user_turn_strategies=FilterIncompleteUserTurnStrategies(
|
||||
# Optional: customize turn completion behavior
|
||||
# config=UserTurnCompletionConfig(
|
||||
# incomplete_short_timeout=5.0,
|
||||
# incomplete_long_timeout=10.0,
|
||||
# incomplete_short_prompt="Custom prompt...",
|
||||
# incomplete_long_prompt="Custom prompt...",
|
||||
# instructions="Custom turn completion instructions...",
|
||||
# ),
|
||||
),
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@@ -50,10 +50,7 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = GradiumSTTService(
|
||||
api_key=os.environ["GRADIUM_API_KEY"],
|
||||
api_endpoint_base_url="wss://us.api.gradium.ai/api/speech/asr",
|
||||
)
|
||||
stt = GradiumSTTService(api_key=os.environ["GRADIUM_API_KEY"])
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.environ["CARTESIA_API_KEY"],
|
||||
|
||||
@@ -55,7 +55,6 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
tts = GradiumTTSService(
|
||||
api_key=os.environ["GRADIUM_API_KEY"],
|
||||
settings=GradiumTTSService.Settings(voice="YTpq7expH9539ERJ"),
|
||||
url="wss://us.api.gradium.ai/api/speech/tts",
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
|
||||
@@ -54,7 +54,6 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
|
||||
stt = GradiumSTTService(
|
||||
api_key=os.environ["GRADIUM_API_KEY"],
|
||||
api_endpoint_base_url="wss://us.api.gradium.ai/api/speech/asr",
|
||||
settings=GradiumSTTService.Settings(
|
||||
language=Language.EN,
|
||||
),
|
||||
@@ -62,7 +61,6 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
|
||||
tts = GradiumTTSService(
|
||||
api_key=os.environ["GRADIUM_API_KEY"],
|
||||
url="wss://us.api.gradium.ai/api/speech/tts",
|
||||
settings=GradiumTTSService.Settings(
|
||||
voice="YTpq7expH9539ERJ",
|
||||
),
|
||||
|
||||
129
examples/voice/voice-nvidia-sagemaker.py
Normal file
129
examples/voice/voice-nvidia-sagemaker.py
Normal file
@@ -0,0 +1,129 @@
|
||||
#
|
||||
# Copyright (c) 2024-2026, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
# For a full example of how to deploy to SageMaker, see:
|
||||
# https://github.com/pipecat-ai/pipecat-examples/tree/main/nvidia_sagemaker_example/deployment/aws-sagemaker-nvidia
|
||||
|
||||
import os
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.frames.frames import LLMRunFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import (
|
||||
LLMContextAggregatorPair,
|
||||
LLMUserAggregatorParams,
|
||||
)
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.nvidia.llm import NvidiaLLMService
|
||||
from pipecat.services.nvidia.sagemaker.stt import NvidiaSageMakerSTTService
|
||||
from pipecat.services.nvidia.sagemaker.tts import NvidiaSageMakerTTSService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
# We use lambdas to defer transport parameter creation until the transport
|
||||
# type is selected at runtime.
|
||||
transport_params = {
|
||||
"daily": lambda: DailyParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
),
|
||||
"twilio": lambda: FastAPIWebsocketParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
),
|
||||
"webrtc": lambda: TransportParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = NvidiaSageMakerSTTService(
|
||||
endpoint_name=os.environ["SAGEMAKER_ASR_ENDPOINT_NAME"],
|
||||
region=os.getenv("AWS_REGION", "us-west-2"),
|
||||
)
|
||||
|
||||
llm = NvidiaLLMService(
|
||||
api_key=os.environ["NVIDIA_API_KEY"],
|
||||
settings=NvidiaLLMService.Settings(
|
||||
model="meta/llama-3.3-70b-instruct",
|
||||
system_instruction="You are a helpful assistant in a voice conversation. Your responses will be spoken aloud, so avoid emojis, bullet points, or other formatting that can't be spoken. Respond to what the user said in a creative, helpful, and brief way.",
|
||||
),
|
||||
)
|
||||
|
||||
tts = NvidiaSageMakerTTSService(
|
||||
endpoint_name=os.environ["SAGEMAKER_MAGPIE_ENDPOINT_NAME"],
|
||||
region=os.getenv("AWS_REGION", "us-west-2"),
|
||||
)
|
||||
|
||||
context = LLMContext()
|
||||
user_aggregator, assistant_aggregator = LLMContextAggregatorPair(
|
||||
context,
|
||||
user_params=LLMUserAggregatorParams(vad_analyzer=SileroVADAnalyzer()),
|
||||
)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(), # Transport user input
|
||||
stt, # STT
|
||||
user_aggregator, # User responses
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
assistant_aggregator, # Assistant spoken responses
|
||||
]
|
||||
)
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
params=PipelineParams(
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
),
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
context.add_message(
|
||||
{"role": "developer", "content": "Please introduce yourself to the user."}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
async def bot(runner_args: RunnerArguments):
|
||||
"""Main bot entry point compatible with Pipecat Cloud."""
|
||||
transport = await create_transport(runner_args, transport_params)
|
||||
await run_bot(transport, runner_args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from pipecat.runner.run import main
|
||||
|
||||
main()
|
||||
@@ -25,7 +25,6 @@ from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.services.openai.stt import OpenAIRealtimeSTTService
|
||||
from pipecat.services.openai.tts import OpenAITTSService
|
||||
from pipecat.transcriptions.language import Language
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
@@ -53,14 +52,7 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = OpenAIRealtimeSTTService(
|
||||
api_key=os.environ["OPENAI_API_KEY"],
|
||||
settings=OpenAIRealtimeSTTService.Settings(
|
||||
model="gpt-4o-transcribe",
|
||||
prompt="Expect words related to dogs, such as breed names.",
|
||||
language=Language.EN,
|
||||
),
|
||||
)
|
||||
stt = OpenAIRealtimeSTTService(api_key=os.environ["OPENAI_API_KEY"])
|
||||
|
||||
tts = OpenAITTSService(
|
||||
api_key=os.environ["OPENAI_API_KEY"],
|
||||
@@ -72,7 +64,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.environ["OPENAI_API_KEY"],
|
||||
settings=OpenAILLMService.Settings(
|
||||
system_instruction="You are very knowledgable about dogs. Your output will be spoken aloud, so avoid special characters that can't easily be spoken, such as emojis or bullet points. Respond to what the user said in a creative and helpful way.",
|
||||
system_instruction="You are a helpful assistant in a voice conversation. Your responses will be spoken aloud, so avoid emojis, bullet points, or other formatting that can't be spoken. Respond to what the user said in a creative, helpful, and brief way.",
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@@ -58,6 +58,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
# Add strict mode to enforce the language hints
|
||||
language_hints=[Language.EN],
|
||||
language_hints_strict=True,
|
||||
enable_language_identification=True,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@@ -103,7 +103,7 @@ piper = [ "piper-tts>=1.3.0,<2", "requests>=2.32.5,<3" ]
|
||||
qwen = []
|
||||
resembleai = [ "pipecat-ai[websockets-base]" ]
|
||||
rime = [ "pipecat-ai[websockets-base]" ]
|
||||
runner = [ "python-dotenv>=1.0.0,<2.0.0", "uvicorn>=0.32.0,<1.0.0", "fastapi>=0.115.6,<1", "pipecat-ai-small-webrtc-prebuilt>=2.5.0"]
|
||||
runner = [ "python-dotenv>=1.0.0,<2.0.0", "uvicorn>=0.32.0,<1.0.0", "fastapi>=0.115.6,<1", "pipecat-ai-prebuilt>=1.0.0"]
|
||||
sagemaker = ["aws_sdk_sagemaker_runtime_http2; python_version>='3.12'"]
|
||||
sambanova = []
|
||||
sarvam = [ "sarvamai==0.1.28", "pipecat-ai[websockets-base]" ]
|
||||
|
||||
@@ -6,116 +6,54 @@
|
||||
"exclude": ["**/*_pb2.py", "**/__pycache__"],
|
||||
"ignore": [
|
||||
"tests",
|
||||
"src/pipecat/adapters/services/anthropic_adapter.py",
|
||||
"src/pipecat/adapters/services/aws_nova_sonic_adapter.py",
|
||||
"src/pipecat/adapters/services/bedrock_adapter.py",
|
||||
"src/pipecat/adapters/services/gemini_adapter.py",
|
||||
"src/pipecat/adapters/services/grok_realtime_adapter.py",
|
||||
"src/pipecat/adapters/services/inworld_realtime_adapter.py",
|
||||
"src/pipecat/adapters/services/open_ai_adapter.py",
|
||||
"src/pipecat/adapters/services/open_ai_realtime_adapter.py",
|
||||
"src/pipecat/adapters/services/open_ai_responses_adapter.py",
|
||||
"src/pipecat/adapters/services/perplexity_adapter.py",
|
||||
"src/pipecat/audio/dtmf/utils.py",
|
||||
"src/pipecat/audio/filters/aic_filter.py",
|
||||
"src/pipecat/audio/filters/krisp_viva_filter.py",
|
||||
"src/pipecat/audio/filters/rnnoise_filter.py",
|
||||
"src/pipecat/audio/turn/smart_turn/local_smart_turn_v2.py",
|
||||
"src/pipecat/audio/turn/smart_turn/local_smart_turn_v3.py",
|
||||
"src/pipecat/audio/vad/silero.py",
|
||||
"src/pipecat/processors/aggregators/llm_context.py",
|
||||
"src/pipecat/processors/aggregators/llm_response_universal.py",
|
||||
"src/pipecat/processors/frame_processor.py",
|
||||
"src/pipecat/processors/frameworks/langchain.py",
|
||||
"src/pipecat/processors/frameworks/rtvi/observer.py",
|
||||
"src/pipecat/processors/frameworks/rtvi/processor.py",
|
||||
"src/pipecat/processors/frameworks/strands_agents.py",
|
||||
"src/pipecat/services/anthropic/llm.py",
|
||||
"src/pipecat/services/assemblyai/stt.py",
|
||||
"src/pipecat/services/aws/agent_core.py",
|
||||
"src/pipecat/services/aws/llm.py",
|
||||
"src/pipecat/services/aws/nova_sonic/llm.py",
|
||||
"src/pipecat/services/aws/sagemaker/bidi_client.py",
|
||||
"src/pipecat/services/aws/stt.py",
|
||||
"src/pipecat/services/aws/tts.py",
|
||||
"src/pipecat/services/aws/utils.py",
|
||||
"src/pipecat/services/azure/stt.py",
|
||||
"src/pipecat/services/azure/tts.py",
|
||||
"src/pipecat/services/cartesia/stt.py",
|
||||
"src/pipecat/services/cartesia/tts.py",
|
||||
"src/pipecat/services/deepgram/flux/base.py",
|
||||
"src/pipecat/services/deepgram/flux/sagemaker/stt.py",
|
||||
"src/pipecat/services/deepgram/flux/stt.py",
|
||||
"src/pipecat/services/deepgram/sagemaker/stt.py",
|
||||
"src/pipecat/services/deepgram/sagemaker/tts.py",
|
||||
"src/pipecat/services/deepgram/tts.py",
|
||||
"src/pipecat/services/elevenlabs/stt.py",
|
||||
"src/pipecat/services/elevenlabs/tts.py",
|
||||
"src/pipecat/services/fish/tts.py",
|
||||
"src/pipecat/services/gladia/stt.py",
|
||||
"src/pipecat/services/google/gemini_live/llm.py",
|
||||
"src/pipecat/services/google/gemini_live/vertex/llm.py",
|
||||
"src/pipecat/services/google/image.py",
|
||||
"src/pipecat/services/google/llm.py",
|
||||
"src/pipecat/services/google/stt.py",
|
||||
"src/pipecat/services/google/tts.py",
|
||||
"src/pipecat/services/gradium/stt.py",
|
||||
"src/pipecat/services/groq/tts.py",
|
||||
"src/pipecat/services/heygen/api_interactive_avatar.py",
|
||||
"src/pipecat/services/heygen/base_api.py",
|
||||
"src/pipecat/services/heygen/client.py",
|
||||
"src/pipecat/services/heygen/video.py",
|
||||
"src/pipecat/services/hume/tts.py",
|
||||
"src/pipecat/services/inworld/realtime/llm.py",
|
||||
"src/pipecat/services/inworld/tts.py",
|
||||
"src/pipecat/services/kokoro/tts.py",
|
||||
"src/pipecat/services/llm_service.py",
|
||||
"src/pipecat/services/lmnt/tts.py",
|
||||
"src/pipecat/services/mem0/memory.py",
|
||||
"src/pipecat/services/mistral/stt.py",
|
||||
"src/pipecat/services/mistral/tts.py",
|
||||
"src/pipecat/services/moondream/vision.py",
|
||||
"src/pipecat/services/neuphonic/tts.py",
|
||||
"src/pipecat/services/nvidia/stt.py",
|
||||
"src/pipecat/services/nvidia/tts.py",
|
||||
"src/pipecat/services/openai/base_llm.py",
|
||||
"src/pipecat/services/openai/image.py",
|
||||
"src/pipecat/services/openai/llm.py",
|
||||
"src/pipecat/services/openai/realtime/llm.py",
|
||||
"src/pipecat/services/openai/responses/llm.py",
|
||||
"src/pipecat/services/openai/stt.py",
|
||||
"src/pipecat/services/openai/tts.py",
|
||||
"src/pipecat/services/openrouter/llm.py",
|
||||
"src/pipecat/services/piper/tts.py",
|
||||
"src/pipecat/services/resembleai/tts.py",
|
||||
"src/pipecat/services/rime/tts.py",
|
||||
"src/pipecat/services/sambanova/llm.py",
|
||||
"src/pipecat/services/sarvam/stt.py",
|
||||
"src/pipecat/services/sarvam/tts.py",
|
||||
"src/pipecat/services/simli/video.py",
|
||||
"src/pipecat/services/smallest/tts.py",
|
||||
"src/pipecat/services/soniox/stt.py",
|
||||
"src/pipecat/services/speechmatics/stt.py",
|
||||
"src/pipecat/services/stt_service.py",
|
||||
"src/pipecat/services/tavus/video.py",
|
||||
"src/pipecat/services/tts_service.py",
|
||||
"src/pipecat/services/ultravox/llm.py",
|
||||
"src/pipecat/services/websocket_service.py",
|
||||
"src/pipecat/services/whisper/stt.py",
|
||||
"src/pipecat/services/xai/realtime/llm.py",
|
||||
"src/pipecat/services/xtts/tts.py",
|
||||
"src/pipecat/transports/base_output.py",
|
||||
"src/pipecat/transports/daily/transport.py",
|
||||
"src/pipecat/transports/heygen/transport.py",
|
||||
"src/pipecat/transports/lemonslice/transport.py",
|
||||
"src/pipecat/transports/livekit/transport.py",
|
||||
"src/pipecat/transports/smallwebrtc/connection.py",
|
||||
"src/pipecat/transports/smallwebrtc/request_handler.py",
|
||||
"src/pipecat/transports/smallwebrtc/transport.py",
|
||||
"src/pipecat/transports/tavus/transport.py",
|
||||
"src/pipecat/transports/websocket/client.py",
|
||||
"src/pipecat/transports/websocket/server.py",
|
||||
"src/pipecat/transports/whatsapp/client.py"
|
||||
"src/pipecat/transports/websocket/server.py"
|
||||
],
|
||||
"reportMissingImports": false
|
||||
}
|
||||
|
||||
@@ -223,12 +223,11 @@ TESTS_REALTIME = [
|
||||
# ("realtime/realtime-azure.py", EVAL_WEATHER),
|
||||
("realtime/realtime-openai-text.py", EVAL_WEATHER),
|
||||
("realtime/realtime-openai-live-video.py", EVAL_VISION_CAMERA),
|
||||
("realtime/realtime-gemini-live.py", EVAL_SIMPLE_MATH),
|
||||
("realtime/realtime-gemini-live.py", EVAL_WEATHER),
|
||||
("realtime/realtime-gemini-live-local-vad.py", EVAL_SIMPLE_MATH),
|
||||
("realtime/realtime-gemini-live-function-calling.py", EVAL_WEATHER),
|
||||
("realtime/realtime-gemini-live-video.py", EVAL_VISION_CAMERA),
|
||||
("realtime/realtime-gemini-live-google-search.py", EVAL_ONLINE_SEARCH),
|
||||
("realtime/realtime-gemini-live-vertex-function-calling.py", EVAL_WEATHER),
|
||||
("realtime/realtime-gemini-live-vertex.py", EVAL_WEATHER),
|
||||
("realtime/realtime-aws-nova-sonic.py", EVAL_SIMPLE_MATH),
|
||||
("realtime/realtime-ultravox.py", EVAL_ORDER),
|
||||
("realtime/realtime-grok.py", EVAL_WEATHER),
|
||||
@@ -243,6 +242,7 @@ TESTS_VIDEO_AVATAR = [
|
||||
|
||||
TESTS_TURN_MANAGEMENT = [
|
||||
("turn-management/turn-management-filter-incomplete-turns.py", EVAL_COMPLETE_TURN),
|
||||
("turn-management/turn-management-filter-incomplete-turns-function-calling.py", EVAL_WEATHER),
|
||||
]
|
||||
|
||||
TESTS_THINKING = [
|
||||
|
||||
@@ -9,7 +9,7 @@
|
||||
import copy
|
||||
import json
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, TypedDict, TypeGuard, TypeVar
|
||||
from typing import Any, TypedDict, TypeGuard, TypeVar, cast
|
||||
|
||||
from anthropic import NOT_GIVEN, NotGiven
|
||||
from anthropic.types.message_param import MessageParam
|
||||
@@ -121,16 +121,20 @@ class AnthropicLLMAdapter(BaseLLMAdapter[AnthropicLLMInvocationParams]):
|
||||
messages = self._from_universal_context_messages(self.get_messages(context)).messages
|
||||
|
||||
# Sanitize messages for logging
|
||||
messages_for_logging = []
|
||||
messages_for_logging: list[dict[str, Any]] = []
|
||||
for message in messages:
|
||||
msg = copy.deepcopy(message)
|
||||
if "content" in msg:
|
||||
if isinstance(msg["content"], list):
|
||||
for item in msg["content"]:
|
||||
if item["type"] == "image":
|
||||
item["source"]["data"] = "..."
|
||||
if item["type"] == "thinking" and item.get("signature"):
|
||||
item["signature"] = "..."
|
||||
msg: dict[str, Any] = copy.deepcopy(dict(message))
|
||||
content = msg.get("content")
|
||||
if isinstance(content, list):
|
||||
for item in content:
|
||||
if not isinstance(item, dict):
|
||||
continue
|
||||
if item.get("type") == "image":
|
||||
source = item.get("source")
|
||||
if isinstance(source, dict):
|
||||
source["data"] = "..."
|
||||
if item.get("type") == "thinking" and item.get("signature"):
|
||||
item["signature"] = "..."
|
||||
messages_for_logging.append(msg)
|
||||
return messages_for_logging
|
||||
|
||||
@@ -185,8 +189,13 @@ class AnthropicLLMAdapter(BaseLLMAdapter[AnthropicLLMInvocationParams]):
|
||||
]
|
||||
if isinstance(next_message["content"], str):
|
||||
next_message["content"] = [{"type": "text", "text": next_message["content"]}]
|
||||
# Concatenate the content
|
||||
current_message["content"].extend(next_message["content"])
|
||||
# Concatenate the content. MessageParam types content as
|
||||
# `str | Iterable[...]`, but this codebase assumes it's
|
||||
# either a str or a list. The str case is handled above, so
|
||||
# we assume that both are lists here.
|
||||
cast(list[Any], current_message["content"]).extend(
|
||||
cast(list[Any], next_message["content"])
|
||||
)
|
||||
# Remove the next message from the list
|
||||
messages.pop(i + 1)
|
||||
else:
|
||||
@@ -239,7 +248,7 @@ class AnthropicLLMAdapter(BaseLLMAdapter[AnthropicLLMInvocationParams]):
|
||||
}
|
||||
|
||||
# Fall back to assuming that the message is already in Anthropic format
|
||||
return copy.deepcopy(message.message)
|
||||
return cast(MessageParam, copy.deepcopy(message.message))
|
||||
|
||||
def _from_standard_message(self, message: LLMStandardMessage) -> MessageParam:
|
||||
"""Convert standard universal context message to Anthropic format.
|
||||
@@ -280,20 +289,26 @@ class AnthropicLLMAdapter(BaseLLMAdapter[AnthropicLLMInvocationParams]):
|
||||
]
|
||||
}
|
||||
"""
|
||||
message = copy.deepcopy(message)
|
||||
if message["role"] == "tool":
|
||||
return {
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "tool_result",
|
||||
"tool_use_id": message["tool_call_id"],
|
||||
"content": message["content"],
|
||||
},
|
||||
],
|
||||
}
|
||||
if message.get("tool_calls"):
|
||||
tc = message["tool_calls"]
|
||||
# ChatCompletionMessageParam (input) and MessageParam (output) are
|
||||
# different TypedDicts — work with the message as a plain dict for the
|
||||
# transformations below and cast back to MessageParam at return sites.
|
||||
msg = cast(dict[str, Any], copy.deepcopy(message))
|
||||
if msg["role"] == "tool":
|
||||
return cast(
|
||||
MessageParam,
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "tool_result",
|
||||
"tool_use_id": msg["tool_call_id"],
|
||||
"content": msg["content"],
|
||||
},
|
||||
],
|
||||
},
|
||||
)
|
||||
if msg.get("tool_calls"):
|
||||
tc = msg["tool_calls"]
|
||||
ret = {"role": "assistant", "content": []}
|
||||
for tool_call in tc:
|
||||
function = tool_call["function"]
|
||||
@@ -305,8 +320,8 @@ class AnthropicLLMAdapter(BaseLLMAdapter[AnthropicLLMInvocationParams]):
|
||||
"input": arguments,
|
||||
}
|
||||
ret["content"].append(new_tool_use)
|
||||
return ret
|
||||
content = message.get("content")
|
||||
return cast(MessageParam, ret)
|
||||
content = msg.get("content")
|
||||
if isinstance(content, str):
|
||||
# fix empty text
|
||||
if content == "":
|
||||
@@ -354,7 +369,7 @@ class AnthropicLLMAdapter(BaseLLMAdapter[AnthropicLLMInvocationParams]):
|
||||
image_item = content.pop(img_idx)
|
||||
content.insert(first_txt_idx, image_item)
|
||||
|
||||
return message
|
||||
return cast(MessageParam, msg)
|
||||
|
||||
def _with_cache_control_markers(self, messages: list[MessageParam]) -> list[MessageParam]:
|
||||
"""Add cache control markers to messages for prompt caching.
|
||||
@@ -369,7 +384,16 @@ class AnthropicLLMAdapter(BaseLLMAdapter[AnthropicLLMInvocationParams]):
|
||||
def add_cache_control_marker(message: MessageParam):
|
||||
if isinstance(message["content"], str):
|
||||
message["content"] = [{"type": "text", "text": message["content"]}]
|
||||
message["content"][-1]["cache_control"] = {"type": "ephemeral"}
|
||||
# Assumptions on the next line:
|
||||
# - content is a list (str case handled above; this codebase only
|
||||
# ever constructs content as a str or a list)
|
||||
# - the list is non-empty (guaranteed by the empty-content
|
||||
# replacement in `_from_universal_context_messages`)
|
||||
# - the last item is a dict. The standard-message path enforces
|
||||
# this via TypedDicts (which are dicts at runtime); the
|
||||
# LLMSpecificMessage passthrough doesn't, but in practice
|
||||
# callers use dicts.
|
||||
cast(list[Any], message["content"])[-1]["cache_control"] = {"type": "ephemeral"}
|
||||
|
||||
try:
|
||||
# Add cache control markers to the most recent two user messages.
|
||||
|
||||
@@ -8,9 +8,9 @@
|
||||
|
||||
import copy
|
||||
import json
|
||||
from dataclasses import dataclass
|
||||
from dataclasses import asdict, dataclass
|
||||
from enum import Enum
|
||||
from typing import Any, TypedDict
|
||||
from typing import Any, TypedDict, cast
|
||||
|
||||
from loguru import logger
|
||||
|
||||
@@ -110,7 +110,10 @@ class AWSNovaSonicLLMAdapter(BaseLLMAdapter[AWSNovaSonicLLMInvocationParams]):
|
||||
Returns:
|
||||
List of messages in a format ready for logging about AWS Nova Sonic.
|
||||
"""
|
||||
return self._from_universal_context_messages(self.get_messages(context)).messages
|
||||
return [
|
||||
asdict(m)
|
||||
for m in self._from_universal_context_messages(self.get_messages(context)).messages
|
||||
]
|
||||
|
||||
@dataclass
|
||||
class ConvertedMessages:
|
||||
@@ -123,18 +126,27 @@ class AWSNovaSonicLLMAdapter(BaseLLMAdapter[AWSNovaSonicLLMInvocationParams]):
|
||||
self, universal_context_messages: list[LLMContextMessage]
|
||||
) -> ConvertedMessages:
|
||||
system_instruction = None
|
||||
messages = []
|
||||
messages: list[AWSNovaSonicConversationHistoryMessage] = []
|
||||
|
||||
# Bail if there are no messages
|
||||
if not universal_context_messages:
|
||||
return self.ConvertedMessages()
|
||||
return self.ConvertedMessages(messages=[])
|
||||
|
||||
universal_context_messages = copy.deepcopy(universal_context_messages)
|
||||
# NOTE: This adapter does not yet handle ``LLMSpecificMessage`` —
|
||||
# those are filtered out below (the role-extraction and conversion
|
||||
# logic only applies to standard message dicts). If/when this
|
||||
# adapter grows a per-provider passthrough like the Anthropic
|
||||
# adapter has, LLMSpecific items can flow through.
|
||||
ucm: list[dict[str, Any]] = [
|
||||
cast(dict[str, Any], m)
|
||||
for m in copy.deepcopy(universal_context_messages)
|
||||
if isinstance(m, dict)
|
||||
]
|
||||
|
||||
# If we have a "system" message as our first message,
|
||||
# pull that out into "instruction"
|
||||
if universal_context_messages[0].get("role") == "system":
|
||||
system = universal_context_messages.pop(0)
|
||||
if ucm and ucm[0].get("role") == "system":
|
||||
system = ucm.pop(0)
|
||||
content = system.get("content")
|
||||
if isinstance(content, str):
|
||||
system_instruction = content
|
||||
@@ -145,19 +157,21 @@ class AWSNovaSonicLLMAdapter(BaseLLMAdapter[AWSNovaSonicLLMInvocationParams]):
|
||||
|
||||
# Convert any remaining "system"/"developer" messages to "user",
|
||||
# as Nova Sonic only supports "user" and "assistant" in history.
|
||||
for msg in universal_context_messages:
|
||||
for msg in ucm:
|
||||
if msg.get("role") in ("system", "developer"):
|
||||
msg["role"] = "user"
|
||||
|
||||
# Process remaining messages to fill out conversation history.
|
||||
for universal_context_message in universal_context_messages:
|
||||
for universal_context_message in ucm:
|
||||
message = self._from_universal_context_message(universal_context_message)
|
||||
if message:
|
||||
messages.append(message)
|
||||
|
||||
return self.ConvertedMessages(messages=messages, system_instruction=system_instruction)
|
||||
|
||||
def _from_universal_context_message(self, message) -> AWSNovaSonicConversationHistoryMessage:
|
||||
def _from_universal_context_message(
|
||||
self, message: dict[str, Any]
|
||||
) -> AWSNovaSonicConversationHistoryMessage | None:
|
||||
"""Convert standard message format to Nova Sonic format.
|
||||
|
||||
Args:
|
||||
@@ -167,17 +181,18 @@ class AWSNovaSonicLLMAdapter(BaseLLMAdapter[AWSNovaSonicLLMInvocationParams]):
|
||||
Nova Sonic conversation history message, or None if not convertible.
|
||||
"""
|
||||
role = message.get("role")
|
||||
if message.get("role") == "user" or message.get("role") == "assistant":
|
||||
if role == "user" or role == "assistant":
|
||||
content = message.get("content")
|
||||
if isinstance(message.get("content"), list):
|
||||
content = ""
|
||||
for c in message.get("content"):
|
||||
if isinstance(content, list):
|
||||
text_parts = []
|
||||
for c in content:
|
||||
if c.get("type") == "text":
|
||||
content += " " + c.get("text")
|
||||
text_parts.append(c.get("text"))
|
||||
else:
|
||||
logger.error(
|
||||
f"Unhandled content type in context message: {c.get('type')} - {message}"
|
||||
)
|
||||
content = " ".join(t for t in text_parts if t)
|
||||
# There won't be content if this is an assistant tool call entry.
|
||||
# We're ignoring those since they can't be loaded into AWS Nova Sonic conversation
|
||||
# history
|
||||
|
||||
@@ -10,7 +10,7 @@ import base64
|
||||
import copy
|
||||
import json
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, TypedDict
|
||||
from typing import Any, TypedDict, cast
|
||||
|
||||
from loguru import logger
|
||||
|
||||
@@ -68,16 +68,19 @@ class AWSBedrockLLMAdapter(BaseLLMAdapter[AWSBedrockLLMInvocationParams]):
|
||||
system_instruction,
|
||||
discard_context_system=True,
|
||||
)
|
||||
return {
|
||||
"system": [{"text": effective_system}] if effective_system else None,
|
||||
"messages": converted.messages,
|
||||
# NOTE: LLMContext's tools are guaranteed to be a ToolsSchema (or NOT_GIVEN)
|
||||
"tools": self.from_standard_tools(context.tools) or [],
|
||||
# To avoid refactoring in AWSBedrockLLMService, we just pass through tool_choice.
|
||||
# Eventually (when we don't have to maintain the non-LLMContext code path) we should do
|
||||
# the conversion to Bedrock's expected format here rather than in AWSBedrockLLMService.
|
||||
"tool_choice": context.tool_choice,
|
||||
}
|
||||
return cast(
|
||||
AWSBedrockLLMInvocationParams,
|
||||
{
|
||||
"system": [{"text": effective_system}] if effective_system else None,
|
||||
"messages": converted.messages,
|
||||
# NOTE: LLMContext's tools are guaranteed to be a ToolsSchema (or NOT_GIVEN)
|
||||
"tools": self.from_standard_tools(context.tools) or [],
|
||||
# To avoid refactoring in AWSBedrockLLMService, we just pass through tool_choice.
|
||||
# Eventually (when we don't have to maintain the non-LLMContext code path) we should do
|
||||
# the conversion to Bedrock's expected format here rather than in AWSBedrockLLMService.
|
||||
"tool_choice": context.tool_choice,
|
||||
},
|
||||
)
|
||||
|
||||
def get_messages_for_logging(self, context) -> list[dict[str, Any]]:
|
||||
"""Get messages from a universal LLM context in a format ready for logging about AWS Bedrock.
|
||||
@@ -213,35 +216,36 @@ class AWSBedrockLLMAdapter(BaseLLMAdapter[AWSBedrockLLMInvocationParams]):
|
||||
]
|
||||
}
|
||||
"""
|
||||
message = copy.deepcopy(message)
|
||||
if message["role"] == "tool":
|
||||
# ChatCompletionMessageParam (input) and the dict shape Bedrock expects
|
||||
# are different — work with the deepcopied message as a plain dict for
|
||||
# the transformations below.
|
||||
msg = cast(dict[str, Any], copy.deepcopy(message))
|
||||
if msg["role"] == "tool":
|
||||
# Try to parse the content as JSON if it looks like JSON
|
||||
try:
|
||||
if message["content"].strip().startswith("{") and message[
|
||||
"content"
|
||||
].strip().endswith("}"):
|
||||
content_json = json.loads(message["content"])
|
||||
if msg["content"].strip().startswith("{") and msg["content"].strip().endswith("}"):
|
||||
content_json = json.loads(msg["content"])
|
||||
tool_result_content = [{"json": content_json}]
|
||||
else:
|
||||
tool_result_content = [{"text": message["content"]}]
|
||||
tool_result_content = [{"text": msg["content"]}]
|
||||
except (json.JSONDecodeError, ValueError, AttributeError):
|
||||
tool_result_content = [{"text": message["content"]}]
|
||||
tool_result_content = [{"text": msg["content"]}]
|
||||
|
||||
return {
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"toolResult": {
|
||||
"toolUseId": message["tool_call_id"],
|
||||
"toolUseId": msg["tool_call_id"],
|
||||
"content": tool_result_content,
|
||||
},
|
||||
},
|
||||
],
|
||||
}
|
||||
|
||||
if message.get("tool_calls"):
|
||||
tc = message["tool_calls"]
|
||||
ret = {"role": "assistant", "content": []}
|
||||
if msg.get("tool_calls"):
|
||||
tc = msg["tool_calls"]
|
||||
ret: dict[str, Any] = {"role": "assistant", "content": []}
|
||||
for tool_call in tc:
|
||||
function = tool_call["function"]
|
||||
arguments = json.loads(function["arguments"])
|
||||
@@ -256,12 +260,12 @@ class AWSBedrockLLMAdapter(BaseLLMAdapter[AWSBedrockLLMInvocationParams]):
|
||||
return ret
|
||||
|
||||
# Handle text content
|
||||
content = message.get("content")
|
||||
content = msg.get("content")
|
||||
if isinstance(content, str):
|
||||
if content == "":
|
||||
return {"role": message["role"], "content": [{"text": "(empty)"}]}
|
||||
return {"role": msg["role"], "content": [{"text": "(empty)"}]}
|
||||
else:
|
||||
return {"role": message["role"], "content": [{"text": content}]}
|
||||
return {"role": msg["role"], "content": [{"text": content}]}
|
||||
elif isinstance(content, list):
|
||||
new_content = []
|
||||
for item in content:
|
||||
@@ -300,9 +304,9 @@ class AWSBedrockLLMAdapter(BaseLLMAdapter[AWSBedrockLLMInvocationParams]):
|
||||
# Move image before the first text
|
||||
image_item = new_content.pop(img_idx)
|
||||
new_content.insert(first_txt_idx, image_item)
|
||||
return {"role": message["role"], "content": new_content}
|
||||
return {"role": msg["role"], "content": new_content}
|
||||
|
||||
return message
|
||||
return msg
|
||||
|
||||
@staticmethod
|
||||
def _to_bedrock_function_format(function: FunctionSchema) -> dict[str, Any]:
|
||||
|
||||
@@ -9,7 +9,7 @@
|
||||
import base64
|
||||
import json
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any, TypedDict
|
||||
from typing import Any, TypedDict, cast
|
||||
|
||||
from loguru import logger
|
||||
from openai import NotGiven
|
||||
@@ -139,6 +139,36 @@ class GeminiLLMAdapter(BaseLLMAdapter[GeminiLLMInvocationParams]):
|
||||
|
||||
return formatted_standard_tools + custom_gemini_tools
|
||||
|
||||
@staticmethod
|
||||
def to_function_response_dict(content: Any) -> dict[str, Any]:
|
||||
"""Convert a tool-result content value to Gemini's FunctionResponse.response shape.
|
||||
|
||||
Gemini's ``FunctionResponse.response`` field requires a dict, so
|
||||
non-dict values (e.g. plain strings, JSON-encoded scalars, or
|
||||
sentinel strings like ``"COMPLETED"`` used when a function returned
|
||||
no value) are wrapped as ``{"value": <value>}``. JSON strings that
|
||||
decode to a dict are passed through as-is.
|
||||
|
||||
Args:
|
||||
content: The tool-result content. Typically the JSON-encoded
|
||||
return value of a function, but can also be a plain string
|
||||
(e.g. ``"COMPLETED"``) or already-parsed dict.
|
||||
|
||||
Returns:
|
||||
A dict suitable for ``FunctionResponse.response``.
|
||||
"""
|
||||
if isinstance(content, dict):
|
||||
return content
|
||||
if not isinstance(content, str):
|
||||
return {"value": content}
|
||||
try:
|
||||
decoded = json.loads(content)
|
||||
except (json.JSONDecodeError, ValueError):
|
||||
return {"value": content}
|
||||
if isinstance(decoded, dict):
|
||||
return decoded
|
||||
return {"value": decoded}
|
||||
|
||||
def get_messages_for_logging(self, context: LLMContext) -> list[dict[str, Any]]:
|
||||
"""Get messages from a universal LLM context in a format ready for logging about Gemini.
|
||||
|
||||
@@ -154,9 +184,12 @@ class GeminiLLMAdapter(BaseLLMAdapter[GeminiLLMInvocationParams]):
|
||||
messages = self._from_universal_context_messages(self.get_messages(context)).messages
|
||||
|
||||
# Sanitize messages for logging
|
||||
messages_for_logging = []
|
||||
messages_for_logging: list[dict[str, Any]] = []
|
||||
for message in messages:
|
||||
obj = message.to_json_dict()
|
||||
# `to_json_dict()` returns `dict[str, object]`; treat as a plain
|
||||
# dict for the value indexing/mutation below. The broad `except`
|
||||
# below is the safety net if any item isn't shaped as expected.
|
||||
obj: dict[str, Any] = cast(dict[str, Any], message.to_json_dict())
|
||||
try:
|
||||
if "parts" in obj:
|
||||
for part in obj["parts"]:
|
||||
@@ -274,7 +307,8 @@ class GeminiLLMAdapter(BaseLLMAdapter[GeminiLLMInvocationParams]):
|
||||
# Check if we only have function-related messages (no regular text)
|
||||
effective_system = extracted_system or system_instruction
|
||||
has_regular_messages = any(
|
||||
len(msg.parts) == 1
|
||||
msg.parts is not None
|
||||
and len(msg.parts) == 1
|
||||
and getattr(msg.parts[0], "text", None)
|
||||
and not getattr(msg.parts[0], "function_call", None)
|
||||
and not getattr(msg.parts[0], "function_response", None)
|
||||
@@ -346,8 +380,11 @@ class GeminiLLMAdapter(BaseLLMAdapter[GeminiLLMInvocationParams]):
|
||||
parts=[Part(function_call=FunctionCall(name="search", args={"query": "test"}))]
|
||||
)
|
||||
"""
|
||||
role = message["role"]
|
||||
content = message.get("content", [])
|
||||
# ChatCompletionMessageParam (a union of TypedDicts) doesn't allow
|
||||
# the dict-style key access used below; treat it as a plain dict.
|
||||
msg = cast(dict[str, Any], message)
|
||||
role = msg["role"]
|
||||
content = msg.get("content", [])
|
||||
|
||||
# Convert non-initial system/developer messages to user role,
|
||||
# as Gemini doesn't support these as input messages.
|
||||
@@ -359,8 +396,8 @@ class GeminiLLMAdapter(BaseLLMAdapter[GeminiLLMInvocationParams]):
|
||||
parts = []
|
||||
tool_call_id_to_name_mapping = {}
|
||||
|
||||
if message.get("tool_calls"):
|
||||
for tc in message["tool_calls"]:
|
||||
if msg.get("tool_calls"):
|
||||
for tc in msg["tool_calls"]:
|
||||
id = tc["id"]
|
||||
name = tc["function"]["name"]
|
||||
tool_call_id_to_name_mapping[id] = name
|
||||
@@ -375,19 +412,10 @@ class GeminiLLMAdapter(BaseLLMAdapter[GeminiLLMInvocationParams]):
|
||||
)
|
||||
elif role == "tool":
|
||||
role = "user"
|
||||
try:
|
||||
response = json.loads(message["content"])
|
||||
if isinstance(response, dict):
|
||||
response_dict = response
|
||||
else:
|
||||
response_dict = {"value": response}
|
||||
except Exception as e:
|
||||
# Response might not be JSON-deserializable.
|
||||
# This occurs with a UserImageFrame, for example, where we get a plain "COMPLETED" string.
|
||||
response_dict = {"value": message["content"]}
|
||||
response_dict = self.to_function_response_dict(msg["content"])
|
||||
|
||||
# Get function name from mapping using tool_call_id, or fallback
|
||||
tool_call_id = message.get("tool_call_id")
|
||||
tool_call_id = msg.get("tool_call_id")
|
||||
function_name = "tool_call_result" # Default fallback
|
||||
if tool_call_id and tool_call_id in params.tool_call_id_to_name_mapping:
|
||||
function_name = params.tool_call_id_to_name_mapping[tool_call_id]
|
||||
@@ -491,7 +519,7 @@ class GeminiLLMAdapter(BaseLLMAdapter[GeminiLLMInvocationParams]):
|
||||
|
||||
def is_tool_call_message(msg: Content) -> bool:
|
||||
"""Check if message contains only function_call parts."""
|
||||
return (
|
||||
return bool(
|
||||
msg.role == "model"
|
||||
and msg.parts
|
||||
and all(getattr(part, "function_call", None) for part in msg.parts)
|
||||
@@ -499,6 +527,8 @@ class GeminiLLMAdapter(BaseLLMAdapter[GeminiLLMInvocationParams]):
|
||||
|
||||
def message_has_thought_signature(msg: Content) -> bool:
|
||||
"""Check if any part in the message has a thought_signature."""
|
||||
if msg.parts is None:
|
||||
return False
|
||||
return any(getattr(part, "thought_signature", None) for part in msg.parts)
|
||||
|
||||
merged_messages = []
|
||||
@@ -564,6 +594,8 @@ class GeminiLLMAdapter(BaseLLMAdapter[GeminiLLMInvocationParams]):
|
||||
logger.debug(f"Thought signatures to apply: {len(thought_signature_dicts)}")
|
||||
for ts in thought_signature_dicts:
|
||||
bookmark = ts.get("bookmark")
|
||||
if bookmark is None:
|
||||
continue
|
||||
if bookmark.get("function_call"):
|
||||
logger.trace(f" - To function call: {bookmark['function_call']}")
|
||||
elif bookmark.get("text"):
|
||||
@@ -665,6 +697,8 @@ class GeminiLLMAdapter(BaseLLMAdapter[GeminiLLMInvocationParams]):
|
||||
if (
|
||||
hasattr(part, "inline_data")
|
||||
and part.inline_data
|
||||
and part.inline_data.data is not None
|
||||
and bookmark_inline_data.data is not None
|
||||
# Comparing length should be good enough for matching inline data,
|
||||
# especially since we're already matching thought signatures in
|
||||
# strict message order. Comparing actual data is expensive.
|
||||
|
||||
@@ -13,7 +13,7 @@ Grok's Voice Agent API.
|
||||
import copy
|
||||
import json
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, TypedDict
|
||||
from typing import Any, TypedDict, cast
|
||||
|
||||
from loguru import logger
|
||||
|
||||
@@ -85,7 +85,10 @@ class GrokRealtimeLLMAdapter(BaseLLMAdapter):
|
||||
Returns:
|
||||
List of messages with sensitive data redacted.
|
||||
"""
|
||||
return self.get_messages(context, truncate_large_values=True)
|
||||
return cast(
|
||||
list[dict[str, Any]],
|
||||
self.get_messages(context, truncate_large_values=True),
|
||||
)
|
||||
|
||||
@dataclass
|
||||
class ConvertedMessages:
|
||||
@@ -111,11 +114,20 @@ class GrokRealtimeLLMAdapter(BaseLLMAdapter):
|
||||
if not universal_context_messages:
|
||||
return self.ConvertedMessages(messages=[])
|
||||
|
||||
messages = copy.deepcopy(universal_context_messages)
|
||||
# NOTE: This adapter does not yet handle ``LLMSpecificMessage`` —
|
||||
# those are filtered out below. Other adapters (e.g. Anthropic)
|
||||
# dispatch LLMSpecific items through a per-provider passthrough.
|
||||
# The pack-into-single-text-message strategy here doesn't compose
|
||||
# with opaque per-provider payloads.
|
||||
messages: list[dict[str, Any]] = [
|
||||
cast(dict[str, Any], m)
|
||||
for m in copy.deepcopy(universal_context_messages)
|
||||
if isinstance(m, dict)
|
||||
]
|
||||
system_instruction = None
|
||||
|
||||
# Extract system message as session instructions
|
||||
if messages[0].get("role") == "system":
|
||||
if messages and messages[0].get("role") == "system":
|
||||
system = messages.pop(0)
|
||||
content = system.get("content")
|
||||
if isinstance(content, str):
|
||||
@@ -133,7 +145,9 @@ class GrokRealtimeLLMAdapter(BaseLLMAdapter):
|
||||
# Single user message can be sent normally
|
||||
if len(messages) == 1 and messages[0].get("role") == "user":
|
||||
return self.ConvertedMessages(
|
||||
messages=[self._from_universal_context_message(messages[0])],
|
||||
messages=[
|
||||
self._from_universal_context_message(cast(LLMContextMessage, messages[0]))
|
||||
],
|
||||
system_instruction=system_instruction,
|
||||
)
|
||||
|
||||
@@ -181,26 +195,29 @@ class GrokRealtimeLLMAdapter(BaseLLMAdapter):
|
||||
Returns:
|
||||
ConversationItem formatted for Grok Realtime API.
|
||||
"""
|
||||
if message.get("role") == "user":
|
||||
content = message.get("content")
|
||||
# NOTE: ``LLMSpecificMessage`` is not yet handled here — see the
|
||||
# corresponding note in `_from_universal_context_messages`.
|
||||
msg = cast(dict[str, Any], message)
|
||||
if msg.get("role") == "user":
|
||||
content = msg.get("content")
|
||||
if isinstance(content, list):
|
||||
text_content = ""
|
||||
text_parts = []
|
||||
for c in content:
|
||||
if c.get("type") == "text":
|
||||
text_content += " " + c.get("text")
|
||||
text_parts.append(c.get("text"))
|
||||
else:
|
||||
logger.error(
|
||||
f"Unhandled content type in context message: {c.get('type')} - {message}"
|
||||
f"Unhandled content type in context message: {c.get('type')} - {msg}"
|
||||
)
|
||||
content = text_content.strip()
|
||||
content = " ".join(t for t in text_parts if t).strip()
|
||||
return events.ConversationItem(
|
||||
role="user",
|
||||
type="message",
|
||||
content=[events.ItemContent(type="input_text", text=content)],
|
||||
)
|
||||
|
||||
if message.get("role") == "assistant" and message.get("tool_calls"):
|
||||
tc = message.get("tool_calls")[0]
|
||||
if msg.get("role") == "assistant" and msg.get("tool_calls"):
|
||||
tc = msg["tool_calls"][0]
|
||||
return events.ConversationItem(
|
||||
type="function_call",
|
||||
call_id=tc["id"],
|
||||
@@ -208,7 +225,7 @@ class GrokRealtimeLLMAdapter(BaseLLMAdapter):
|
||||
arguments=tc["function"]["arguments"],
|
||||
)
|
||||
|
||||
logger.error(f"Unhandled message type in _from_universal_context_message: {message}")
|
||||
raise ValueError(f"Unhandled message type in _from_universal_context_message: {msg}")
|
||||
|
||||
@staticmethod
|
||||
def _to_grok_function_format(function: FunctionSchema) -> dict[str, Any]:
|
||||
|
||||
@@ -13,7 +13,7 @@ Inworld's Realtime API.
|
||||
import copy
|
||||
import json
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, TypedDict
|
||||
from typing import Any, TypedDict, cast
|
||||
|
||||
from loguru import logger
|
||||
|
||||
@@ -85,7 +85,10 @@ class InworldRealtimeLLMAdapter(BaseLLMAdapter):
|
||||
Returns:
|
||||
List of messages with sensitive data redacted.
|
||||
"""
|
||||
return self.get_messages(context, truncate_large_values=True)
|
||||
return cast(
|
||||
list[dict[str, Any]],
|
||||
self.get_messages(context, truncate_large_values=True),
|
||||
)
|
||||
|
||||
@dataclass
|
||||
class ConvertedMessages:
|
||||
@@ -111,11 +114,20 @@ class InworldRealtimeLLMAdapter(BaseLLMAdapter):
|
||||
if not universal_context_messages:
|
||||
return self.ConvertedMessages(messages=[])
|
||||
|
||||
messages = copy.deepcopy(universal_context_messages)
|
||||
# NOTE: This adapter does not yet handle ``LLMSpecificMessage`` —
|
||||
# those are filtered out below. Other adapters (e.g. Anthropic)
|
||||
# dispatch LLMSpecific items through a per-provider passthrough.
|
||||
# The pack-into-single-text-message strategy here doesn't compose
|
||||
# with opaque per-provider payloads.
|
||||
messages: list[dict[str, Any]] = [
|
||||
cast(dict[str, Any], m)
|
||||
for m in copy.deepcopy(universal_context_messages)
|
||||
if isinstance(m, dict)
|
||||
]
|
||||
system_instruction = None
|
||||
|
||||
# Extract system message as session instructions
|
||||
if messages[0].get("role") == "system":
|
||||
if messages and messages[0].get("role") == "system":
|
||||
system = messages.pop(0)
|
||||
content = system.get("content")
|
||||
if isinstance(content, str):
|
||||
@@ -133,7 +145,9 @@ class InworldRealtimeLLMAdapter(BaseLLMAdapter):
|
||||
# Single user message can be sent normally
|
||||
if len(messages) == 1 and messages[0].get("role") == "user":
|
||||
return self.ConvertedMessages(
|
||||
messages=[self._from_universal_context_message(messages[0])],
|
||||
messages=[
|
||||
self._from_universal_context_message(cast(LLMContextMessage, messages[0]))
|
||||
],
|
||||
system_instruction=system_instruction,
|
||||
)
|
||||
|
||||
@@ -181,26 +195,29 @@ class InworldRealtimeLLMAdapter(BaseLLMAdapter):
|
||||
Returns:
|
||||
ConversationItem formatted for Inworld Realtime API.
|
||||
"""
|
||||
if message.get("role") == "user":
|
||||
content = message.get("content")
|
||||
# NOTE: ``LLMSpecificMessage`` is not yet handled here — see the
|
||||
# corresponding note in `_from_universal_context_messages`.
|
||||
msg = cast(dict[str, Any], message)
|
||||
if msg.get("role") == "user":
|
||||
content = msg.get("content")
|
||||
if isinstance(content, list):
|
||||
text_content = ""
|
||||
text_parts = []
|
||||
for c in content:
|
||||
if c.get("type") == "text":
|
||||
text_content += " " + c.get("text")
|
||||
text_parts.append(c.get("text"))
|
||||
else:
|
||||
logger.error(
|
||||
f"Unhandled content type in context message: {c.get('type')} - {message}"
|
||||
f"Unhandled content type in context message: {c.get('type')} - {msg}"
|
||||
)
|
||||
content = text_content.strip()
|
||||
content = " ".join(t for t in text_parts if t).strip()
|
||||
return events.ConversationItem(
|
||||
role="user",
|
||||
type="message",
|
||||
content=[events.ItemContent(type="input_text", text=content)],
|
||||
)
|
||||
|
||||
if message.get("role") == "assistant" and message.get("tool_calls"):
|
||||
tc = message.get("tool_calls")[0]
|
||||
if msg.get("role") == "assistant" and msg.get("tool_calls"):
|
||||
tc = msg["tool_calls"][0]
|
||||
return events.ConversationItem(
|
||||
type="function_call",
|
||||
call_id=tc["id"],
|
||||
@@ -208,7 +225,7 @@ class InworldRealtimeLLMAdapter(BaseLLMAdapter):
|
||||
arguments=tc["function"]["arguments"],
|
||||
)
|
||||
|
||||
logger.error(f"Unhandled message type in _from_universal_context_message: {message}")
|
||||
raise ValueError(f"Unhandled message type in _from_universal_context_message: {msg}")
|
||||
|
||||
@staticmethod
|
||||
def _to_inworld_function_format(function: FunctionSchema) -> dict[str, Any]:
|
||||
|
||||
@@ -127,12 +127,15 @@ class OpenAILLMAdapter(BaseLLMAdapter[OpenAILLMInvocationParams]):
|
||||
)
|
||||
|
||||
if system_instruction:
|
||||
# Detect initial system message for warning purposes (don't extract)
|
||||
initial_content = (
|
||||
messages[0].get("content", "")
|
||||
if messages and messages[0].get("role") == "system"
|
||||
else None
|
||||
)
|
||||
# Detect initial system message for warning purposes (don't extract).
|
||||
# ChatCompletionMessageParam.content is `str | Iterable[...]`; we
|
||||
# only forward it for warning purposes, so coerce non-strings to
|
||||
# None — the resolver handles None.
|
||||
initial_content: str | None = None
|
||||
if messages and messages[0].get("role") == "system":
|
||||
raw_content = messages[0].get("content", "")
|
||||
if isinstance(raw_content, str):
|
||||
initial_content = raw_content
|
||||
self._resolve_system_instruction(
|
||||
initial_content,
|
||||
system_instruction,
|
||||
@@ -140,12 +143,15 @@ class OpenAILLMAdapter(BaseLLMAdapter[OpenAILLMInvocationParams]):
|
||||
)
|
||||
messages = [{"role": "system", "content": system_instruction}] + messages
|
||||
|
||||
return {
|
||||
"messages": messages,
|
||||
# NOTE; LLMContext's tools are guaranteed to be a ToolsSchema (or NOT_GIVEN)
|
||||
"tools": self.from_standard_tools(context.tools),
|
||||
"tool_choice": _openai_from_llm_context_tool_choice(context.tool_choice),
|
||||
}
|
||||
return cast(
|
||||
OpenAILLMInvocationParams,
|
||||
{
|
||||
"messages": messages,
|
||||
# NOTE; LLMContext's tools are guaranteed to be a ToolsSchema (or NOT_GIVEN)
|
||||
"tools": self.from_standard_tools(context.tools),
|
||||
"tool_choice": _openai_from_llm_context_tool_choice(context.tool_choice),
|
||||
},
|
||||
)
|
||||
|
||||
def to_provider_tools_format(self, tools_schema: ToolsSchema) -> list[ChatCompletionToolParam]:
|
||||
"""Convert function schemas to OpenAI's function-calling format.
|
||||
@@ -158,13 +164,19 @@ class OpenAILLMAdapter(BaseLLMAdapter[OpenAILLMInvocationParams]):
|
||||
with ChatCompletion API.
|
||||
"""
|
||||
functions_schema = tools_schema.standard_tools
|
||||
formatted_standard_tools = [
|
||||
ChatCompletionToolParam(type="function", function=func.to_default_dict())
|
||||
# `function=...` expects a `FunctionDefinition` TypedDict; the dict
|
||||
# produced by `to_default_dict()` is structurally compatible. Cast at
|
||||
# the boundary.
|
||||
formatted_standard_tools: list[ChatCompletionToolParam] = [
|
||||
ChatCompletionToolParam(type="function", function=cast(Any, func.to_default_dict()))
|
||||
for func in functions_schema
|
||||
]
|
||||
custom_openai_tools = []
|
||||
custom_openai_tools: list[ChatCompletionToolParam] = []
|
||||
if tools_schema.custom_tools:
|
||||
custom_openai_tools = tools_schema.custom_tools.get(AdapterType.OPENAI, [])
|
||||
custom_openai_tools = cast(
|
||||
list[ChatCompletionToolParam],
|
||||
tools_schema.custom_tools.get(AdapterType.OPENAI, []),
|
||||
)
|
||||
return formatted_standard_tools + custom_openai_tools
|
||||
|
||||
def get_messages_for_logging(self, context: LLMContext) -> list[dict[str, Any]]:
|
||||
@@ -178,7 +190,10 @@ class OpenAILLMAdapter(BaseLLMAdapter[OpenAILLMInvocationParams]):
|
||||
Returns:
|
||||
List of messages in a format ready for logging about OpenAI.
|
||||
"""
|
||||
return self.get_messages(context, truncate_large_values=True)
|
||||
return cast(
|
||||
list[dict[str, Any]],
|
||||
self.get_messages(context, truncate_large_values=True),
|
||||
)
|
||||
|
||||
def _from_universal_context_messages(
|
||||
self,
|
||||
|
||||
@@ -9,7 +9,7 @@
|
||||
import copy
|
||||
import json
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, TypedDict
|
||||
from typing import Any, TypedDict, cast
|
||||
|
||||
from loguru import logger
|
||||
|
||||
@@ -81,7 +81,7 @@ class OpenAIRealtimeLLMAdapter(BaseLLMAdapter):
|
||||
Returns:
|
||||
List of messages in a format ready for logging about OpenAI Realtime.
|
||||
"""
|
||||
return self.get_messages(context, truncate_large_values=True)
|
||||
return cast(list[dict[str, Any]], self.get_messages(context, truncate_large_values=True))
|
||||
|
||||
@dataclass
|
||||
class ConvertedMessages:
|
||||
@@ -101,12 +101,24 @@ class OpenAIRealtimeLLMAdapter(BaseLLMAdapter):
|
||||
if not universal_context_messages:
|
||||
return self.ConvertedMessages(messages=[])
|
||||
|
||||
messages = copy.deepcopy(universal_context_messages)
|
||||
# NOTE: This adapter does not yet handle ``LLMSpecificMessage`` — those
|
||||
# are filtered out below. Other adapters (e.g. Anthropic) dispatch
|
||||
# LLMSpecific items through a per-provider passthrough. For OpenAI
|
||||
# Realtime, the strategy here packs a multi-message history into a
|
||||
# single text message (see comment further down), which doesn't
|
||||
# compose with opaque per-provider payloads. If/when this adapter
|
||||
# adopts the per-message strategy, LLMSpecific items can flow
|
||||
# through `_from_universal_context_message` like in other adapters.
|
||||
messages: list[dict[str, Any]] = [
|
||||
cast(dict[str, Any], m)
|
||||
for m in copy.deepcopy(universal_context_messages)
|
||||
if isinstance(m, dict)
|
||||
]
|
||||
system_instruction = None
|
||||
|
||||
# If we have a "system" message as our first message,
|
||||
# pull that out into session "instructions"
|
||||
if messages[0].get("role") == "system":
|
||||
if messages and messages[0].get("role") == "system":
|
||||
system = messages.pop(0)
|
||||
content = system.get("content")
|
||||
if isinstance(content, str):
|
||||
@@ -124,7 +136,9 @@ class OpenAIRealtimeLLMAdapter(BaseLLMAdapter):
|
||||
# If we have just a single "user" item, we can just send it normally
|
||||
if len(messages) == 1 and messages[0].get("role") == "user":
|
||||
return self.ConvertedMessages(
|
||||
messages=[self._from_universal_context_message(messages[0])],
|
||||
messages=[
|
||||
self._from_universal_context_message(cast(LLMContextMessage, messages[0]))
|
||||
],
|
||||
system_instruction=system_instruction,
|
||||
)
|
||||
|
||||
@@ -142,18 +156,18 @@ class OpenAIRealtimeLLMAdapter(BaseLLMAdapter):
|
||||
|
||||
return self.ConvertedMessages(
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"type": "message",
|
||||
"content": [
|
||||
{
|
||||
"type": "input_text",
|
||||
"text": "\n\n".join(
|
||||
events.ConversationItem(
|
||||
role="user",
|
||||
type="message",
|
||||
content=[
|
||||
events.ItemContent(
|
||||
type="input_text",
|
||||
text="\n\n".join(
|
||||
[intro_text, json.dumps(messages, indent=2), trailing_text]
|
||||
),
|
||||
}
|
||||
)
|
||||
],
|
||||
}
|
||||
)
|
||||
],
|
||||
system_instruction=system_instruction,
|
||||
)
|
||||
@@ -161,31 +175,34 @@ class OpenAIRealtimeLLMAdapter(BaseLLMAdapter):
|
||||
def _from_universal_context_message(
|
||||
self, message: LLMContextMessage
|
||||
) -> events.ConversationItem:
|
||||
if message.get("role") == "user":
|
||||
content = message.get("content")
|
||||
if isinstance(message.get("content"), list):
|
||||
# NOTE: ``LLMSpecificMessage`` is not yet handled here — see the
|
||||
# corresponding note in `_from_universal_context_messages`.
|
||||
msg = cast(dict[str, Any], message)
|
||||
if msg.get("role") == "user":
|
||||
content = msg.get("content")
|
||||
if isinstance(content, list):
|
||||
content = ""
|
||||
for c in message.get("content"):
|
||||
for c in msg.get("content", []):
|
||||
if c.get("type") == "text":
|
||||
content += " " + c.get("text")
|
||||
else:
|
||||
logger.error(
|
||||
f"Unhandled content type in context message: {c.get('type')} - {message}"
|
||||
f"Unhandled content type in context message: {c.get('type')} - {msg}"
|
||||
)
|
||||
return events.ConversationItem(
|
||||
role="user",
|
||||
type="message",
|
||||
content=[events.ItemContent(type="input_text", text=content)],
|
||||
)
|
||||
if message.get("role") == "assistant" and message.get("tool_calls"):
|
||||
tc = message.get("tool_calls")[0]
|
||||
if msg.get("role") == "assistant" and msg.get("tool_calls"):
|
||||
tc = msg["tool_calls"][0]
|
||||
return events.ConversationItem(
|
||||
type="function_call",
|
||||
call_id=tc["id"],
|
||||
name=tc["function"]["name"],
|
||||
arguments=tc["function"]["arguments"],
|
||||
)
|
||||
logger.error(f"Unhandled message type in _from_universal_context_message: {message}")
|
||||
raise ValueError(f"Unhandled message type in _from_universal_context_message: {msg}")
|
||||
|
||||
@staticmethod
|
||||
def _to_openai_realtime_function_format(function: FunctionSchema) -> dict[str, Any]:
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
|
||||
"""OpenAI Responses API adapter for Pipecat."""
|
||||
|
||||
from typing import Any, TypedDict
|
||||
from typing import Any, Required, TypedDict, cast
|
||||
|
||||
from openai._types import NotGiven as OpenAINotGiven
|
||||
from openai.types.responses import FunctionToolParam, ResponseInputItemParam, ToolParam
|
||||
@@ -23,8 +23,10 @@ from pipecat.processors.aggregators.llm_context import (
|
||||
class OpenAIResponsesLLMInvocationParams(TypedDict, total=False):
|
||||
"""Context-based parameters for invoking OpenAI Responses API."""
|
||||
|
||||
input: list[ResponseInputItemParam]
|
||||
tools: list[ToolParam] | OpenAINotGiven
|
||||
# `input` and `tools` are always populated by `get_llm_invocation_params`;
|
||||
# `instructions` is only set when a system instruction is present.
|
||||
input: Required[list[ResponseInputItemParam]]
|
||||
tools: Required[list[ToolParam] | OpenAINotGiven]
|
||||
instructions: str
|
||||
|
||||
|
||||
@@ -64,8 +66,11 @@ class OpenAIResponsesLLMAdapter(BaseLLMAdapter[OpenAIResponsesLLMInvocationParam
|
||||
if system_instruction and messages:
|
||||
first_msg = messages[0] if not isinstance(messages[0], LLMSpecificMessage) else None
|
||||
if first_msg and first_msg.get("role") == "system":
|
||||
# `content` is `str | Iterable[...]`; we only forward it for
|
||||
# warning purposes. Coerce non-strings to None.
|
||||
first_content = first_msg.get("content", "")
|
||||
self._resolve_system_instruction(
|
||||
first_msg.get("content", ""),
|
||||
first_content if isinstance(first_content, str) else None,
|
||||
system_instruction,
|
||||
discard_context_system=False,
|
||||
)
|
||||
@@ -143,7 +148,10 @@ class OpenAIResponsesLLMAdapter(BaseLLMAdapter[OpenAIResponsesLLMInvocationParam
|
||||
Returns:
|
||||
List of messages in a format ready for logging.
|
||||
"""
|
||||
return self.get_messages(context, truncate_large_values=True)
|
||||
return cast(
|
||||
list[dict[str, Any]],
|
||||
self.get_messages(context, truncate_large_values=True),
|
||||
)
|
||||
|
||||
def _convert_messages_to_input(
|
||||
self, messages: list[LLMContextMessage]
|
||||
@@ -169,13 +177,15 @@ class OpenAIResponsesLLMAdapter(BaseLLMAdapter[OpenAIResponsesLLMInvocationParam
|
||||
content = message.get("content", "")
|
||||
if isinstance(content, list):
|
||||
content = self._convert_multimodal_content(content)
|
||||
result.append({"role": "developer", "content": content})
|
||||
result.append(
|
||||
cast(ResponseInputItemParam, {"role": "developer", "content": content})
|
||||
)
|
||||
|
||||
elif role == "user":
|
||||
content = message.get("content", "")
|
||||
if isinstance(content, list):
|
||||
content = self._convert_multimodal_content(content)
|
||||
result.append({"role": "user", "content": content})
|
||||
result.append(cast(ResponseInputItemParam, {"role": "user", "content": content}))
|
||||
|
||||
elif role == "assistant":
|
||||
tool_calls = message.get("tool_calls")
|
||||
@@ -194,7 +204,9 @@ class OpenAIResponsesLLMAdapter(BaseLLMAdapter[OpenAIResponsesLLMInvocationParam
|
||||
content = message.get("content", "")
|
||||
if isinstance(content, list):
|
||||
content = self._convert_multimodal_content(content)
|
||||
result.append({"role": "assistant", "content": content})
|
||||
result.append(
|
||||
cast(ResponseInputItemParam, {"role": "assistant", "content": content})
|
||||
)
|
||||
|
||||
elif role == "tool":
|
||||
content = message.get("content", "")
|
||||
|
||||
@@ -28,6 +28,7 @@ the messages are sent to Perplexity's API.
|
||||
"""
|
||||
|
||||
import copy
|
||||
from typing import Any, cast
|
||||
|
||||
from openai.types.chat import ChatCompletionMessageParam
|
||||
|
||||
@@ -116,7 +117,11 @@ class PerplexityLLMAdapter(OpenAILLMAdapter):
|
||||
if not messages:
|
||||
return messages
|
||||
|
||||
messages = copy.deepcopy(messages)
|
||||
# ChatCompletionMessageParam is a union of TypedDicts; the
|
||||
# transformations below mutate by key/index in ways those TypedDicts
|
||||
# don't permit. Work against a plain-dict view for the duration of
|
||||
# the transformation and cast back at the return site.
|
||||
msgs: list[dict[str, Any]] = cast(list[dict[str, Any]], copy.deepcopy(messages))
|
||||
|
||||
# Note: "developer" → "user" conversion is handled by the parent adapter
|
||||
# via the convert_developer_to_user parameter.
|
||||
@@ -125,10 +130,10 @@ class PerplexityLLMAdapter(OpenAILLMAdapter):
|
||||
# Perplexity allows system messages at the start, but rejects them
|
||||
# after any non-system message.
|
||||
in_initial_system_block = True
|
||||
for i in range(len(messages)):
|
||||
if messages[i].get("role") == "system":
|
||||
for i in range(len(msgs)):
|
||||
if msgs[i].get("role") == "system":
|
||||
if not in_initial_system_block:
|
||||
messages[i]["role"] = "user"
|
||||
msgs[i]["role"] = "user"
|
||||
else:
|
||||
in_initial_system_block = False
|
||||
|
||||
@@ -137,9 +142,9 @@ class PerplexityLLMAdapter(OpenAILLMAdapter):
|
||||
# messages that violate Perplexity's strict alternation requirement.
|
||||
# Skip consecutive system messages at the start — Perplexity allows those.
|
||||
i = 0
|
||||
while i < len(messages) - 1:
|
||||
current = messages[i]
|
||||
next_msg = messages[i + 1]
|
||||
while i < len(msgs) - 1:
|
||||
current = msgs[i]
|
||||
next_msg = msgs[i + 1]
|
||||
if current["role"] == next_msg["role"] == "system":
|
||||
# Perplexity allows multiple initial system messages, don't merge
|
||||
i += 1
|
||||
@@ -154,7 +159,7 @@ class PerplexityLLMAdapter(OpenAILLMAdapter):
|
||||
next_msg.get("content"), list
|
||||
):
|
||||
current["content"].extend(next_msg["content"])
|
||||
messages.pop(i + 1)
|
||||
msgs.pop(i + 1)
|
||||
else:
|
||||
i += 1
|
||||
|
||||
@@ -162,7 +167,7 @@ class PerplexityLLMAdapter(OpenAILLMAdapter):
|
||||
# Perplexity requires the last message to be "user" or "tool".
|
||||
# OpenAI appears to silently ignore trailing assistant messages
|
||||
# server-side, so removing them preserves equivalent behavior.
|
||||
while messages and messages[-1].get("role") == "assistant":
|
||||
messages.pop()
|
||||
while msgs and msgs[-1].get("role") == "assistant":
|
||||
msgs.pop()
|
||||
|
||||
return messages
|
||||
return cast(list[ChatCompletionMessageParam], msgs)
|
||||
|
||||
@@ -14,7 +14,7 @@ in-memory after first load to improve performance on subsequent accesses.
|
||||
import asyncio
|
||||
import io
|
||||
import wave
|
||||
from importlib.resources import files
|
||||
from importlib.resources import as_file, files
|
||||
|
||||
import aiofiles
|
||||
|
||||
@@ -52,10 +52,12 @@ async def load_dtmf_audio(button: KeypadEntry, *, sample_rate: int = 8000) -> by
|
||||
__DTMF_RESAMPLER__ = create_file_resampler()
|
||||
|
||||
dtmf_file_name = __DTMF_FILE_NAME.get(button, f"dtmf-{button.value}.wav")
|
||||
dtmf_file_path = files("pipecat.audio.dtmf").joinpath(dtmf_file_name)
|
||||
|
||||
async with aiofiles.open(dtmf_file_path, "rb") as f:
|
||||
data = await f.read()
|
||||
# `as_file` materialises the resource as a real filesystem `Path`,
|
||||
# which aiofiles can open. (For installed packages this is just the
|
||||
# bundled file; for zipped distributions it would extract to a temp.)
|
||||
with as_file(files("pipecat.audio.dtmf").joinpath(dtmf_file_name)) as dtmf_file_path:
|
||||
async with aiofiles.open(dtmf_file_path, "rb") as f:
|
||||
data = await f.read()
|
||||
|
||||
with io.BytesIO(data) as buffer:
|
||||
with wave.open(buffer, "rb") as wf:
|
||||
|
||||
@@ -60,7 +60,12 @@ class RNNoiseFilter(BaseAudioFilter):
|
||||
self._sample_rate = sample_rate
|
||||
|
||||
try:
|
||||
# RNNoise always requires 48kHz
|
||||
# The module-level import sets `RNNoise` to `None` if pyrnnoise
|
||||
# isn't installed; raise instead of calling `None(...)` so the
|
||||
# except clause handles it cleanly.
|
||||
if RNNoise is None:
|
||||
raise ImportError("pyrnnoise is not installed")
|
||||
# RNNoise always requires 48kHz.
|
||||
self._rnnoise = RNNoise(sample_rate=48000)
|
||||
self._rnnoise_ready = True
|
||||
except Exception as e:
|
||||
@@ -107,7 +112,7 @@ class RNNoiseFilter(BaseAudioFilter):
|
||||
Returns:
|
||||
Noise-suppressed audio data as bytes.
|
||||
"""
|
||||
if not self._rnnoise_ready or not self._filtering:
|
||||
if not self._rnnoise_ready or not self._filtering or self._rnnoise is None:
|
||||
return audio
|
||||
|
||||
# Resample input if needed
|
||||
|
||||
@@ -10,6 +10,8 @@ This module provides an audio resampler that uses the resampy library
|
||||
for high-quality audio sample rate conversion.
|
||||
"""
|
||||
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
import resampy
|
||||
|
||||
@@ -21,6 +23,11 @@ class ResampyResampler(BaseAudioResampler):
|
||||
|
||||
This resampler uses the resampy library's Kaiser windowing filter
|
||||
for high-quality audio resampling with good performance characteristics.
|
||||
|
||||
.. deprecated:: 1.2.0
|
||||
ResampyResampler is deprecated and will be removed in Pipecat 2.0.
|
||||
Use SOXRAudioResampler, create_file_resampler(), or create_stream_resampler()
|
||||
instead.
|
||||
"""
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
@@ -29,7 +36,15 @@ class ResampyResampler(BaseAudioResampler):
|
||||
Args:
|
||||
**kwargs: Additional keyword arguments (currently unused).
|
||||
"""
|
||||
pass
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("always")
|
||||
warnings.warn(
|
||||
"ResampyResampler is deprecated and will be removed in Pipecat 2.0. "
|
||||
"Use SOXRAudioResampler, create_file_resampler(), or "
|
||||
"create_stream_resampler() instead.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
async def resample(self, audio: bytes, in_rate: int, out_rate: int) -> bytes:
|
||||
"""Resample audio data using resampy library.
|
||||
|
||||
@@ -339,6 +339,40 @@ class LLMTextFrame(TextFrame):
|
||||
self.includes_inter_frame_spaces = True
|
||||
|
||||
|
||||
@dataclass
|
||||
class LLMMarkerFrame(DataFrame):
|
||||
"""Sideband marker emitted by an LLM service.
|
||||
|
||||
A marker is short, structured assistant output that should be
|
||||
persisted in the conversation context but should not flow through
|
||||
the standard text path (TTS, transcript). The assistant aggregator
|
||||
writes the marker to the context so the LLM can self-condition on
|
||||
prior markers on subsequent turns.
|
||||
|
||||
The primary use today is the ``filter_incomplete_user_turns``
|
||||
protocol, where ``UserTurnCompletionLLMServiceMixin`` emits the
|
||||
turn-completion markers ✓ / ○ / ◐ on every response. The frame is
|
||||
intentionally generic so other components — STT services with
|
||||
built-in turn signals, end-of-turn classifiers, custom annotations,
|
||||
etc. — can use the same mechanism to inject sideband signals into
|
||||
the assistant context.
|
||||
|
||||
Parameters:
|
||||
marker: The marker payload (typically a short string such as a
|
||||
single character).
|
||||
append_to_context_immediately: If True, the marker is written
|
||||
to the context as its own standalone assistant message as
|
||||
soon as it's received. If False, the marker is appended to
|
||||
the running assistant aggregation and flushed to the
|
||||
context together with the following text as a single
|
||||
message (e.g. for the ✓ case the context message ends up
|
||||
as "✓ <response>").
|
||||
"""
|
||||
|
||||
marker: str
|
||||
append_to_context_immediately: bool = True
|
||||
|
||||
|
||||
@dataclass
|
||||
class AggregatedTextFrame(TextFrame):
|
||||
"""Text frame representing an aggregation of TextFrames.
|
||||
@@ -661,6 +695,11 @@ class FunctionCallResultProperties:
|
||||
is_final: Whether this is the final result for the function call. When
|
||||
``False`` the result is treated as an intermediate update. Defaults to ``True``.
|
||||
Only meaningful for async function calls (``cancel_on_interruption=False``).
|
||||
Note: realtime LLM services do not support streamed intermediate
|
||||
results; they deliver only the final result to the provider. An
|
||||
intermediate result reported to a realtime service is dropped
|
||||
and an error is raised. Use a non-realtime LLM service if your
|
||||
tool needs to stream intermediate results.
|
||||
"""
|
||||
|
||||
run_llm: bool | None = None
|
||||
@@ -970,6 +1009,24 @@ class UserSpeakingFrame(SystemFrame):
|
||||
pass
|
||||
|
||||
|
||||
@dataclass
|
||||
class UserTurnInferenceCompletedFrame(SystemFrame):
|
||||
"""Frame indicating that the user turn is semantically complete.
|
||||
|
||||
Emitted by any component that can judge conversational turn
|
||||
completeness — for example an LLM with turn-completion markers, an
|
||||
STT service with built-in turn detection, or a dedicated
|
||||
end-of-turn classifier. Stop strategies that gate the
|
||||
user-turn-stop event on an external completeness signal (e.g.
|
||||
``LLMTurnCompletionUserTurnStopStrategy``) consume this frame to
|
||||
finalize the turn. Producers should emit this frame only when they
|
||||
judge the turn complete; an absence of this frame means the turn is
|
||||
not yet considered complete.
|
||||
"""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
@dataclass
|
||||
class VADUserStartedSpeakingFrame(SystemFrame):
|
||||
"""Frame emitted when VAD definitively detects user started speaking.
|
||||
|
||||
@@ -14,6 +14,7 @@ including heartbeats, idle detection, and observer integration.
|
||||
import asyncio
|
||||
import importlib.util
|
||||
import os
|
||||
import warnings
|
||||
from collections.abc import AsyncIterable, Iterable
|
||||
from pathlib import Path
|
||||
from typing import Any, TypeVar
|
||||
@@ -193,6 +194,7 @@ class PipelineTask(BasePipelineTask):
|
||||
*,
|
||||
params: PipelineParams | None = None,
|
||||
additional_span_attributes: dict | None = None,
|
||||
app_resources: Any = None,
|
||||
cancel_on_idle_timeout: bool = True,
|
||||
cancel_timeout_secs: float = CANCEL_TIMEOUT_SECS,
|
||||
check_dangling_tasks: bool = True,
|
||||
@@ -216,6 +218,14 @@ class PipelineTask(BasePipelineTask):
|
||||
params: Configuration parameters for the pipeline.
|
||||
additional_span_attributes: Optional dictionary of attributes to propagate as
|
||||
OpenTelemetry conversation span attributes.
|
||||
app_resources: Optional application-defined bag of anything your
|
||||
application code may want to share across this session (DB
|
||||
handles, HTTP clients, etc.), passed by reference. Pipecat
|
||||
passes it through untouched and exposes it on the task itself
|
||||
as ``task.app_resources`` and passes it to tool handlers as
|
||||
``FunctionCallParams.app_resources``. The framework never
|
||||
copies or clears this object; the caller retains their handle
|
||||
and can read any mutations after the task finishes.
|
||||
cancel_on_idle_timeout: Whether the pipeline task should be cancelled if
|
||||
the idle timeout is reached.
|
||||
cancel_timeout_secs: Timeout (in seconds) to wait for cancellation to happen
|
||||
@@ -235,13 +245,24 @@ class PipelineTask(BasePipelineTask):
|
||||
rtvi_observer_params: The RTVI observer parameter to use if RTVI is enabled.
|
||||
rtvi_processor: The RTVI processor to add if RTVI is enabled.
|
||||
task_manager: Optional task manager for handling asyncio tasks.
|
||||
tool_resources: Optional application-defined bag of resources (DB handles,
|
||||
clients, state, etc.) passed by reference to every tool handler via
|
||||
``FunctionCallParams.tool_resources``. The framework never copies or
|
||||
clears this object; the caller retains their handle and can read any
|
||||
mutations after the task finishes.
|
||||
tool_resources: Deprecated alias for ``app_resources``.
|
||||
|
||||
.. deprecated:: 1.2.0
|
||||
Use ``app_resources`` instead. ``tool_resources`` will be
|
||||
removed in a future version.
|
||||
"""
|
||||
super().__init__()
|
||||
if tool_resources is not None:
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("always")
|
||||
warnings.warn(
|
||||
"`PipelineTask(tool_resources=...)` is deprecated since 1.2.0, "
|
||||
"use `app_resources` instead.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
if app_resources is None:
|
||||
app_resources = tool_resources
|
||||
self._params = params or PipelineParams()
|
||||
self._additional_span_attributes = additional_span_attributes or {}
|
||||
self._cancel_on_idle_timeout = cancel_on_idle_timeout
|
||||
@@ -252,7 +273,7 @@ class PipelineTask(BasePipelineTask):
|
||||
self._enable_tracing = enable_tracing and is_tracing_available()
|
||||
self._enable_turn_tracking = enable_turn_tracking
|
||||
self._idle_timeout_secs = idle_timeout_secs
|
||||
self._tool_resources = tool_resources
|
||||
self._app_resources = app_resources
|
||||
observers = observers or []
|
||||
self._turn_tracking_observer: TurnTrackingObserver | None = None
|
||||
self._user_bot_latency_observer: UserBotLatencyObserver | None = None
|
||||
@@ -282,7 +303,7 @@ class PipelineTask(BasePipelineTask):
|
||||
|
||||
# This task maneger will handle all the asyncio tasks created by this
|
||||
# PipelineTask and its frame processors.
|
||||
self._task_manager = task_manager or TaskManager()
|
||||
self._pipeline_task_manager = task_manager or TaskManager()
|
||||
|
||||
# This queue is the queue used to push frames to the pipeline.
|
||||
self._push_queue = asyncio.Queue()
|
||||
@@ -365,7 +386,7 @@ class PipelineTask(BasePipelineTask):
|
||||
# The task observer acts as a proxy to the provided observers. This way,
|
||||
# we only need to pass a single observer (using the StartFrame) which
|
||||
# then just acts as a proxy.
|
||||
self._observer = TaskObserver(observers=observers, task_manager=self._task_manager)
|
||||
self._observer = TaskObserver(observers=observers)
|
||||
|
||||
# These events can be used to check which frames make it to the source
|
||||
# or sink processors. Instead of calling the event handlers for every
|
||||
@@ -391,6 +412,21 @@ class PipelineTask(BasePipelineTask):
|
||||
"""
|
||||
return self._params
|
||||
|
||||
@property
|
||||
def app_resources(self) -> Any:
|
||||
"""Get the application-defined resources passed to this task.
|
||||
|
||||
This is the same object passed to the constructor as
|
||||
``app_resources``. Tool handlers can also access it via
|
||||
``FunctionCallParams.app_resources``. The framework returns the
|
||||
original reference; mutations are visible to all callers.
|
||||
|
||||
Returns:
|
||||
The application-defined resources, or ``None`` if none were
|
||||
passed.
|
||||
"""
|
||||
return self._app_resources
|
||||
|
||||
@property
|
||||
def pipeline(self) -> BasePipeline:
|
||||
"""Get the full pipeline managed by this pipeline task.
|
||||
@@ -618,32 +654,24 @@ class PipelineTask(BasePipelineTask):
|
||||
|
||||
async def _create_tasks(self):
|
||||
"""Create and start all pipeline processing tasks."""
|
||||
self._process_push_task = self._task_manager.create_task(
|
||||
self._process_push_queue(), f"{self}::_process_push_queue"
|
||||
)
|
||||
self._process_push_task = self.create_task(self._process_push_queue())
|
||||
return self._process_push_task
|
||||
|
||||
def _maybe_start_heartbeat_tasks(self):
|
||||
"""Start heartbeat tasks if heartbeats are enabled and not already running."""
|
||||
if self._params.enable_heartbeats and self._heartbeat_push_task is None:
|
||||
self._heartbeat_push_task = self._task_manager.create_task(
|
||||
self._heartbeat_push_handler(), f"{self}::_heartbeat_push_handler"
|
||||
)
|
||||
self._heartbeat_monitor_task = self._task_manager.create_task(
|
||||
self._heartbeat_monitor_handler(), f"{self}::_heartbeat_monitor_handler"
|
||||
)
|
||||
self._heartbeat_push_task = self.create_task(self._heartbeat_push_handler())
|
||||
self._heartbeat_monitor_task = self.create_task(self._heartbeat_monitor_handler())
|
||||
|
||||
def _maybe_start_idle_task(self):
|
||||
"""Start idle monitoring task if idle timeout is configured."""
|
||||
if self._idle_timeout_secs:
|
||||
self._idle_monitor_task = self._task_manager.create_task(
|
||||
self._idle_monitor_handler(), f"{self}::_idle_monitor_handler"
|
||||
)
|
||||
self._idle_monitor_task = self.create_task(self._idle_monitor_handler())
|
||||
|
||||
async def _cancel_tasks(self):
|
||||
"""Cancel all running pipeline tasks."""
|
||||
if self._process_push_task:
|
||||
await self._task_manager.cancel_task(self._process_push_task)
|
||||
await self.cancel_task(self._process_push_task)
|
||||
self._process_push_task = None
|
||||
|
||||
await self._maybe_cancel_heartbeat_tasks()
|
||||
@@ -655,17 +683,17 @@ class PipelineTask(BasePipelineTask):
|
||||
return
|
||||
|
||||
if self._heartbeat_push_task:
|
||||
await self._task_manager.cancel_task(self._heartbeat_push_task)
|
||||
await self.cancel_task(self._heartbeat_push_task)
|
||||
self._heartbeat_push_task = None
|
||||
|
||||
if self._heartbeat_monitor_task:
|
||||
await self._task_manager.cancel_task(self._heartbeat_monitor_task)
|
||||
await self.cancel_task(self._heartbeat_monitor_task)
|
||||
self._heartbeat_monitor_task = None
|
||||
|
||||
async def _maybe_cancel_idle_task(self):
|
||||
"""Cancel idle monitoring task if it is running."""
|
||||
if self._idle_monitor_task:
|
||||
await self._task_manager.cancel_task(self._idle_monitor_task)
|
||||
await self.cancel_task(self._idle_monitor_task)
|
||||
self._idle_monitor_task = None
|
||||
|
||||
def _initial_metrics_frame(self) -> MetricsFrame:
|
||||
@@ -723,14 +751,22 @@ class PipelineTask(BasePipelineTask):
|
||||
|
||||
async def _setup(self, params: PipelineTaskParams):
|
||||
"""Set up the pipeline task and all processors."""
|
||||
await super().setup(self._pipeline_task_manager)
|
||||
|
||||
mgr_params = TaskManagerParams(loop=params.loop)
|
||||
self._task_manager.setup(mgr_params)
|
||||
self.task_manager.setup(mgr_params)
|
||||
|
||||
setup = FrameProcessorSetup(
|
||||
clock=self._clock,
|
||||
task_manager=self._task_manager,
|
||||
task_manager=self.task_manager,
|
||||
observer=self._observer,
|
||||
tool_resources=self._tool_resources,
|
||||
pipeline_task=self,
|
||||
# Populate the deprecated `tool_resources` field for backwards
|
||||
# compatibility with custom FrameProcessor subclasses whose
|
||||
# ``setup()`` overrides still read it. Reading the field emits a
|
||||
# DeprecationWarning; new code should read
|
||||
# ``setup.pipeline_task.app_resources`` instead.
|
||||
tool_resources=self._app_resources,
|
||||
)
|
||||
await self._pipeline.setup(setup)
|
||||
|
||||
@@ -738,6 +774,7 @@ class PipelineTask(BasePipelineTask):
|
||||
await self._load_setup_files()
|
||||
|
||||
# Start task observer.
|
||||
await self._observer.setup(self.task_manager)
|
||||
await self._observer.start()
|
||||
|
||||
async def _cleanup(self, cleanup_pipeline: bool):
|
||||
@@ -977,7 +1014,7 @@ class PipelineTask(BasePipelineTask):
|
||||
|
||||
def _print_dangling_tasks(self):
|
||||
"""Log any dangling tasks that haven't been properly cleaned up."""
|
||||
tasks = [t.get_name() for t in self._task_manager.current_tasks()]
|
||||
tasks = [t.get_name() for t in self.task_manager.current_tasks()]
|
||||
if tasks:
|
||||
logger.warning(f"{self} dangling tasks detected: {tasks}")
|
||||
|
||||
|
||||
@@ -17,7 +17,6 @@ from typing import Any
|
||||
from attr import dataclass
|
||||
|
||||
from pipecat.observers.base_observer import BaseObserver, FrameProcessed, FramePushed
|
||||
from pipecat.utils.asyncio.task_manager import BaseTaskManager
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -62,19 +61,16 @@ class TaskObserver(BaseObserver):
|
||||
self,
|
||||
*,
|
||||
observers: list[BaseObserver] | None = None,
|
||||
task_manager: BaseTaskManager,
|
||||
**kwargs,
|
||||
):
|
||||
"""Initialize the TaskObserver.
|
||||
|
||||
Args:
|
||||
observers: List of observers to manage. Defaults to empty list.
|
||||
task_manager: Task manager for creating and managing observer tasks.
|
||||
**kwargs: Additional arguments passed to the base observer.
|
||||
"""
|
||||
super().__init__(**kwargs)
|
||||
self._observers = observers or []
|
||||
self._task_manager = task_manager
|
||||
self._proxies: dict[BaseObserver, Proxy] | None = (
|
||||
None # Becomes a dict after start() is called
|
||||
)
|
||||
@@ -106,7 +102,7 @@ class TaskObserver(BaseObserver):
|
||||
# Remove the proxy so it doesn't get called anymore.
|
||||
del self._proxies[observer]
|
||||
# Cancel the proxy task right away.
|
||||
await self._task_manager.cancel_task(proxy.task)
|
||||
await self.cancel_task(proxy.task)
|
||||
|
||||
# Remove the observer from the list.
|
||||
if observer in self._observers:
|
||||
@@ -122,7 +118,7 @@ class TaskObserver(BaseObserver):
|
||||
return
|
||||
|
||||
for proxy in self._proxies.values():
|
||||
await self._task_manager.cancel_task(proxy.task)
|
||||
await self.cancel_task(proxy.task)
|
||||
|
||||
async def cleanup(self):
|
||||
"""Cleanup all proxy observers."""
|
||||
@@ -157,9 +153,8 @@ class TaskObserver(BaseObserver):
|
||||
def _create_proxy(self, observer: BaseObserver) -> Proxy:
|
||||
"""Create a proxy for a single observer."""
|
||||
queue = asyncio.Queue()
|
||||
task = self._task_manager.create_task(
|
||||
self._proxy_task_handler(queue, observer),
|
||||
f"TaskObserver::{observer}::_proxy_task_handler",
|
||||
task = self.create_task(
|
||||
self._proxy_task_handler(queue, observer), f"{observer}::_proxy_task_handler"
|
||||
)
|
||||
proxy = Proxy(queue=queue, task=task, observer=observer)
|
||||
return proxy
|
||||
|
||||
286
src/pipecat/processors/aggregators/async_tool_messages.py
Normal file
286
src/pipecat/processors/aggregators/async_tool_messages.py
Normal file
@@ -0,0 +1,286 @@
|
||||
#
|
||||
# Copyright (c) 2024-2026, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""Helpers for the async-tool message protocol used in LLM contexts.
|
||||
|
||||
When a function is registered with ``cancel_on_interruption=False``, the
|
||||
``LLMUserContextAggregator`` / ``LLMAssistantContextAggregator`` pair appends
|
||||
async-tool messages to the conversation context as the underlying task
|
||||
progresses:
|
||||
|
||||
- A ``started`` message (``role="tool"``) is appended immediately when the
|
||||
tool starts running.
|
||||
- An ``intermediate`` message (``role="developer"``) is appended each time an
|
||||
intermediate result is reported via
|
||||
``result_callback(..., FunctionCallResultProperties(is_final=False))``.
|
||||
- A ``final`` message (``role="developer"``) is appended when the task
|
||||
finishes.
|
||||
|
||||
This module is the single source of truth for the on-the-wire payload shape:
|
||||
|
||||
- The aggregator uses the ``build_*_message`` functions when injecting messages.
|
||||
- Realtime LLM services use ``parse_message`` to detect async-tool messages
|
||||
while iterating the context, then read ``payload.result`` and deliver it via
|
||||
their formal tool-result channel.
|
||||
|
||||
Internally, ``AsyncToolMessagePayload`` is the canonical structured form;
|
||||
the on-the-wire JSON string is always derived from it (never stored) so the
|
||||
two representations can't drift.
|
||||
|
||||
Consumers are expected to import the module rather than its individual
|
||||
functions, e.g.::
|
||||
|
||||
from pipecat.processors.aggregators import async_tool_messages
|
||||
...
|
||||
async_tool_messages.build_started_message(tool_call_id)
|
||||
async_tool_messages.parse_message(msg)
|
||||
"""
|
||||
|
||||
import json
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Literal
|
||||
|
||||
from pipecat.processors.aggregators.llm_context import LLMStandardMessage
|
||||
|
||||
AsyncToolMessageKind = Literal["started", "intermediate", "final"]
|
||||
|
||||
# --- Payload shape (private; canonical source of truth) ---------------------
|
||||
|
||||
# The ``type`` field that identifies an async-tool message payload. Both the
|
||||
# builders and the parser use this constant; do not duplicate the literal.
|
||||
_PAYLOAD_TYPE = "async_tool"
|
||||
|
||||
# Status value for started / intermediate messages (task still running).
|
||||
_STATUS_RUNNING = "running"
|
||||
|
||||
# Status value for the final message (task complete).
|
||||
_STATUS_FINISHED = "finished"
|
||||
|
||||
# Description shipped on the started message. The text is intentionally
|
||||
# self-explanatory so a model reading the context can tell what's about to
|
||||
# happen even without out-of-band knowledge of the protocol.
|
||||
_STARTED_DESCRIPTION = (
|
||||
"An asynchronous task associated with this tool_call_id has started "
|
||||
"running. Expect results to arrive later as developer messages that look "
|
||||
"roughly like this one (with 'type=async_tool' and a matching tool_call_id) "
|
||||
"but with a 'result' field. Note that there *may* be more than one result "
|
||||
"(i.e., a stream of results), but there doesn't have to be (there may be "
|
||||
"only one). The last result will come in a message with 'status=finished'."
|
||||
)
|
||||
|
||||
# Description shipped on each intermediate-result message.
|
||||
_INTERMEDIATE_DESCRIPTION = (
|
||||
"This is an intermediate result for the asynchronous task associated with "
|
||||
"this tool_call_id. The task is still running. More intermediate results "
|
||||
"may follow, or the next result may be the final one with "
|
||||
"'status=finished'."
|
||||
)
|
||||
|
||||
# Description shipped on the final-result message.
|
||||
_FINAL_DESCRIPTION = (
|
||||
"This is the final result for the asynchronous task associated with this "
|
||||
"tool_call_id. The task has completed. No further results will arrive for "
|
||||
"this tool_call_id."
|
||||
)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class AsyncToolMessagePayload:
|
||||
"""The structured contents of an async-tool message in an LLM context.
|
||||
|
||||
Parameters:
|
||||
kind: Which of the three async-tool message stages this is.
|
||||
tool_call_id: The id of the tool invocation this payload relates to.
|
||||
status: ``"running"`` for started/intermediate, ``"finished"`` for
|
||||
the final message.
|
||||
description: Human-readable description from the payload. May be empty.
|
||||
result: For ``intermediate`` and ``final`` messages, the JSON-encoded
|
||||
result string (or the literal ``"COMPLETED"`` if the function
|
||||
returned no value). ``None`` for ``started`` messages.
|
||||
"""
|
||||
|
||||
kind: AsyncToolMessageKind
|
||||
tool_call_id: str
|
||||
status: Literal["running", "finished"]
|
||||
description: str
|
||||
result: str | None
|
||||
|
||||
|
||||
# --- Internal: payload ↔ on-the-wire forms -----------------------------------
|
||||
|
||||
|
||||
def _payload_to_json(payload: AsyncToolMessagePayload) -> str:
|
||||
"""Serialize a payload to its on-the-wire JSON string form.
|
||||
|
||||
Fields that don't apply to the payload's kind are omitted (notably
|
||||
``result`` is left out of ``started`` payloads, since the task hasn't
|
||||
produced a result yet).
|
||||
"""
|
||||
obj: dict[str, Any] = {
|
||||
"type": _PAYLOAD_TYPE,
|
||||
"status": payload.status,
|
||||
"tool_call_id": payload.tool_call_id,
|
||||
"description": payload.description,
|
||||
}
|
||||
if payload.result is not None:
|
||||
obj["result"] = payload.result
|
||||
return json.dumps(obj)
|
||||
|
||||
|
||||
def _payload_to_message(payload: AsyncToolMessagePayload) -> LLMStandardMessage:
|
||||
"""Wrap a payload in the LLM context message shape that matches its kind.
|
||||
|
||||
- ``started``: ``role="tool"`` plus ``tool_call_id`` at the top level
|
||||
(so the message can sit alongside other regular tool-result messages).
|
||||
- ``intermediate`` / ``final``: ``role="developer"``; ``tool_call_id``
|
||||
lives only inside the JSON payload.
|
||||
"""
|
||||
content = _payload_to_json(payload)
|
||||
if payload.kind == "started":
|
||||
return {
|
||||
"role": "tool",
|
||||
"content": content,
|
||||
"tool_call_id": payload.tool_call_id,
|
||||
}
|
||||
return {
|
||||
"role": "developer",
|
||||
"content": content,
|
||||
}
|
||||
|
||||
|
||||
# --- Builders ----------------------------------------------------------------
|
||||
|
||||
|
||||
def build_started_message(tool_call_id: str) -> LLMStandardMessage:
|
||||
"""Build a ``started`` async-tool message for an LLM context.
|
||||
|
||||
Append the returned message to the LLM context immediately when an async
|
||||
function call (registered with ``cancel_on_interruption=False``) starts
|
||||
running. The message lets the model know a task is in flight and that its
|
||||
results will arrive later in subsequent ``developer``-role messages.
|
||||
|
||||
Args:
|
||||
tool_call_id: The id of the tool invocation this message is for.
|
||||
|
||||
Returns:
|
||||
A message ready to pass to ``LLMContext.add_message``.
|
||||
"""
|
||||
return _payload_to_message(
|
||||
AsyncToolMessagePayload(
|
||||
kind="started",
|
||||
tool_call_id=tool_call_id,
|
||||
status=_STATUS_RUNNING,
|
||||
description=_STARTED_DESCRIPTION,
|
||||
result=None,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def build_intermediate_result_message(tool_call_id: str, result: str) -> LLMStandardMessage:
|
||||
"""Build an intermediate-result async-tool message for an LLM context.
|
||||
|
||||
Append the returned message to the LLM context each time the running async
|
||||
function reports a non-final result via
|
||||
``result_callback(..., FunctionCallResultProperties(is_final=False))``.
|
||||
|
||||
Args:
|
||||
tool_call_id: The id of the tool invocation the result is for.
|
||||
result: The JSON-encoded result string (caller is responsible for
|
||||
encoding the function's actual return value, typically via
|
||||
``json.dumps``).
|
||||
|
||||
Returns:
|
||||
A message ready to pass to ``LLMContext.add_message``.
|
||||
"""
|
||||
return _payload_to_message(
|
||||
AsyncToolMessagePayload(
|
||||
kind="intermediate",
|
||||
tool_call_id=tool_call_id,
|
||||
status=_STATUS_RUNNING,
|
||||
description=_INTERMEDIATE_DESCRIPTION,
|
||||
result=result,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def build_final_result_message(tool_call_id: str, result: str) -> LLMStandardMessage:
|
||||
"""Build a final-result async-tool message for an LLM context.
|
||||
|
||||
Append the returned message to the LLM context when the async function
|
||||
finishes. After this message no further async-tool messages will arrive
|
||||
for this ``tool_call_id``.
|
||||
|
||||
Args:
|
||||
tool_call_id: The id of the tool invocation the result is for.
|
||||
result: The JSON-encoded result string, or the literal ``"COMPLETED"``
|
||||
sentinel when the function returned ``None`` (matching the same
|
||||
convention used for synchronous tool calls).
|
||||
|
||||
Returns:
|
||||
A message ready to pass to ``LLMContext.add_message``.
|
||||
"""
|
||||
return _payload_to_message(
|
||||
AsyncToolMessagePayload(
|
||||
kind="final",
|
||||
tool_call_id=tool_call_id,
|
||||
status=_STATUS_FINISHED,
|
||||
description=_FINAL_DESCRIPTION,
|
||||
result=result,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
# --- Parsing -----------------------------------------------------------------
|
||||
|
||||
|
||||
def parse_message(message: LLMStandardMessage) -> AsyncToolMessagePayload | None:
|
||||
"""Decode an async-tool message payload, or return None if not async-tool.
|
||||
|
||||
Args:
|
||||
message: A standard message from the LLM context. Callers iterating
|
||||
over ``LLMContext.get_messages()`` should filter out
|
||||
``LLMSpecificMessage`` entries first; only ``LLMStandardMessage``
|
||||
values can carry async-tool payloads.
|
||||
|
||||
Returns:
|
||||
An ``AsyncToolMessagePayload`` if the message is a recognized
|
||||
async-tool payload, otherwise ``None``.
|
||||
"""
|
||||
role = message.get("role")
|
||||
if role not in ("tool", "developer"):
|
||||
return None
|
||||
content = message.get("content")
|
||||
if not isinstance(content, str):
|
||||
return None
|
||||
try:
|
||||
payload = json.loads(content)
|
||||
except (json.JSONDecodeError, ValueError):
|
||||
return None
|
||||
if not isinstance(payload, dict) or payload.get("type") != _PAYLOAD_TYPE:
|
||||
return None
|
||||
tool_call_id = payload.get("tool_call_id")
|
||||
status = payload.get("status")
|
||||
if not isinstance(tool_call_id, str) or status not in (_STATUS_RUNNING, _STATUS_FINISHED):
|
||||
return None
|
||||
description = payload.get("description", "")
|
||||
if not isinstance(description, str):
|
||||
description = ""
|
||||
result = payload.get("result")
|
||||
if result is not None and not isinstance(result, str):
|
||||
result = None
|
||||
if result is None:
|
||||
kind: AsyncToolMessageKind = "started"
|
||||
elif status == _STATUS_FINISHED:
|
||||
kind = "final"
|
||||
else:
|
||||
kind = "intermediate"
|
||||
return AsyncToolMessagePayload(
|
||||
kind=kind,
|
||||
tool_call_id=tool_call_id,
|
||||
status=status,
|
||||
description=description,
|
||||
result=result,
|
||||
)
|
||||
@@ -21,7 +21,7 @@ import io
|
||||
import wave
|
||||
from collections.abc import Callable
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, TypeAlias, TypeGuard, TypeVar
|
||||
from typing import Any, TypeAlias, TypeGuard, TypeVar, cast
|
||||
|
||||
from loguru import logger
|
||||
from openai._types import NOT_GIVEN as OPEN_AI_NOT_GIVEN
|
||||
@@ -129,13 +129,13 @@ class LLMContext:
|
||||
url: The URL of the image.
|
||||
text: Optional text to include with the image.
|
||||
"""
|
||||
content = []
|
||||
content: list[dict[str, Any]] = []
|
||||
if text:
|
||||
content.append({"type": "text", "text": text})
|
||||
|
||||
content.append({"type": "image_url", "image_url": {"url": url}})
|
||||
|
||||
return {"role": role, "content": content}
|
||||
return cast(LLMContextMessage, {"role": role, "content": content})
|
||||
|
||||
@staticmethod
|
||||
async def create_image_message(
|
||||
@@ -187,7 +187,7 @@ class LLMContext:
|
||||
audio_frames: List of audio frame objects to include.
|
||||
text: Optional text to include with the audio.
|
||||
"""
|
||||
content = [{"type": "text", "text": text}]
|
||||
content: list[dict[str, Any]] = [{"type": "text", "text": text}]
|
||||
|
||||
def encode_audio():
|
||||
sample_rate = audio_frames[0].sample_rate
|
||||
@@ -214,7 +214,7 @@ class LLMContext:
|
||||
}
|
||||
)
|
||||
|
||||
return {"role": role, "content": content}
|
||||
return cast(LLMContextMessage, {"role": role, "content": content})
|
||||
|
||||
@property
|
||||
def messages(self) -> list[LLMContextMessage]:
|
||||
@@ -295,7 +295,10 @@ class LLMContext:
|
||||
result.append(msg_copy)
|
||||
continue
|
||||
|
||||
msg = copy.deepcopy(message)
|
||||
# The standard message variant is a union of TypedDicts; the
|
||||
# mutations below operate on plain dicts at runtime. Treat as
|
||||
# such for the duration of the redaction loop.
|
||||
msg: dict[str, Any] = cast(dict[str, Any], copy.deepcopy(message))
|
||||
content = msg.get("content")
|
||||
if isinstance(content, list):
|
||||
for item in content:
|
||||
|
||||
@@ -44,6 +44,7 @@ from pipecat.frames.frames import (
|
||||
LLMContextSummaryRequestFrame,
|
||||
LLMFullResponseEndFrame,
|
||||
LLMFullResponseStartFrame,
|
||||
LLMMarkerFrame,
|
||||
LLMMessagesAppendFrame,
|
||||
LLMMessagesTransformFrame,
|
||||
LLMMessagesUpdateFrame,
|
||||
@@ -53,7 +54,6 @@ from pipecat.frames.frames import (
|
||||
LLMThoughtEndFrame,
|
||||
LLMThoughtStartFrame,
|
||||
LLMThoughtTextFrame,
|
||||
LLMUpdateSettingsFrame,
|
||||
StartFrame,
|
||||
TextFrame,
|
||||
TranscriptionFrame,
|
||||
@@ -67,25 +67,29 @@ from pipecat.frames.frames import (
|
||||
VADUserStartedSpeakingFrame,
|
||||
VADUserStoppedSpeakingFrame,
|
||||
)
|
||||
from pipecat.processors.aggregators import async_tool_messages
|
||||
from pipecat.processors.aggregators.llm_context import (
|
||||
LLMContext,
|
||||
LLMContextMessage,
|
||||
LLMSpecificMessage,
|
||||
NotGiven,
|
||||
is_given,
|
||||
)
|
||||
from pipecat.processors.aggregators.llm_context_summarizer import (
|
||||
LLMContextSummarizer,
|
||||
SummaryAppliedEvent,
|
||||
)
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.services.settings import LLMSettings
|
||||
from pipecat.turns.user_idle_controller import UserIdleController
|
||||
from pipecat.turns.user_mute import BaseUserMuteStrategy
|
||||
from pipecat.turns.user_start import BaseUserTurnStartStrategy, UserTurnStartedParams
|
||||
from pipecat.turns.user_stop import BaseUserTurnStopStrategy, UserTurnStoppedParams
|
||||
from pipecat.turns.user_turn_completion_mixin import UserTurnCompletionConfig
|
||||
from pipecat.turns.user_turn_controller import UserTurnController
|
||||
from pipecat.turns.user_turn_strategies import UserTurnStrategies
|
||||
from pipecat.turns.user_turn_strategies import (
|
||||
FilterIncompleteUserTurnStrategies,
|
||||
UserTurnStrategies,
|
||||
)
|
||||
from pipecat.utils.context.llm_context_summarization import (
|
||||
LLMAutoContextSummarizationConfig,
|
||||
LLMContextSummarizationConfig,
|
||||
@@ -99,6 +103,21 @@ class LLMUserAggregatorParams:
|
||||
"""Parameters for configuring LLM user aggregation behavior.
|
||||
|
||||
Parameters:
|
||||
add_tool_change_messages: When True, on each ``LLMSetToolsFrame`` the
|
||||
aggregator computes the diff against the currently advertised tools
|
||||
and appends a developer-role message to the context describing
|
||||
additions/removals. Helps the LLM stay coherent across
|
||||
mid-conversation tool changes, mitigating several flavors of
|
||||
tool-call-related hallucination: calling tools that have been
|
||||
removed, avoiding tools that have been re-added, and hallucinating
|
||||
output (made-up answers or tool-call-shaped non-tool-calls) when
|
||||
tools are unavailable. Only standard tools are diffed; custom
|
||||
(LLM-specific) tools are ignored. When using
|
||||
``LLMContextAggregatorPair``, prefer setting this via its
|
||||
``add_tool_change_messages`` argument instead. Defaults to False.
|
||||
audio_idle_timeout: Timeout in seconds to force speech stop when
|
||||
no audio frames are received while in SPEAKING state (e.g. user mutes
|
||||
mic mid-speech). Set to 0 to disable. Defaults to 1.0.
|
||||
user_turn_strategies: User turn start and stop strategies.
|
||||
user_mute_strategies: List of user mute strategies.
|
||||
user_turn_stop_timeout: Time in seconds to wait before considering the
|
||||
@@ -108,27 +127,64 @@ class LLMUserAggregatorParams:
|
||||
has been idle (not speaking) for this duration. Set to 0 to disable
|
||||
idle detection.
|
||||
vad_analyzer: Voice Activity Detection analyzer instance.
|
||||
audio_idle_timeout: Timeout in seconds to force speech stop when
|
||||
no audio frames are received while in SPEAKING state (e.g. user mutes
|
||||
mic mid-speech). Set to 0 to disable. Defaults to 1.0.
|
||||
filter_incomplete_user_turns: Whether to filter out incomplete user turns.
|
||||
When enabled, the LLM outputs a turn completion marker at the start of
|
||||
each response: ✓ (complete), ○ (incomplete short), or ◐ (incomplete long).
|
||||
Incomplete responses are suppressed and timeouts trigger re-prompting.
|
||||
user_turn_completion_config: Configuration for turn completion behavior including
|
||||
custom instructions, timeouts, and prompts. Only used when
|
||||
filter_incomplete_user_turns is True.
|
||||
filter_incomplete_user_turns: [DEPRECATED] Use
|
||||
``user_turn_strategies=FilterIncompleteUserTurnStrategies()``
|
||||
instead. When enabled, the LLM outputs a turn-completion
|
||||
marker at the start of each response: ✓ (complete), ○
|
||||
(incomplete short), or ◐ (incomplete long). Incomplete
|
||||
responses are suppressed and timeouts trigger re-prompting.
|
||||
|
||||
.. deprecated:: 1.2.0
|
||||
Use ``user_turn_strategies=FilterIncompleteUserTurnStrategies()``
|
||||
instead. Will be removed in version 2.0.0.
|
||||
|
||||
user_turn_completion_config: [DEPRECATED] Configuration for turn
|
||||
completion behavior including custom instructions, timeouts, and
|
||||
prompts. Only used when filter_incomplete_user_turns is True
|
||||
(deprecated path) — for the new strategy-based API, pass the config
|
||||
directly to ``FilterIncompleteUserTurnStrategies(config=...)``.
|
||||
|
||||
.. deprecated:: 1.2.0
|
||||
Pass the config directly to
|
||||
``FilterIncompleteUserTurnStrategies(config=...)`` instead.
|
||||
Will be removed in version 2.0.0.
|
||||
"""
|
||||
|
||||
add_tool_change_messages: bool = False
|
||||
audio_idle_timeout: float = 1.0
|
||||
user_turn_strategies: UserTurnStrategies | None = None
|
||||
user_mute_strategies: list[BaseUserMuteStrategy] = field(default_factory=list)
|
||||
user_turn_stop_timeout: float = 5.0
|
||||
user_idle_timeout: float = 0
|
||||
vad_analyzer: VADAnalyzer | None = None
|
||||
audio_idle_timeout: float = 1.0
|
||||
filter_incomplete_user_turns: bool = False
|
||||
user_turn_completion_config: UserTurnCompletionConfig | None = None
|
||||
|
||||
def __post_init__(self):
|
||||
if self.filter_incomplete_user_turns:
|
||||
warnings.warn(
|
||||
"LLMUserAggregatorParams.filter_incomplete_user_turns is deprecated. "
|
||||
"Use user_turn_strategies=FilterIncompleteUserTurnStrategies() instead.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
if self.user_turn_completion_config:
|
||||
warnings.warn(
|
||||
"LLMUserAggregatorParams.user_turn_completion_config is deprecated. "
|
||||
"Use user_turn_strategies=FilterIncompleteUserTurnStrategies() instead.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
if self.user_turn_completion_config is not None:
|
||||
warnings.warn(
|
||||
"LLMUserAggregatorParams.user_turn_completion_config is deprecated. "
|
||||
"Pass the config directly to "
|
||||
"FilterIncompleteUserTurnStrategies(config=...) instead.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class LLMAssistantAggregatorParams:
|
||||
@@ -143,14 +199,32 @@ class LLMAssistantAggregatorParams:
|
||||
summarization. Controls trigger thresholds, message preservation, and
|
||||
summarization prompts. If None, uses default
|
||||
``LLMAutoContextSummarizationConfig`` values.
|
||||
add_tool_change_messages: When True, on each ``LLMSetToolsFrame`` the
|
||||
aggregator computes the diff against the currently advertised tools
|
||||
and appends a developer-role message to the context describing
|
||||
additions/removals. Helps the LLM stay coherent across
|
||||
mid-conversation tool changes, mitigating several flavors of
|
||||
tool-call-related hallucination: calling tools that have been
|
||||
removed, avoiding tools that have been re-added, and hallucinating
|
||||
output (made-up answers or tool-call-shaped non-tool-calls) when
|
||||
tools are unavailable. Only standard tools are diffed; custom
|
||||
(LLM-specific) tools are ignored. When using
|
||||
``LLMContextAggregatorPair``, prefer setting this via its
|
||||
``add_tool_change_messages`` argument instead. Defaults to False.
|
||||
"""
|
||||
|
||||
enable_auto_context_summarization: bool = False
|
||||
auto_context_summarization_config: LLMAutoContextSummarizationConfig | None = None
|
||||
add_tool_change_messages: bool = False
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Deprecated field names — kept for backward compatibility.
|
||||
# Use enable_auto_context_summarization and auto_context_summarization_config instead.
|
||||
#
|
||||
# .. deprecated:: 1.2.0
|
||||
# Use ``enable_auto_context_summarization`` and
|
||||
# ``auto_context_summarization_config`` instead. Will be removed in
|
||||
# version 2.0.0.
|
||||
# ---------------------------------------------------------------------------
|
||||
enable_context_summarization: bool | None = None
|
||||
context_summarization_config: LLMContextSummarizationConfig | None = None
|
||||
@@ -248,20 +322,87 @@ class LLMContextAggregator(FrameProcessor):
|
||||
common functionality for context-based conversation management.
|
||||
"""
|
||||
|
||||
def __init__(self, *, context: LLMContext, role: str, **kwargs):
|
||||
# Developer-role messages appended to the context when tools are added/
|
||||
# removed via ``LLMSetToolsFrame`` (only when ``add_tool_change_messages``
|
||||
# is enabled on the aggregator's params). ``{function_names}`` is
|
||||
# substituted with a sorted, comma-separated, backtick-wrapped list.
|
||||
TOOL_ACTIVATION_MESSAGE_TEMPLATE = (
|
||||
"The following function(s) have just been added and may now be called: "
|
||||
"{function_names}. Any previously available functions remain available."
|
||||
)
|
||||
TOOL_DEACTIVATION_MESSAGE_TEMPLATE = (
|
||||
"The following function(s) have just been removed and should not be called: "
|
||||
"{function_names}. Any previously available functions remain available. "
|
||||
"The removed function(s) may become available again later, in which case "
|
||||
"you will be informed."
|
||||
)
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
context: LLMContext,
|
||||
role: str,
|
||||
add_tool_change_messages: bool = False,
|
||||
**kwargs,
|
||||
):
|
||||
"""Initialize the context response aggregator.
|
||||
|
||||
Args:
|
||||
context: The LLM context to use for conversation storage.
|
||||
role: The role this aggregator represents (e.g. "user", "assistant").
|
||||
add_tool_change_messages: See the field of the same name on the
|
||||
aggregator-specific params dataclasses. Subclasses propagate
|
||||
this from their ``params``.
|
||||
**kwargs: Additional arguments passed to parent class.
|
||||
"""
|
||||
super().__init__(**kwargs)
|
||||
self._context = context
|
||||
self._role = role
|
||||
self._add_tool_change_messages = add_tool_change_messages
|
||||
|
||||
self._aggregation: list[TextPartForConcatenation] = []
|
||||
|
||||
def _maybe_add_tool_change_messages(self, new_tools: ToolsSchema | NotGiven) -> None:
|
||||
"""Append a developer message describing tool add/remove deltas.
|
||||
|
||||
No-op unless ``add_tool_change_messages`` was enabled on the aggregator,
|
||||
and no-op when the diff against the currently advertised tools is empty.
|
||||
Custom (LLM-specific) tools are ignored — only standard tools are diffed.
|
||||
|
||||
Both aggregators call this on every ``LLMSetToolsFrame`` they handle.
|
||||
Whichever aggregator handles the frame first computes a real diff
|
||||
against the shared context and adds the announcement; by the time
|
||||
the other aggregator sees it (if at all), the context already
|
||||
reflects the new tools, so its diff is empty and no duplicate
|
||||
message is added. This is order-independent: it works whether the
|
||||
frame flows downstream (user aggregator first) or upstream
|
||||
(assistant aggregator first, and consumed without being forwarded).
|
||||
"""
|
||||
if not self._add_tool_change_messages:
|
||||
return
|
||||
|
||||
def _names(tools: ToolsSchema | NotGiven) -> set[str]:
|
||||
if not is_given(tools):
|
||||
return set()
|
||||
return {s.name for s in tools.standard_tools}
|
||||
|
||||
old_names = _names(self._context.tools)
|
||||
new_names = _names(new_tools)
|
||||
added = new_names - old_names
|
||||
removed = old_names - new_names
|
||||
if not added and not removed:
|
||||
return
|
||||
|
||||
parts: list[str] = []
|
||||
if added:
|
||||
names = ", ".join(f"`{n}`" for n in sorted(added))
|
||||
parts.append(self.TOOL_ACTIVATION_MESSAGE_TEMPLATE.format(function_names=names))
|
||||
if removed:
|
||||
names = ", ".join(f"`{n}`" for n in sorted(removed))
|
||||
parts.append(self.TOOL_DEACTIVATION_MESSAGE_TEMPLATE.format(function_names=names))
|
||||
|
||||
self._context.add_message({"role": "developer", "content": " ".join(parts)})
|
||||
|
||||
@property
|
||||
def messages(self) -> list[LLMContextMessage]:
|
||||
"""Get messages from the LLM context.
|
||||
@@ -434,20 +575,46 @@ class LLMUserAggregator(LLMContextAggregator):
|
||||
params: Configuration parameters for aggregation behavior.
|
||||
**kwargs: Additional arguments.
|
||||
"""
|
||||
super().__init__(context=context, role="user", **kwargs)
|
||||
self._params = params or LLMUserAggregatorParams()
|
||||
params = params or LLMUserAggregatorParams()
|
||||
super().__init__(
|
||||
context=context,
|
||||
role="user",
|
||||
add_tool_change_messages=params.add_tool_change_messages,
|
||||
**kwargs,
|
||||
)
|
||||
self._params = params
|
||||
|
||||
self._register_event_handler("on_user_turn_started")
|
||||
self._register_event_handler("on_user_turn_stopped")
|
||||
self._register_event_handler("on_user_turn_stop_timeout")
|
||||
self._register_event_handler("on_user_turn_idle")
|
||||
self._register_event_handler("on_user_turn_inference_triggered")
|
||||
self._register_event_handler("on_user_mute_started")
|
||||
self._register_event_handler("on_user_mute_stopped")
|
||||
|
||||
user_turn_strategies = self._params.user_turn_strategies or UserTurnStrategies()
|
||||
|
||||
# Deprecated path: translate filter_incomplete_user_turns into
|
||||
# the equivalent FilterIncompleteUserTurnStrategies wiring. The
|
||||
# DeprecationWarning is emitted in LLMUserAggregatorParams.__post_init__.
|
||||
if self._params.filter_incomplete_user_turns:
|
||||
user_turn_strategies = FilterIncompleteUserTurnStrategies(
|
||||
start=user_turn_strategies.start,
|
||||
stop=user_turn_strategies.stop,
|
||||
config=self._params.user_turn_completion_config,
|
||||
)
|
||||
self._params.user_turn_strategies = user_turn_strategies
|
||||
|
||||
self._user_is_muted = False
|
||||
self._user_turn_start_timestamp = ""
|
||||
# Full transcript across the user turn. Each
|
||||
# `_on_user_turn_inference_triggered` push captures only the
|
||||
# new segment since the previous push (push_aggregation resets
|
||||
# `_aggregation` after writing to context); we accumulate those
|
||||
# segments here so the eventual `on_user_turn_stopped` event
|
||||
# surfaces the full turn transcript even when several
|
||||
# inferences fire before finalization.
|
||||
self._full_user_turn_aggregation: str | None = None
|
||||
|
||||
self._user_turn_controller = UserTurnController(
|
||||
user_turn_strategies=user_turn_strategies,
|
||||
@@ -458,6 +625,9 @@ class LLMUserAggregator(LLMContextAggregator):
|
||||
self._user_turn_controller.add_event_handler(
|
||||
"on_user_turn_started", self._on_user_turn_started
|
||||
)
|
||||
self._user_turn_controller.add_event_handler(
|
||||
"on_user_turn_inference_triggered", self._on_user_turn_inference_triggered
|
||||
)
|
||||
self._user_turn_controller.add_event_handler(
|
||||
"on_user_turn_stopped", self._on_user_turn_stopped
|
||||
)
|
||||
@@ -536,6 +706,7 @@ class LLMUserAggregator(LLMContextAggregator):
|
||||
elif isinstance(frame, LLMMessagesTransformFrame):
|
||||
await self._handle_llm_messages_transform(frame)
|
||||
elif isinstance(frame, LLMSetToolsFrame):
|
||||
self._maybe_add_tool_change_messages(frame.tools)
|
||||
self.set_tools(frame.tools)
|
||||
# Push the LLMSetToolsFrame as well, since speech-to-speech LLM
|
||||
# services (like OpenAI Realtime) may need to know about tool
|
||||
@@ -575,21 +746,6 @@ class LLMUserAggregator(LLMContextAggregator):
|
||||
for s in self._params.user_mute_strategies:
|
||||
await s.setup(self.task_manager)
|
||||
|
||||
# Enable incomplete turn filtering on the LLM if configured
|
||||
if self._params.filter_incomplete_user_turns:
|
||||
# Get config or use defaults
|
||||
config = self._params.user_turn_completion_config or UserTurnCompletionConfig()
|
||||
|
||||
# Enable the feature on the LLM with config
|
||||
await self.push_frame(
|
||||
LLMUpdateSettingsFrame(
|
||||
delta=LLMSettings(
|
||||
filter_incomplete_user_turns=True,
|
||||
user_turn_completion_config=config,
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
async def _stop(self, frame: EndFrame):
|
||||
await self._maybe_emit_user_turn_stopped(on_session_end=True)
|
||||
await self._cleanup()
|
||||
@@ -729,6 +885,7 @@ class LLMUserAggregator(LLMContextAggregator):
|
||||
logger.debug(f"{self}: User started speaking (strategy: {strategy})")
|
||||
|
||||
self._user_turn_start_timestamp = time_now_iso8601()
|
||||
self._full_user_turn_aggregation = None
|
||||
|
||||
if params.enable_user_speaking_frames:
|
||||
await self.broadcast_frame(UserStartedSpeakingFrame)
|
||||
@@ -740,6 +897,30 @@ class LLMUserAggregator(LLMContextAggregator):
|
||||
|
||||
await self._call_event_handler("on_user_turn_started", strategy)
|
||||
|
||||
async def _on_user_turn_inference_triggered(
|
||||
self,
|
||||
controller: UserTurnController,
|
||||
strategy: BaseUserTurnStopStrategy,
|
||||
):
|
||||
logger.debug(f"{self}: User turn inference triggered (strategy: {strategy})")
|
||||
|
||||
# Push aggregation now: this writes the user message segment to
|
||||
# the context and emits LLMContextFrame, which kicks LLM
|
||||
# inference. Concatenate the segment into
|
||||
# `_full_user_turn_aggregation` so multiple inferences in the
|
||||
# same turn don't lose earlier segments from the eventual
|
||||
# `on_user_turn_stopped` event.
|
||||
segment = await self.push_aggregation()
|
||||
if segment:
|
||||
if self._full_user_turn_aggregation:
|
||||
self._full_user_turn_aggregation = (
|
||||
f"{self._full_user_turn_aggregation} {segment}".strip()
|
||||
)
|
||||
else:
|
||||
self._full_user_turn_aggregation = segment
|
||||
|
||||
await self._call_event_handler("on_user_turn_inference_triggered", strategy)
|
||||
|
||||
async def _on_user_turn_stopped(
|
||||
self,
|
||||
controller: UserTurnController,
|
||||
@@ -774,15 +955,29 @@ class LLMUserAggregator(LLMContextAggregator):
|
||||
):
|
||||
"""Maybe emit user turn stopped event.
|
||||
|
||||
Earlier inference triggers in the same turn have already pushed
|
||||
their segments to the context and accumulated them into
|
||||
``self._full_user_turn_aggregation``. Any aggregation that
|
||||
arrived after the last inference trigger is flushed here so
|
||||
end-of-turn content is never lost from the public event.
|
||||
|
||||
Args:
|
||||
strategy: The strategy that triggered the turn stop.
|
||||
on_session_end: If True, only emit if there's unemitted content
|
||||
(avoids duplicate events when session ends).
|
||||
"""
|
||||
aggregation = await self.push_aggregation()
|
||||
if not on_session_end or aggregation:
|
||||
segment = await self.push_aggregation()
|
||||
full_aggregation = self._full_user_turn_aggregation
|
||||
self._full_user_turn_aggregation = None
|
||||
|
||||
if segment and full_aggregation:
|
||||
content = f"{full_aggregation} {segment}".strip()
|
||||
else:
|
||||
content = full_aggregation or segment
|
||||
|
||||
if not on_session_end or content:
|
||||
message = UserTurnStoppedMessage(
|
||||
content=aggregation, timestamp=self._user_turn_start_timestamp
|
||||
content=content, timestamp=self._user_turn_start_timestamp
|
||||
)
|
||||
await self._call_event_handler("on_user_turn_stopped", strategy, message)
|
||||
self._user_turn_start_timestamp = ""
|
||||
@@ -843,8 +1038,14 @@ class LLMAssistantAggregator(LLMContextAggregator):
|
||||
params: Configuration parameters for aggregation behavior.
|
||||
**kwargs: Additional arguments.
|
||||
"""
|
||||
super().__init__(context=context, role="assistant", **kwargs)
|
||||
self._params = params or LLMAssistantAggregatorParams()
|
||||
params = params or LLMAssistantAggregatorParams()
|
||||
super().__init__(
|
||||
context=context,
|
||||
role="assistant",
|
||||
add_tool_change_messages=params.add_tool_change_messages,
|
||||
**kwargs,
|
||||
)
|
||||
self._params = params
|
||||
|
||||
self._function_calls_in_progress: dict[str, FunctionCallInProgressFrame | None] = {}
|
||||
self._function_calls_image_results: dict[str, UserImageRawFrame] = {}
|
||||
@@ -927,13 +1128,15 @@ class LLMAssistantAggregator(LLMContextAggregator):
|
||||
await self._handle_end_or_cancel(frame)
|
||||
await self.push_frame(frame, direction)
|
||||
elif isinstance(frame, LLMAssistantPushAggregationFrame):
|
||||
await self.push_aggregation()
|
||||
await self._handle_push_aggregation()
|
||||
elif isinstance(frame, LLMFullResponseStartFrame):
|
||||
await self._handle_llm_start(frame)
|
||||
elif isinstance(frame, LLMFullResponseEndFrame):
|
||||
await self._handle_llm_end(frame)
|
||||
elif isinstance(frame, TextFrame):
|
||||
await self._handle_text(frame)
|
||||
elif isinstance(frame, LLMMarkerFrame):
|
||||
await self._handle_marker_frame(frame)
|
||||
elif isinstance(frame, LLMThoughtStartFrame):
|
||||
await self._handle_thought_start(frame)
|
||||
elif isinstance(frame, LLMThoughtTextFrame):
|
||||
@@ -949,6 +1152,7 @@ class LLMAssistantAggregator(LLMContextAggregator):
|
||||
elif isinstance(frame, LLMMessagesTransformFrame):
|
||||
await self._handle_llm_messages_transform(frame)
|
||||
elif isinstance(frame, LLMSetToolsFrame):
|
||||
self._maybe_add_tool_change_messages(frame.tools)
|
||||
self.set_tools(frame.tools)
|
||||
elif isinstance(frame, LLMSetToolChoiceFrame):
|
||||
self.set_tool_choice(frame.tool_choice)
|
||||
@@ -1075,23 +1279,7 @@ class LLMAssistantAggregator(LLMContextAggregator):
|
||||
|
||||
is_async = not frame.cancel_on_interruption
|
||||
if is_async:
|
||||
self._context.add_message(
|
||||
{
|
||||
"role": "tool",
|
||||
"content": json.dumps(
|
||||
{
|
||||
"type": "async_tool",
|
||||
"status": "running",
|
||||
"tool_call_id": frame.tool_call_id,
|
||||
"description": "An asynchronous task associated with this tool_call_id has started running. "
|
||||
+ "Expect results to arrive later as developer messages that look roughly like this one (with 'type=async_tool' and a matching tool_call_id) but with a 'result' field. "
|
||||
+ "Note that there *may* be more than one result (i.e., a stream of results), but there doesn't have to be (there may be only one). "
|
||||
+ "The last result will come in a message with 'status=finished'.",
|
||||
}
|
||||
),
|
||||
"tool_call_id": frame.tool_call_id,
|
||||
}
|
||||
)
|
||||
self._context.add_message(async_tool_messages.build_started_message(frame.tool_call_id))
|
||||
else:
|
||||
self._context.add_message(
|
||||
{
|
||||
@@ -1204,19 +1392,7 @@ class LLMAssistantAggregator(LLMContextAggregator):
|
||||
|
||||
result = json.dumps(frame.result, ensure_ascii=False)
|
||||
self._context.add_message(
|
||||
{
|
||||
"role": "developer",
|
||||
"content": json.dumps(
|
||||
{
|
||||
"type": "async_tool",
|
||||
"tool_call_id": frame.tool_call_id,
|
||||
"status": "running",
|
||||
"description": "This is an intermediate result for the asynchronous task associated with this tool_call_id. "
|
||||
+ "The task is still running. More intermediate results may follow, or the next result may be the final one with 'status=finished'.",
|
||||
"result": result,
|
||||
}
|
||||
),
|
||||
}
|
||||
async_tool_messages.build_intermediate_result_message(frame.tool_call_id, result)
|
||||
)
|
||||
|
||||
async def _handle_function_call_finished(
|
||||
@@ -1237,19 +1413,7 @@ class LLMAssistantAggregator(LLMContextAggregator):
|
||||
# notified of the completed result instead of updating the IN_PROGRESS
|
||||
# tool message.
|
||||
self._context.add_message(
|
||||
{
|
||||
"role": "developer",
|
||||
"content": json.dumps(
|
||||
{
|
||||
"type": "async_tool",
|
||||
"tool_call_id": frame.tool_call_id,
|
||||
"status": "finished",
|
||||
"description": "This is the final result for the asynchronous task associated with this tool_call_id. "
|
||||
+ "The task has completed. No further results will arrive for this tool_call_id.",
|
||||
"result": result,
|
||||
}
|
||||
),
|
||||
}
|
||||
async_tool_messages.build_final_result_message(frame.tool_call_id, result)
|
||||
)
|
||||
else:
|
||||
self._update_function_call_result(frame.function_name, frame.tool_call_id, result)
|
||||
@@ -1309,6 +1473,17 @@ class LLMAssistantAggregator(LLMContextAggregator):
|
||||
async def _handle_llm_end(self, _: LLMFullResponseEndFrame):
|
||||
await self._trigger_assistant_turn_stopped()
|
||||
|
||||
async def _handle_push_aggregation(self):
|
||||
# LLMAssistantPushAggregationFrame is emitted by TTSService at the end
|
||||
# of a TTSSpeakFrame-driven utterance (no surrounding LLM response
|
||||
# cycle), so no LLMFullResponseStartFrame ever set the turn-start
|
||||
# timestamp. Open a turn now so on_assistant_turn_stopped fires for the
|
||||
# greeting text the same way it did before LLMAssistantPushAggregationFrame
|
||||
# was introduced.
|
||||
if not self._assistant_turn_start_timestamp:
|
||||
await self._trigger_assistant_turn_started()
|
||||
await self._trigger_assistant_turn_stopped()
|
||||
|
||||
async def _handle_text(self, frame: TextFrame):
|
||||
# Skip TextFrame types not intended to build the assistant context
|
||||
if isinstance(frame, (TranscriptionFrame, TranslationFrame, InterimTranscriptionFrame)):
|
||||
@@ -1327,6 +1502,31 @@ class LLMAssistantAggregator(LLMContextAggregator):
|
||||
)
|
||||
)
|
||||
|
||||
async def _handle_marker_frame(self, frame: LLMMarkerFrame):
|
||||
if frame.append_to_context_immediately:
|
||||
# Stand-alone marker: write it to the context now as its
|
||||
# own assistant message. Used when the marker is the entire
|
||||
# assistant turn — e.g. the ○ / ◐ incomplete-turn signals,
|
||||
# where the spoken response is suppressed and the marker
|
||||
# is the only artifact.
|
||||
self._context.add_message({"role": "assistant", "content": frame.marker})
|
||||
await self.push_context_frame()
|
||||
timestamp_frame = LLMContextAssistantTimestampFrame(timestamp=time_now_iso8601())
|
||||
await self.push_frame(timestamp_frame)
|
||||
return
|
||||
|
||||
# Marker is part of an in-progress assistant response. Append
|
||||
# it to the running aggregation so `push_aggregation` writes
|
||||
# marker + text as a single context message — e.g. the ✓
|
||||
# complete-turn signal that prefixes the spoken response,
|
||||
# producing "✓ <response>" in context. Markers are stripped
|
||||
# from the transcript via
|
||||
# `_maybe_strip_turn_completion_markers` so consumers see
|
||||
# clean text.
|
||||
self._aggregation.append(
|
||||
TextPartForConcatenation(frame.marker, includes_inter_part_spaces=False)
|
||||
)
|
||||
|
||||
async def _handle_thought_start(self, frame: LLMThoughtStartFrame):
|
||||
await self._reset_thought_aggregation()
|
||||
self._thought_append_to_context = frame.append_to_context
|
||||
@@ -1478,6 +1678,7 @@ class LLMContextAggregatorPair:
|
||||
*,
|
||||
user_params: LLMUserAggregatorParams | None = None,
|
||||
assistant_params: LLMAssistantAggregatorParams | None = None,
|
||||
add_tool_change_messages: bool | None = None,
|
||||
):
|
||||
"""Initialize the LLM context aggregator pair.
|
||||
|
||||
@@ -1485,9 +1686,22 @@ class LLMContextAggregatorPair:
|
||||
context: The context to be managed by the aggregators.
|
||||
user_params: Parameters for the user context aggregator.
|
||||
assistant_params: Parameters for the assistant context aggregator.
|
||||
add_tool_change_messages: When provided, sets the field of the
|
||||
same name on both ``user_params`` and ``assistant_params``,
|
||||
overriding any value already set on either. This is the
|
||||
preferred way to enable tool-change announcements: it ensures
|
||||
both aggregators participate, which makes the feature robust
|
||||
regardless of which aggregator handles a given
|
||||
``LLMSetToolsFrame``. The shared context guarantees the
|
||||
announcement is added exactly once (the second aggregator's
|
||||
diff is empty by the time it sees the frame). Leave as
|
||||
``None`` to respect per-params settings.
|
||||
"""
|
||||
user_params = user_params or LLMUserAggregatorParams()
|
||||
assistant_params = assistant_params or LLMAssistantAggregatorParams()
|
||||
if add_tool_change_messages is not None:
|
||||
user_params.add_tool_change_messages = add_tool_change_messages
|
||||
assistant_params.add_tool_change_messages = add_tool_change_messages
|
||||
self._user = LLMUserAggregator(context, params=user_params)
|
||||
self._assistant = LLMAssistantAggregator(context, params=assistant_params)
|
||||
|
||||
|
||||
@@ -16,10 +16,12 @@ from __future__ import annotations
|
||||
import asyncio
|
||||
import dataclasses
|
||||
import traceback
|
||||
from collections.abc import Awaitable, Callable, Coroutine
|
||||
import warnings
|
||||
from collections.abc import Awaitable, Callable
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Optional,
|
||||
)
|
||||
@@ -47,6 +49,9 @@ from pipecat.utils.asyncio.task_manager import BaseTaskManager
|
||||
from pipecat.utils.base_object import BaseObject
|
||||
from pipecat.utils.frame_queue import FrameQueue
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pipecat.pipeline.task import PipelineTask
|
||||
|
||||
|
||||
class FrameDirection(Enum):
|
||||
"""Direction of frame flow in the processing pipeline.
|
||||
@@ -71,15 +76,45 @@ class FrameProcessorSetup:
|
||||
clock: The clock instance for timing operations.
|
||||
task_manager: The task manager for handling async operations.
|
||||
observer: Optional observer for monitoring frame processing events.
|
||||
tool_resources: Application-defined resources shared with processors
|
||||
for this pipeline run.
|
||||
pipeline_task: The :class:`PipelineTask` running this pipeline. Stored
|
||||
on each processor as ``self.pipeline_task`` so processors can
|
||||
reach task-scoped state (e.g. ``self.pipeline_task.app_resources``).
|
||||
tool_resources: Deprecated. :class:`PipelineTask` continues to populate
|
||||
this with ``app_resources`` so that custom :class:`FrameProcessor`
|
||||
subclasses whose ``setup()`` overrides read ``setup.tool_resources``
|
||||
keep working. New code should read
|
||||
``setup.pipeline_task.app_resources`` instead.
|
||||
|
||||
.. deprecated:: 1.2.0
|
||||
Reading this attribute emits a ``DeprecationWarning``. Read
|
||||
``setup.pipeline_task.app_resources`` instead.
|
||||
``tool_resources`` will be removed in a future version.
|
||||
"""
|
||||
|
||||
clock: BaseClock
|
||||
task_manager: BaseTaskManager
|
||||
observer: BaseObserver | None = None
|
||||
pipeline_task: PipelineTask | None = None
|
||||
tool_resources: Any = None
|
||||
|
||||
def __getattribute__(self, name: str) -> Any:
|
||||
# Warn when user code reads the deprecated ``tool_resources`` field.
|
||||
# Set is unaffected (goes through ``__setattr__``), so PipelineTask can
|
||||
# populate it for backwards compat without tripping the warning.
|
||||
if name == "tool_resources":
|
||||
value = object.__getattribute__(self, "tool_resources")
|
||||
if value is not None:
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("always")
|
||||
warnings.warn(
|
||||
"`FrameProcessorSetup.tool_resources` is deprecated since 1.2.0; "
|
||||
"read `setup.pipeline_task.app_resources` instead.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
return value
|
||||
return object.__getattribute__(self, name)
|
||||
|
||||
|
||||
class FrameProcessorQueue(asyncio.PriorityQueue):
|
||||
"""A priority queue for systems frames and other frames.
|
||||
@@ -182,12 +217,12 @@ class FrameProcessor(BaseObject):
|
||||
# Clock
|
||||
self._clock: BaseClock | None = None
|
||||
|
||||
# Task Manager
|
||||
self._task_manager: BaseTaskManager | None = None
|
||||
|
||||
# Observer
|
||||
self._observer: BaseObserver | None = None
|
||||
|
||||
# Pipeline Task
|
||||
self._pipeline_task: PipelineTask | None = None
|
||||
|
||||
# Other properties
|
||||
self._enable_metrics = False
|
||||
self._enable_usage_metrics = False
|
||||
@@ -331,18 +366,20 @@ class FrameProcessor(BaseObject):
|
||||
return self._report_only_initial_ttfb
|
||||
|
||||
@property
|
||||
def task_manager(self) -> BaseTaskManager:
|
||||
"""Get the task manager for this processor.
|
||||
def pipeline_task(self) -> PipelineTask | None:
|
||||
"""Get the :class:`PipelineTask` this processor is running in.
|
||||
|
||||
Provides access to task-scoped state from inside a processor — most
|
||||
notably ``self.pipeline_task.app_resources`` for the application's
|
||||
shared bag of resources (DB handles, clients, feature flags, etc.).
|
||||
|
||||
Returns:
|
||||
The task manager instance.
|
||||
|
||||
Raises:
|
||||
Exception: If the task manager is not initialized.
|
||||
The :class:`PipelineTask` instance that set up this processor,
|
||||
or ``None`` if the processor has not yet been set up by one
|
||||
(for example, before the task has started, or when the processor
|
||||
was instantiated in isolation).
|
||||
"""
|
||||
if not self._task_manager:
|
||||
raise Exception(f"{self} TaskManager is still not initialized.")
|
||||
return self._task_manager
|
||||
return self._pipeline_task
|
||||
|
||||
def processors_with_metrics(self):
|
||||
"""Return processors that can generate metrics.
|
||||
@@ -457,50 +494,22 @@ class FrameProcessor(BaseObject):
|
||||
await self.stop_processing_metrics()
|
||||
await self.stop_text_aggregation_metrics()
|
||||
|
||||
def create_task(self, coroutine: Coroutine, name: str | None = None) -> asyncio.Task:
|
||||
"""Create a new task managed by this processor.
|
||||
|
||||
Args:
|
||||
coroutine: The coroutine to run in the task.
|
||||
name: Optional name for the task.
|
||||
|
||||
Returns:
|
||||
The created asyncio task.
|
||||
"""
|
||||
if name:
|
||||
name = f"{self}::{name}"
|
||||
else:
|
||||
name = f"{self}::{coroutine.cr_code.co_name}"
|
||||
return self.task_manager.create_task(coroutine, name)
|
||||
|
||||
async def cancel_task(self, task: asyncio.Task, timeout: float | None = 1.0):
|
||||
"""Cancel a task managed by this processor.
|
||||
|
||||
A default timeout if 1 second is used in order to avoid potential
|
||||
freezes caused by certain libraries that swallow
|
||||
`asyncio.CancelledError`.
|
||||
|
||||
Args:
|
||||
task: The task to cancel.
|
||||
timeout: Optional timeout for task cancellation.
|
||||
"""
|
||||
await self.task_manager.cancel_task(task, timeout)
|
||||
|
||||
async def setup(self, setup: FrameProcessorSetup):
|
||||
"""Set up the processor with required components.
|
||||
|
||||
Args:
|
||||
setup: Configuration object containing setup parameters.
|
||||
"""
|
||||
await super().setup(setup.task_manager)
|
||||
self._clock = setup.clock
|
||||
self._task_manager = setup.task_manager
|
||||
self._observer = setup.observer
|
||||
self._pipeline_task = setup.pipeline_task
|
||||
|
||||
# Create processing tasks.
|
||||
self.__create_input_task()
|
||||
|
||||
if self._metrics is not None:
|
||||
await self._metrics.setup(self._task_manager)
|
||||
await self._metrics.setup(self.task_manager)
|
||||
|
||||
async def cleanup(self):
|
||||
"""Clean up processor resources."""
|
||||
@@ -822,14 +831,19 @@ class FrameProcessor(BaseObject):
|
||||
current_is_uninterruptible = isinstance(
|
||||
self.__process_current_frame, UninterruptibleFrame
|
||||
)
|
||||
if current_is_uninterruptible or self.__process_queue.has_uninterruptible:
|
||||
# We don't want to cancel an UninterruptibleFrame (either the
|
||||
# one currently being processed or one waiting in the queue),
|
||||
# so we simply cleanup the queue keeping only
|
||||
# UninterruptibleFrames.
|
||||
if current_is_uninterruptible:
|
||||
# The frame currently being processed is uninterruptible, so we
|
||||
# must not cancel it. Just flush non-uninterruptible frames from
|
||||
# the queue; any uninterruptible ones will be kept and processed
|
||||
# after the current frame finishes.
|
||||
self.__reset_process_queue()
|
||||
else:
|
||||
# Cancel and re-create the process task.
|
||||
# Cancel and re-create the process task. Previously this branch
|
||||
# was skipped when the queue contained an uninterruptible frame,
|
||||
# which caused slow non-uninterruptible frames to block
|
||||
# interruptions. Uninterruptible queued frames are safe here
|
||||
# because __create_process_task calls __reset_process_queue
|
||||
# internally, which always preserves them.
|
||||
await self.__cancel_process_task()
|
||||
self.__create_process_task()
|
||||
except Exception as e:
|
||||
|
||||
@@ -67,9 +67,20 @@ class LangchainProcessor(FrameProcessor):
|
||||
# The last one by the human is the one we want to send to the LLM.
|
||||
logger.debug(f"Got transcription frame {frame}")
|
||||
messages = frame.context.get_messages()
|
||||
text: str = messages[-1]["content"]
|
||||
# Historically this processor has only handled plain-text user
|
||||
# messages; the guards below make that contract explicit for the
|
||||
# type checker. TODO: maybe handle other message shapes (provider-
|
||||
# specific messages, multi-modal content lists, etc.).
|
||||
last_message = messages[-1] if messages else None
|
||||
if not isinstance(last_message, dict):
|
||||
await self.push_frame(frame, direction)
|
||||
return
|
||||
content = last_message.get("content")
|
||||
if not isinstance(content, str):
|
||||
await self.push_frame(frame, direction)
|
||||
return
|
||||
|
||||
await self._ainvoke(text.strip())
|
||||
await self._ainvoke(content.strip())
|
||||
else:
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
@@ -87,7 +98,10 @@ class LangchainProcessor(FrameProcessor):
|
||||
case str():
|
||||
return text
|
||||
case AIMessageChunk():
|
||||
return text.content
|
||||
# `content` is `str | list[...]` (multi-modal); stringify if
|
||||
# it's a list, since downstream consumers want plain text.
|
||||
content = text.content
|
||||
return content if isinstance(content, str) else str(content)
|
||||
case _:
|
||||
return ""
|
||||
|
||||
|
||||
@@ -10,6 +10,11 @@ from pipecat.processors.frameworks.rtvi.frames import (
|
||||
RTVIClientMessageFrame,
|
||||
RTVIServerMessageFrame,
|
||||
RTVIServerResponseFrame,
|
||||
RTVIUICancelTaskFrame,
|
||||
RTVIUICommandFrame,
|
||||
RTVIUIEventFrame,
|
||||
RTVIUISnapshotFrame,
|
||||
RTVIUITaskFrame,
|
||||
)
|
||||
from pipecat.processors.frameworks.rtvi.observer import (
|
||||
RTVIFunctionCallReportLevel,
|
||||
@@ -26,4 +31,9 @@ __all__ = [
|
||||
"RTVIProcessor",
|
||||
"RTVIServerMessageFrame",
|
||||
"RTVIServerResponseFrame",
|
||||
"RTVIUICancelTaskFrame",
|
||||
"RTVIUICommandFrame",
|
||||
"RTVIUIEventFrame",
|
||||
"RTVIUISnapshotFrame",
|
||||
"RTVIUITaskFrame",
|
||||
]
|
||||
|
||||
@@ -10,6 +10,7 @@ from dataclasses import dataclass
|
||||
from typing import Any
|
||||
|
||||
from pipecat.frames.frames import SystemFrame
|
||||
from pipecat.processors.frameworks.rtvi.models import UITaskData
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -27,6 +28,132 @@ class RTVIServerMessageFrame(SystemFrame):
|
||||
return f"{self.name}(data: {self.data})"
|
||||
|
||||
|
||||
@dataclass
|
||||
class RTVIUICommandFrame(SystemFrame):
|
||||
"""A frame for sending a UI command to the client.
|
||||
|
||||
Pipeline-side counterpart of the ``ui-command`` RTVI message.
|
||||
The observer wraps the ``command`` + ``payload`` into a
|
||||
``UICommandMessage`` envelope before pushing it to the transport,
|
||||
so the wire shape is:
|
||||
``{label, type: "ui-command", data: {command, payload}}``.
|
||||
|
||||
Parameters:
|
||||
command: App-defined command (e.g. ``"toast"``,
|
||||
``"navigate"``, or any app-specific command).
|
||||
payload: App-defined payload. Pydantic command models
|
||||
(``Toast``, ``Navigate``, ``ScrollTo``, ...) should be
|
||||
converted to a plain dict via ``model_dump()`` before
|
||||
being placed here; an arbitrary dict works as well.
|
||||
"""
|
||||
|
||||
command: str = ""
|
||||
payload: Any = None
|
||||
|
||||
def __str__(self):
|
||||
"""String representation of the UI command frame."""
|
||||
return f"{self.name}(command: {self.command})"
|
||||
|
||||
|
||||
@dataclass
|
||||
class RTVIUITaskFrame(SystemFrame):
|
||||
"""A frame for sending a UI task lifecycle envelope to the client.
|
||||
|
||||
Pipeline-side counterpart of the ``ui-task`` RTVI message. The
|
||||
observer wraps the ``data`` into a ``UITaskMessage`` envelope
|
||||
before pushing it to the transport, so the wire shape is:
|
||||
``{label, type: "ui-task", data: <one of the four kinds>}``.
|
||||
|
||||
Parameters:
|
||||
data: One of the four task-lifecycle data models from
|
||||
``rtvi.models`` (``UITaskGroupStartedData``,
|
||||
``UITaskUpdateData``, ``UITaskCompletedData``, or
|
||||
``UITaskGroupCompletedData``). The ``kind`` field on
|
||||
each discriminates which lifecycle phase this is.
|
||||
"""
|
||||
|
||||
data: UITaskData | None = None
|
||||
|
||||
def __str__(self):
|
||||
"""String representation of the UI task frame."""
|
||||
kind = getattr(self.data, "kind", "?")
|
||||
return f"{self.name}(kind: {kind})"
|
||||
|
||||
|
||||
@dataclass
|
||||
class RTVIUIEventFrame(SystemFrame):
|
||||
"""An inbound UI event from the client.
|
||||
|
||||
Pushed downstream by ``RTVIProcessor`` whenever a ``ui-event``
|
||||
message arrives from the client, alongside firing the
|
||||
``on_ui_message`` event handler. Mirrors the
|
||||
frame-and-event pattern used by ``client-message``: pipeline
|
||||
observers and processors that want to react to UI events at the
|
||||
pipeline level can match on this frame; code that subscribes to
|
||||
events instead (like the bridge in ``pipecat-ai-subagents``)
|
||||
keeps using the event handler.
|
||||
|
||||
Parameters:
|
||||
msg_id: The RTVI message id, as set by the client.
|
||||
event: App-defined event (the ``data.event`` field).
|
||||
payload: App-defined payload (the ``data.payload`` field).
|
||||
"""
|
||||
|
||||
msg_id: str = ""
|
||||
event: str = ""
|
||||
payload: Any = None
|
||||
|
||||
def __str__(self):
|
||||
"""String representation of the UI event frame."""
|
||||
return f"{self.name}(event: {self.event})"
|
||||
|
||||
|
||||
@dataclass
|
||||
class RTVIUISnapshotFrame(SystemFrame):
|
||||
"""An inbound accessibility-snapshot from the client.
|
||||
|
||||
Pushed downstream by ``RTVIProcessor`` whenever a ``ui-snapshot``
|
||||
message arrives, alongside firing ``on_ui_message``. Carries
|
||||
the serialized accessibility tree the client took of its DOM.
|
||||
|
||||
Parameters:
|
||||
msg_id: The RTVI message id, as set by the client.
|
||||
tree: The serialized accessibility tree.
|
||||
"""
|
||||
|
||||
msg_id: str = ""
|
||||
tree: Any = None
|
||||
|
||||
def __str__(self):
|
||||
"""String representation of the UI snapshot frame."""
|
||||
return f"{self.name}"
|
||||
|
||||
|
||||
@dataclass
|
||||
class RTVIUICancelTaskFrame(SystemFrame):
|
||||
"""An inbound user-task-group cancellation request from the client.
|
||||
|
||||
Pushed downstream by ``RTVIProcessor`` whenever a
|
||||
``ui-cancel-task`` message arrives, alongside firing
|
||||
``on_ui_message``. The server-side framework should look up the
|
||||
matching task group and cancel it (subject to whatever
|
||||
cancellable policy the group was registered with).
|
||||
|
||||
Parameters:
|
||||
msg_id: The RTVI message id, as set by the client.
|
||||
task_id: The task group id the client wants cancelled.
|
||||
reason: Optional human-readable reason.
|
||||
"""
|
||||
|
||||
msg_id: str = ""
|
||||
task_id: str = ""
|
||||
reason: str | None = None
|
||||
|
||||
def __str__(self):
|
||||
"""String representation of the UI cancel-task frame."""
|
||||
return f"{self.name}(task_id: {self.task_id})"
|
||||
|
||||
|
||||
@dataclass
|
||||
class RTVIClientMessageFrame(SystemFrame):
|
||||
"""A frame for sending messages from the client to the RTVI server.
|
||||
|
||||
@@ -20,14 +20,14 @@ from typing import (
|
||||
Literal,
|
||||
)
|
||||
|
||||
from pydantic import BaseModel
|
||||
from pydantic import BaseModel, ConfigDict
|
||||
|
||||
from pipecat.frames.frames import (
|
||||
AggregationType,
|
||||
)
|
||||
|
||||
# -- Constants --
|
||||
PROTOCOL_VERSION = "1.2.0"
|
||||
PROTOCOL_VERSION = "1.3.0"
|
||||
|
||||
MESSAGE_LABEL = "rtvi-ai"
|
||||
MessageLiteral = Literal["rtvi-ai"]
|
||||
@@ -549,3 +549,474 @@ class SystemLogMessage(BaseModel):
|
||||
label: MessageLiteral = MESSAGE_LABEL
|
||||
type: Literal["system-log"] = "system-log"
|
||||
data: TextMessageData
|
||||
|
||||
|
||||
# -- UI Agent Protocol -------------------------------------------------------
|
||||
#
|
||||
# A structured RTVI message vocabulary that lets server-side AI agents
|
||||
# observe and drive a GUI app on the client side. The protocol covers
|
||||
# five first-class RTVI message types:
|
||||
#
|
||||
# ui-event client-to-server event message
|
||||
# ui-command server-to-client command message
|
||||
# ui-snapshot client-to-server accessibility snapshot
|
||||
# ui-cancel-task client-to-server cancellation request
|
||||
# ui-task server-to-client task lifecycle envelope
|
||||
#
|
||||
# This section is data only (constants and payload models, no
|
||||
# behavior). Higher-level frameworks like ``pipecat-ai-subagents``
|
||||
# build the agent abstractions on top, and single-LLM Pipecat apps can
|
||||
# target the same wire format directly via custom tools that emit
|
||||
# typed RTVI messages with these types. The matching client-side
|
||||
# implementation lives in ``@pipecat-ai/client-js`` and
|
||||
# ``@pipecat-ai/client-react``.
|
||||
|
||||
# The wire-format ``type`` strings (``"ui-event"``, ``"ui-command"``,
|
||||
# ``"ui-snapshot"``, ``"ui-cancel-task"``, ``"ui-task"``) are pinned
|
||||
# as ``Literal[...]`` field defaults on the corresponding ``*Message``
|
||||
# pydantic class below, matching the convention used for every other
|
||||
# RTVI message type in this module.
|
||||
|
||||
# Each ``ui-task`` envelope carries a ``kind`` field that the client's
|
||||
# task reducer dispatches on. The four kinds form the lifecycle of a
|
||||
# user-facing task group:
|
||||
#
|
||||
# group_started → task_update* → task_completed × N → group_completed
|
||||
#
|
||||
# where N is the number of workers in the group. The kind strings are
|
||||
# pinned as ``Literal[...]`` defaults on the matching ``UITask*Data``
|
||||
# class below.
|
||||
|
||||
|
||||
# -- UI envelope data classes --
|
||||
|
||||
|
||||
class UIEventData(BaseModel):
|
||||
"""Inner ``data`` for a ``ui-event`` message.
|
||||
|
||||
Parameters:
|
||||
event: App-defined event.
|
||||
payload: App-defined payload, schemaless by design.
|
||||
"""
|
||||
|
||||
event: str
|
||||
payload: Any | None = None
|
||||
|
||||
|
||||
class UICommandData(BaseModel):
|
||||
"""Inner ``data`` for a ``ui-command`` message.
|
||||
|
||||
Parameters:
|
||||
command: App-defined command.
|
||||
payload: App-defined payload (already a plain dict by the
|
||||
time it lands on the wire). The standard command payload models
|
||||
below produce the right shape via ``model_dump()``.
|
||||
"""
|
||||
|
||||
command: str
|
||||
payload: Any | None = None
|
||||
|
||||
|
||||
class A11yNode(BaseModel):
|
||||
"""One node in the UI accessibility snapshot tree.
|
||||
|
||||
Mirrors the client-side ``A11yNode`` wire shape. Extra fields are
|
||||
allowed so clients can add platform-specific or future metadata
|
||||
without breaking older servers.
|
||||
|
||||
Parameters:
|
||||
ref: Stable client-assigned element reference.
|
||||
role: ARIA-style role for the node.
|
||||
name: Optional accessible name.
|
||||
value: Optional current value for inputs/progress/etc.
|
||||
state: Optional short state tags (e.g. ``"focused"``,
|
||||
``"disabled"``, ``"offscreen"``).
|
||||
level: Optional heading level.
|
||||
colcount: Optional column count for grid-like containers.
|
||||
rowcount: Optional row count for grid-like containers.
|
||||
children: Optional child nodes.
|
||||
"""
|
||||
|
||||
model_config = ConfigDict(extra="allow")
|
||||
|
||||
ref: str
|
||||
role: str
|
||||
name: str | None = None
|
||||
value: str | None = None
|
||||
state: list[str] | None = None
|
||||
level: int | None = None
|
||||
colcount: int | None = None
|
||||
rowcount: int | None = None
|
||||
children: list["A11yNode"] | None = None
|
||||
|
||||
|
||||
class A11ySelection(BaseModel):
|
||||
"""The user's current text selection in the UI snapshot.
|
||||
|
||||
Extra fields are allowed for forward compatibility with client
|
||||
snapshot additions.
|
||||
|
||||
Parameters:
|
||||
ref: Ref of the element that carries the selection.
|
||||
text: Selected text.
|
||||
start_offset: Optional selection start offset.
|
||||
end_offset: Optional selection end offset.
|
||||
"""
|
||||
|
||||
model_config = ConfigDict(extra="allow")
|
||||
|
||||
ref: str
|
||||
text: str
|
||||
start_offset: int | None = None
|
||||
end_offset: int | None = None
|
||||
|
||||
|
||||
class A11ySnapshot(BaseModel):
|
||||
"""Client accessibility snapshot sent in a ``ui-snapshot`` message.
|
||||
|
||||
Mirrors the client-side ``A11ySnapshot`` wire shape. Extra fields
|
||||
are allowed so clients can add compatible metadata over time.
|
||||
|
||||
Parameters:
|
||||
root: Root accessibility node.
|
||||
captured_at: Client-side epoch milliseconds when captured.
|
||||
selection: Optional current text selection.
|
||||
"""
|
||||
|
||||
model_config = ConfigDict(extra="allow")
|
||||
|
||||
root: A11yNode
|
||||
captured_at: int
|
||||
selection: A11ySelection | None = None
|
||||
|
||||
|
||||
class UISnapshotData(BaseModel):
|
||||
"""Inner ``data`` for a ``ui-snapshot`` message.
|
||||
|
||||
The accessibility snapshot tree mirrors the client-side
|
||||
``A11ySnapshot`` wire shape and is kept forward-compatible by
|
||||
allowing extra fields on the snapshot models.
|
||||
|
||||
Parameters:
|
||||
tree: The serialized accessibility tree.
|
||||
"""
|
||||
|
||||
tree: A11ySnapshot
|
||||
|
||||
|
||||
class UICancelTaskData(BaseModel):
|
||||
"""Inner ``data`` for a ``ui-cancel-task`` message.
|
||||
|
||||
Parameters:
|
||||
task_id: The task group id the client wants cancelled.
|
||||
reason: Optional human-readable reason.
|
||||
"""
|
||||
|
||||
task_id: str
|
||||
reason: str | None = None
|
||||
|
||||
|
||||
class UITaskGroupStartedData(BaseModel):
|
||||
"""``data`` for a ``ui-task`` envelope with kind ``group_started``.
|
||||
|
||||
Parameters:
|
||||
kind: Always ``"group_started"``.
|
||||
task_id: Shared task identifier for the group.
|
||||
agents: Names of the agents the work was dispatched to.
|
||||
label: Optional human-readable label for the group.
|
||||
cancellable: Whether the client may request cancellation.
|
||||
at: Epoch milliseconds when the group started.
|
||||
"""
|
||||
|
||||
kind: Literal["group_started"] = "group_started"
|
||||
task_id: str
|
||||
agents: list[str] | None = None
|
||||
label: str | None = None
|
||||
cancellable: bool = True
|
||||
at: int = 0
|
||||
|
||||
|
||||
class UITaskUpdateData(BaseModel):
|
||||
"""``data`` for a ``ui-task`` envelope with kind ``task_update``.
|
||||
|
||||
Parameters:
|
||||
kind: Always ``"task_update"``.
|
||||
task_id: The shared task identifier.
|
||||
agent_name: The worker that produced the update.
|
||||
data: The worker's update payload, forwarded verbatim.
|
||||
at: Epoch milliseconds when the update was emitted.
|
||||
"""
|
||||
|
||||
kind: Literal["task_update"] = "task_update"
|
||||
task_id: str
|
||||
agent_name: str
|
||||
data: Any | None = None
|
||||
at: int = 0
|
||||
|
||||
|
||||
class UITaskCompletedData(BaseModel):
|
||||
"""``data`` for a ``ui-task`` envelope with kind ``task_completed``.
|
||||
|
||||
Parameters:
|
||||
kind: Always ``"task_completed"``.
|
||||
task_id: The shared task identifier.
|
||||
agent_name: The worker that produced the response.
|
||||
status: Completion status string.
|
||||
response: The worker's response payload.
|
||||
at: Epoch milliseconds when the response was received.
|
||||
"""
|
||||
|
||||
kind: Literal["task_completed"] = "task_completed"
|
||||
task_id: str
|
||||
agent_name: str
|
||||
status: str
|
||||
response: Any | None = None
|
||||
at: int = 0
|
||||
|
||||
|
||||
class UITaskGroupCompletedData(BaseModel):
|
||||
"""``data`` for a ``ui-task`` envelope with kind ``group_completed``.
|
||||
|
||||
Parameters:
|
||||
kind: Always ``"group_completed"``.
|
||||
task_id: The shared task identifier.
|
||||
at: Epoch milliseconds when the group completed.
|
||||
"""
|
||||
|
||||
kind: Literal["group_completed"] = "group_completed"
|
||||
task_id: str
|
||||
at: int = 0
|
||||
|
||||
|
||||
#: Discriminated union over the four task-lifecycle data shapes,
|
||||
#: keyed by the ``kind`` field.
|
||||
UITaskData = (
|
||||
UITaskGroupStartedData | UITaskUpdateData | UITaskCompletedData | UITaskGroupCompletedData
|
||||
)
|
||||
|
||||
|
||||
# -- UI envelope message classes --
|
||||
|
||||
|
||||
class UIEventMessage(BaseModel):
|
||||
"""RTVI ``ui-event`` message (client → server)."""
|
||||
|
||||
label: MessageLiteral = MESSAGE_LABEL
|
||||
type: Literal["ui-event"] = "ui-event"
|
||||
id: str
|
||||
data: UIEventData
|
||||
|
||||
|
||||
class UICommandMessage(BaseModel):
|
||||
"""RTVI ``ui-command`` message (server → client)."""
|
||||
|
||||
label: MessageLiteral = MESSAGE_LABEL
|
||||
type: Literal["ui-command"] = "ui-command"
|
||||
data: UICommandData
|
||||
|
||||
|
||||
class UISnapshotMessage(BaseModel):
|
||||
"""RTVI ``ui-snapshot`` message (client → server)."""
|
||||
|
||||
label: MessageLiteral = MESSAGE_LABEL
|
||||
type: Literal["ui-snapshot"] = "ui-snapshot"
|
||||
id: str
|
||||
data: UISnapshotData
|
||||
|
||||
|
||||
class UICancelTaskMessage(BaseModel):
|
||||
"""RTVI ``ui-cancel-task`` message (client → server)."""
|
||||
|
||||
label: MessageLiteral = MESSAGE_LABEL
|
||||
type: Literal["ui-cancel-task"] = "ui-cancel-task"
|
||||
id: str
|
||||
data: UICancelTaskData
|
||||
|
||||
|
||||
class UITaskMessage(BaseModel):
|
||||
"""RTVI ``ui-task`` message (server → client).
|
||||
|
||||
The ``data`` field is one of the four task-lifecycle
|
||||
discriminated by the ``kind`` field.
|
||||
"""
|
||||
|
||||
label: MessageLiteral = MESSAGE_LABEL
|
||||
type: Literal["ui-task"] = "ui-task"
|
||||
data: UITaskData
|
||||
|
||||
|
||||
# -- UI command payloads --
|
||||
#
|
||||
# These models describe commands that have matching default React
|
||||
# handlers in ``@pipecat-ai/client-react``'s ``standardHandlers``.
|
||||
# Apps can use them as-is, override the client handler to customize
|
||||
# rendering, or ignore them entirely and define their own command
|
||||
# names.
|
||||
#
|
||||
# Server-side helpers that send commands accept these models directly.
|
||||
# ``BaseModel.model_dump()`` converts them to the plain-dict shape
|
||||
# that travels over the wire.
|
||||
|
||||
|
||||
class Toast(BaseModel):
|
||||
"""A transient notification surface shown on the client.
|
||||
|
||||
Parameters:
|
||||
title: Required headline.
|
||||
subtitle: Optional second line beneath the title.
|
||||
description: Optional body text.
|
||||
image_url: Optional leading image.
|
||||
duration_ms: Optional dismiss timer. Client default applies
|
||||
when None.
|
||||
"""
|
||||
|
||||
title: str
|
||||
subtitle: str | None = None
|
||||
description: str | None = None
|
||||
image_url: str | None = None
|
||||
duration_ms: int | None = None
|
||||
|
||||
|
||||
class Navigate(BaseModel):
|
||||
"""Client-side navigation to a named view.
|
||||
|
||||
Parameters:
|
||||
view: App-defined view name (route, screen id, tab key, etc.).
|
||||
params: Optional view-specific parameters.
|
||||
"""
|
||||
|
||||
view: str
|
||||
params: dict | None = None
|
||||
|
||||
|
||||
class ScrollTo(BaseModel):
|
||||
"""Scroll a target element into view.
|
||||
|
||||
The client resolves the target by ``ref`` first (a snapshot ref
|
||||
like ``"e42"`` assigned by the a11y walker), then falls back to
|
||||
``target_id`` (``document.getElementById``). Supply whichever you
|
||||
have; ``ref`` is the normal choice when acting on a node from
|
||||
``<ui_state>``.
|
||||
|
||||
Parameters:
|
||||
ref: Snapshot ref from ``<ui_state>``.
|
||||
target_id: Element id registered on the client.
|
||||
behavior: Optional scroll behavior hint. Typical values:
|
||||
``"smooth"`` or ``"instant"``. Clients may ignore.
|
||||
"""
|
||||
|
||||
ref: str | None = None
|
||||
target_id: str | None = None
|
||||
behavior: str | None = None
|
||||
|
||||
|
||||
class Highlight(BaseModel):
|
||||
"""Briefly emphasize a target element (flash, glow, pulse).
|
||||
|
||||
Parameters:
|
||||
ref: Snapshot ref from ``<ui_state>``.
|
||||
target_id: Element id registered on the client.
|
||||
duration_ms: Optional highlight duration. Client default
|
||||
applies when None.
|
||||
"""
|
||||
|
||||
ref: str | None = None
|
||||
target_id: str | None = None
|
||||
duration_ms: int | None = None
|
||||
|
||||
|
||||
class Focus(BaseModel):
|
||||
"""Move input focus to a target element.
|
||||
|
||||
Parameters:
|
||||
ref: Snapshot ref from ``<ui_state>``.
|
||||
target_id: Element id registered on the client.
|
||||
"""
|
||||
|
||||
ref: str | None = None
|
||||
target_id: str | None = None
|
||||
|
||||
|
||||
class Click(BaseModel):
|
||||
"""Click an element on the client.
|
||||
|
||||
Closes the form-fill loop for non-text inputs (checkboxes, radios)
|
||||
and exposes the rest of the action vocabulary (submit buttons,
|
||||
links, app-specific clickable nodes). The standard handler
|
||||
silently no-ops on ``disabled`` targets so the agent can't bypass
|
||||
UI affordances the user is meant to control.
|
||||
|
||||
For native ``<select>``, prefer ``SetInputValue`` (clicking
|
||||
options doesn't reliably change the selection); for custom
|
||||
comboboxes (ARIA listbox + popup), apps wire their own command
|
||||
matching the library's interaction model.
|
||||
|
||||
Parameters:
|
||||
ref: Snapshot ref from ``<ui_state>``.
|
||||
target_id: Element id registered on the client. Used as a
|
||||
fallback when ``ref`` is not set or has gone stale.
|
||||
"""
|
||||
|
||||
ref: str | None = None
|
||||
target_id: str | None = None
|
||||
|
||||
|
||||
class SetInputValue(BaseModel):
|
||||
"""Write a value into a text input or textarea on the client.
|
||||
|
||||
Use this for form-filling: the agent has decided what should go
|
||||
into a field (clarifying answer, tax form entry, etc.) and asks
|
||||
the client to populate it. With ``replace=True`` (the default),
|
||||
the existing value is overwritten; with ``replace=False`` the
|
||||
value is appended.
|
||||
|
||||
The standard handler silently no-ops on ``disabled``, ``readonly``,
|
||||
and ``<input type="hidden">`` targets so the agent can't write
|
||||
into fields the user can't.
|
||||
|
||||
Parameters:
|
||||
value: The text to write.
|
||||
ref: Snapshot ref from ``<ui_state>``. Typically the ref of
|
||||
an ``<input>`` or ``<textarea>``.
|
||||
target_id: Element id registered on the client. Used as a
|
||||
fallback when ``ref`` is not set or has gone stale.
|
||||
replace: When True (the default), overwrite the current
|
||||
value. When False, append to it.
|
||||
"""
|
||||
|
||||
value: str = ""
|
||||
ref: str | None = None
|
||||
target_id: str | None = None
|
||||
replace: bool = True
|
||||
|
||||
|
||||
class SelectText(BaseModel):
|
||||
"""Select text on the page so the user can see what the agent means.
|
||||
|
||||
Mirror of the ``selection`` field surfaced in the snapshot. Use
|
||||
this to point the user's attention at a specific paragraph or
|
||||
range after the agent has decided what it's referring to.
|
||||
|
||||
With ``start_offset`` and ``end_offset`` omitted, the entire
|
||||
target's text content is selected (``Range.selectNodeContents``
|
||||
for document elements; ``el.select()`` for ``<input>`` /
|
||||
``<textarea>``).
|
||||
|
||||
Parameters:
|
||||
ref: Snapshot ref from ``<ui_state>``. Typically the ref of
|
||||
a paragraph or input element.
|
||||
target_id: Element id registered on the client. Used as a
|
||||
fallback when ``ref`` is not set or has gone stale.
|
||||
start_offset: Character offset within the target's text
|
||||
where the selection should start. For ``<input>`` and
|
||||
``<textarea>`` this is the value offset; for document
|
||||
elements it is computed against the concatenation of
|
||||
descendant text nodes in document order.
|
||||
end_offset: End character offset, exclusive. Same coordinate
|
||||
system as ``start_offset``.
|
||||
"""
|
||||
|
||||
ref: str | None = None
|
||||
target_id: str | None = None
|
||||
start_offset: int | None = None
|
||||
end_offset: int | None = None
|
||||
|
||||
@@ -58,6 +58,8 @@ from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.processors.frameworks.rtvi.frames import (
|
||||
RTVIServerMessageFrame,
|
||||
RTVIServerResponseFrame,
|
||||
RTVIUICommandFrame,
|
||||
RTVIUITaskFrame,
|
||||
)
|
||||
from pipecat.transports.base_output import BaseOutputTransport
|
||||
from pipecat.utils.string import match_endofsentence
|
||||
@@ -430,6 +432,15 @@ class RTVIObserver(BaseObserver):
|
||||
elif isinstance(frame, RTVIServerMessageFrame):
|
||||
message = RTVI.ServerMessage(data=frame.data)
|
||||
await self.send_rtvi_message(message)
|
||||
elif isinstance(frame, RTVIUICommandFrame):
|
||||
message = RTVI.UICommandMessage(
|
||||
data=RTVI.UICommandData(command=frame.command, payload=frame.payload)
|
||||
)
|
||||
await self.send_rtvi_message(message)
|
||||
elif isinstance(frame, RTVIUITaskFrame):
|
||||
if frame.data is not None:
|
||||
message = RTVI.UITaskMessage(data=frame.data)
|
||||
await self.send_rtvi_message(message)
|
||||
elif isinstance(frame, RTVIServerResponseFrame):
|
||||
if frame.error is not None:
|
||||
await self._send_error_response(frame)
|
||||
|
||||
@@ -32,7 +32,12 @@ from pipecat.frames.frames import (
|
||||
SystemFrame,
|
||||
)
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.processors.frameworks.rtvi.frames import RTVIClientMessageFrame
|
||||
from pipecat.processors.frameworks.rtvi.frames import (
|
||||
RTVIClientMessageFrame,
|
||||
RTVIUICancelTaskFrame,
|
||||
RTVIUIEventFrame,
|
||||
RTVIUISnapshotFrame,
|
||||
)
|
||||
from pipecat.processors.frameworks.rtvi.observer import RTVIObserver, RTVIObserverParams
|
||||
from pipecat.services.llm_service import (
|
||||
FunctionCallParams, # TODO(aleix): we shouldn't import `services` from `processors`
|
||||
@@ -76,6 +81,7 @@ class RTVIProcessor(FrameProcessor):
|
||||
self._register_event_handler("on_bot_started")
|
||||
self._register_event_handler("on_client_ready")
|
||||
self._register_event_handler("on_client_message")
|
||||
self._register_event_handler("on_ui_message")
|
||||
|
||||
self._input_transport = None
|
||||
self._transport = transport
|
||||
@@ -102,7 +108,7 @@ class RTVIProcessor(FrameProcessor):
|
||||
self._client_ready = True
|
||||
await self._call_event_handler("on_client_ready")
|
||||
|
||||
async def set_bot_ready(self, about: Mapping[str, Any] = None):
|
||||
async def set_bot_ready(self, about: Mapping[str, Any] | None = None):
|
||||
"""Mark the bot as ready and send the bot-ready message.
|
||||
|
||||
Args:
|
||||
@@ -288,6 +294,41 @@ class RTVIProcessor(FrameProcessor):
|
||||
case "client-message":
|
||||
data = RTVI.RawClientMessageData.model_validate(message.data)
|
||||
await self._handle_client_message(message.id, data)
|
||||
case "ui-event":
|
||||
event_data = RTVI.UIEventData.model_validate(message.data or {})
|
||||
await self.push_frame(
|
||||
RTVIUIEventFrame(
|
||||
msg_id=message.id,
|
||||
event=event_data.event,
|
||||
payload=event_data.payload,
|
||||
)
|
||||
)
|
||||
await self._call_event_handler(
|
||||
"on_ui_message",
|
||||
RTVI.UIEventMessage(id=message.id, data=event_data),
|
||||
)
|
||||
case "ui-snapshot":
|
||||
snapshot_data = RTVI.UISnapshotData.model_validate(message.data or {})
|
||||
await self.push_frame(
|
||||
RTVIUISnapshotFrame(msg_id=message.id, tree=snapshot_data.tree)
|
||||
)
|
||||
await self._call_event_handler(
|
||||
"on_ui_message",
|
||||
RTVI.UISnapshotMessage(id=message.id, data=snapshot_data),
|
||||
)
|
||||
case "ui-cancel-task":
|
||||
cancel_data = RTVI.UICancelTaskData.model_validate(message.data or {})
|
||||
await self.push_frame(
|
||||
RTVIUICancelTaskFrame(
|
||||
msg_id=message.id,
|
||||
task_id=cancel_data.task_id,
|
||||
reason=cancel_data.reason,
|
||||
)
|
||||
)
|
||||
await self._call_event_handler(
|
||||
"on_ui_message",
|
||||
RTVI.UICancelTaskMessage(id=message.id, data=cancel_data),
|
||||
)
|
||||
case "llm-function-call-result":
|
||||
data = RTVI.LLMFunctionCallResultData.model_validate(message.data)
|
||||
await self._handle_function_call_result(data)
|
||||
@@ -404,7 +445,7 @@ class RTVIProcessor(FrameProcessor):
|
||||
)
|
||||
await self.push_frame(frame)
|
||||
|
||||
async def _send_bot_ready(self, about: Mapping[str, Any] = None):
|
||||
async def _send_bot_ready(self, about: Mapping[str, Any] | None = None):
|
||||
"""Send the bot-ready message to the client.
|
||||
|
||||
Args:
|
||||
|
||||
@@ -71,9 +71,15 @@ class StrandsAgentsProcessor(FrameProcessor):
|
||||
await super().process_frame(frame, direction)
|
||||
if isinstance(frame, LLMContextFrame):
|
||||
messages = frame.context.get_messages()
|
||||
if messages:
|
||||
last_message = messages[-1]
|
||||
await self._ainvoke(str(last_message["content"]).strip())
|
||||
# Historically this processor has only handled plain-text user
|
||||
# messages; the guards below make that contract explicit for the
|
||||
# type checker. TODO: handle other message shapes (provider-
|
||||
# specific messages, multi-modal content lists, etc.).
|
||||
last_message = messages[-1] if messages else None
|
||||
if isinstance(last_message, dict):
|
||||
content = last_message.get("content")
|
||||
if isinstance(content, str):
|
||||
await self._ainvoke(content.strip())
|
||||
else:
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
@@ -91,6 +97,9 @@ class StrandsAgentsProcessor(FrameProcessor):
|
||||
await self.start_ttfb_metrics()
|
||||
|
||||
if self.graph:
|
||||
# `__init__` asserts `graph_exit_node` is set whenever `graph`
|
||||
# is, so this can't be None here.
|
||||
assert self.graph_exit_node is not None
|
||||
# Graph does not stream; await full result then emit assistant text
|
||||
graph_result = await self.graph.invoke_async(text)
|
||||
if ttfb_tracking:
|
||||
@@ -115,6 +124,9 @@ class StrandsAgentsProcessor(FrameProcessor):
|
||||
except Exception as parse_err:
|
||||
logger.warning(f"Failed to extract messages from GraphResult: {parse_err}")
|
||||
else:
|
||||
# `__init__` asserts at least one of `agent`/`graph` is set,
|
||||
# and we're in the `not self.graph` branch.
|
||||
assert self.agent is not None
|
||||
# Agent supports streaming events via async iterator
|
||||
async for event in self.agent.stream_async(text):
|
||||
# Push to TTS service
|
||||
|
||||
@@ -27,7 +27,7 @@ try:
|
||||
|
||||
gi.require_version("Gst", "1.0")
|
||||
gi.require_version("GstApp", "1.0")
|
||||
from gi.repository import Gst, GstApp
|
||||
from gi.repository import Gst, GstApp # pyright: ignore[reportAttributeAccessIssue]
|
||||
except ModuleNotFoundError as e:
|
||||
logger.error(f"Exception: {e}")
|
||||
logger.error(
|
||||
|
||||
@@ -20,7 +20,6 @@ from pipecat.metrics.metrics import (
|
||||
TTFBMetricsData,
|
||||
TTSUsageMetricsData,
|
||||
)
|
||||
from pipecat.utils.asyncio.task_manager import BaseTaskManager
|
||||
from pipecat.utils.base_object import BaseObject
|
||||
|
||||
|
||||
@@ -40,36 +39,12 @@ class FrameProcessorMetrics(BaseObject):
|
||||
processing times, and usage statistics.
|
||||
"""
|
||||
super().__init__()
|
||||
self._task_manager = None
|
||||
self._start_ttfb_time = 0
|
||||
self._start_processing_time = 0
|
||||
self._start_text_aggregation_time = 0
|
||||
self._last_ttfb_time = 0
|
||||
self._should_report_ttfb = True
|
||||
|
||||
async def setup(self, task_manager: BaseTaskManager):
|
||||
"""Set up the metrics collector with a task manager.
|
||||
|
||||
Args:
|
||||
task_manager: The task manager for handling async operations.
|
||||
"""
|
||||
self._task_manager = task_manager
|
||||
|
||||
async def cleanup(self):
|
||||
"""Clean up metrics collection resources."""
|
||||
await super().cleanup()
|
||||
|
||||
@property
|
||||
def task_manager(self) -> BaseTaskManager:
|
||||
"""Get the associated task manager.
|
||||
|
||||
Returns:
|
||||
The task manager instance for async operations.
|
||||
"""
|
||||
if self._task_manager is None:
|
||||
raise RuntimeError("task_manager not set; call setup() first")
|
||||
return self._task_manager
|
||||
|
||||
@property
|
||||
def ttfb(self) -> float | None:
|
||||
"""Get the current TTFB value in seconds.
|
||||
|
||||
@@ -19,6 +19,10 @@ All bots must implement a `bot(runner_args)` async function as the entry point.
|
||||
The server automatically discovers and executes this function when connections
|
||||
are established.
|
||||
|
||||
By default the runner starts a single FastAPI server that supports WebRTC, Daily,
|
||||
and telephony transports simultaneously. Clients declare which transport they want
|
||||
via the ``transport`` field in the ``/start`` request body (default: ``"webrtc"``).
|
||||
|
||||
Single transport example::
|
||||
|
||||
async def bot(runner_args: RunnerArguments):
|
||||
@@ -55,14 +59,33 @@ Supported transports:
|
||||
- WebRTC - Provides local WebRTC interface with prebuilt UI
|
||||
- Telephony - Handles webhook and WebSocket connections for Twilio, Telnyx, Plivo, Exotel
|
||||
|
||||
The ``/start`` endpoint accepts::
|
||||
|
||||
{
|
||||
"transport": "webrtc", // "webrtc" | "daily" | "twilio" | "telnyx" |
|
||||
// "plivo" | "exotel" — default: "webrtc"
|
||||
|
||||
// WebRTC-specific
|
||||
"enableDefaultIceServers": false,
|
||||
"body": {...},
|
||||
|
||||
// Daily-specific
|
||||
"createDailyRoom": true,
|
||||
"dailyRoomProperties": {...},
|
||||
"dailyMeetingTokenProperties": {...},
|
||||
"body": {...}
|
||||
}
|
||||
|
||||
To run locally:
|
||||
|
||||
- WebRTC: `python bot.py -t webrtc`
|
||||
- ESP32: `python bot.py -t webrtc --esp32 --host 192.168.1.100`
|
||||
- Daily (server): `python bot.py -t daily`
|
||||
- Daily (direct, testing only): `python bot.py -d`
|
||||
- Telephony: `python bot.py -t twilio -x your_username.ngrok.io`
|
||||
- Exotel: `python bot.py -t exotel` (no proxy needed, but ngrok connection to HTTP 7860 is required)
|
||||
- All transports (default): ``python bot.py``
|
||||
- WebRTC only: ``python bot.py -t webrtc``
|
||||
- ESP32: ``python bot.py -t webrtc --esp32 --host 192.168.1.100``
|
||||
- Daily only: ``python bot.py -t daily``
|
||||
- Daily (direct, testing only): ``python bot.py -d``
|
||||
- Telephony: ``python bot.py -t twilio -x your_username.ngrok.io``
|
||||
- Exotel: ``python bot.py -t exotel`` (no proxy needed, but ngrok connection to HTTP 7860 is required)
|
||||
- WhatsApp: ``python bot.py --whatsapp``
|
||||
"""
|
||||
|
||||
import argparse
|
||||
@@ -186,8 +209,33 @@ async def _run_telephony_bot(websocket: WebSocket, args: argparse.Namespace):
|
||||
await bot_module.bot(runner_args)
|
||||
|
||||
|
||||
async def _run_websocket_bot(websocket: WebSocket, args: argparse.Namespace):
|
||||
"""Run a bot for plain WebSocket transport."""
|
||||
bot_module = _get_bot_module()
|
||||
|
||||
runner_args = WebSocketRunnerArguments(
|
||||
websocket=websocket,
|
||||
transport_type="websocket",
|
||||
session_id=str(uuid.uuid4()),
|
||||
)
|
||||
runner_args.cli_args = args
|
||||
|
||||
await bot_module.bot(runner_args)
|
||||
|
||||
|
||||
def _setup_websocket_routes(app: FastAPI, args: argparse.Namespace):
|
||||
"""Set up the plain WebSocket route at ``/ws-client``."""
|
||||
|
||||
@app.websocket("/ws-client")
|
||||
async def websocket_client_endpoint(websocket: WebSocket):
|
||||
"""Handle plain WebSocket connections (non-telephony)."""
|
||||
await websocket.accept()
|
||||
logger.debug("Plain WebSocket connection accepted")
|
||||
await _run_websocket_bot(websocket, args)
|
||||
|
||||
|
||||
def _configure_server_app(args: argparse.Namespace):
|
||||
"""Configure the module-level FastAPI app with transport-specific routes."""
|
||||
"""Configure the module-level FastAPI app with routes for all transports."""
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=["*"],
|
||||
@@ -196,24 +244,232 @@ def _configure_server_app(args: argparse.Namespace):
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
# Set up transport-specific routes
|
||||
if args.transport == "webrtc":
|
||||
_setup_webrtc_routes(app, args)
|
||||
if args.whatsapp:
|
||||
_setup_whatsapp_routes(app, args)
|
||||
elif args.transport == "daily":
|
||||
_setup_daily_routes(app, args)
|
||||
elif args.transport in TELEPHONY_TRANSPORTS:
|
||||
_setup_telephony_routes(app, args)
|
||||
else:
|
||||
logger.warning(f"Unknown transport type: {args.transport}")
|
||||
# Shared session store: session_id -> body data. Used by the WebRTC /start
|
||||
# flow and the /sessions/{session_id}/... proxy routes.
|
||||
active_sessions: dict[str, dict[str, Any]] = {}
|
||||
|
||||
_setup_frontend_routes(app)
|
||||
_setup_webrtc_routes(app, args, active_sessions)
|
||||
_setup_daily_routes(app, args)
|
||||
_setup_telephony_routes(app, args)
|
||||
_setup_websocket_routes(app, args)
|
||||
_setup_unified_start_route(app, args, active_sessions)
|
||||
|
||||
if args.whatsapp:
|
||||
_setup_whatsapp_routes(app, args)
|
||||
|
||||
|
||||
def _setup_webrtc_routes(app: FastAPI, args: argparse.Namespace):
|
||||
def _setup_unified_start_route(
|
||||
app: FastAPI, args: argparse.Namespace, active_sessions: dict[str, dict[str, Any]]
|
||||
):
|
||||
"""Register the unified POST /start and GET /status endpoints.
|
||||
|
||||
Handles WebRTC, Daily, and telephony transport start flows. Clients specify
|
||||
which transport they want via the ``transport`` field in the request body.
|
||||
When ``-t`` was passed on the command line, requests for any other transport
|
||||
are rejected with HTTP 400.
|
||||
"""
|
||||
ALL_TRANSPORTS = ["webrtc", "daily", *TELEPHONY_TRANSPORTS, "websocket"]
|
||||
|
||||
@app.get("/status")
|
||||
async def status():
|
||||
"""Return the transports supported by this runner instance."""
|
||||
transports = [args.transport] if args.transport is not None else ALL_TRANSPORTS
|
||||
return {"status": "ready", "transports": transports}
|
||||
|
||||
class IceServer(TypedDict, total=False):
|
||||
urls: str | list[str]
|
||||
|
||||
class IceConfig(TypedDict):
|
||||
iceServers: list[IceServer]
|
||||
|
||||
class StartBotResult(TypedDict, total=False):
|
||||
sessionId: str
|
||||
iceConfig: IceConfig | None
|
||||
dailyRoom: str | None
|
||||
dailyToken: str | None
|
||||
wsUrl: str | None
|
||||
token: str | None
|
||||
|
||||
@app.post("/start")
|
||||
async def start_agent(request: Request):
|
||||
"""Start a bot session.
|
||||
|
||||
Accepts::
|
||||
|
||||
{
|
||||
"transport": "webrtc", // "webrtc" | "daily" | "twilio" | "telnyx" |
|
||||
// "plivo" | "exotel" — default: "webrtc"
|
||||
|
||||
// WebRTC-specific
|
||||
"enableDefaultIceServers": false,
|
||||
"body": {...},
|
||||
|
||||
// Daily-specific
|
||||
"createDailyRoom": true,
|
||||
"dailyRoomProperties": {...},
|
||||
"dailyMeetingTokenProperties": {...},
|
||||
"body": {...}
|
||||
}
|
||||
"""
|
||||
try:
|
||||
request_data = await request.json()
|
||||
logger.debug(f"Received request: {request_data}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to parse request body: {e}")
|
||||
request_data = {}
|
||||
|
||||
# Determine transport: explicit field → legacy Daily hint → CLI default → webrtc
|
||||
transport = request_data.get("transport")
|
||||
if transport is None and request_data.get("createDailyRoom", False):
|
||||
transport = "daily"
|
||||
if transport is None:
|
||||
transport = args.transport or "webrtc"
|
||||
|
||||
# Enforce restriction when -t was explicitly set on the command line
|
||||
if args.transport is not None and transport != args.transport:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=(
|
||||
f"Transport '{transport}' is not allowed. "
|
||||
f"Server is configured for '{args.transport}' only (-t {args.transport})."
|
||||
),
|
||||
)
|
||||
|
||||
if transport == "webrtc":
|
||||
# WebRTC: register the session; the bot starts when the WebRTC offer arrives.
|
||||
session_id = str(uuid.uuid4())
|
||||
active_sessions[session_id] = request_data.get("body", {})
|
||||
|
||||
result = StartBotResult(
|
||||
sessionId=session_id,
|
||||
)
|
||||
if request_data.get("enableDefaultIceServers"):
|
||||
result["iceConfig"] = IceConfig(
|
||||
iceServers=[IceServer(urls=["stun:stun.l.google.com:19302"])]
|
||||
)
|
||||
return result
|
||||
|
||||
elif transport == "daily":
|
||||
create_daily_room = request_data.get("createDailyRoom", False)
|
||||
body = request_data.get("body", {})
|
||||
daily_room_properties_dict = request_data.get("dailyRoomProperties", None)
|
||||
daily_token_properties_dict = request_data.get("dailyMeetingTokenProperties", None)
|
||||
|
||||
bot_module = _get_bot_module()
|
||||
|
||||
existing_room_url = os.getenv("DAILY_ROOM_URL")
|
||||
session_id = str(uuid.uuid4())
|
||||
result: StartBotResult | None = None
|
||||
|
||||
if create_daily_room or existing_room_url:
|
||||
from pipecat.runner.daily import configure
|
||||
from pipecat.transports.daily.utils import (
|
||||
DailyMeetingTokenProperties,
|
||||
DailyRoomProperties,
|
||||
)
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
room_properties = None
|
||||
if daily_room_properties_dict:
|
||||
daily_room_properties_dict.setdefault(
|
||||
"exp", time.time() + PIPECAT_ROOM_EXP_HOURS * 3600
|
||||
)
|
||||
daily_room_properties_dict.setdefault("eject_at_room_exp", True)
|
||||
try:
|
||||
room_properties = DailyRoomProperties(**daily_room_properties_dict)
|
||||
logger.debug(f"Using custom room properties: {room_properties}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to parse dailyRoomProperties: {e}")
|
||||
|
||||
token_properties = None
|
||||
if daily_token_properties_dict:
|
||||
try:
|
||||
token_properties = DailyMeetingTokenProperties(
|
||||
**daily_token_properties_dict
|
||||
)
|
||||
logger.debug(f"Using custom token properties: {token_properties}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to parse dailyMeetingTokenProperties: {e}")
|
||||
|
||||
room_url, token = await configure(
|
||||
session,
|
||||
room_exp_duration=PIPECAT_ROOM_EXP_HOURS,
|
||||
room_properties=room_properties,
|
||||
token_properties=token_properties,
|
||||
)
|
||||
runner_args = DailyRunnerArguments(
|
||||
room_url=room_url, token=token, body=body, session_id=session_id
|
||||
)
|
||||
result = StartBotResult(
|
||||
dailyRoom=room_url,
|
||||
dailyToken=token,
|
||||
sessionId=session_id,
|
||||
)
|
||||
else:
|
||||
runner_args = RunnerArguments(body=body, session_id=session_id)
|
||||
|
||||
runner_args.cli_args = args
|
||||
asyncio.create_task(bot_module.bot(runner_args))
|
||||
return result
|
||||
|
||||
elif transport in TELEPHONY_TRANSPORTS:
|
||||
# Telephony: the bot starts when the provider connects to /ws.
|
||||
# Return the WebSocket URL so the caller knows where to point their provider.
|
||||
scheme = "wss" if args.host != "localhost" else "ws"
|
||||
return StartBotResult(
|
||||
wsUrl=f"{scheme}://{args.host}:{args.port}/ws",
|
||||
)
|
||||
|
||||
elif transport == "websocket":
|
||||
# Plain WebSocket: the bot starts when the client connects to /ws-client.
|
||||
scheme = "wss" if args.host != "localhost" else "ws"
|
||||
session_id = str(uuid.uuid4())
|
||||
return StartBotResult(
|
||||
wsUrl=f"{scheme}://{args.host}:{args.port}/ws-client",
|
||||
sessionId=session_id,
|
||||
token="mock_token",
|
||||
)
|
||||
|
||||
else:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"Unknown transport '{transport}'.",
|
||||
)
|
||||
|
||||
|
||||
def _resolve_download_path(folder: str, filename: str) -> Path:
|
||||
"""Resolve a download path and ensure it stays within the downloads folder."""
|
||||
allowed_base = Path(folder).resolve()
|
||||
file_path = (allowed_base / filename).resolve()
|
||||
|
||||
if not file_path.is_relative_to(allowed_base):
|
||||
raise HTTPException(status_code=403, detail="Access denied")
|
||||
|
||||
return file_path
|
||||
|
||||
|
||||
def _setup_frontend_routes(app: FastAPI):
|
||||
"""Mount the prebuilt frontend UI and root redirect for all transports."""
|
||||
try:
|
||||
from pipecat_ai_prebuilt.frontend import PipecatPrebuiltUI
|
||||
except ImportError as e:
|
||||
logger.error(f"Prebuilt frontend not available: {e}")
|
||||
return
|
||||
|
||||
app.mount("/client", PipecatPrebuiltUI)
|
||||
|
||||
@app.get("/", include_in_schema=False)
|
||||
async def root_redirect():
|
||||
"""Redirect root requests to client interface."""
|
||||
return RedirectResponse(url="/client/")
|
||||
|
||||
|
||||
def _setup_webrtc_routes(
|
||||
app: FastAPI, args: argparse.Namespace, active_sessions: dict[str, dict[str, Any]]
|
||||
):
|
||||
"""Set up WebRTC-specific routes."""
|
||||
try:
|
||||
from pipecat_ai_small_webrtc_prebuilt.frontend import SmallWebRTCPrebuiltUI
|
||||
|
||||
from pipecat.transports.smallwebrtc.connection import SmallWebRTCConnection
|
||||
from pipecat.transports.smallwebrtc.request_handler import (
|
||||
IceCandidate,
|
||||
@@ -225,41 +481,20 @@ def _setup_webrtc_routes(app: FastAPI, args: argparse.Namespace):
|
||||
logger.error(f"WebRTC transport dependencies not installed: {e}")
|
||||
return
|
||||
|
||||
class IceServer(TypedDict, total=False):
|
||||
urls: str | list[str]
|
||||
|
||||
class IceConfig(TypedDict):
|
||||
iceServers: list[IceServer]
|
||||
|
||||
class StartBotResult(TypedDict, total=False):
|
||||
sessionId: str
|
||||
iceConfig: IceConfig | None
|
||||
|
||||
# In-memory store of active sessions: session_id -> session info
|
||||
active_sessions: dict[str, dict[str, Any]] = {}
|
||||
|
||||
# Mount the frontend
|
||||
app.mount("/client", SmallWebRTCPrebuiltUI)
|
||||
|
||||
@app.get("/", include_in_schema=False)
|
||||
async def root_redirect():
|
||||
"""Redirect root requests to client interface."""
|
||||
return RedirectResponse(url="/client/")
|
||||
|
||||
@app.get("/files/{filename:path}")
|
||||
async def download_file(filename: str):
|
||||
"""Handle file downloads."""
|
||||
if not args.folder:
|
||||
logger.warning(f"Attempting to dowload {filename}, but downloads folder not setup.")
|
||||
return
|
||||
logger.warning(f"Attempting to download {filename}, but downloads folder not setup.")
|
||||
raise HTTPException(404)
|
||||
|
||||
file_path = Path(args.folder) / filename
|
||||
if not os.path.exists(file_path):
|
||||
file_path = _resolve_download_path(args.folder, filename)
|
||||
if not file_path.exists():
|
||||
raise HTTPException(404)
|
||||
|
||||
media_type, _ = mimetypes.guess_type(file_path)
|
||||
|
||||
return FileResponse(path=file_path, media_type=media_type, filename=filename)
|
||||
return FileResponse(path=file_path, media_type=media_type, filename=file_path.name)
|
||||
|
||||
# Initialize the SmallWebRTC request handler
|
||||
small_webrtc_handler: SmallWebRTCRequestHandler = SmallWebRTCRequestHandler(
|
||||
@@ -304,29 +539,6 @@ def _setup_webrtc_routes(app: FastAPI, args: argparse.Namespace):
|
||||
await small_webrtc_handler.handle_patch_request(request)
|
||||
return {"status": "success"}
|
||||
|
||||
@app.post("/start")
|
||||
async def rtvi_start(request: Request):
|
||||
"""Mimic Pipecat Cloud's /start endpoint."""
|
||||
# Parse the request body
|
||||
try:
|
||||
request_data = await request.json()
|
||||
logger.debug(f"Received request: {request_data}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to parse request body: {e}")
|
||||
request_data = {}
|
||||
|
||||
# Store session info immediately in memory, replicate the behavior expected on Pipecat Cloud
|
||||
session_id = str(uuid.uuid4())
|
||||
active_sessions[session_id] = request_data.get("body", {})
|
||||
|
||||
result: StartBotResult = {"sessionId": session_id}
|
||||
if request_data.get("enableDefaultIceServers"):
|
||||
result["iceConfig"] = IceConfig(
|
||||
iceServers=[IceServer(urls=["stun:stun.l.google.com:19302"])]
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
@app.api_route(
|
||||
"/sessions/{session_id}/{path:path}",
|
||||
methods=["GET", "POST", "PUT", "PATCH", "DELETE"],
|
||||
@@ -552,12 +764,10 @@ def _setup_whatsapp_routes(app: FastAPI, args: argparse.Namespace):
|
||||
def _setup_daily_routes(app: FastAPI, args: argparse.Namespace):
|
||||
"""Set up Daily-specific routes."""
|
||||
|
||||
@app.get("/")
|
||||
@app.get("/daily")
|
||||
async def create_room_and_start_agent():
|
||||
"""Launch a Daily bot and redirect to room."""
|
||||
print("Starting bot with Daily transport and redirecting to Daily room")
|
||||
|
||||
import aiohttp
|
||||
logger.debug("Starting bot with Daily transport and redirecting to Daily room")
|
||||
|
||||
from pipecat.runner.daily import configure
|
||||
|
||||
@@ -573,105 +783,6 @@ def _setup_daily_routes(app: FastAPI, args: argparse.Namespace):
|
||||
asyncio.create_task(bot_module.bot(runner_args))
|
||||
return RedirectResponse(room_url)
|
||||
|
||||
@app.post("/start")
|
||||
async def start_agent(request: Request):
|
||||
"""Handler for /start endpoints.
|
||||
|
||||
Expects POST body like::
|
||||
{
|
||||
"createDailyRoom": true,
|
||||
"dailyRoomProperties": { "start_video_off": true },
|
||||
"dailyMeetingTokenProperties": { "is_owner": true, "user_name": "Bot" },
|
||||
"body": { "custom_data": "value" }
|
||||
}
|
||||
"""
|
||||
print("Starting bot with Daily transport")
|
||||
|
||||
# Parse the request body
|
||||
try:
|
||||
request_data = await request.json()
|
||||
logger.debug(f"Received request: {request_data}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to parse request body: {e}")
|
||||
request_data = {}
|
||||
|
||||
create_daily_room = request_data.get("createDailyRoom", False)
|
||||
body = request_data.get("body", {})
|
||||
daily_room_properties_dict = request_data.get("dailyRoomProperties", None)
|
||||
daily_token_properties_dict = request_data.get("dailyMeetingTokenProperties", None)
|
||||
|
||||
bot_module = _get_bot_module()
|
||||
|
||||
existing_room_url = os.getenv("DAILY_ROOM_URL")
|
||||
|
||||
session_id = str(uuid.uuid4())
|
||||
result = None
|
||||
|
||||
# Configure room if:
|
||||
# 1. Explicitly requested via createDailyRoom in payload
|
||||
# 2. Using pre-configured room from DAILY_ROOM_URL env var
|
||||
if create_daily_room or existing_room_url:
|
||||
import aiohttp
|
||||
|
||||
from pipecat.runner.daily import configure
|
||||
from pipecat.transports.daily.utils import (
|
||||
DailyMeetingTokenProperties,
|
||||
DailyRoomProperties,
|
||||
)
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
# Parse dailyRoomProperties if provided
|
||||
room_properties = None
|
||||
if daily_room_properties_dict:
|
||||
# Apply Pipecat Cloud's session policy if caller didn't override.
|
||||
daily_room_properties_dict.setdefault(
|
||||
"exp", time.time() + PIPECAT_ROOM_EXP_HOURS * 3600
|
||||
)
|
||||
daily_room_properties_dict.setdefault("eject_at_room_exp", True)
|
||||
try:
|
||||
room_properties = DailyRoomProperties(**daily_room_properties_dict)
|
||||
logger.debug(f"Using custom room properties: {room_properties}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to parse dailyRoomProperties: {e}")
|
||||
# Continue without custom properties
|
||||
|
||||
# Parse dailyMeetingTokenProperties if provided
|
||||
token_properties = None
|
||||
if daily_token_properties_dict:
|
||||
try:
|
||||
token_properties = DailyMeetingTokenProperties(
|
||||
**daily_token_properties_dict
|
||||
)
|
||||
logger.debug(f"Using custom token properties: {token_properties}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to parse dailyMeetingTokenProperties: {e}")
|
||||
# Continue without custom properties
|
||||
|
||||
room_url, token = await configure(
|
||||
session,
|
||||
room_exp_duration=PIPECAT_ROOM_EXP_HOURS,
|
||||
room_properties=room_properties,
|
||||
token_properties=token_properties,
|
||||
)
|
||||
runner_args = DailyRunnerArguments(
|
||||
room_url=room_url, token=token, body=body, session_id=session_id
|
||||
)
|
||||
result = {
|
||||
"dailyRoom": room_url,
|
||||
"dailyToken": token,
|
||||
"sessionId": session_id,
|
||||
}
|
||||
else:
|
||||
runner_args = RunnerArguments(body=body, session_id=session_id)
|
||||
|
||||
# Update CLI args.
|
||||
runner_args.cli_args = args
|
||||
|
||||
# Start the bot in the background
|
||||
asyncio.create_task(bot_module.bot(runner_args))
|
||||
|
||||
return result
|
||||
|
||||
if args.dialin:
|
||||
|
||||
@app.post("/daily-dialin-webhook")
|
||||
@@ -720,8 +831,6 @@ def _setup_daily_routes(app: FastAPI, args: argparse.Namespace):
|
||||
detail="Missing required fields: From, To, callId, callDomain",
|
||||
)
|
||||
|
||||
import aiohttp
|
||||
|
||||
from pipecat.runner.daily import configure
|
||||
from pipecat.runner.types import DailyDialinRequest, DialinSettings
|
||||
|
||||
@@ -790,44 +899,51 @@ def _setup_daily_routes(app: FastAPI, args: argparse.Namespace):
|
||||
|
||||
|
||||
def _setup_telephony_routes(app: FastAPI, args: argparse.Namespace):
|
||||
"""Set up telephony-specific routes."""
|
||||
# XML response templates (Exotel doesn't use XML webhooks)
|
||||
XML_TEMPLATES = {
|
||||
"twilio": f"""<?xml version="1.0" encoding="UTF-8"?>
|
||||
"""Set up telephony-specific routes.
|
||||
|
||||
The WebSocket endpoint (``/ws``) is always registered so providers can
|
||||
connect directly. The XML webhook (``POST /``) is only registered when a
|
||||
specific telephony transport is chosen via ``-t`` because the XML template
|
||||
is provider-specific and requires a proxy hostname (``--proxy``).
|
||||
"""
|
||||
if args.transport in TELEPHONY_TRANSPORTS:
|
||||
# XML response templates (Exotel doesn't use XML webhooks)
|
||||
XML_TEMPLATES = {
|
||||
"twilio": f"""<?xml version="1.0" encoding="UTF-8"?>
|
||||
<Response>
|
||||
<Connect>
|
||||
<Stream url="wss://{args.proxy}/ws"></Stream>
|
||||
</Connect>
|
||||
<Pause length="40"/>
|
||||
</Response>""",
|
||||
"telnyx": f"""<?xml version="1.0" encoding="UTF-8"?>
|
||||
"telnyx": f"""<?xml version="1.0" encoding="UTF-8"?>
|
||||
<Response>
|
||||
<Connect>
|
||||
<Stream url="wss://{args.proxy}/ws" bidirectionalMode="rtp"></Stream>
|
||||
</Connect>
|
||||
<Pause length="40"/>
|
||||
</Response>""",
|
||||
"plivo": f"""<?xml version="1.0" encoding="UTF-8"?>
|
||||
"plivo": f"""<?xml version="1.0" encoding="UTF-8"?>
|
||||
<Response>
|
||||
<Stream bidirectional="true" keepCallAlive="true" contentType="audio/x-mulaw;rate=8000">wss://{args.proxy}/ws</Stream>
|
||||
</Response>""",
|
||||
}
|
||||
}
|
||||
|
||||
@app.post("/")
|
||||
async def start_call():
|
||||
"""Handle telephony webhook and return XML response."""
|
||||
if args.transport == "exotel":
|
||||
# Exotel doesn't use POST webhooks - redirect to proper documentation
|
||||
logger.debug("POST Exotel endpoint - not used")
|
||||
return {
|
||||
"error": "Exotel doesn't use POST webhooks",
|
||||
"websocket_url": f"wss://{args.proxy}/ws",
|
||||
"note": "Configure the WebSocket URL above in your Exotel App Bazaar Voicebot Applet",
|
||||
}
|
||||
else:
|
||||
logger.debug(f"POST {args.transport.upper()} XML")
|
||||
xml_content = XML_TEMPLATES.get(args.transport, "<Response></Response>")
|
||||
return HTMLResponse(content=xml_content, media_type="application/xml")
|
||||
@app.post("/")
|
||||
async def start_call():
|
||||
"""Handle telephony webhook and return XML response."""
|
||||
if args.transport == "exotel":
|
||||
# Exotel doesn't use POST webhooks - redirect to proper documentation
|
||||
logger.debug("POST Exotel endpoint - not used")
|
||||
return {
|
||||
"error": "Exotel doesn't use POST webhooks",
|
||||
"websocket_url": f"wss://{args.proxy}/ws",
|
||||
"note": "Configure the WebSocket URL above in your Exotel App Bazaar Voicebot Applet",
|
||||
}
|
||||
else:
|
||||
logger.debug(f"POST {args.transport.upper()} XML")
|
||||
xml_content = XML_TEMPLATES.get(args.transport, "<Response></Response>")
|
||||
return HTMLResponse(content=xml_content, media_type="application/xml")
|
||||
|
||||
@app.websocket("/ws")
|
||||
async def websocket_endpoint(websocket: WebSocket):
|
||||
@@ -836,11 +952,6 @@ def _setup_telephony_routes(app: FastAPI, args: argparse.Namespace):
|
||||
logger.debug("WebSocket connection accepted")
|
||||
await _run_telephony_bot(websocket, args)
|
||||
|
||||
@app.get("/")
|
||||
async def start_agent():
|
||||
"""Simple status endpoint for telephony transports."""
|
||||
return {"status": f"Bot started with {args.transport}"}
|
||||
|
||||
|
||||
async def _run_daily_direct(args: argparse.Namespace):
|
||||
"""Run Daily bot with direct connection (no FastAPI server)."""
|
||||
@@ -911,22 +1022,27 @@ def runner_port() -> int:
|
||||
def main(parser: argparse.ArgumentParser | None = None):
|
||||
"""Start the Pipecat development runner.
|
||||
|
||||
Parses command-line arguments and starts a FastAPI server configured
|
||||
for the specified transport type.
|
||||
Parses command-line arguments and starts a FastAPI server that supports
|
||||
WebRTC, Daily, and telephony transports simultaneously. Clients declare
|
||||
which transport to use via the ``transport`` field in the ``/start`` body.
|
||||
|
||||
When ``-t`` is provided, the server restricts ``/start`` to that transport
|
||||
only and displays transport-specific startup information.
|
||||
|
||||
The runner discovers and runs any ``bot(runner_args)`` function found in the
|
||||
calling module.
|
||||
|
||||
Command-line arguments:
|
||||
- --host: Server host address (default: localhost) 879
|
||||
- --host: Server host address (default: localhost)
|
||||
- --port: Server port (default: 7860)
|
||||
- -t/--transport: Transport type (daily, webrtc, twilio, telnyx, plivo, exotel)
|
||||
- -t/--transport: Restrict to a single transport and set as default for /start
|
||||
(daily, webrtc, twilio, telnyx, plivo, exotel). Omit to support all transports.
|
||||
- -x/--proxy: Public proxy hostname for telephony webhooks
|
||||
- -d/--direct: Connect directly to Daily room (automatically sets transport to daily)
|
||||
- -f/--folder: Path to downloads folder
|
||||
- --dialin: Enable Daily PSTN dial-in webhook handling (requires Daily transport)
|
||||
- --dialin: Enable Daily PSTN dial-in webhook handling
|
||||
- --esp32: Enable SDP munging for ESP32 compatibility (requires --host with IP address)
|
||||
- --whatsapp: Ensure requried WhatsApp environment variables are present
|
||||
- --whatsapp: Ensure required WhatsApp environment variables are present
|
||||
- -v/--verbose: Increase logging verbosity
|
||||
|
||||
Args:
|
||||
@@ -947,8 +1063,11 @@ def main(parser: argparse.ArgumentParser | None = None):
|
||||
"--transport",
|
||||
type=str,
|
||||
choices=["daily", "webrtc", *TELEPHONY_TRANSPORTS],
|
||||
default="webrtc",
|
||||
help="Transport type",
|
||||
default=None,
|
||||
help=(
|
||||
"Restrict the server to a single transport and set it as the default for /start. "
|
||||
"Omit to support all transports simultaneously (default behaviour)."
|
||||
),
|
||||
)
|
||||
parser.add_argument("-x", "--proxy", help="Public proxy host name")
|
||||
parser.add_argument(
|
||||
@@ -966,7 +1085,7 @@ def main(parser: argparse.ArgumentParser | None = None):
|
||||
"--dialin",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="Enable Daily PSTN dial-in webhook handling (requires Daily transport)",
|
||||
help="Enable Daily PSTN dial-in webhook handling",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--esp32",
|
||||
@@ -978,7 +1097,7 @@ def main(parser: argparse.ArgumentParser | None = None):
|
||||
"--whatsapp",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="Ensure requried WhatsApp environment variables are present",
|
||||
help="Ensure required WhatsApp environment variables are present",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
@@ -987,12 +1106,13 @@ def main(parser: argparse.ArgumentParser | None = None):
|
||||
if args.proxy:
|
||||
args.proxy = _validate_and_clean_proxy(args.proxy)
|
||||
|
||||
# Auto-set transport to daily if --direct is used without explicit transport
|
||||
if args.direct and args.transport == "webrtc": # webrtc is the default
|
||||
args.transport = "daily"
|
||||
elif args.direct and args.transport != "daily":
|
||||
logger.error("--direct flag only works with Daily transport (-t daily)")
|
||||
return
|
||||
# --direct implies Daily transport
|
||||
if args.direct:
|
||||
if args.transport is None or args.transport == "daily":
|
||||
args.transport = "daily"
|
||||
else:
|
||||
logger.error("--direct flag only works with Daily transport (-t daily)")
|
||||
return
|
||||
|
||||
# Validate ESP32 requirements
|
||||
if args.esp32 and args.host == "localhost":
|
||||
@@ -1000,7 +1120,7 @@ def main(parser: argparse.ArgumentParser | None = None):
|
||||
return
|
||||
|
||||
# Validate dial-in requirements
|
||||
if args.dialin and args.transport != "daily":
|
||||
if args.dialin and args.transport is not None and args.transport != "daily":
|
||||
logger.error("--dialin flag only works with Daily transport (-t daily)")
|
||||
return
|
||||
|
||||
@@ -1018,28 +1138,38 @@ def main(parser: argparse.ArgumentParser | None = None):
|
||||
asyncio.run(_run_daily_direct(args))
|
||||
return
|
||||
|
||||
# Print startup message for server-based transports
|
||||
if args.transport == "webrtc":
|
||||
print()
|
||||
# Print startup message
|
||||
print()
|
||||
if args.transport is None:
|
||||
print("🚀 Bot ready!")
|
||||
print(f" → WebRTC: http://{args.host}:{args.port}/client")
|
||||
print(f" → Daily: http://{args.host}:{args.port}/daily")
|
||||
print(f" → Telephony: ws://{args.host}:{args.port}/ws")
|
||||
elif args.transport == "webrtc":
|
||||
if args.esp32:
|
||||
print(f"🚀 Bot ready! (ESP32 mode)")
|
||||
print("🚀 Bot ready! (ESP32 mode)")
|
||||
elif args.whatsapp:
|
||||
print(f"🚀 Bot ready! (WhatsApp)")
|
||||
print("🚀 Bot ready! (WhatsApp)")
|
||||
else:
|
||||
print(f"🚀 Bot ready!")
|
||||
print("🚀 Bot ready! (WebRTC)")
|
||||
print(f" → Open http://{args.host}:{args.port}/client in your browser")
|
||||
print()
|
||||
elif args.transport == "daily":
|
||||
print()
|
||||
print(f"🚀 Bot ready!")
|
||||
print("🚀 Bot ready! (Daily)")
|
||||
if args.dialin:
|
||||
print(
|
||||
f" → Daily dial-in webhook: http://{args.host}:{args.port}/daily-dialin-webhook"
|
||||
)
|
||||
print(f" → Configure this URL in your Daily phone number settings")
|
||||
else:
|
||||
print(f" → Open http://{args.host}:{args.port} in your browser to start a session")
|
||||
print()
|
||||
print(
|
||||
f" → Open http://{args.host}:{args.port}/daily in your browser to start a session"
|
||||
)
|
||||
elif args.transport in TELEPHONY_TRANSPORTS:
|
||||
print(f"🚀 Bot ready! ({args.transport.capitalize()})")
|
||||
if args.proxy:
|
||||
print(f" → XML webhook: http://{args.host}:{args.port}/")
|
||||
print(f" → WebSocket: ws://{args.host}:{args.port}/ws")
|
||||
print()
|
||||
|
||||
RUNNER_DOWNLOADS_FOLDER = args.folder
|
||||
RUNNER_HOST = args.host
|
||||
|
||||
@@ -105,10 +105,14 @@ class WebSocketRunnerArguments(RunnerArguments):
|
||||
|
||||
Parameters:
|
||||
websocket: WebSocket connection for audio streaming
|
||||
transport_type: Transport type identifier. Set to ``"websocket"`` for plain
|
||||
WebSocket connections; ``None`` triggers auto-detection from the first
|
||||
telephony provider message.
|
||||
body: Additional request data
|
||||
"""
|
||||
|
||||
websocket: WebSocket
|
||||
transport_type: str | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
|
||||
@@ -509,35 +509,29 @@ async def create_transport(
|
||||
"daily": lambda: DailyParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
),
|
||||
"webrtc": lambda: TransportParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
),
|
||||
"twilio": lambda: FastAPIWebsocketParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
# add_wav_header and serializer will be set automatically
|
||||
),
|
||||
"telnyx": lambda: FastAPIWebsocketParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
# add_wav_header and serializer will be set automatically
|
||||
),
|
||||
"plivo": lambda: FastAPIWebsocketParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
# add_wav_header and serializer will be set automatically
|
||||
),
|
||||
"exotel": lambda: FastAPIWebsocketParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
# add_wav_header and serializer will be set automatically
|
||||
),
|
||||
}
|
||||
@@ -568,6 +562,12 @@ async def create_transport(
|
||||
)
|
||||
|
||||
elif isinstance(runner_args, WebSocketRunnerArguments):
|
||||
if runner_args.transport_type == "websocket":
|
||||
params = _get_transport_params("websocket", transport_params)
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketTransport
|
||||
|
||||
return FastAPIWebsocketTransport(websocket=runner_args.websocket, params=params)
|
||||
|
||||
# Parse once to determine the provider and get data
|
||||
transport_type, call_data = await parse_telephony_websocket(runner_args.websocket)
|
||||
params = _get_transport_params(transport_type, transport_params)
|
||||
|
||||
@@ -105,7 +105,7 @@ class AnthropicLLMSettings(LLMSettings):
|
||||
return instance
|
||||
|
||||
|
||||
class AnthropicLLMService(LLMService):
|
||||
class AnthropicLLMService(LLMService[AnthropicLLMAdapter]):
|
||||
"""LLM service for Anthropic's Claude models.
|
||||
|
||||
Provides inference capabilities with Claude models including support for
|
||||
@@ -293,7 +293,7 @@ class AnthropicLLMService(LLMService):
|
||||
effective_instruction = system_instruction or assert_given(
|
||||
self._settings.system_instruction
|
||||
)
|
||||
adapter: AnthropicLLMAdapter = self.get_llm_adapter()
|
||||
adapter = self.get_llm_adapter()
|
||||
invocation_params = adapter.get_llm_invocation_params(
|
||||
context,
|
||||
enable_prompt_caching=assert_given(self._settings.enable_prompt_caching),
|
||||
@@ -328,8 +328,8 @@ class AnthropicLLMService(LLMService):
|
||||
return next((block.text for block in response.content if hasattr(block, "text")), None)
|
||||
|
||||
def _get_llm_invocation_params(self, context: LLMContext) -> AnthropicLLMInvocationParams:
|
||||
adapter: AnthropicLLMAdapter = self.get_llm_adapter()
|
||||
params: AnthropicLLMInvocationParams = adapter.get_llm_invocation_params(
|
||||
adapter = self.get_llm_adapter()
|
||||
params = adapter.get_llm_invocation_params(
|
||||
context,
|
||||
enable_prompt_caching=assert_given(self._settings.enable_prompt_caching),
|
||||
system_instruction=assert_given(self._settings.system_instruction),
|
||||
|
||||
@@ -233,10 +233,11 @@ class AssemblyAISTTService(WebsocketSTTService):
|
||||
sample_rate = connection_params.sample_rate
|
||||
encoding = connection_params.encoding
|
||||
default_settings.model = connection_params.speech_model
|
||||
default_settings.formatted_finals = connection_params.formatted_finals
|
||||
default_settings.word_finalization_max_wait_time = (
|
||||
connection_params.word_finalization_max_wait_time
|
||||
)
|
||||
# Note: `formatted_finals` and `word_finalization_max_wait_time`
|
||||
# were added to Settings after this deprecated input model
|
||||
# was frozen and have no equivalent on
|
||||
# AssemblyAIConnectionParams; they are only configurable via
|
||||
# the canonical `settings=...` API.
|
||||
default_settings.end_of_turn_confidence_threshold = (
|
||||
connection_params.end_of_turn_confidence_threshold
|
||||
)
|
||||
|
||||
@@ -42,14 +42,17 @@ except ModuleNotFoundError as e:
|
||||
raise Exception(f"Missing module: {e}")
|
||||
|
||||
|
||||
def language_to_async_language(language: Language) -> str | None:
|
||||
def language_to_async_language(language: Language) -> str:
|
||||
"""Convert a Language enum to Async language code.
|
||||
|
||||
Args:
|
||||
language: The Language enum value to convert.
|
||||
|
||||
Returns:
|
||||
The corresponding Async language code, or None if not supported.
|
||||
The corresponding service language code. If ``language`` is not in
|
||||
the verified mapping, falls back to the base language code (e.g.,
|
||||
``en`` from ``en-US``) and logs a warning (via
|
||||
``resolve_language(..., use_base_code=True)``).
|
||||
"""
|
||||
LANGUAGE_MAP = {
|
||||
Language.EN: "en",
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user