Compare commits
234 Commits
mb/update-
...
mb/fix-sen
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c6235a34a9 | ||
|
|
76eef837b6 | ||
|
|
c9aaa463b7 | ||
|
|
6d582e41b7 | ||
|
|
ca29f62bff | ||
|
|
0dced68c3c | ||
|
|
8ab81d289a | ||
|
|
f457d00760 | ||
|
|
f5118c4412 | ||
|
|
a79fe40162 | ||
|
|
dcb4949e20 | ||
|
|
8b543e558d | ||
|
|
8181962236 | ||
|
|
98dc891640 | ||
|
|
71de0da570 | ||
|
|
b40c8bb81d | ||
|
|
43f1b59b86 | ||
|
|
a0a2bb3aa4 | ||
|
|
04a50df3d5 | ||
|
|
8c0edffaff | ||
|
|
fe6063fdbe | ||
|
|
195146adb2 | ||
|
|
cab9e18cc9 | ||
|
|
baef688e4e | ||
|
|
f1f43fe500 | ||
|
|
73b63f8d35 | ||
|
|
0c14b33e92 | ||
|
|
09beaccaf0 | ||
|
|
40557a1aae | ||
|
|
ecc4cc4a79 | ||
|
|
37be8805f4 | ||
|
|
93c7e64995 | ||
|
|
9de2bd61a9 | ||
|
|
566af71862 | ||
|
|
12064bd6e6 | ||
|
|
a962459151 | ||
|
|
8fc76a29bc | ||
|
|
e3019261a5 | ||
|
|
fa1f6f1c51 | ||
|
|
337f00c16c | ||
|
|
d50922cdcd | ||
|
|
47f5ca6265 | ||
|
|
2eddb6ffda | ||
|
|
560a6f2247 | ||
|
|
59ecb19000 | ||
|
|
cfb094b3c8 | ||
|
|
1f7e8e001b | ||
|
|
688b136141 | ||
|
|
809c4c1bc5 | ||
|
|
81ca5e6601 | ||
|
|
ebc49d2252 | ||
|
|
ff8d158e18 | ||
|
|
37980b0854 | ||
|
|
39ebc2c9c1 | ||
|
|
ab61d09ec1 | ||
|
|
e4afc0a13c | ||
|
|
dde3d2395b | ||
|
|
30b36c3d6e | ||
|
|
de4dfc3ed4 | ||
|
|
a0128516ff | ||
|
|
db3b8c7325 | ||
|
|
9273ec0f25 | ||
|
|
8dfa1187be | ||
|
|
e17fd580c6 | ||
|
|
3e3d50a855 | ||
|
|
402661ae03 | ||
|
|
69c6a95b8a | ||
|
|
4d49210a73 | ||
|
|
5f8a22ef2f | ||
|
|
606ad0826a | ||
|
|
57028255ee | ||
|
|
87ebbab758 | ||
|
|
bd401e8d6f | ||
|
|
f0dfab23e7 | ||
|
|
fbc907c371 | ||
|
|
40b5ef485d | ||
|
|
b30af3e155 | ||
|
|
446bb5cddf | ||
|
|
1c1ee94074 | ||
|
|
ac30083b45 | ||
|
|
ce579d4266 | ||
|
|
5a07b30c7a | ||
|
|
9da33f3897 | ||
|
|
5ca82ec61e | ||
|
|
0067c7df47 | ||
|
|
ab03db5b0c | ||
|
|
238d6bf9ab | ||
|
|
90ae85bab2 | ||
|
|
29e09b2053 | ||
|
|
bad9977e8c | ||
|
|
b987579d54 | ||
|
|
40f1f4ff11 | ||
|
|
a3ad31d0f6 | ||
|
|
8044c4170d | ||
|
|
bc51e7abc6 | ||
|
|
256ecf4d71 | ||
|
|
c16969c4f5 | ||
|
|
8ef64d8c8d | ||
|
|
4947d08733 | ||
|
|
b61846534d | ||
|
|
8f01cd220a | ||
|
|
3abaaf80e0 | ||
|
|
13890fa021 | ||
|
|
802af28888 | ||
|
|
24a628c85e | ||
|
|
ddab95835b | ||
|
|
cb13f4b4cb | ||
|
|
4793277d34 | ||
|
|
28c729cc36 | ||
|
|
4d07c7b77c | ||
|
|
4ff0567025 | ||
|
|
1377dec01b | ||
|
|
42f4d73a63 | ||
|
|
f1c1ebf852 | ||
|
|
eb6d43f6cb | ||
|
|
f387776985 | ||
|
|
5286591826 | ||
|
|
6831e63ec9 | ||
|
|
12bcb7db64 | ||
|
|
1b48b1d860 | ||
|
|
d161e2767f | ||
|
|
4e3af00b6d | ||
|
|
4015aedb86 | ||
|
|
75a6ee839b | ||
|
|
13ce02c896 | ||
|
|
2fd5885dc3 | ||
|
|
d743586bfb | ||
|
|
8051017895 | ||
|
|
dc7bf98ce5 | ||
|
|
609a43a191 | ||
|
|
4fb04422d9 | ||
|
|
2f74a7e674 | ||
|
|
5205f56087 | ||
|
|
694c792af3 | ||
|
|
406e82a842 | ||
|
|
837de5f893 | ||
|
|
10b9b1da2f | ||
|
|
7854a2ec83 | ||
|
|
ac7c69078f | ||
|
|
c9b4356ea6 | ||
|
|
b3e4421191 | ||
|
|
84058c3948 | ||
|
|
aebc781419 | ||
|
|
4160446f4c | ||
|
|
05a14af184 | ||
|
|
89d2ef2bde | ||
|
|
f550015efb | ||
|
|
8fa44863fb | ||
|
|
088cb56922 | ||
|
|
a789e5feea | ||
|
|
16ca44131c | ||
|
|
418860cf26 | ||
|
|
e2fc8b3dce | ||
|
|
8b641089f8 | ||
|
|
d36ed755ce | ||
|
|
7aaf64fe55 | ||
|
|
5f52008974 | ||
|
|
d520677b23 | ||
|
|
42bd1e9d40 | ||
|
|
7f0494aa04 | ||
|
|
b7ae2989ac | ||
|
|
2b2b0f8121 | ||
|
|
5ca33a2b00 | ||
|
|
938dcb613d | ||
|
|
bc748cf9d0 | ||
|
|
3b55d16a49 | ||
|
|
d7f31e0cbd | ||
|
|
c662a2d820 | ||
|
|
2c220ca54e | ||
|
|
89f0ff17c0 | ||
|
|
b5465364fa | ||
|
|
c024eb7b8c | ||
|
|
608570e89d | ||
|
|
3ad61a8a04 | ||
|
|
4c4bae2db6 | ||
|
|
901b6b5913 | ||
|
|
71cd0f1c87 | ||
|
|
a2a419e6db | ||
|
|
bbbbdc459a | ||
|
|
d203528dad | ||
|
|
4bcca7956e | ||
|
|
68a4cf4c68 | ||
|
|
0508ddddfb | ||
|
|
8714c9137f | ||
|
|
4c029fcfa7 | ||
|
|
5c86f8e687 | ||
|
|
54a4d8a9f8 | ||
|
|
38af514d95 | ||
|
|
6aa80c0b8e | ||
|
|
e720573e60 | ||
|
|
541a43905b | ||
|
|
707df913cd | ||
|
|
3f3d757581 | ||
|
|
7c781ce816 | ||
|
|
f3efc9da00 | ||
|
|
827a70104d | ||
|
|
a40327305c | ||
|
|
168af44429 | ||
|
|
5f8433476c | ||
|
|
6a6fea74f5 | ||
|
|
91b557ecbf | ||
|
|
be85291414 | ||
|
|
09f171b69d | ||
|
|
929fd98958 | ||
|
|
1cfbfcaf11 | ||
|
|
cd5a3c13bd | ||
|
|
9b871b0cc5 | ||
|
|
0d499a8aa3 | ||
|
|
45292ab13d | ||
|
|
be6ea0dbf6 | ||
|
|
fb18ae174e | ||
|
|
c4506523ab | ||
|
|
b360cb31dc | ||
|
|
07f104199c | ||
|
|
bc1949b4bf | ||
|
|
2035dd8b39 | ||
|
|
24c8189327 | ||
|
|
998ac32627 | ||
|
|
50645c1c4f | ||
|
|
8ce29ee8f2 | ||
|
|
7b8aeef4cc | ||
|
|
6a24457f0e | ||
|
|
2c01c2b5b3 | ||
|
|
1c2e114fa2 | ||
|
|
0f137e36c2 | ||
|
|
b7f12a96f1 | ||
|
|
3331f71e17 | ||
|
|
55d200e2d1 | ||
|
|
3fae00e067 | ||
|
|
78cdefd191 | ||
|
|
42502a4f3b | ||
|
|
fc67cc3302 | ||
|
|
e503ea7466 | ||
|
|
248206e234 |
2
.github/workflows/coverage.yaml
vendored
@@ -25,7 +25,7 @@ jobs:
|
||||
version: "latest"
|
||||
|
||||
- name: Set up Python
|
||||
run: uv python install 3.10
|
||||
run: uv python install 3.12
|
||||
|
||||
- name: Install system packages
|
||||
run: |
|
||||
|
||||
11
.github/workflows/sync-quickstart.yaml
vendored
@@ -23,17 +23,12 @@ jobs:
|
||||
token: ${{ secrets.QUICKSTART_SYNC_TOKEN }}
|
||||
path: quickstart-repo
|
||||
|
||||
- name: Sync files (excluding READMEs)
|
||||
- name: Sync files (excluding uv.lock and README.md)
|
||||
run: |
|
||||
# Copy code files only, skip READMEs
|
||||
cp examples/quickstart/bot.py quickstart-repo/
|
||||
cp examples/quickstart/requirements.txt quickstart-repo/
|
||||
cp examples/quickstart/env.example quickstart-repo/
|
||||
|
||||
# Copy any other files that aren't README.md
|
||||
# Copy all files except uv.lock and README.md
|
||||
find examples/quickstart -type f \
|
||||
-not -name "README.md" \
|
||||
-not -name "*.md" \
|
||||
-not -name "uv.lock" \
|
||||
-exec cp {} quickstart-repo/ \;
|
||||
|
||||
- name: Commit and push changes
|
||||
|
||||
2
.github/workflows/tests.yaml
vendored
@@ -29,7 +29,7 @@ jobs:
|
||||
version: "latest"
|
||||
|
||||
- name: Set up Python
|
||||
run: uv python install 3.10
|
||||
run: uv python install 3.12
|
||||
|
||||
- name: Install system packages
|
||||
run: |
|
||||
|
||||
42
.github/workflows/update-lockfile.yaml
vendored
@@ -1,42 +0,0 @@
|
||||
name: Update lockfile
|
||||
|
||||
on:
|
||||
push:
|
||||
paths:
|
||||
- 'pyproject.toml'
|
||||
branches:
|
||||
- main
|
||||
workflow_dispatch: # Allows manual triggering from GitHub UI
|
||||
|
||||
jobs:
|
||||
update-lockfile:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
# This gives the workflow permission to push back to the repo
|
||||
token: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Install uv
|
||||
uses: astral-sh/setup-uv@v1
|
||||
|
||||
- name: Update lockfile
|
||||
run: uv lock
|
||||
|
||||
- name: Check for changes
|
||||
id: verify-changed-files
|
||||
run: |
|
||||
if [ -n "$(git status --porcelain)" ]; then
|
||||
echo "changed=true" >> $GITHUB_OUTPUT
|
||||
else
|
||||
echo "changed=false" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
|
||||
- name: Commit lockfile
|
||||
if: steps.verify-changed-files.outputs.changed == 'true'
|
||||
run: |
|
||||
git config --local user.email "action@github.com"
|
||||
git config --local user.name "GitHub Action"
|
||||
git add uv.lock
|
||||
git commit -m "chore: update uv.lock after dependency changes"
|
||||
git push
|
||||
284
CHANGELOG.md
@@ -5,18 +5,294 @@ All notable changes to **Pipecat** will be documented in this file.
|
||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
||||
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||
|
||||
## Unreleased
|
||||
## [Unreleased]
|
||||
|
||||
### Added
|
||||
|
||||
- Added a `cancel_timeout_secs` argument to `PipelineTask` which defines how
|
||||
long the pipeline has to complete cancellation. When `PipelineTask.cancel()`
|
||||
is called, a `CancelFrame` is pushed through the pipeline and must reach the
|
||||
end. If it does not reach the end within the specified time, a warning is
|
||||
shown and the wait is aborted.
|
||||
|
||||
- Added a new "universal" (LLM-agnostic) `LLMContext` and accompanying
|
||||
`LLMContextAggregatorPair`, which will eventually replace `OpenAILLMContext`
|
||||
(and the other under-the-hood contexts) and the other context aggregators.
|
||||
The new universal `LLMContext` machinery allows a single context to be shared
|
||||
between different LLMs, enabling runtime LLM switching and scenarios like
|
||||
failover.
|
||||
|
||||
From the developer's point of view, switching to using the new universal
|
||||
context machinery will usually be a matter of going from this:
|
||||
|
||||
```python
|
||||
context = OpenAILLMContext(messages, tools)
|
||||
context_aggregator = llm.create_context_aggregator(context)
|
||||
```
|
||||
|
||||
To this:
|
||||
|
||||
```python
|
||||
context = LLMContext(messages, tools)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
```
|
||||
|
||||
To start, the universal `LLMContext` is supported with the following LLM
|
||||
services:
|
||||
|
||||
- `OpenAILLMService`
|
||||
- `GoogleLLMService`
|
||||
|
||||
- Added a new `LLMSwitcher` class to enable runtime LLM switching, built atop a
|
||||
new generic `ServiceSwitcher`.
|
||||
|
||||
Switchers take a switching strategy. The first available strategy is
|
||||
`ServiceSwitcherStrategyManual`.
|
||||
|
||||
To switch LLMs at runtime, the LLMs must be sharing one instance of the new
|
||||
universal `LLMContext` (see above bullet).
|
||||
|
||||
```python
|
||||
# Instantiate your LLM services
|
||||
llm_openai = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
|
||||
llm_google = GoogleLLMService(api_key=os.getenv("GOOGLE_API_KEY"))
|
||||
|
||||
# Instantiate a switcher
|
||||
# (ServiceSwitcherStrategyManual defaults to OpenAI, as it's first in the list)
|
||||
llm_switcher = LLMSwitcher(
|
||||
llms=[llm_openai, llm_google], strategy_type=ServiceSwitcherStrategyManual
|
||||
)
|
||||
|
||||
# Create your pipeline
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(),
|
||||
stt,
|
||||
context_aggregator.user(),
|
||||
llm_switcher,
|
||||
tts,
|
||||
transport.output(),
|
||||
context_aggregator.assistant(),
|
||||
]
|
||||
)
|
||||
task = PipelineTask(pipeline, params=PipelineParams(allow_interruptions=True))
|
||||
|
||||
# ...
|
||||
# Whenever is appropriate, switch LLMs!
|
||||
await task.queue_frames([ManuallySwitchServiceFrame(service=llm_google)])
|
||||
```
|
||||
|
||||
- Added an `LLMService.run_inference()` method to LLM services to enable
|
||||
direct, out-of-band (i.e. out-of-pipeline) inference.
|
||||
|
||||
### Changed
|
||||
|
||||
- Updated `SarvamTTSService` to use WebSocket streaming for real-time audio
|
||||
generation with multiple Indian languages, with HTTP support still available
|
||||
via `SarvamHttpTTSService`.
|
||||
|
||||
### Fixed
|
||||
|
||||
- Fixed a missing import in `SentryMetrics`.
|
||||
|
||||
- Fixed a `CartesiaTTSService` issue that was causing the application to hang
|
||||
after Cartesia's 5 minutes timed out.
|
||||
|
||||
## [0.0.81] - 2025-08-25
|
||||
|
||||
### Added
|
||||
|
||||
- Added `pipecat.extensions.voicemail`, a module for detecting voicemail vs.
|
||||
live conversation, primarily intended for use in outbound calling scenarios.
|
||||
The voicemail module is optimized for text LLMs only.
|
||||
|
||||
- Added new frames to the `idle_timeout_frames` arg: `TranscriptionFrame`,
|
||||
`InterimTranscriptionFrame`, `UserStartedSpeakingFrame`, and
|
||||
`UserStoppedSpeakingFrame`. These additions serve as indicators of user
|
||||
activity in the pipeline idle detection logic.
|
||||
|
||||
- Allow passing custom pipeline sink and source processors to a
|
||||
`Pipeline`. Pipeline source and sink processors are used to know and control
|
||||
what's coming in and out of a `Pipeline` processor.
|
||||
|
||||
- Added `FrameProcessor.pause_processing_system_frames()` and
|
||||
`FrameProcessor.resume_processing_system_frames()`. These allow to pause and
|
||||
resume the processing of system frame.
|
||||
|
||||
- Added new `on_process_frame()` observer method which makes it possible to know
|
||||
when a frame is being processed.
|
||||
|
||||
- Added new `FrameProcessor.entry_processor()` method. This allows you to access
|
||||
the first non-compound processor in a pipeline.
|
||||
|
||||
- Added `FrameProcessor` properties `processors`, `next` and `previous`.
|
||||
|
||||
- `ElevenLabsTTSService` now supports additional runtime changes to the `model`,
|
||||
`language`, and `voice_settings` parameters.
|
||||
|
||||
- Added `apply_text_normalization` support to `ElevenLabsTTSService` and
|
||||
`ElevenLabsHttpTTSService`.
|
||||
|
||||
- Added `MistralLLMService`, using Mistral's chat completion API.
|
||||
|
||||
- Added the ability to retry executing a chat completion after a timeout period
|
||||
for `OpenAILLMService` and its subclasses, `AnthropicLLMService`, and
|
||||
`AWSBedrockLLMService`. The LLM services accept new args:
|
||||
`retry_timeout_secs` and `retry_on_timeout`. This feature is disabled by
|
||||
default.
|
||||
|
||||
### Changed
|
||||
|
||||
- Updated `daily-python` to 0.19.7.
|
||||
|
||||
### Deprecated
|
||||
|
||||
- `FrameProcessor.wait_for_task()` is deprecated. Use `await task` or `await
|
||||
asyncio.wait_for(task, timeout)` instead.
|
||||
|
||||
### Removed
|
||||
|
||||
- Watchdog timers have been removed. They were introduced in 0.0.72 to help
|
||||
diagnose pipeline freezes. Unfortunately, they proved ineffective since they
|
||||
required developers to use Pipecat-specific queues, iterators, and events to
|
||||
correctly reset the timer, which limited their usefulness and added friction.
|
||||
|
||||
- Removed unused `FrameProcessor.set_parent()` and
|
||||
`FrameProcessor.get_parent()`.
|
||||
|
||||
### Fixed
|
||||
|
||||
- Fixed an issue that would cause `PipelineRunner` and `PipelineTask` to not
|
||||
handle external asyncio task cancellation properly.
|
||||
|
||||
- Added `SpeechmaticsSTTService` exception handling on connection and sending.
|
||||
|
||||
- Replaced `asyncio.wait_for()` for `wait_for2.wait_for()` for Python <
|
||||
3.12. because of issues regarding task cancellation (i.e. cancellation is
|
||||
never propagated).
|
||||
See https://bugs.python.org/issue42130
|
||||
|
||||
- Fixed an `AudioBufferProcessor` issues that would cause audio overlap when
|
||||
setting a max buffer size.
|
||||
|
||||
- Fixed an issue where `AsyncAITTSService` had very high latency in responding
|
||||
by adding `force=true` when sending the flush command.
|
||||
|
||||
### Performance
|
||||
|
||||
- Improve `PipelineTask` performance by using direct mode processors and by
|
||||
removing unnecessary tasks.
|
||||
|
||||
- Improve `ParallelPipeline` performance by using direct mode, by not
|
||||
creating a task for each frame and every sub-pipeline and also by removing
|
||||
other unnecessary tasks.
|
||||
|
||||
- `Pipeline` performance improvements by using direct mode.
|
||||
|
||||
### Other
|
||||
|
||||
- Added `14w-function-calling-mistal.py` using `MistralLLMService`.
|
||||
|
||||
- Added `13j-azure-transcription.py` using `AzureSTTService`.
|
||||
|
||||
## [0.0.80] - 2025-08-13
|
||||
|
||||
### Added
|
||||
|
||||
- Added `GeminiTTSService` which uses Google Gemini to generate TTS output. The
|
||||
Gemini model can be prompted to insert styled speech to control the TTS
|
||||
output.
|
||||
|
||||
- Added Exotel support to Pipecat's development runner. You can now connect
|
||||
using the runner with `uv run bot.py -t exotel` and an ngrok connection to
|
||||
HTTP port 7860.
|
||||
|
||||
- Added `enable_direct_mode` argument to `FrameProcessor`. The direct mode is
|
||||
for processors which require very little I/O or compute resources, that is
|
||||
processors that can perform their task almost immediately. These type of
|
||||
processors don't need any of the internal tasks and queues usually created by
|
||||
frame processors which means overall application performance might be slightly
|
||||
increased. Use with care.
|
||||
|
||||
- Added TTFB metrics for `HeyGenVideoService` and `TavusVideoService`.
|
||||
|
||||
- Added `endpoint_id` parameter to `AzureSTTService`. ([Custom EndpointId](https://docs.azure.cn/en-us/ai-services/speech-service/how-to-recognize-speech?pivots=programming-language-python#use-a-custom-endpoint))
|
||||
|
||||
### Changed
|
||||
|
||||
- `WatchdogPriorityQueue` now requires the items to be inserted to always be
|
||||
tuples and the size of the tuple needs to be specified in the constructor when
|
||||
creating the queue with the `tuple_size` argument.
|
||||
|
||||
- Updated Moondream to revision `2025-01-09`.
|
||||
|
||||
- Updated `PlayHTHttpTTSService` to no longer use the `pyht` client to remove
|
||||
compatibility issues with other packages. Now you can use the PlayHT HTTP
|
||||
service with other services, like GoogleLLMService.
|
||||
|
||||
- Updated `pyproject.toml` to once again pin `numba` to `>=0.61.2` in order to
|
||||
resolve package versioning issues.
|
||||
|
||||
- Updated the `STTMuteFilter` to include `VADUserStartedSpeakingFrame` and
|
||||
`VADUserStoppedSpeakingFrame` in the list of frames to filter when the
|
||||
filtering is on.
|
||||
|
||||
### Performance
|
||||
|
||||
- Improving the latency of the `HeyGenVideoService`.
|
||||
|
||||
- Improved some frame processors performance by using the new frame processor
|
||||
direct mode. In direct mode a frame processor will process frames right away
|
||||
avoiding the need for internal queues and tasks. This is useful for some
|
||||
simple processors. For example, in processors that wrap other processors
|
||||
(e.g. `Pipeline`, `ParallelPipeline`), we add one processor before and one
|
||||
after the wrapped processors (internally, you will see them as sources and
|
||||
sinks). These sources and sinks don't do any special processing and they
|
||||
basically forward frames. So, for these simple processors we now enable the
|
||||
new direct mode which avoids creating any internal tasks (and queues) and
|
||||
therefore improves performance.
|
||||
|
||||
### Fixed
|
||||
|
||||
- Fixed an issue with the `BaseWhisperSTTService` where the language was
|
||||
specified as an enum and not a string.
|
||||
|
||||
- Fixed an issue where `SmallWebRTCTransport` ended before TTS finished.
|
||||
|
||||
- Fixed an issue in `OpenAIRealtimeBetaLLMService` where specifying a `text`
|
||||
`modalities` didn't result in text being outputted from the model.
|
||||
|
||||
- Added SSML reserved character escaping to `AzureBaseTTSService` to properly
|
||||
handle special characters in text sent to Azure TTS. This fixes an issue
|
||||
where characters like `&`, `<`, `>`, `"`, and `'` in LLM-generated text would
|
||||
cause TTS failures.
|
||||
|
||||
- Fixed a `WatchdogPriorityQueue` issue that could cause an exception when
|
||||
compating watchdog cancel sentinel items with other items in the queue.
|
||||
|
||||
- Fixed an issue that would cause system frames to not be processed with higher
|
||||
priority than other frames. This could cause slower interruption times.
|
||||
|
||||
- Fixed an issue where retrying a websocket connection error would result in an
|
||||
error.
|
||||
|
||||
### Other
|
||||
|
||||
- Updated `15-switch-voices.py` and `15a-switch-languages.py` examples to show
|
||||
how to enclose complex logic (e.g. `ParallelPipeline`) into a single processor
|
||||
so the main pipeline becomes simpler.
|
||||
- Add foundation example `19b-openai-realtime-beta-text.py`, showing how to use
|
||||
`OpenAIRealtimeBetaLLMService` to output text to a TTS service.
|
||||
|
||||
- Add vision support to release evals so we can run the foundational examples 12
|
||||
series.
|
||||
|
||||
- Added foundational example `15a-switch-languages.py` to release evals. It is
|
||||
able to detect if we switched the language properly.
|
||||
|
||||
- Updated foundational examples to show how to enclose complex logic
|
||||
(e.g. `ParallelPipeline`) into a single processor so the main pipeline becomes
|
||||
simpler.
|
||||
|
||||
- Added `07n-interruptible-gemini.py`, demonstrating how to use
|
||||
`GeminiTTSService`.
|
||||
|
||||
## [0.0.79] - 2025-08-07
|
||||
|
||||
|
||||
@@ -31,6 +31,23 @@ git push origin your-branch-name
|
||||
|
||||
Our maintainers will review your PR, and once everything is good, your contributions will be merged!
|
||||
|
||||
## Dependency Management
|
||||
|
||||
This project uses [uv](https://docs.astral.sh/uv/) for dependency management. The `uv.lock` file is committed to ensure reproducible builds.
|
||||
|
||||
### Adding or Updating Dependencies
|
||||
|
||||
1. Edit `pyproject.toml` to add/update dependencies
|
||||
2. Run `uv lock` to update the lockfile with new dependency resolution
|
||||
3. Run `uv sync` to install the updated dependencies locally
|
||||
4. Always commit both files together:
|
||||
```bash
|
||||
git add pyproject.toml uv.lock
|
||||
git commit -m "feat: add new dependency for feature X"
|
||||
```
|
||||
|
||||
**Important:** Never manually edit `uv.lock`. It's auto-generated by `uv lock`.
|
||||
|
||||
## Code Style and Documentation
|
||||
|
||||
### Python Code Style
|
||||
|
||||
@@ -54,7 +54,7 @@ You can connect to Pipecat from any platform using our official SDKs:
|
||||
| Category | Services |
|
||||
| ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| Speech-to-Text | [AssemblyAI](https://docs.pipecat.ai/server/services/stt/assemblyai), [AWS](https://docs.pipecat.ai/server/services/stt/aws), [Azure](https://docs.pipecat.ai/server/services/stt/azure), [Cartesia](https://docs.pipecat.ai/server/services/stt/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/stt/deepgram), [Fal Wizper](https://docs.pipecat.ai/server/services/stt/fal), [Gladia](https://docs.pipecat.ai/server/services/stt/gladia), [Google](https://docs.pipecat.ai/server/services/stt/google), [Groq (Whisper)](https://docs.pipecat.ai/server/services/stt/groq), [NVIDIA Riva](https://docs.pipecat.ai/server/services/stt/riva), [OpenAI (Whisper)](https://docs.pipecat.ai/server/services/stt/openai), [SambaNova (Whisper)](https://docs.pipecat.ai/server/services/stt/sambanova), [Soniox](https://docs.pipecat.ai/server/services/stt/soniox), [Speechmatics](https://docs.pipecat.ai/server/services/stt/speechmatics), [Ultravox](https://docs.pipecat.ai/server/services/stt/ultravox), [Whisper](https://docs.pipecat.ai/server/services/stt/whisper) |
|
||||
| LLMs | [Anthropic](https://docs.pipecat.ai/server/services/llm/anthropic), [AWS](https://docs.pipecat.ai/server/services/llm/aws), [Azure](https://docs.pipecat.ai/server/services/llm/azure), [Cerebras](https://docs.pipecat.ai/server/services/llm/cerebras), [DeepSeek](https://docs.pipecat.ai/server/services/llm/deepseek), [Fireworks AI](https://docs.pipecat.ai/server/services/llm/fireworks), [Gemini](https://docs.pipecat.ai/server/services/llm/gemini), [Grok](https://docs.pipecat.ai/server/services/llm/grok), [Groq](https://docs.pipecat.ai/server/services/llm/groq), [NVIDIA NIM](https://docs.pipecat.ai/server/services/llm/nim), [Ollama](https://docs.pipecat.ai/server/services/llm/ollama), [OpenAI](https://docs.pipecat.ai/server/services/llm/openai), [OpenRouter](https://docs.pipecat.ai/server/services/llm/openrouter), [Perplexity](https://docs.pipecat.ai/server/services/llm/perplexity), [Qwen](https://docs.pipecat.ai/server/services/llm/qwen), [SambaNova](https://docs.pipecat.ai/server/services/llm/sambanova) [Together AI](https://docs.pipecat.ai/server/services/llm/together) |
|
||||
| LLMs | [Anthropic](https://docs.pipecat.ai/server/services/llm/anthropic), [AWS](https://docs.pipecat.ai/server/services/llm/aws), [Azure](https://docs.pipecat.ai/server/services/llm/azure), [Cerebras](https://docs.pipecat.ai/server/services/llm/cerebras), [DeepSeek](https://docs.pipecat.ai/server/services/llm/deepseek), [Fireworks AI](https://docs.pipecat.ai/server/services/llm/fireworks), [Gemini](https://docs.pipecat.ai/server/services/llm/gemini), [Grok](https://docs.pipecat.ai/server/services/llm/grok), [Groq](https://docs.pipecat.ai/server/services/llm/groq), [Mistral](https://docs.pipecat.ai/server/services/llm/mistral), [NVIDIA NIM](https://docs.pipecat.ai/server/services/llm/nim), [Ollama](https://docs.pipecat.ai/server/services/llm/ollama), [OpenAI](https://docs.pipecat.ai/server/services/llm/openai), [OpenRouter](https://docs.pipecat.ai/server/services/llm/openrouter), [Perplexity](https://docs.pipecat.ai/server/services/llm/perplexity), [Qwen](https://docs.pipecat.ai/server/services/llm/qwen), [SambaNova](https://docs.pipecat.ai/server/services/llm/sambanova) [Together AI](https://docs.pipecat.ai/server/services/llm/together) |
|
||||
| Text-to-Speech | [Async](https://docs.pipecat.ai/server/services/tts/asyncai), [AWS](https://docs.pipecat.ai/server/services/tts/aws), [Azure](https://docs.pipecat.ai/server/services/tts/azure), [Cartesia](https://docs.pipecat.ai/server/services/tts/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/tts/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/tts/elevenlabs), [Fish](https://docs.pipecat.ai/server/services/tts/fish), [Google](https://docs.pipecat.ai/server/services/tts/google), [Groq](https://docs.pipecat.ai/server/services/tts/groq), [Inworld](https://docs.pipecat.ai/server/services/tts/inworld), [LMNT](https://docs.pipecat.ai/server/services/tts/lmnt), [MiniMax](https://docs.pipecat.ai/server/services/tts/minimax), [Neuphonic](https://docs.pipecat.ai/server/services/tts/neuphonic), [NVIDIA Riva](https://docs.pipecat.ai/server/services/tts/riva), [OpenAI](https://docs.pipecat.ai/server/services/tts/openai), [Piper](https://docs.pipecat.ai/server/services/tts/piper), [PlayHT](https://docs.pipecat.ai/server/services/tts/playht), [Rime](https://docs.pipecat.ai/server/services/tts/rime), [Sarvam](https://docs.pipecat.ai/server/services/tts/sarvam), [XTTS](https://docs.pipecat.ai/server/services/tts/xtts) |
|
||||
| Speech-to-Speech | [AWS Nova Sonic](https://docs.pipecat.ai/server/services/s2s/aws), [Gemini Multimodal Live](https://docs.pipecat.ai/server/services/s2s/gemini), [OpenAI Realtime](https://docs.pipecat.ai/server/services/s2s/openai) |
|
||||
| Transport | [Daily (WebRTC)](https://docs.pipecat.ai/server/services/transport/daily), [FastAPI Websocket](https://docs.pipecat.ai/server/services/transport/fastapi-websocket), [SmallWebRTCTransport](https://docs.pipecat.ai/server/services/transport/small-webrtc), [WebSocket Server](https://docs.pipecat.ai/server/services/transport/websocket-server), Local |
|
||||
@@ -114,7 +114,8 @@ You can get started with Pipecat running on your local machine, then move your a
|
||||
|
||||
### Prerequisites
|
||||
|
||||
**Python Version:** 3.10+
|
||||
**Minimum Python Version:** 3.10
|
||||
**Recommended Python Version:** 3.12
|
||||
|
||||
### Setup Steps
|
||||
|
||||
|
||||
@@ -1,10 +0,0 @@
|
||||
# Pipecat Docs
|
||||
|
||||
## [Architecture Overview](architecture.md)
|
||||
|
||||
Learn about the thinking behind the framework's design.
|
||||
|
||||
## [A Frame's Progress](frame-progress.md)
|
||||
|
||||
See how a Frame is processed through a Transport, a Pipeline, and a series of Frame Processors.
|
||||
|
||||
@@ -1,17 +0,0 @@
|
||||
# Pipecat architecture guide
|
||||
|
||||
## Frames
|
||||
|
||||
Frames can represent discrete chunks of data, for instance a chunk of text, a chunk of audio, or an image. They can also be used to as control flow, for instance a frame that indicates that there is no more data available, or that a user started or stopped talking. They can also represent more complex data structures, such as a message array used for an LLM completion.
|
||||
|
||||
## FrameProcessors
|
||||
|
||||
Frame processors operate on frames. Every frame processor implements a `process_frame` method that consumes one frame and produces zero or more frames. Frame processors can do simple transforms, such as concatenating text fragments into sentences, or they can treat frames as input for an AI Service, and emit chat completions based on message arrays or transform text into audio or images.
|
||||
|
||||
## Pipelines
|
||||
|
||||
Pipelines are lists of frame processors linked together. Frame processors can push frames upstream or downstream to their peers. A very simple pipeline might chain an LLM frame processor to a text-to-speech frame processor, with a transport as an output.
|
||||
|
||||
## Transports
|
||||
|
||||
Transports provide input and output frame processors to receive or send frames respectively. For example, the `DailyTransport` does this with a WebRTC session joined to a Daily.co room.
|
||||
@@ -1,46 +0,0 @@
|
||||
# A Frame's Progress
|
||||
|
||||
1. A user says “Hello, LLM” and the cloud transcription service delivers a transcription to the Transport.
|
||||

|
||||
|
||||
2. The Transport places a Transcription frame in the Pipeline’s source queue.
|
||||

|
||||
|
||||
3. The Pipeline passes the Transcription frame to the first Frame Processor in its list, the LLM User Message Aggregator.
|
||||

|
||||
|
||||
4. The LLM User Message Aggregator updates the LLM Context with a `{“user”: “Hello LLM”}` message.
|
||||

|
||||
|
||||
5. The LLM User Message Aggregator yields an LLM Message Frame, containing the updated LLM Context. The Pipeline passes this frame to the LLM Frame Processor.
|
||||

|
||||
|
||||
6. The LLM Frame Processor creates a streaming chat completion based on the LLM context and yields the first chunk of a response, Text Frame with the value “Hi, “. The Pipeline passes this frame to the TTS Frame Processor. The TTS Frame Processor aggregates this response but doesn’t yield anything, yet, because it’s waiting for a full sentence.
|
||||

|
||||
|
||||
7. The LLM Frame Processor yields another Text Frame with the value “there.”. The Pipeline passes this frame to the TTS Frame Processor.
|
||||

|
||||
|
||||
8. The TTS Frame Processor now has a full sentence, so it starts streaming audio based on “Hi, there.” It yields the first chunk of streaming audio as an Audio frame, which the Pipeline passes to the LLM Assistant Message Aggregator.
|
||||

|
||||
|
||||
9. The LLM Assistant Message Aggregator doesn’t do anything with Audio frames, so it immediately yields the frame, unchanged. This is the convention for all Frame Processors: frames that the processor doesn’t process should be immediately yielded.
|
||||

|
||||
|
||||
10. The Pipeline places the first Audio frame in its sink queue, which is being watched by the Transport. Since the frame is now in a queue, the Pipeline can continue processing other frames. Note that the source and sink queues form a sort of “boundary of concurrent processing” between a Pipeline and the outside world. In a Pipeline, Frames are processed sequentially; once a Frame is on a queue it can be processed in parallel with the frames being processed by the Pipeline. TODO: link to a more in-depth section about this.
|
||||

|
||||
|
||||
11. The TTS Frame Processor yields another Audio frame as the Transport transmits the first Audio frame.
|
||||

|
||||
|
||||
12. As before, the LLM Assistant Message Aggregator immediately yields the Audio frame and the Pipeline places the Audio frame in the sink queue.
|
||||

|
||||
|
||||
13. The TTS Frame Processor has no more frames to yield. The LLM Frame Processor emits an LLM Response End Frame, which the Pipeline passes to the TTS Frame Processor.
|
||||

|
||||
|
||||
14. The TTS Frame Processor immediately yields the LLM Response End Frame, so the Pipeline passes it along to the LLM Assistant Message Aggregator. The LLM Assistant Message Aggregator updates the LLM Context with the full response from the LLM. TODO TODO: I realized I forgot that the TSS Frame Processor also yields the Text frames that the LLM emitted so that the LLM Assistant Message Aggregator could accumulate them, arrggh.
|
||||

|
||||
|
||||
15. The system is quiet, and waiting for the next message from the Transport.
|
||||

|
||||
110
docs/frame.md
@@ -1,110 +0,0 @@
|
||||
# Understanding Different Frame Types in the Pipecat System
|
||||
|
||||
In the Pipecat system, frames are used to represent different types of data and control signals that flow through the pipeline. Understanding these frame types is crucial for working with the system effectively. This tutorial will cover the main categories of frames and their specific uses.
|
||||
|
||||
## 1. Base Frame Classes
|
||||
|
||||
### Frame
|
||||
The `Frame` class is the base class for all frames. It includes:
|
||||
- `id`: A unique identifier
|
||||
- `name`: A descriptive name
|
||||
- `pts`: Presentation timestamp (optional)
|
||||
|
||||
### DataFrame
|
||||
`DataFrame` is a subclass of `Frame` and serves as a base for most data-carrying frames.
|
||||
|
||||
## 2. Audio Frames
|
||||
|
||||
### AudioRawFrame
|
||||
Represents a chunk of audio with properties:
|
||||
- `audio`: Raw audio data
|
||||
- `sample_rate`: Audio sample rate
|
||||
- `num_channels`: Number of audio channels
|
||||
|
||||
Subclasses include:
|
||||
- `InputAudioRawFrame`: For audio from input sources
|
||||
- `OutputAudioRawFrame`: For audio to be played by output devices
|
||||
- `TTSAudioRawFrame`: For audio generated by Text-to-Speech services
|
||||
|
||||
## 3. Image Frames
|
||||
|
||||
### ImageRawFrame
|
||||
Represents an image with properties:
|
||||
- `image`: Raw image data
|
||||
- `size`: Image dimensions
|
||||
- `format`: Image format (e.g., JPEG, PNG)
|
||||
|
||||
Subclasses include:
|
||||
- `InputImageRawFrame`: For images from input sources
|
||||
- `OutputImageRawFrame`: For images to be displayed
|
||||
- `UserImageRawFrame`: For images associated with a specific user
|
||||
- `VisionImageRawFrame`: For images with associated text for description
|
||||
- `URLImageRawFrame`: For images with an associated URL
|
||||
|
||||
### SpriteFrame
|
||||
Represents an animated sprite, containing a list of `ImageRawFrame` objects.
|
||||
|
||||
## 4. Text and Transcription Frames
|
||||
|
||||
### TextFrame
|
||||
Represents a chunk of text, used for various purposes in the pipeline.
|
||||
|
||||
### TranscriptionFrame
|
||||
A specialized `TextFrame` for speech transcriptions, including:
|
||||
- `user_id`: ID of the speaking user
|
||||
- `timestamp`: When the transcription was generated
|
||||
- `language`: Detected language of the speech
|
||||
|
||||
### InterimTranscriptionFrame
|
||||
Similar to `TranscriptionFrame`, but for interim (not final) transcriptions.
|
||||
|
||||
## 5. LLM (Language Model) Frames
|
||||
|
||||
### LLMMessagesFrame
|
||||
Contains a list of messages for an LLM service to process.
|
||||
|
||||
### LLMMessagesAppendFrame and LLMMessagesUpdateFrame
|
||||
Used to modify the current context of LLM messages.
|
||||
|
||||
### LLMSetToolsFrame
|
||||
Specifies tools (functions) available for the LLM to use.
|
||||
|
||||
### LLMEnablePromptCachingFrame
|
||||
Controls prompt caching in certain LLMs.
|
||||
|
||||
## 6. System and Control Frames
|
||||
|
||||
### SystemFrame
|
||||
Base class for system-level frames.
|
||||
|
||||
Important system frames include:
|
||||
- `StartFrame`: Initiates a pipeline
|
||||
- `CancelFrame`: Stops a pipeline immediately
|
||||
- `ErrorFrame`: Notifies of errors (with `FatalErrorFrame` for unrecoverable errors)
|
||||
- `EndTaskFrame` and `CancelTaskFrame`: Control pipeline tasks
|
||||
- `StartInterruptionFrame` and `StopInterruptionFrame`: Indicate user speech for interruptions
|
||||
|
||||
### ControlFrame
|
||||
Base class for control-flow frames.
|
||||
|
||||
Notable control frames:
|
||||
- `EndFrame`: Signals the end of a pipeline
|
||||
- `LLMFullResponseStartFrame` and `LLMFullResponseEndFrame`: Bracket LLM responses
|
||||
- `UserStartedSpeakingFrame` and `UserStoppedSpeakingFrame`: Indicate user speech activity
|
||||
- `BotStartedSpeakingFrame` and `BotStoppedSpeakingFrame`: Indicate bot speech activity
|
||||
- `TTSStartedFrame` and `TTSStoppedFrame`: Bracket Text-to-Speech responses
|
||||
|
||||
## 7. Special Purpose Frames
|
||||
|
||||
### MetricsFrame
|
||||
Contains performance metrics data.
|
||||
|
||||
### FunctionCallInProgressFrame and FunctionCallResultFrame
|
||||
Used for handling LLM function (tool) calls.
|
||||
|
||||
### ServiceUpdateSettingsFrame
|
||||
Base class for updating service settings, with specific subclasses for LLM, TTS, and STT services.
|
||||
|
||||
## Conclusion
|
||||
|
||||
Understanding these frame types is essential for working with the Pipecat system. Each frame type serves a specific purpose in the pipeline, whether it's carrying data (like audio or images), controlling the flow of the pipeline, or managing system-level operations. By using the appropriate frame types, you can effectively process and transmit various kinds of information through your pipeline.
|
||||
|
Before Width: | Height: | Size: 98 KiB |
|
Before Width: | Height: | Size: 91 KiB |
|
Before Width: | Height: | Size: 92 KiB |
|
Before Width: | Height: | Size: 92 KiB |
|
Before Width: | Height: | Size: 98 KiB |
|
Before Width: | Height: | Size: 94 KiB |
|
Before Width: | Height: | Size: 94 KiB |
|
Before Width: | Height: | Size: 95 KiB |
|
Before Width: | Height: | Size: 94 KiB |
|
Before Width: | Height: | Size: 96 KiB |
|
Before Width: | Height: | Size: 110 KiB |
|
Before Width: | Height: | Size: 102 KiB |
|
Before Width: | Height: | Size: 111 KiB |
|
Before Width: | Height: | Size: 117 KiB |
|
Before Width: | Height: | Size: 98 KiB |
@@ -59,6 +59,9 @@ GOOGLE_VERTEX_TEST_CREDENTIALS=...
|
||||
LMNT_API_KEY=...
|
||||
LMNT_VOICE_ID=...
|
||||
|
||||
# Perplexity
|
||||
PERPLEXITY_API_KEY=...
|
||||
|
||||
# PlayHT
|
||||
PLAY_HT_USER_ID=...
|
||||
PLAY_HT_API_KEY=...
|
||||
|
||||
163
examples/foundational/07n-interruptible-gemini.py
Normal file
@@ -0,0 +1,163 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""
|
||||
A conversational AI bot using Gemini for both LLM and TTS.
|
||||
|
||||
This example demonstrates how to use Gemini's TTS capabilities with the new
|
||||
GeminiTTSService, which uses Gemini's TTS-specific models instead of Google Cloud TTS.
|
||||
|
||||
Features showcased:
|
||||
- Gemini LLM for conversation
|
||||
- Gemini TTS with natural voice control
|
||||
- Support for different voice personalities
|
||||
- Style and tone control through natural language prompts
|
||||
|
||||
Run with:
|
||||
python examples/foundational/gemini-tts.py
|
||||
|
||||
Make sure to set your environment variables:
|
||||
export GOOGLE_API_KEY=your_api_key_here
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.google.llm import GoogleLLMService
|
||||
from pipecat.services.google.stt import GoogleSTTService
|
||||
from pipecat.services.google.tts import GeminiTTSService
|
||||
from pipecat.transcriptions.language import Language
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.network.fastapi_websocket import FastAPIWebsocketParams
|
||||
from pipecat.transports.services.daily import DailyParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||
# instantiated. The function will be called when the desired transport gets
|
||||
# selected.
|
||||
transport_params = {
|
||||
"daily": lambda: DailyParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
),
|
||||
"twilio": lambda: FastAPIWebsocketParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
),
|
||||
"webrtc": lambda: TransportParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot with Gemini TTS")
|
||||
|
||||
stt = GoogleSTTService(
|
||||
params=GoogleSTTService.InputParams(languages=Language.EN_US),
|
||||
credentials=os.getenv("GOOGLE_TEST_CREDENTIALS"),
|
||||
)
|
||||
|
||||
tts = GeminiTTSService(
|
||||
api_key=os.getenv("GOOGLE_API_KEY"),
|
||||
model="gemini-2.5-flash-preview-tts", # TTS-specific model
|
||||
voice_id="Charon",
|
||||
params=GeminiTTSService.InputParams(language=Language.EN_US),
|
||||
)
|
||||
|
||||
llm = GoogleLLMService(
|
||||
api_key=os.getenv("GOOGLE_API_KEY"),
|
||||
model="gemini-2.5-flash",
|
||||
)
|
||||
|
||||
# System message that instructs the AI on how to speak
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": """You are a helpful AI assistant in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way.
|
||||
|
||||
IMPORTANT: Since you're using Gemini TTS which supports natural voice control, you can include speaking instructions in your responses. For example:
|
||||
- "Say cheerfully: Welcome to our conversation!"
|
||||
- "Read this in a calm, professional tone: Here are the details you requested."
|
||||
- "Speak in an excited whisper: I have some great news to share!"
|
||||
- "Say slowly and clearly: Let me explain this step by step."
|
||||
|
||||
Feel free to use natural language instructions to control your voice style, tone, pace, and emotion. The TTS system will interpret these instructions and adjust the speech accordingly.
|
||||
|
||||
Your output will be converted to audio, so avoid special characters in your answers. Respond to what the user said in a creative and helpful way.""",
|
||||
},
|
||||
]
|
||||
|
||||
context = OpenAILLMContext(messages)
|
||||
context_aggregator = llm.create_context_aggregator(context)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(), # Transport user input
|
||||
stt, # STT
|
||||
context_aggregator.user(), # User responses
|
||||
llm, # LLM
|
||||
tts, # Gemini TTS
|
||||
transport.output(), # Transport bot output
|
||||
context_aggregator.assistant(), # Assistant spoken responses
|
||||
]
|
||||
)
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
params=PipelineParams(
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
),
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation with a styled introduction
|
||||
messages.append(
|
||||
{
|
||||
"role": "system",
|
||||
"content": "Say cheerfully and warmly: Hello! I'm your AI assistant powered by Gemini's new TTS technology. I can speak with different voices, tones, and styles. How can I help you today?",
|
||||
}
|
||||
)
|
||||
await task.queue_frames([context_aggregator.user().get_context_frame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
async def bot(runner_args: RunnerArguments):
|
||||
"""Main bot entry point compatible with Pipecat Cloud."""
|
||||
transport = await create_transport(runner_args, transport_params)
|
||||
await run_bot(transport, runner_args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from pipecat.runner.run import main
|
||||
|
||||
main()
|
||||
126
examples/foundational/07z-interruptible-sarvam-http.py
Normal file
@@ -0,0 +1,126 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
|
||||
import os
|
||||
|
||||
import aiohttp
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.services.sarvam.tts import SarvamHttpTTSService
|
||||
from pipecat.transcriptions.language import Language
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.network.fastapi_websocket import FastAPIWebsocketParams
|
||||
from pipecat.transports.services.daily import DailyParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||
# instantiated. The function will be called when the desired transport gets
|
||||
# selected.
|
||||
transport_params = {
|
||||
"daily": lambda: DailyParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
),
|
||||
"twilio": lambda: FastAPIWebsocketParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
),
|
||||
"webrtc": lambda: TransportParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
# Create an HTTP session
|
||||
async with aiohttp.ClientSession() as session:
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
|
||||
tts = SarvamHttpTTSService(
|
||||
api_key=os.getenv("SARVAM_API_KEY"),
|
||||
aiohttp_session=session,
|
||||
params=SarvamHttpTTSService.InputParams(language=Language.EN),
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
|
||||
},
|
||||
]
|
||||
|
||||
context = OpenAILLMContext(messages)
|
||||
context_aggregator = llm.create_context_aggregator(context)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(), # Transport user input
|
||||
stt,
|
||||
context_aggregator.user(), # User responses
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
context_aggregator.assistant(), # Assistant spoken responses
|
||||
]
|
||||
)
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
params=PipelineParams(
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
),
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
|
||||
await task.queue_frames([context_aggregator.user().get_context_frame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
async def bot(runner_args: RunnerArguments):
|
||||
"""Main bot entry point compatible with Pipecat Cloud."""
|
||||
transport = await create_transport(runner_args, transport_params)
|
||||
await run_bot(transport, runner_args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from pipecat.runner.run import main
|
||||
|
||||
main()
|
||||
@@ -5,6 +5,7 @@
|
||||
#
|
||||
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
|
||||
import aiohttp
|
||||
@@ -12,6 +13,7 @@ from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.frames.frames import TTSUpdateSettingsFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
@@ -21,7 +23,6 @@ from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.services.sarvam.tts import SarvamTTSService
|
||||
from pipecat.transcriptions.language import Language
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.network.fastapi_websocket import FastAPIWebsocketParams
|
||||
from pipecat.transports.services.daily import DailyParams
|
||||
@@ -54,64 +55,64 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
# Create an HTTP session
|
||||
async with aiohttp.ClientSession() as session:
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
|
||||
tts = SarvamTTSService(
|
||||
api_key=os.getenv("SARVAM_API_KEY"),
|
||||
aiohttp_session=session,
|
||||
params=SarvamTTSService.InputParams(language=Language.EN),
|
||||
)
|
||||
tts = SarvamTTSService(
|
||||
api_key=os.getenv("SARVAM_API_KEY"),
|
||||
model="bulbul:v2",
|
||||
voice_id="manisha",
|
||||
)
|
||||
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
|
||||
|
||||
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
|
||||
},
|
||||
]
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
|
||||
},
|
||||
context = OpenAILLMContext(messages)
|
||||
context_aggregator = llm.create_context_aggregator(context)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(), # Transport user input
|
||||
stt,
|
||||
context_aggregator.user(), # User responses
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
context_aggregator.assistant(), # Assistant spoken responses
|
||||
]
|
||||
)
|
||||
|
||||
context = OpenAILLMContext(messages)
|
||||
context_aggregator = llm.create_context_aggregator(context)
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
params=PipelineParams(
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
),
|
||||
)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(), # Transport user input
|
||||
stt,
|
||||
context_aggregator.user(), # User responses
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
context_aggregator.assistant(), # Assistant spoken responses
|
||||
]
|
||||
)
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
|
||||
await task.queue_frames([context_aggregator.user().get_context_frame()])
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
params=PipelineParams(
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
),
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
)
|
||||
# Optionally, you can wait for 30 seconds and then change the voice.
|
||||
# await asyncio.sleep(30)
|
||||
# await task.queue_frame(TTSUpdateSettingsFrame(settings={"voice": "anushka"}))
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
|
||||
await task.queue_frames([context_aggregator.user().get_context_frame()])
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
|
||||
await runner.run(task)
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
async def bot(runner_args: RunnerArguments):
|
||||
|
||||
88
examples/foundational/13j-azure-transcription.py
Normal file
@@ -0,0 +1,88 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import os
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.frames.frames import Frame, TranscriptionFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineTask
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.azure.stt import AzureSTTService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.network.fastapi_websocket import FastAPIWebsocketParams
|
||||
from pipecat.transports.services.daily import DailyParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
class TranscriptionLogger(FrameProcessor):
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if isinstance(frame, TranscriptionFrame):
|
||||
print(f"Transcription: {frame.text}")
|
||||
|
||||
|
||||
transport_params = {
|
||||
"daily": lambda: DailyParams(
|
||||
audio_in_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
),
|
||||
"twilio": lambda: FastAPIWebsocketParams(
|
||||
audio_in_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
),
|
||||
"webrtc": lambda: TransportParams(
|
||||
audio_in_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = AzureSTTService(
|
||||
api_key=os.getenv("AZURE_SPEECH_API_KEY"),
|
||||
region=os.getenv("AZURE_SPEECH_REGION"),
|
||||
)
|
||||
|
||||
tl = TranscriptionLogger()
|
||||
|
||||
pipeline = Pipeline([transport.input(), stt, tl])
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
async def bot(runner_args: RunnerArguments):
|
||||
"""Main bot entry point compatible with Pipecat Cloud."""
|
||||
transport = await create_transport(runner_args, transport_params)
|
||||
await run_bot(transport, runner_args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from pipecat.runner.run import main
|
||||
|
||||
main()
|
||||
165
examples/foundational/14w-function-calling-mistral.py
Normal file
@@ -0,0 +1,165 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import os
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.adapters.schemas.function_schema import FunctionSchema
|
||||
from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.frames.frames import TTSSpeakFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.llm_service import FunctionCallParams
|
||||
from pipecat.services.mistral.llm import MistralLLMService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.network.fastapi_websocket import FastAPIWebsocketParams
|
||||
from pipecat.transports.services.daily import DailyParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
async def fetch_weather_from_api(params: FunctionCallParams):
|
||||
await params.result_callback({"conditions": "nice", "temperature": "75"})
|
||||
|
||||
|
||||
async def fetch_restaurant_recommendation(params: FunctionCallParams):
|
||||
await params.result_callback({"name": "The Golden Dragon"})
|
||||
|
||||
|
||||
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||
# instantiated. The function will be called when the desired transport gets
|
||||
# selected.
|
||||
transport_params = {
|
||||
"daily": lambda: DailyParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
),
|
||||
"twilio": lambda: FastAPIWebsocketParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
),
|
||||
"webrtc": lambda: TransportParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
voice_id="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
)
|
||||
|
||||
llm = MistralLLMService(api_key=os.getenv("MISTRAL_API_KEY"))
|
||||
|
||||
# You can also register a function_name of None to get all functions
|
||||
# sent to the same callback with an additional function_name parameter.
|
||||
llm.register_function("get_current_weather", fetch_weather_from_api)
|
||||
llm.register_function("get_restaurant_recommendation", fetch_restaurant_recommendation)
|
||||
|
||||
weather_function = FunctionSchema(
|
||||
name="get_current_weather",
|
||||
description="Get the current weather",
|
||||
properties={
|
||||
"location": {
|
||||
"type": "string",
|
||||
"description": "The city and state, e.g. San Francisco, CA",
|
||||
},
|
||||
"format": {
|
||||
"type": "string",
|
||||
"enum": ["celsius", "fahrenheit"],
|
||||
"description": "The temperature unit to use. Infer this from the user's location.",
|
||||
},
|
||||
},
|
||||
required=["location", "format"],
|
||||
)
|
||||
restaurant_function = FunctionSchema(
|
||||
name="get_restaurant_recommendation",
|
||||
description="Get a restaurant recommendation",
|
||||
properties={
|
||||
"location": {
|
||||
"type": "string",
|
||||
"description": "The city and state, e.g. San Francisco, CA",
|
||||
},
|
||||
},
|
||||
required=["location"],
|
||||
)
|
||||
tools = ToolsSchema(standard_tools=[weather_function, restaurant_function])
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
|
||||
},
|
||||
]
|
||||
|
||||
context = OpenAILLMContext(messages, tools)
|
||||
context_aggregator = llm.create_context_aggregator(context)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(),
|
||||
stt,
|
||||
context_aggregator.user(),
|
||||
llm,
|
||||
tts,
|
||||
transport.output(),
|
||||
context_aggregator.assistant(),
|
||||
]
|
||||
)
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
params=PipelineParams(
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
),
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
await task.queue_frames([context_aggregator.user().get_context_frame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
async def bot(runner_args: RunnerArguments):
|
||||
"""Main bot entry point compatible with Pipecat Cloud."""
|
||||
transport = await create_transport(runner_args, transport_params)
|
||||
await run_bot(transport, runner_args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from pipecat.runner.run import main
|
||||
|
||||
main()
|
||||
170
examples/foundational/14x-function-calling-universal-context.py
Normal file
@@ -0,0 +1,170 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import os
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.adapters.schemas.function_schema import FunctionSchema
|
||||
from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.frames.frames import TTSSpeakFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.llm_service import FunctionCallParams
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.network.fastapi_websocket import FastAPIWebsocketParams
|
||||
from pipecat.transports.services.daily import DailyParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
async def fetch_weather_from_api(params: FunctionCallParams):
|
||||
await params.result_callback({"conditions": "nice", "temperature": "75"})
|
||||
|
||||
|
||||
async def fetch_restaurant_recommendation(params: FunctionCallParams):
|
||||
await params.result_callback({"name": "The Golden Dragon"})
|
||||
|
||||
|
||||
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||
# instantiated. The function will be called when the desired transport gets
|
||||
# selected.
|
||||
transport_params = {
|
||||
"daily": lambda: DailyParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
),
|
||||
"twilio": lambda: FastAPIWebsocketParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
),
|
||||
"webrtc": lambda: TransportParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
voice_id="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
|
||||
|
||||
# You can also register a function_name of None to get all functions
|
||||
# sent to the same callback with an additional function_name parameter.
|
||||
llm.register_function("get_current_weather", fetch_weather_from_api)
|
||||
llm.register_function("get_restaurant_recommendation", fetch_restaurant_recommendation)
|
||||
|
||||
@llm.event_handler("on_function_calls_started")
|
||||
async def on_function_calls_started(service, function_calls):
|
||||
await tts.queue_frame(TTSSpeakFrame("Let me check on that."))
|
||||
|
||||
weather_function = FunctionSchema(
|
||||
name="get_current_weather",
|
||||
description="Get the current weather",
|
||||
properties={
|
||||
"location": {
|
||||
"type": "string",
|
||||
"description": "The city and state, e.g. San Francisco, CA",
|
||||
},
|
||||
"format": {
|
||||
"type": "string",
|
||||
"enum": ["celsius", "fahrenheit"],
|
||||
"description": "The temperature unit to use. Infer this from the user's location.",
|
||||
},
|
||||
},
|
||||
required=["location", "format"],
|
||||
)
|
||||
restaurant_function = FunctionSchema(
|
||||
name="get_restaurant_recommendation",
|
||||
description="Get a restaurant recommendation",
|
||||
properties={
|
||||
"location": {
|
||||
"type": "string",
|
||||
"description": "The city and state, e.g. San Francisco, CA",
|
||||
},
|
||||
},
|
||||
required=["location"],
|
||||
)
|
||||
tools = ToolsSchema(standard_tools=[weather_function, restaurant_function])
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
|
||||
},
|
||||
]
|
||||
|
||||
context = LLMContext(messages, tools)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(),
|
||||
stt,
|
||||
context_aggregator.user(),
|
||||
llm,
|
||||
tts,
|
||||
transport.output(),
|
||||
context_aggregator.assistant(),
|
||||
]
|
||||
)
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
params=PipelineParams(
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
),
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
await task.queue_frames([context_aggregator.user().get_context_frame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
async def bot(runner_args: RunnerArguments):
|
||||
"""Main bot entry point compatible with Pipecat Cloud."""
|
||||
transport = await create_transport(runner_args, transport_params)
|
||||
await run_bot(transport, runner_args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from pipecat.runner.run import main
|
||||
|
||||
main()
|
||||
@@ -0,0 +1,229 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.adapters.schemas.function_schema import FunctionSchema
|
||||
from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.frames.frames import TTSSpeakFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import (
|
||||
create_transport,
|
||||
get_transport_client_id,
|
||||
maybe_capture_participant_camera,
|
||||
)
|
||||
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.google.llm import GoogleLLMService
|
||||
from pipecat.services.llm_service import FunctionCallParams
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.services.daily import DailyParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
# Global variable to store the client ID
|
||||
client_id = ""
|
||||
|
||||
|
||||
async def get_weather(params: FunctionCallParams):
|
||||
location = params.arguments["location"]
|
||||
await params.result_callback(f"The weather in {location} is currently 72 degrees and sunny.")
|
||||
|
||||
|
||||
async def fetch_restaurant_recommendation(params: FunctionCallParams):
|
||||
await params.result_callback({"name": "The Golden Dragon"})
|
||||
|
||||
|
||||
async def get_image(params: FunctionCallParams):
|
||||
question = params.arguments["question"]
|
||||
logger.debug(f"Requesting image with user_id={client_id}, question={question}")
|
||||
|
||||
# Request the image frame
|
||||
await params.llm.request_image_frame(
|
||||
user_id=client_id,
|
||||
function_name=params.function_name,
|
||||
tool_call_id=params.tool_call_id,
|
||||
text_content=question,
|
||||
)
|
||||
|
||||
# Wait a short time for the frame to be processed
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
# Return a result to complete the function call
|
||||
await params.result_callback(
|
||||
f"I've captured an image from your camera and I'm analyzing what you asked about: {question}"
|
||||
)
|
||||
|
||||
|
||||
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||
# instantiated. The function will be called when the desired transport gets
|
||||
# selected.
|
||||
transport_params = {
|
||||
"daily": lambda: DailyParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
video_in_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
),
|
||||
"webrtc": lambda: TransportParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
video_in_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
voice_id="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
)
|
||||
|
||||
llm = GoogleLLMService(api_key=os.getenv("GOOGLE_API_KEY"), model="gemini-2.0-flash-001")
|
||||
llm.register_function("get_weather", get_weather)
|
||||
llm.register_function("get_image", get_image)
|
||||
llm.register_function("get_restaurant_recommendation", fetch_restaurant_recommendation)
|
||||
|
||||
@llm.event_handler("on_function_calls_started")
|
||||
async def on_function_calls_started(service, function_calls):
|
||||
await tts.queue_frame(TTSSpeakFrame("Let me check on that."))
|
||||
|
||||
weather_function = FunctionSchema(
|
||||
name="get_weather",
|
||||
description="Get the current weather",
|
||||
properties={
|
||||
"location": {
|
||||
"type": "string",
|
||||
"description": "The city and state, e.g. San Francisco, CA",
|
||||
},
|
||||
"format": {
|
||||
"type": "string",
|
||||
"enum": ["celsius", "fahrenheit"],
|
||||
"description": "The temperature unit to use. Infer this from the user's location.",
|
||||
},
|
||||
},
|
||||
required=["location", "format"],
|
||||
)
|
||||
restaurant_function = FunctionSchema(
|
||||
name="get_restaurant_recommendation",
|
||||
description="Get a restaurant recommendation",
|
||||
properties={
|
||||
"location": {
|
||||
"type": "string",
|
||||
"description": "The city and state, e.g. San Francisco, CA",
|
||||
},
|
||||
},
|
||||
required=["location"],
|
||||
)
|
||||
get_image_function = FunctionSchema(
|
||||
name="get_image",
|
||||
description="Get an image from the video stream.",
|
||||
properties={
|
||||
"question": {
|
||||
"type": "string",
|
||||
"description": "The question that the user is asking about the image.",
|
||||
}
|
||||
},
|
||||
required=["question"],
|
||||
)
|
||||
tools = ToolsSchema(standard_tools=[weather_function, get_image_function, restaurant_function])
|
||||
|
||||
system_prompt = """\
|
||||
You are a helpful assistant who converses with a user and answers questions. Respond concisely to general questions.
|
||||
|
||||
Your response will be turned into speech so use only simple words and punctuation.
|
||||
|
||||
You have access to three tools: get_weather, get_restaurant_recommendation, and get_image.
|
||||
|
||||
You can respond to questions about the weather using the get_weather tool.
|
||||
|
||||
You can answer questions about the user's video stream using the get_image tool. Some examples of phrases that \
|
||||
indicate you should use the get_image tool are:
|
||||
- What do you see?
|
||||
- What's in the video?
|
||||
- Can you describe the video?
|
||||
- Tell me about what you see.
|
||||
- Tell me something interesting about what you see.
|
||||
- What's happening in the video?
|
||||
"""
|
||||
messages = [
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": "Say hello."},
|
||||
]
|
||||
|
||||
context = LLMContext(messages, tools)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(),
|
||||
stt,
|
||||
context_aggregator.user(),
|
||||
llm,
|
||||
tts,
|
||||
transport.output(),
|
||||
context_aggregator.assistant(),
|
||||
]
|
||||
)
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
params=PipelineParams(
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
),
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected: {client}")
|
||||
|
||||
await maybe_capture_participant_camera(transport, client)
|
||||
|
||||
global client_id
|
||||
client_id = get_transport_client_id(transport, client)
|
||||
|
||||
# Kick off the conversation.
|
||||
await task.queue_frames([context_aggregator.user().get_context_frame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
async def bot(runner_args: RunnerArguments):
|
||||
"""Main bot entry point compatible with Pipecat Cloud."""
|
||||
transport = await create_transport(runner_args, transport_params)
|
||||
await run_bot(transport, runner_args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from pipecat.runner.run import main
|
||||
|
||||
main()
|
||||
@@ -137,7 +137,7 @@ You have access to the following tools:
|
||||
- get_current_weather: Get the current weather for a given location.
|
||||
- get_restaurant_recommendation: Get a restaurant recommendation for a given location.
|
||||
|
||||
Remember, your responses should be short. Just one or two sentences, usually.""",
|
||||
Remember, your responses should be short. Just one or two sentences, usually. Respond in English.""",
|
||||
)
|
||||
|
||||
llm = OpenAIRealtimeBetaLLMService(
|
||||
@@ -158,16 +158,6 @@ Remember, your responses should be short. Just one or two sentences, usually."""
|
||||
# openai WebSocket API can understand.
|
||||
context = OpenAILLMContext(
|
||||
[{"role": "user", "content": "Say hello!"}],
|
||||
# [{"role": "user", "content": [{"type": "text", "text": "Say hello!"}]}],
|
||||
# [
|
||||
# {
|
||||
# "role": "user",
|
||||
# "content": [
|
||||
# {"type": "text", "text": "Say"},
|
||||
# {"type": "text", "text": "yo what's up!"},
|
||||
# ],
|
||||
# }
|
||||
# ],
|
||||
tools,
|
||||
)
|
||||
|
||||
|
||||
@@ -133,7 +133,7 @@ You have access to the following tools:
|
||||
- get_current_weather: Get the current weather for a given location.
|
||||
- get_restaurant_recommendation: Get a restaurant recommendation for a given location.
|
||||
|
||||
Remember, your responses should be short. Just one or two sentences, usually.""",
|
||||
Remember, your responses should be short. Just one or two sentences, usually. Respond in English.""",
|
||||
)
|
||||
|
||||
llm = AzureRealtimeBetaLLMService(
|
||||
|
||||
229
examples/foundational/19b-openai-realtime-beta-text.py
Normal file
@@ -0,0 +1,229 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
|
||||
import os
|
||||
from datetime import datetime
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.adapters.schemas.function_schema import FunctionSchema
|
||||
from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.frames.frames import TranscriptionMessage
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.processors.transcript_processor import TranscriptProcessor
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.cartesia import CartesiaTTSService
|
||||
from pipecat.services.llm_service import FunctionCallParams
|
||||
from pipecat.services.openai_realtime_beta import (
|
||||
InputAudioNoiseReduction,
|
||||
InputAudioTranscription,
|
||||
OpenAIRealtimeBetaLLMService,
|
||||
SemanticTurnDetection,
|
||||
SessionProperties,
|
||||
)
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.network.fastapi_websocket import FastAPIWebsocketParams
|
||||
from pipecat.transports.services.daily import DailyParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
async def fetch_weather_from_api(params: FunctionCallParams):
|
||||
temperature = 75 if params.arguments["format"] == "fahrenheit" else 24
|
||||
await params.result_callback(
|
||||
{
|
||||
"conditions": "nice",
|
||||
"temperature": temperature,
|
||||
"format": params.arguments["format"],
|
||||
"timestamp": datetime.now().strftime("%Y%m%d_%H%M%S"),
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
async def fetch_restaurant_recommendation(params: FunctionCallParams):
|
||||
await params.result_callback({"name": "The Golden Dragon"})
|
||||
|
||||
|
||||
weather_function = FunctionSchema(
|
||||
name="get_current_weather",
|
||||
description="Get the current weather",
|
||||
properties={
|
||||
"location": {
|
||||
"type": "string",
|
||||
"description": "The city and state, e.g. San Francisco, CA",
|
||||
},
|
||||
"format": {
|
||||
"type": "string",
|
||||
"enum": ["celsius", "fahrenheit"],
|
||||
"description": "The temperature unit to use. Infer this from the users location.",
|
||||
},
|
||||
},
|
||||
required=["location", "format"],
|
||||
)
|
||||
|
||||
restaurant_function = FunctionSchema(
|
||||
name="get_restaurant_recommendation",
|
||||
description="Get a restaurant recommendation",
|
||||
properties={
|
||||
"location": {
|
||||
"type": "string",
|
||||
"description": "The city and state, e.g. San Francisco, CA",
|
||||
},
|
||||
},
|
||||
required=["location"],
|
||||
)
|
||||
|
||||
# Create tools schema
|
||||
tools = ToolsSchema(standard_tools=[weather_function, restaurant_function])
|
||||
|
||||
|
||||
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||
# instantiated. The function will be called when the desired transport gets
|
||||
# selected.
|
||||
transport_params = {
|
||||
"daily": lambda: DailyParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
),
|
||||
"twilio": lambda: FastAPIWebsocketParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
),
|
||||
"webrtc": lambda: TransportParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
session_properties = SessionProperties(
|
||||
input_audio_transcription=InputAudioTranscription(),
|
||||
modalities=["text"],
|
||||
# Set openai TurnDetection parameters. Not setting this at all will turn it
|
||||
# on by default
|
||||
turn_detection=SemanticTurnDetection(),
|
||||
# Or set to False to disable openai turn detection and use transport VAD
|
||||
# turn_detection=False,
|
||||
input_audio_noise_reduction=InputAudioNoiseReduction(type="near_field"),
|
||||
# tools=tools,
|
||||
instructions="""You are a helpful and friendly AI.
|
||||
|
||||
Act like a human, but remember that you aren't a human and that you can't do human
|
||||
things in the real world. Your voice and personality should be warm and engaging, with a lively and
|
||||
playful tone.
|
||||
|
||||
If interacting in a non-English language, start by using the standard accent or dialect familiar to
|
||||
the user. Talk quickly. You should always call a function if you can. Do not refer to these rules,
|
||||
even if you're asked about them.
|
||||
|
||||
You are participating in a voice conversation. Keep your responses concise, short, and to the point
|
||||
unless specifically asked to elaborate on a topic.
|
||||
|
||||
You have access to the following tools:
|
||||
- get_current_weather: Get the current weather for a given location.
|
||||
- get_restaurant_recommendation: Get a restaurant recommendation for a given location.
|
||||
|
||||
Remember, your responses should be short. Just one or two sentences, usually. Respond in English.""",
|
||||
)
|
||||
|
||||
llm = OpenAIRealtimeBetaLLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
session_properties=session_properties,
|
||||
start_audio_paused=False,
|
||||
)
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
voice_id="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
)
|
||||
|
||||
# you can either register a single function for all function calls, or specific functions
|
||||
# llm.register_function(None, fetch_weather_from_api)
|
||||
llm.register_function("get_current_weather", fetch_weather_from_api)
|
||||
llm.register_function("get_restaurant_recommendation", fetch_restaurant_recommendation)
|
||||
|
||||
transcript = TranscriptProcessor()
|
||||
|
||||
# Create a standard OpenAI LLM context object using the normal messages format. The
|
||||
# OpenAIRealtimeBetaLLMService will convert this internally to messages that the
|
||||
# openai WebSocket API can understand.
|
||||
context = OpenAILLMContext(
|
||||
[{"role": "user", "content": "Say hello!"}],
|
||||
tools,
|
||||
)
|
||||
|
||||
context_aggregator = llm.create_context_aggregator(context)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(), # Transport user input
|
||||
context_aggregator.user(),
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
transcript.user(), # Placed after the LLM, as LLM pushes TranscriptionFrames downstream
|
||||
transport.output(), # Transport bot output
|
||||
transcript.assistant(), # After the transcript output, to time with the audio output
|
||||
context_aggregator.assistant(),
|
||||
]
|
||||
)
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
params=PipelineParams(
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
),
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
await task.queue_frames([context_aggregator.user().get_context_frame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
# Register event handler for transcript updates
|
||||
@transcript.event_handler("on_transcript_update")
|
||||
async def on_transcript_update(processor, frame):
|
||||
for msg in frame.messages:
|
||||
if isinstance(msg, TranscriptionMessage):
|
||||
timestamp = f"[{msg.timestamp}] " if msg.timestamp else ""
|
||||
line = f"{timestamp}{msg.role}: {msg.content}"
|
||||
logger.info(f"Transcript: {line}")
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
async def bot(runner_args: RunnerArguments):
|
||||
"""Main bot entry point compatible with Pipecat Cloud."""
|
||||
transport = await create_transport(runner_args, transport_params)
|
||||
await run_bot(transport, runner_args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from pipecat.runner.run import main
|
||||
|
||||
main()
|
||||
@@ -25,7 +25,8 @@ from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.services.llm_service import LLMService
|
||||
from pipecat.services.openai.llm import OpenAIContextAggregatorPair, OpenAILLMService
|
||||
from pipecat.sync.event_notifier import EventNotifier
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.network.fastapi_websocket import FastAPIWebsocketParams
|
||||
@@ -34,6 +35,76 @@ from pipecat.transports.services.daily import DailyParams
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
class TurnDetectionLLM(Pipeline):
|
||||
def __init__(self, llm: LLMService, context_aggregator: OpenAIContextAggregatorPair):
|
||||
# This is the LLM that will be used to detect if the user has finished a
|
||||
# statement. This doesn't really need to be an LLM, we could use NLP
|
||||
# libraries for that, but it was easier as an example because we
|
||||
# leverage the context aggregators.
|
||||
statement_llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
|
||||
|
||||
statement_messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "Determine if the user's statement is a complete sentence or question, ending in a natural pause or punctuation. Return 'YES' if it is complete and 'NO' if it seems to leave a thought unfinished.",
|
||||
},
|
||||
]
|
||||
|
||||
statement_context = OpenAILLMContext(statement_messages)
|
||||
statement_context_aggregator = statement_llm.create_context_aggregator(statement_context)
|
||||
|
||||
# We have instructed the LLM to return 'YES' if it thinks the user
|
||||
# completed a sentence. So, if it's 'YES' we will return true in this
|
||||
# predicate which will wake up the notifier.
|
||||
async def wake_check_filter(frame):
|
||||
logger.debug(f"Completeness check frame: {frame}")
|
||||
return frame.text == "YES"
|
||||
|
||||
# This is a notifier that we use to synchronize the two LLMs.
|
||||
notifier = EventNotifier()
|
||||
|
||||
# This a filter that will wake up the notifier if the given predicate
|
||||
# (wake_check_filter) returns true.
|
||||
completness_check = WakeNotifierFilter(
|
||||
notifier, types=(TextFrame,), filter=wake_check_filter
|
||||
)
|
||||
|
||||
# This processor keeps the last context and will let it through once the
|
||||
# notifier is woken up. We start with the gate open because we send an
|
||||
# initial context frame to start the conversation.
|
||||
gated_context_aggregator = GatedOpenAILLMContextAggregator(
|
||||
notifier=notifier, start_open=True
|
||||
)
|
||||
|
||||
# Notify if the user hasn't said anything.
|
||||
async def user_idle_notifier(frame):
|
||||
await notifier.notify()
|
||||
|
||||
# Sometimes the LLM will fail detecting if a user has completed a
|
||||
# sentence, this will wake up the notifier if that happens.
|
||||
user_idle = UserIdleProcessor(callback=user_idle_notifier, timeout=3.0)
|
||||
|
||||
# The ParallePipeline input are the user transcripts. We have two
|
||||
# contexts. The first one will be used to determine if the user finished
|
||||
# a statement and if so the notifier will be woken up. The second
|
||||
# context is simply the regular context but it's gated waiting for the
|
||||
# notifier to be woken up.
|
||||
super().__init__(
|
||||
[
|
||||
ParallelPipeline(
|
||||
[
|
||||
statement_context_aggregator.user(),
|
||||
statement_llm,
|
||||
completness_check,
|
||||
NullFilter(),
|
||||
],
|
||||
[context_aggregator.user(), gated_context_aggregator, llm],
|
||||
),
|
||||
user_idle,
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||
# instantiated. The function will be called when the desired transport gets
|
||||
# selected.
|
||||
@@ -66,24 +137,8 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
voice_id="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
)
|
||||
|
||||
# This is the LLM that will be used to detect if the user has finished a
|
||||
# statement. This doesn't really need to be an LLM, we could use NLP
|
||||
# libraries for that, but it was easier as an example because we
|
||||
# leverage the context aggregators.
|
||||
statement_llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
|
||||
|
||||
statement_messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "Determine if the user's statement is a complete sentence or question, ending in a natural pause or punctuation. Return 'YES' if it is complete and 'NO' if it seems to leave a thought unfinished.",
|
||||
},
|
||||
]
|
||||
|
||||
statement_context = OpenAILLMContext(statement_messages)
|
||||
statement_context_aggregator = statement_llm.create_context_aggregator(statement_context)
|
||||
|
||||
# This is the regular LLM.
|
||||
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
|
||||
llm_main = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
|
||||
|
||||
messages = [
|
||||
{
|
||||
@@ -93,53 +148,16 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
]
|
||||
|
||||
context = OpenAILLMContext(messages)
|
||||
context_aggregator = llm.create_context_aggregator(context)
|
||||
context_aggregator = llm_main.create_context_aggregator(context)
|
||||
|
||||
# We have instructed the LLM to return 'YES' if it thinks the user
|
||||
# completed a sentence. So, if it's 'YES' we will return true in this
|
||||
# predicate which will wake up the notifier.
|
||||
async def wake_check_filter(frame):
|
||||
return frame.text == "YES"
|
||||
# LLM + turn detection (with an extra LLM as a judge)
|
||||
llm = TurnDetectionLLM(llm_main, context_aggregator)
|
||||
|
||||
# This is a notifier that we use to synchronize the two LLMs.
|
||||
notifier = EventNotifier()
|
||||
|
||||
# This a filter that will wake up the notifier if the given predicate
|
||||
# (wake_check_filter) returns true.
|
||||
completness_check = WakeNotifierFilter(notifier, types=(TextFrame,), filter=wake_check_filter)
|
||||
|
||||
# This processor keeps the last context and will let it through once the
|
||||
# notifier is woken up. We start with the gate open because we send an
|
||||
# initial context frame to start the conversation.
|
||||
gated_context_aggregator = GatedOpenAILLMContextAggregator(notifier=notifier, start_open=True)
|
||||
|
||||
# Notify if the user hasn't said anything.
|
||||
async def user_idle_notifier(frame):
|
||||
await notifier.notify()
|
||||
|
||||
# Sometimes the LLM will fail detecting if a user has completed a
|
||||
# sentence, this will wake up the notifier if that happens.
|
||||
user_idle = UserIdleProcessor(callback=user_idle_notifier, timeout=3.0)
|
||||
|
||||
# The ParallePipeline input are the user transcripts. We have two
|
||||
# contexts. The first one will be used to determine if the user finished
|
||||
# a statement and if so the notifier will be woken up. The second
|
||||
# context is simply the regular context but it's gated waiting for the
|
||||
# notifier to be woken up.
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(), # Transport user input
|
||||
stt,
|
||||
ParallelPipeline(
|
||||
[
|
||||
statement_context_aggregator.user(),
|
||||
statement_llm,
|
||||
completness_check,
|
||||
NullFilter(),
|
||||
],
|
||||
[context_aggregator.user(), gated_context_aggregator, llm],
|
||||
),
|
||||
user_idle,
|
||||
stt, # STT
|
||||
llm, # LLM with turn detection
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
context_aggregator.assistant(), # Assistant spoken responses
|
||||
|
||||
@@ -6,7 +6,6 @@
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import time
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
@@ -44,13 +43,14 @@ from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.llm_service import FunctionCallParams
|
||||
from pipecat.services.llm_service import FunctionCallParams, LLMService
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.sync.base_notifier import BaseNotifier
|
||||
from pipecat.sync.event_notifier import EventNotifier
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.network.fastapi_websocket import FastAPIWebsocketParams
|
||||
from pipecat.transports.services.daily import DailyParams
|
||||
from pipecat.utils.time import time_now_iso8601
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
@@ -192,6 +192,75 @@ async def fetch_weather_from_api(params: FunctionCallParams):
|
||||
await params.result_callback({"conditions": "nice", "temperature": "75"})
|
||||
|
||||
|
||||
class TurnDetectionLLM(Pipeline):
|
||||
def __init__(self, llm: LLMService):
|
||||
# This is the LLM that will be used to detect if the user has finished a
|
||||
# statement. This doesn't really need to be an LLM, we could use NLP
|
||||
# libraries for that, but we have the machinery to use an LLM, so we
|
||||
# might as well!
|
||||
statement_llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
|
||||
|
||||
# We have instructed the LLM to return 'YES' if it thinks the user
|
||||
# completed a sentence. So, if it's 'YES' we will return true in this
|
||||
# predicate which will wake up the notifier.
|
||||
async def wake_check_filter(frame):
|
||||
logger.debug(f"Completeness check frame: {frame}")
|
||||
return frame.text == "YES"
|
||||
|
||||
# This is a notifier that we use to synchronize the two LLMs.
|
||||
notifier = EventNotifier()
|
||||
|
||||
# This turns the LLM context into an inference request to classify the user's speech
|
||||
# as complete or incomplete.
|
||||
statement_judge_context_filter = StatementJudgeContextFilter()
|
||||
|
||||
# This sends a UserStoppedSpeakingFrame and triggers the notifier event
|
||||
completeness_check = CompletenessCheck(notifier=notifier)
|
||||
|
||||
# # Notify if the user hasn't said anything.
|
||||
async def user_idle_notifier(frame):
|
||||
await notifier.notify()
|
||||
|
||||
# Sometimes the LLM will fail detecting if a user has completed a
|
||||
# sentence, this will wake up the notifier if that happens.
|
||||
user_idle = UserIdleProcessor(callback=user_idle_notifier, timeout=5.0)
|
||||
|
||||
# We start with the gate open because we send an initial context frame
|
||||
# to start the conversation.
|
||||
bot_output_gate = OutputGate(notifier=notifier, start_open=True)
|
||||
|
||||
async def pass_only_llm_trigger_frames(frame):
|
||||
return (
|
||||
isinstance(frame, OpenAILLMContextFrame)
|
||||
or isinstance(frame, StartInterruptionFrame)
|
||||
or isinstance(frame, StopInterruptionFrame)
|
||||
or isinstance(frame, FunctionCallInProgressFrame)
|
||||
or isinstance(frame, FunctionCallResultFrame)
|
||||
)
|
||||
|
||||
super().__init__(
|
||||
[
|
||||
ParallelPipeline(
|
||||
[
|
||||
# Ignore everything except an OpenAILLMContextFrame. Pass a specially constructed
|
||||
# simplified context frame to the statement classifier LLM. The only frame this
|
||||
# sub-pipeline will output is a UserStoppedSpeakingFrame.
|
||||
statement_judge_context_filter,
|
||||
statement_llm,
|
||||
completeness_check,
|
||||
],
|
||||
[
|
||||
# Block everything except frames that trigger LLM inference.
|
||||
FunctionFilter(filter=pass_only_llm_trigger_frames),
|
||||
llm,
|
||||
bot_output_gate, # Buffer all llm/tts output until notified.
|
||||
],
|
||||
),
|
||||
user_idle,
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||
# instantiated. The function will be called when the desired transport gets
|
||||
# selected.
|
||||
@@ -224,18 +293,13 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
voice_id="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
)
|
||||
|
||||
# This is the LLM that will be used to detect if the user has finished a
|
||||
# statement. This doesn't really need to be an LLM, we could use NLP
|
||||
# libraries for that, but we have the machinery to use an LLM, so we might as well!
|
||||
statement_llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
|
||||
|
||||
# This is the regular LLM.
|
||||
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
|
||||
llm_main = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
|
||||
# You can also register a function_name of None to get all functions
|
||||
# sent to the same callback with an additional function_name parameter.
|
||||
llm.register_function("get_current_weather", fetch_weather_from_api)
|
||||
llm_main.register_function("get_current_weather", fetch_weather_from_api)
|
||||
|
||||
@llm.event_handler("on_function_calls_started")
|
||||
@llm_main.event_handler("on_function_calls_started")
|
||||
async def on_function_calls_started(service, function_calls):
|
||||
await tts.queue_frame(TTSSpeakFrame("Let me check on that."))
|
||||
|
||||
@@ -272,69 +336,18 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
]
|
||||
|
||||
context = OpenAILLMContext(messages, tools)
|
||||
context_aggregator = llm.create_context_aggregator(context)
|
||||
context_aggregator = llm_main.create_context_aggregator(context)
|
||||
|
||||
# We have instructed the LLM to return 'YES' if it thinks the user
|
||||
# completed a sentence. So, if it's 'YES' we will return true in this
|
||||
# predicate which will wake up the notifier.
|
||||
async def wake_check_filter(frame):
|
||||
logger.debug(f"Completeness check frame: {frame}")
|
||||
return frame.text == "YES"
|
||||
|
||||
# This is a notifier that we use to synchronize the two LLMs.
|
||||
notifier = EventNotifier()
|
||||
|
||||
# This turns the LLM context into an inference request to classify the user's speech
|
||||
# as complete or incomplete.
|
||||
statement_judge_context_filter = StatementJudgeContextFilter()
|
||||
|
||||
# This sends a UserStoppedSpeakingFrame and triggers the notifier event
|
||||
completeness_check = CompletenessCheck(notifier=notifier)
|
||||
|
||||
# # Notify if the user hasn't said anything.
|
||||
async def user_idle_notifier(frame):
|
||||
await notifier.notify()
|
||||
|
||||
# Sometimes the LLM will fail detecting if a user has completed a
|
||||
# sentence, this will wake up the notifier if that happens.
|
||||
user_idle = UserIdleProcessor(callback=user_idle_notifier, timeout=5.0)
|
||||
|
||||
# We start with the gate open because we send an initial context frame
|
||||
# to start the conversation.
|
||||
bot_output_gate = OutputGate(notifier=notifier, start_open=True)
|
||||
|
||||
async def pass_only_llm_trigger_frames(frame):
|
||||
return (
|
||||
isinstance(frame, OpenAILLMContextFrame)
|
||||
or isinstance(frame, StartInterruptionFrame)
|
||||
or isinstance(frame, StopInterruptionFrame)
|
||||
or isinstance(frame, FunctionCallInProgressFrame)
|
||||
or isinstance(frame, FunctionCallResultFrame)
|
||||
)
|
||||
# LLM + turn detection (with an extra LLM as a judge)
|
||||
llm = TurnDetectionLLM(llm_main)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(),
|
||||
stt,
|
||||
context_aggregator.user(),
|
||||
ParallelPipeline(
|
||||
[
|
||||
# Ignore everything except an OpenAILLMContextFrame. Pass a specially constructed
|
||||
# simplified context frame to the statement classifier LLM. The only frame this
|
||||
# sub-pipeline will output is a UserStoppedSpeakingFrame.
|
||||
statement_judge_context_filter,
|
||||
statement_llm,
|
||||
completeness_check,
|
||||
],
|
||||
[
|
||||
# Block everything except frames that trigger LLM inference.
|
||||
FunctionFilter(filter=pass_only_llm_trigger_frames),
|
||||
llm,
|
||||
bot_output_gate, # Buffer all llm/tts output until notified.
|
||||
],
|
||||
),
|
||||
llm,
|
||||
tts,
|
||||
user_idle,
|
||||
transport.output(),
|
||||
context_aggregator.assistant(),
|
||||
]
|
||||
@@ -365,7 +378,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
await task.queue_frames(
|
||||
[
|
||||
UserStartedSpeakingFrame(),
|
||||
TranscriptionFrame(user_id="", timestamp=time.time(), text=message["message"]),
|
||||
TranscriptionFrame(
|
||||
user_id="", timestamp=time_now_iso8601(), text=message["message"]
|
||||
),
|
||||
UserStoppedSpeakingFrame(),
|
||||
]
|
||||
)
|
||||
|
||||
@@ -6,7 +6,6 @@
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import time
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
@@ -45,13 +44,14 @@ from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.anthropic.llm import AnthropicLLMService
|
||||
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.llm_service import FunctionCallParams
|
||||
from pipecat.services.llm_service import FunctionCallParams, LLMService
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.sync.base_notifier import BaseNotifier
|
||||
from pipecat.sync.event_notifier import EventNotifier
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.network.fastapi_websocket import FastAPIWebsocketParams
|
||||
from pipecat.transports.services.daily import DailyParams
|
||||
from pipecat.utils.time import time_now_iso8601
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
@@ -391,6 +391,75 @@ class OutputGate(FrameProcessor):
|
||||
break
|
||||
|
||||
|
||||
class TurnDetectionLLM(Pipeline):
|
||||
def __init__(self, llm: LLMService):
|
||||
# This is the LLM that will be used to detect if the user has finished a
|
||||
# statement. This doesn't really need to be an LLM, we could use NLP
|
||||
# libraries for that, but we have the machinery to use an LLM, so we might as well!
|
||||
statement_llm = AnthropicLLMService(api_key=os.getenv("ANTHROPIC_API_KEY"))
|
||||
|
||||
# This is a notifier that we use to synchronize the two LLMs.
|
||||
notifier = EventNotifier()
|
||||
|
||||
# This turns the LLM context into an inference request to classify the user's speech
|
||||
# as complete or incomplete.
|
||||
statement_judge_context_filter = StatementJudgeContextFilter()
|
||||
|
||||
# This sends a UserStoppedSpeakingFrame and triggers the notifier event
|
||||
completeness_check = CompletenessCheck(notifier=notifier)
|
||||
|
||||
# # Notify if the user hasn't said anything.
|
||||
async def user_idle_notifier(frame):
|
||||
await notifier.notify()
|
||||
|
||||
# Sometimes the LLM will fail detecting if a user has completed a
|
||||
# sentence, this will wake up the notifier if that happens.
|
||||
user_idle = UserIdleProcessor(callback=user_idle_notifier, timeout=5.0)
|
||||
|
||||
# We start with the gate open because we send an initial context frame
|
||||
# to start the conversation.
|
||||
bot_output_gate = OutputGate(notifier=notifier, start_open=True)
|
||||
|
||||
async def block_user_stopped_speaking(frame):
|
||||
return not isinstance(frame, UserStoppedSpeakingFrame)
|
||||
|
||||
async def pass_only_llm_trigger_frames(frame):
|
||||
return (
|
||||
isinstance(frame, OpenAILLMContextFrame)
|
||||
or isinstance(frame, StartInterruptionFrame)
|
||||
or isinstance(frame, StopInterruptionFrame)
|
||||
or isinstance(frame, FunctionCallInProgressFrame)
|
||||
or isinstance(frame, FunctionCallResultFrame)
|
||||
)
|
||||
|
||||
super().__init__(
|
||||
[
|
||||
ParallelPipeline(
|
||||
[
|
||||
# Pass everything except UserStoppedSpeaking to the elements after
|
||||
# this ParallelPipeline
|
||||
FunctionFilter(filter=block_user_stopped_speaking),
|
||||
],
|
||||
[
|
||||
# Ignore everything except an OpenAILLMContextFrame. Pass a specially constructed
|
||||
# simplified context frame to the statement classifier LLM. The only frame this
|
||||
# sub-pipeline will output is a UserStoppedSpeakingFrame.
|
||||
statement_judge_context_filter,
|
||||
statement_llm,
|
||||
completeness_check,
|
||||
],
|
||||
[
|
||||
# Block everything except frames that trigger LLM inference.
|
||||
FunctionFilter(filter=pass_only_llm_trigger_frames),
|
||||
llm,
|
||||
bot_output_gate, # Buffer all llm/tts output until notified.
|
||||
],
|
||||
),
|
||||
user_idle,
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
async def fetch_weather_from_api(params: FunctionCallParams):
|
||||
await params.result_callback({"conditions": "nice", "temperature": "75"})
|
||||
|
||||
@@ -427,18 +496,13 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
voice_id="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
)
|
||||
|
||||
# This is the LLM that will be used to detect if the user has finished a
|
||||
# statement. This doesn't really need to be an LLM, we could use NLP
|
||||
# libraries for that, but we have the machinery to use an LLM, so we might as well!
|
||||
statement_llm = AnthropicLLMService(api_key=os.getenv("ANTHROPIC_API_KEY"))
|
||||
|
||||
# This is the regular LLM.
|
||||
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
|
||||
llm_main = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
|
||||
# Register a function_name of None to get all functions
|
||||
# sent to the same callback with an additional function_name parameter.
|
||||
llm.register_function("get_current_weather", fetch_weather_from_api)
|
||||
llm_main.register_function("get_current_weather", fetch_weather_from_api)
|
||||
|
||||
@llm.event_handler("on_function_calls_started")
|
||||
@llm_main.event_handler("on_function_calls_started")
|
||||
async def on_function_calls_started(service, function_calls):
|
||||
await tts.queue_frame(TTSSpeakFrame("Let me check on that."))
|
||||
|
||||
@@ -475,76 +539,18 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
]
|
||||
|
||||
context = OpenAILLMContext(messages, tools)
|
||||
context_aggregator = llm.create_context_aggregator(context)
|
||||
context_aggregator = llm_main.create_context_aggregator(context)
|
||||
|
||||
# We have instructed the LLM to return 'YES' if it thinks the user
|
||||
# completed a sentence. So, if it's 'YES' we will return true in this
|
||||
# predicate which will wake up the notifier.
|
||||
async def wake_check_filter(frame):
|
||||
return frame.text == "YES"
|
||||
|
||||
# This is a notifier that we use to synchronize the two LLMs.
|
||||
notifier = EventNotifier()
|
||||
|
||||
# This turns the LLM context into an inference request to classify the user's speech
|
||||
# as complete or incomplete.
|
||||
statement_judge_context_filter = StatementJudgeContextFilter()
|
||||
|
||||
# This sends a UserStoppedSpeakingFrame and triggers the notifier event
|
||||
completeness_check = CompletenessCheck(notifier=notifier)
|
||||
|
||||
# # Notify if the user hasn't said anything.
|
||||
async def user_idle_notifier(frame):
|
||||
await notifier.notify()
|
||||
|
||||
# Sometimes the LLM will fail detecting if a user has completed a
|
||||
# sentence, this will wake up the notifier if that happens.
|
||||
user_idle = UserIdleProcessor(callback=user_idle_notifier, timeout=5.0)
|
||||
|
||||
# We start with the gate open because we send an initial context frame
|
||||
# to start the conversation.
|
||||
bot_output_gate = OutputGate(notifier=notifier, start_open=True)
|
||||
|
||||
async def block_user_stopped_speaking(frame):
|
||||
return not isinstance(frame, UserStoppedSpeakingFrame)
|
||||
|
||||
async def pass_only_llm_trigger_frames(frame):
|
||||
return (
|
||||
isinstance(frame, OpenAILLMContextFrame)
|
||||
or isinstance(frame, StartInterruptionFrame)
|
||||
or isinstance(frame, StopInterruptionFrame)
|
||||
or isinstance(frame, FunctionCallInProgressFrame)
|
||||
or isinstance(frame, FunctionCallResultFrame)
|
||||
)
|
||||
# LLM + turn detection (with an extra LLM as a judge)
|
||||
llm = TurnDetectionLLM(llm_main)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(),
|
||||
stt,
|
||||
context_aggregator.user(),
|
||||
ParallelPipeline(
|
||||
[
|
||||
# Pass everything except UserStoppedSpeaking to the elements after
|
||||
# this ParallelPipeline
|
||||
FunctionFilter(filter=block_user_stopped_speaking),
|
||||
],
|
||||
[
|
||||
# Ignore everything except an OpenAILLMContextFrame. Pass a specially constructed
|
||||
# simplified context frame to the statement classifier LLM. The only frame this
|
||||
# sub-pipeline will output is a UserStoppedSpeakingFrame.
|
||||
statement_judge_context_filter,
|
||||
statement_llm,
|
||||
completeness_check,
|
||||
],
|
||||
[
|
||||
# Block everything except frames that trigger LLM inference.
|
||||
FunctionFilter(filter=pass_only_llm_trigger_frames),
|
||||
llm,
|
||||
bot_output_gate, # Buffer all llm/tts output until notified.
|
||||
],
|
||||
),
|
||||
llm,
|
||||
tts,
|
||||
user_idle,
|
||||
transport.output(),
|
||||
context_aggregator.assistant(),
|
||||
]
|
||||
@@ -580,7 +586,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
await task.queue_frames(
|
||||
[
|
||||
UserStartedSpeakingFrame(),
|
||||
TranscriptionFrame(user_id="", timestamp=time.time(), text=message["message"]),
|
||||
TranscriptionFrame(
|
||||
user_id="", timestamp=time_now_iso8601(), text=message["message"]
|
||||
),
|
||||
UserStoppedSpeakingFrame(),
|
||||
]
|
||||
)
|
||||
|
||||
@@ -47,11 +47,13 @@ from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
from pipecat.services.google.llm import GoogleLLMContext, GoogleLLMService
|
||||
from pipecat.services.llm_service import LLMService
|
||||
from pipecat.sync.base_notifier import BaseNotifier
|
||||
from pipecat.sync.event_notifier import EventNotifier
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.network.fastapi_websocket import FastAPIWebsocketParams
|
||||
from pipecat.transports.services.daily import DailyParams
|
||||
from pipecat.utils.time import time_now_iso8601
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
@@ -607,23 +609,90 @@ class OutputGate(FrameProcessor):
|
||||
self._gate_task = None
|
||||
|
||||
async def _gate_task_handler(self):
|
||||
while True:
|
||||
try:
|
||||
await self._notifier.wait()
|
||||
await self._notifier.wait()
|
||||
|
||||
transcription = await self._transcription_buffer.wait_for_transcription() or "-"
|
||||
self._context.add_message(Content(role="user", parts=[Part(text=transcription)]))
|
||||
transcription = await self._transcription_buffer.wait_for_transcription() or "-"
|
||||
self._context.add_message(Content(role="user", parts=[Part(text=transcription)]))
|
||||
|
||||
self.open_gate()
|
||||
for frame, direction in self._frames_buffer:
|
||||
await self.push_frame(frame, direction)
|
||||
self._frames_buffer = []
|
||||
except asyncio.CancelledError:
|
||||
break
|
||||
except Exception as e:
|
||||
logger.error(f"OutputGate error: {e}")
|
||||
raise e
|
||||
break
|
||||
self.open_gate()
|
||||
for frame, direction in self._frames_buffer:
|
||||
await self.push_frame(frame, direction)
|
||||
self._frames_buffer = []
|
||||
|
||||
|
||||
class TurnDetectionLLM(Pipeline):
|
||||
def __init__(self, llm: LLMService, context: OpenAILLMContext):
|
||||
# This is the LLM that will transcribe user speech.
|
||||
tx_llm = GoogleLLMService(
|
||||
name="Transcriber",
|
||||
model=TRANSCRIBER_MODEL,
|
||||
api_key=os.getenv("GOOGLE_API_KEY"),
|
||||
temperature=0.0,
|
||||
system_instruction=transcriber_system_instruction,
|
||||
)
|
||||
|
||||
# This is the LLM that will classify user speech as complete or incomplete.
|
||||
classifier_llm = GoogleLLMService(
|
||||
name="Classifier",
|
||||
model=CLASSIFIER_MODEL,
|
||||
api_key=os.getenv("GOOGLE_API_KEY"),
|
||||
temperature=0.0,
|
||||
system_instruction=classifier_system_instruction,
|
||||
)
|
||||
|
||||
# This is a notifier that we use to synchronize the two LLMs.
|
||||
notifier = EventNotifier()
|
||||
|
||||
# This turns the LLM context into an inference request to classify the user's speech
|
||||
# as complete or incomplete.
|
||||
# statement_judge_context_filter = StatementJudgeAudioContextAccumulator(notifier=notifier)
|
||||
|
||||
audio_accumulater = AudioAccumulator()
|
||||
# This sends a UserStoppedSpeakingFrame and triggers the notifier event
|
||||
completeness_check = CompletenessCheck(
|
||||
notifier=notifier, audio_accumulator=audio_accumulater
|
||||
)
|
||||
|
||||
async def block_user_stopped_speaking(frame):
|
||||
return not isinstance(frame, UserStoppedSpeakingFrame)
|
||||
|
||||
conversation_audio_context_assembler = ConversationAudioContextAssembler(context=context)
|
||||
|
||||
llm_aggregator_buffer = LLMAggregatorBuffer()
|
||||
|
||||
bot_output_gate = OutputGate(
|
||||
notifier=notifier, context=context, llm_transcription_buffer=llm_aggregator_buffer
|
||||
)
|
||||
|
||||
super().__init__(
|
||||
[
|
||||
audio_accumulater,
|
||||
ParallelPipeline(
|
||||
[
|
||||
# Pass everything except UserStoppedSpeaking to the elements after
|
||||
# this ParallelPipeline
|
||||
FunctionFilter(filter=block_user_stopped_speaking),
|
||||
],
|
||||
[
|
||||
ParallelPipeline(
|
||||
[
|
||||
classifier_llm,
|
||||
completeness_check,
|
||||
],
|
||||
[
|
||||
tx_llm,
|
||||
llm_aggregator_buffer,
|
||||
],
|
||||
)
|
||||
],
|
||||
[
|
||||
conversation_audio_context_assembler,
|
||||
llm,
|
||||
bot_output_gate, # buffer output until notified, then flush frames and update context
|
||||
],
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||
@@ -656,24 +725,6 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
voice_id="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
)
|
||||
|
||||
# This is the LLM that will transcribe user speech.
|
||||
tx_llm = GoogleLLMService(
|
||||
name="Transcriber",
|
||||
model=TRANSCRIBER_MODEL,
|
||||
api_key=os.getenv("GOOGLE_API_KEY"),
|
||||
temperature=0.0,
|
||||
system_instruction=transcriber_system_instruction,
|
||||
)
|
||||
|
||||
# This is the LLM that will classify user speech as complete or incomplete.
|
||||
classifier_llm = GoogleLLMService(
|
||||
name="Classifier",
|
||||
model=CLASSIFIER_MODEL,
|
||||
api_key=os.getenv("GOOGLE_API_KEY"),
|
||||
temperature=0.0,
|
||||
system_instruction=classifier_system_instruction,
|
||||
)
|
||||
|
||||
# This is the regular LLM that responds conversationally.
|
||||
conversation_llm = GoogleLLMService(
|
||||
name="Conversation",
|
||||
@@ -685,57 +736,12 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
context = OpenAILLMContext()
|
||||
context_aggregator = conversation_llm.create_context_aggregator(context)
|
||||
|
||||
# This is a notifier that we use to synchronize the two LLMs.
|
||||
notifier = EventNotifier()
|
||||
|
||||
# This turns the LLM context into an inference request to classify the user's speech
|
||||
# as complete or incomplete.
|
||||
# statement_judge_context_filter = StatementJudgeAudioContextAccumulator(notifier=notifier)
|
||||
|
||||
audio_accumulater = AudioAccumulator()
|
||||
# This sends a UserStoppedSpeakingFrame and triggers the notifier event
|
||||
completeness_check = CompletenessCheck(notifier=notifier, audio_accumulator=audio_accumulater)
|
||||
|
||||
async def block_user_stopped_speaking(frame):
|
||||
return not isinstance(frame, UserStoppedSpeakingFrame)
|
||||
|
||||
conversation_audio_context_assembler = ConversationAudioContextAssembler(context=context)
|
||||
|
||||
llm_aggregator_buffer = LLMAggregatorBuffer()
|
||||
|
||||
bot_output_gate = OutputGate(
|
||||
notifier=notifier, context=context, llm_transcription_buffer=llm_aggregator_buffer
|
||||
)
|
||||
llm = TurnDetectionLLM(conversation_llm, context)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(),
|
||||
audio_accumulater,
|
||||
ParallelPipeline(
|
||||
[
|
||||
# Pass everything except UserStoppedSpeaking to the elements after
|
||||
# this ParallelPipeline
|
||||
FunctionFilter(filter=block_user_stopped_speaking),
|
||||
],
|
||||
[
|
||||
ParallelPipeline(
|
||||
[
|
||||
classifier_llm,
|
||||
completeness_check,
|
||||
],
|
||||
[
|
||||
tx_llm,
|
||||
llm_aggregator_buffer,
|
||||
],
|
||||
)
|
||||
],
|
||||
[
|
||||
conversation_audio_context_assembler,
|
||||
conversation_llm,
|
||||
bot_output_gate, # buffer output until notified, then flush frames and update context
|
||||
# TempPrinter(),
|
||||
],
|
||||
),
|
||||
llm,
|
||||
tts,
|
||||
transport.output(),
|
||||
context_aggregator.assistant(),
|
||||
@@ -766,7 +772,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
await task.queue_frames(
|
||||
[
|
||||
UserStartedSpeakingFrame(),
|
||||
TranscriptionFrame(user_id="", timestamp=time.time(), text=message["message"]),
|
||||
TranscriptionFrame(
|
||||
user_id="", timestamp=time_now_iso8601(), text=message["message"]
|
||||
),
|
||||
UserStoppedSpeakingFrame(),
|
||||
]
|
||||
)
|
||||
|
||||
139
examples/foundational/44-voicemail-detection.py
Normal file
@@ -0,0 +1,139 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import os
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.extensions.voicemail.voicemail_detector import VoicemailDetector
|
||||
from pipecat.frames.frames import EndTaskFrame, TTSSpeakFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.processors.frame_processor import FrameDirection
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.network.fastapi_websocket import FastAPIWebsocketParams
|
||||
from pipecat.transports.services.daily import DailyParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||
# instantiated. The function will be called when the desired transport gets
|
||||
# selected.
|
||||
transport_params = {
|
||||
"daily": lambda: DailyParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
),
|
||||
"twilio": lambda: FastAPIWebsocketParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
),
|
||||
"webrtc": lambda: TransportParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
voice_id="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
|
||||
classifier_llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
|
||||
|
||||
voicemail = VoicemailDetector(llm=classifier_llm)
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
|
||||
},
|
||||
]
|
||||
|
||||
context = OpenAILLMContext(messages)
|
||||
context_aggregator = llm.create_context_aggregator(context)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(),
|
||||
stt,
|
||||
voicemail.detector(), # Voicemail detection — between STT and User context aggregator
|
||||
context_aggregator.user(),
|
||||
llm,
|
||||
tts,
|
||||
voicemail.gate(), # TTS gating — Immediately after the TTS service
|
||||
transport.output(),
|
||||
context_aggregator.assistant(),
|
||||
]
|
||||
)
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
params=PipelineParams(
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
),
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
@voicemail.event_handler("on_voicemail_detected")
|
||||
async def handle_voicemail(processor):
|
||||
logger.info("Voicemail detected! Leaving a message...")
|
||||
|
||||
# Push frames using standard Pipecat pattern
|
||||
await processor.push_frame(
|
||||
TTSSpeakFrame(
|
||||
"Hello, this is Jamie calling about your appointment. Please call me back at 555-0123 when you get this."
|
||||
)
|
||||
)
|
||||
|
||||
# NOTE: A common pattern is to end pipeline after the voicemail is left.
|
||||
# Uncomment the following line to end the pipeline after leaving the voicemail.
|
||||
# await processor.push_frame(EndTaskFrame(), FrameDirection.UPSTREAM)
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
async def bot(runner_args: RunnerArguments):
|
||||
"""Main bot entry point compatible with Pipecat Cloud."""
|
||||
transport = await create_transport(runner_args, transport_params)
|
||||
await run_bot(transport, runner_args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from pipecat.runner.run import main
|
||||
|
||||
main()
|
||||
@@ -4,40 +4,32 @@ This directory contains examples showing how to build voice and multimodal agent
|
||||
|
||||
## Setup
|
||||
|
||||
1. Make sure you have uv installed:
|
||||
1. Follow the [README](../../README.md#%EF%B8%8F-contributing-to-the-framework) steps to get your local environment configured.
|
||||
|
||||
```bash
|
||||
curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||
```
|
||||
> **Run from root directory**: Make sure you are running the steps from the root directory.
|
||||
|
||||
> **Need help?** Refer to the [uv install documentation](https://docs.astral.sh/uv/getting-started/installation/).
|
||||
> **Using local audio?**: The `LocalAudioTransport` requires a system dependency for `portaudio`. Install the dependency to use the transport.
|
||||
|
||||
2. Create a venv and install example dependencies:
|
||||
|
||||
```bash
|
||||
uv sync --all-extras --no-extra krisp
|
||||
```
|
||||
|
||||
3. Create a `.env` file with your API keys:
|
||||
2. Copy the [`env.example`](../../env.example) file and add API keys for services you plan to use:
|
||||
|
||||
```bash
|
||||
cp env.example .env
|
||||
# Edit .env with your API keys
|
||||
```
|
||||
|
||||
4. Navigate to the examples directory:
|
||||
3. Navigate to the examples directory if you aren't already there:
|
||||
|
||||
```bash
|
||||
cd examples/foundational
|
||||
```
|
||||
|
||||
5. Run any example:
|
||||
4. Run any example:
|
||||
|
||||
```bash
|
||||
uv run python 01-say-one-thing.py
|
||||
```
|
||||
|
||||
6. Open the web interface at http://localhost:7860/client/ and click "Connect"
|
||||
5. Open the web interface at http://localhost:7860/client/ and click "Connect"
|
||||
|
||||
## Running examples with other transports
|
||||
|
||||
|
||||
16
examples/quickstart/Dockerfile
Normal file
@@ -0,0 +1,16 @@
|
||||
FROM dailyco/pipecat-base:latest
|
||||
|
||||
# Enable bytecode compilation
|
||||
ENV UV_COMPILE_BYTECODE=1
|
||||
|
||||
# Copy from the cache instead of linking since it's a mounted volume
|
||||
ENV UV_LINK_MODE=copy
|
||||
|
||||
# Install the project's dependencies using the lockfile and settings
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
--mount=type=bind,source=uv.lock,target=uv.lock \
|
||||
--mount=type=bind,source=pyproject.toml,target=pyproject.toml \
|
||||
uv sync --locked --no-install-project --no-dev
|
||||
|
||||
# Copy the application code
|
||||
COPY ./bot.py bot.py
|
||||
@@ -1,87 +1,159 @@
|
||||
# Pipecat Quickstart
|
||||
|
||||
Run your first Pipecat bot in under 5 minutes. This example creates a voice AI bot that you can talk to in your browser.
|
||||
Build and deploy your first voice AI bot in under 10 minutes. Develop locally, then scale to production on Pipecat Cloud.
|
||||
|
||||
## Prerequisites
|
||||
**Two steps**: [🏠 Local Development](#run-your-bot-locally) → [☁️ Production Deployment](#deploy-to-production)
|
||||
|
||||
### Python 3.10+
|
||||
> 🎯 Quick start: Local bot in 5 minutes, production deployment in 5 more
|
||||
|
||||
Pipecat requires Python 3.10 or newer. Check your version:
|
||||
## Step 1: Local Development (5 min)
|
||||
|
||||
```bash
|
||||
python --version
|
||||
```
|
||||
### Prerequisites
|
||||
|
||||
If you need to upgrade Python, we recommend using a version manager like `uv` or `pyenv`.
|
||||
#### Environment
|
||||
|
||||
### AI Service API keys
|
||||
- Python 3.10 or later
|
||||
- [uv](https://docs.astral.sh/uv/getting-started/installation/) package manager installed
|
||||
|
||||
Pipecat orchestrates different AI services in a pipeline, ensuring low latency communication. In this quickstart example, we'll use:
|
||||
#### AI Service API keys
|
||||
|
||||
- [Deepgram](https://console.deepgram.com/signup) for Speech-to-Text transcriptions
|
||||
You'll need API keys from three services:
|
||||
|
||||
- [Deepgram](https://console.deepgram.com/signup) for Speech-to-Text
|
||||
- [OpenAI](https://auth.openai.com/create-account) for LLM inference
|
||||
- [Cartesia](https://play.cartesia.ai/sign-up) for Text-to-Speech audio generation
|
||||
- [Cartesia](https://play.cartesia.ai/sign-up) for Text-to-Speech
|
||||
|
||||
Have your API keys ready. We'll add them to your `.env` shortly.
|
||||
> 💡 **Tip**: Sign up for all three now. You'll need them for both local and cloud deployment.
|
||||
|
||||
## Setup
|
||||
### Setup
|
||||
|
||||
1. Set up a virtual environment
|
||||
Navigate to the quickstart directory and set up your environment.
|
||||
|
||||
From the `examples/quickstart` directory, run:
|
||||
1. Install dependencies:
|
||||
|
||||
```bash
|
||||
uv sync
|
||||
```
|
||||
|
||||
2. Configure your API keys:
|
||||
|
||||
Create a `.env` file:
|
||||
|
||||
```bash
|
||||
cp env.example .env
|
||||
```
|
||||
|
||||
Then, add your API keys:
|
||||
|
||||
```ini
|
||||
DEEPGRAM_API_KEY=your_deepgram_api_key
|
||||
OPENAI_API_KEY=your_openai_api_key
|
||||
CARTESIA_API_KEY=your_cartesia_api_key
|
||||
```
|
||||
|
||||
### Run your bot locally
|
||||
|
||||
```bash
|
||||
python -m venv .venv
|
||||
source .venv/bin/activate # On Windows: .venv\Scripts\activate
|
||||
uv run bot.py
|
||||
```
|
||||
|
||||
> Using `uv`? Create your venv using: `uv venv && source .venv/bin/activate`.
|
||||
|
||||
2. Install dependencies
|
||||
|
||||
```bash
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
> Using `uv`? Install requirements using: `uv pip install -r requirements.txt`.
|
||||
|
||||
3. Configure environment variables
|
||||
|
||||
Create a `.env` file:
|
||||
|
||||
```bash
|
||||
cp env.example .env
|
||||
```
|
||||
|
||||
Then, add your API keys:
|
||||
|
||||
```
|
||||
DEEPGRAM_API_KEY=your_deepgram_api_key
|
||||
OPENAI_API_KEY=your_openai_api_key
|
||||
CARTESIA_API_KEY=your_cartesia_api_key
|
||||
```
|
||||
|
||||
4. Run the example
|
||||
|
||||
Run your bot using:
|
||||
|
||||
```bash
|
||||
python bot.py
|
||||
```
|
||||
|
||||
> Using `uv`? Run your bot using: `uv run bot.py`.
|
||||
|
||||
**Open http://localhost:7860 in your browser** and click `Connect` to start talking to your bot.
|
||||
|
||||
> 💡 First run note: The initial startup may take ~10 seconds as Pipecat downloads required models, like the Silero VAD model.
|
||||
> 💡 First run note: The initial startup may take ~20 seconds as Pipecat downloads required models and imports.
|
||||
|
||||
## Troubleshooting
|
||||
🎉 **Success!** Your bot is running locally. Now let's deploy it to production so others can use it.
|
||||
|
||||
- **Browser permissions**: Make sure to allow microphone access when prompted by your browser.
|
||||
- **Connection issues**: If the WebRTC connection fails, first try a different browser. If that fails, make sure you don't have a VPN or firewall rules blocking traffic. WebRTC uses UDP to communicate.
|
||||
- **Audio issues**: Check that your microphone and speakers are working and not muted.
|
||||
---
|
||||
|
||||
## Next Steps
|
||||
## Step 2: Deploy to Production (5 min)
|
||||
|
||||
- **Read the docs**: Check out [Pipecat's docs](https://docs.pipecat.ai/) for guides and reference information.
|
||||
- **Join Discord**: Join [Pipecat's Discord server](https://discord.gg/pipecat) to get help and learn about what others are building.
|
||||
Transform your local bot into a production-ready service. Pipecat Cloud handles scaling, monitoring, and global deployment.
|
||||
|
||||
### Prerequisites
|
||||
|
||||
1. [Sign up for Pipecat Cloud](https://pipecat.daily.co/sign-up).
|
||||
|
||||
2. Install the Pipecat Cloud CLI:
|
||||
|
||||
```bash
|
||||
uv add pipecatcloud
|
||||
```
|
||||
|
||||
> 💡 Tip: You can run the `pipecatcloud` CLI using the `pcc` alias.
|
||||
|
||||
3. Set up Docker for building your bot image:
|
||||
|
||||
- **Install [Docker](https://www.docker.com/)** on your system
|
||||
- **Create a [Docker Hub](https://hub.docker.com/) account**
|
||||
- **Login to Docker Hub:**
|
||||
|
||||
```bash
|
||||
docker login
|
||||
```
|
||||
|
||||
### Configure your deployment
|
||||
|
||||
The `pcc-deploy.toml` file tells Pipecat Cloud how to run your bot. **Update the image field** with your Docker Hub username by editing `pcc-deploy.toml`.
|
||||
|
||||
```ini
|
||||
agent_name = "quickstart"
|
||||
image = "YOUR_DOCKERHUB_USERNAME/quickstart:0.1" # 👈 Update this line
|
||||
secret_set = "quickstart-secrets"
|
||||
|
||||
[scaling]
|
||||
min_agents = 1
|
||||
```
|
||||
|
||||
**Understanding the TOML file settings:**
|
||||
|
||||
- `agent_name`: Your bot's name in Pipecat Cloud
|
||||
- `image`: The Docker image to deploy (format: `username/image:version`)
|
||||
- `secret_set`: Where your API keys are stored securely
|
||||
- `min_agents`: Number of bot instances to keep ready (1 = instant start)
|
||||
|
||||
> 💡 Tip: [Set up `image_credentials`](https://docs.pipecat.ai/deployment/pipecat-cloud/fundamentals/secrets#image-pull-secrets) in your TOML file for authenticated image pulls
|
||||
|
||||
### Configure secrets
|
||||
|
||||
Upload your API keys to Pipecat Cloud's secure storage:
|
||||
|
||||
```bash
|
||||
uv run pcc secrets set quickstart-secrets --file .env
|
||||
```
|
||||
|
||||
This creates a secret set called `quickstart-secrets` (matching your TOML file) and uploads all your API keys from `.env`.
|
||||
|
||||
### Build and deploy
|
||||
|
||||
Build your Docker image and push to Docker Hub:
|
||||
|
||||
```bash
|
||||
# Update build.sh with your Docker Hub username, then:
|
||||
./build.sh
|
||||
```
|
||||
|
||||
Deploy to Pipecat Cloud:
|
||||
|
||||
```bash
|
||||
uv run pcc deploy
|
||||
```
|
||||
|
||||
### Connect to your agent
|
||||
|
||||
1. Open your [Pipecat Cloud dashboard](https://pipecat.daily.co/)
|
||||
2. Select your `quickstart` agent → **Sandbox**
|
||||
3. Allow microphone access and click **Connect**
|
||||
|
||||
---
|
||||
|
||||
## What's Next?
|
||||
|
||||
**🔧 Customize your bot**: Modify `bot.py` to change personality, add functions, or integrate with your data
|
||||
**📚 Learn more**: Check out [Pipecat's docs](https://docs.pipecat.ai/) for advanced features
|
||||
**💬 Get help**: Join [Pipecat's Discord](https://discord.gg/pipecat) to connect with the community
|
||||
|
||||
### Troubleshooting
|
||||
|
||||
- **Browser permissions**: Allow microphone access when prompted
|
||||
- **Connection issues**: Try a different browser or check VPN/firewall settings
|
||||
- **Audio issues**: Verify microphone and speakers are working and not muted
|
||||
|
||||
@@ -7,18 +7,16 @@
|
||||
"""Pipecat Quickstart Example.
|
||||
|
||||
The example runs a simple voice AI bot that you can connect to using your
|
||||
browser and speak with it.
|
||||
browser and speak with it. You can also deploy this bot to Pipecat Cloud.
|
||||
|
||||
Required AI services:
|
||||
- Deepgram (Speech-to-Text)
|
||||
- OpenAI (LLM)
|
||||
- Cartesia (Text-to-Speech)
|
||||
|
||||
The example connects between client and server using a P2P WebRTC connection.
|
||||
|
||||
Run the bot using::
|
||||
|
||||
python bot.py
|
||||
uv run bot.py
|
||||
"""
|
||||
|
||||
import os
|
||||
@@ -27,7 +25,7 @@ from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
print("🚀 Starting Pipecat bot...")
|
||||
print("⏳ Loading AI models (30-40 seconds first run, <2 seconds after)\n")
|
||||
print("⏳ Loading models and imports (20 seconds first run only)\n")
|
||||
|
||||
logger.info("Loading Silero VAD model...")
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
@@ -40,15 +38,12 @@ from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.processors.frameworks.rtvi import RTVIConfig, RTVIObserver, RTVIProcessor
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
|
||||
logger.info("✅ Pipeline components loaded")
|
||||
|
||||
logger.info("Loading WebRTC transport...")
|
||||
from pipecat.transports.network.small_webrtc import SmallWebRTCTransport
|
||||
from pipecat.transports.services.daily import DailyParams
|
||||
|
||||
logger.info("✅ All components loaded successfully!")
|
||||
|
||||
@@ -121,14 +116,20 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
async def bot(runner_args: RunnerArguments):
|
||||
"""Main bot entry point for the bot starter."""
|
||||
|
||||
transport = SmallWebRTCTransport(
|
||||
params=TransportParams(
|
||||
transport_params = {
|
||||
"daily": lambda: DailyParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
),
|
||||
webrtc_connection=runner_args.webrtc_connection,
|
||||
)
|
||||
"webrtc": lambda: TransportParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
),
|
||||
}
|
||||
|
||||
transport = await create_transport(runner_args, transport_params)
|
||||
|
||||
await run_bot(transport, runner_args)
|
||||
|
||||
|
||||
19
examples/quickstart/build.sh
Executable file
@@ -0,0 +1,19 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
VERSION="0.1"
|
||||
DOCKER_USERNAME="your_username"
|
||||
AGENT_NAME="quickstart"
|
||||
|
||||
# Build the Docker image with the correct context
|
||||
echo "Building Docker image..."
|
||||
docker build --platform=linux/arm64 -t "$DOCKER_USERNAME/$AGENT_NAME:$VERSION" -t "$DOCKER_USERNAME/$AGENT_NAME:latest" .
|
||||
|
||||
# Push the Docker images
|
||||
echo "Pushing Docker image $DOCKER_USERNAME/$AGENT_NAME:$VERSION..."
|
||||
docker push "$DOCKER_USERNAME/$AGENT_NAME:$VERSION"
|
||||
|
||||
echo "Pushing Docker image $DOCKER_USERNAME/$AGENT_NAME:latest..."
|
||||
docker push "$DOCKER_USERNAME/$AGENT_NAME:latest"
|
||||
|
||||
echo "Successfully built and pushed $DOCKER_USERNAME/$AGENT_NAME:$VERSION and $DOCKER_USERNAME/$AGENT_NAME:latest"
|
||||
@@ -1,3 +1,6 @@
|
||||
DEEPGRAM_API_KEY=your_deepgram_api_key
|
||||
OPENAI_API_KEY=your_openai_api_key
|
||||
CARTESIA_API_KEY=your_cartesia_api_key
|
||||
CARTESIA_API_KEY=your_cartesia_api_key
|
||||
|
||||
# Optional: Connect via Daily WebRTC locally
|
||||
DAILY_API_KEY=your_daily_api_key
|
||||
6
examples/quickstart/pcc-deploy.toml
Normal file
@@ -0,0 +1,6 @@
|
||||
agent_name = "quickstart"
|
||||
image = "your_username/quickstart:0.1"
|
||||
secret_set = "quickstart-secrets"
|
||||
|
||||
[scaling]
|
||||
min_agents = 1
|
||||
19
examples/quickstart/pyproject.toml
Normal file
@@ -0,0 +1,19 @@
|
||||
[project]
|
||||
name = "pipecat-quickstart"
|
||||
version = "0.1.0"
|
||||
description = "Quickstart example for building voice AI bots with Pipecat"
|
||||
requires-python = ">=3.10"
|
||||
dependencies = [
|
||||
"pipecat-ai[webrtc,daily,silero,deepgram,openai,cartesia,runner]>=0.0.79",
|
||||
"pipecatcloud>=0.2.3"
|
||||
]
|
||||
|
||||
[dependency-groups]
|
||||
dev = [
|
||||
"ruff~=0.12.1",
|
||||
]
|
||||
|
||||
[tool.ruff]
|
||||
line-length = 100
|
||||
[tool.ruff.lint]
|
||||
select = ["I"]
|
||||
@@ -1 +0,0 @@
|
||||
pipecat-ai[webrtc,silero,deepgram,openai,cartesia,runner]>=0.0.77
|
||||
3148
examples/quickstart/uv.lock
generated
Normal file
@@ -36,6 +36,7 @@ dependencies = [
|
||||
"openai>=1.74.0,<=1.99.1",
|
||||
# Pinning numba to resolve package dependencies
|
||||
"numba==0.61.2",
|
||||
"wait_for2>=0.4.1; python_version<'3.12'",
|
||||
]
|
||||
|
||||
[project.urls]
|
||||
@@ -52,7 +53,7 @@ azure = [ "azure-cognitiveservices-speech~=1.42.0"]
|
||||
cartesia = [ "cartesia~=2.0.3", "websockets>=13.1,<15.0" ]
|
||||
cerebras = []
|
||||
deepseek = []
|
||||
daily = [ "daily-python~=0.19.6" ]
|
||||
daily = [ "daily-python~=0.19.7" ]
|
||||
deepgram = [ "deepgram-sdk~=4.7.0" ]
|
||||
elevenlabs = [ "websockets>=13.1,<15.0" ]
|
||||
fal = [ "fal-client~=0.5.9" ]
|
||||
@@ -73,8 +74,9 @@ lmnt = [ "websockets>=13.1,<15.0" ]
|
||||
local = [ "pyaudio~=0.2.14" ]
|
||||
mcp = [ "mcp[cli]~=1.9.4" ]
|
||||
mem0 = [ "mem0ai~=0.1.94" ]
|
||||
mistral = []
|
||||
mlx-whisper = [ "mlx-whisper~=0.4.2" ]
|
||||
moondream = [ "einops~=0.8.0", "timm~=1.0.13", "transformers>=4.48.0" ]
|
||||
moondream = [ "accelerate~=1.10.0", "einops~=0.8.0", "pyvips[binary]~=3.0.0", "timm~=1.0.13", "transformers>=4.48.0" ]
|
||||
nim = []
|
||||
neuphonic = [ "websockets>=13.1,<15.0" ]
|
||||
noisereduce = [ "noisereduce~=3.0.3" ]
|
||||
@@ -82,12 +84,13 @@ openai = [ "websockets>=13.1,<15.0" ]
|
||||
openpipe = [ "openpipe~=4.50.0" ]
|
||||
openrouter = []
|
||||
perplexity = []
|
||||
playht = [ "pyht>=0.1.6", "websockets>=13.1,<15.0" ]
|
||||
playht = [ "websockets>=13.1,<15.0" ]
|
||||
qwen = []
|
||||
rime = [ "websockets>=13.1,<15.0" ]
|
||||
riva = [ "nvidia-riva-client~=2.21.1" ]
|
||||
runner = [ "python-dotenv>=1.0.0,<2.0.0", "uvicorn>=0.32.0,<1.0.0", "fastapi>=0.115.6,<0.117.0", "pipecat-ai-small-webrtc-prebuilt>=1.0.0"]
|
||||
sambanova = []
|
||||
sarvam = [ "websockets>=13.1,<15.0" ]
|
||||
sentry = [ "sentry-sdk~=2.23.1" ]
|
||||
local-smart-turn = [ "coremltools>=8.0", "transformers", "torch>=2.5.0,<3", "torchaudio>=2.5.0,<3" ]
|
||||
remote-smart-turn = []
|
||||
@@ -113,7 +116,7 @@ dev = [
|
||||
"pre-commit~=4.2.0",
|
||||
"pyright~=1.1.402",
|
||||
"pytest~=8.4.1",
|
||||
"pytest-asyncio~=1.0.0",
|
||||
"pytest-asyncio~=1.1.0",
|
||||
"pytest-aiohttp==1.1.0",
|
||||
"ruff~=0.12.1",
|
||||
"setuptools~=78.1.1",
|
||||
@@ -124,7 +127,7 @@ dev = [
|
||||
docs = [
|
||||
"sphinx>=8.1.3",
|
||||
"sphinx-rtd-theme",
|
||||
"sphinx-markdown-builder",
|
||||
"sphinx-markdown-builder",
|
||||
"sphinx-autodoc-typehints",
|
||||
"toml",
|
||||
]
|
||||
|
||||
BIN
scripts/evals/assets/cat.jpg
Normal file
|
After Width: | Height: | Size: 63 KiB |
@@ -4,7 +4,6 @@
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import io
|
||||
import os
|
||||
@@ -13,11 +12,12 @@ import time
|
||||
import wave
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import List, Optional
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
import aiofiles
|
||||
from deepgram import LiveOptions
|
||||
from loguru import logger
|
||||
from PIL.ImageFile import ImageFile
|
||||
from utils import (
|
||||
EvalResult,
|
||||
load_module_from_path,
|
||||
@@ -30,7 +30,7 @@ from pipecat.adapters.schemas.function_schema import FunctionSchema
|
||||
from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
from pipecat.frames.frames import EndTaskFrame
|
||||
from pipecat.frames.frames import EndTaskFrame, OutputImageRawFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
@@ -49,6 +49,8 @@ SCRIPT_DIR = Path(__file__).resolve().parent
|
||||
PIPELINE_IDLE_TIMEOUT_SECS = 60
|
||||
EVAL_TIMEOUT_SECS = 90
|
||||
|
||||
EvalPrompt = str | Tuple[str, ImageFile]
|
||||
|
||||
|
||||
class EvalRunner:
|
||||
def __init__(
|
||||
@@ -87,7 +89,13 @@ class EvalRunner:
|
||||
async def assert_eval_false(self):
|
||||
await self._queue.put(False)
|
||||
|
||||
async def run_eval(self, example_file: str, prompt: str, eval: Optional[str] = None):
|
||||
async def run_eval(
|
||||
self,
|
||||
example_file: str,
|
||||
prompt: EvalPrompt,
|
||||
eval: str,
|
||||
user_speaks_first: bool = False,
|
||||
):
|
||||
if not re.match(self._pattern, example_file):
|
||||
return
|
||||
|
||||
@@ -104,7 +112,9 @@ class EvalRunner:
|
||||
try:
|
||||
tasks = [
|
||||
asyncio.create_task(run_example_pipeline(script_path)),
|
||||
asyncio.create_task(run_eval_pipeline(self, example_file, prompt, eval)),
|
||||
asyncio.create_task(
|
||||
run_eval_pipeline(self, example_file, prompt, eval, user_speaks_first)
|
||||
),
|
||||
]
|
||||
_, pending = await asyncio.wait(tasks, timeout=EVAL_TIMEOUT_SECS)
|
||||
if pending:
|
||||
@@ -178,6 +188,7 @@ async def run_example_pipeline(script_path: Path):
|
||||
DailyParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
video_in_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
),
|
||||
)
|
||||
@@ -189,7 +200,11 @@ async def run_example_pipeline(script_path: Path):
|
||||
|
||||
|
||||
async def run_eval_pipeline(
|
||||
eval_runner: EvalRunner, example_file: str, prompt: str, eval: Optional[str]
|
||||
eval_runner: EvalRunner,
|
||||
example_file: str,
|
||||
prompt: EvalPrompt,
|
||||
eval: str,
|
||||
user_speaks_first: bool = False,
|
||||
):
|
||||
logger.info(f"Starting eval bot")
|
||||
|
||||
@@ -202,6 +217,7 @@ async def run_eval_pipeline(
|
||||
DailyParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
video_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=2.0)),
|
||||
),
|
||||
)
|
||||
@@ -210,12 +226,15 @@ async def run_eval_pipeline(
|
||||
# 5" (in audio) this can be converted to "32 is 5".
|
||||
stt = DeepgramSTTService(
|
||||
api_key=os.getenv("DEEPGRAM_API_KEY"),
|
||||
live_options=LiveOptions(smart_format=False),
|
||||
live_options=LiveOptions(
|
||||
language="multi",
|
||||
smart_format=False,
|
||||
),
|
||||
)
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
voice_id="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
voice_id="97f4b8fb-f2fe-444b-bb9a-c109783a857a", # Nathan
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
|
||||
@@ -239,15 +258,25 @@ async def run_eval_pipeline(
|
||||
)
|
||||
tools = ToolsSchema(standard_tools=[eval_function])
|
||||
|
||||
# See if we need to include an eval prompt.
|
||||
eval_prompt = ""
|
||||
if eval:
|
||||
eval_prompt = f"The answer is correct if the user says [{eval}]."
|
||||
# Load example prompt depending on image.
|
||||
example_prompt = ""
|
||||
example_image: Optional[ImageFile] = None
|
||||
if isinstance(prompt, str):
|
||||
example_prompt = prompt
|
||||
elif isinstance(prompt, tuple):
|
||||
example_prompt, example_image = prompt
|
||||
|
||||
eval_prompt = f"The answer is correct if it's appropriate for the context and matches: {eval}."
|
||||
common_system_prompt = f"Call the eval function with your assessment only if the user answers the question. {eval_prompt}"
|
||||
if user_speaks_first:
|
||||
system_prompt = f"You are an LLM eval, be extremly brief. You will start the conversation by saying: '{example_prompt}'. {common_system_prompt}"
|
||||
else:
|
||||
system_prompt = f"You are an LLM eval, be extremly brief. Your goal is to first ask one question: {example_prompt}. {common_system_prompt}"
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": f"You are an LLM eval, be extremly brief. Your goal is to only ask one question: {prompt}. Call the eval function only if the user answers the question and check if the answer is correct (words as numbers are valid). {eval_prompt}",
|
||||
"content": system_prompt,
|
||||
},
|
||||
]
|
||||
|
||||
@@ -285,8 +314,24 @@ async def run_eval_pipeline(
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
if example_image:
|
||||
await task.queue_frame(
|
||||
OutputImageRawFrame(
|
||||
image=example_image.tobytes(),
|
||||
size=example_image.size,
|
||||
format="RGB",
|
||||
)
|
||||
)
|
||||
await audio_buffer.start_recording()
|
||||
|
||||
# Default behavior is for the bot to speak first
|
||||
# If the eval bot speaks first, we append the prompt to the messages
|
||||
if user_speaks_first:
|
||||
messages.append(
|
||||
{"role": "user", "content": f"Start by saying this exactly: '{prompt}'"}
|
||||
)
|
||||
await task.queue_frames([context_aggregator.user().get_context_frame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
@@ -296,6 +341,8 @@ async def run_eval_pipeline(
|
||||
async def on_pipeline_idle_timeout(task):
|
||||
await eval_runner.assert_eval_false()
|
||||
|
||||
runner = PipelineRunner()
|
||||
# TODO(aleix): We should handle SIGINT and SIGTERM so we can cancel both the
|
||||
# eval and the example.
|
||||
runner = PipelineRunner(handle_sigint=False)
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
@@ -13,17 +13,24 @@ from pathlib import Path
|
||||
from dotenv import load_dotenv
|
||||
from eval import EvalRunner
|
||||
from loguru import logger
|
||||
from PIL import Image
|
||||
from utils import check_env_variables
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
SCRIPT_DIR = Path(__file__).resolve().parent
|
||||
|
||||
ASSETS_DIR = SCRIPT_DIR / "assets"
|
||||
|
||||
FOUNDATIONAL_DIR = SCRIPT_DIR.parent.parent / "examples" / "foundational"
|
||||
|
||||
# Speaking order constants
|
||||
USER_SPEAKS_FIRST = True
|
||||
BOT_SPEAKS_FIRST = False
|
||||
|
||||
# Math
|
||||
PROMPT_SIMPLE_MATH = "A simple math addition."
|
||||
EVAL_SIMPLE_MATH = "Correct math addition."
|
||||
|
||||
# Weather
|
||||
PROMPT_WEATHER = "What's the weather in San Francisco?"
|
||||
@@ -35,112 +42,173 @@ EVAL_WEATHER = (
|
||||
PROMPT_ONLINE_SEARCH = "What's the date right now in London?"
|
||||
EVAL_ONLINE_SEARCH = f"Today is {datetime.now(timezone.utc).strftime('%B %d, %Y')}."
|
||||
|
||||
# Switch language
|
||||
PROMPT_SWITCH_LANGUAGE = "Say something in Spanish."
|
||||
EVAL_SWITCH_LANGUAGE = "The user is now talking in Spanish."
|
||||
|
||||
# Vision
|
||||
PROMPT_VISION = ("What do you see?", Image.open(ASSETS_DIR / "cat.jpg"))
|
||||
EVAL_VISION = "A cat description."
|
||||
|
||||
# Voicemail
|
||||
PROMPT_VOICEMAIL = "Please leave a message after the beep."
|
||||
EVAL_VOICEMAIL = "Assess the conversation and determine if it is a voicemail."
|
||||
PROMPT_CONVERSATION = "Hello, this is Mark."
|
||||
EVAL_CONVERSATION = "A start of a conversation, not a voicemail."
|
||||
|
||||
TESTS_07 = [
|
||||
# 07 series
|
||||
("07-interruptible.py", PROMPT_SIMPLE_MATH, None),
|
||||
("07-interruptible-cartesia-http.py", PROMPT_SIMPLE_MATH, None),
|
||||
("07a-interruptible-speechmatics.py", PROMPT_SIMPLE_MATH, None),
|
||||
("07aa-interruptible-soniox.py", PROMPT_SIMPLE_MATH, None),
|
||||
("07ab-interruptible-inworld-http.py", PROMPT_SIMPLE_MATH, None),
|
||||
("07ac-interruptible-asyncai.py", PROMPT_SIMPLE_MATH, None),
|
||||
("07ac-interruptible-asyncai-http.py", PROMPT_SIMPLE_MATH, None),
|
||||
("07b-interruptible-langchain.py", PROMPT_SIMPLE_MATH, None),
|
||||
("07c-interruptible-deepgram.py", PROMPT_SIMPLE_MATH, None),
|
||||
("07d-interruptible-elevenlabs.py", PROMPT_SIMPLE_MATH, None),
|
||||
("07d-interruptible-elevenlabs-http.py", PROMPT_SIMPLE_MATH, None),
|
||||
("07e-interruptible-playht.py", PROMPT_SIMPLE_MATH, None),
|
||||
("07e-interruptible-playht-http.py", PROMPT_SIMPLE_MATH, None),
|
||||
("07f-interruptible-azure.py", PROMPT_SIMPLE_MATH, None),
|
||||
("07g-interruptible-openai.py", PROMPT_SIMPLE_MATH, None),
|
||||
("07h-interruptible-openpipe.py", PROMPT_SIMPLE_MATH, None),
|
||||
("07j-interruptible-gladia.py", PROMPT_SIMPLE_MATH, None),
|
||||
("07k-interruptible-lmnt.py", PROMPT_SIMPLE_MATH, None),
|
||||
("07l-interruptible-groq.py", PROMPT_SIMPLE_MATH, None),
|
||||
("07m-interruptible-aws.py", PROMPT_SIMPLE_MATH, None),
|
||||
("07n-interruptible-google.py", PROMPT_SIMPLE_MATH, None),
|
||||
("07o-interruptible-assemblyai.py", PROMPT_SIMPLE_MATH, None),
|
||||
("07q-interruptible-rime.py", PROMPT_SIMPLE_MATH, None),
|
||||
("07q-interruptible-rime-http.py", PROMPT_SIMPLE_MATH, None),
|
||||
("07r-interruptible-riva-nim.py", PROMPT_SIMPLE_MATH, None),
|
||||
("07s-interruptible-google-audio-in.py", PROMPT_SIMPLE_MATH, None),
|
||||
("07t-interruptible-fish.py", PROMPT_SIMPLE_MATH, None),
|
||||
("07v-interruptible-neuphonic.py", PROMPT_SIMPLE_MATH, None),
|
||||
("07v-interruptible-neuphonic-http.py", PROMPT_SIMPLE_MATH, None),
|
||||
("07w-interruptible-fal.py", PROMPT_SIMPLE_MATH, None),
|
||||
("07y-interruptible-minimax.py", PROMPT_SIMPLE_MATH, None),
|
||||
("07z-interruptible-sarvam.py", PROMPT_SIMPLE_MATH, None),
|
||||
("07-interruptible.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07-interruptible-cartesia-http.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07a-interruptible-speechmatics.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07aa-interruptible-soniox.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07ab-interruptible-inworld-http.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07ac-interruptible-asyncai.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07ac-interruptible-asyncai-http.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07b-interruptible-langchain.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07c-interruptible-deepgram.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07d-interruptible-elevenlabs.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
(
|
||||
"07d-interruptible-elevenlabs-http.py",
|
||||
PROMPT_SIMPLE_MATH,
|
||||
EVAL_SIMPLE_MATH,
|
||||
BOT_SPEAKS_FIRST,
|
||||
),
|
||||
("07e-interruptible-playht.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07e-interruptible-playht-http.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07f-interruptible-azure.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07g-interruptible-openai.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07h-interruptible-openpipe.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07j-interruptible-gladia.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07k-interruptible-lmnt.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07l-interruptible-groq.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07m-interruptible-aws.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07n-interruptible-gemini.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07n-interruptible-google.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07o-interruptible-assemblyai.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07q-interruptible-rime.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07q-interruptible-rime-http.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07r-interruptible-riva-nim.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
(
|
||||
"07s-interruptible-google-audio-in.py",
|
||||
PROMPT_SIMPLE_MATH,
|
||||
EVAL_SIMPLE_MATH,
|
||||
BOT_SPEAKS_FIRST,
|
||||
),
|
||||
("07t-interruptible-fish.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07v-interruptible-neuphonic.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07v-interruptible-neuphonic-http.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07w-interruptible-fal.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07y-interruptible-minimax.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07z-interruptible-sarvam.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
# Needs a local XTTS docker instance running.
|
||||
# ("07i-interruptible-xtts.py", PROMPT_SIMPLE_MATH, None),
|
||||
# ("07i-interruptible-xtts.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
# Needs a Krisp license.
|
||||
# ("07p-interruptible-krisp.py", PROMPT_SIMPLE_MATH, None),
|
||||
# ("07p-interruptible-krisp.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
# Needs GPU resources.
|
||||
# ("07u-interruptible-ultravox.py", PROMPT_SIMPLE_MATH, None),
|
||||
# ("07u-interruptible-ultravox.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
]
|
||||
|
||||
TESTS_12 = [
|
||||
("12-describe-video.py", PROMPT_VISION, EVAL_VISION, BOT_SPEAKS_FIRST),
|
||||
("12a-describe-video-gemini-flash.py", PROMPT_VISION, EVAL_VISION, BOT_SPEAKS_FIRST),
|
||||
("12b-describe-video-gpt-4o.py", PROMPT_VISION, EVAL_VISION, BOT_SPEAKS_FIRST),
|
||||
("12c-describe-video-anthropic.py", PROMPT_VISION, EVAL_VISION, BOT_SPEAKS_FIRST),
|
||||
]
|
||||
|
||||
TESTS_14 = [
|
||||
("14-function-calling.py", PROMPT_WEATHER, EVAL_WEATHER),
|
||||
("14a-function-calling-anthropic.py", PROMPT_WEATHER, EVAL_WEATHER),
|
||||
("14b-function-calling-anthropic-video.py", PROMPT_WEATHER, EVAL_WEATHER),
|
||||
("14d-function-calling-video.py", PROMPT_WEATHER, EVAL_WEATHER),
|
||||
("14e-function-calling-google.py", PROMPT_WEATHER, EVAL_WEATHER),
|
||||
("14f-function-calling-groq.py", PROMPT_WEATHER, EVAL_WEATHER),
|
||||
("14g-function-calling-grok.py", PROMPT_WEATHER, EVAL_WEATHER),
|
||||
("14h-function-calling-azure.py", PROMPT_WEATHER, EVAL_WEATHER),
|
||||
("14i-function-calling-fireworks.py", PROMPT_WEATHER, EVAL_WEATHER),
|
||||
("14j-function-calling-nim.py", PROMPT_WEATHER, EVAL_WEATHER),
|
||||
("14m-function-calling-openrouter.py", PROMPT_WEATHER, EVAL_WEATHER),
|
||||
("14n-function-calling-perplexity.py", PROMPT_WEATHER, EVAL_WEATHER),
|
||||
("14p-function-calling-gemini-vertex-ai.py", PROMPT_WEATHER, EVAL_WEATHER),
|
||||
("14q-function-calling-qwen.py", PROMPT_WEATHER, EVAL_WEATHER),
|
||||
("14r-function-calling-aws.py", PROMPT_WEATHER, EVAL_WEATHER),
|
||||
("14v-function-calling-openai.py", PROMPT_WEATHER, EVAL_WEATHER),
|
||||
("14-function-calling.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
("14a-function-calling-anthropic.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
("14b-function-calling-anthropic-video.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
("14d-function-calling-video.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
("14e-function-calling-google.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
("14f-function-calling-groq.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
("14g-function-calling-grok.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
("14h-function-calling-azure.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
("14i-function-calling-fireworks.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
("14j-function-calling-nim.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
("14m-function-calling-openrouter.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
("14n-function-calling-perplexity.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
("14p-function-calling-gemini-vertex-ai.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
("14q-function-calling-qwen.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
("14r-function-calling-aws.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
("14v-function-calling-openai.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
("14w-function-calling-mistral.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
# Currently not working.
|
||||
# ("14c-function-calling-together.py", PROMPT_WEATHER, EVAL_WEATHER),
|
||||
# ("14k-function-calling-cerebras.py", PROMPT_WEATHER, EVAL_WEATHER),
|
||||
# ("14l-function-calling-deepseek.py", PROMPT_WEATHER, EVAL_WEATHER),
|
||||
# ("14o-function-calling-gemini-openai-format.py", PROMPT_WEATHER, EVAL_WEATHER),
|
||||
# ("14c-function-calling-together.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
# ("14k-function-calling-cerebras.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
# ("14l-function-calling-deepseek.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
# ("14o-function-calling-gemini-openai-format.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
]
|
||||
|
||||
TESTS_15 = [
|
||||
("15a-switch-languages.py", PROMPT_SWITCH_LANGUAGE, EVAL_SWITCH_LANGUAGE, BOT_SPEAKS_FIRST),
|
||||
]
|
||||
|
||||
TESTS_19 = [
|
||||
("19-openai-realtime-beta.py", PROMPT_WEATHER, EVAL_WEATHER),
|
||||
("19a-azure-realtime-beta.py", PROMPT_WEATHER, EVAL_WEATHER),
|
||||
("19-openai-realtime-beta.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
("19a-azure-realtime-beta.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
("19b-openai-realtime-beta-text.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
]
|
||||
|
||||
TESTS_21 = [
|
||||
("21a-tavus-video-service.py", PROMPT_SIMPLE_MATH, None),
|
||||
("21a-tavus-video-service.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
]
|
||||
|
||||
TESTS_26 = [
|
||||
("26-gemini-multimodal-live.py", PROMPT_SIMPLE_MATH, None),
|
||||
("26a-gemini-multimodal-live-transcription.py", PROMPT_SIMPLE_MATH, None),
|
||||
("26b-gemini-multimodal-live-function-calling.py", PROMPT_WEATHER, EVAL_WEATHER),
|
||||
("26c-gemini-multimodal-live-video.py", PROMPT_SIMPLE_MATH, None),
|
||||
("26e-gemini-multimodal-google-search.py", PROMPT_ONLINE_SEARCH, EVAL_ONLINE_SEARCH),
|
||||
("26-gemini-multimodal-live.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
(
|
||||
"26a-gemini-multimodal-live-transcription.py",
|
||||
PROMPT_SIMPLE_MATH,
|
||||
EVAL_SIMPLE_MATH,
|
||||
BOT_SPEAKS_FIRST,
|
||||
),
|
||||
(
|
||||
"26b-gemini-multimodal-live-function-calling.py",
|
||||
PROMPT_WEATHER,
|
||||
EVAL_WEATHER,
|
||||
BOT_SPEAKS_FIRST,
|
||||
),
|
||||
("26c-gemini-multimodal-live-video.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
(
|
||||
"26e-gemini-multimodal-google-search.py",
|
||||
PROMPT_ONLINE_SEARCH,
|
||||
EVAL_ONLINE_SEARCH,
|
||||
BOT_SPEAKS_FIRST,
|
||||
),
|
||||
# Currently not working.
|
||||
# ("26d-gemini-multimodal-live-text.py", PROMPT_SIMPLE_MATH, None),
|
||||
# ("26d-gemini-multimodal-live-text.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
]
|
||||
|
||||
TESTS_27 = [
|
||||
("27-simli-layer.py", PROMPT_SIMPLE_MATH, None),
|
||||
("27-simli-layer.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
]
|
||||
|
||||
TESTS_40 = [
|
||||
("40-aws-nova-sonic.py", PROMPT_SIMPLE_MATH, None),
|
||||
("40-aws-nova-sonic.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
]
|
||||
|
||||
TESTS_43 = [
|
||||
("43a-heygen-video-service.py", PROMPT_SIMPLE_MATH, None),
|
||||
("43a-heygen-video-service.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
]
|
||||
|
||||
TESTS_44 = [
|
||||
("44-voicemail-detection.py", PROMPT_VOICEMAIL, EVAL_VOICEMAIL, USER_SPEAKS_FIRST),
|
||||
("44-voicemail-detection.py", PROMPT_CONVERSATION, EVAL_CONVERSATION, USER_SPEAKS_FIRST),
|
||||
]
|
||||
|
||||
TESTS = [
|
||||
*TESTS_07,
|
||||
*TESTS_12,
|
||||
*TESTS_14,
|
||||
*TESTS_15,
|
||||
*TESTS_19,
|
||||
*TESTS_21,
|
||||
*TESTS_26,
|
||||
*TESTS_27,
|
||||
*TESTS_40,
|
||||
*TESTS_43,
|
||||
*TESTS_44,
|
||||
]
|
||||
|
||||
|
||||
@@ -162,8 +230,11 @@ async def main(args: argparse.Namespace):
|
||||
log_level=log_level,
|
||||
)
|
||||
|
||||
for test, prompt, eval in TESTS:
|
||||
await runner.run_eval(test, prompt, eval)
|
||||
# Parse test config: (test, prompt, eval, user_speaks_first)
|
||||
for test_config in TESTS:
|
||||
test, prompt, eval, user_speaks_first = test_config
|
||||
|
||||
await runner.run_eval(test, prompt, eval, user_speaks_first)
|
||||
|
||||
runner.print_results()
|
||||
|
||||
|
||||
@@ -12,3 +12,20 @@ from loguru import logger
|
||||
__version__ = version("pipecat-ai")
|
||||
|
||||
logger.info(f"ᓚᘏᗢ Pipecat {__version__} (Python {sys.version}) ᓚᘏᗢ")
|
||||
|
||||
# We replace `asyncio.wait_for()` for `wait_for2.wait_for()` for Python < 3.12.
|
||||
#
|
||||
# In Python 3.12, `asyncio.wait_for()` is implemented in terms of
|
||||
# `asyncio.timeout()` which fixed a bunch of issues. However, this was never
|
||||
# backported (because of the lack of `async.timeout()`) and there are still many
|
||||
# remainig issues, specially in Python 3.10, in `async.wait_for()`.
|
||||
#
|
||||
# See https://github.com/python/cpython/pull/98518
|
||||
|
||||
import asyncio
|
||||
|
||||
if sys.version_info < (3, 12):
|
||||
import wait_for2
|
||||
|
||||
# Replace asyncio.wait_for.
|
||||
asyncio.wait_for = wait_for2.wait_for
|
||||
|
||||
@@ -11,21 +11,45 @@ adapters that handle tool format conversion and standardization.
|
||||
"""
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Any, List, Union, cast
|
||||
from typing import Any, Generic, List, TypeVar, Union, cast
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext, NotGiven
|
||||
|
||||
# Should be a TypedDict
|
||||
TLLMInvocationParams = TypeVar("TLLMInvocationParams", bound=dict[str, Any])
|
||||
|
||||
|
||||
class BaseLLMAdapter(ABC):
|
||||
class BaseLLMAdapter(ABC, Generic[TLLMInvocationParams]):
|
||||
"""Abstract base class for LLM provider adapters.
|
||||
|
||||
Provides a standard interface for converting between Pipecat's standardized
|
||||
tool schemas and provider-specific tool formats. Subclasses must implement
|
||||
provider-specific conversion logic.
|
||||
Provides a standard interface for converting to provider-specific formats.
|
||||
|
||||
Handles:
|
||||
|
||||
- Extracting provider-specific parameters for LLM invocation from a
|
||||
universal LLM context
|
||||
- Converting standardized tools schema to provider-specific tool formats.
|
||||
- Extracting messages from the LLM context for the purposes of logging
|
||||
about the specific provider.
|
||||
|
||||
Subclasses must implement provider-specific conversion logic.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def get_llm_invocation_params(self, context: LLMContext) -> TLLMInvocationParams:
|
||||
"""Get provider-specific LLM invocation parameters from a universal LLM context.
|
||||
|
||||
Args:
|
||||
context: The LLM context containing messages, tools, etc.
|
||||
|
||||
Returns:
|
||||
Provider-specific parameters for invoking the LLM.
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def to_provider_tools_format(self, tools_schema: ToolsSchema) -> List[Any]:
|
||||
"""Convert tools schema to the provider's specific format.
|
||||
@@ -38,7 +62,20 @@ class BaseLLMAdapter(ABC):
|
||||
"""
|
||||
pass
|
||||
|
||||
def from_standard_tools(self, tools: Any) -> List[Any]:
|
||||
@abstractmethod
|
||||
def get_messages_for_logging(self, context: LLMContext) -> List[dict[str, Any]]:
|
||||
"""Get messages from a universal LLM context in a format ready for logging about this provider.
|
||||
|
||||
Args:
|
||||
context: The LLM context containing messages.
|
||||
|
||||
Returns:
|
||||
List of messages in a format ready for logging about this
|
||||
provider.
|
||||
"""
|
||||
pass
|
||||
|
||||
def from_standard_tools(self, tools: Any) -> List[Any] | NotGiven:
|
||||
"""Convert tools from standard format to provider format.
|
||||
|
||||
Args:
|
||||
|
||||
@@ -6,20 +6,58 @@
|
||||
|
||||
"""Anthropic LLM adapter for Pipecat."""
|
||||
|
||||
from typing import Any, Dict, List
|
||||
from typing import Any, Dict, List, TypedDict
|
||||
|
||||
from pipecat.adapters.base_llm_adapter import BaseLLMAdapter
|
||||
from pipecat.adapters.schemas.function_schema import FunctionSchema
|
||||
from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
|
||||
|
||||
class AnthropicLLMAdapter(BaseLLMAdapter):
|
||||
class AnthropicLLMInvocationParams(TypedDict):
|
||||
"""Context-based parameters for invoking Anthropic's LLM API.
|
||||
|
||||
This is a placeholder until support for universal LLMContext machinery is added for Anthropic.
|
||||
"""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class AnthropicLLMAdapter(BaseLLMAdapter[AnthropicLLMInvocationParams]):
|
||||
"""Adapter for converting tool schemas to Anthropic's function-calling format.
|
||||
|
||||
This adapter handles the conversion of Pipecat's standard function schemas
|
||||
to the specific format required by Anthropic's Claude models for function calling.
|
||||
"""
|
||||
|
||||
def get_llm_invocation_params(self, context: LLMContext) -> AnthropicLLMInvocationParams:
|
||||
"""Get Anthropic-specific LLM invocation parameters from a universal LLM context.
|
||||
|
||||
This is a placeholder until support for universal LLMContext machinery is added for Anthropic.
|
||||
|
||||
Args:
|
||||
context: The LLM context containing messages, tools, etc.
|
||||
|
||||
Returns:
|
||||
Dictionary of parameters for invoking Anthropic's LLM API.
|
||||
"""
|
||||
raise NotImplementedError("Universal LLMContext is not yet supported for Anthropic.")
|
||||
|
||||
def get_messages_for_logging(self, context) -> List[dict[str, Any]]:
|
||||
"""Get messages from a universal LLM context in a format ready for logging about Anthropic.
|
||||
|
||||
Removes or truncates sensitive data like image content for safe logging.
|
||||
|
||||
This is a placeholder until support for universal LLMContext machinery is added for Anthropic.
|
||||
|
||||
Args:
|
||||
context: The LLM context containing messages.
|
||||
|
||||
Returns:
|
||||
List of messages in a format ready for logging about Anthropic.
|
||||
"""
|
||||
raise NotImplementedError("Universal LLMContext is not yet supported for Anthropic.")
|
||||
|
||||
@staticmethod
|
||||
def _to_anthropic_function_format(function: FunctionSchema) -> Dict[str, Any]:
|
||||
"""Convert a single function schema to Anthropic's format.
|
||||
|
||||
@@ -7,20 +7,58 @@
|
||||
"""AWS Nova Sonic LLM adapter for Pipecat."""
|
||||
|
||||
import json
|
||||
from typing import Any, Dict, List
|
||||
from typing import Any, Dict, List, TypedDict
|
||||
|
||||
from pipecat.adapters.base_llm_adapter import BaseLLMAdapter
|
||||
from pipecat.adapters.schemas.function_schema import FunctionSchema
|
||||
from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
|
||||
|
||||
class AWSNovaSonicLLMAdapter(BaseLLMAdapter):
|
||||
class AWSNovaSonicLLMInvocationParams(TypedDict):
|
||||
"""Context-based parameters for invoking AWS Nova Sonic LLM API.
|
||||
|
||||
This is a placeholder until support for universal LLMContext machinery is added for AWS Nova Sonic.
|
||||
"""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class AWSNovaSonicLLMAdapter(BaseLLMAdapter[AWSNovaSonicLLMInvocationParams]):
|
||||
"""Adapter for AWS Nova Sonic language models.
|
||||
|
||||
Converts Pipecat's standard function schemas into AWS Nova Sonic's
|
||||
specific function-calling format, enabling tool use with Nova Sonic models.
|
||||
"""
|
||||
|
||||
def get_llm_invocation_params(self, context: LLMContext) -> AWSNovaSonicLLMInvocationParams:
|
||||
"""Get AWS Nova Sonic-specific LLM invocation parameters from a universal LLM context.
|
||||
|
||||
This is a placeholder until support for universal LLMContext machinery is added for AWS Nova Sonic.
|
||||
|
||||
Args:
|
||||
context: The LLM context containing messages, tools, etc.
|
||||
|
||||
Returns:
|
||||
Dictionary of parameters for invoking AWS Nova Sonic's LLM API.
|
||||
"""
|
||||
raise NotImplementedError("Universal LLMContext is not yet supported for AWS Nova Sonic.")
|
||||
|
||||
def get_messages_for_logging(self, context) -> List[dict[str, Any]]:
|
||||
"""Get messages from a universal LLM context in a format ready for logging about AWS Nova Sonic.
|
||||
|
||||
Removes or truncates sensitive data like image content for safe logging.
|
||||
|
||||
This is a placeholder until support for universal LLMContext machinery is added for AWS Nova Sonic.
|
||||
|
||||
Args:
|
||||
context: The LLM context containing messages.
|
||||
|
||||
Returns:
|
||||
List of messages in a format ready for logging about AWS Nova Sonic.
|
||||
"""
|
||||
raise NotImplementedError("Universal LLMContext is not yet supported for AWS Nova Sonic.")
|
||||
|
||||
@staticmethod
|
||||
def _to_aws_nova_sonic_function_format(function: FunctionSchema) -> Dict[str, Any]:
|
||||
"""Convert a function schema to AWS Nova Sonic format.
|
||||
|
||||
@@ -6,20 +6,58 @@
|
||||
|
||||
"""AWS Bedrock LLM adapter for Pipecat."""
|
||||
|
||||
from typing import Any, Dict, List
|
||||
from typing import Any, Dict, List, TypedDict
|
||||
|
||||
from pipecat.adapters.base_llm_adapter import BaseLLMAdapter
|
||||
from pipecat.adapters.schemas.function_schema import FunctionSchema
|
||||
from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
|
||||
|
||||
class AWSBedrockLLMAdapter(BaseLLMAdapter):
|
||||
class AWSBedrockLLMInvocationParams(TypedDict):
|
||||
"""Context-based parameters for invoking AWS Bedrock's LLM API.
|
||||
|
||||
This is a placeholder until support for universal LLMContext machinery is added for Bedrock.
|
||||
"""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class AWSBedrockLLMAdapter(BaseLLMAdapter[AWSBedrockLLMInvocationParams]):
|
||||
"""Adapter for AWS Bedrock LLM integration with Pipecat.
|
||||
|
||||
Provides conversion utilities for transforming Pipecat function schemas
|
||||
into AWS Bedrock's expected tool format for function calling capabilities.
|
||||
"""
|
||||
|
||||
def get_llm_invocation_params(self, context: LLMContext) -> AWSBedrockLLMInvocationParams:
|
||||
"""Get AWS Bedrock-specific LLM invocation parameters from a universal LLM context.
|
||||
|
||||
This is a placeholder until support for universal LLMContext machinery is added for Bedrock.
|
||||
|
||||
Args:
|
||||
context: The LLM context containing messages, tools, etc.
|
||||
|
||||
Returns:
|
||||
Dictionary of parameters for invoking AWS Bedrock's LLM API.
|
||||
"""
|
||||
raise NotImplementedError("Universal LLMContext is not yet supported for AWS Bedrock.")
|
||||
|
||||
def get_messages_for_logging(self, context) -> List[dict[str, Any]]:
|
||||
"""Get messages from a universal LLM context in a format ready for logging about AWS Bedrock.
|
||||
|
||||
Removes or truncates sensitive data like image content for safe logging.
|
||||
|
||||
This is a placeholder until support for universal LLMContext machinery is added for Bedrock.
|
||||
|
||||
Args:
|
||||
context: The LLM context containing messages.
|
||||
|
||||
Returns:
|
||||
List of messages in a format ready for logging about AWS Bedrock.
|
||||
"""
|
||||
raise NotImplementedError("Universal LLMContext is not yet supported for AWS Bedrock.")
|
||||
|
||||
@staticmethod
|
||||
def _to_bedrock_function_format(function: FunctionSchema) -> Dict[str, Any]:
|
||||
"""Convert a function schema to Bedrock's tool format.
|
||||
|
||||
@@ -6,20 +6,71 @@
|
||||
|
||||
"""Gemini LLM adapter for Pipecat."""
|
||||
|
||||
from typing import Any, Dict, List, Union
|
||||
import base64
|
||||
import json
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, List, Optional, TypedDict
|
||||
|
||||
from loguru import logger
|
||||
from openai import NotGiven
|
||||
|
||||
from pipecat.adapters.base_llm_adapter import BaseLLMAdapter
|
||||
from pipecat.adapters.schemas.tools_schema import AdapterType, ToolsSchema
|
||||
from pipecat.processors.aggregators.llm_context import (
|
||||
LLMContext,
|
||||
LLMContextMessage,
|
||||
LLMSpecificMessage,
|
||||
LLMStandardMessage,
|
||||
)
|
||||
|
||||
try:
|
||||
from google.genai.types import (
|
||||
Blob,
|
||||
Content,
|
||||
FunctionCall,
|
||||
FunctionResponse,
|
||||
Part,
|
||||
)
|
||||
except ModuleNotFoundError as e:
|
||||
logger.error(f"Exception: {e}")
|
||||
logger.error("In order to use Google AI, you need to `pip install pipecat-ai[google]`.")
|
||||
raise Exception(f"Missing module: {e}")
|
||||
|
||||
|
||||
class GeminiLLMAdapter(BaseLLMAdapter):
|
||||
"""LLM adapter for Google's Gemini service.
|
||||
class GeminiLLMInvocationParams(TypedDict):
|
||||
"""Context-based parameters for invoking Gemini LLM."""
|
||||
|
||||
Provides tool schema conversion functionality to transform standard tool
|
||||
definitions into Gemini's specific function-calling format for use with
|
||||
Gemini LLM models.
|
||||
system_instruction: Optional[str]
|
||||
messages: List[Content]
|
||||
tools: List[Any] | NotGiven
|
||||
|
||||
|
||||
class GeminiLLMAdapter(BaseLLMAdapter[GeminiLLMInvocationParams]):
|
||||
"""Gemini-specific adapter for Pipecat.
|
||||
|
||||
Handles:
|
||||
- Extracting parameters for Gemini's API from a universal LLM context
|
||||
- Converting Pipecat's standardized tools schema to Gemini's function-calling format.
|
||||
- Extracting and sanitizing messages from the LLM context for logging with Gemini.
|
||||
"""
|
||||
|
||||
def get_llm_invocation_params(self, context: LLMContext) -> GeminiLLMInvocationParams:
|
||||
"""Get Gemini-specific LLM invocation parameters from a universal LLM context.
|
||||
|
||||
Args:
|
||||
context: The LLM context containing messages, tools, etc.
|
||||
|
||||
Returns:
|
||||
Dictionary of parameters for Gemini's API.
|
||||
"""
|
||||
messages = self._from_universal_context_messages(self._get_messages(context))
|
||||
return {
|
||||
"system_instruction": messages.system_instruction,
|
||||
"messages": messages.messages,
|
||||
# NOTE; LLMContext's tools are guaranteed to be a ToolsSchema (or NOT_GIVEN)
|
||||
"tools": self.from_standard_tools(context.tools),
|
||||
}
|
||||
|
||||
def to_provider_tools_format(self, tools_schema: ToolsSchema) -> List[Dict[str, Any]]:
|
||||
"""Convert tool schemas to Gemini's function-calling format.
|
||||
|
||||
@@ -39,3 +90,223 @@ class GeminiLLMAdapter(BaseLLMAdapter):
|
||||
custom_gemini_tools = tools_schema.custom_tools.get(AdapterType.GEMINI, [])
|
||||
|
||||
return formatted_standard_tools + custom_gemini_tools
|
||||
|
||||
def get_messages_for_logging(self, context: LLMContext) -> List[dict[str, Any]]:
|
||||
"""Get messages from a universal LLM context in a format ready for logging about Gemini.
|
||||
|
||||
Removes or truncates sensitive data like image content for safe logging.
|
||||
|
||||
Args:
|
||||
context: The LLM context containing messages.
|
||||
|
||||
Returns:
|
||||
List of messages in a format ready for logging about Gemini.
|
||||
"""
|
||||
# Get messages in Gemini's format
|
||||
messages = self._from_universal_context_messages(self._get_messages(context)).messages
|
||||
|
||||
# Sanitize messages for logging
|
||||
messages_for_logging = []
|
||||
for message in messages:
|
||||
obj = message.to_json_dict()
|
||||
try:
|
||||
if "parts" in obj:
|
||||
for part in obj["parts"]:
|
||||
if "inline_data" in part:
|
||||
part["inline_data"]["data"] = "..."
|
||||
except Exception as e:
|
||||
logger.debug(f"Error: {e}")
|
||||
messages_for_logging.append(obj)
|
||||
return messages_for_logging
|
||||
|
||||
def _get_messages(self, context: LLMContext) -> List[LLMContextMessage]:
|
||||
return context.get_messages("google")
|
||||
|
||||
@dataclass
|
||||
class ConvertedMessages:
|
||||
"""Container for Google-formatted messages converted from universal context."""
|
||||
|
||||
messages: List[Content]
|
||||
system_instruction: Optional[str] = None
|
||||
|
||||
def _from_universal_context_messages(
|
||||
self, universal_context_messages: List[LLMContextMessage]
|
||||
) -> ConvertedMessages:
|
||||
"""Restructures messages to ensure proper Google format and message ordering.
|
||||
|
||||
This method handles conversion of OpenAI-formatted messages to Google format,
|
||||
with special handling for function calls, function responses, and system messages.
|
||||
System messages are added back to the context as user messages when needed.
|
||||
|
||||
The final message order is preserved as:
|
||||
|
||||
1. Function calls (from model)
|
||||
2. Function responses (from user)
|
||||
3. Text messages (converted from system messages)
|
||||
|
||||
Note::
|
||||
|
||||
System messages are only added back when there are no regular text
|
||||
messages in the context, ensuring proper conversation continuity
|
||||
after function calls.
|
||||
"""
|
||||
system_instruction = None
|
||||
messages = []
|
||||
|
||||
# Process each message, preserving Google-formatted messages and converting others
|
||||
for message in universal_context_messages:
|
||||
if isinstance(message, LLMSpecificMessage):
|
||||
# Assume that LLMSpecificMessage wraps a message in Google format
|
||||
messages.append(message.message)
|
||||
continue
|
||||
|
||||
# Convert standard format to Google format
|
||||
converted = self._from_standard_message(
|
||||
message, already_have_system_instruction=bool(system_instruction)
|
||||
)
|
||||
if isinstance(converted, Content):
|
||||
# Regular (non-system) message
|
||||
messages.append(converted)
|
||||
else:
|
||||
# System instruction
|
||||
system_instruction = converted
|
||||
|
||||
# Check if we only have function-related messages (no regular text)
|
||||
has_regular_messages = any(
|
||||
len(msg.parts) == 1
|
||||
and getattr(msg.parts[0], "text", None)
|
||||
and not getattr(msg.parts[0], "function_call", None)
|
||||
and not getattr(msg.parts[0], "function_response", None)
|
||||
for msg in messages
|
||||
)
|
||||
|
||||
# Add system instruction back as a user message if we only have function messages
|
||||
if system_instruction and not has_regular_messages:
|
||||
messages.append(Content(role="user", parts=[Part(text=system_instruction)]))
|
||||
|
||||
# Remove any empty messages
|
||||
messages = [m for m in messages if m.parts]
|
||||
|
||||
return self.ConvertedMessages(messages=messages, system_instruction=system_instruction)
|
||||
|
||||
def _from_standard_message(
|
||||
self, message: LLMStandardMessage, already_have_system_instruction: bool
|
||||
) -> Content | str:
|
||||
"""Convert universal context message to Google Content object.
|
||||
|
||||
Handles conversion of text, images, and function calls to Google's
|
||||
format.
|
||||
System instructions are returned as a plain string.
|
||||
|
||||
Args:
|
||||
message: Message in universal context format.
|
||||
already_have_system_instruction: Whether we already have a system instruction
|
||||
|
||||
Returns:
|
||||
Content object with role and parts, or a plain string for system
|
||||
messages.
|
||||
|
||||
Examples:
|
||||
Standard text message::
|
||||
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Hello there"
|
||||
}
|
||||
|
||||
Converts to Google Content with::
|
||||
|
||||
Content(
|
||||
role="user",
|
||||
parts=[Part(text="Hello there")]
|
||||
)
|
||||
|
||||
Standard function call message::
|
||||
|
||||
{
|
||||
"role": "assistant",
|
||||
"tool_calls": [
|
||||
{
|
||||
"function": {
|
||||
"name": "search",
|
||||
"arguments": '{"query": "test"}'
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
Converts to Google Content with::
|
||||
|
||||
Content(
|
||||
role="model",
|
||||
parts=[Part(function_call=FunctionCall(name="search", args={"query": "test"}))]
|
||||
)
|
||||
"""
|
||||
role = message["role"]
|
||||
content = message.get("content", [])
|
||||
if role == "system":
|
||||
if already_have_system_instruction:
|
||||
role = "user" # Convert system message to user role if we already have a system instruction
|
||||
else:
|
||||
# System instructions are returned as plain text
|
||||
if isinstance(content, str):
|
||||
return content
|
||||
elif isinstance(content, list):
|
||||
# If content is a list, we assume it's a list of text parts, per the standard
|
||||
return " ".join(part["text"] for part in content if part.get("type") == "text")
|
||||
elif role == "assistant":
|
||||
role = "model"
|
||||
|
||||
parts = []
|
||||
if message.get("tool_calls"):
|
||||
for tc in message["tool_calls"]:
|
||||
parts.append(
|
||||
Part(
|
||||
function_call=FunctionCall(
|
||||
name=tc["function"]["name"],
|
||||
args=json.loads(tc["function"]["arguments"]),
|
||||
)
|
||||
)
|
||||
)
|
||||
elif role == "tool":
|
||||
role = "model"
|
||||
try:
|
||||
response = json.loads(message["content"])
|
||||
if isinstance(response, dict):
|
||||
response_dict = response
|
||||
else:
|
||||
response_dict = {"value": response}
|
||||
except Exception as e:
|
||||
# Response might not be JSON-deserializable.
|
||||
# This occurs with a UserImageFrame, for example, where we get a plain "COMPLETED" string.
|
||||
response_dict = {"value": message["content"]}
|
||||
parts.append(
|
||||
Part(
|
||||
function_response=FunctionResponse(
|
||||
name="tool_call_result", # seems to work to hard-code the same name every time
|
||||
response=response_dict,
|
||||
)
|
||||
)
|
||||
)
|
||||
elif isinstance(content, str):
|
||||
parts.append(Part(text=content))
|
||||
elif isinstance(content, list):
|
||||
for c in content:
|
||||
if c["type"] == "text":
|
||||
parts.append(Part(text=c["text"]))
|
||||
elif c["type"] == "image_url":
|
||||
parts.append(
|
||||
Part(
|
||||
inline_data=Blob(
|
||||
mime_type="image/jpeg",
|
||||
data=base64.b64decode(c["image_url"]["url"].split(",")[1]),
|
||||
)
|
||||
)
|
||||
)
|
||||
elif c["type"] == "input_audio":
|
||||
input_audio = c["input_audio"]
|
||||
audio_bytes = base64.b64decode(input_audio["data"])
|
||||
parts.append(Part(inline_data=Blob(mime_type="audio/wav", data=audio_bytes)))
|
||||
|
||||
message = Content(role=role, parts=parts)
|
||||
return message
|
||||
|
||||
@@ -6,22 +6,63 @@
|
||||
|
||||
"""OpenAI LLM adapter for Pipecat."""
|
||||
|
||||
from typing import List
|
||||
import copy
|
||||
import json
|
||||
from typing import Any, List, TypedDict
|
||||
|
||||
from openai.types.chat import ChatCompletionToolParam
|
||||
from openai._types import NOT_GIVEN as OPEN_AI_NOT_GIVEN
|
||||
from openai._types import NotGiven as OpenAINotGiven
|
||||
from openai.types.chat import (
|
||||
ChatCompletionMessageParam,
|
||||
ChatCompletionToolChoiceOptionParam,
|
||||
ChatCompletionToolParam,
|
||||
)
|
||||
|
||||
from pipecat.adapters.base_llm_adapter import BaseLLMAdapter
|
||||
from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
||||
from pipecat.processors.aggregators.llm_context import (
|
||||
LLMContext,
|
||||
LLMContextMessage,
|
||||
LLMContextToolChoice,
|
||||
NotGiven,
|
||||
)
|
||||
|
||||
|
||||
class OpenAILLMAdapter(BaseLLMAdapter):
|
||||
"""Adapter for converting tool schemas to OpenAI's format.
|
||||
class OpenAILLMInvocationParams(TypedDict):
|
||||
"""Context-based parameters for invoking OpenAI ChatCompletion API."""
|
||||
|
||||
Provides conversion utilities for transforming Pipecat's standard tool
|
||||
schemas into the format expected by OpenAI's ChatCompletion API for
|
||||
function calling capabilities.
|
||||
messages: List[ChatCompletionMessageParam]
|
||||
tools: List[ChatCompletionToolParam] | OpenAINotGiven
|
||||
tool_choice: ChatCompletionToolChoiceOptionParam | OpenAINotGiven
|
||||
|
||||
|
||||
class OpenAILLMAdapter(BaseLLMAdapter[OpenAILLMInvocationParams]):
|
||||
"""OpenAI-specific adapter for Pipecat.
|
||||
|
||||
Handles:
|
||||
|
||||
- Extracting parameters for OpenAI's ChatCompletion API from a universal
|
||||
LLM context
|
||||
- Converting Pipecat's standardized tools schema to OpenAI's function-calling format.
|
||||
- Extracting and sanitizing messages from the LLM context for logging about OpenAI.
|
||||
"""
|
||||
|
||||
def get_llm_invocation_params(self, context: LLMContext) -> OpenAILLMInvocationParams:
|
||||
"""Get OpenAI-specific LLM invocation parameters from a universal LLM context.
|
||||
|
||||
Args:
|
||||
context: The LLM context containing messages, tools, etc.
|
||||
|
||||
Returns:
|
||||
Dictionary of parameters for OpenAI's ChatCompletion API.
|
||||
"""
|
||||
return {
|
||||
"messages": self._from_universal_context_messages(self._get_messages(context)),
|
||||
# NOTE; LLMContext's tools are guaranteed to be a ToolsSchema (or NOT_GIVEN)
|
||||
"tools": self.from_standard_tools(context.tools),
|
||||
"tool_choice": context.tool_choice,
|
||||
}
|
||||
|
||||
def to_provider_tools_format(self, tools_schema: ToolsSchema) -> List[ChatCompletionToolParam]:
|
||||
"""Convert function schemas to OpenAI's function-calling format.
|
||||
|
||||
@@ -37,3 +78,43 @@ class OpenAILLMAdapter(BaseLLMAdapter):
|
||||
ChatCompletionToolParam(type="function", function=func.to_default_dict())
|
||||
for func in functions_schema
|
||||
]
|
||||
|
||||
def get_messages_for_logging(self, context: LLMContext) -> List[dict[str, Any]]:
|
||||
"""Get messages from a universal LLM context in a format ready for logging about OpenAI.
|
||||
|
||||
Removes or truncates sensitive data like image content for safe logging.
|
||||
|
||||
Args:
|
||||
context: The LLM context containing messages.
|
||||
|
||||
Returns:
|
||||
List of messages in a format ready for logging about OpenAI.
|
||||
"""
|
||||
msgs = []
|
||||
for message in self._get_messages(context):
|
||||
msg = copy.deepcopy(message)
|
||||
if "content" in msg:
|
||||
if isinstance(msg["content"], list):
|
||||
for item in msg["content"]:
|
||||
if item["type"] == "image_url":
|
||||
if item["image_url"]["url"].startswith("data:image/"):
|
||||
item["image_url"]["url"] = "data:image/..."
|
||||
if "mime_type" in msg and msg["mime_type"].startswith("image/"):
|
||||
msg["data"] = "..."
|
||||
msgs.append(msg)
|
||||
return json.dumps(msgs, ensure_ascii=False)
|
||||
|
||||
def _get_messages(self, context: LLMContext) -> List[LLMContextMessage]:
|
||||
return context.get_messages("openai")
|
||||
|
||||
def _from_universal_context_messages(
|
||||
self, messages: List[LLMContextMessage]
|
||||
) -> List[ChatCompletionMessageParam]:
|
||||
# Just a pass-through: messages are already the right type
|
||||
return messages
|
||||
|
||||
def _from_standard_tool_choice(
|
||||
self, tool_choice: LLMContextToolChoice | NotGiven
|
||||
) -> ChatCompletionToolChoiceOptionParam | OpenAINotGiven:
|
||||
# Just a pass-through: tool_choice is already the right type
|
||||
return tool_choice
|
||||
|
||||
@@ -6,11 +6,21 @@
|
||||
|
||||
"""OpenAI Realtime LLM adapter for Pipecat."""
|
||||
|
||||
from typing import Any, Dict, List, Union
|
||||
from typing import Any, Dict, List, TypedDict, Union
|
||||
|
||||
from pipecat.adapters.base_llm_adapter import BaseLLMAdapter
|
||||
from pipecat.adapters.schemas.function_schema import FunctionSchema
|
||||
from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
|
||||
|
||||
class OpenAIRealtimeLLMInvocationParams(TypedDict):
|
||||
"""Context-based parameters for invoking OpenAI Realtime API.
|
||||
|
||||
This is a placeholder until support for universal LLMContext machinery is added for OpenAI Realtime.
|
||||
"""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class OpenAIRealtimeLLMAdapter(BaseLLMAdapter):
|
||||
@@ -20,6 +30,34 @@ class OpenAIRealtimeLLMAdapter(BaseLLMAdapter):
|
||||
OpenAI's Realtime API for function calling capabilities.
|
||||
"""
|
||||
|
||||
def get_llm_invocation_params(self, context: LLMContext) -> OpenAIRealtimeLLMInvocationParams:
|
||||
"""Get OpenAI Realtime-specific LLM invocation parameters from a universal LLM context.
|
||||
|
||||
This is a placeholder until support for universal LLMContext machinery is added for OpenAI Realtime.
|
||||
|
||||
Args:
|
||||
context: The LLM context containing messages, tools, etc.
|
||||
|
||||
Returns:
|
||||
Dictionary of parameters for invoking OpenAI Realtime's API.
|
||||
"""
|
||||
raise NotImplementedError("Universal LLMContext is not yet supported for OpenAI Realtime.")
|
||||
|
||||
def get_messages_for_logging(self, context) -> List[dict[str, Any]]:
|
||||
"""Get messages from a universal LLM context in a format ready for logging about OpenAI Realtime.
|
||||
|
||||
Removes or truncates sensitive data like image content for safe logging.
|
||||
|
||||
This is a placeholder until support for universal LLMContext machinery is added for OpenAI Realtime.
|
||||
|
||||
Args:
|
||||
context: The LLM context containing messages.
|
||||
|
||||
Returns:
|
||||
List of messages in a format ready for logging about OpenAI Realtime.
|
||||
"""
|
||||
raise NotImplementedError("Universal LLMContext is not yet supported for OpenAI Realtime.")
|
||||
|
||||
@staticmethod
|
||||
def _to_openai_realtime_function_format(function: FunctionSchema) -> Dict[str, Any]:
|
||||
"""Convert a function schema to OpenAI Realtime format.
|
||||
|
||||
@@ -28,7 +28,7 @@ SPEAKING_THRESHOLD = 20
|
||||
def create_default_resampler(**kwargs) -> BaseAudioResampler:
|
||||
"""Create a default audio resampler instance.
|
||||
|
||||
. deprecated:: 0.0.74
|
||||
.. deprecated:: 0.0.74
|
||||
This function is deprecated and will be removed in a future version.
|
||||
Use `create_stream_resampler` for real-time processing scenarios or
|
||||
`create_file_resampler` for batch processing of complete audio files.
|
||||
|
||||
0
src/pipecat/extensions/__init__.py
Normal file
0
src/pipecat/extensions/voicemail/__init__.py
Normal file
707
src/pipecat/extensions/voicemail/voicemail_detector.py
Normal file
@@ -0,0 +1,707 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""Voicemail detection module for Pipecat.
|
||||
|
||||
This module provides voicemail detection capabilities using parallel pipeline
|
||||
processing to classify incoming calls as either voicemail messages or live
|
||||
conversations. It's specifically designed for outbound calling scenarios where
|
||||
a bot needs to determine if a human answered or if the call went to voicemail.
|
||||
|
||||
Note:
|
||||
The voicemail module is optimized for text LLMs only.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
from typing import List, Optional
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.frames.frames import (
|
||||
BotInterruptionFrame,
|
||||
EndFrame,
|
||||
Frame,
|
||||
LLMFullResponseEndFrame,
|
||||
LLMFullResponseStartFrame,
|
||||
LLMTextFrame,
|
||||
StopFrame,
|
||||
SystemFrame,
|
||||
TTSAudioRawFrame,
|
||||
TTSStartedFrame,
|
||||
TTSStoppedFrame,
|
||||
TTSTextFrame,
|
||||
UserStartedSpeakingFrame,
|
||||
UserStoppedSpeakingFrame,
|
||||
)
|
||||
from pipecat.pipeline.parallel_pipeline import ParallelPipeline
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor, FrameProcessorSetup
|
||||
from pipecat.services.llm_service import LLMService
|
||||
from pipecat.sync.base_notifier import BaseNotifier
|
||||
from pipecat.sync.event_notifier import EventNotifier
|
||||
|
||||
|
||||
class NotifierGate(FrameProcessor):
|
||||
"""Base gate processor that controls frame flow based on notifier signals.
|
||||
|
||||
This base class provides common gate functionality for processors that need to
|
||||
start open and close permanently when a notifier signals. Subclasses define
|
||||
which frames are allowed through when the gate is closed.
|
||||
|
||||
The gate starts open to allow initial processing and closes permanently once
|
||||
the notifier signals. This ensures controlled frame flow based on external
|
||||
decisions or events.
|
||||
"""
|
||||
|
||||
def __init__(self, notifier: BaseNotifier, task_name: str = "gate"):
|
||||
"""Initialize the notifier gate.
|
||||
|
||||
Args:
|
||||
notifier: Notifier that signals when the gate should close.
|
||||
task_name: Name for the notification waiting task (for debugging).
|
||||
"""
|
||||
super().__init__()
|
||||
self._notifier = notifier
|
||||
self._task_name = task_name
|
||||
self._gate_opened = True
|
||||
self._gate_task: Optional[asyncio.Task] = None
|
||||
|
||||
async def setup(self, setup: FrameProcessorSetup):
|
||||
"""Set up the processor with required components.
|
||||
|
||||
Args:
|
||||
setup: Configuration object containing setup parameters.
|
||||
"""
|
||||
await super().setup(setup)
|
||||
self._gate_task = self.create_task(self._wait_for_notification())
|
||||
|
||||
async def cleanup(self):
|
||||
"""Clean up the processor resources."""
|
||||
await super().cleanup()
|
||||
if self._gate_task:
|
||||
await self.cancel_task(self._gate_task)
|
||||
self._gate_task = None
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
"""Process frames and control gate state based on notifier signals.
|
||||
|
||||
Args:
|
||||
frame: The frame to process.
|
||||
direction: The direction of frame flow in the pipeline.
|
||||
"""
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
# Gate logic: open gate allows all frames, closed gate filters frames
|
||||
if self._gate_opened:
|
||||
await self.push_frame(frame, direction)
|
||||
elif isinstance(
|
||||
frame,
|
||||
(SystemFrame, EndFrame, StopFrame),
|
||||
):
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
async def _wait_for_notification(self):
|
||||
"""Wait for notifier signal and close the gate.
|
||||
|
||||
This method blocks until the notifier signals, then closes the gate
|
||||
permanently to change frame filtering behavior.
|
||||
"""
|
||||
await self._notifier.wait()
|
||||
|
||||
if self._gate_opened:
|
||||
self._gate_opened = False
|
||||
|
||||
|
||||
class ClassifierGate(NotifierGate):
|
||||
"""Gate processor that controls frame flow based on classification decisions.
|
||||
|
||||
Inherits from NotifierGate and starts open to allow initial classification
|
||||
processing. Closes permanently once a classification decision is made
|
||||
(CONVERSATION or VOICEMAIL). This ensures the classifier only runs until a
|
||||
definitive decision is reached, preventing unnecessary LLM calls and maintaining
|
||||
system efficiency.
|
||||
|
||||
When closed, only allows system frames and user speaking frames to continue.
|
||||
Speaking frames are needed for voicemail timing control, but not for conversation.
|
||||
"""
|
||||
|
||||
def __init__(self, gate_notifier: BaseNotifier, conversation_notifier: BaseNotifier):
|
||||
"""Initialize the classifier gate.
|
||||
|
||||
Args:
|
||||
gate_notifier: Notifier that signals when a classification decision has
|
||||
been made and the gate should close.
|
||||
conversation_notifier: Notifier that signals when conversation is detected.
|
||||
"""
|
||||
super().__init__(gate_notifier, task_name="classifier_gate")
|
||||
self._conversation_notifier = conversation_notifier
|
||||
self._conversation_detected = False
|
||||
self._conversation_task: Optional[asyncio.Task] = None
|
||||
|
||||
async def setup(self, setup: FrameProcessorSetup):
|
||||
"""Set up the processor with required components.
|
||||
|
||||
Args:
|
||||
setup: Configuration object containing setup parameters.
|
||||
"""
|
||||
await super().setup(setup)
|
||||
self._conversation_task = self.create_task(self._wait_for_conversation())
|
||||
|
||||
async def cleanup(self):
|
||||
"""Clean up the processor resources."""
|
||||
await super().cleanup()
|
||||
if self._conversation_task:
|
||||
await self.cancel_task(self._conversation_task)
|
||||
self._conversation_task = None
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
"""Process frames and control gate state based on notifier signals.
|
||||
|
||||
Args:
|
||||
frame: The frame to process.
|
||||
direction: The direction of frame flow in the pipeline.
|
||||
"""
|
||||
await FrameProcessor.process_frame(self, frame, direction)
|
||||
|
||||
# Gate logic: open gate allows all frames, closed gate filters frames
|
||||
if self._gate_opened:
|
||||
await self.push_frame(frame, direction)
|
||||
elif isinstance(frame, (UserStartedSpeakingFrame, UserStoppedSpeakingFrame)):
|
||||
# Only allow speaking frames if conversation was NOT detected (i.e., voicemail case)
|
||||
# This prevents the UserContextAggregator from issuing a warning about no aggregation
|
||||
# to push.
|
||||
if not self._conversation_detected:
|
||||
await self.push_frame(frame, direction)
|
||||
elif isinstance(frame, (SystemFrame, EndFrame, StopFrame)):
|
||||
# Always allow system frames through
|
||||
# This includes the UserStartedSpeakingFrame and UserStoppedSpeakingFrame
|
||||
# which are used to detect voicemail timing.
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
async def _wait_for_conversation(self):
|
||||
"""Wait for conversation detection notification and mark conversation detected."""
|
||||
await self._conversation_notifier.wait()
|
||||
self._conversation_detected = True
|
||||
|
||||
|
||||
class ConversationGate(NotifierGate):
|
||||
"""Gate processor that blocks conversation flow when voicemail is detected.
|
||||
|
||||
Inherits from NotifierGate and starts open to allow normal conversation
|
||||
processing. Closes permanently when voicemail is detected to prevent the
|
||||
main conversation LLM from processing additional input after voicemail
|
||||
classification.
|
||||
|
||||
When closed, only allows system frames and user speaking frames to continue.
|
||||
"""
|
||||
|
||||
def __init__(self, voicemail_notifier: BaseNotifier):
|
||||
"""Initialize the conversation gate.
|
||||
|
||||
Args:
|
||||
voicemail_notifier: Notifier that signals when voicemail has been
|
||||
detected and the conversation should be blocked.
|
||||
"""
|
||||
super().__init__(voicemail_notifier, task_name="conversation_gate")
|
||||
|
||||
|
||||
class ClassificationProcessor(FrameProcessor):
|
||||
"""Processor that handles LLM classification responses and triggers events.
|
||||
|
||||
This processor aggregates LLM text tokens into complete responses and analyzes
|
||||
them to determine if the call reached a voicemail system or a live person.
|
||||
It uses the LLM response frame delimiters (LLMFullResponseStartFrame and
|
||||
LLMFullResponseEndFrame) to ensure complete token aggregation regardless
|
||||
of how the LLM tokenizes the response words.
|
||||
|
||||
The processor expects responses containing either "CONVERSATION" (indicating
|
||||
a human answered) or "VOICEMAIL" (indicating an automated system). Once a
|
||||
decision is made, it triggers the appropriate notifications and event handlers.
|
||||
|
||||
For voicemail detection, the event handler timer starts immediately and is cancelled
|
||||
and restarted based on user speech patterns to ensure proper timing.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
gate_notifier: BaseNotifier,
|
||||
conversation_notifier: BaseNotifier,
|
||||
voicemail_notifier: BaseNotifier,
|
||||
voicemail_response_delay: float,
|
||||
):
|
||||
"""Initialize the voicemail processor.
|
||||
|
||||
Args:
|
||||
gate_notifier: Notifier to signal the ClassifierGate about classification
|
||||
decisions so it can close and stop processing.
|
||||
conversation_notifier: Notifier to signal the TTSGate to release
|
||||
all gated TTS frames for normal conversation flow.
|
||||
voicemail_notifier: Notifier to signal the TTSGate to clear
|
||||
gated TTS frames since voicemail was detected.
|
||||
voicemail_response_delay: Delay in seconds after user stops speaking
|
||||
before triggering the voicemail event handler. This ensures the voicemail
|
||||
greeting or user message is complete before responding.
|
||||
"""
|
||||
super().__init__()
|
||||
self._gate_notifier = gate_notifier
|
||||
self._conversation_notifier = conversation_notifier
|
||||
self._voicemail_notifier = voicemail_notifier
|
||||
self._voicemail_response_delay = voicemail_response_delay
|
||||
|
||||
# Register the voicemail detected event
|
||||
self._register_event_handler("on_voicemail_detected")
|
||||
|
||||
# Aggregation state for collecting complete LLM responses
|
||||
self._processing_response = False
|
||||
self._response_buffer = ""
|
||||
self._decision_made = False
|
||||
|
||||
# Voicemail timing state
|
||||
self._voicemail_detected = False
|
||||
self._voicemail_task: Optional[asyncio.Task] = None
|
||||
self._voicemail_event = asyncio.Event()
|
||||
self._voicemail_event.set()
|
||||
|
||||
async def setup(self, setup: FrameProcessorSetup):
|
||||
"""Set up the processor with required components.
|
||||
|
||||
Args:
|
||||
setup: Configuration object containing setup parameters.
|
||||
"""
|
||||
await super().setup(setup)
|
||||
self._voicemail_task = self.create_task(self._delayed_voicemail_handler())
|
||||
|
||||
async def cleanup(self):
|
||||
"""Clean up the processor resources."""
|
||||
await super().cleanup()
|
||||
if self._voicemail_task:
|
||||
await self.cancel_task(self._voicemail_task)
|
||||
self._voicemail_task = None
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
"""Process frames and handle LLM classification responses.
|
||||
|
||||
This method implements a state machine for aggregating LLM responses:
|
||||
1. LLMFullResponseStartFrame: Begin collecting tokens
|
||||
2. LLMTextFrame: Accumulate text tokens into buffer
|
||||
3. LLMFullResponseEndFrame: Process complete response and make decision
|
||||
4. UserStartedSpeakingFrame/UserStoppedSpeakingFrame: Manage voicemail timing
|
||||
|
||||
Args:
|
||||
frame: The frame to process.
|
||||
direction: The direction of frame flow in the pipeline.
|
||||
"""
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if isinstance(frame, LLMFullResponseStartFrame):
|
||||
# Begin aggregating a new LLM response
|
||||
self._processing_response = True
|
||||
self._response_buffer = ""
|
||||
|
||||
elif isinstance(frame, LLMFullResponseEndFrame):
|
||||
# Complete response received - make classification decision
|
||||
if self._processing_response and not self._decision_made:
|
||||
await self._process_classification(self._response_buffer.strip())
|
||||
self._processing_response = False
|
||||
self._response_buffer = ""
|
||||
|
||||
elif isinstance(frame, LLMTextFrame) and self._processing_response:
|
||||
# Accumulate text tokens from the streaming LLM response
|
||||
self._response_buffer += frame.text
|
||||
|
||||
elif isinstance(frame, UserStartedSpeakingFrame):
|
||||
# User started speaking - set the voicemail event
|
||||
if self._voicemail_detected:
|
||||
self._voicemail_event.set()
|
||||
|
||||
elif isinstance(frame, UserStoppedSpeakingFrame):
|
||||
# User stopped speaking - clear the voicemail event
|
||||
if self._voicemail_detected:
|
||||
self._voicemail_event.clear()
|
||||
|
||||
else:
|
||||
# Pass all non-LLM frames through
|
||||
# Blocking LLM frames prevents interference with the downstream LLM
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
async def _process_classification(self, full_response: str):
|
||||
"""Process the complete LLM classification response and trigger actions.
|
||||
|
||||
Analyzes the aggregated response text to determine if it contains
|
||||
"CONVERSATION" or "VOICEMAIL" and triggers the appropriate notifications
|
||||
and callbacks based on the classification result.
|
||||
|
||||
Args:
|
||||
full_response: The complete aggregated response text from the LLM.
|
||||
"""
|
||||
if self._decision_made:
|
||||
return
|
||||
|
||||
response = full_response.upper()
|
||||
logger.debug(f"{self}: Classifying response: '{full_response}'")
|
||||
|
||||
if "CONVERSATION" in response:
|
||||
# Human answered - continue normal conversation flow
|
||||
self._decision_made = True
|
||||
logger.info(f"{self}: CONVERSATION detected")
|
||||
await self._gate_notifier.notify() # Close the classifier gate
|
||||
await self._conversation_notifier.notify() # Release buffered TTS frames
|
||||
|
||||
elif "VOICEMAIL" in response:
|
||||
# Voicemail detected - trigger voicemail handling
|
||||
self._decision_made = True
|
||||
self._voicemail_detected = True
|
||||
logger.info(f"{self}: VOICEMAIL detected")
|
||||
await self._gate_notifier.notify() # Close the classifier gate
|
||||
await self._voicemail_notifier.notify() # Clear buffered TTS frames
|
||||
|
||||
# Interrupt the current pipeline to stop any ongoing processing
|
||||
await self.push_frame(BotInterruptionFrame(), FrameDirection.UPSTREAM)
|
||||
|
||||
# Set the voicemail event to trigger the voicemail handler
|
||||
self._voicemail_event.clear()
|
||||
|
||||
else:
|
||||
# This can happen if the LLM is interrupted before completing the response
|
||||
logger.debug(f"{self}: No classification found: '{full_response}'")
|
||||
|
||||
async def _delayed_voicemail_handler(self):
|
||||
"""Execute the voicemail event handler after the configured delay.
|
||||
|
||||
This method waits for the specified delay period, then triggers the
|
||||
developer's voicemail event handler. The timer can be cancelled and restarted
|
||||
based on user speech patterns to ensure proper timing.
|
||||
"""
|
||||
while True:
|
||||
try:
|
||||
await asyncio.wait_for(
|
||||
self._voicemail_event.wait(), timeout=self._voicemail_response_delay
|
||||
)
|
||||
await asyncio.sleep(0.1)
|
||||
except asyncio.TimeoutError:
|
||||
await self._call_event_handler("on_voicemail_detected")
|
||||
break
|
||||
|
||||
|
||||
class TTSGate(FrameProcessor):
|
||||
"""Gates TTS frames until voicemail classification decision is made.
|
||||
|
||||
This processor holds TTS output frames in a gate while the voicemail
|
||||
classification is in progress. This prevents audio from being played
|
||||
to the caller before determining if they're human or a voicemail system.
|
||||
|
||||
The gate operates in two modes based on the classification result:
|
||||
|
||||
- CONVERSATION: Opens the gate to release all held frames for normal dialogue
|
||||
- VOICEMAIL: Clears held frames since they're not needed for voicemail
|
||||
|
||||
The gating only applies to TTS-related frames (TTSTextFrame, TTSAudioRawFrame).
|
||||
All other frames pass through immediately to maintain proper pipeline flow.
|
||||
"""
|
||||
|
||||
def __init__(self, conversation_notifier: BaseNotifier, voicemail_notifier: BaseNotifier):
|
||||
"""Initialize the TTS gate.
|
||||
|
||||
Args:
|
||||
conversation_notifier: Notifier that signals when a conversation is
|
||||
detected and gated frames should be released for playback.
|
||||
voicemail_notifier: Notifier that signals when voicemail is detected
|
||||
and gated frames should be cleared (not played).
|
||||
"""
|
||||
super().__init__()
|
||||
self._conversation_notifier = conversation_notifier
|
||||
self._voicemail_notifier = voicemail_notifier
|
||||
self._frame_buffer: List[tuple[Frame, FrameDirection]] = []
|
||||
self._gating_active = True
|
||||
self._conversation_task: Optional[asyncio.Task] = None
|
||||
self._voicemail_task: Optional[asyncio.Task] = None
|
||||
|
||||
async def setup(self, setup: FrameProcessorSetup):
|
||||
"""Set up the processor with required components.
|
||||
|
||||
Args:
|
||||
setup: Configuration object containing setup parameters.
|
||||
"""
|
||||
await super().setup(setup)
|
||||
|
||||
self._conversation_task = self.create_task(self._wait_for_conversation())
|
||||
self._voicemail_task = self.create_task(self._wait_for_voicemail())
|
||||
|
||||
async def cleanup(self):
|
||||
"""Clean up the processor resources."""
|
||||
await super().cleanup()
|
||||
if self._conversation_task:
|
||||
await self.cancel_task(self._conversation_task)
|
||||
self._conversation_task = None
|
||||
if self._voicemail_task:
|
||||
await self.cancel_task(self._voicemail_task)
|
||||
self._voicemail_task = None
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
"""Process frames and handle gating logic based on frame type.
|
||||
|
||||
TTS frames are gated while classification is active. All other frames
|
||||
pass through immediately. The gating state is controlled by the
|
||||
classification notifications.
|
||||
|
||||
Args:
|
||||
frame: The frame to process.
|
||||
direction: The direction of frame flow in the pipeline.
|
||||
"""
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
# Core gating logic: hold TTS frames, pass everything else through
|
||||
if self._gating_active and isinstance(
|
||||
frame, (TTSStartedFrame, TTSStoppedFrame, TTSTextFrame, TTSAudioRawFrame)
|
||||
):
|
||||
# Gate TTS frames while waiting for classification decision
|
||||
self._frame_buffer.append((frame, direction))
|
||||
else:
|
||||
# Pass through all non-TTS frames immediately
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
async def _wait_for_conversation(self):
|
||||
"""Wait for conversation detection notification and release gated frames.
|
||||
|
||||
When a conversation is detected, all gated TTS frames are released
|
||||
in order to continue normal dialogue flow. This allows the bot to
|
||||
respond naturally to the human caller.
|
||||
"""
|
||||
await self._conversation_notifier.wait()
|
||||
|
||||
# Release all gated frames in original order
|
||||
self._gating_active = False
|
||||
for frame, direction in self._frame_buffer:
|
||||
await self.push_frame(frame, direction)
|
||||
self._frame_buffer.clear()
|
||||
|
||||
async def _wait_for_voicemail(self):
|
||||
"""Wait for voicemail detection notification and clear gated frames.
|
||||
|
||||
When voicemail is detected, all gated TTS frames are discarded
|
||||
since they were intended for human conversation and are not appropriate
|
||||
for voicemail systems. The developer event handlers will handle voicemail-
|
||||
specific audio output.
|
||||
"""
|
||||
await self._voicemail_notifier.wait()
|
||||
|
||||
# Clear gated frames without playing them
|
||||
self._gating_active = False
|
||||
self._frame_buffer.clear()
|
||||
|
||||
|
||||
class VoicemailDetector(ParallelPipeline):
|
||||
"""Parallel pipeline for detecting voicemail vs. live conversation in outbound calls.
|
||||
|
||||
This detector uses a parallel pipeline architecture to perform real-time
|
||||
classification of outbound phone calls without interrupting the conversation
|
||||
flow. It determines whether a human answered the phone or if the call went
|
||||
to a voicemail system.
|
||||
|
||||
Architecture:
|
||||
|
||||
- Conversation branch: Empty pipeline that allows normal frame flow
|
||||
- Classification branch: Contains the LLM classifier and decision logic
|
||||
|
||||
The system uses a gate mechanism to control when classification runs and
|
||||
a gating system to prevent TTS output until classification is complete.
|
||||
Once a decision is made, the appropriate action is taken:
|
||||
|
||||
- CONVERSATION: Continue normal bot dialogue
|
||||
- VOICEMAIL: Trigger developer event handler for custom voicemail handling
|
||||
|
||||
Example::
|
||||
|
||||
classification_llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
|
||||
detector = VoicemailDetector(llm=classification_llm)
|
||||
|
||||
@detector.event_handler("on_voicemail_detected")
|
||||
async def handle_voicemail(processor):
|
||||
await processor.push_frame(TTSSpeakFrame("Please leave a message."))
|
||||
|
||||
pipeline = Pipeline([
|
||||
transport.input(),
|
||||
stt,
|
||||
detector.detector(), # Classification
|
||||
context_aggregator.user(),
|
||||
llm,
|
||||
tts,
|
||||
detector.gate(), # TTS gating
|
||||
transport.output(),
|
||||
context_aggregator.assistant(),
|
||||
])
|
||||
|
||||
# For custom prompts, append the required response instruction:
|
||||
custom_prompt = "Your custom classification logic here. " + VoicemailDetector.CLASSIFIER_RESPONSE_INSTRUCTION
|
||||
|
||||
Events:
|
||||
on_voicemail_detected: Triggered when voicemail is detected after the configured
|
||||
delay. The event handler receives one argument: the ClassificationProcessor
|
||||
instance which can be used to push frames.
|
||||
|
||||
Constants:
|
||||
CLASSIFIER_RESPONSE_INSTRUCTION: The exact text that must be included in custom
|
||||
system prompts to ensure proper classification functionality.
|
||||
"""
|
||||
|
||||
CLASSIFIER_RESPONSE_INSTRUCTION = 'Respond with ONLY "CONVERSATION" if a person answered, or "VOICEMAIL" if it\'s voicemail/recording.'
|
||||
|
||||
DEFAULT_SYSTEM_PROMPT = (
|
||||
"""You are a voicemail detection classifier for an OUTBOUND calling system. A bot has called a phone number and you need to determine if a human answered or if the call went to voicemail based on the provided text.
|
||||
|
||||
HUMAN ANSWERED - LIVE CONVERSATION (respond "CONVERSATION"):
|
||||
- Personal greetings: "Hello?", "Hi", "Yeah?", "John speaking"
|
||||
- Interactive responses: "Who is this?", "What do you want?", "Can I help you?"
|
||||
- Conversational tone expecting back-and-forth dialogue
|
||||
- Questions directed at the caller: "Hello? Anyone there?"
|
||||
- Informal responses: "Yep", "What's up?", "Speaking"
|
||||
- Natural, spontaneous speech patterns
|
||||
- Immediate acknowledgment of the call
|
||||
|
||||
VOICEMAIL SYSTEM (respond "VOICEMAIL"):
|
||||
- Automated voicemail greetings: "Hi, you've reached [name], please leave a message"
|
||||
- Phone carrier messages: "The number you have dialed is not in service", "Please leave a message", "All circuits are busy"
|
||||
- Professional voicemail: "This is [name], I'm not available right now"
|
||||
- Instructions about leaving messages: "leave a message", "leave your name and number"
|
||||
- References to callback or messaging: "call me back", "I'll get back to you"
|
||||
- Carrier system messages: "mailbox is full", "has not been set up"
|
||||
- Business hours messages: "our office is currently closed"
|
||||
|
||||
"""
|
||||
+ CLASSIFIER_RESPONSE_INSTRUCTION
|
||||
)
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
llm: LLMService,
|
||||
voicemail_response_delay: float = 2.0,
|
||||
custom_system_prompt: Optional[str] = None,
|
||||
):
|
||||
"""Initialize the voicemail detector with classification and buffering components.
|
||||
|
||||
Args:
|
||||
llm: LLM service used for voicemail vs conversation classification.
|
||||
Should be fast and reliable for real-time classification.
|
||||
voicemail_response_delay: Delay in seconds after user stops speaking
|
||||
before triggering the voicemail event handler. This allows voicemail
|
||||
responses to be played back after a short delay to ensure the response
|
||||
occurs during the voicemail recording. Default is 2.0 seconds.
|
||||
custom_system_prompt: Optional custom system prompt for classification. If None,
|
||||
uses the default prompt optimized for outbound calling scenarios.
|
||||
Custom prompts should instruct the LLM to respond with exactly
|
||||
"CONVERSATION" or "VOICEMAIL" for proper detection functionality.
|
||||
"""
|
||||
self._classifier_llm = llm
|
||||
self._prompt = (
|
||||
custom_system_prompt if custom_system_prompt is not None else self.DEFAULT_SYSTEM_PROMPT
|
||||
)
|
||||
self._voicemail_response_delay = voicemail_response_delay
|
||||
|
||||
# Validate custom prompts to ensure they work with the detection logic
|
||||
if custom_system_prompt is not None:
|
||||
self._validate_prompt(custom_system_prompt)
|
||||
|
||||
# Set up the LLM context with the classification prompt
|
||||
self._messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": self._prompt,
|
||||
},
|
||||
]
|
||||
|
||||
# Create the LLM context and aggregators for conversation management
|
||||
self._context = OpenAILLMContext(self._messages)
|
||||
self._context_aggregator = llm.create_context_aggregator(self._context)
|
||||
|
||||
# Create notification system for coordinating between components
|
||||
self._gate_notifier = EventNotifier() # Signals classification completion
|
||||
self._conversation_notifier = EventNotifier() # Signals conversation detected
|
||||
self._voicemail_notifier = EventNotifier() # Signals voicemail detected
|
||||
|
||||
# Create the processor components
|
||||
self._classifier_gate = ClassifierGate(self._gate_notifier, self._conversation_notifier)
|
||||
self._conversation_gate = ConversationGate(self._voicemail_notifier)
|
||||
self._classification_processor = ClassificationProcessor(
|
||||
gate_notifier=self._gate_notifier,
|
||||
conversation_notifier=self._conversation_notifier,
|
||||
voicemail_notifier=self._voicemail_notifier,
|
||||
voicemail_response_delay=voicemail_response_delay,
|
||||
)
|
||||
self._voicemail_gate = TTSGate(self._conversation_notifier, self._voicemail_notifier)
|
||||
|
||||
# Initialize the parallel pipeline with conversation and classifier branches
|
||||
super().__init__(
|
||||
# Conversation branch: gate to blocks after voicemail detection
|
||||
[self._conversation_gate],
|
||||
# Classification branch: gate -> context -> LLM -> processor -> context
|
||||
[
|
||||
self._classifier_gate,
|
||||
self._context_aggregator.user(),
|
||||
self._classifier_llm,
|
||||
self._classification_processor,
|
||||
self._context_aggregator.assistant(),
|
||||
],
|
||||
)
|
||||
|
||||
# Register the voicemail detected event after super().__init__()
|
||||
self._register_event_handler("on_voicemail_detected")
|
||||
|
||||
def _validate_prompt(self, prompt: str) -> None:
|
||||
"""Validate custom prompt contains required response format instructions.
|
||||
|
||||
Custom prompts must instruct the LLM to respond with exactly "CONVERSATION"
|
||||
or "VOICEMAIL" for the detection logic to work properly. This method
|
||||
checks for the presence of these keywords and warns if they're missing.
|
||||
|
||||
Args:
|
||||
prompt: The custom system prompt to validate.
|
||||
"""
|
||||
has_conversation = "CONVERSATION" in prompt
|
||||
has_voicemail = "VOICEMAIL" in prompt
|
||||
|
||||
if not has_conversation or not has_voicemail:
|
||||
logger.warning(
|
||||
"Custom system prompt should instruct the LLM to respond with exactly "
|
||||
'"CONVERSATION" or "VOICEMAIL" for proper detection functionality. '
|
||||
f"Consider appending VoicemailDetector.CLASSIFIER_RESPONSE_INSTRUCTION to your prompt: "
|
||||
f'"{self.CLASSIFIER_RESPONSE_INSTRUCTION}"'
|
||||
)
|
||||
|
||||
def detector(self) -> "VoicemailDetector":
|
||||
"""Get the detector pipeline for placement after STT in the main pipeline.
|
||||
|
||||
This should be placed after the STT service and before the context
|
||||
aggregator in your main pipeline to enable voicemail classification.
|
||||
|
||||
Returns:
|
||||
The VoicemailDetector instance itself (which is a ParallelPipeline).
|
||||
"""
|
||||
return self
|
||||
|
||||
def gate(self) -> TTSGate:
|
||||
"""Get the gate processor for placement after TTS in the main pipeline.
|
||||
|
||||
This should be placed after the TTS service and before the transport
|
||||
output to enable TTS frame gating during classification.
|
||||
|
||||
Returns:
|
||||
The TTSGate processor instance.
|
||||
"""
|
||||
return self._voicemail_gate
|
||||
|
||||
def add_event_handler(self, event_name: str, handler):
|
||||
"""Add an event handler for voicemail detection events.
|
||||
|
||||
Args:
|
||||
event_name: The name of the event to handle.
|
||||
handler: The function to call when the event occurs.
|
||||
"""
|
||||
if event_name == "on_voicemail_detected":
|
||||
self._classification_processor.add_event_handler(event_name, handler)
|
||||
else:
|
||||
super().add_event_handler(event_name, handler)
|
||||
@@ -27,6 +27,7 @@ from typing import (
|
||||
Tuple,
|
||||
)
|
||||
|
||||
from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
||||
from pipecat.audio.interruptions.base_interruption_strategy import BaseInterruptionStrategy
|
||||
from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
@@ -36,6 +37,7 @@ from pipecat.utils.time import nanoseconds_to_str
|
||||
from pipecat.utils.utils import obj_count, obj_id
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext, NotGiven
|
||||
from pipecat.processors.frame_processor import FrameProcessor
|
||||
|
||||
|
||||
@@ -228,7 +230,7 @@ class OutputImageRawFrame(DataFrame, ImageRawFrame):
|
||||
|
||||
def __str__(self):
|
||||
pts = format_pts(self.pts)
|
||||
return f"{self.name}(pts: {pts}, size: {self.size}, format: {self.format})"
|
||||
return f"{self.name}(pts: {pts}, destination: {self.transport_destination}, size: {self.size}, format: {self.format})"
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -403,6 +405,11 @@ class OpenAILLMContextAssistantTimestampFrame(DataFrame):
|
||||
timestamp: str
|
||||
|
||||
|
||||
# A more universal (LLM-agnostic) name for
|
||||
# OpenAILLMContextAssistantTimestampFrame, matching LLMContext
|
||||
LLMContextAssistantTimestampFrame = OpenAILLMContextAssistantTimestampFrame
|
||||
|
||||
|
||||
@dataclass
|
||||
class TranscriptionMessage:
|
||||
"""A message in a conversation transcript.
|
||||
@@ -474,6 +481,20 @@ class TranscriptionUpdateFrame(DataFrame):
|
||||
return f"{self.name}(pts: {pts}, messages: {len(self.messages)})"
|
||||
|
||||
|
||||
@dataclass
|
||||
class LLMContextFrame(Frame):
|
||||
"""Frame containing a universal LLM context.
|
||||
|
||||
Used as a signal to LLM services to ingest the provided context and
|
||||
generate a response based on it.
|
||||
|
||||
Parameters:
|
||||
context: The LLM context containing messages, tools, and configuration.
|
||||
"""
|
||||
|
||||
context: "LLMContext"
|
||||
|
||||
|
||||
@dataclass
|
||||
class LLMMessagesFrame(DataFrame):
|
||||
"""Frame containing LLM messages for chat completion.
|
||||
@@ -556,7 +577,7 @@ class LLMSetToolsFrame(DataFrame):
|
||||
tools: List of tool/function definitions for the LLM.
|
||||
"""
|
||||
|
||||
tools: List[dict]
|
||||
tools: List[dict] | ToolsSchema | "NotGiven"
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -1445,3 +1466,20 @@ class MixerEnableFrame(MixerControlFrame):
|
||||
"""
|
||||
|
||||
enable: bool
|
||||
|
||||
|
||||
@dataclass
|
||||
class ServiceSwitcherFrame(ControlFrame):
|
||||
"""A base class for frames that control ServiceSwitcher behavior."""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
@dataclass
|
||||
class ManuallySwitchServiceFrame(ServiceSwitcherFrame):
|
||||
"""A frame to request a manual switch in the active service in a ServiceSwitcher.
|
||||
|
||||
Handled by ServiceSwitcherStrategyManual to switch the active service.
|
||||
"""
|
||||
|
||||
service: "FrameProcessor"
|
||||
|
||||
@@ -11,7 +11,6 @@ processors without modifying the pipeline structure. Observers can be used
|
||||
for logging, debugging, analytics, and monitoring pipeline behavior.
|
||||
"""
|
||||
|
||||
from abc import abstractmethod
|
||||
from dataclasses import dataclass
|
||||
|
||||
from typing_extensions import TYPE_CHECKING
|
||||
@@ -23,6 +22,28 @@ if TYPE_CHECKING:
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
|
||||
|
||||
@dataclass
|
||||
class FrameProcessed:
|
||||
"""Event data for frame processing in the pipeline.
|
||||
|
||||
Represents an event where a frame is being processed by a processor. This
|
||||
data structure is typically used by observers to track the flow of frames
|
||||
through the pipeline for logging, debugging, or analytics purposes.
|
||||
|
||||
Parameters:
|
||||
processor: The processor processing the frame.
|
||||
frame: The frame being processed.
|
||||
direction: The direction of the frame (e.g., downstream or upstream).
|
||||
timestamp: The time when the frame was pushed, based on the pipeline clock.
|
||||
|
||||
"""
|
||||
|
||||
processor: "FrameProcessor"
|
||||
frame: Frame
|
||||
direction: "FrameDirection"
|
||||
timestamp: int
|
||||
|
||||
|
||||
@dataclass
|
||||
class FramePushed:
|
||||
"""Event data for frame transfers between processors in the pipeline.
|
||||
@@ -56,7 +77,18 @@ class BaseObserver(BaseObject):
|
||||
performance analysis, and analytics collection.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
async def on_process_frame(self, data: FrameProcessed):
|
||||
"""Handle the event when a frame is being processed by a processor.
|
||||
|
||||
This method should be implemented by subclasses to define specific
|
||||
behavior (e.g., logging, monitoring, debugging) when a frame is
|
||||
being processed by a processor.
|
||||
|
||||
Args:
|
||||
data: The event data containing details about the frame processing.
|
||||
"""
|
||||
pass
|
||||
|
||||
async def on_push_frame(self, data: FramePushed):
|
||||
"""Handle the event when a frame is pushed from one processor to another.
|
||||
|
||||
|
||||
@@ -6,31 +6,12 @@
|
||||
|
||||
"""Base pipeline implementation for frame processing."""
|
||||
|
||||
from abc import abstractmethod
|
||||
from typing import List
|
||||
|
||||
from pipecat.processors.frame_processor import FrameProcessor
|
||||
|
||||
|
||||
class BasePipeline(FrameProcessor):
|
||||
"""Base class for all pipeline implementations.
|
||||
"""Base class for all pipeline implementations."""
|
||||
|
||||
Provides the foundation for pipeline processors that need to support
|
||||
metrics collection from their contained processors.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
def __init__(self, **kwargs):
|
||||
"""Initialize the base pipeline."""
|
||||
super().__init__()
|
||||
|
||||
@abstractmethod
|
||||
def processors_with_metrics(self) -> List[FrameProcessor]:
|
||||
"""Return processors that can generate metrics.
|
||||
|
||||
Implementing classes should collect and return all processors within
|
||||
their pipeline that support metrics generation.
|
||||
|
||||
Returns:
|
||||
List of frame processors that support metrics collection.
|
||||
"""
|
||||
pass
|
||||
super().__init__(**kwargs)
|
||||
|
||||
84
src/pipecat/pipeline/llm_switcher.py
Normal file
@@ -0,0 +1,84 @@
|
||||
#
|
||||
# Copyright (c) 2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""LLM switcher for switching between different LLMs at runtime, with different switching strategies."""
|
||||
|
||||
from typing import Any, List, Optional, Type
|
||||
|
||||
from pipecat.pipeline.service_switcher import ServiceSwitcher, StrategyType
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.services.llm_service import LLMService
|
||||
|
||||
|
||||
class LLMSwitcher(ServiceSwitcher[StrategyType]):
|
||||
"""A pipeline that switches between different LLMs at runtime."""
|
||||
|
||||
def __init__(self, llms: List[LLMService], strategy_type: Type[StrategyType]):
|
||||
"""Initialize the service switcher with a list of LLMs and a switching strategy."""
|
||||
super().__init__(llms, strategy_type)
|
||||
|
||||
@property
|
||||
def llms(self) -> List[LLMService]:
|
||||
"""Get the list of LLMs managed by this switcher."""
|
||||
return self.services
|
||||
|
||||
@property
|
||||
def active_llm(self) -> Optional[LLMService]:
|
||||
"""Get the currently active LLM, if any."""
|
||||
return self.strategy.active_service
|
||||
|
||||
async def run_inference(
|
||||
self, context: LLMContext, system_instruction: Optional[str] = None
|
||||
) -> Optional[str]:
|
||||
"""Run a one-shot, out-of-band (i.e. out-of-pipeline) inference with the given LLM context, using the currently active LLM.
|
||||
|
||||
Args:
|
||||
context: The LLM context containing conversation history.
|
||||
system_instruction: Optional system instruction to guide the LLM's
|
||||
behavior. You could also (again, optionally) provide a system
|
||||
instruction directly in the context. If both are provided, the
|
||||
one in the context takes precedence.
|
||||
|
||||
Returns:
|
||||
The LLM's response as a string, or None if no response is generated.
|
||||
"""
|
||||
if self.active_llm:
|
||||
return await self.active_llm.run_inference(
|
||||
context=context, system_instruction=system_instruction
|
||||
)
|
||||
return None
|
||||
|
||||
def register_function(
|
||||
self,
|
||||
function_name: Optional[str],
|
||||
handler: Any,
|
||||
start_callback=None,
|
||||
*,
|
||||
cancel_on_interruption: bool = True,
|
||||
):
|
||||
"""Register a function handler for LLM function calls, on all LLMs, active or not.
|
||||
|
||||
Args:
|
||||
function_name: The name of the function to handle. Use None to handle
|
||||
all function calls with a catch-all handler.
|
||||
handler: The function handler. Should accept a single FunctionCallParams
|
||||
parameter.
|
||||
start_callback: Legacy callback function (deprecated). Put initialization
|
||||
code at the top of your handler instead.
|
||||
|
||||
.. deprecated:: 0.0.59
|
||||
The `start_callback` parameter is deprecated and will be removed in a future version.
|
||||
|
||||
cancel_on_interruption: Whether to cancel this function call when an
|
||||
interruption occurs. Defaults to True.
|
||||
"""
|
||||
for llm in self.llms:
|
||||
llm.register_function(
|
||||
function_name=function_name,
|
||||
handler=handler,
|
||||
start_callback=start_callback,
|
||||
cancel_on_interruption=cancel_on_interruption,
|
||||
)
|
||||
@@ -11,106 +11,15 @@ sub-pipelines concurrently, with coordination for system frames and proper
|
||||
handling of pipeline lifecycle events.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
from itertools import chain
|
||||
from typing import Awaitable, Callable, Dict, List
|
||||
from typing import Dict, List
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.frames.frames import (
|
||||
CancelFrame,
|
||||
EndFrame,
|
||||
Frame,
|
||||
StartFrame,
|
||||
StartInterruptionFrame,
|
||||
SystemFrame,
|
||||
)
|
||||
from pipecat.frames.frames import EndFrame, Frame, StartFrame
|
||||
from pipecat.pipeline.base_pipeline import BasePipeline
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.pipeline import Pipeline, PipelineSink, PipelineSource
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor, FrameProcessorSetup
|
||||
from pipecat.utils.asyncio.watchdog_queue import WatchdogQueue
|
||||
|
||||
|
||||
class ParallelPipelineSource(FrameProcessor):
|
||||
"""Source processor for parallel pipeline branches.
|
||||
|
||||
Handles frame routing for parallel pipeline inputs, directing system frames
|
||||
to the parent push function and other upstream frames to a queue for processing.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
upstream_queue: asyncio.Queue,
|
||||
push_frame_func: Callable[[Frame, FrameDirection], Awaitable[None]],
|
||||
):
|
||||
"""Initialize the parallel pipeline source.
|
||||
|
||||
Args:
|
||||
upstream_queue: Queue for collecting upstream frames from this branch.
|
||||
push_frame_func: Function to push frames to the parent parallel pipeline.
|
||||
"""
|
||||
super().__init__()
|
||||
self._up_queue = upstream_queue
|
||||
self._push_frame_func = push_frame_func
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
"""Process frames with special handling for system frames.
|
||||
|
||||
Args:
|
||||
frame: The frame to process.
|
||||
direction: The direction of frame flow.
|
||||
"""
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
match direction:
|
||||
case FrameDirection.UPSTREAM:
|
||||
if isinstance(frame, SystemFrame):
|
||||
await self._push_frame_func(frame, direction)
|
||||
else:
|
||||
await self._up_queue.put(frame)
|
||||
case FrameDirection.DOWNSTREAM:
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
|
||||
class ParallelPipelineSink(FrameProcessor):
|
||||
"""Sink processor for parallel pipeline branches.
|
||||
|
||||
Handles frame routing for parallel pipeline outputs, directing system frames
|
||||
to the parent push function and other downstream frames to a queue for coordination.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
downstream_queue: asyncio.Queue,
|
||||
push_frame_func: Callable[[Frame, FrameDirection], Awaitable[None]],
|
||||
):
|
||||
"""Initialize the parallel pipeline sink.
|
||||
|
||||
Args:
|
||||
downstream_queue: Queue for collecting downstream frames from this branch.
|
||||
push_frame_func: Function to push frames to the parent parallel pipeline.
|
||||
"""
|
||||
super().__init__()
|
||||
self._down_queue = downstream_queue
|
||||
self._push_frame_func = push_frame_func
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
"""Process frames with special handling for system frames.
|
||||
|
||||
Args:
|
||||
frame: The frame to process.
|
||||
direction: The direction of frame flow.
|
||||
"""
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
match direction:
|
||||
case FrameDirection.UPSTREAM:
|
||||
await self.push_frame(frame, direction)
|
||||
case FrameDirection.DOWNSTREAM:
|
||||
if isinstance(frame, SystemFrame):
|
||||
await self._push_frame_func(frame, direction)
|
||||
else:
|
||||
await self._down_queue.put(frame)
|
||||
|
||||
|
||||
class ParallelPipeline(BasePipeline):
|
||||
@@ -132,28 +41,69 @@ class ParallelPipeline(BasePipeline):
|
||||
Exception: If no processor lists are provided.
|
||||
TypeError: If any argument is not a list of processors.
|
||||
"""
|
||||
# We don't set it to direct mode because we use frame pausing and that
|
||||
# requires queues.
|
||||
super().__init__()
|
||||
|
||||
if len(args) == 0:
|
||||
raise Exception(f"ParallelPipeline needs at least one argument")
|
||||
|
||||
self._args = args
|
||||
self._sources = []
|
||||
self._sinks = []
|
||||
self._pipelines = []
|
||||
|
||||
self._seen_ids = set()
|
||||
self._endframe_counter: Dict[int, int] = {}
|
||||
self._start_frame_counter: Dict[int, int] = {}
|
||||
self._started = False
|
||||
self._frame_counter: Dict[int, int] = {}
|
||||
|
||||
self._up_task = None
|
||||
self._down_task = None
|
||||
logger.debug(f"Creating {self} pipelines")
|
||||
|
||||
for processors in args:
|
||||
if not isinstance(processors, list):
|
||||
raise TypeError(f"ParallelPipeline argument {processors} is not a list")
|
||||
|
||||
num_pipelines = len(self._pipelines)
|
||||
|
||||
# We add a source before the pipeline and a sink after so we control
|
||||
# the frames that are pushed upstream and downstream.
|
||||
source = PipelineSource(
|
||||
self._parallel_push_frame, name=f"{self}::Source{num_pipelines}"
|
||||
)
|
||||
sink = PipelineSink(self._pipeline_sink_push_frame, name=f"{self}::Sink{num_pipelines}")
|
||||
|
||||
# Create pipeline
|
||||
pipeline = Pipeline(processors, source=source, sink=sink)
|
||||
self._pipelines.append(pipeline)
|
||||
|
||||
logger.debug(f"Finished creating {self} pipelines")
|
||||
|
||||
#
|
||||
# BasePipeline
|
||||
# Frame processor
|
||||
#
|
||||
|
||||
@property
|
||||
def processors(self):
|
||||
"""Return the list of sub-processors contained within this processor.
|
||||
|
||||
Only compound processors (e.g. pipelines and parallel pipelines) have
|
||||
sub-processors. Non-compound processors will return an empty list.
|
||||
|
||||
Returns:
|
||||
The list of sub-processors if this is a compound processor.
|
||||
"""
|
||||
return self._pipelines
|
||||
|
||||
@property
|
||||
def entry_processors(self) -> List["FrameProcessor"]:
|
||||
"""Return the list of entry processors for this processor.
|
||||
|
||||
Entry processors are the first processors in a compound processor
|
||||
(e.g. pipelines, parallel pipelines). Note that pipelines can also be an
|
||||
entry processor as pipelines are processors themselves. Non-compound
|
||||
processors will simply return an empty list.
|
||||
|
||||
Returns:
|
||||
The list of entry processors.
|
||||
"""
|
||||
return self._pipelines
|
||||
|
||||
def processors_with_metrics(self) -> List[FrameProcessor]:
|
||||
"""Collect processors that can generate metrics from all parallel branches.
|
||||
|
||||
@@ -162,10 +112,6 @@ class ParallelPipeline(BasePipeline):
|
||||
"""
|
||||
return list(chain.from_iterable(p.processors_with_metrics() for p in self._pipelines))
|
||||
|
||||
#
|
||||
# Frame processor
|
||||
#
|
||||
|
||||
async def setup(self, setup: FrameProcessorSetup):
|
||||
"""Set up the parallel pipeline and all its branches.
|
||||
|
||||
@@ -176,39 +122,14 @@ class ParallelPipeline(BasePipeline):
|
||||
TypeError: If any processor list argument is not actually a list.
|
||||
"""
|
||||
await super().setup(setup)
|
||||
|
||||
self._up_queue = WatchdogQueue(setup.task_manager)
|
||||
self._down_queue = WatchdogQueue(setup.task_manager)
|
||||
|
||||
logger.debug(f"Creating {self} pipelines")
|
||||
for processors in self._args:
|
||||
if not isinstance(processors, list):
|
||||
raise TypeError(f"ParallelPipeline argument {processors} is not a list")
|
||||
|
||||
# We will add a source before the pipeline and a sink after.
|
||||
source = ParallelPipelineSource(self._up_queue, self._parallel_push_frame)
|
||||
sink = ParallelPipelineSink(self._down_queue, self._pipeline_sink_push_frame)
|
||||
self._sources.append(source)
|
||||
self._sinks.append(sink)
|
||||
|
||||
# Create pipeline
|
||||
pipeline = Pipeline(processors)
|
||||
source.link(pipeline)
|
||||
pipeline.link(sink)
|
||||
self._pipelines.append(pipeline)
|
||||
|
||||
logger.debug(f"Finished creating {self} pipelines")
|
||||
|
||||
await asyncio.gather(*[s.setup(setup) for s in self._sources])
|
||||
await asyncio.gather(*[p.setup(setup) for p in self._pipelines])
|
||||
await asyncio.gather(*[s.setup(setup) for s in self._sinks])
|
||||
for p in self._pipelines:
|
||||
await p.setup(setup)
|
||||
|
||||
async def cleanup(self):
|
||||
"""Clean up the parallel pipeline and all its branches."""
|
||||
await super().cleanup()
|
||||
await asyncio.gather(*[s.cleanup() for s in self._sources])
|
||||
await asyncio.gather(*[p.cleanup() for p in self._pipelines])
|
||||
await asyncio.gather(*[s.cleanup() for s in self._sinks])
|
||||
for p in self._pipelines:
|
||||
await p.cleanup()
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
"""Process frames through all parallel branches with lifecycle coordination.
|
||||
@@ -219,79 +140,15 @@ class ParallelPipeline(BasePipeline):
|
||||
"""
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if isinstance(frame, StartFrame):
|
||||
self._start_frame_counter[frame.id] = len(self._pipelines)
|
||||
elif isinstance(frame, EndFrame):
|
||||
self._endframe_counter[frame.id] = len(self._pipelines)
|
||||
elif isinstance(frame, CancelFrame):
|
||||
await self._cancel()
|
||||
# Parallel pipeline synchronized frames.
|
||||
if isinstance(frame, (StartFrame, EndFrame)):
|
||||
self._frame_counter[frame.id] = len(self._pipelines)
|
||||
await self.pause_processing_system_frames()
|
||||
await self.pause_processing_frames()
|
||||
|
||||
if direction == FrameDirection.UPSTREAM:
|
||||
# If we get an upstream frame we process it in each sink.
|
||||
await asyncio.gather(*[s.queue_frame(frame, direction) for s in self._sinks])
|
||||
elif direction == FrameDirection.DOWNSTREAM:
|
||||
# If we get a downstream frame we process it in each source.
|
||||
await asyncio.gather(*[s.queue_frame(frame, direction) for s in self._sources])
|
||||
|
||||
# Handle interruptions after everything has been cancelled.
|
||||
if isinstance(frame, StartInterruptionFrame):
|
||||
await self._handle_interruption()
|
||||
# Wait for tasks to finish.
|
||||
elif isinstance(frame, EndFrame):
|
||||
await self._stop()
|
||||
|
||||
async def _start(self, frame: StartFrame):
|
||||
"""Start the parallel pipeline processing tasks."""
|
||||
await self._create_tasks()
|
||||
|
||||
async def _stop(self):
|
||||
"""Stop all parallel pipeline processing tasks."""
|
||||
if self._up_task:
|
||||
# The up task doesn't receive an EndFrame, so we just cancel it.
|
||||
await self.cancel_task(self._up_task)
|
||||
self._up_task = None
|
||||
|
||||
if self._down_task:
|
||||
# The down tasks waits for the last EndFrame sent by the internal
|
||||
# pipelines.
|
||||
await self._down_task
|
||||
self._down_task = None
|
||||
|
||||
async def _cancel(self):
|
||||
"""Cancel all parallel pipeline processing tasks."""
|
||||
if self._up_task:
|
||||
self._up_queue.cancel()
|
||||
await self.cancel_task(self._up_task)
|
||||
self._up_task = None
|
||||
if self._down_task:
|
||||
self._down_queue.cancel()
|
||||
await self.cancel_task(self._down_task)
|
||||
self._down_task = None
|
||||
|
||||
async def _create_tasks(self):
|
||||
"""Create upstream and downstream processing tasks if not already running."""
|
||||
if not self._up_task:
|
||||
self._up_task = self.create_task(self._process_up_queue())
|
||||
if not self._down_task:
|
||||
self._down_task = self.create_task(self._process_down_queue())
|
||||
|
||||
async def _drain_queue(self, queue: asyncio.Queue):
|
||||
try:
|
||||
while not queue.empty():
|
||||
queue.get_nowait()
|
||||
except asyncio.QueueEmpty:
|
||||
logger.debug(f"Draining {self} queue already empty")
|
||||
|
||||
async def _drain_queues(self):
|
||||
"""Drain all frames from upstream and downstream queues."""
|
||||
await self._drain_queue(self._up_queue)
|
||||
await self._drain_queue(self._down_queue)
|
||||
|
||||
async def _handle_interruption(self):
|
||||
"""Handle interruption by cancelling tasks, draining queues, and restarting."""
|
||||
await self._cancel()
|
||||
await self._drain_queues()
|
||||
await self._create_tasks()
|
||||
# Process frames in each of the sub-pipelines.
|
||||
for p in self._pipelines:
|
||||
await p.queue_frame(frame, direction)
|
||||
|
||||
async def _parallel_push_frame(self, frame: Frame, direction: FrameDirection):
|
||||
"""Push frames while avoiding duplicates using frame ID tracking."""
|
||||
@@ -300,52 +157,18 @@ class ParallelPipeline(BasePipeline):
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
async def _pipeline_sink_push_frame(self, frame: Frame, direction: FrameDirection):
|
||||
if isinstance(frame, StartFrame):
|
||||
# Decrement counter and check if all pipelines have processed the StartFrame
|
||||
start_frame_counter = self._start_frame_counter.get(frame.id, 0)
|
||||
if start_frame_counter > 0:
|
||||
self._start_frame_counter[frame.id] -= 1
|
||||
start_frame_counter = self._start_frame_counter[frame.id]
|
||||
# Parallel pipeline synchronized frames.
|
||||
if isinstance(frame, (StartFrame, EndFrame)):
|
||||
# Decrement counter.
|
||||
frame_counter = self._frame_counter.get(frame.id, 0)
|
||||
if frame_counter > 0:
|
||||
self._frame_counter[frame.id] -= 1
|
||||
frame_counter = self._frame_counter[frame.id]
|
||||
|
||||
# Only push the StartFrame when all pipelines have processed it
|
||||
if start_frame_counter == 0:
|
||||
self._started = True
|
||||
await self._start(frame)
|
||||
# Only push the frame when all pipelines have processed it.
|
||||
if frame_counter == 0:
|
||||
await self._parallel_push_frame(frame, direction)
|
||||
await self.resume_processing_system_frames()
|
||||
await self.resume_processing_frames()
|
||||
else:
|
||||
if self._started:
|
||||
await self._parallel_push_frame(frame, direction)
|
||||
else:
|
||||
await self._down_queue.put(frame)
|
||||
|
||||
async def _process_up_queue(self):
|
||||
"""Process upstream frames from all parallel branches."""
|
||||
while True:
|
||||
frame = await self._up_queue.get()
|
||||
await self._parallel_push_frame(frame, FrameDirection.UPSTREAM)
|
||||
self._up_queue.task_done()
|
||||
|
||||
async def _process_down_queue(self):
|
||||
"""Process downstream frames with EndFrame coordination.
|
||||
|
||||
Coordinates EndFrames to ensure they are only pushed upstream once
|
||||
all parallel branches have completed processing them.
|
||||
"""
|
||||
running = True
|
||||
while running:
|
||||
frame = await self._down_queue.get()
|
||||
|
||||
endframe_counter = self._endframe_counter.get(frame.id, 0)
|
||||
|
||||
# If we have a counter, decrement it.
|
||||
if endframe_counter > 0:
|
||||
self._endframe_counter[frame.id] -= 1
|
||||
endframe_counter = self._endframe_counter[frame.id]
|
||||
|
||||
# If we don't have a counter or we reached 0, push the frame.
|
||||
if endframe_counter == 0:
|
||||
await self._parallel_push_frame(frame, FrameDirection.DOWNSTREAM)
|
||||
|
||||
running = not (endframe_counter == 0 and isinstance(frame, EndFrame))
|
||||
|
||||
self._down_queue.task_done()
|
||||
await self._parallel_push_frame(frame, direction)
|
||||
|
||||
@@ -11,7 +11,7 @@ in sequence and manages frame flow between them, along with helper classes
|
||||
for pipeline source and sink operations.
|
||||
"""
|
||||
|
||||
from typing import Callable, Coroutine, List
|
||||
from typing import Callable, Coroutine, List, Optional
|
||||
|
||||
from pipecat.frames.frames import Frame
|
||||
from pipecat.pipeline.base_pipeline import BasePipeline
|
||||
@@ -26,13 +26,14 @@ class PipelineSource(FrameProcessor):
|
||||
provided upstream handler function.
|
||||
"""
|
||||
|
||||
def __init__(self, upstream_push_frame: Callable[[Frame, FrameDirection], Coroutine]):
|
||||
def __init__(self, upstream_push_frame: Callable[[Frame, FrameDirection], Coroutine], **kwargs):
|
||||
"""Initialize the pipeline source.
|
||||
|
||||
Args:
|
||||
upstream_push_frame: Coroutine function to handle upstream frames.
|
||||
**kwargs: Additional arguments passed to parent class.
|
||||
"""
|
||||
super().__init__()
|
||||
super().__init__(enable_direct_mode=True, **kwargs)
|
||||
self._upstream_push_frame = upstream_push_frame
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
@@ -59,13 +60,16 @@ class PipelineSink(FrameProcessor):
|
||||
provided downstream handler function.
|
||||
"""
|
||||
|
||||
def __init__(self, downstream_push_frame: Callable[[Frame, FrameDirection], Coroutine]):
|
||||
def __init__(
|
||||
self, downstream_push_frame: Callable[[Frame, FrameDirection], Coroutine], **kwargs
|
||||
):
|
||||
"""Initialize the pipeline sink.
|
||||
|
||||
Args:
|
||||
downstream_push_frame: Coroutine function to handle downstream frames.
|
||||
**kwargs: Additional arguments passed to parent class.
|
||||
"""
|
||||
super().__init__()
|
||||
super().__init__(enable_direct_mode=True, **kwargs)
|
||||
self._downstream_push_frame = downstream_push_frame
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
@@ -92,26 +96,60 @@ class Pipeline(BasePipeline):
|
||||
provides metrics collection from contained processors.
|
||||
"""
|
||||
|
||||
def __init__(self, processors: List[FrameProcessor]):
|
||||
def __init__(
|
||||
self,
|
||||
processors: List[FrameProcessor],
|
||||
*,
|
||||
source: Optional[FrameProcessor] = None,
|
||||
sink: Optional[FrameProcessor] = None,
|
||||
):
|
||||
"""Initialize the pipeline with a list of processors.
|
||||
|
||||
Args:
|
||||
processors: List of frame processors to connect in sequence.
|
||||
source: An optional pipeline source processor.
|
||||
sink: An optional pipeline sink processor.
|
||||
"""
|
||||
super().__init__()
|
||||
super().__init__(enable_direct_mode=True)
|
||||
|
||||
# Add a source and a sink queue so we can forward frames upstream and
|
||||
# downstream outside of the pipeline.
|
||||
self._source = PipelineSource(self.push_frame)
|
||||
self._sink = PipelineSink(self.push_frame)
|
||||
self._source = source or PipelineSource(self.push_frame, name=f"{self}::Source")
|
||||
self._sink = sink or PipelineSink(self.push_frame, name=f"{self}::Sink")
|
||||
self._processors: List[FrameProcessor] = [self._source] + processors + [self._sink]
|
||||
|
||||
self._link_processors()
|
||||
|
||||
#
|
||||
# BasePipeline
|
||||
# Frame processor
|
||||
#
|
||||
|
||||
@property
|
||||
def processors(self):
|
||||
"""Return the list of sub-processors contained within this processor.
|
||||
|
||||
Only compound processors (e.g. pipelines and parallel pipelines) have
|
||||
sub-processors. Non-compound processors will return an empty list.
|
||||
|
||||
Returns:
|
||||
The list of sub-processors if this is a compound processor.
|
||||
"""
|
||||
return self._processors
|
||||
|
||||
@property
|
||||
def entry_processors(self) -> List["FrameProcessor"]:
|
||||
"""Return the list of entry processors for this processor.
|
||||
|
||||
Entry processors are the first processors in a compound processor
|
||||
(e.g. pipelines, parallel pipelines). Note that pipelines can also be an
|
||||
entry processor as pipelines are processors themselves. Non-compound
|
||||
processors will simply return an empty list.
|
||||
|
||||
Returns:
|
||||
The list of entry processors.
|
||||
"""
|
||||
return [self._source]
|
||||
|
||||
def processors_with_metrics(self):
|
||||
"""Return processors that can generate metrics.
|
||||
|
||||
@@ -122,17 +160,12 @@ class Pipeline(BasePipeline):
|
||||
List of frame processors that can generate metrics.
|
||||
"""
|
||||
services = []
|
||||
for p in self._processors:
|
||||
if isinstance(p, BasePipeline):
|
||||
services.extend(p.processors_with_metrics())
|
||||
elif p.can_generate_metrics():
|
||||
for p in self.processors:
|
||||
if p.can_generate_metrics():
|
||||
services.append(p)
|
||||
services.extend(p.processors_with_metrics())
|
||||
return services
|
||||
|
||||
#
|
||||
# Frame processor
|
||||
#
|
||||
|
||||
async def setup(self, setup: FrameProcessorSetup):
|
||||
"""Set up the pipeline and all contained processors.
|
||||
|
||||
@@ -175,7 +208,5 @@ class Pipeline(BasePipeline):
|
||||
"""Link all processors in sequence and set their parent."""
|
||||
prev = self._processors[0]
|
||||
for curr in self._processors[1:]:
|
||||
prev.set_parent(self)
|
||||
prev.link(curr)
|
||||
prev = curr
|
||||
prev.set_parent(self)
|
||||
|
||||
@@ -71,7 +71,10 @@ class PipelineRunner(BaseObject):
|
||||
logger.debug(f"Runner {self} started running {task}")
|
||||
self._tasks[task.name] = task
|
||||
params = PipelineTaskParams(loop=self._loop)
|
||||
await task.run(params)
|
||||
try:
|
||||
await task.run(params)
|
||||
except asyncio.CancelledError:
|
||||
await self._cancel()
|
||||
del self._tasks[task.name]
|
||||
|
||||
# Cleanup base object.
|
||||
@@ -95,6 +98,10 @@ class PipelineRunner(BaseObject):
|
||||
async def cancel(self):
|
||||
"""Cancel all running tasks immediately."""
|
||||
logger.debug(f"Cancelling runner {self}")
|
||||
await self._cancel()
|
||||
|
||||
async def _cancel(self):
|
||||
"""Cancel all running tasks immediately."""
|
||||
await asyncio.gather(*[t.cancel() for t in self._tasks.values()])
|
||||
|
||||
def _setup_sigint(self):
|
||||
|
||||
144
src/pipecat/pipeline/service_switcher.py
Normal file
@@ -0,0 +1,144 @@
|
||||
#
|
||||
# Copyright (c) 2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""Service switcher for switching between different services at runtime, with different switching strategies."""
|
||||
|
||||
from typing import Any, Generic, List, Optional, Type, TypeVar
|
||||
|
||||
from pipecat.frames.frames import Frame, ManuallySwitchServiceFrame, ServiceSwitcherFrame
|
||||
from pipecat.pipeline.parallel_pipeline import ParallelPipeline
|
||||
from pipecat.processors.filters.function_filter import FunctionFilter
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
|
||||
|
||||
class ServiceSwitcherStrategy:
|
||||
"""Base class for service switching strategies."""
|
||||
|
||||
def __init__(self, services: List[FrameProcessor]):
|
||||
"""Initialize the service switcher strategy with a list of services."""
|
||||
self.services = services
|
||||
self.active_service: Optional[FrameProcessor] = None
|
||||
|
||||
def is_active(self, service: FrameProcessor) -> bool:
|
||||
"""Determine if the given service is the currently active one.
|
||||
|
||||
This method should be overridden by subclasses to implement specific logic.
|
||||
|
||||
Args:
|
||||
service: The service to check.
|
||||
|
||||
Returns:
|
||||
True if the given service is the active one, False otherwise.
|
||||
"""
|
||||
raise NotImplementedError("Subclasses must implement this method.")
|
||||
|
||||
def handle_frame(self, frame: ServiceSwitcherFrame, direction: FrameDirection):
|
||||
"""Handle a frame that controls service switching.
|
||||
|
||||
This method can be overridden by subclasses to implement specific logic
|
||||
for handling frames that control service switching.
|
||||
|
||||
Args:
|
||||
frame: The frame to handle.
|
||||
direction: The direction of the frame (upstream or downstream).
|
||||
"""
|
||||
raise NotImplementedError("Subclasses must implement this method.")
|
||||
|
||||
|
||||
class ServiceSwitcherStrategyManual(ServiceSwitcherStrategy):
|
||||
"""A strategy for switching between services manually.
|
||||
|
||||
This strategy allows the user to manually select which service is active.
|
||||
The initial active service is the first one in the list.
|
||||
"""
|
||||
|
||||
def __init__(self, services: List[FrameProcessor]):
|
||||
"""Initialize the manual service switcher strategy with a list of services."""
|
||||
super().__init__(services)
|
||||
self.active_service = services[0] if services else None
|
||||
|
||||
def is_active(self, service: FrameProcessor) -> bool:
|
||||
"""Check if the given service is the currently active one.
|
||||
|
||||
Args:
|
||||
service: The service to check.
|
||||
|
||||
Returns:
|
||||
True if the given service is the active one, False otherwise.
|
||||
"""
|
||||
return service == self.active_service
|
||||
|
||||
def handle_frame(self, frame: ServiceSwitcherFrame, direction: FrameDirection):
|
||||
"""Handle a frame that controls service switching.
|
||||
|
||||
Args:
|
||||
frame: The frame to handle.
|
||||
direction: The direction of the frame (upstream or downstream).
|
||||
"""
|
||||
if isinstance(frame, ManuallySwitchServiceFrame):
|
||||
self._set_active(frame.service)
|
||||
else:
|
||||
raise ValueError(f"Unsupported frame type: {type(frame)}")
|
||||
|
||||
def _set_active(self, service: FrameProcessor):
|
||||
"""Set the active service to the given one.
|
||||
|
||||
Args:
|
||||
service: The service to set as active.
|
||||
"""
|
||||
if service in self.services:
|
||||
self.active_service = service
|
||||
else:
|
||||
raise ValueError(f"Service {service} is not in the list of available services.")
|
||||
|
||||
|
||||
StrategyType = TypeVar("StrategyType", bound=ServiceSwitcherStrategy)
|
||||
|
||||
|
||||
class ServiceSwitcher(ParallelPipeline, Generic[StrategyType]):
|
||||
"""A pipeline that switches between different services at runtime."""
|
||||
|
||||
def __init__(self, services: List[FrameProcessor], strategy_type: Type[StrategyType]):
|
||||
"""Initialize the service switcher with a list of services and a switching strategy."""
|
||||
strategy = strategy_type(services)
|
||||
super().__init__(*self._make_pipeline_definitions(services, strategy))
|
||||
self.services = services
|
||||
self.strategy = strategy
|
||||
|
||||
@staticmethod
|
||||
def _make_pipeline_definitions(
|
||||
services: List[FrameProcessor], strategy: ServiceSwitcherStrategy
|
||||
) -> List[Any]:
|
||||
pipelines = []
|
||||
for service in services:
|
||||
pipelines.append(ServiceSwitcher._make_pipeline_definition(service, strategy))
|
||||
return pipelines
|
||||
|
||||
@staticmethod
|
||||
def _make_pipeline_definition(
|
||||
service: FrameProcessor, strategy: ServiceSwitcherStrategy
|
||||
) -> Any:
|
||||
async def filter(frame) -> bool:
|
||||
_ = frame
|
||||
return strategy.is_active(service)
|
||||
|
||||
return [
|
||||
FunctionFilter(filter, direction=FrameDirection.DOWNSTREAM),
|
||||
service,
|
||||
FunctionFilter(filter, direction=FrameDirection.UPSTREAM),
|
||||
]
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
"""Process a frame, handling frames which affect service switching.
|
||||
|
||||
Args:
|
||||
frame: The frame to process.
|
||||
direction: The direction of the frame (upstream or downstream).
|
||||
"""
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if isinstance(frame, ServiceSwitcherFrame):
|
||||
self.strategy.handle_frame(frame, direction)
|
||||
@@ -22,7 +22,6 @@ from pipecat.frames.frames import ControlFrame, EndFrame, Frame, SystemFrame
|
||||
from pipecat.pipeline.base_pipeline import BasePipeline
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor, FrameProcessorSetup
|
||||
from pipecat.utils.asyncio.watchdog_queue import WatchdogQueue
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -49,7 +48,7 @@ class SyncParallelPipelineSource(FrameProcessor):
|
||||
Args:
|
||||
upstream_queue: Queue for collecting upstream frames from the pipeline.
|
||||
"""
|
||||
super().__init__()
|
||||
super().__init__(enable_direct_mode=True)
|
||||
self._up_queue = upstream_queue
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
@@ -81,7 +80,7 @@ class SyncParallelPipelineSink(FrameProcessor):
|
||||
Args:
|
||||
downstream_queue: Queue for collecting downstream frames from the pipeline.
|
||||
"""
|
||||
super().__init__()
|
||||
super().__init__(enable_direct_mode=True)
|
||||
self._down_queue = downstream_queue
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
@@ -128,40 +127,15 @@ class SyncParallelPipeline(BasePipeline):
|
||||
if len(args) == 0:
|
||||
raise Exception(f"SyncParallelPipeline needs at least one argument")
|
||||
|
||||
self._args = args
|
||||
self._sinks = []
|
||||
self._sources = []
|
||||
self._pipelines = []
|
||||
|
||||
#
|
||||
# BasePipeline
|
||||
#
|
||||
|
||||
def processors_with_metrics(self) -> List[FrameProcessor]:
|
||||
"""Collect processors that can generate metrics from all parallel pipelines.
|
||||
|
||||
Returns:
|
||||
List of frame processors that support metrics collection from all parallel paths.
|
||||
"""
|
||||
return list(chain.from_iterable(p.processors_with_metrics() for p in self._pipelines))
|
||||
|
||||
#
|
||||
# Frame processor
|
||||
#
|
||||
|
||||
async def setup(self, setup: FrameProcessorSetup):
|
||||
"""Set up the parallel pipeline and all contained processors.
|
||||
|
||||
Args:
|
||||
setup: Configuration for frame processor setup.
|
||||
"""
|
||||
await super().setup(setup)
|
||||
|
||||
self._up_queue = WatchdogQueue(setup.task_manager)
|
||||
self._down_queue = WatchdogQueue(setup.task_manager)
|
||||
self._up_queue = asyncio.Queue()
|
||||
self._down_queue = asyncio.Queue()
|
||||
|
||||
logger.debug(f"Creating {self} pipelines")
|
||||
for processors in self._args:
|
||||
for processors in args:
|
||||
if not isinstance(processors, list):
|
||||
raise TypeError(f"SyncParallelPipeline argument {processors} is not a list")
|
||||
|
||||
@@ -171,29 +145,68 @@ class SyncParallelPipeline(BasePipeline):
|
||||
source = SyncParallelPipelineSource(up_queue)
|
||||
sink = SyncParallelPipelineSink(down_queue)
|
||||
|
||||
# Create pipeline
|
||||
pipeline = Pipeline(processors)
|
||||
source.link(pipeline)
|
||||
pipeline.link(sink)
|
||||
self._pipelines.append(pipeline)
|
||||
|
||||
# Keep track of sources and sinks. We also keep the output queue of
|
||||
# the source and the sinks so we can use it later.
|
||||
self._sources.append({"processor": source, "queue": down_queue})
|
||||
self._sinks.append({"processor": sink, "queue": up_queue})
|
||||
|
||||
# Create pipeline
|
||||
pipeline = Pipeline(processors, source=source, sink=sink)
|
||||
self._pipelines.append(pipeline)
|
||||
|
||||
logger.debug(f"Finished creating {self} pipelines")
|
||||
|
||||
await asyncio.gather(*[s["processor"].setup(setup) for s in self._sources])
|
||||
#
|
||||
# Frame processor
|
||||
#
|
||||
|
||||
@property
|
||||
def processors(self):
|
||||
"""Return the list of sub-processors contained within this processor.
|
||||
|
||||
Only compound processors (e.g. pipelines and parallel pipelines) have
|
||||
sub-processors. Non-compound processors will return an empty list.
|
||||
|
||||
Returns:
|
||||
The list of sub-processors if this is a compound processor.
|
||||
"""
|
||||
return self._pipelines
|
||||
|
||||
@property
|
||||
def entry_processors(self) -> List["FrameProcessor"]:
|
||||
"""Return the list of entry processors for this processor.
|
||||
|
||||
Entry processors are the first processors in a compound processor
|
||||
(e.g. pipelines, parallel pipelines). Note that pipelines can also be an
|
||||
entry processor as pipelines are processors themselves. Non-compound
|
||||
processors will simply return an empty list.
|
||||
|
||||
Returns:
|
||||
The list of entry processors.
|
||||
"""
|
||||
return self._sources
|
||||
|
||||
def processors_with_metrics(self) -> List[FrameProcessor]:
|
||||
"""Collect processors that can generate metrics from all parallel pipelines.
|
||||
|
||||
Returns:
|
||||
List of frame processors that support metrics collection from all parallel paths.
|
||||
"""
|
||||
return list(chain.from_iterable(p.processors_with_metrics() for p in self._pipelines))
|
||||
|
||||
async def setup(self, setup: FrameProcessorSetup):
|
||||
"""Set up the parallel pipeline and all contained processors.
|
||||
|
||||
Args:
|
||||
setup: Configuration for frame processor setup.
|
||||
"""
|
||||
await super().setup(setup)
|
||||
await asyncio.gather(*[p.setup(setup) for p in self._pipelines])
|
||||
await asyncio.gather(*[s["processor"].setup(setup) for s in self._sinks])
|
||||
|
||||
async def cleanup(self):
|
||||
"""Clean up the parallel pipeline and all contained processors."""
|
||||
await super().cleanup()
|
||||
await asyncio.gather(*[s["processor"].cleanup() for s in self._sources])
|
||||
await asyncio.gather(*[p.cleanup() for p in self._pipelines])
|
||||
await asyncio.gather(*[s["processor"].cleanup() for s in self._sinks])
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
"""Process frames through all parallel pipelines with synchronization.
|
||||
|
||||
@@ -32,31 +32,33 @@ from pipecat.frames.frames import (
|
||||
Frame,
|
||||
HeartbeatFrame,
|
||||
InputAudioRawFrame,
|
||||
InterimTranscriptionFrame,
|
||||
LLMFullResponseEndFrame,
|
||||
MetricsFrame,
|
||||
StartFrame,
|
||||
StopFrame,
|
||||
StopTaskFrame,
|
||||
TranscriptionFrame,
|
||||
UserStartedSpeakingFrame,
|
||||
UserStoppedSpeakingFrame,
|
||||
)
|
||||
from pipecat.metrics.metrics import ProcessingMetricsData, TTFBMetricsData
|
||||
from pipecat.observers.base_observer import BaseObserver
|
||||
from pipecat.observers.turn_tracking_observer import TurnTrackingObserver
|
||||
from pipecat.pipeline.base_pipeline import BasePipeline
|
||||
from pipecat.pipeline.base_task import BasePipelineTask, PipelineTaskParams
|
||||
from pipecat.pipeline.pipeline import Pipeline, PipelineSink, PipelineSource
|
||||
from pipecat.pipeline.task_observer import TaskObserver
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor, FrameProcessorSetup
|
||||
from pipecat.utils.asyncio.task_manager import (
|
||||
WATCHDOG_TIMEOUT,
|
||||
BaseTaskManager,
|
||||
TaskManager,
|
||||
TaskManagerParams,
|
||||
)
|
||||
from pipecat.utils.asyncio.watchdog_queue import WatchdogQueue
|
||||
from pipecat.utils.asyncio.task_manager import BaseTaskManager, TaskManager, TaskManagerParams
|
||||
from pipecat.utils.tracing.setup import is_tracing_available
|
||||
from pipecat.utils.tracing.turn_trace_observer import TurnTraceObserver
|
||||
|
||||
HEARTBEAT_SECONDS = 1.0
|
||||
HEARTBEAT_MONITOR_SECONDS = HEARTBEAT_SECONDS * 10
|
||||
HEARTBEAT_SECS = 1.0
|
||||
HEARTBEAT_MONITOR_SECS = HEARTBEAT_SECS * 10
|
||||
|
||||
IDLE_TIMEOUT_SECS = 300
|
||||
|
||||
CANCEL_TIMEOUT_SECS = 10.0
|
||||
|
||||
|
||||
class PipelineParams(BaseModel):
|
||||
@@ -93,7 +95,7 @@ class PipelineParams(BaseModel):
|
||||
enable_heartbeats: bool = False
|
||||
enable_metrics: bool = False
|
||||
enable_usage_metrics: bool = False
|
||||
heartbeats_period_secs: float = HEARTBEAT_SECONDS
|
||||
heartbeats_period_secs: float = HEARTBEAT_SECS
|
||||
interruption_strategies: List[BaseInterruptionStrategy] = Field(default_factory=list)
|
||||
observers: List[BaseObserver] = Field(default_factory=list)
|
||||
report_only_initial_ttfb: bool = False
|
||||
@@ -101,70 +103,6 @@ class PipelineParams(BaseModel):
|
||||
start_metadata: Dict[str, Any] = Field(default_factory=dict)
|
||||
|
||||
|
||||
class PipelineTaskSource(FrameProcessor):
|
||||
"""Source processor for pipeline tasks that handles frame routing.
|
||||
|
||||
This is the source processor that is linked at the beginning of the
|
||||
pipeline given to the pipeline task. It allows us to easily push frames
|
||||
downstream to the pipeline and also receive upstream frames coming from the
|
||||
pipeline.
|
||||
"""
|
||||
|
||||
def __init__(self, up_queue: asyncio.Queue, **kwargs):
|
||||
"""Initialize the pipeline task source.
|
||||
|
||||
Args:
|
||||
up_queue: Queue for upstream frame processing.
|
||||
**kwargs: Additional arguments passed to the parent class.
|
||||
"""
|
||||
super().__init__(**kwargs)
|
||||
self._up_queue = up_queue
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
"""Process frames and route them based on direction.
|
||||
|
||||
Args:
|
||||
frame: The frame to process.
|
||||
direction: The direction of frame flow.
|
||||
"""
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
match direction:
|
||||
case FrameDirection.UPSTREAM:
|
||||
await self._up_queue.put(frame)
|
||||
case FrameDirection.DOWNSTREAM:
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
|
||||
class PipelineTaskSink(FrameProcessor):
|
||||
"""Sink processor for pipeline tasks that handles final frame processing.
|
||||
|
||||
This is the sink processor that is linked at the end of the pipeline
|
||||
given to the pipeline task. It allows us to receive downstream frames and
|
||||
act on them, for example, waiting to receive an EndFrame.
|
||||
"""
|
||||
|
||||
def __init__(self, down_queue: asyncio.Queue, **kwargs):
|
||||
"""Initialize the pipeline task sink.
|
||||
|
||||
Args:
|
||||
down_queue: Queue for downstream frame processing.
|
||||
**kwargs: Additional arguments passed to the parent class.
|
||||
"""
|
||||
super().__init__(**kwargs)
|
||||
self._down_queue = down_queue
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
"""Process frames and route them to the downstream queue.
|
||||
|
||||
Args:
|
||||
frame: The frame to process.
|
||||
direction: The direction of frame flow.
|
||||
"""
|
||||
await super().process_frame(frame, direction)
|
||||
await self._down_queue.put(frame)
|
||||
|
||||
|
||||
class PipelineTask(BasePipelineTask):
|
||||
"""Manages the execution of a pipeline, handling frame processing and task lifecycle.
|
||||
|
||||
@@ -196,26 +134,28 @@ class PipelineTask(BasePipelineTask):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
pipeline: BasePipeline,
|
||||
pipeline: FrameProcessor,
|
||||
*,
|
||||
params: Optional[PipelineParams] = None,
|
||||
additional_span_attributes: Optional[dict] = None,
|
||||
cancel_on_idle_timeout: bool = True,
|
||||
cancel_timeout_secs: float = CANCEL_TIMEOUT_SECS,
|
||||
check_dangling_tasks: bool = True,
|
||||
clock: Optional[BaseClock] = None,
|
||||
conversation_id: Optional[str] = None,
|
||||
enable_tracing: bool = False,
|
||||
enable_turn_tracking: bool = True,
|
||||
enable_watchdog_logging: bool = False,
|
||||
enable_watchdog_timers: bool = False,
|
||||
idle_timeout_frames: Tuple[Type[Frame], ...] = (
|
||||
BotSpeakingFrame,
|
||||
InterimTranscriptionFrame,
|
||||
LLMFullResponseEndFrame,
|
||||
TranscriptionFrame,
|
||||
UserStartedSpeakingFrame,
|
||||
UserStoppedSpeakingFrame,
|
||||
),
|
||||
idle_timeout_secs: Optional[float] = 300,
|
||||
idle_timeout_secs: Optional[float] = IDLE_TIMEOUT_SECS,
|
||||
observers: Optional[List[BaseObserver]] = None,
|
||||
task_manager: Optional[BaseTaskManager] = None,
|
||||
watchdog_timeout_secs: float = WATCHDOG_TIMEOUT,
|
||||
):
|
||||
"""Initialize the PipelineTask.
|
||||
|
||||
@@ -226,13 +166,13 @@ class PipelineTask(BasePipelineTask):
|
||||
OpenTelemetry conversation span attributes.
|
||||
cancel_on_idle_timeout: Whether the pipeline task should be cancelled if
|
||||
the idle timeout is reached.
|
||||
cancel_timeout_secs: Timeout (in seconds) to wait for cancellation to happen
|
||||
cleanly.
|
||||
check_dangling_tasks: Whether to check for processors' tasks finishing properly.
|
||||
clock: Clock implementation for timing operations.
|
||||
conversation_id: Optional custom ID for the conversation.
|
||||
enable_tracing: Whether to enable tracing.
|
||||
enable_turn_tracking: Whether to enable turn tracking.
|
||||
enable_watchdog_logging: Whether to print task processing times.
|
||||
enable_watchdog_timers: Whether to enable task watchdog timers.
|
||||
idle_timeout_frames: A tuple with the frames that should trigger an idle
|
||||
timeout if not received within `idle_timeout_seconds`.
|
||||
idle_timeout_secs: Timeout (in seconds) to consider pipeline idle or
|
||||
@@ -240,24 +180,19 @@ class PipelineTask(BasePipelineTask):
|
||||
automatically.
|
||||
observers: List of observers for monitoring pipeline execution.
|
||||
task_manager: Optional task manager for handling asyncio tasks.
|
||||
watchdog_timeout_secs: Watchdog timer timeout (in seconds). A warning
|
||||
will be logged if the watchdog timer is not reset before this timeout.
|
||||
"""
|
||||
super().__init__()
|
||||
self._pipeline = pipeline
|
||||
self._params = params or PipelineParams()
|
||||
self._additional_span_attributes = additional_span_attributes or {}
|
||||
self._cancel_on_idle_timeout = cancel_on_idle_timeout
|
||||
self._cancel_timeout_secs = cancel_timeout_secs
|
||||
self._check_dangling_tasks = check_dangling_tasks
|
||||
self._clock = clock or SystemClock()
|
||||
self._conversation_id = conversation_id
|
||||
self._enable_tracing = enable_tracing and is_tracing_available()
|
||||
self._enable_turn_tracking = enable_turn_tracking
|
||||
self._enable_watchdog_logging = enable_watchdog_logging
|
||||
self._enable_watchdog_timers = enable_watchdog_timers
|
||||
self._idle_timeout_frames = idle_timeout_frames
|
||||
self._idle_timeout_secs = idle_timeout_secs
|
||||
self._watchdog_timeout_secs = watchdog_timeout_secs
|
||||
if self._params.observers:
|
||||
import warnings
|
||||
|
||||
@@ -288,40 +223,30 @@ class PipelineTask(BasePipelineTask):
|
||||
# PipelineTask and its frame processors.
|
||||
self._task_manager = task_manager or TaskManager()
|
||||
|
||||
# This queue receives frames coming from the pipeline upstream.
|
||||
self._up_queue = WatchdogQueue(self._task_manager)
|
||||
self._process_up_task: Optional[asyncio.Task] = None
|
||||
# This queue receives frames coming from the pipeline downstream.
|
||||
self._down_queue = WatchdogQueue(self._task_manager)
|
||||
self._process_down_task: Optional[asyncio.Task] = None
|
||||
# This queue is the queue used to push frames to the pipeline.
|
||||
self._push_queue = WatchdogQueue(self._task_manager)
|
||||
self._push_queue = asyncio.Queue()
|
||||
self._process_push_task: Optional[asyncio.Task] = None
|
||||
# This is the heartbeat queue. When a heartbeat frame is received in the
|
||||
# down queue we add it to the heartbeat queue for processing.
|
||||
self._heartbeat_queue = WatchdogQueue(self._task_manager)
|
||||
self._heartbeat_queue = asyncio.Queue()
|
||||
self._heartbeat_push_task: Optional[asyncio.Task] = None
|
||||
self._heartbeat_monitor_task: Optional[asyncio.Task] = None
|
||||
# This is the idle queue. When frames are received downstream they are
|
||||
# put in the queue. If no frame is received the pipeline is considered
|
||||
# idle.
|
||||
self._idle_queue = WatchdogQueue(self._task_manager)
|
||||
self._idle_queue = asyncio.Queue()
|
||||
self._idle_monitor_task: Optional[asyncio.Task] = None
|
||||
# This event is used to indicate a finalize frame (e.g. EndFrame,
|
||||
# StopFrame) has been received in the down queue.
|
||||
self._pipeline_end_event = asyncio.Event()
|
||||
|
||||
# This is a source processor that we connect to the provided
|
||||
# pipeline. This source processor allows up to receive and react to
|
||||
# upstream frames.
|
||||
self._source = PipelineTaskSource(self._up_queue)
|
||||
self._source.link(pipeline)
|
||||
|
||||
# This is a sink processor that we connect to the provided
|
||||
# pipeline. This sink processor allows up to receive and react to
|
||||
# downstream frames.
|
||||
self._sink = PipelineTaskSink(self._down_queue)
|
||||
pipeline.link(self._sink)
|
||||
# This is the final pipeline. It is composed of a source processor,
|
||||
# followed by the user pipeline, and ending with a sink processor. The
|
||||
# source allows us to receive and react to upstream frames, and the sink
|
||||
# allows us to receive and react to downstream frames.
|
||||
source = PipelineSource(self._source_push_frame, name=f"{self}::Source")
|
||||
sink = PipelineSink(self._sink_push_frame, name=f"{self}::Sink")
|
||||
self._pipeline = Pipeline([pipeline], source=source, sink=sink)
|
||||
|
||||
# The task observer acts as a proxy to the provided observers. This way,
|
||||
# we only need to pass a single observer (using the StartFrame) which
|
||||
@@ -446,24 +371,44 @@ class PipelineTask(BasePipelineTask):
|
||||
|
||||
# Create all main tasks and wait of the main push task. This is the
|
||||
# task that pushes frames to the very beginning of our pipeline (our
|
||||
# controlled PipelineTaskSource processor).
|
||||
# controlled source processor).
|
||||
push_task = await self._create_tasks()
|
||||
await self._task_manager.wait_for_task(push_task)
|
||||
await push_task
|
||||
|
||||
# We have already cleaned up the pipeline inside the task.
|
||||
cleanup_pipeline = False
|
||||
except asyncio.CancelledError:
|
||||
# We are awaiting on the push task and it might be cancelled
|
||||
# (e.g. Ctrl-C). This means we will get a CancelledError here as
|
||||
# well, because you get a CancelledError in every place you are
|
||||
# awaiting a task.
|
||||
pass
|
||||
finally:
|
||||
await self._cancel_tasks()
|
||||
await self._cleanup(cleanup_pipeline)
|
||||
if self._check_dangling_tasks:
|
||||
self._print_dangling_tasks()
|
||||
|
||||
# Pipeline has finished nicely.
|
||||
self._finished = True
|
||||
except asyncio.CancelledError:
|
||||
# Raise exception back to the pipeline runner so it can cancel this
|
||||
# task properly.
|
||||
raise
|
||||
finally:
|
||||
# We can reach this point for different reasons:
|
||||
#
|
||||
# 1. The task has finished properly (e.g. `EndFrame`).
|
||||
# 2. By calling `PipelineTask.cancel()`.
|
||||
# 3. By asyncio task cancellation.
|
||||
#
|
||||
# Case (1) will execute the code below without issues because
|
||||
# `self._finished` is true.
|
||||
#
|
||||
# Case (2) will execute the code below without issues because
|
||||
# `self._cancelled` is true.
|
||||
#
|
||||
# Case (3) will raise the exception above (because we are cancelling
|
||||
# the asyncio task). This will be then captured by the
|
||||
# `PipelineRunner` which will call `PipelineTask.cancel()` and
|
||||
# therefore becoming case (2).
|
||||
if self._finished or self._cancelled:
|
||||
logger.debug(f"Pipeline task {self} is finishing cleanup...")
|
||||
await self._cancel_tasks()
|
||||
await self._cleanup(cleanup_pipeline)
|
||||
if self._check_dangling_tasks:
|
||||
self._print_dangling_tasks()
|
||||
self._finished = True
|
||||
logger.debug(f"Pipeline task {self} has finished")
|
||||
|
||||
async def queue_frame(self, frame: Frame):
|
||||
"""Queue a single frame to be pushed down the pipeline.
|
||||
@@ -491,12 +436,13 @@ class PipelineTask(BasePipelineTask):
|
||||
if not self._cancelled:
|
||||
logger.debug(f"Cancelling pipeline task {self}")
|
||||
self._cancelled = True
|
||||
cancel_frame = CancelFrame()
|
||||
# Make sure everything is cleaned up downstream. This is sent
|
||||
# out-of-band from the main streaming task which is what we want since
|
||||
# we want to cancel right away.
|
||||
await self._source.push_frame(CancelFrame())
|
||||
# Wait for CancelFrame to make it throught the pipeline.
|
||||
await self._wait_for_pipeline_end()
|
||||
await self._pipeline.queue_frame(cancel_frame)
|
||||
# Wait for CancelFrame to make it through the pipeline.
|
||||
await self._wait_for_pipeline_end(cancel_frame)
|
||||
# Only cancel the push task, we don't want to be able to process any
|
||||
# other frame after cancel. Everything else will be cancelled in
|
||||
# run().
|
||||
@@ -506,12 +452,6 @@ class PipelineTask(BasePipelineTask):
|
||||
|
||||
async def _create_tasks(self):
|
||||
"""Create and start all pipeline processing tasks."""
|
||||
self._process_up_task = self._task_manager.create_task(
|
||||
self._process_up_queue(), f"{self}::_process_up_queue"
|
||||
)
|
||||
self._process_down_task = self._task_manager.create_task(
|
||||
self._process_down_queue(), f"{self}::_process_down_queue"
|
||||
)
|
||||
self._process_push_task = self._task_manager.create_task(
|
||||
self._process_push_queue(), f"{self}::_process_push_queue"
|
||||
)
|
||||
@@ -545,14 +485,6 @@ class PipelineTask(BasePipelineTask):
|
||||
await self._task_manager.cancel_task(self._process_push_task)
|
||||
self._process_push_task = None
|
||||
|
||||
if self._process_up_task:
|
||||
await self._task_manager.cancel_task(self._process_up_task)
|
||||
self._process_up_task = None
|
||||
|
||||
if self._process_down_task:
|
||||
await self._task_manager.cancel_task(self._process_down_task)
|
||||
self._process_down_task = None
|
||||
|
||||
await self._maybe_cancel_heartbeat_tasks()
|
||||
await self._maybe_cancel_idle_task()
|
||||
|
||||
@@ -572,7 +504,6 @@ class PipelineTask(BasePipelineTask):
|
||||
async def _maybe_cancel_idle_task(self):
|
||||
"""Cancel idle monitoring task if it is running."""
|
||||
if self._idle_timeout_secs and self._idle_monitor_task:
|
||||
self._idle_queue.cancel()
|
||||
await self._task_manager.cancel_task(self._idle_monitor_task)
|
||||
self._idle_monitor_task = None
|
||||
|
||||
@@ -585,30 +516,43 @@ class PipelineTask(BasePipelineTask):
|
||||
data.append(ProcessingMetricsData(processor=p.name, value=0.0))
|
||||
return MetricsFrame(data=data)
|
||||
|
||||
async def _wait_for_pipeline_end(self):
|
||||
async def _wait_for_pipeline_end(self, frame: Frame):
|
||||
"""Wait for the pipeline to signal completion."""
|
||||
await self._pipeline_end_event.wait()
|
||||
|
||||
async def wait_for_cancel():
|
||||
try:
|
||||
await asyncio.wait_for(
|
||||
self._pipeline_end_event.wait(), timeout=self._cancel_timeout_secs
|
||||
)
|
||||
logger.debug(f"{self}: {frame} reached the end of the pipeline.")
|
||||
except asyncio.TimeoutError:
|
||||
logger.warning(
|
||||
f"{self}: timeout waiting for {frame} to reach the end of the pipeline (being blocked somewhere?)."
|
||||
)
|
||||
finally:
|
||||
await self._call_event_handler("on_pipeline_cancelled", frame)
|
||||
|
||||
logger.debug(f"{self}: waiting for {frame} to reach the end of the pipeline...")
|
||||
|
||||
if isinstance(frame, CancelFrame):
|
||||
await wait_for_cancel()
|
||||
else:
|
||||
await self._pipeline_end_event.wait()
|
||||
logger.debug(f"{self}: {frame} reached the end of the pipeline.")
|
||||
|
||||
self._pipeline_end_event.clear()
|
||||
|
||||
async def _setup(self, params: PipelineTaskParams):
|
||||
"""Set up the pipeline task and all processors."""
|
||||
mgr_params = TaskManagerParams(
|
||||
loop=params.loop,
|
||||
enable_watchdog_logging=self._enable_watchdog_logging,
|
||||
enable_watchdog_timers=self._enable_watchdog_timers,
|
||||
watchdog_timeout=self._watchdog_timeout_secs,
|
||||
)
|
||||
mgr_params = TaskManagerParams(loop=params.loop)
|
||||
self._task_manager.setup(mgr_params)
|
||||
|
||||
setup = FrameProcessorSetup(
|
||||
clock=self._clock,
|
||||
task_manager=self._task_manager,
|
||||
observer=self._observer,
|
||||
watchdog_timers_enabled=self._enable_watchdog_timers,
|
||||
)
|
||||
await self._source.setup(setup)
|
||||
await self._pipeline.setup(setup)
|
||||
await self._sink.setup(setup)
|
||||
|
||||
async def _cleanup(self, cleanup_pipeline: bool):
|
||||
"""Clean up the pipeline task and processors."""
|
||||
@@ -620,10 +564,8 @@ class PipelineTask(BasePipelineTask):
|
||||
self._turn_trace_observer.end_conversation_tracing()
|
||||
|
||||
# Cleanup pipeline processors.
|
||||
await self._source.cleanup()
|
||||
if cleanup_pipeline:
|
||||
await self._pipeline.cleanup()
|
||||
await self._sink.cleanup()
|
||||
|
||||
async def _process_push_queue(self):
|
||||
"""Process frames from the push queue and send them through the pipeline.
|
||||
@@ -647,24 +589,24 @@ class PipelineTask(BasePipelineTask):
|
||||
interruption_strategies=self._params.interruption_strategies,
|
||||
)
|
||||
start_frame.metadata = self._params.start_metadata
|
||||
await self._source.queue_frame(start_frame, FrameDirection.DOWNSTREAM)
|
||||
await self._pipeline.queue_frame(start_frame)
|
||||
|
||||
if self._params.enable_metrics and self._params.send_initial_empty_metrics:
|
||||
await self._source.queue_frame(self._initial_metrics_frame(), FrameDirection.DOWNSTREAM)
|
||||
await self._pipeline.queue_frame(self._initial_metrics_frame())
|
||||
|
||||
running = True
|
||||
cleanup_pipeline = True
|
||||
while running:
|
||||
frame = await self._push_queue.get()
|
||||
await self._source.queue_frame(frame, FrameDirection.DOWNSTREAM)
|
||||
await self._pipeline.queue_frame(frame)
|
||||
if isinstance(frame, (CancelFrame, EndFrame, StopFrame)):
|
||||
await self._wait_for_pipeline_end()
|
||||
await self._wait_for_pipeline_end(frame)
|
||||
running = not isinstance(frame, (CancelFrame, EndFrame, StopFrame))
|
||||
cleanup_pipeline = not isinstance(frame, StopFrame)
|
||||
self._push_queue.task_done()
|
||||
await self._cleanup(cleanup_pipeline)
|
||||
|
||||
async def _process_up_queue(self):
|
||||
async def _source_push_frame(self, frame: Frame, direction: FrameDirection):
|
||||
"""Process frames coming upstream from the pipeline.
|
||||
|
||||
This is the task that processes frames coming upstream from the
|
||||
@@ -672,33 +614,29 @@ class PipelineTask(BasePipelineTask):
|
||||
pipeline to be stopped (e.g. EndTaskFrame) in which case we would send
|
||||
an EndFrame down the pipeline.
|
||||
"""
|
||||
while True:
|
||||
frame = await self._up_queue.get()
|
||||
if isinstance(frame, self._reached_upstream_types):
|
||||
await self._call_event_handler("on_frame_reached_upstream", frame)
|
||||
|
||||
if isinstance(frame, self._reached_upstream_types):
|
||||
await self._call_event_handler("on_frame_reached_upstream", frame)
|
||||
|
||||
if isinstance(frame, EndTaskFrame):
|
||||
# Tell the task we should end nicely.
|
||||
await self.queue_frame(EndFrame())
|
||||
elif isinstance(frame, CancelTaskFrame):
|
||||
# Tell the task we should end right away.
|
||||
if isinstance(frame, EndTaskFrame):
|
||||
# Tell the task we should end nicely.
|
||||
await self.queue_frame(EndFrame())
|
||||
elif isinstance(frame, CancelTaskFrame):
|
||||
# Tell the task we should end right away.
|
||||
await self.queue_frame(CancelFrame())
|
||||
elif isinstance(frame, StopTaskFrame):
|
||||
# Tell the task we should stop nicely.
|
||||
await self.queue_frame(StopFrame())
|
||||
elif isinstance(frame, ErrorFrame):
|
||||
if frame.fatal:
|
||||
logger.error(f"A fatal error occurred: {frame}")
|
||||
# Cancel all tasks downstream.
|
||||
await self.queue_frame(CancelFrame())
|
||||
elif isinstance(frame, StopTaskFrame):
|
||||
# Tell the task we should stop nicely.
|
||||
await self.queue_frame(StopFrame())
|
||||
elif isinstance(frame, ErrorFrame):
|
||||
if frame.fatal:
|
||||
logger.error(f"A fatal error occurred: {frame}")
|
||||
# Cancel all tasks downstream.
|
||||
await self.queue_frame(CancelFrame())
|
||||
# Tell the task we should stop.
|
||||
await self.queue_frame(StopTaskFrame())
|
||||
else:
|
||||
logger.warning(f"Something went wrong: {frame}")
|
||||
self._up_queue.task_done()
|
||||
# Tell the task we should stop.
|
||||
await self.queue_frame(StopTaskFrame())
|
||||
else:
|
||||
logger.warning(f"Something went wrong: {frame}")
|
||||
|
||||
async def _process_down_queue(self):
|
||||
async def _sink_push_frame(self, frame: Frame, direction: FrameDirection):
|
||||
"""Process frames coming downstream from the pipeline.
|
||||
|
||||
This tasks process frames coming downstream from the pipeline. For
|
||||
@@ -706,34 +644,29 @@ class PipelineTask(BasePipelineTask):
|
||||
processors have handled the EndFrame and therefore we can exit the task
|
||||
cleanly.
|
||||
"""
|
||||
while True:
|
||||
frame = await self._down_queue.get()
|
||||
# Queue received frame to the idle queue so we can monitor idle
|
||||
# pipelines.
|
||||
await self._idle_queue.put(frame)
|
||||
|
||||
# Queue received frame to the idle queue so we can monitor idle
|
||||
# pipelines.
|
||||
await self._idle_queue.put(frame)
|
||||
if isinstance(frame, self._reached_downstream_types):
|
||||
await self._call_event_handler("on_frame_reached_downstream", frame)
|
||||
|
||||
if isinstance(frame, self._reached_downstream_types):
|
||||
await self._call_event_handler("on_frame_reached_downstream", frame)
|
||||
if isinstance(frame, StartFrame):
|
||||
await self._call_event_handler("on_pipeline_started", frame)
|
||||
|
||||
if isinstance(frame, StartFrame):
|
||||
await self._call_event_handler("on_pipeline_started", frame)
|
||||
|
||||
# Start heartbeat tasks now that StartFrame has been processed
|
||||
# by all processors in the pipeline
|
||||
self._maybe_start_heartbeat_tasks()
|
||||
elif isinstance(frame, EndFrame):
|
||||
await self._call_event_handler("on_pipeline_ended", frame)
|
||||
self._pipeline_end_event.set()
|
||||
elif isinstance(frame, StopFrame):
|
||||
await self._call_event_handler("on_pipeline_stopped", frame)
|
||||
self._pipeline_end_event.set()
|
||||
elif isinstance(frame, CancelFrame):
|
||||
await self._call_event_handler("on_pipeline_cancelled", frame)
|
||||
self._pipeline_end_event.set()
|
||||
elif isinstance(frame, HeartbeatFrame):
|
||||
await self._heartbeat_queue.put(frame)
|
||||
self._down_queue.task_done()
|
||||
# Start heartbeat tasks now that StartFrame has been processed
|
||||
# by all processors in the pipeline
|
||||
self._maybe_start_heartbeat_tasks()
|
||||
elif isinstance(frame, EndFrame):
|
||||
await self._call_event_handler("on_pipeline_ended", frame)
|
||||
self._pipeline_end_event.set()
|
||||
elif isinstance(frame, StopFrame):
|
||||
await self._call_event_handler("on_pipeline_stopped", frame)
|
||||
self._pipeline_end_event.set()
|
||||
elif isinstance(frame, CancelFrame):
|
||||
self._pipeline_end_event.set()
|
||||
elif isinstance(frame, HeartbeatFrame):
|
||||
await self._heartbeat_queue.put(frame)
|
||||
|
||||
async def _heartbeat_push_handler(self):
|
||||
"""Push heartbeat frames at regular intervals."""
|
||||
@@ -741,7 +674,7 @@ class PipelineTask(BasePipelineTask):
|
||||
# Don't use `queue_frame()` because if an EndFrame is queued the
|
||||
# task will just stop waiting for the pipeline to finish not
|
||||
# allowing more frames to be pushed.
|
||||
await self._source.queue_frame(HeartbeatFrame(timestamp=self._clock.get_time()))
|
||||
await self._pipeline.queue_frame(HeartbeatFrame(timestamp=self._clock.get_time()))
|
||||
await asyncio.sleep(self._params.heartbeats_period_secs)
|
||||
|
||||
async def _heartbeat_monitor_handler(self):
|
||||
@@ -752,7 +685,7 @@ class PipelineTask(BasePipelineTask):
|
||||
the time that a heartbeat frame takes to processes, that is how long it
|
||||
takes for the heartbeat frame to traverse all the pipeline.
|
||||
"""
|
||||
wait_time = HEARTBEAT_MONITOR_SECONDS
|
||||
wait_time = HEARTBEAT_MONITOR_SECS
|
||||
while True:
|
||||
try:
|
||||
frame = await asyncio.wait_for(self._heartbeat_queue.get(), timeout=wait_time)
|
||||
@@ -816,6 +749,10 @@ class PipelineTask(BasePipelineTask):
|
||||
Returns:
|
||||
Whether the pipeline task should continue running.
|
||||
"""
|
||||
# If we are cancelling, just exit the task.
|
||||
if self._cancelled:
|
||||
return True
|
||||
|
||||
logger.warning("Idle timeout detected. Last 10 frames received:")
|
||||
for i, frame in enumerate(last_frames, 1):
|
||||
logger.warning(f"Frame {i}: {frame}")
|
||||
|
||||
@@ -13,13 +13,12 @@ the main pipeline execution.
|
||||
|
||||
import asyncio
|
||||
import inspect
|
||||
from typing import Dict, List, Optional
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from attr import dataclass
|
||||
|
||||
from pipecat.observers.base_observer import BaseObserver, FramePushed
|
||||
from pipecat.observers.base_observer import BaseObserver, FrameProcessed, FramePushed
|
||||
from pipecat.utils.asyncio.task_manager import BaseTaskManager
|
||||
from pipecat.utils.asyncio.watchdog_queue import WatchdogQueue
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -86,7 +85,7 @@ class TaskObserver(BaseObserver):
|
||||
|
||||
# If we already started, create a new proxy for the observer.
|
||||
# Otherwise, it will be created in start().
|
||||
if self._started():
|
||||
if self._proxies:
|
||||
proxy = self._create_proxy(observer)
|
||||
self._proxies[observer] = proxy
|
||||
|
||||
@@ -97,7 +96,7 @@ class TaskObserver(BaseObserver):
|
||||
observer: The observer to remove.
|
||||
"""
|
||||
# If the observer has a proxy, remove it.
|
||||
if observer in self._proxies:
|
||||
if self._proxies and observer in self._proxies:
|
||||
proxy = self._proxies[observer]
|
||||
# Remove the proxy so it doesn't get called anymore.
|
||||
del self._proxies[observer]
|
||||
@@ -120,22 +119,25 @@ class TaskObserver(BaseObserver):
|
||||
for proxy in self._proxies.values():
|
||||
await self._task_manager.cancel_task(proxy.task)
|
||||
|
||||
async def on_process_frame(self, data: FramePushed):
|
||||
"""Queue frame data for all managed observers.
|
||||
|
||||
Args:
|
||||
data: The frame push event data to distribute to observers.
|
||||
"""
|
||||
await self._send_to_proxy(data)
|
||||
|
||||
async def on_push_frame(self, data: FramePushed):
|
||||
"""Queue frame data for all managed observers.
|
||||
|
||||
Args:
|
||||
data: The frame push event data to distribute to observers.
|
||||
"""
|
||||
for proxy in self._proxies.values():
|
||||
await proxy.queue.put(data)
|
||||
|
||||
def _started(self) -> bool:
|
||||
"""Check if the task observer has been started."""
|
||||
return self._proxies is not None
|
||||
await self._send_to_proxy(data)
|
||||
|
||||
def _create_proxy(self, observer: BaseObserver) -> Proxy:
|
||||
"""Create a proxy for a single observer."""
|
||||
queue = WatchdogQueue(self._task_manager)
|
||||
queue = asyncio.Queue()
|
||||
task = self._task_manager.create_task(
|
||||
self._proxy_task_handler(queue, observer),
|
||||
f"TaskObserver::{observer}::_proxy_task_handler",
|
||||
@@ -151,6 +153,10 @@ class TaskObserver(BaseObserver):
|
||||
proxies[observer] = proxy
|
||||
return proxies
|
||||
|
||||
async def _send_to_proxy(self, data: Any):
|
||||
for proxy in self._proxies.values():
|
||||
await proxy.queue.put(data)
|
||||
|
||||
async def _proxy_task_handler(self, queue: asyncio.Queue, observer: BaseObserver):
|
||||
"""Handle frame processing for a single observer."""
|
||||
on_push_frame_deprecated = False
|
||||
@@ -169,11 +175,15 @@ class TaskObserver(BaseObserver):
|
||||
|
||||
while True:
|
||||
data = await queue.get()
|
||||
if on_push_frame_deprecated:
|
||||
await observer.on_push_frame(
|
||||
data.src, data.dst, data.frame, data.direction, data.timestamp
|
||||
)
|
||||
else:
|
||||
await observer.on_push_frame(data)
|
||||
|
||||
if isinstance(data, FramePushed):
|
||||
if on_push_frame_deprecated:
|
||||
await observer.on_push_frame(
|
||||
data.src, data.dst, data.frame, data.direction, data.timestamp
|
||||
)
|
||||
else:
|
||||
await observer.on_push_frame(data)
|
||||
elif isinstance(data, FrameProcessed):
|
||||
await observer.on_process_frame(data)
|
||||
|
||||
queue.task_done()
|
||||
|
||||
@@ -24,7 +24,7 @@ from pipecat.frames.frames import (
|
||||
StartFrame,
|
||||
TranscriptionFrame,
|
||||
)
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor, FrameProcessorSetup
|
||||
from pipecat.utils.time import time_now_iso8601
|
||||
|
||||
|
||||
@@ -64,7 +64,11 @@ class DTMFAggregator(FrameProcessor):
|
||||
|
||||
self._digit_event = asyncio.Event()
|
||||
self._aggregation_task: Optional[asyncio.Task] = None
|
||||
self._interruption_task: Optional[asyncio.Task] = None
|
||||
|
||||
async def cleanup(self) -> None:
|
||||
"""Clean up resources."""
|
||||
await super().cleanup()
|
||||
await self._stop_aggregation_task()
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection) -> None:
|
||||
"""Process incoming frames and handle DTMF aggregation.
|
||||
@@ -82,7 +86,6 @@ class DTMFAggregator(FrameProcessor):
|
||||
if self._aggregation:
|
||||
await self._flush_aggregation()
|
||||
await self._stop_aggregation_task()
|
||||
await self._stop_interruption_task()
|
||||
await self.push_frame(frame, direction)
|
||||
elif isinstance(frame, InputDTMFFrame):
|
||||
# Push the DTMF frame downstream first
|
||||
@@ -102,7 +105,7 @@ class DTMFAggregator(FrameProcessor):
|
||||
|
||||
# For first digit, schedule interruption in separate task
|
||||
if is_first_digit:
|
||||
self._interruption_task = self.create_task(self._send_interruption_task())
|
||||
await self.push_frame(BotInterruptionFrame(), FrameDirection.UPSTREAM)
|
||||
|
||||
# Check for immediate flush conditions
|
||||
if frame.button == self._termination_digit:
|
||||
@@ -111,16 +114,6 @@ class DTMFAggregator(FrameProcessor):
|
||||
# Signal digit received for timeout handling
|
||||
self._digit_event.set()
|
||||
|
||||
async def _send_interruption_task(self):
|
||||
"""Send interruption frame safely in a separate task."""
|
||||
await self.push_frame(BotInterruptionFrame(), FrameDirection.UPSTREAM)
|
||||
|
||||
async def _stop_interruption_task(self) -> None:
|
||||
"""Stops the interruption task."""
|
||||
if self._interruption_task:
|
||||
await self.cancel_task(self._interruption_task)
|
||||
self._interruption_task = None
|
||||
|
||||
def _create_aggregation_task(self) -> None:
|
||||
"""Creates the aggregation task if it hasn't been created yet."""
|
||||
if not self._aggregation_task:
|
||||
@@ -139,7 +132,6 @@ class DTMFAggregator(FrameProcessor):
|
||||
await asyncio.wait_for(self._digit_event.wait(), timeout=self._idle_timeout)
|
||||
self._digit_event.clear()
|
||||
except asyncio.TimeoutError:
|
||||
self.reset_watchdog()
|
||||
if self._aggregation:
|
||||
await self._flush_aggregation()
|
||||
|
||||
@@ -157,8 +149,3 @@ class DTMFAggregator(FrameProcessor):
|
||||
await self.push_frame(transcription_frame)
|
||||
|
||||
self._aggregation = ""
|
||||
|
||||
async def cleanup(self) -> None:
|
||||
"""Clean up resources."""
|
||||
await super().cleanup()
|
||||
await self._stop_aggregation_task()
|
||||
|
||||
277
src/pipecat/processors/aggregators/llm_context.py
Normal file
@@ -0,0 +1,277 @@
|
||||
#
|
||||
# Copyright (c) 2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""Universal LLM context management for LLM services in Pipecat.
|
||||
|
||||
Context contents are represented in a universal format (based on OpenAI)
|
||||
that supports a union of known Pipecat LLM service functionality.
|
||||
|
||||
Whenever an LLM service needs to access context, it does a just-in-time
|
||||
translation from this universal context into whatever format it needs, using a
|
||||
service-specific adapter.
|
||||
"""
|
||||
|
||||
import base64
|
||||
import io
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, List, Optional, TypeAlias, Union
|
||||
|
||||
from loguru import logger
|
||||
from openai._types import NOT_GIVEN as OPEN_AI_NOT_GIVEN
|
||||
from openai._types import NotGiven as OpenAINotGiven
|
||||
from openai.types.chat import (
|
||||
ChatCompletionMessageParam,
|
||||
ChatCompletionToolChoiceOptionParam,
|
||||
)
|
||||
from PIL import Image
|
||||
|
||||
from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
||||
from pipecat.frames.frames import AudioRawFrame
|
||||
|
||||
# "Re-export" types from OpenAI that we're using as universal context types.
|
||||
# NOTE: if universal message types need to someday diverge from OpenAI's, we
|
||||
# should consider managing our own definitions. But we should do so carefully,
|
||||
# as the OpenAI messages are somewhat of a standard and we want to continue
|
||||
# supporting them.
|
||||
LLMStandardMessage = ChatCompletionMessageParam
|
||||
LLMContextToolChoice = ChatCompletionToolChoiceOptionParam
|
||||
NOT_GIVEN = OPEN_AI_NOT_GIVEN
|
||||
NotGiven = OpenAINotGiven
|
||||
|
||||
|
||||
@dataclass
|
||||
class LLMSpecificMessage:
|
||||
"""A container for a context message that is specific to a particular LLM service.
|
||||
|
||||
Enables the use of service-specific message types while maintaining
|
||||
compatibility with the universal LLM context format.
|
||||
"""
|
||||
|
||||
llm: str
|
||||
message: Any
|
||||
|
||||
|
||||
LLMContextMessage: TypeAlias = Union[LLMStandardMessage, LLMSpecificMessage]
|
||||
|
||||
|
||||
class LLMContext:
|
||||
"""Manages conversation context for LLM interactions.
|
||||
|
||||
Handles message history, tool definitions, tool choices, and multimedia
|
||||
content for LLM conversations. Provides methods for message manipulation,
|
||||
and content formatting.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
messages: Optional[List[LLMContextMessage]] = None,
|
||||
tools: ToolsSchema | NotGiven = NOT_GIVEN,
|
||||
tool_choice: LLMContextToolChoice | NotGiven = NOT_GIVEN,
|
||||
):
|
||||
"""Initialize the LLM context.
|
||||
|
||||
Args:
|
||||
messages: Initial list of conversation messages.
|
||||
tools: Available tools for the LLM to use.
|
||||
tool_choice: Tool selection strategy for the LLM.
|
||||
"""
|
||||
self._messages: List[LLMContextMessage] = messages if messages else []
|
||||
self._tools: ToolsSchema | NotGiven = LLMContext._normalize_and_validate_tools(tools)
|
||||
self._tool_choice: LLMContextToolChoice | NotGiven = tool_choice
|
||||
|
||||
def get_messages(self, llm_specific_filter: Optional[str] = None) -> List[LLMContextMessage]:
|
||||
"""Get the current messages list.
|
||||
|
||||
Args:
|
||||
llm_specific_filter: Optional filter to return LLM-specific
|
||||
messages for the given LLM, in addition to the standard
|
||||
messages. If messages end up being filtered, an error will be
|
||||
logged.
|
||||
|
||||
Returns:
|
||||
List of conversation messages.
|
||||
"""
|
||||
if llm_specific_filter is None:
|
||||
return self._messages
|
||||
filtered_messages = [
|
||||
msg
|
||||
for msg in self._messages
|
||||
if not isinstance(msg, LLMSpecificMessage) or msg.llm == llm_specific_filter
|
||||
]
|
||||
if len(filtered_messages) < len(self._messages):
|
||||
logger.error(
|
||||
f"Attempted to use incompatible LLMSpecificMessages with LLM '{llm_specific_filter}'."
|
||||
)
|
||||
return filtered_messages
|
||||
|
||||
@property
|
||||
def tools(self) -> ToolsSchema | NotGiven:
|
||||
"""Get the tools list.
|
||||
|
||||
Returns:
|
||||
Tools list.
|
||||
"""
|
||||
return self._tools
|
||||
|
||||
@property
|
||||
def tool_choice(self) -> LLMContextToolChoice | NotGiven:
|
||||
"""Get the current tool choice setting.
|
||||
|
||||
Returns:
|
||||
The tool choice configuration.
|
||||
"""
|
||||
return self._tool_choice
|
||||
|
||||
def add_message(self, message: LLMContextMessage):
|
||||
"""Add a single message to the context.
|
||||
|
||||
Args:
|
||||
message: The message to add to the conversation history.
|
||||
"""
|
||||
self._messages.append(message)
|
||||
|
||||
def add_messages(self, messages: List[LLMContextMessage]):
|
||||
"""Add multiple messages to the context.
|
||||
|
||||
Args:
|
||||
messages: List of messages to add to the conversation history.
|
||||
"""
|
||||
self._messages.extend(messages)
|
||||
|
||||
def set_messages(self, messages: List[LLMContextMessage]):
|
||||
"""Replace all messages in the context.
|
||||
|
||||
Args:
|
||||
messages: New list of messages to replace the current history.
|
||||
"""
|
||||
self._messages[:] = messages
|
||||
|
||||
def set_tools(self, tools: ToolsSchema | NotGiven = NOT_GIVEN):
|
||||
"""Set the available tools for the LLM.
|
||||
|
||||
Args:
|
||||
tools: A ToolsSchema or NOT_GIVEN to disable tools.
|
||||
"""
|
||||
self._tools = LLMContext._normalize_and_validate_tools(tools)
|
||||
|
||||
def set_tool_choice(self, tool_choice: LLMContextToolChoice | NotGiven):
|
||||
"""Set the tool choice configuration.
|
||||
|
||||
Args:
|
||||
tool_choice: Tool selection strategy for the LLM.
|
||||
"""
|
||||
self._tool_choice = tool_choice
|
||||
|
||||
def add_image_frame_message(
|
||||
self, *, format: str, size: tuple[int, int], image: bytes, text: str = None
|
||||
):
|
||||
"""Add a message containing an image frame.
|
||||
|
||||
Args:
|
||||
format: Image format (e.g., 'RGB', 'RGBA').
|
||||
size: Image dimensions as (width, height) tuple.
|
||||
image: Raw image bytes.
|
||||
text: Optional text to include with the image.
|
||||
"""
|
||||
buffer = io.BytesIO()
|
||||
Image.frombytes(format, size, image).save(buffer, format="JPEG")
|
||||
encoded_image = base64.b64encode(buffer.getvalue()).decode("utf-8")
|
||||
|
||||
content = []
|
||||
if text:
|
||||
content.append({"type": "text", "text": text})
|
||||
content.append(
|
||||
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{encoded_image}"}},
|
||||
)
|
||||
self.add_message({"role": "user", "content": content})
|
||||
|
||||
def add_audio_frames_message(
|
||||
self, *, audio_frames: list[AudioRawFrame], text: str = "Audio follows"
|
||||
):
|
||||
"""Add a message containing audio frames.
|
||||
|
||||
Args:
|
||||
audio_frames: List of audio frame objects to include.
|
||||
text: Optional text to include with the audio.
|
||||
"""
|
||||
if not audio_frames:
|
||||
return
|
||||
|
||||
sample_rate = audio_frames[0].sample_rate
|
||||
num_channels = audio_frames[0].num_channels
|
||||
|
||||
content = []
|
||||
content.append({"type": "text", "text": text})
|
||||
data = b"".join(frame.audio for frame in audio_frames)
|
||||
data = bytes(
|
||||
self._create_wav_header(
|
||||
sample_rate,
|
||||
num_channels,
|
||||
16,
|
||||
len(data),
|
||||
)
|
||||
+ data
|
||||
)
|
||||
encoded_audio = base64.b64encode(data).decode("utf-8")
|
||||
content.append(
|
||||
{
|
||||
"type": "input_audio",
|
||||
"input_audio": {"data": encoded_audio, "format": "wav"},
|
||||
}
|
||||
)
|
||||
self.add_message({"role": "user", "content": content})
|
||||
|
||||
def _create_wav_header(self, sample_rate, num_channels, bits_per_sample, data_size):
|
||||
"""Create a WAV file header for audio data.
|
||||
|
||||
Args:
|
||||
sample_rate: Audio sample rate in Hz.
|
||||
num_channels: Number of audio channels.
|
||||
bits_per_sample: Bits per audio sample.
|
||||
data_size: Size of audio data in bytes.
|
||||
|
||||
Returns:
|
||||
WAV header as a bytearray.
|
||||
"""
|
||||
# RIFF chunk descriptor
|
||||
header = bytearray()
|
||||
header.extend(b"RIFF") # ChunkID
|
||||
header.extend((data_size + 36).to_bytes(4, "little")) # ChunkSize: total size - 8
|
||||
header.extend(b"WAVE") # Format
|
||||
# "fmt " sub-chunk
|
||||
header.extend(b"fmt ") # Subchunk1ID
|
||||
header.extend((16).to_bytes(4, "little")) # Subchunk1Size (16 for PCM)
|
||||
header.extend((1).to_bytes(2, "little")) # AudioFormat (1 for PCM)
|
||||
header.extend(num_channels.to_bytes(2, "little")) # NumChannels
|
||||
header.extend(sample_rate.to_bytes(4, "little")) # SampleRate
|
||||
# Calculate byte rate and block align
|
||||
byte_rate = sample_rate * num_channels * (bits_per_sample // 8)
|
||||
block_align = num_channels * (bits_per_sample // 8)
|
||||
header.extend(byte_rate.to_bytes(4, "little")) # ByteRate
|
||||
header.extend(block_align.to_bytes(2, "little")) # BlockAlign
|
||||
header.extend(bits_per_sample.to_bytes(2, "little")) # BitsPerSample
|
||||
# "data" sub-chunk
|
||||
header.extend(b"data") # Subchunk2ID
|
||||
header.extend(data_size.to_bytes(4, "little")) # Subchunk2Size
|
||||
return header
|
||||
|
||||
@staticmethod
|
||||
def _normalize_and_validate_tools(tools: ToolsSchema | NotGiven) -> ToolsSchema | NotGiven:
|
||||
"""Normalize and validate the given tools.
|
||||
|
||||
Raises:
|
||||
TypeError: If tools are not a ToolsSchema or NotGiven.
|
||||
"""
|
||||
if isinstance(tools, ToolsSchema):
|
||||
if not tools.standard_tools and not tools.custom_tools:
|
||||
return NOT_GIVEN
|
||||
return tools
|
||||
elif tools is NOT_GIVEN:
|
||||
return NOT_GIVEN
|
||||
else:
|
||||
raise TypeError(
|
||||
f"In LLMContext, tools must be a ToolsSchema object or NOT_GIVEN. Got type: {type(tools)}",
|
||||
)
|
||||
@@ -670,7 +670,7 @@ class LLMUserContextAggregator(LLMContextResponseAggregator):
|
||||
if self._vad_params
|
||||
else self._params.turn_emulated_vad_timeout
|
||||
)
|
||||
await asyncio.wait_for(self._aggregation_event.wait(), timeout)
|
||||
await asyncio.wait_for(self._aggregation_event.wait(), timeout=timeout)
|
||||
await self._maybe_emulate_user_speaking()
|
||||
except asyncio.TimeoutError:
|
||||
if not self._user_speaking:
|
||||
@@ -684,7 +684,6 @@ class LLMUserContextAggregator(LLMContextResponseAggregator):
|
||||
)
|
||||
self._emulating_vad = False
|
||||
finally:
|
||||
self.reset_watchdog()
|
||||
self._aggregation_event.clear()
|
||||
|
||||
async def _maybe_emulate_user_speaking(self):
|
||||
@@ -986,10 +985,6 @@ class LLMAssistantContextAggregator(LLMContextResponseAggregator):
|
||||
|
||||
def _context_updated_task_finished(self, task: asyncio.Task):
|
||||
self._context_updated_tasks.discard(task)
|
||||
# The task is finished so this should exit immediately. We need to do
|
||||
# this because otherwise the task manager would report a dangling task
|
||||
# if we don't remove it.
|
||||
asyncio.run_coroutine_threadsafe(self.wait_for_task(task), self.get_event_loop())
|
||||
|
||||
|
||||
class LLMUserResponseAggregator(LLMUserContextAggregator):
|
||||
|
||||
829
src/pipecat/processors/aggregators/llm_response_universal.py
Normal file
@@ -0,0 +1,829 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""LLM response aggregators for handling conversation context and message aggregation.
|
||||
|
||||
This module provides aggregators that process and accumulate LLM responses, user inputs,
|
||||
and conversation context. These aggregators handle the flow between speech-to-text,
|
||||
LLM processing, and text-to-speech components in conversational AI pipelines.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, List, Literal, Optional, Set
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
||||
from pipecat.audio.interruptions.base_interruption_strategy import BaseInterruptionStrategy
|
||||
from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
from pipecat.frames.frames import (
|
||||
BotInterruptionFrame,
|
||||
BotStartedSpeakingFrame,
|
||||
BotStoppedSpeakingFrame,
|
||||
CancelFrame,
|
||||
EmulateUserStartedSpeakingFrame,
|
||||
EmulateUserStoppedSpeakingFrame,
|
||||
EndFrame,
|
||||
Frame,
|
||||
FunctionCallCancelFrame,
|
||||
FunctionCallInProgressFrame,
|
||||
FunctionCallResultFrame,
|
||||
FunctionCallsStartedFrame,
|
||||
InputAudioRawFrame,
|
||||
InterimTranscriptionFrame,
|
||||
LLMContextAssistantTimestampFrame,
|
||||
LLMContextFrame,
|
||||
LLMFullResponseEndFrame,
|
||||
LLMFullResponseStartFrame,
|
||||
LLMMessagesAppendFrame,
|
||||
LLMMessagesUpdateFrame,
|
||||
LLMSetToolChoiceFrame,
|
||||
LLMSetToolsFrame,
|
||||
SpeechControlParamsFrame,
|
||||
StartFrame,
|
||||
StartInterruptionFrame,
|
||||
TextFrame,
|
||||
TranscriptionFrame,
|
||||
UserImageRawFrame,
|
||||
UserStartedSpeakingFrame,
|
||||
UserStoppedSpeakingFrame,
|
||||
)
|
||||
from pipecat.processors.aggregators.llm_context import (
|
||||
LLMContext,
|
||||
LLMContextMessage,
|
||||
LLMSpecificMessage,
|
||||
NotGiven,
|
||||
)
|
||||
from pipecat.processors.aggregators.llm_response import (
|
||||
LLMAssistantAggregatorParams,
|
||||
LLMUserAggregatorParams,
|
||||
)
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.utils.time import time_now_iso8601
|
||||
|
||||
|
||||
class LLMContextAggregator(FrameProcessor):
|
||||
"""Base LLM aggregator that uses an LLMContext for conversation storage.
|
||||
|
||||
This aggregator maintains conversation state using an LLMContext and
|
||||
pushes LLMContextFrame objects as aggregation frames. It provides
|
||||
common functionality for context-based conversation management.
|
||||
"""
|
||||
|
||||
def __init__(self, *, context: LLMContext, role: str, **kwargs):
|
||||
"""Initialize the context response aggregator.
|
||||
|
||||
Args:
|
||||
context: The LLM context to use for conversation storage.
|
||||
role: The role this aggregator represents (e.g. "user", "assistant").
|
||||
**kwargs: Additional arguments passed to parent class.
|
||||
"""
|
||||
super().__init__(**kwargs)
|
||||
self._context = context
|
||||
self._role = role
|
||||
|
||||
self._aggregation: str = ""
|
||||
|
||||
@property
|
||||
def messages(self) -> List[LLMContextMessage]:
|
||||
"""Get messages from the LLM context.
|
||||
|
||||
Returns:
|
||||
List of message dictionaries from the context.
|
||||
"""
|
||||
return self._context.get_messages()
|
||||
|
||||
@property
|
||||
def role(self) -> str:
|
||||
"""Get the role for this aggregator.
|
||||
|
||||
Returns:
|
||||
The role string for this aggregator.
|
||||
"""
|
||||
return self._role
|
||||
|
||||
@property
|
||||
def context(self):
|
||||
"""Get the LLM context.
|
||||
|
||||
Returns:
|
||||
The LLMContext instance used by this aggregator.
|
||||
"""
|
||||
return self._context
|
||||
|
||||
def get_context_frame(self) -> LLMContextFrame:
|
||||
"""Create a context frame with the current context.
|
||||
|
||||
Returns:
|
||||
LLMContextFrame containing the current context.
|
||||
"""
|
||||
return LLMContextFrame(context=self._context)
|
||||
|
||||
async def push_context_frame(self, direction: FrameDirection = FrameDirection.DOWNSTREAM):
|
||||
"""Push a context frame in the specified direction.
|
||||
|
||||
Args:
|
||||
direction: The direction to push the frame (upstream or downstream).
|
||||
"""
|
||||
frame = self.get_context_frame()
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
def add_messages(self, messages):
|
||||
"""Add messages to the context.
|
||||
|
||||
Args:
|
||||
messages: Messages to add to the conversation context.
|
||||
"""
|
||||
self._context.add_messages(messages)
|
||||
|
||||
def set_messages(self, messages):
|
||||
"""Set the context messages.
|
||||
|
||||
Args:
|
||||
messages: Messages to replace the current context messages.
|
||||
"""
|
||||
self._context.set_messages(messages)
|
||||
|
||||
def set_tools(self, tools: ToolsSchema | NotGiven):
|
||||
"""Set tools in the context.
|
||||
|
||||
Args:
|
||||
tools: List of tool definitions to set in the context.
|
||||
"""
|
||||
self._context.set_tools(tools)
|
||||
|
||||
def set_tool_choice(self, tool_choice: Literal["none", "auto", "required"] | dict):
|
||||
"""Set tool choice in the context.
|
||||
|
||||
Args:
|
||||
tool_choice: Tool choice configuration for the context.
|
||||
"""
|
||||
self._context.set_tool_choice(tool_choice)
|
||||
|
||||
async def reset(self):
|
||||
"""Reset the aggregation state."""
|
||||
self._aggregation = ""
|
||||
|
||||
|
||||
class LLMUserAggregator(LLMContextAggregator):
|
||||
"""User LLM aggregator that processes speech-to-text transcriptions.
|
||||
|
||||
This aggregator handles the complex logic of aggregating user speech transcriptions
|
||||
from STT services. It manages multiple scenarios including:
|
||||
|
||||
- Transcriptions received between VAD events
|
||||
- Transcriptions received outside VAD events
|
||||
- Interim vs final transcriptions
|
||||
- User interruptions during bot speech
|
||||
- Emulated VAD for whispered or short utterances
|
||||
|
||||
The aggregator uses timeouts to handle cases where transcriptions arrive
|
||||
after VAD events or when no VAD is available.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
context: LLMContext,
|
||||
*,
|
||||
params: Optional[LLMUserAggregatorParams] = None,
|
||||
**kwargs,
|
||||
):
|
||||
"""Initialize the user context aggregator.
|
||||
|
||||
Args:
|
||||
context: The LLM context for conversation storage.
|
||||
params: Configuration parameters for aggregation behavior.
|
||||
**kwargs: Additional arguments. Supports deprecated 'aggregation_timeout'.
|
||||
"""
|
||||
super().__init__(context=context, role="user", **kwargs)
|
||||
self._params = params or LLMUserAggregatorParams()
|
||||
self._vad_params: Optional[VADParams] = None
|
||||
self._turn_params: Optional[SmartTurnParams] = None
|
||||
|
||||
if "aggregation_timeout" in kwargs:
|
||||
import warnings
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("always")
|
||||
warnings.warn(
|
||||
"Parameter 'aggregation_timeout' is deprecated, use 'params' instead.",
|
||||
DeprecationWarning,
|
||||
)
|
||||
|
||||
self._params.aggregation_timeout = kwargs["aggregation_timeout"]
|
||||
|
||||
self._user_speaking = False
|
||||
self._bot_speaking = False
|
||||
self._was_bot_speaking = False
|
||||
self._emulating_vad = False
|
||||
self._seen_interim_results = False
|
||||
self._waiting_for_aggregation = False
|
||||
|
||||
self._aggregation_event = asyncio.Event()
|
||||
self._aggregation_task = None
|
||||
|
||||
async def reset(self):
|
||||
"""Reset the aggregation state and interruption strategies."""
|
||||
await super().reset()
|
||||
self._was_bot_speaking = False
|
||||
self._seen_interim_results = False
|
||||
self._waiting_for_aggregation = False
|
||||
[await s.reset() for s in self._interruption_strategies]
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
"""Process frames for user speech aggregation and context management.
|
||||
|
||||
Args:
|
||||
frame: The frame to process.
|
||||
direction: The direction of frame flow in the pipeline.
|
||||
"""
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if isinstance(frame, StartFrame):
|
||||
# Push StartFrame before start(), because we want StartFrame to be
|
||||
# processed by every processor before any other frame is processed.
|
||||
await self.push_frame(frame, direction)
|
||||
await self._start(frame)
|
||||
elif isinstance(frame, EndFrame):
|
||||
# Push EndFrame before stop(), because stop() waits on the task to
|
||||
# finish and the task finishes when EndFrame is processed.
|
||||
await self.push_frame(frame, direction)
|
||||
await self._stop(frame)
|
||||
elif isinstance(frame, CancelFrame):
|
||||
await self._cancel(frame)
|
||||
await self.push_frame(frame, direction)
|
||||
elif isinstance(frame, InputAudioRawFrame):
|
||||
await self._handle_input_audio(frame)
|
||||
await self.push_frame(frame, direction)
|
||||
elif isinstance(frame, UserStartedSpeakingFrame):
|
||||
await self._handle_user_started_speaking(frame)
|
||||
await self.push_frame(frame, direction)
|
||||
elif isinstance(frame, UserStoppedSpeakingFrame):
|
||||
await self._handle_user_stopped_speaking(frame)
|
||||
await self.push_frame(frame, direction)
|
||||
elif isinstance(frame, BotStartedSpeakingFrame):
|
||||
await self._handle_bot_started_speaking(frame)
|
||||
await self.push_frame(frame, direction)
|
||||
elif isinstance(frame, BotStoppedSpeakingFrame):
|
||||
await self._handle_bot_stopped_speaking(frame)
|
||||
await self.push_frame(frame, direction)
|
||||
elif isinstance(frame, TranscriptionFrame):
|
||||
await self._handle_transcription(frame)
|
||||
elif isinstance(frame, InterimTranscriptionFrame):
|
||||
await self._handle_interim_transcription(frame)
|
||||
elif isinstance(frame, LLMMessagesAppendFrame):
|
||||
await self._handle_llm_messages_append(frame)
|
||||
elif isinstance(frame, LLMMessagesUpdateFrame):
|
||||
await self._handle_llm_messages_update(frame)
|
||||
elif isinstance(frame, LLMSetToolsFrame):
|
||||
self.set_tools(frame.tools)
|
||||
elif isinstance(frame, LLMSetToolChoiceFrame):
|
||||
self.set_tool_choice(frame.tool_choice)
|
||||
elif isinstance(frame, SpeechControlParamsFrame):
|
||||
self._vad_params = frame.vad_params
|
||||
self._turn_params = frame.turn_params
|
||||
await self.push_frame(frame, direction)
|
||||
else:
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
async def _process_aggregation(self):
|
||||
"""Process the current aggregation and push it downstream."""
|
||||
aggregation = self._aggregation
|
||||
await self.reset()
|
||||
self._context.add_message({"role": self.role, "content": aggregation})
|
||||
frame = LLMContextFrame(self._context)
|
||||
await self.push_frame(frame)
|
||||
|
||||
async def _push_aggregation(self):
|
||||
"""Push the current aggregation based on interruption strategies and conditions."""
|
||||
if len(self._aggregation) > 0:
|
||||
if self.interruption_strategies and self._bot_speaking:
|
||||
should_interrupt = await self._should_interrupt_based_on_strategies()
|
||||
|
||||
if should_interrupt:
|
||||
logger.debug(
|
||||
"Interruption conditions met - pushing BotInterruptionFrame and aggregation"
|
||||
)
|
||||
await self.push_frame(BotInterruptionFrame(), FrameDirection.UPSTREAM)
|
||||
await self._process_aggregation()
|
||||
else:
|
||||
logger.debug("Interruption conditions not met - not pushing aggregation")
|
||||
# Don't process aggregation, just reset it
|
||||
await self.reset()
|
||||
else:
|
||||
# No interruption config - normal behavior (always push aggregation)
|
||||
await self._process_aggregation()
|
||||
# Handles the case where both the user and the bot are not speaking,
|
||||
# and the bot was previously speaking before the user interruption.
|
||||
# Normally, when the user stops speaking, new text is expected,
|
||||
# which triggers the bot to respond. However, if no new text
|
||||
# is received, this safeguard ensures
|
||||
# the bot doesn't hang indefinitely while waiting to speak again.
|
||||
elif not self._seen_interim_results and self._was_bot_speaking and not self._bot_speaking:
|
||||
logger.warning("User stopped speaking but no new aggregation received.")
|
||||
# Resetting it so we don't trigger this twice
|
||||
self._was_bot_speaking = False
|
||||
# TODO: we are not enabling this for now, due to some STT services which can take as long as 2 seconds two return a transcription
|
||||
# So we need more tests and probably make this feature configurable, disabled it by default.
|
||||
# We are just pushing the same previous context to be processed again in this case
|
||||
# await self.push_frame(LLMContextFrame(self._context))
|
||||
|
||||
async def _should_interrupt_based_on_strategies(self) -> bool:
|
||||
"""Check if interruption should occur based on configured strategies.
|
||||
|
||||
Returns:
|
||||
True if any interruption strategy indicates interruption should occur.
|
||||
"""
|
||||
|
||||
async def should_interrupt(strategy: BaseInterruptionStrategy):
|
||||
await strategy.append_text(self._aggregation)
|
||||
return await strategy.should_interrupt()
|
||||
|
||||
return any([await should_interrupt(s) for s in self._interruption_strategies])
|
||||
|
||||
async def _start(self, frame: StartFrame):
|
||||
self._create_aggregation_task()
|
||||
|
||||
async def _stop(self, frame: EndFrame):
|
||||
await self._cancel_aggregation_task()
|
||||
|
||||
async def _cancel(self, frame: CancelFrame):
|
||||
await self._cancel_aggregation_task()
|
||||
|
||||
async def _handle_llm_messages_append(self, frame: LLMMessagesAppendFrame):
|
||||
self.add_messages(frame.messages)
|
||||
if frame.run_llm:
|
||||
await self.push_context_frame()
|
||||
|
||||
async def _handle_llm_messages_update(self, frame: LLMMessagesUpdateFrame):
|
||||
self.set_messages(frame.messages)
|
||||
if frame.run_llm:
|
||||
await self.push_context_frame()
|
||||
|
||||
async def _handle_input_audio(self, frame: InputAudioRawFrame):
|
||||
for s in self.interruption_strategies:
|
||||
await s.append_audio(frame.audio, frame.sample_rate)
|
||||
|
||||
async def _handle_user_started_speaking(self, frame: UserStartedSpeakingFrame):
|
||||
self._user_speaking = True
|
||||
self._waiting_for_aggregation = True
|
||||
self._was_bot_speaking = self._bot_speaking
|
||||
|
||||
# If we get a non-emulated UserStartedSpeakingFrame but we are in the
|
||||
# middle of emulating VAD, let's stop emulating VAD (i.e. don't send the
|
||||
# EmulateUserStoppedSpeakingFrame).
|
||||
if not frame.emulated and self._emulating_vad:
|
||||
self._emulating_vad = False
|
||||
|
||||
async def _handle_user_stopped_speaking(self, _: UserStoppedSpeakingFrame):
|
||||
self._user_speaking = False
|
||||
# We just stopped speaking. Let's see if there's some aggregation to
|
||||
# push. If the last thing we saw is an interim transcription, let's wait
|
||||
# pushing the aggregation as we will probably get a final transcription.
|
||||
if len(self._aggregation) > 0:
|
||||
if not self._seen_interim_results:
|
||||
await self._push_aggregation()
|
||||
# Handles the case where both the user and the bot are not speaking,
|
||||
# and the bot was previously speaking before the user interruption.
|
||||
# So in this case we are resetting the aggregation timer
|
||||
elif not self._seen_interim_results and self._was_bot_speaking and not self._bot_speaking:
|
||||
# Reset aggregation timer.
|
||||
self._aggregation_event.set()
|
||||
|
||||
async def _handle_bot_started_speaking(self, _: BotStartedSpeakingFrame):
|
||||
self._bot_speaking = True
|
||||
|
||||
async def _handle_bot_stopped_speaking(self, _: BotStoppedSpeakingFrame):
|
||||
self._bot_speaking = False
|
||||
|
||||
async def _handle_transcription(self, frame: TranscriptionFrame):
|
||||
text = frame.text
|
||||
|
||||
# Make sure we really have some text.
|
||||
if not text.strip():
|
||||
return
|
||||
|
||||
self._aggregation += f" {text}" if self._aggregation else text
|
||||
# We just got a final result, so let's reset interim results.
|
||||
self._seen_interim_results = False
|
||||
# Reset aggregation timer.
|
||||
self._aggregation_event.set()
|
||||
|
||||
async def _handle_interim_transcription(self, _: InterimTranscriptionFrame):
|
||||
self._seen_interim_results = True
|
||||
|
||||
def _create_aggregation_task(self):
|
||||
if not self._aggregation_task:
|
||||
self._aggregation_task = self.create_task(self._aggregation_task_handler())
|
||||
|
||||
async def _cancel_aggregation_task(self):
|
||||
if self._aggregation_task:
|
||||
await self.cancel_task(self._aggregation_task)
|
||||
self._aggregation_task = None
|
||||
|
||||
async def _aggregation_task_handler(self):
|
||||
while True:
|
||||
try:
|
||||
# The _aggregation_task_handler handles two distinct timeout scenarios:
|
||||
#
|
||||
# 1. When emulating_vad=True: Wait for emulated VAD timeout before
|
||||
# pushing aggregation (simulating VAD behavior when no actual VAD
|
||||
# detection occurred).
|
||||
#
|
||||
# 2. When emulating_vad=False: Use aggregation_timeout as a buffer
|
||||
# to wait for potential late-arriving transcription frames after
|
||||
# a real VAD event.
|
||||
#
|
||||
# For emulated VAD scenarios, the timeout strategy depends on whether
|
||||
# a turn analyzer is configured:
|
||||
#
|
||||
# - WITH turn analyzer: Use turn_emulated_vad_timeout parameter because
|
||||
# the VAD's stop_secs is set very low (e.g. 0.2s) for rapid speech
|
||||
# chunking to feed the turn analyzer. This low value is too fast
|
||||
# for emulated VAD scenarios where we need to allow users time to
|
||||
# finish speaking (e.g. 0.8s).
|
||||
#
|
||||
# - WITHOUT turn analyzer: Use VAD's stop_secs directly to maintain
|
||||
# consistent user experience between real VAD detection and
|
||||
# emulated VAD scenarios.
|
||||
if not self._emulating_vad:
|
||||
timeout = self._params.aggregation_timeout
|
||||
elif self._turn_params:
|
||||
timeout = self._params.turn_emulated_vad_timeout
|
||||
else:
|
||||
# Use VAD stop_secs when no turn analyzer is present, fallback if no VAD params
|
||||
timeout = (
|
||||
self._vad_params.stop_secs
|
||||
if self._vad_params
|
||||
else self._params.turn_emulated_vad_timeout
|
||||
)
|
||||
await asyncio.wait_for(self._aggregation_event.wait(), timeout=timeout)
|
||||
await self._maybe_emulate_user_speaking()
|
||||
except asyncio.TimeoutError:
|
||||
if not self._user_speaking:
|
||||
await self._push_aggregation()
|
||||
|
||||
# If we are emulating VAD we still need to send the user stopped
|
||||
# speaking frame.
|
||||
if self._emulating_vad:
|
||||
await self.push_frame(
|
||||
EmulateUserStoppedSpeakingFrame(), FrameDirection.UPSTREAM
|
||||
)
|
||||
self._emulating_vad = False
|
||||
finally:
|
||||
self._aggregation_event.clear()
|
||||
|
||||
async def _maybe_emulate_user_speaking(self):
|
||||
"""Maybe emulate user speaking based on transcription.
|
||||
|
||||
Emulate user speaking if we got a transcription but it was not
|
||||
detected by VAD. Behavior when bot is speaking depends on the
|
||||
enable_emulated_vad_interruptions parameter.
|
||||
"""
|
||||
# Check if we received a transcription but VAD was not able to detect
|
||||
# voice (e.g. when you whisper a short utterance). In that case, we need
|
||||
# to emulate VAD (i.e. user start/stopped speaking), but we do it only
|
||||
# if the bot is not speaking. If the bot is speaking and we really have
|
||||
# a short utterance we don't really want to interrupt the bot.
|
||||
if (
|
||||
not self._user_speaking
|
||||
and not self._waiting_for_aggregation
|
||||
and len(self._aggregation) > 0
|
||||
):
|
||||
if self._bot_speaking and not self._params.enable_emulated_vad_interruptions:
|
||||
# If emulated VAD interruptions are disabled and bot is speaking, ignore
|
||||
logger.debug("Ignoring user speaking emulation, bot is speaking.")
|
||||
await self.reset()
|
||||
else:
|
||||
# Either bot is not speaking, or emulated VAD interruptions are enabled
|
||||
# - trigger user speaking emulation.
|
||||
await self.push_frame(EmulateUserStartedSpeakingFrame(), FrameDirection.UPSTREAM)
|
||||
self._emulating_vad = True
|
||||
|
||||
|
||||
class LLMAssistantAggregator(LLMContextAggregator):
|
||||
"""Assistant LLM aggregator that processes bot responses and function calls.
|
||||
|
||||
This aggregator handles the complex logic of processing assistant responses including:
|
||||
|
||||
- Text frame aggregation between response start/end markers
|
||||
- Function call lifecycle management
|
||||
- Context updates with timestamps
|
||||
- Tool execution and result handling
|
||||
- Interruption handling during responses
|
||||
|
||||
The aggregator manages function calls in progress and coordinates between
|
||||
text generation and tool execution phases of LLM responses.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
context: LLMContext,
|
||||
*,
|
||||
params: Optional[LLMAssistantAggregatorParams] = None,
|
||||
**kwargs,
|
||||
):
|
||||
"""Initialize the assistant context aggregator.
|
||||
|
||||
Args:
|
||||
context: The OpenAI LLM context for conversation storage.
|
||||
params: Configuration parameters for aggregation behavior.
|
||||
**kwargs: Additional arguments. Supports deprecated 'expect_stripped_words'.
|
||||
"""
|
||||
super().__init__(context=context, role="assistant", **kwargs)
|
||||
self._params = params or LLMAssistantAggregatorParams()
|
||||
|
||||
if "expect_stripped_words" in kwargs:
|
||||
import warnings
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("always")
|
||||
warnings.warn(
|
||||
"Parameter 'expect_stripped_words' is deprecated, use 'params' instead.",
|
||||
DeprecationWarning,
|
||||
)
|
||||
|
||||
self._params.expect_stripped_words = kwargs["expect_stripped_words"]
|
||||
|
||||
self._started = 0
|
||||
self._function_calls_in_progress: Dict[str, Optional[FunctionCallInProgressFrame]] = {}
|
||||
self._context_updated_tasks: Set[asyncio.Task] = set()
|
||||
|
||||
@property
|
||||
def has_function_calls_in_progress(self) -> bool:
|
||||
"""Check if there are any function calls currently in progress.
|
||||
|
||||
Returns:
|
||||
True if function calls are in progress, False otherwise.
|
||||
"""
|
||||
return bool(self._function_calls_in_progress)
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
"""Process frames for assistant response aggregation and function call management.
|
||||
|
||||
Args:
|
||||
frame: The frame to process.
|
||||
direction: The direction of frame flow in the pipeline.
|
||||
"""
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if isinstance(frame, StartInterruptionFrame):
|
||||
await self._handle_interruptions(frame)
|
||||
await self.push_frame(frame, direction)
|
||||
elif isinstance(frame, LLMFullResponseStartFrame):
|
||||
await self._handle_llm_start(frame)
|
||||
elif isinstance(frame, LLMFullResponseEndFrame):
|
||||
await self._handle_llm_end(frame)
|
||||
elif isinstance(frame, TextFrame):
|
||||
await self._handle_text(frame)
|
||||
elif isinstance(frame, LLMMessagesAppendFrame):
|
||||
await self._handle_llm_messages_append(frame)
|
||||
elif isinstance(frame, LLMMessagesUpdateFrame):
|
||||
await self._handle_llm_messages_update(frame)
|
||||
elif isinstance(frame, LLMSetToolsFrame):
|
||||
self.set_tools(frame.tools)
|
||||
elif isinstance(frame, LLMSetToolChoiceFrame):
|
||||
self.set_tool_choice(frame.tool_choice)
|
||||
elif isinstance(frame, FunctionCallsStartedFrame):
|
||||
await self._handle_function_calls_started(frame)
|
||||
elif isinstance(frame, FunctionCallInProgressFrame):
|
||||
await self._handle_function_call_in_progress(frame)
|
||||
elif isinstance(frame, FunctionCallResultFrame):
|
||||
await self._handle_function_call_result(frame)
|
||||
elif isinstance(frame, FunctionCallCancelFrame):
|
||||
await self._handle_function_call_cancel(frame)
|
||||
elif isinstance(frame, UserImageRawFrame) and frame.request and frame.request.tool_call_id:
|
||||
await self._handle_user_image_frame(frame)
|
||||
elif isinstance(frame, BotStoppedSpeakingFrame):
|
||||
await self._push_aggregation()
|
||||
await self.push_frame(frame, direction)
|
||||
else:
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
async def _push_aggregation(self):
|
||||
"""Push the current assistant aggregation with timestamp."""
|
||||
if not self._aggregation:
|
||||
return
|
||||
|
||||
aggregation = self._aggregation.strip()
|
||||
await self.reset()
|
||||
|
||||
if aggregation:
|
||||
self._context.add_message({"role": "assistant", "content": aggregation})
|
||||
|
||||
# Push context frame
|
||||
await self.push_context_frame()
|
||||
|
||||
# Push timestamp frame with current time
|
||||
timestamp_frame = LLMContextAssistantTimestampFrame(timestamp=time_now_iso8601())
|
||||
await self.push_frame(timestamp_frame)
|
||||
|
||||
async def _handle_llm_messages_append(self, frame: LLMMessagesAppendFrame):
|
||||
self.add_messages(frame.messages)
|
||||
if frame.run_llm:
|
||||
await self.push_context_frame(FrameDirection.UPSTREAM)
|
||||
|
||||
async def _handle_llm_messages_update(self, frame: LLMMessagesUpdateFrame):
|
||||
self.set_messages(frame.messages)
|
||||
if frame.run_llm:
|
||||
await self.push_context_frame(FrameDirection.UPSTREAM)
|
||||
|
||||
async def _handle_interruptions(self, frame: StartInterruptionFrame):
|
||||
await self._push_aggregation()
|
||||
self._started = 0
|
||||
await self.reset()
|
||||
|
||||
async def _handle_function_calls_started(self, frame: FunctionCallsStartedFrame):
|
||||
function_names = [f"{f.function_name}:{f.tool_call_id}" for f in frame.function_calls]
|
||||
logger.debug(f"{self} FunctionCallsStartedFrame: {function_names}")
|
||||
for function_call in frame.function_calls:
|
||||
self._function_calls_in_progress[function_call.tool_call_id] = None
|
||||
|
||||
async def _handle_function_call_in_progress(self, frame: FunctionCallInProgressFrame):
|
||||
logger.debug(
|
||||
f"{self} FunctionCallInProgressFrame: [{frame.function_name}:{frame.tool_call_id}]"
|
||||
)
|
||||
|
||||
# Update context with the in-progress function call
|
||||
self._context.add_message(
|
||||
{
|
||||
"role": "assistant",
|
||||
"tool_calls": [
|
||||
{
|
||||
"id": frame.tool_call_id,
|
||||
"function": {
|
||||
"name": frame.function_name,
|
||||
"arguments": json.dumps(frame.arguments),
|
||||
},
|
||||
"type": "function",
|
||||
}
|
||||
],
|
||||
}
|
||||
)
|
||||
self._context.add_message(
|
||||
{
|
||||
"role": "tool",
|
||||
"content": "IN_PROGRESS",
|
||||
"tool_call_id": frame.tool_call_id,
|
||||
}
|
||||
)
|
||||
|
||||
self._function_calls_in_progress[frame.tool_call_id] = frame
|
||||
|
||||
async def _handle_function_call_result(self, frame: FunctionCallResultFrame):
|
||||
logger.debug(
|
||||
f"{self} FunctionCallResultFrame: [{frame.function_name}:{frame.tool_call_id}]"
|
||||
)
|
||||
if frame.tool_call_id not in self._function_calls_in_progress:
|
||||
logger.warning(
|
||||
f"FunctionCallResultFrame tool_call_id [{frame.tool_call_id}] is not running"
|
||||
)
|
||||
return
|
||||
|
||||
del self._function_calls_in_progress[frame.tool_call_id]
|
||||
|
||||
properties = frame.properties
|
||||
|
||||
# Update context with the function call result
|
||||
if frame.result:
|
||||
result = json.dumps(frame.result)
|
||||
self._update_function_call_result(frame.function_name, frame.tool_call_id, result)
|
||||
else:
|
||||
self._update_function_call_result(frame.function_name, frame.tool_call_id, "COMPLETED")
|
||||
|
||||
run_llm = False
|
||||
|
||||
# Run inference if the function call result requires it.
|
||||
if frame.result:
|
||||
if properties and properties.run_llm is not None:
|
||||
# If the tool call result has a run_llm property, use it.
|
||||
run_llm = properties.run_llm
|
||||
elif frame.run_llm is not None:
|
||||
# If the frame is indicating we should run the LLM, do it.
|
||||
run_llm = frame.run_llm
|
||||
else:
|
||||
# If this is the last function call in progress, run the LLM.
|
||||
run_llm = not bool(self._function_calls_in_progress)
|
||||
|
||||
if run_llm:
|
||||
await self.push_context_frame(FrameDirection.UPSTREAM)
|
||||
|
||||
# Call the `on_context_updated` callback once the function call result
|
||||
# is added to the context. Also, run this in a separate task to make
|
||||
# sure we don't block the pipeline.
|
||||
if properties and properties.on_context_updated:
|
||||
task_name = f"{frame.function_name}:{frame.tool_call_id}:on_context_updated"
|
||||
task = self.create_task(properties.on_context_updated(), task_name)
|
||||
self._context_updated_tasks.add(task)
|
||||
task.add_done_callback(self._context_updated_task_finished)
|
||||
|
||||
async def _handle_function_call_cancel(self, frame: FunctionCallCancelFrame):
|
||||
logger.debug(
|
||||
f"{self} FunctionCallCancelFrame: [{frame.function_name}:{frame.tool_call_id}]"
|
||||
)
|
||||
if frame.tool_call_id not in self._function_calls_in_progress:
|
||||
return
|
||||
|
||||
if self._function_calls_in_progress[frame.tool_call_id].cancel_on_interruption:
|
||||
# Update context with the function call cancellation
|
||||
self._update_function_call_result(frame.function_name, frame.tool_call_id, "CANCELLED")
|
||||
del self._function_calls_in_progress[frame.tool_call_id]
|
||||
|
||||
def _update_function_call_result(self, function_name: str, tool_call_id: str, result: Any):
|
||||
for message in self._context.get_messages():
|
||||
if (
|
||||
not isinstance(message, LLMSpecificMessage)
|
||||
and message["role"] == "tool"
|
||||
and message["tool_call_id"]
|
||||
and message["tool_call_id"] == tool_call_id
|
||||
):
|
||||
message["content"] = result
|
||||
|
||||
async def _handle_user_image_frame(self, frame: UserImageRawFrame):
|
||||
logger.debug(
|
||||
f"{self} UserImageRawFrame: [{frame.request.function_name}:{frame.request.tool_call_id}]"
|
||||
)
|
||||
|
||||
if frame.request.tool_call_id not in self._function_calls_in_progress:
|
||||
logger.warning(
|
||||
f"UserImageRawFrame tool_call_id [{frame.request.tool_call_id}] is not running"
|
||||
)
|
||||
return
|
||||
|
||||
del self._function_calls_in_progress[frame.request.tool_call_id]
|
||||
|
||||
# Update context with the image frame
|
||||
self._update_function_call_result(
|
||||
frame.request.function_name, frame.request.tool_call_id, "COMPLETED"
|
||||
)
|
||||
self._context.add_image_frame_message(
|
||||
format=frame.format,
|
||||
size=frame.size,
|
||||
image=frame.image,
|
||||
text=frame.request.context,
|
||||
)
|
||||
|
||||
await self._push_aggregation()
|
||||
await self.push_context_frame(FrameDirection.UPSTREAM)
|
||||
|
||||
async def _handle_llm_start(self, _: LLMFullResponseStartFrame):
|
||||
self._started += 1
|
||||
|
||||
async def _handle_llm_end(self, _: LLMFullResponseEndFrame):
|
||||
self._started -= 1
|
||||
await self._push_aggregation()
|
||||
|
||||
async def _handle_text(self, frame: TextFrame):
|
||||
if not self._started:
|
||||
return
|
||||
|
||||
if self._params.expect_stripped_words:
|
||||
self._aggregation += f" {frame.text}" if self._aggregation else frame.text
|
||||
else:
|
||||
self._aggregation += frame.text
|
||||
|
||||
def _context_updated_task_finished(self, task: asyncio.Task):
|
||||
self._context_updated_tasks.discard(task)
|
||||
|
||||
|
||||
class LLMContextAggregatorPair:
|
||||
"""Pair of LLM context aggregators for updating context with user and assistant messages."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
context: LLMContext,
|
||||
*,
|
||||
user_params: LLMUserAggregatorParams = LLMUserAggregatorParams(),
|
||||
assistant_params: LLMAssistantAggregatorParams = LLMAssistantAggregatorParams(),
|
||||
):
|
||||
"""Initialize the LLM context aggregator pair.
|
||||
|
||||
Args:
|
||||
context: The context to be managed by the aggregators.
|
||||
user_params: Parameters for the user context aggregator.
|
||||
assistant_params: Parameters for the assistant context aggregator.
|
||||
"""
|
||||
self._user = LLMUserAggregator(context, params=user_params)
|
||||
self._assistant = LLMAssistantAggregator(context, params=assistant_params)
|
||||
|
||||
def user(self) -> LLMUserAggregator:
|
||||
"""Get the user context aggregator.
|
||||
|
||||
Returns:
|
||||
The user context aggregator instance.
|
||||
"""
|
||||
return self._user
|
||||
|
||||
def assistant(self) -> LLMAssistantAggregator:
|
||||
"""Get the assistant context aggregator.
|
||||
|
||||
Returns:
|
||||
The assistant context aggregator instance.
|
||||
"""
|
||||
return self._assistant
|
||||
@@ -178,6 +178,7 @@ class AudioBufferProcessor(FrameProcessor):
|
||||
Calls audio handlers with any remaining buffered audio before stopping.
|
||||
"""
|
||||
await self._call_on_audio_data_handler()
|
||||
self._reset_recording()
|
||||
self._recording = False
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
@@ -230,6 +231,7 @@ class AudioBufferProcessor(FrameProcessor):
|
||||
|
||||
if self._buffer_size > 0 and len(self._user_audio_buffer) > self._buffer_size:
|
||||
await self._call_on_audio_data_handler()
|
||||
self._reset_recording()
|
||||
|
||||
# Process turn recording with preprocessed data.
|
||||
if self._enable_turn_audio:
|
||||
@@ -288,8 +290,6 @@ class AudioBufferProcessor(FrameProcessor):
|
||||
self._num_channels,
|
||||
)
|
||||
|
||||
self._reset_audio_buffers()
|
||||
|
||||
def _buffer_has_audio(self, buffer: bytearray) -> bool:
|
||||
"""Check if a buffer contains audio data."""
|
||||
return buffer is not None and len(buffer) > 0
|
||||
|
||||
@@ -12,7 +12,6 @@ from typing import Awaitable, Callable, Optional
|
||||
from pipecat.frames.frames import CancelFrame, EndFrame, Frame, StartFrame
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.processors.producer_processor import ProducerProcessor, identity_transformer
|
||||
from pipecat.utils.asyncio.watchdog_queue import WatchdogQueue
|
||||
|
||||
|
||||
class ConsumerProcessor(FrameProcessor):
|
||||
@@ -66,7 +65,7 @@ class ConsumerProcessor(FrameProcessor):
|
||||
async def _start(self, _: StartFrame):
|
||||
"""Start the consumer task and register with the producer."""
|
||||
if not self._consumer_task:
|
||||
self._queue: WatchdogQueue = self._producer.add_consumer()
|
||||
self._queue = self._producer.add_consumer()
|
||||
self._consumer_task = self.create_task(self._consumer_task_handler())
|
||||
|
||||
async def _stop(self, _: EndFrame):
|
||||
@@ -77,7 +76,6 @@ class ConsumerProcessor(FrameProcessor):
|
||||
async def _cancel(self, _: CancelFrame):
|
||||
"""Cancel the consumer task."""
|
||||
if self._consumer_task:
|
||||
self._queue.cancel()
|
||||
await self.cancel_task(self._consumer_task)
|
||||
|
||||
async def _consumer_task_handler(self):
|
||||
|
||||
@@ -32,6 +32,8 @@ from pipecat.frames.frames import (
|
||||
TranscriptionFrame,
|
||||
UserStartedSpeakingFrame,
|
||||
UserStoppedSpeakingFrame,
|
||||
VADUserStartedSpeakingFrame,
|
||||
VADUserStoppedSpeakingFrame,
|
||||
)
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
|
||||
@@ -205,6 +207,8 @@ class STTMuteFilter(FrameProcessor):
|
||||
(
|
||||
StartInterruptionFrame,
|
||||
StopInterruptionFrame,
|
||||
VADUserStartedSpeakingFrame,
|
||||
VADUserStoppedSpeakingFrame,
|
||||
UserStartedSpeakingFrame,
|
||||
UserStoppedSpeakingFrame,
|
||||
InputAudioRawFrame,
|
||||
|
||||
@@ -14,7 +14,7 @@ management, and frame flow control mechanisms.
|
||||
import asyncio
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
from typing import Any, Awaitable, Callable, Coroutine, List, Optional, Sequence
|
||||
from typing import Any, Awaitable, Callable, Coroutine, List, Optional, Sequence, Tuple
|
||||
|
||||
from loguru import logger
|
||||
|
||||
@@ -34,11 +34,9 @@ from pipecat.frames.frames import (
|
||||
SystemFrame,
|
||||
)
|
||||
from pipecat.metrics.metrics import LLMTokenUsage, MetricsData
|
||||
from pipecat.observers.base_observer import BaseObserver, FramePushed
|
||||
from pipecat.observers.base_observer import BaseObserver, FrameProcessed, FramePushed
|
||||
from pipecat.processors.metrics.frame_processor_metrics import FrameProcessorMetrics
|
||||
from pipecat.utils.asyncio.task_manager import BaseTaskManager
|
||||
from pipecat.utils.asyncio.watchdog_event import WatchdogEvent
|
||||
from pipecat.utils.asyncio.watchdog_queue import WatchdogQueue
|
||||
from pipecat.utils.base_object import BaseObject
|
||||
|
||||
|
||||
@@ -54,6 +52,9 @@ class FrameDirection(Enum):
|
||||
UPSTREAM = 2
|
||||
|
||||
|
||||
FrameCallback = Callable[["FrameProcessor", Frame, FrameDirection], Awaitable[None]]
|
||||
|
||||
|
||||
@dataclass
|
||||
class FrameProcessorSetup:
|
||||
"""Configuration parameters for frame processor initialization.
|
||||
@@ -62,59 +63,54 @@ class FrameProcessorSetup:
|
||||
clock: The clock instance for timing operations.
|
||||
task_manager: The task manager for handling async operations.
|
||||
observer: Optional observer for monitoring frame processing events.
|
||||
watchdog_timers_enabled: Whether to enable watchdog timers by default.
|
||||
"""
|
||||
|
||||
clock: BaseClock
|
||||
task_manager: BaseTaskManager
|
||||
observer: Optional[BaseObserver] = None
|
||||
watchdog_timers_enabled: bool = False
|
||||
|
||||
|
||||
class FrameProcessorQueue(WatchdogQueue):
|
||||
class FrameProcessorQueue(asyncio.PriorityQueue):
|
||||
"""A priority queue for systems frames and other frames.
|
||||
|
||||
This is a specialized queue for frame processors that separates and
|
||||
prioritizes system frames over other frames.
|
||||
|
||||
This queue uses two internal `WatchdogQueue` instances:
|
||||
- One for system-level frames (`SystemFrame`)
|
||||
- One for regular frames
|
||||
|
||||
It ensures that `SystemFrame` objects are processed before any other
|
||||
frames. Additionally, it uses an `asyncio.Event` to signal when new items
|
||||
have been added to either queue, allowing consumers to wait efficiently when
|
||||
the queue is empty.
|
||||
prioritizes system frames over other frames. It ensures that `SystemFrame`
|
||||
objects are processed before any other frames by using a priority queue.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, manager: BaseTaskManager):
|
||||
HIGH_PRIORITY = 1
|
||||
LOW_PRIORITY = 2
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the FrameProcessorQueue.
|
||||
|
||||
Args:
|
||||
manager (BaseTaskManager): The task manager used by the internal watchdog queues.
|
||||
|
||||
"""
|
||||
super().__init__(manager)
|
||||
self.__event = WatchdogEvent(manager)
|
||||
self.__main_queue = WatchdogQueue(manager)
|
||||
self.__system_queue = WatchdogQueue(manager)
|
||||
super().__init__()
|
||||
self.__high_counter = 0
|
||||
self.__low_counter = 0
|
||||
|
||||
async def put(self, item: Any):
|
||||
"""Put an item into the appropriate queue.
|
||||
async def put(self, item: Tuple[Frame, FrameDirection, FrameCallback]):
|
||||
"""Put an item into the priority queue.
|
||||
|
||||
System frames (`SystemFrame`) are placed into the system queue and all others
|
||||
into the regular queue. Signals the event to wake up any waiting consumers.
|
||||
System frames (`SystemFrame`) have higher priority than any other
|
||||
frames. If a non-frame item (e.g. a watchdog cancellation sentinel) is
|
||||
provided it will have the highest priority.
|
||||
|
||||
Args:
|
||||
item (Any): The item to enqueue.
|
||||
|
||||
"""
|
||||
if isinstance(item, SystemFrame):
|
||||
await self.__system_queue.put(item)
|
||||
frame, _, _ = item
|
||||
if isinstance(frame, SystemFrame):
|
||||
self.__high_counter += 1
|
||||
await super().put((self.HIGH_PRIORITY, self.__high_counter, item))
|
||||
else:
|
||||
await self.__main_queue.put(item)
|
||||
self.__event.set()
|
||||
self.__low_counter += 1
|
||||
await super().put((self.LOW_PRIORITY, self.__low_counter, item))
|
||||
|
||||
async def get(self) -> Any:
|
||||
"""Retrieve the next item from the queue.
|
||||
@@ -126,38 +122,9 @@ class FrameProcessorQueue(WatchdogQueue):
|
||||
Any: The next item from the system or main queue.
|
||||
|
||||
"""
|
||||
# Wait for an item in any of the queues if they are empty.
|
||||
if self.__main_queue.empty() and self.__system_queue.empty():
|
||||
await self.__event.wait()
|
||||
|
||||
# Prioritize system frames.
|
||||
if self.__system_queue.qsize() > 0:
|
||||
item = await self.__system_queue.get()
|
||||
self.__system_queue.task_done()
|
||||
else:
|
||||
item = await self.__main_queue.get()
|
||||
self.__main_queue.task_done()
|
||||
|
||||
# Clear the event only if all queues are empty.
|
||||
if self.__main_queue.empty() and self.__system_queue.empty():
|
||||
self.__event.clear()
|
||||
|
||||
_, _, item = await super().get()
|
||||
return item
|
||||
|
||||
def cancel(self):
|
||||
"""Cancel both internal queues.
|
||||
|
||||
This method is used to stop processing and release any pending tasks
|
||||
in both the system and main queues. Typically used during shutdown
|
||||
or cleanup to prevent further processing of frames.
|
||||
|
||||
"""
|
||||
self.__main_queue.cancel()
|
||||
self.__system_queue.cancel()
|
||||
|
||||
|
||||
FrameCallback = Callable[["FrameProcessor", Frame, FrameDirection], Awaitable[None]]
|
||||
|
||||
|
||||
class FrameProcessor(BaseObject):
|
||||
"""Base class for all frame processors in the pipeline.
|
||||
@@ -175,35 +142,24 @@ class FrameProcessor(BaseObject):
|
||||
self,
|
||||
*,
|
||||
name: Optional[str] = None,
|
||||
enable_watchdog_logging: Optional[bool] = None,
|
||||
enable_watchdog_timers: Optional[bool] = None,
|
||||
enable_direct_mode: bool = False,
|
||||
metrics: Optional[FrameProcessorMetrics] = None,
|
||||
watchdog_timeout_secs: Optional[float] = None,
|
||||
**kwargs,
|
||||
):
|
||||
"""Initialize the frame processor.
|
||||
|
||||
Args:
|
||||
name: Optional name for this processor instance.
|
||||
enable_watchdog_logging: Whether to enable watchdog logging for tasks.
|
||||
enable_watchdog_timers: Whether to enable watchdog timers for tasks.
|
||||
enable_direct_mode: Whether to process frames immediately or use internal queues.
|
||||
metrics: Optional metrics collector for this processor.
|
||||
watchdog_timeout_secs: Timeout in seconds for watchdog operations.
|
||||
**kwargs: Additional arguments passed to parent class.
|
||||
"""
|
||||
super().__init__(name=name)
|
||||
self._parent: Optional["FrameProcessor"] = None
|
||||
super().__init__(name=name, **kwargs)
|
||||
self._prev: Optional["FrameProcessor"] = None
|
||||
self._next: Optional["FrameProcessor"] = None
|
||||
|
||||
# Enable watchdog timers for all tasks created by this frame processor.
|
||||
self._enable_watchdog_timers = enable_watchdog_timers
|
||||
|
||||
# Enable watchdog logging for all tasks created by this frame processor.
|
||||
self._enable_watchdog_logging = enable_watchdog_logging
|
||||
|
||||
# Allow this frame processor to control their tasks timeout.
|
||||
self._watchdog_timeout_secs = watchdog_timeout_secs
|
||||
# Enable direct mode to skip queues and process frames right away.
|
||||
self._enable_direct_mode = enable_direct_mode
|
||||
|
||||
# Clock
|
||||
self._clock: Optional[BaseClock] = None
|
||||
@@ -246,6 +202,8 @@ class FrameProcessor(BaseObject):
|
||||
|
||||
# The input task that handles all types of frames. It processes system
|
||||
# frames right away and queues non-system frames for later processing.
|
||||
self.__should_block_system_frames = False
|
||||
self.__input_event: Optional[asyncio.Event] = None
|
||||
self.__input_frame_task: Optional[asyncio.Task] = None
|
||||
|
||||
# The process task processes non-system frames. Non-system frames will
|
||||
@@ -254,9 +212,8 @@ class FrameProcessor(BaseObject):
|
||||
# called. To resume processing frames we need to call
|
||||
# `resume_processing_frames()` which will wake up the event.
|
||||
self.__should_block_frames = False
|
||||
self.__process_event = None
|
||||
self.__process_event: Optional[asyncio.Event] = None
|
||||
self.__process_frame_task: Optional[asyncio.Task] = None
|
||||
self.__process_queue = None
|
||||
|
||||
@property
|
||||
def id(self) -> int:
|
||||
@@ -276,6 +233,50 @@ class FrameProcessor(BaseObject):
|
||||
"""
|
||||
return self._name
|
||||
|
||||
@property
|
||||
def processors(self) -> List["FrameProcessor"]:
|
||||
"""Return the list of sub-processors contained within this processor.
|
||||
|
||||
Only compound processors (e.g. pipelines and parallel pipelines) have
|
||||
sub-processors. Non-compound processors will return an empty list.
|
||||
|
||||
Returns:
|
||||
The list of sub-processors if this is a compound processor.
|
||||
"""
|
||||
return []
|
||||
|
||||
@property
|
||||
def entry_processors(self) -> List["FrameProcessor"]:
|
||||
"""Return the list of entry processors for this processor.
|
||||
|
||||
Entry processors are the first processors in a compound processor
|
||||
(e.g. pipelines, parallel pipelines). Note that pipelines can also be an
|
||||
entry processor as pipelines are processors themselves. Non-compound
|
||||
processors will simply return an empty list.
|
||||
|
||||
Returns:
|
||||
The list of entry processors.
|
||||
"""
|
||||
return []
|
||||
|
||||
@property
|
||||
def next(self) -> Optional["FrameProcessor"]:
|
||||
"""Get the next processor.
|
||||
|
||||
Returns:
|
||||
The next processor, or None if there's no next processor.
|
||||
"""
|
||||
return self._next
|
||||
|
||||
@property
|
||||
def previous(self) -> Optional["FrameProcessor"]:
|
||||
"""Get the previous processor.
|
||||
|
||||
Returns:
|
||||
The previous processor, or None if there's no previous processor.
|
||||
"""
|
||||
return self._prev
|
||||
|
||||
@property
|
||||
def interruptions_allowed(self):
|
||||
"""Check if interruptions are allowed for this processor.
|
||||
@@ -335,6 +336,17 @@ class FrameProcessor(BaseObject):
|
||||
raise Exception(f"{self} TaskManager is still not initialized.")
|
||||
return self._task_manager
|
||||
|
||||
def processors_with_metrics(self):
|
||||
"""Return processors that can generate metrics.
|
||||
|
||||
Recursively collects all processors that support metrics generation,
|
||||
including those from nested processors.
|
||||
|
||||
Returns:
|
||||
List of frame processors that can generate metrics.
|
||||
"""
|
||||
return []
|
||||
|
||||
def can_generate_metrics(self) -> bool:
|
||||
"""Check if this processor can generate metrics.
|
||||
|
||||
@@ -402,23 +414,12 @@ class FrameProcessor(BaseObject):
|
||||
await self.stop_ttfb_metrics()
|
||||
await self.stop_processing_metrics()
|
||||
|
||||
def create_task(
|
||||
self,
|
||||
coroutine: Coroutine,
|
||||
name: Optional[str] = None,
|
||||
*,
|
||||
enable_watchdog_logging: Optional[bool] = None,
|
||||
enable_watchdog_timers: Optional[bool] = None,
|
||||
watchdog_timeout_secs: Optional[float] = None,
|
||||
) -> asyncio.Task:
|
||||
def create_task(self, coroutine: Coroutine, name: Optional[str] = None) -> asyncio.Task:
|
||||
"""Create a new task managed by this processor.
|
||||
|
||||
Args:
|
||||
coroutine: The coroutine to run in the task.
|
||||
name: Optional name for the task.
|
||||
enable_watchdog_logging: Whether to enable watchdog logging.
|
||||
enable_watchdog_timers: Whether to enable watchdog timers.
|
||||
watchdog_timeout_secs: Timeout in seconds for watchdog operations.
|
||||
|
||||
Returns:
|
||||
The created asyncio task.
|
||||
@@ -427,21 +428,7 @@ class FrameProcessor(BaseObject):
|
||||
name = f"{self}::{name}"
|
||||
else:
|
||||
name = f"{self}::{coroutine.cr_code.co_name}"
|
||||
return self.task_manager.create_task(
|
||||
coroutine,
|
||||
name,
|
||||
enable_watchdog_logging=(
|
||||
enable_watchdog_logging
|
||||
if enable_watchdog_logging
|
||||
else self._enable_watchdog_logging
|
||||
),
|
||||
enable_watchdog_timers=(
|
||||
enable_watchdog_timers if enable_watchdog_timers else self._enable_watchdog_timers
|
||||
),
|
||||
watchdog_timeout=(
|
||||
watchdog_timeout_secs if watchdog_timeout_secs else self._watchdog_timeout_secs
|
||||
),
|
||||
)
|
||||
return self.task_manager.create_task(coroutine, name)
|
||||
|
||||
async def cancel_task(self, task: asyncio.Task, timeout: Optional[float] = None):
|
||||
"""Cancel a task managed by this processor.
|
||||
@@ -455,15 +442,27 @@ class FrameProcessor(BaseObject):
|
||||
async def wait_for_task(self, task: asyncio.Task, timeout: Optional[float] = None):
|
||||
"""Wait for a task to complete.
|
||||
|
||||
.. deprecated:: 0.0.81
|
||||
This function is deprecated, use `await task` or
|
||||
`await asyncio.wait_for(task, timeout) instead.
|
||||
|
||||
Args:
|
||||
task: The task to wait for.
|
||||
timeout: Optional timeout for waiting.
|
||||
"""
|
||||
await self.task_manager.wait_for_task(task, timeout)
|
||||
import warnings
|
||||
|
||||
def reset_watchdog(self):
|
||||
"""Reset the watchdog timer for the current task."""
|
||||
self.task_manager.task_reset_watchdog()
|
||||
warnings.warn(
|
||||
"`FrameProcessor.wait_for_task()` is deprecated. "
|
||||
"Use `await task` or `await asyncio.wait_for(task, timeout)` instead.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
if timeout:
|
||||
await asyncio.wait_for(task, timeout)
|
||||
else:
|
||||
await task
|
||||
|
||||
async def setup(self, setup: FrameProcessorSetup):
|
||||
"""Set up the processor with required components.
|
||||
@@ -474,11 +473,6 @@ class FrameProcessor(BaseObject):
|
||||
self._clock = setup.clock
|
||||
self._task_manager = setup.task_manager
|
||||
self._observer = setup.observer
|
||||
self._watchdog_timers_enabled = (
|
||||
self._enable_watchdog_timers
|
||||
if self._enable_watchdog_timers
|
||||
else setup.watchdog_timers_enabled
|
||||
)
|
||||
|
||||
# Create processing tasks.
|
||||
self.__create_input_task()
|
||||
@@ -504,30 +498,6 @@ class FrameProcessor(BaseObject):
|
||||
processor._prev = self
|
||||
logger.debug(f"Linking {self} -> {self._next}")
|
||||
|
||||
def get_event_loop(self) -> asyncio.AbstractEventLoop:
|
||||
"""Get the event loop used by this processor.
|
||||
|
||||
Returns:
|
||||
The asyncio event loop.
|
||||
"""
|
||||
return self.task_manager.get_event_loop()
|
||||
|
||||
def set_parent(self, parent: "FrameProcessor"):
|
||||
"""Set the parent processor for this processor.
|
||||
|
||||
Args:
|
||||
parent: The parent processor.
|
||||
"""
|
||||
self._parent = parent
|
||||
|
||||
def get_parent(self) -> Optional["FrameProcessor"]:
|
||||
"""Get the parent processor.
|
||||
|
||||
Returns:
|
||||
The parent processor, or None if no parent is set.
|
||||
"""
|
||||
return self._parent
|
||||
|
||||
def get_clock(self) -> BaseClock:
|
||||
"""Get the clock used by this processor.
|
||||
|
||||
@@ -541,6 +511,14 @@ class FrameProcessor(BaseObject):
|
||||
raise Exception(f"{self} Clock is still not initialized.")
|
||||
return self._clock
|
||||
|
||||
def get_event_loop(self) -> asyncio.AbstractEventLoop:
|
||||
"""Get the event loop used by this processor.
|
||||
|
||||
Returns:
|
||||
The asyncio event loop.
|
||||
"""
|
||||
return self.task_manager.get_event_loop()
|
||||
|
||||
async def queue_frame(
|
||||
self,
|
||||
frame: Frame,
|
||||
@@ -558,19 +536,33 @@ class FrameProcessor(BaseObject):
|
||||
if self._cancelling:
|
||||
return
|
||||
|
||||
await self.__input_queue.put((frame, direction, callback))
|
||||
if self._enable_direct_mode:
|
||||
await self.__process_frame(frame, direction, callback)
|
||||
else:
|
||||
await self.__input_queue.put((frame, direction, callback))
|
||||
|
||||
async def pause_processing_frames(self):
|
||||
"""Pause processing of queued frames."""
|
||||
logger.trace(f"{self}: pausing frame processing")
|
||||
self.__should_block_frames = True
|
||||
|
||||
async def pause_processing_system_frames(self):
|
||||
"""Pause processing of queued system frames."""
|
||||
logger.trace(f"{self}: pausing system frame processing")
|
||||
self.__should_block_system_frames = True
|
||||
|
||||
async def resume_processing_frames(self):
|
||||
"""Resume processing of queued frames."""
|
||||
logger.trace(f"{self}: resuming frame processing")
|
||||
if self.__process_event:
|
||||
self.__process_event.set()
|
||||
|
||||
async def resume_processing_system_frames(self):
|
||||
"""Resume processing of queued system frames."""
|
||||
logger.trace(f"{self}: resuming system frame processing")
|
||||
if self.__input_event:
|
||||
self.__input_event.set()
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
"""Process a frame.
|
||||
|
||||
@@ -578,6 +570,16 @@ class FrameProcessor(BaseObject):
|
||||
frame: The frame to process.
|
||||
direction: The direction of frame flow.
|
||||
"""
|
||||
if self._observer:
|
||||
timestamp = self._clock.get_time() if self._clock else 0
|
||||
data = FrameProcessed(
|
||||
processor=self,
|
||||
frame=frame,
|
||||
direction=direction,
|
||||
timestamp=timestamp,
|
||||
)
|
||||
await self._observer.on_process_frame(data)
|
||||
|
||||
if isinstance(frame, StartFrame):
|
||||
await self.__start(frame)
|
||||
elif isinstance(frame, StartInterruptionFrame):
|
||||
@@ -730,36 +732,39 @@ class FrameProcessor(BaseObject):
|
||||
|
||||
def __create_input_task(self):
|
||||
"""Create the frame input processing task."""
|
||||
if self._enable_direct_mode:
|
||||
return
|
||||
|
||||
if not self.__input_frame_task:
|
||||
self.__input_queue = FrameProcessorQueue(self.task_manager)
|
||||
self.__input_event = asyncio.Event()
|
||||
self.__input_queue = FrameProcessorQueue()
|
||||
self.__input_frame_task = self.create_task(self.__input_frame_task_handler())
|
||||
|
||||
async def __cancel_input_task(self):
|
||||
"""Cancel the frame input processing task."""
|
||||
if self.__input_frame_task:
|
||||
self.__input_queue.cancel()
|
||||
await self.cancel_task(self.__input_frame_task)
|
||||
self.__input_frame_task = None
|
||||
|
||||
def __create_process_task(self):
|
||||
"""Create the non-system frame processing task."""
|
||||
if self._enable_direct_mode:
|
||||
return
|
||||
|
||||
if not self.__process_frame_task:
|
||||
self.__should_block_frames = False
|
||||
if not self.__process_event:
|
||||
self.__process_event = WatchdogEvent(self.task_manager)
|
||||
self.__process_event.clear()
|
||||
self.__process_queue = WatchdogQueue(self.task_manager)
|
||||
self.__process_event = asyncio.Event()
|
||||
self.__process_queue = asyncio.Queue()
|
||||
self.__process_frame_task = self.create_task(self.__process_frame_task_handler())
|
||||
|
||||
async def __cancel_process_task(self):
|
||||
"""Cancel the non-system frame processing task."""
|
||||
if self.__process_frame_task:
|
||||
self.__process_queue.cancel()
|
||||
await self.cancel_task(self.__process_frame_task)
|
||||
self.__process_frame_task = None
|
||||
|
||||
async def __process_frame(
|
||||
self, frame: Frame, direction: FrameDirection, callback: FrameCallback
|
||||
self, frame: Frame, direction: FrameDirection, callback: Optional[FrameCallback]
|
||||
):
|
||||
try:
|
||||
# Process the frame.
|
||||
@@ -779,6 +784,13 @@ class FrameProcessor(BaseObject):
|
||||
|
||||
"""
|
||||
while True:
|
||||
if self.__should_block_system_frames and self.__input_event:
|
||||
logger.trace(f"{self}: system frame processing paused")
|
||||
await self.__input_event.wait()
|
||||
self.__input_event.clear()
|
||||
self.__should_block_system_frames = False
|
||||
logger.trace(f"{self}: system frame processing resumed")
|
||||
|
||||
(frame, direction, callback) = await self.__input_queue.get()
|
||||
|
||||
if isinstance(frame, SystemFrame):
|
||||
@@ -790,6 +802,8 @@ class FrameProcessor(BaseObject):
|
||||
f"{self}: __process_queue is None when processing frame {frame.name}"
|
||||
)
|
||||
|
||||
self.__input_queue.task_done()
|
||||
|
||||
async def __process_frame_task_handler(self):
|
||||
"""Handle non-system frames from the process queue."""
|
||||
while True:
|
||||
@@ -803,3 +817,5 @@ class FrameProcessor(BaseObject):
|
||||
(frame, direction, callback) = await self.__process_queue.get()
|
||||
|
||||
await self.__process_frame(frame, direction, callback)
|
||||
|
||||
self.__process_queue.task_done()
|
||||
|
||||
@@ -42,6 +42,7 @@ from pipecat.frames.frames import (
|
||||
FunctionCallResultFrame,
|
||||
InputAudioRawFrame,
|
||||
InterimTranscriptionFrame,
|
||||
LLMContextFrame,
|
||||
LLMFullResponseEndFrame,
|
||||
LLMFullResponseStartFrame,
|
||||
LLMMessagesAppendFrame,
|
||||
@@ -72,11 +73,9 @@ from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.services.llm_service import (
|
||||
FunctionCallParams, # TODO(aleix): we shouldn't import `services` from `processors`
|
||||
)
|
||||
from pipecat.services.openai.llm import OpenAIContextAggregatorPair
|
||||
from pipecat.transports.base_input import BaseInputTransport
|
||||
from pipecat.transports.base_output import BaseOutputTransport
|
||||
from pipecat.transports.base_transport import BaseTransport
|
||||
from pipecat.utils.asyncio.watchdog_queue import WatchdogQueue
|
||||
from pipecat.utils.string import match_endofsentence
|
||||
|
||||
RTVI_PROTOCOL_VERSION = "1.0.0"
|
||||
@@ -918,7 +917,10 @@ class RTVIObserver(BaseObserver):
|
||||
and self._params.user_transcription_enabled
|
||||
):
|
||||
await self._handle_user_transcriptions(frame)
|
||||
elif isinstance(frame, OpenAILLMContextFrame) and self._params.user_llm_enabled:
|
||||
elif (
|
||||
isinstance(frame, (OpenAILLMContextFrame, LLMContextFrame))
|
||||
and self._params.user_llm_enabled
|
||||
):
|
||||
await self._handle_context(frame)
|
||||
elif isinstance(frame, LLMFullResponseStartFrame) and self._params.bot_llm_enabled:
|
||||
await self.push_transport_message_urgent(RTVIBotLLMStartedMessage())
|
||||
@@ -1019,16 +1021,20 @@ class RTVIObserver(BaseObserver):
|
||||
if message:
|
||||
await self.push_transport_message_urgent(message)
|
||||
|
||||
async def _handle_context(self, frame: OpenAILLMContextFrame):
|
||||
async def _handle_context(self, frame: OpenAILLMContextFrame | LLMContextFrame):
|
||||
"""Process LLM context frames to extract user messages for the RTVI client."""
|
||||
try:
|
||||
messages = frame.context.messages
|
||||
if isinstance(frame, OpenAILLMContextFrame):
|
||||
messages = frame.context.messages
|
||||
else:
|
||||
messages = frame.context.get_messages()
|
||||
if not messages:
|
||||
return
|
||||
|
||||
message = messages[-1]
|
||||
|
||||
# Handle Google LLM format (protobuf objects with attributes)
|
||||
# Note: not possible if frame is a universal LLMContextFrame
|
||||
if hasattr(message, "role") and message.role == "user" and hasattr(message, "parts"):
|
||||
text = "".join(part.text for part in message.parts if hasattr(part, "text"))
|
||||
if text:
|
||||
@@ -1315,10 +1321,10 @@ class RTVIProcessor(FrameProcessor):
|
||||
async def _start(self, frame: StartFrame):
|
||||
"""Start the RTVI processor tasks."""
|
||||
if not self._action_task:
|
||||
self._action_queue = WatchdogQueue(self.task_manager)
|
||||
self._action_queue = asyncio.Queue()
|
||||
self._action_task = self.create_task(self._action_task_handler())
|
||||
if not self._message_task:
|
||||
self._message_queue = WatchdogQueue(self.task_manager)
|
||||
self._message_queue = asyncio.Queue()
|
||||
self._message_task = self.create_task(self._message_task_handler())
|
||||
await self._call_event_handler("on_bot_started")
|
||||
|
||||
@@ -1333,12 +1339,10 @@ class RTVIProcessor(FrameProcessor):
|
||||
async def _cancel_tasks(self):
|
||||
"""Cancel all running tasks."""
|
||||
if self._action_task:
|
||||
self._action_queue.cancel()
|
||||
await self.cancel_task(self._action_task)
|
||||
self._action_task = None
|
||||
|
||||
if self._message_task:
|
||||
self._message_queue.cancel()
|
||||
await self.cancel_task(self._message_task)
|
||||
self._message_task = None
|
||||
|
||||
|
||||
@@ -11,7 +11,6 @@ from typing import Awaitable, Callable, List, Optional
|
||||
|
||||
from pipecat.frames.frames import Frame, StartFrame
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.utils.asyncio.watchdog_event import WatchdogEvent
|
||||
|
||||
|
||||
class IdleFrameProcessor(FrameProcessor):
|
||||
@@ -78,7 +77,7 @@ class IdleFrameProcessor(FrameProcessor):
|
||||
def _create_idle_task(self):
|
||||
"""Create and start the idle monitoring task."""
|
||||
if not self._idle_task:
|
||||
self._idle_event = WatchdogEvent(self.task_manager)
|
||||
self._idle_event = asyncio.Event()
|
||||
self._idle_task = self.create_task(self._idle_task_handler())
|
||||
|
||||
async def _idle_task_handler(self):
|
||||
|
||||
@@ -6,10 +6,11 @@
|
||||
|
||||
"""Sentry integration for frame processor metrics."""
|
||||
|
||||
import asyncio
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.utils.asyncio.task_manager import BaseTaskManager
|
||||
from pipecat.utils.asyncio.watchdog_queue import WatchdogQueue
|
||||
|
||||
try:
|
||||
import sentry_sdk
|
||||
@@ -51,7 +52,7 @@ class SentryMetrics(FrameProcessorMetrics):
|
||||
"""
|
||||
await super().setup(task_manager)
|
||||
if self._sentry_available:
|
||||
self._sentry_queue = WatchdogQueue(task_manager)
|
||||
self._sentry_queue = asyncio.Queue()
|
||||
self._sentry_task = self.task_manager.create_task(
|
||||
self._sentry_task_handler(), name=f"{self}::_sentry_task_handler"
|
||||
)
|
||||
@@ -64,7 +65,7 @@ class SentryMetrics(FrameProcessorMetrics):
|
||||
await super().cleanup()
|
||||
if self._sentry_task:
|
||||
await self._sentry_queue.put(None)
|
||||
await self.task_manager.wait_for_task(self._sentry_task)
|
||||
await self._sentry_task
|
||||
self._sentry_task = None
|
||||
logger.trace(f"{self} Flushing Sentry metrics")
|
||||
sentry_sdk.flush(timeout=5.0)
|
||||
|
||||
@@ -11,7 +11,6 @@ from typing import Awaitable, Callable, List
|
||||
|
||||
from pipecat.frames.frames import Frame
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.utils.asyncio.watchdog_queue import WatchdogQueue
|
||||
|
||||
|
||||
async def identity_transformer(frame: Frame):
|
||||
@@ -64,7 +63,7 @@ class ProducerProcessor(FrameProcessor):
|
||||
Returns:
|
||||
asyncio.Queue: The queue for the newly added consumer.
|
||||
"""
|
||||
queue = WatchdogQueue(self.task_manager)
|
||||
queue = asyncio.Queue()
|
||||
self._consumers.append(queue)
|
||||
return queue
|
||||
|
||||
|
||||
@@ -22,7 +22,6 @@ from pipecat.frames.frames import (
|
||||
UserStoppedSpeakingFrame,
|
||||
)
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.utils.asyncio.watchdog_event import WatchdogEvent
|
||||
|
||||
|
||||
class UserIdleProcessor(FrameProcessor):
|
||||
@@ -78,7 +77,7 @@ class UserIdleProcessor(FrameProcessor):
|
||||
self._interrupted = False
|
||||
self._conversation_started = False
|
||||
self._idle_task = None
|
||||
self._idle_event = None
|
||||
self._idle_event = asyncio.Event()
|
||||
|
||||
def _wrap_callback(
|
||||
self,
|
||||
@@ -138,9 +137,6 @@ class UserIdleProcessor(FrameProcessor):
|
||||
"""
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if isinstance(frame, StartFrame):
|
||||
self._idle_event = WatchdogEvent(self.task_manager)
|
||||
|
||||
# Check for end frames before processing
|
||||
if isinstance(frame, (EndFrame, CancelFrame)):
|
||||
# Stop the idle task, if it exists
|
||||
|
||||
@@ -53,7 +53,7 @@ Supported transports:
|
||||
|
||||
- Daily - Creates rooms and tokens, runs bot as participant
|
||||
- WebRTC - Provides local WebRTC interface with prebuilt UI
|
||||
- Telephony - Handles webhook and WebSocket connections for Twilio, Telnyx, Plivo
|
||||
- Telephony - Handles webhook and WebSocket connections for Twilio, Telnyx, Plivo, Exotel
|
||||
|
||||
To run locally:
|
||||
|
||||
@@ -62,6 +62,7 @@ To run locally:
|
||||
- Daily (server): `python bot.py -t daily`
|
||||
- Daily (direct, testing only): `python bot.py -d`
|
||||
- Telephony: `python bot.py -t twilio -x your_username.ngrok.io`
|
||||
- Exotel: `python bot.py -t exotel` (no proxy needed, but ngrok connection to HTTP 7860 is required)
|
||||
"""
|
||||
|
||||
import argparse
|
||||
@@ -145,7 +146,6 @@ async def _run_telephony_bot(websocket: WebSocket):
|
||||
|
||||
# Just pass the WebSocket - let the bot handle parsing
|
||||
runner_args = WebSocketRunnerArguments(websocket=websocket)
|
||||
runner_args.handle_sigint = False
|
||||
|
||||
await bot_module.bot(runner_args)
|
||||
|
||||
@@ -169,7 +169,7 @@ def _create_server_app(
|
||||
_setup_webrtc_routes(app, esp32_mode=esp32_mode, host=host)
|
||||
elif transport_type == "daily":
|
||||
_setup_daily_routes(app)
|
||||
elif transport_type in ["twilio", "telnyx", "plivo"]:
|
||||
elif transport_type in ["twilio", "telnyx", "plivo", "exotel"]:
|
||||
_setup_telephony_routes(app, transport_type, proxy)
|
||||
else:
|
||||
logger.warning(f"Unknown transport type: {transport_type}")
|
||||
@@ -223,7 +223,6 @@ def _setup_webrtc_routes(app: FastAPI, esp32_mode: bool = False, host: str = "lo
|
||||
|
||||
bot_module = _get_bot_module()
|
||||
runner_args = SmallWebRTCRunnerArguments(webrtc_connection=pipecat_connection)
|
||||
runner_args.handle_sigint = False
|
||||
background_tasks.add_task(bot_module.bot, runner_args)
|
||||
|
||||
answer = pipecat_connection.get_answer()
|
||||
@@ -266,7 +265,6 @@ def _setup_daily_routes(app: FastAPI):
|
||||
# Start the bot in the background with empty body for GET requests
|
||||
bot_module = _get_bot_module()
|
||||
runner_args = DailyRunnerArguments(room_url=room_url, token=token)
|
||||
runner_args.handle_sigint = False
|
||||
asyncio.create_task(bot_module.bot(runner_args))
|
||||
return RedirectResponse(room_url)
|
||||
|
||||
@@ -311,7 +309,6 @@ def _setup_daily_routes(app: FastAPI):
|
||||
# Start the bot in the background with extracted body data
|
||||
bot_module = _get_bot_module()
|
||||
runner_args = DailyRunnerArguments(room_url=room_url, token=token, body=bot_body)
|
||||
runner_args.handle_sigint = False
|
||||
asyncio.create_task(bot_module.bot(runner_args))
|
||||
# Match PCC /start endpoint response format:
|
||||
return {"dailyRoom": room_url, "dailyToken": token}
|
||||
@@ -337,7 +334,7 @@ def _setup_daily_routes(app: FastAPI):
|
||||
|
||||
def _setup_telephony_routes(app: FastAPI, transport_type: str, proxy: str):
|
||||
"""Set up telephony-specific routes."""
|
||||
# XML response templates
|
||||
# XML response templates (Exotel doesn't use XML webhooks)
|
||||
XML_TEMPLATES = {
|
||||
"twilio": f"""<?xml version="1.0" encoding="UTF-8"?>
|
||||
<Response>
|
||||
@@ -362,9 +359,18 @@ def _setup_telephony_routes(app: FastAPI, transport_type: str, proxy: str):
|
||||
@app.post("/")
|
||||
async def start_call():
|
||||
"""Handle telephony webhook and return XML response."""
|
||||
logger.debug(f"POST {transport_type.upper()} XML")
|
||||
xml_content = XML_TEMPLATES.get(transport_type, "<Response></Response>")
|
||||
return HTMLResponse(content=xml_content, media_type="application/xml")
|
||||
if transport_type == "exotel":
|
||||
# Exotel doesn't use POST webhooks - redirect to proper documentation
|
||||
logger.debug("POST Exotel endpoint - not used")
|
||||
return {
|
||||
"error": "Exotel doesn't use POST webhooks",
|
||||
"websocket_url": f"wss://{proxy}/ws",
|
||||
"note": "Configure the WebSocket URL above in your Exotel App Bazaar Voicebot Applet",
|
||||
}
|
||||
else:
|
||||
logger.debug(f"POST {transport_type.upper()} XML")
|
||||
xml_content = XML_TEMPLATES.get(transport_type, "<Response></Response>")
|
||||
return HTMLResponse(content=xml_content, media_type="application/xml")
|
||||
|
||||
@app.websocket("/ws")
|
||||
async def websocket_endpoint(websocket: WebSocket):
|
||||
@@ -396,6 +402,7 @@ async def _run_daily_direct():
|
||||
|
||||
# Direct connections have no request body, so use empty dict
|
||||
runner_args = DailyRunnerArguments(room_url=room_url, token=token)
|
||||
runner_args.handle_sigint = True
|
||||
|
||||
# Get the bot module and run it directly
|
||||
bot_module = _get_bot_module()
|
||||
@@ -440,7 +447,7 @@ def main():
|
||||
Args:
|
||||
--host: Server host address (default: localhost)
|
||||
--port: Server port (default: 7860)
|
||||
-t/--transport: Transport type (daily, webrtc, twilio, telnyx, plivo)
|
||||
-t/--transport: Transport type (daily, webrtc, twilio, telnyx, plivo, exotel)
|
||||
-x/--proxy: Public proxy hostname for telephony webhooks
|
||||
--esp32: Enable SDP munging for ESP32 compatibility (requires --host with IP address)
|
||||
-d/--direct: Connect directly to Daily room (automatically sets transport to daily)
|
||||
@@ -455,7 +462,7 @@ def main():
|
||||
"-t",
|
||||
"--transport",
|
||||
type=str,
|
||||
choices=["daily", "webrtc", "twilio", "telnyx", "plivo"],
|
||||
choices=["daily", "webrtc", "twilio", "telnyx", "plivo", "exotel"],
|
||||
default="webrtc",
|
||||
help="Transport type",
|
||||
)
|
||||
|
||||
@@ -25,7 +25,7 @@ class RunnerArguments:
|
||||
pipeline_idle_timeout_secs: int = field(init=False)
|
||||
|
||||
def __post_init__(self):
|
||||
self.handle_sigint = True
|
||||
self.handle_sigint = False
|
||||
self.handle_sigterm = False
|
||||
self.pipeline_idle_timeout_secs = 300
|
||||
|
||||
|
||||
@@ -77,6 +77,17 @@ def _detect_transport_type_from_message(message_data: dict) -> str:
|
||||
logger.trace("Auto-detected: PLIVO")
|
||||
return "plivo"
|
||||
|
||||
# Exotel detection
|
||||
if (
|
||||
message_data.get("event") == "start"
|
||||
and "start" in message_data
|
||||
and "stream_sid" in message_data.get("start", {})
|
||||
and "call_sid" in message_data.get("start", {})
|
||||
and "account_sid" in message_data.get("start", {})
|
||||
):
|
||||
logger.trace("Auto-detected: EXOTEL")
|
||||
return "exotel"
|
||||
|
||||
logger.trace("Auto-detection failed - unknown format")
|
||||
return "unknown"
|
||||
|
||||
@@ -91,6 +102,7 @@ async def parse_telephony_websocket(websocket: WebSocket):
|
||||
- Twilio: {"stream_id": str, "call_id": str}
|
||||
- Telnyx: {"stream_id": str, "call_control_id": str, "outbound_encoding": str}
|
||||
- Plivo: {"stream_id": str, "call_id": str}
|
||||
- Exotel: {"stream_id": str, "call_id": str, "account_sid": str}
|
||||
|
||||
Example usage::
|
||||
|
||||
@@ -160,6 +172,14 @@ async def parse_telephony_websocket(websocket: WebSocket):
|
||||
"call_id": start_data.get("callId"),
|
||||
}
|
||||
|
||||
elif transport_type == "exotel":
|
||||
start_data = call_data_raw.get("start", {})
|
||||
call_data = {
|
||||
"stream_id": start_data.get("stream_sid"),
|
||||
"call_id": start_data.get("call_sid"),
|
||||
"account_sid": start_data.get("account_sid"),
|
||||
}
|
||||
|
||||
else:
|
||||
call_data = {}
|
||||
|
||||
@@ -240,6 +260,7 @@ async def maybe_capture_participant_screen(
|
||||
await transport.capture_participant_video(
|
||||
client["id"], framerate=framerate, video_source="screenVideo"
|
||||
)
|
||||
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
@@ -379,10 +400,17 @@ async def _create_telephony_transport(
|
||||
auth_id=os.getenv("PLIVO_AUTH_ID", ""),
|
||||
auth_token=os.getenv("PLIVO_AUTH_TOKEN", ""),
|
||||
)
|
||||
elif transport_type == "exotel":
|
||||
from pipecat.serializers.exotel import ExotelFrameSerializer
|
||||
|
||||
params.serializer = ExotelFrameSerializer(
|
||||
stream_sid=call_data["stream_id"],
|
||||
call_sid=call_data["call_id"],
|
||||
)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Unsupported telephony provider: {transport_type}. "
|
||||
f"Supported providers: twilio, telnyx, plivo"
|
||||
f"Supported providers: twilio, telnyx, plivo, exotel"
|
||||
)
|
||||
|
||||
return FastAPIWebsocketTransport(websocket=websocket, params=params)
|
||||
@@ -399,7 +427,7 @@ async def create_transport(
|
||||
Args:
|
||||
runner_args: Arguments from the runner.
|
||||
transport_params: Dict mapping transport names to parameter factory functions.
|
||||
Keys should be: "daily", "webrtc", "twilio", "telnyx", "plivo"
|
||||
Keys should be: "daily", "webrtc", "twilio", "telnyx", "plivo", "exotel"
|
||||
Values should be functions that return transport parameters when called.
|
||||
|
||||
Returns:
|
||||
@@ -440,6 +468,12 @@ async def create_transport(
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
# add_wav_header and serializer will be set automatically
|
||||
),
|
||||
"exotel": lambda: FastAPIWebsocketParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
# add_wav_header and serializer will be set automatically
|
||||
),
|
||||
}
|
||||
|
||||
transport = await create_transport(runner_args, transport_params)
|
||||
|
||||
@@ -31,6 +31,7 @@ from pipecat.frames.frames import (
|
||||
FunctionCallCancelFrame,
|
||||
FunctionCallInProgressFrame,
|
||||
FunctionCallResultFrame,
|
||||
LLMContextFrame,
|
||||
LLMEnablePromptCachingFrame,
|
||||
LLMFullResponseEndFrame,
|
||||
LLMFullResponseStartFrame,
|
||||
@@ -41,6 +42,7 @@ from pipecat.frames.frames import (
|
||||
VisionImageRawFrame,
|
||||
)
|
||||
from pipecat.metrics.metrics import LLMTokenUsage
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response import (
|
||||
LLMAssistantAggregatorParams,
|
||||
LLMAssistantContextAggregator,
|
||||
@@ -53,11 +55,10 @@ from pipecat.processors.aggregators.openai_llm_context import (
|
||||
)
|
||||
from pipecat.processors.frame_processor import FrameDirection
|
||||
from pipecat.services.llm_service import FunctionCallFromLLM, LLMService
|
||||
from pipecat.utils.asyncio.watchdog_async_iterator import WatchdogAsyncIterator
|
||||
from pipecat.utils.tracing.service_decorators import traced_llm
|
||||
|
||||
try:
|
||||
from anthropic import NOT_GIVEN, AsyncAnthropic, NotGiven
|
||||
from anthropic import NOT_GIVEN, APITimeoutError, AsyncAnthropic, NotGiven
|
||||
except ModuleNotFoundError as e:
|
||||
logger.error(f"Exception: {e}")
|
||||
logger.error("In order to use Anthropic, you need to `pip install pipecat-ai[anthropic]`.")
|
||||
@@ -133,6 +134,8 @@ class AnthropicLLMService(LLMService):
|
||||
model: str = "claude-sonnet-4-20250514",
|
||||
params: Optional[InputParams] = None,
|
||||
client=None,
|
||||
retry_timeout_secs: Optional[float] = 5.0,
|
||||
retry_on_timeout: Optional[bool] = False,
|
||||
**kwargs,
|
||||
):
|
||||
"""Initialize the Anthropic LLM service.
|
||||
@@ -142,6 +145,8 @@ class AnthropicLLMService(LLMService):
|
||||
model: Model name to use. Defaults to "claude-sonnet-4-20250514".
|
||||
params: Optional model parameters for inference.
|
||||
client: Optional custom Anthropic client instance.
|
||||
retry_timeout_secs: Request timeout in seconds for retry logic.
|
||||
retry_on_timeout: Whether to retry the request once if it times out.
|
||||
**kwargs: Additional arguments passed to parent LLMService.
|
||||
"""
|
||||
super().__init__(**kwargs)
|
||||
@@ -150,6 +155,8 @@ class AnthropicLLMService(LLMService):
|
||||
api_key=api_key
|
||||
) # if the client is provided, use it and remove it, otherwise create a new one
|
||||
self.set_model_name(model)
|
||||
self._retry_timeout_secs = retry_timeout_secs
|
||||
self._retry_on_timeout = retry_on_timeout
|
||||
self._settings = {
|
||||
"max_tokens": params.max_tokens,
|
||||
"enable_prompt_caching_beta": params.enable_prompt_caching_beta or False,
|
||||
@@ -167,6 +174,71 @@ class AnthropicLLMService(LLMService):
|
||||
"""
|
||||
return True
|
||||
|
||||
async def _create_message_stream(self, api_call, params):
|
||||
"""Create message stream with optional timeout and retry.
|
||||
|
||||
Args:
|
||||
api_call: The Anthropic API method to call.
|
||||
params: Parameters for the API call.
|
||||
|
||||
Returns:
|
||||
Async stream of message events.
|
||||
"""
|
||||
if self._retry_on_timeout:
|
||||
try:
|
||||
response = await asyncio.wait_for(
|
||||
api_call(**params), timeout=self._retry_timeout_secs
|
||||
)
|
||||
return response
|
||||
except (APITimeoutError, asyncio.TimeoutError):
|
||||
# Retry, this time without a timeout so we get a response
|
||||
logger.debug(f"{self}: Retrying message creation due to timeout")
|
||||
response = await api_call(**params)
|
||||
return response
|
||||
else:
|
||||
response = await api_call(**params)
|
||||
return response
|
||||
|
||||
async def run_inference(
|
||||
self, context: LLMContext | OpenAILLMContext, system_instruction: Optional[str] = None
|
||||
) -> Optional[str]:
|
||||
"""Run a one-shot, out-of-band (i.e. out-of-pipeline) inference with the given LLM context.
|
||||
|
||||
Args:
|
||||
context: The LLM context containing conversation history.
|
||||
system_instruction: Optional system instruction to guide the LLM's
|
||||
behavior. You could also (again, optionally) provide a system
|
||||
instruction directly in the context. If both are provided, the
|
||||
one in the context takes precedence.
|
||||
|
||||
Returns:
|
||||
The LLM's response as a string, or None if no response is generated.
|
||||
"""
|
||||
messages = []
|
||||
system = []
|
||||
if isinstance(context, LLMContext):
|
||||
# Future code will be something like this:
|
||||
# adapter = self.get_llm_adapter()
|
||||
# params: AnthropicLLMInvocationParams = adapter.get_llm_invocation_params(context)
|
||||
# messages = params["messages"]
|
||||
# system = params["system_instruction"]
|
||||
raise NotImplementedError("Universal LLMContext is not yet supported for Anthropic.")
|
||||
else:
|
||||
context = AnthropicLLMContext.upgrade_to_anthropic(context)
|
||||
messages = context.messages
|
||||
system = getattr(context, "system", None) or system_instruction
|
||||
|
||||
# LLM completion
|
||||
response = await self._client.messages.create(
|
||||
model=self.model_name,
|
||||
messages=messages,
|
||||
system=system,
|
||||
max_tokens=8192,
|
||||
stream=False,
|
||||
)
|
||||
|
||||
return response.content[0].text
|
||||
|
||||
@property
|
||||
def enable_prompt_caching_beta(self) -> bool:
|
||||
"""Check if prompt caching beta feature is enabled.
|
||||
@@ -250,7 +322,7 @@ class AnthropicLLMService(LLMService):
|
||||
|
||||
params.update(self._settings["extra"])
|
||||
|
||||
response = await api_call(**params)
|
||||
response = await self._create_message_stream(api_call, params)
|
||||
|
||||
await self.stop_ttfb_metrics()
|
||||
|
||||
@@ -259,7 +331,7 @@ class AnthropicLLMService(LLMService):
|
||||
json_accumulator = ""
|
||||
|
||||
function_calls = []
|
||||
async for event in WatchdogAsyncIterator(response, manager=self.task_manager):
|
||||
async for event in response:
|
||||
# Aggregate streaming content, create frames, trigger events
|
||||
|
||||
if event.type == "content_block_delta":
|
||||
@@ -378,6 +450,8 @@ class AnthropicLLMService(LLMService):
|
||||
context = None
|
||||
if isinstance(frame, OpenAILLMContextFrame):
|
||||
context: "AnthropicLLMContext" = AnthropicLLMContext.upgrade_to_anthropic(frame.context)
|
||||
elif isinstance(frame, LLMContextFrame):
|
||||
raise NotImplementedError("Universal LLMContext is not yet supported for Anthropic.")
|
||||
elif isinstance(frame, LLMMessagesFrame):
|
||||
context = AnthropicLLMContext.from_messages(frame.messages)
|
||||
elif isinstance(frame, VisionImageRawFrame):
|
||||
|
||||
@@ -219,10 +219,7 @@ class AssemblyAISTTService(STTService):
|
||||
await self._websocket.send(json.dumps({"type": "Terminate"}))
|
||||
|
||||
try:
|
||||
await asyncio.wait_for(
|
||||
self._termination_event.wait(),
|
||||
timeout=5.0,
|
||||
)
|
||||
await asyncio.wait_for(self._termination_event.wait(), timeout=5.0)
|
||||
except asyncio.TimeoutError:
|
||||
logger.warning("Timed out waiting for termination message from server")
|
||||
|
||||
@@ -247,11 +244,9 @@ class AssemblyAISTTService(STTService):
|
||||
try:
|
||||
while self._connected:
|
||||
try:
|
||||
message = await asyncio.wait_for(self._websocket.recv(), timeout=1.0)
|
||||
message = await self._websocket.recv()
|
||||
data = json.loads(message)
|
||||
await self._handle_message(data)
|
||||
except asyncio.TimeoutError:
|
||||
self.reset_watchdog()
|
||||
except websockets.exceptions.ConnectionClosedOK:
|
||||
break
|
||||
except Exception as e:
|
||||
|
||||
@@ -29,7 +29,6 @@ from pipecat.frames.frames import (
|
||||
from pipecat.processors.frame_processor import FrameDirection
|
||||
from pipecat.services.tts_service import InterruptibleTTSService, TTSService
|
||||
from pipecat.transcriptions.language import Language
|
||||
from pipecat.utils.asyncio.watchdog_async_iterator import WatchdogAsyncIterator
|
||||
from pipecat.utils.tracing.service_decorators import traced_tts
|
||||
|
||||
try:
|
||||
@@ -276,9 +275,7 @@ class AsyncAITTSService(InterruptibleTTSService):
|
||||
self._started = False
|
||||
|
||||
async def _receive_messages(self):
|
||||
async for message in WatchdogAsyncIterator(
|
||||
self._get_websocket(), manager=self.task_manager
|
||||
):
|
||||
async for message in self._get_websocket():
|
||||
msg = json.loads(message)
|
||||
if not msg:
|
||||
continue
|
||||
@@ -301,9 +298,8 @@ class AsyncAITTSService(InterruptibleTTSService):
|
||||
|
||||
async def _keepalive_task_handler(self):
|
||||
"""Send periodic keepalive messages to maintain WebSocket connection."""
|
||||
KEEPALIVE_SLEEP = 10 if self.task_manager.task_watchdog_enabled else 3
|
||||
KEEPALIVE_SLEEP = 3
|
||||
while True:
|
||||
self.reset_watchdog()
|
||||
await asyncio.sleep(KEEPALIVE_SLEEP)
|
||||
try:
|
||||
if self._websocket and self._websocket.state is State.OPEN:
|
||||
@@ -335,7 +331,7 @@ class AsyncAITTSService(InterruptibleTTSService):
|
||||
yield TTSStartedFrame()
|
||||
self._started = True
|
||||
|
||||
msg = self._build_msg(text=text)
|
||||
msg = self._build_msg(text=text, force=True)
|
||||
|
||||
try:
|
||||
await self._get_websocket().send(msg)
|
||||
|
||||
@@ -31,6 +31,7 @@ from pipecat.frames.frames import (
|
||||
FunctionCallFromLLM,
|
||||
FunctionCallInProgressFrame,
|
||||
FunctionCallResultFrame,
|
||||
LLMContextFrame,
|
||||
LLMFullResponseEndFrame,
|
||||
LLMFullResponseStartFrame,
|
||||
LLMMessagesFrame,
|
||||
@@ -40,6 +41,7 @@ from pipecat.frames.frames import (
|
||||
VisionImageRawFrame,
|
||||
)
|
||||
from pipecat.metrics.metrics import LLMTokenUsage
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response import (
|
||||
LLMAssistantAggregatorParams,
|
||||
LLMAssistantContextAggregator,
|
||||
@@ -58,6 +60,7 @@ try:
|
||||
import aioboto3
|
||||
import httpx
|
||||
from botocore.config import Config
|
||||
from botocore.exceptions import ReadTimeoutError
|
||||
except ModuleNotFoundError as e:
|
||||
logger.error(f"Exception: {e}")
|
||||
logger.error(
|
||||
@@ -724,6 +727,8 @@ class AWSBedrockLLMService(LLMService):
|
||||
aws_region: str = "us-east-1",
|
||||
params: Optional[InputParams] = None,
|
||||
client_config: Optional[Config] = None,
|
||||
retry_timeout_secs: Optional[float] = 5.0,
|
||||
retry_on_timeout: Optional[bool] = False,
|
||||
**kwargs,
|
||||
):
|
||||
"""Initialize the AWS Bedrock LLM service.
|
||||
@@ -736,6 +741,8 @@ class AWSBedrockLLMService(LLMService):
|
||||
aws_region: AWS region for the Bedrock service.
|
||||
params: Model parameters and configuration.
|
||||
client_config: Custom boto3 client configuration.
|
||||
retry_timeout_secs: Request timeout in seconds for retry logic.
|
||||
retry_on_timeout: Whether to retry the request once if it times out.
|
||||
**kwargs: Additional arguments passed to parent LLMService.
|
||||
"""
|
||||
super().__init__(**kwargs)
|
||||
@@ -762,6 +769,8 @@ class AWSBedrockLLMService(LLMService):
|
||||
}
|
||||
|
||||
self.set_model_name(model)
|
||||
self._retry_timeout_secs = retry_timeout_secs
|
||||
self._retry_on_timeout = retry_on_timeout
|
||||
self._settings = {
|
||||
"max_tokens": params.max_tokens,
|
||||
"temperature": params.temperature,
|
||||
@@ -782,6 +791,106 @@ class AWSBedrockLLMService(LLMService):
|
||||
"""
|
||||
return True
|
||||
|
||||
async def run_inference(
|
||||
self, context: LLMContext | OpenAILLMContext, system_instruction: Optional[str] = None
|
||||
) -> Optional[str]:
|
||||
"""Run a one-shot, out-of-band (i.e. out-of-pipeline) inference with the given LLM context.
|
||||
|
||||
Args:
|
||||
context: The LLM context containing conversation history.
|
||||
system_instruction: Optional system instruction to guide the LLM's
|
||||
behavior. You could also (again, optionally) provide a system
|
||||
instruction directly in the context. If both are provided, the
|
||||
one in the context takes precedence.
|
||||
|
||||
Returns:
|
||||
The LLM's response as a string, or None if no response is generated.
|
||||
"""
|
||||
try:
|
||||
messages = []
|
||||
system = []
|
||||
if isinstance(context, LLMContext):
|
||||
# Future code will be something like this:
|
||||
# adapter = self.get_llm_adapter()
|
||||
# params: AWSBedrockLLMInvocationParams = adapter.get_llm_invocation_params(context)
|
||||
# messages = params["messages"]
|
||||
# system = params["system_instruction"]
|
||||
raise NotImplementedError(
|
||||
"Universal LLMContext is not yet supported for AWS Bedrock."
|
||||
)
|
||||
else:
|
||||
context = AWSBedrockLLMContext.upgrade_to_bedrock(context)
|
||||
messages = context.messages
|
||||
system = getattr(context, "system", None) or system_instruction
|
||||
|
||||
# Determine if we're using Claude or Nova based on model ID
|
||||
model_id = self.model_name
|
||||
|
||||
# Prepare request parameters
|
||||
request_params = {
|
||||
"modelId": model_id,
|
||||
"messages": messages,
|
||||
"inferenceConfig": {
|
||||
"maxTokens": 8192,
|
||||
"temperature": 0.7,
|
||||
"topP": 0.9,
|
||||
},
|
||||
}
|
||||
|
||||
if system:
|
||||
request_params["system"] = [{"text": system}]
|
||||
|
||||
async with self._aws_session.client(
|
||||
service_name="bedrock-runtime", **self._aws_params
|
||||
) as client:
|
||||
# Call Bedrock without streaming
|
||||
response = await client.converse(**request_params)
|
||||
|
||||
# Extract the response text
|
||||
if (
|
||||
"output" in response
|
||||
and "message" in response["output"]
|
||||
and "content" in response["output"]["message"]
|
||||
):
|
||||
content = response["output"]["message"]["content"]
|
||||
if isinstance(content, list):
|
||||
for item in content:
|
||||
if item.get("text"):
|
||||
return item["text"]
|
||||
elif isinstance(content, str):
|
||||
return content
|
||||
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Bedrock summary generation failed: {e}", exc_info=True)
|
||||
return None
|
||||
|
||||
async def _create_converse_stream(self, client, request_params):
|
||||
"""Create converse stream with optional timeout and retry.
|
||||
|
||||
Args:
|
||||
client: The AWS Bedrock client instance.
|
||||
request_params: Parameters for the converse_stream call.
|
||||
|
||||
Returns:
|
||||
Async stream of response events.
|
||||
"""
|
||||
if self._retry_on_timeout:
|
||||
try:
|
||||
response = await asyncio.wait_for(
|
||||
await client.converse_stream(**request_params), timeout=self._retry_timeout_secs
|
||||
)
|
||||
return response
|
||||
except (ReadTimeoutError, asyncio.TimeoutError) as e:
|
||||
# Retry, this time without a timeout so we get a response
|
||||
logger.debug(f"{self}: Retrying converse_stream due to timeout")
|
||||
response = await client.converse_stream(**request_params)
|
||||
return response
|
||||
else:
|
||||
response = await client.converse_stream(**request_params)
|
||||
return response
|
||||
|
||||
def create_context_aggregator(
|
||||
self,
|
||||
context: OpenAILLMContext,
|
||||
@@ -911,7 +1020,7 @@ class AWSBedrockLLMService(LLMService):
|
||||
service_name="bedrock-runtime", **self._aws_params
|
||||
) as client:
|
||||
# Call AWS Bedrock with streaming
|
||||
response = await client.converse_stream(**request_params)
|
||||
response = await self._create_converse_stream(client, request_params)
|
||||
|
||||
await self.stop_ttfb_metrics()
|
||||
|
||||
@@ -922,8 +1031,6 @@ class AWSBedrockLLMService(LLMService):
|
||||
function_calls = []
|
||||
|
||||
async for event in response["stream"]:
|
||||
self.reset_watchdog()
|
||||
|
||||
# Handle text content
|
||||
if "contentBlockDelta" in event:
|
||||
delta = event["contentBlockDelta"]["delta"]
|
||||
@@ -1014,6 +1121,8 @@ class AWSBedrockLLMService(LLMService):
|
||||
context = None
|
||||
if isinstance(frame, OpenAILLMContextFrame):
|
||||
context = AWSBedrockLLMContext.upgrade_to_bedrock(frame.context)
|
||||
if isinstance(frame, LLMContextFrame):
|
||||
raise NotImplementedError("Universal LLMContext is not yet supported for AWS Bedrock.")
|
||||
elif isinstance(frame, LLMMessagesFrame):
|
||||
context = AWSBedrockLLMContext.from_messages(frame.messages)
|
||||
elif isinstance(frame, VisionImageRawFrame):
|
||||
|
||||
@@ -480,7 +480,7 @@ class AWSTranscribeSTTService(STTService):
|
||||
break
|
||||
|
||||
try:
|
||||
response = await asyncio.wait_for(self._ws_client.recv(), timeout=1.0)
|
||||
response = await self._ws_client.recv()
|
||||
|
||||
headers, payload = decode_event(response)
|
||||
|
||||
@@ -531,8 +531,6 @@ class AWSTranscribeSTTService(STTService):
|
||||
else:
|
||||
logger.debug(f"{self} Other message type received: {headers}")
|
||||
logger.debug(f"{self} Payload: {payload}")
|
||||
except asyncio.TimeoutError:
|
||||
self.reset_watchdog()
|
||||
except websockets.exceptions.ConnectionClosed as e:
|
||||
logger.error(
|
||||
f"{self} WebSocket connection closed in receive loop with code {e.code}: {e.reason}"
|
||||
|
||||