Compare commits
391 Commits
hush/endFr
...
v0.0.98
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f9fef78070 | ||
|
|
92970c7873 | ||
|
|
491d298c10 | ||
|
|
c46a20328d | ||
|
|
7e4dbf42e8 | ||
|
|
159e403ae4 | ||
|
|
d3d50ac580 | ||
|
|
e03e5f3a59 | ||
|
|
65e4719cec | ||
|
|
d07b37b288 | ||
|
|
ca97d9dc4b | ||
|
|
4c20483a7e | ||
|
|
6d84f36d05 | ||
|
|
0b6e8f5bca | ||
|
|
cdd6f5aa6a | ||
|
|
f1a0d547ce | ||
|
|
b1b7fc6357 | ||
|
|
b3403e884d | ||
|
|
16e304016d | ||
|
|
21a55f6aae | ||
|
|
310df33de6 | ||
|
|
c8a86059fb | ||
|
|
c537d7bafb | ||
|
|
1fce68cef1 | ||
|
|
ecd9ec4ad2 | ||
|
|
db983cb693 | ||
|
|
5b30f1b1ef | ||
|
|
5f7dbfe775 | ||
|
|
2bb6ba59fc | ||
|
|
ac7b06faba | ||
|
|
afa7573834 | ||
|
|
f2eb9eeb56 | ||
|
|
9e49e09360 | ||
|
|
b5221cd2c1 | ||
|
|
796f3aeff3 | ||
|
|
de94790b94 | ||
|
|
bd3bf9a00e | ||
|
|
92f934031d | ||
|
|
11b92d89d0 | ||
|
|
0d1a122582 | ||
|
|
24b5efb9d8 | ||
|
|
eeb3b85e39 | ||
|
|
8255770b6c | ||
|
|
d3f918eb58 | ||
|
|
36c6549426 | ||
|
|
88d909d468 | ||
|
|
21e346abe2 | ||
|
|
70a80847a7 | ||
|
|
27647fc067 | ||
|
|
85fe6d4c34 | ||
|
|
4cd971e4bd | ||
|
|
54926f390d | ||
|
|
50362ca37e | ||
|
|
a14c911fb2 | ||
|
|
a5e42337a4 | ||
|
|
4f848e9631 | ||
|
|
93df7044fa | ||
|
|
e604e9b490 | ||
|
|
2e4fa3f8db | ||
|
|
5f6448a8a4 | ||
|
|
6cda357ce8 | ||
|
|
7e87f61d17 | ||
|
|
ccdf83800b | ||
|
|
4b81be7acf | ||
|
|
abc2ad8cbc | ||
|
|
64471d65f8 | ||
|
|
3c4991a41f | ||
|
|
71d6516a14 | ||
|
|
22288648e6 | ||
|
|
a6ee040d82 | ||
|
|
87fc860cd5 | ||
|
|
b25ad21941 | ||
|
|
debcea3baa | ||
|
|
c2abe42a64 | ||
|
|
56dee06a29 | ||
|
|
60cc14cafd | ||
|
|
1e98094394 | ||
|
|
ccdd6cde52 | ||
|
|
12979293ad | ||
|
|
28248e9b00 | ||
|
|
0e88ad672e | ||
|
|
f41c3dcbc3 | ||
|
|
645e1802f8 | ||
|
|
6636da682c | ||
|
|
10a32c943f | ||
|
|
455579ffcc | ||
|
|
c37da6ab78 | ||
|
|
1892854516 | ||
|
|
735e597bf2 | ||
|
|
52980a69c5 | ||
|
|
ff2f1dac82 | ||
|
|
3cbfbb997e | ||
|
|
3e66cb50e0 | ||
|
|
b821dd2507 | ||
|
|
0c5bccd1f1 | ||
|
|
926514ca18 | ||
|
|
ca5e668f4a | ||
|
|
53de6c0b9a | ||
|
|
b22ac8292f | ||
|
|
83877ab1e6 | ||
|
|
2a6a0d83db | ||
|
|
6ca117a3c1 | ||
|
|
4fcb099fd7 | ||
|
|
c5ff5cc219 | ||
|
|
88289f578a | ||
|
|
229ff794d6 | ||
|
|
096db3eb6c | ||
|
|
cfd1cada8c | ||
|
|
ee435b6f1e | ||
|
|
d289b38ba7 | ||
|
|
b0f63c3785 | ||
|
|
1249ee3de3 | ||
|
|
b09d8bd595 | ||
|
|
540a48b1b6 | ||
|
|
aa0529ff82 | ||
|
|
7e92597c0e | ||
|
|
99f89351fa | ||
|
|
0b4d984be6 | ||
|
|
17203ba3e6 | ||
|
|
924831089c | ||
|
|
329b8ac426 | ||
|
|
61674d7758 | ||
|
|
b9990811b5 | ||
|
|
8ccc2cbf31 | ||
|
|
f4e33fc8dd | ||
|
|
5bfea84bd5 | ||
|
|
ef703e9d16 | ||
|
|
44aa11737b | ||
|
|
49f1f7d6a2 | ||
|
|
4ea51ff67c | ||
|
|
747bd4f737 | ||
|
|
15f5583fd2 | ||
|
|
c8c6f424cd | ||
|
|
0cdf0c4504 | ||
|
|
217f03b9cc | ||
|
|
12093fcffc | ||
|
|
e5fb643cf5 | ||
|
|
4517475db7 | ||
|
|
92b6e8d66b | ||
|
|
3be1a7afaa | ||
|
|
15df3c06e8 | ||
|
|
f0af0a6b96 | ||
|
|
4cefe1357c | ||
|
|
4df0a9bf73 | ||
|
|
9ef139d020 | ||
|
|
9103d4ae05 | ||
|
|
bd63b6cefa | ||
|
|
4d03270bc3 | ||
|
|
0debcee761 | ||
|
|
6aee72c5b4 | ||
|
|
8d62cfb1b6 | ||
|
|
41214236ab | ||
|
|
b25963a63b | ||
|
|
8c6ef21d84 | ||
|
|
f729b1625b | ||
|
|
0ffaa09c95 | ||
|
|
f6e31b7e89 | ||
|
|
49b2b12e04 | ||
|
|
7ad3969690 | ||
|
|
af089a65ae | ||
|
|
48422dd442 | ||
|
|
fed6a8b669 | ||
|
|
82e0253a62 | ||
|
|
a7f26dca60 | ||
|
|
459ef27f3f | ||
|
|
464cfa5ccb | ||
|
|
9289881a80 | ||
|
|
34033cd454 | ||
|
|
47c21c9579 | ||
|
|
3b0bcf0b66 | ||
|
|
c4a8308027 | ||
|
|
e9f76dcaf2 | ||
|
|
21b2229b2b | ||
|
|
11aa9c9e68 | ||
|
|
9f4680e9bd | ||
|
|
04443a3820 | ||
|
|
1571cc58ac | ||
|
|
dea80cf946 | ||
|
|
91dec044c4 | ||
|
|
8cf4267d87 | ||
|
|
0ee7cab6c6 | ||
|
|
74c2039bfb | ||
|
|
66088837cd | ||
|
|
07ebf8534a | ||
|
|
fce4cfba15 | ||
|
|
af52833ca0 | ||
|
|
9fdf756375 | ||
|
|
283bbb385c | ||
|
|
8c6b2edb25 | ||
|
|
6ab30f9b87 | ||
|
|
3d93285bdf | ||
|
|
7261cd28f2 | ||
|
|
33eeb8ce44 | ||
|
|
ebda94ca98 | ||
|
|
40b17cff8f | ||
|
|
7ba0ebba11 | ||
|
|
b39087027c | ||
|
|
e65974c870 | ||
|
|
b1e5d68d97 | ||
|
|
39bca074d7 | ||
|
|
b5e79f9dc5 | ||
|
|
613b96819f | ||
|
|
57c24670ea | ||
|
|
d79dd94019 | ||
|
|
fa8e7458e1 | ||
|
|
4d66191963 | ||
|
|
7e9d67002e | ||
|
|
ffbb6e5937 | ||
|
|
535b85cf90 | ||
|
|
8dc9872ed5 | ||
|
|
f37a53cc25 | ||
|
|
9cce28c64c | ||
|
|
3ca94363ec | ||
|
|
9dd882ecf8 | ||
|
|
0bbb14eb9b | ||
|
|
050f287ec4 | ||
|
|
e6f5561785 | ||
|
|
2df91f4b37 | ||
|
|
7db49b9067 | ||
|
|
7c497bdc89 | ||
|
|
1aa4247d2b | ||
|
|
1ffa9ff51f | ||
|
|
435b53f1a0 | ||
|
|
406bdfad0d | ||
|
|
acba544e6f | ||
|
|
5d93c64ee5 | ||
|
|
de10bc8803 | ||
|
|
36f5c1722d | ||
|
|
a8280522e5 | ||
|
|
05d65dfdd3 | ||
|
|
a3962e3b47 | ||
|
|
cd231cf829 | ||
|
|
9fafc1692d | ||
|
|
7648d0436c | ||
|
|
bff8747e38 | ||
|
|
d227c0c097 | ||
|
|
9ccde60521 | ||
|
|
b84a40666c | ||
|
|
e72b135a4c | ||
|
|
2235d8f5a2 | ||
|
|
6e20a50a4b | ||
|
|
89d9ca045a | ||
|
|
4b95ee92eb | ||
|
|
d481ac6cc6 | ||
|
|
e5a91296b5 | ||
|
|
d8d10a0685 | ||
|
|
6dd9ed03b1 | ||
|
|
d486c80804 | ||
|
|
dedea7c420 | ||
|
|
b78eb5de6b | ||
|
|
95aa13beb1 | ||
|
|
88ce85342c | ||
|
|
bedd40ae8b | ||
|
|
fda327b3ee | ||
|
|
ace95b6e6d | ||
|
|
26c5c28c5c | ||
|
|
81f862749d | ||
|
|
b8bf7b4132 | ||
|
|
d90121ef3b | ||
|
|
d0b7b4fb0a | ||
|
|
4acc317923 | ||
|
|
7caf5751ee | ||
|
|
1330ef3ad6 | ||
|
|
9efb21d61e | ||
|
|
6d93b8e9d8 | ||
|
|
6f527e509e | ||
|
|
6cf1d0417e | ||
|
|
19d8b0dfc2 | ||
|
|
7fa0cbf2a9 | ||
|
|
36c4bc2df2 | ||
|
|
42be0183af | ||
|
|
7961f8a664 | ||
|
|
4ca143e8af | ||
|
|
2607699664 | ||
|
|
47fa3b8556 | ||
|
|
fa0100c38b | ||
|
|
e5142c1210 | ||
|
|
5907b51c7d | ||
|
|
9e4ec4f7f3 | ||
|
|
e2161ea63d | ||
|
|
7c81f66241 | ||
|
|
60da466379 | ||
|
|
12c29b71f3 | ||
|
|
b52b108932 | ||
|
|
a357ff0205 | ||
|
|
0ece8b5894 | ||
|
|
782b257bbb | ||
|
|
ab8dcd6ede | ||
|
|
012c2f7dde | ||
|
|
87fdd8f006 | ||
|
|
7bdac02837 | ||
|
|
861567bc59 | ||
|
|
d0ff43134a | ||
|
|
3458b74fc9 | ||
|
|
a6202c4d1a | ||
|
|
3c3141796a | ||
|
|
8b8b57b09c | ||
|
|
4f30a48ecd | ||
|
|
ecbc41045c | ||
|
|
e1528d0f0c | ||
|
|
6b6d760cf1 | ||
|
|
7a4372a909 | ||
|
|
0e820a01b9 | ||
|
|
24266c238f | ||
|
|
dcc20f86e1 | ||
|
|
ec8964425a | ||
|
|
26918728df | ||
|
|
954849379b | ||
|
|
06542a2dbc | ||
|
|
59d40eac45 | ||
|
|
17cf6c56cf | ||
|
|
616e6ba351 | ||
|
|
f3cb5e0106 | ||
|
|
c89f230c99 | ||
|
|
69cd5716cd | ||
|
|
ab58f72322 | ||
|
|
ead361f665 | ||
|
|
fa6b8851ed | ||
|
|
1cc69d475d | ||
|
|
51bdd8b728 | ||
|
|
30ff488714 | ||
|
|
0707141998 | ||
|
|
cc861d6b70 | ||
|
|
de4e9c54f6 | ||
|
|
da671cd232 | ||
|
|
1d9696e614 | ||
|
|
510f3df6b7 | ||
|
|
68292bd75f | ||
|
|
42423bff41 | ||
|
|
c3d2a25229 | ||
|
|
cf1a9c1548 | ||
|
|
51ba245e10 | ||
|
|
39b4e61837 | ||
|
|
ceaf53fdb0 | ||
|
|
f93276c64f | ||
|
|
62a0f0c0f5 | ||
|
|
793aca6b8b | ||
|
|
1fcaf3a4bf | ||
|
|
afeef94900 | ||
|
|
860d9c4f29 | ||
|
|
4393191166 | ||
|
|
88daad524e | ||
|
|
66c58f8155 | ||
|
|
7bbb5be910 | ||
|
|
0dcb65bd56 | ||
|
|
2784b0f438 | ||
|
|
6484855139 | ||
|
|
771469b834 | ||
|
|
a60618b0ca | ||
|
|
3d21faaac2 | ||
|
|
f325eeb95b | ||
|
|
4c3fd42b1c | ||
|
|
c2309efd7e | ||
|
|
4ae1819645 | ||
|
|
a38f208135 | ||
|
|
d1eb837890 | ||
|
|
153201542b | ||
|
|
9137e50043 | ||
|
|
8dbe119a73 | ||
|
|
26f96d0be8 | ||
|
|
9944e6faf0 | ||
|
|
c1573c1f76 | ||
|
|
9f45ad4d2e | ||
|
|
fccc91e923 | ||
|
|
a510b276e6 | ||
|
|
6481094638 | ||
|
|
3132e12265 | ||
|
|
12af3f79d0 | ||
|
|
4835617b16 | ||
|
|
9283108240 | ||
|
|
515eaeeb1a | ||
|
|
5095fc6a64 | ||
|
|
7eedb33d50 | ||
|
|
47f78df497 | ||
|
|
74154b26a2 | ||
|
|
0c3c26b7b8 | ||
|
|
64417ef4ff | ||
|
|
f3b254e335 | ||
|
|
f27119a712 | ||
|
|
2a51d0f1e5 | ||
|
|
9156e21727 | ||
|
|
a5145be16e | ||
|
|
b104a59b10 | ||
|
|
04dbbabc03 | ||
|
|
19cc0177b8 | ||
|
|
77cd106795 | ||
|
|
71869a116d | ||
|
|
2f2bde9856 | ||
|
|
7de8838deb | ||
|
|
9bf88bbf14 | ||
|
|
35593b8574 |
174
.github/workflows/generate-changelog.yml
vendored
Normal file
174
.github/workflows/generate-changelog.yml
vendored
Normal file
@@ -0,0 +1,174 @@
|
||||
name: Generate Changelog for Release
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
version:
|
||||
description: "Release version (e.g., 0.0.97)"
|
||||
required: true
|
||||
type: string
|
||||
date:
|
||||
description: "Release date (YYYY-MM-DD format, defaults to today)"
|
||||
required: false
|
||||
type: string
|
||||
default: ""
|
||||
|
||||
permissions:
|
||||
contents: write
|
||||
pull-requests: write
|
||||
|
||||
jobs:
|
||||
generate-changelog:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.12"
|
||||
|
||||
- name: Install uv
|
||||
uses: astral-sh/setup-uv@v4
|
||||
with:
|
||||
enable-cache: true
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
uv sync --group dev
|
||||
|
||||
- name: Set release date
|
||||
id: set_date
|
||||
run: |
|
||||
if [ -z "${{ inputs.date }}" ]; then
|
||||
RELEASE_DATE=$(date +%Y-%m-%d)
|
||||
echo "Using today's date: $RELEASE_DATE"
|
||||
else
|
||||
RELEASE_DATE="${{ inputs.date }}"
|
||||
echo "Using provided date: $RELEASE_DATE"
|
||||
fi
|
||||
echo "release_date=$RELEASE_DATE" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Validate inputs
|
||||
run: |
|
||||
# Validate version format (basic check)
|
||||
if ! [[ "${{ inputs.version }}" =~ ^[0-9]+\.[0-9]+\.[0-9]+.*$ ]]; then
|
||||
echo "Error: Version must be in format X.Y.Z (e.g., 0.0.97)"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Validate date format if provided
|
||||
if [ -n "${{ inputs.date }}" ]; then
|
||||
if ! date -d "${{ inputs.date }}" >/dev/null 2>&1; then
|
||||
# Try macOS date format
|
||||
if ! date -j -f "%Y-%m-%d" "${{ inputs.date }}" >/dev/null 2>&1; then
|
||||
echo "Error: Date must be in YYYY-MM-DD format (e.g., 2025-12-04)"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
- name: Check for changelog fragments
|
||||
id: check_fragments
|
||||
run: |
|
||||
FRAGMENT_COUNT=$(find changelog -name "*.md" ! -name "_template.md.j2" | wc -l | tr -d ' ')
|
||||
echo "fragment_count=$FRAGMENT_COUNT" >> $GITHUB_OUTPUT
|
||||
|
||||
if [ "$FRAGMENT_COUNT" -eq "0" ]; then
|
||||
echo "❌ Error: No changelog fragments found in changelog/"
|
||||
echo ""
|
||||
echo "Cannot create a release without changelog entries."
|
||||
echo "Add changelog fragments to the changelog/ directory (e.g., 1234.added.md) and try again."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Validate fragment types
|
||||
VALID_TYPES="added changed deprecated removed fixed security"
|
||||
INVALID_FRAGMENTS=""
|
||||
|
||||
for file in changelog/*.md; do
|
||||
# Skip template
|
||||
if [[ "$file" == "changelog/_template.md.j2" ]]; then
|
||||
continue
|
||||
fi
|
||||
|
||||
# Extract type from filename (e.g., 1234.added.md -> added)
|
||||
filename=$(basename "$file")
|
||||
# Handle both 1234.added.md and 1234.added.2.md patterns
|
||||
type=$(echo "$filename" | sed -E 's/^[0-9]+\.([a-z]+)(\.[0-9]+)?\.md$/\1/')
|
||||
|
||||
# Check if type is valid
|
||||
if ! echo "$VALID_TYPES" | grep -wq "$type"; then
|
||||
INVALID_FRAGMENTS="$INVALID_FRAGMENTS\n - $filename (type: '$type')"
|
||||
fi
|
||||
done
|
||||
|
||||
if [ -n "$INVALID_FRAGMENTS" ]; then
|
||||
echo "❌ Error: Invalid changelog fragment types found:"
|
||||
echo -e "$INVALID_FRAGMENTS"
|
||||
echo ""
|
||||
echo "Valid types are: $VALID_TYPES"
|
||||
echo "Example: 1234.added.md, 5678.fixed.md"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "✓ Found $FRAGMENT_COUNT changelog fragment(s)"
|
||||
echo "has_fragments=true" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Preview changelog
|
||||
run: |
|
||||
echo "## Preview of changelog for version ${{ inputs.version }}"
|
||||
echo ""
|
||||
uv run towncrier build --draft --version "${{ inputs.version }}" --date "${{ steps.set_date.outputs.release_date }}"
|
||||
|
||||
- name: Build changelog
|
||||
run: |
|
||||
uv run towncrier build --version "${{ inputs.version }}" --date "${{ steps.set_date.outputs.release_date }}" --yes
|
||||
|
||||
- name: Create Pull Request
|
||||
uses: peter-evans/create-pull-request@v7
|
||||
with:
|
||||
token: ${{ secrets.GITHUB_TOKEN }}
|
||||
commit-message: "Update changelog for version ${{ inputs.version }}"
|
||||
title: "Release ${{ inputs.version }} - Changelog Update"
|
||||
body: |
|
||||
## Changelog Update for Release ${{ inputs.version }}
|
||||
|
||||
This PR updates the CHANGELOG.md with all changes for version **${{ inputs.version }}**.
|
||||
|
||||
### Summary
|
||||
- **Version:** ${{ inputs.version }}
|
||||
- **Date:** ${{ steps.set_date.outputs.release_date }}
|
||||
- **Fragments processed:** ${{ steps.check_fragments.outputs.fragment_count }}
|
||||
|
||||
### What this PR does
|
||||
- ✅ Adds new release section to CHANGELOG.md
|
||||
- ✅ Removes processed changelog fragments
|
||||
- ✅ Ready to merge for release
|
||||
|
||||
### Next Steps
|
||||
1. Review the changelog entries below
|
||||
2. Make any necessary edits to CHANGELOG.md if needed
|
||||
3. Merge this PR
|
||||
4. Continue with your release process
|
||||
|
||||
---
|
||||
|
||||
<details>
|
||||
<summary>📋 Preview of changes</summary>
|
||||
|
||||
The changelog has been updated with entries from the following fragments:
|
||||
|
||||
```bash
|
||||
${{ steps.check_fragments.outputs.fragment_count }} fragments processed
|
||||
```
|
||||
|
||||
</details>
|
||||
branch: changelog-${{ inputs.version }}
|
||||
delete-branch: true
|
||||
labels: |
|
||||
changelog
|
||||
release
|
||||
1
.github/workflows/python-compatibility.yaml
vendored
1
.github/workflows/python-compatibility.yaml
vendored
@@ -50,7 +50,6 @@ jobs:
|
||||
run: |
|
||||
uv sync --group dev --all-extras \
|
||||
--no-extra krisp \
|
||||
--no-extra ultravox \
|
||||
--no-extra local-smart-turn \
|
||||
--no-extra moondream \
|
||||
--no-extra mlx-whisper
|
||||
|
||||
@@ -11,7 +11,7 @@ build:
|
||||
jobs:
|
||||
post_install:
|
||||
- pip install uv
|
||||
- UV_PROJECT_ENVIRONMENT=$READTHEDOCS_VIRTUALENV_PATH uv sync --group docs --all-extras --no-extra krisp --no-extra gstreamer --no-extra ultravox --no-extra local_smart_turn --no-extra moondream --no-extra riva --no-extra mlx-whisper
|
||||
- UV_PROJECT_ENVIRONMENT=$READTHEDOCS_VIRTUALENV_PATH uv sync --group docs --all-extras --no-extra krisp --no-extra gstreamer --no-extra local_smart_turn --no-extra moondream --no-extra riva --no-extra mlx-whisper
|
||||
|
||||
sphinx:
|
||||
configuration: docs/api/conf.py
|
||||
|
||||
692
CHANGELOG.md
692
CHANGELOG.md
@@ -5,22 +5,695 @@ All notable changes to **Pipecat** will be documented in this file.
|
||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
||||
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||
|
||||
## [Unreleased]
|
||||
<!-- towncrier release notes start -->
|
||||
|
||||
## [0.0.98] - 2025-12-17
|
||||
|
||||
### Added
|
||||
|
||||
- Added `RimeNonJsonTTSService` which supports non-JSON streaming mode. This
|
||||
new class supports websocket streaming for the Arcana model.
|
||||
(PR [#3085](https://github.com/pipecat-ai/pipecat/pull/3085))
|
||||
|
||||
- Added additional functionality related to "thinking", for Google and
|
||||
Anthropic LLMs.
|
||||
|
||||
1. New typed parameters for Google and Anthropic LLMs that control the
|
||||
models' thinking behavior (like how much thinking to do, and whether to
|
||||
output thoughts or thought summaries):
|
||||
- `AnthropicLLMService.ThinkingConfig`
|
||||
- `GoogleLLMService.ThinkingConfig`
|
||||
2. New frames for representing thoughts output by LLMs:
|
||||
- `LLMThoughtStartFrame`
|
||||
- `LLMThoughtTextFrame`
|
||||
- `LLMThoughtEndFrame`
|
||||
3. A generic mechanism for recording LLM thoughts to context, used
|
||||
specifically to support Anthropic, whose thought signatures are expected
|
||||
to appear alongside the text of the thoughts within assistant context
|
||||
messages. See:
|
||||
- `LLMThoughtEndFrame.signature`
|
||||
- `LLMAssistantAggregator` handling of the above field
|
||||
- `AnthropicLLMAdapter` handling of `"thought"` context messages
|
||||
4. Google-specific logic for inserting thought signatures into the context,
|
||||
to help maintain thinking continuity in a chain of LLM calls. See:
|
||||
- `GoogleLLMService` sending `LLMMessagesAppendFrame`s to add
|
||||
LLM-specific
|
||||
`"thought_signature"` messages to context
|
||||
- `GeminiLLMAdapter` handling of `"thought_signature"` messages
|
||||
5. An expansion of `TranscriptProcessor` to process LLM thoughts in
|
||||
addition to user and assistant utterances. See:
|
||||
- `TranscriptProcessor(process_thoughts=True)` (defaults to `False`)
|
||||
- `ThoughtTranscriptionMessage`, which is now also emitted with the
|
||||
`"on_transcript_update"` event
|
||||
(PR [#3175](https://github.com/pipecat-ai/pipecat/pull/3175))
|
||||
|
||||
- Data and control frames can now be marked as non-interruptible by using the
|
||||
`UninterruptibleFrame` mixin. Frames marked as `UninterruptibleFrame` will
|
||||
not be interrupted during processing, and any queued frames of this type will
|
||||
be retained in the internal queues. This is useful when you need ordered
|
||||
frames (data or control) that should not be discarded or cancelled due to
|
||||
interruptions.
|
||||
(PR [#3189](https://github.com/pipecat-ai/pipecat/pull/3189))
|
||||
|
||||
- Added `on_conversation_detected` event to `VoicemaiDetector`.
|
||||
(PR [#3207](https://github.com/pipecat-ai/pipecat/pull/3207))
|
||||
|
||||
- Added `x-goog-api-client` header with Pipecat's version to all Google
|
||||
services' requests.
|
||||
(PR [#3208](https://github.com/pipecat-ai/pipecat/pull/3208))
|
||||
|
||||
- Added support for the HeyGen LiveAvatar API (see https://www.liveavatar.com/).
|
||||
(PR [#3210](https://github.com/pipecat-ai/pipecat/pull/3210))
|
||||
|
||||
- Added to `AWSNovaSonicLLMService` functionality related to the new (and now
|
||||
default) Nova 2 Sonic model (`"amazon.nova-2-sonic-v1:0"`):
|
||||
|
||||
- Added the `endpointing_sensitivity` parameter to control how quickly the
|
||||
model decides the user has stopped speaking.
|
||||
- Made the assistant-response-trigger hack a no-op. It's only needed for
|
||||
the older Nova Sonic model.
|
||||
(PR [#3212](https://github.com/pipecat-ai/pipecat/pull/3212))
|
||||
|
||||
- [Ultravox Realtime](https://docs.ultravox.ai) is now a supported
|
||||
speech-to-speech service.
|
||||
|
||||
- Added `UltravoxRealtimeLLMService` for the integration.
|
||||
- Added `49-ultravox-realtime.py` example (with tool calling).
|
||||
(PR [#3227](https://github.com/pipecat-ai/pipecat/pull/3227))
|
||||
|
||||
- Added Daily PSTN dial-in support to the development runner with `--dialin`
|
||||
flag. This includes:
|
||||
|
||||
- `/daily-dialin-webhook` endpoint that handles incoming Daily PSTN webhooks
|
||||
- Automatic Daily room creation with SIP configuration
|
||||
- `DialinSettings` and `DailyDialinRequest` types in `pipecat.runner.types`
|
||||
for type-safe dial-in data
|
||||
- The runner now mimics Pipecat Cloud's dial-in webhook handling for local
|
||||
development
|
||||
(PR [#3235](https://github.com/pipecat-ai/pipecat/pull/3235))
|
||||
|
||||
- Add Gladia session id to logs for `GladiaSTTService`.
|
||||
(PR [#3236](https://github.com/pipecat-ai/pipecat/pull/3236))
|
||||
|
||||
- Added `InworldHttpTTSService` which uses Inworld's HTTP based TTS service in
|
||||
either streaming or non-streaming mode. Note: This class was previously named
|
||||
`InworldTTSService`.
|
||||
(PR [#3239](https://github.com/pipecat-ai/pipecat/pull/3239))
|
||||
|
||||
- Added `language_hints_strict` parameter to `SonioxSTTService` to strictly
|
||||
enforces language hints. This ensures that transcription occurs in the
|
||||
specified language.
|
||||
(PR [#3245](https://github.com/pipecat-ai/pipecat/pull/3245))
|
||||
|
||||
- Added Pipecat library version info to the `about` field in the `bot-ready`
|
||||
RTVI message.
|
||||
(PR [#3248](https://github.com/pipecat-ai/pipecat/pull/3248))
|
||||
|
||||
- Added `VisionFullResponseStartFrame`, `VisionFullResponseEndFrame` and
|
||||
`VisionTextFrame`. This are used by vision services similar to LLM
|
||||
services.
|
||||
(PR [#3252](https://github.com/pipecat-ai/pipecat/pull/3252))
|
||||
|
||||
### Changed
|
||||
|
||||
- `FunctionCallInProgressFrame` and `FunctionCallResultFrame` have changed from
|
||||
system frames to a control frame and a data frame, respectively, and are
|
||||
now both marked as `UninterruptibleFrame`.
|
||||
(PR [#3189](https://github.com/pipecat-ai/pipecat/pull/3189))
|
||||
|
||||
- `UserBotLatencyLogObserver` now uses `VADUserStartedSpeakingFrame` and
|
||||
`VADUserStoppedSpeakingFrame` to determine latency from user stopped speaking
|
||||
to bot started speaking.
|
||||
(PR [#3206](https://github.com/pipecat-ai/pipecat/pull/3206))
|
||||
|
||||
- Updated `HeyGenVideoService` and `HeyGenTransport` to support both HeyGen
|
||||
APIs (Interactive Avatar and Live Avatar).
|
||||
Using them is as simple as specifying the `service_type` when creating the
|
||||
`HeyGenVideoService` and the `HeyGenTransport`:
|
||||
|
||||
```python
|
||||
heyGen = HeyGenVideoService(
|
||||
api_key=os.getenv("HEYGEN_LIVE_AVATAR_API_KEY"),
|
||||
service_type=ServiceType.LIVE_AVATAR,
|
||||
session=session,
|
||||
)
|
||||
```
|
||||
|
||||
(PR [#3210](https://github.com/pipecat-ai/pipecat/pull/3210))
|
||||
|
||||
- Made `"amazon.nova-2-sonic-v1:0"` the new default model for
|
||||
`AWSNovaSonicLLMService`.
|
||||
(PR [#3212](https://github.com/pipecat-ai/pipecat/pull/3212))
|
||||
|
||||
- Updated the `run_inference` methods in the LLM service classes
|
||||
(`AnthropicLLMService`, `AWSBedrockLLMService`, `GoogleLLMService`, and
|
||||
`OpenAILLMService` and its base classes) to use the provided LLM
|
||||
configuration parameters.
|
||||
(PR [#3214](https://github.com/pipecat-ai/pipecat/pull/3214))
|
||||
|
||||
- Updated default models for:
|
||||
|
||||
- `GeminiLiveLLMService` to `gemini-2.5-flash-native-audio-preview-12-2025`.
|
||||
- `GeminiLiveVertexLLMService` to `gemini-live-2.5-flash-native-audio`.
|
||||
(PR [#3228](https://github.com/pipecat-ai/pipecat/pull/3228))
|
||||
|
||||
- Changed the `reason` field in `EndFrame`, `CancelFrame`, `EndTaskFrame`, and
|
||||
`CancelTaskFrame` from `str` to `Any` to indicate that it can hold values
|
||||
other than strings.
|
||||
(PR [#3231](https://github.com/pipecat-ai/pipecat/pull/3231))
|
||||
|
||||
- Updated websocket STT services to use the `WebsocketSTTService` base class.
|
||||
This base class manages the websocket connection and handles reconnects.
|
||||
Updated services:
|
||||
|
||||
- `AssemblyAISTTService`
|
||||
- `AWSTranscribeSTTService`
|
||||
- `GladiaSTTService`
|
||||
- `SonioxSTTService`
|
||||
(PR [#3236](https://github.com/pipecat-ai/pipecat/pull/3236))
|
||||
|
||||
- Changed Inworld's TTS service implementations:
|
||||
|
||||
- Previously, the HTTP implementation was named `InworldTTSService`. That
|
||||
has been moved to `InworldHttpTTSService`. This service now supports
|
||||
word-timestamp alignment data in both streaming and non-streaming modes.
|
||||
- Updated the `InworldTTSService` class to use Inworld's Websocket API.
|
||||
This class now has support for word-timestamp alignment data and tracks
|
||||
contexts for each user turn.
|
||||
(PR [#3239](https://github.com/pipecat-ai/pipecat/pull/3239))
|
||||
|
||||
- ⚠️ Breaking change: `WordTTSService.start_word_timestamps()` and
|
||||
`WordTTSService.reset_word_timestamps()` are now async.
|
||||
(PR [#3240](https://github.com/pipecat-ai/pipecat/pull/3240))
|
||||
|
||||
- Updated the current RTVI version to 1.1.0 to reflect recent additions and
|
||||
deprecations.
|
||||
|
||||
- New RTVI Messages: `send-text` and `bot-output`
|
||||
- Deprecated Messages: `append-to-context` and `bot-transcription`
|
||||
(PR [#3248](https://github.com/pipecat-ai/pipecat/pull/3248))
|
||||
|
||||
- `MoondreamService` now pushes `VisionFullResponseStartFrame`,
|
||||
`VisionFullResponseEndFrame` and `VisionTextFrame`.
|
||||
(PR [#3252](https://github.com/pipecat-ai/pipecat/pull/3252))
|
||||
|
||||
### Deprecated
|
||||
|
||||
- `FalSmartTurnAnalyzer` and `LocalSmartTurnAnalyzer` are deprecated and will
|
||||
be removed in a future version. Use `LocalSmartTurnAnalyzerV3` instead.
|
||||
(PR [#3219](https://github.com/pipecat-ai/pipecat/pull/3219))
|
||||
|
||||
### Removed
|
||||
|
||||
- Removed the deprecated VLLM-based open source Ultravox STT service.
|
||||
(PR [#3227](https://github.com/pipecat-ai/pipecat/pull/3227))
|
||||
|
||||
### Fixed
|
||||
|
||||
- Fixed a bug in `AWSNovaSonicLLMService` where we would mishandle cancelled
|
||||
tool calls in the context, resulting in errors.
|
||||
(PR [#3212](https://github.com/pipecat-ai/pipecat/pull/3212))
|
||||
|
||||
- Better support conversation history with Gemini 2.5 Flash Image (model
|
||||
"gemini-2.5-flash-image"). Prior to this fix, the model had no memory of
|
||||
previous images it had generated, so it wouldn't be able to iterate on
|
||||
them.
|
||||
(PR [#3224](https://github.com/pipecat-ai/pipecat/pull/3224))
|
||||
|
||||
- Support conversations with Gemini 3 Pro Image (model
|
||||
"gemini-3-pro-image-preview"). Prior to this fix, after the model generated
|
||||
an image the conversation would not be able to progress.
|
||||
(PR [#3224](https://github.com/pipecat-ai/pipecat/pull/3224))
|
||||
|
||||
- Fixed an issue where `ElevenLabsHttpTTSService` was not updating
|
||||
voice settings when receiving a `TTSUpdateSettingsFrame`.
|
||||
(PR [#3226](https://github.com/pipecat-ai/pipecat/pull/3226))
|
||||
|
||||
- Fixed the return type for `SmallWebRTCRequestHandler.handle_web_request()`
|
||||
function.
|
||||
(PR [#3230](https://github.com/pipecat-ai/pipecat/pull/3230))
|
||||
|
||||
- Fix a bug in LLM context audio content handling
|
||||
(PR [#3234](https://github.com/pipecat-ai/pipecat/pull/3234))
|
||||
|
||||
- In `GladiaSTTService`, reset the `_bytes_sent` counter on connecting the
|
||||
websocket. This avoids unnecessary audio buffer trimming.
|
||||
(PR [#3236](https://github.com/pipecat-ai/pipecat/pull/3236))
|
||||
|
||||
- Fixed a TTS service word-timestamp issue that could cause generated
|
||||
`TTSTextFrame` instances to have an incorrect pts (`pts = -1`).
|
||||
(PR [#3240](https://github.com/pipecat-ai/pipecat/pull/3240))
|
||||
|
||||
- Fixed an issue in `SimpleTextAggreagtor` where spaces were not being stripped
|
||||
before returning the aggregation. This resulted in an extra space for TTS
|
||||
services that don't support word-timestamp alignment data.
|
||||
(PR [#3247](https://github.com/pipecat-ai/pipecat/pull/3247))
|
||||
|
||||
## [0.0.97] - 2025-12-05
|
||||
|
||||
### Added
|
||||
|
||||
- Added new Gradium services, `GradiumSTTService` and `GradiumTTSService`, for
|
||||
speech-to-text and text-to-speech functionality using Gradium's API.
|
||||
|
||||
- Additions for `AsyncAITTSService` and `AsyncAIHttpTTSService`:
|
||||
|
||||
- Added new `languages`: `pt`, `nl`, `ar`, `ru`, `ro`, `ja`, `he`, `hy`,
|
||||
`tr`, `hi`, `zh`.
|
||||
- Updated the default model to `asyncflow_multilingual_v1.0` for improved
|
||||
accuracy and broader language coverage.
|
||||
|
||||
- Added optional tool and tool output filters for MCP services.
|
||||
|
||||
### Changed
|
||||
|
||||
- Updated Deepgram logging to include Deepgram request IDs for improved
|
||||
debugging.
|
||||
|
||||
- Text Aggregation Improvements:
|
||||
|
||||
- **Breaking Change**: `BaseTextAggregator.aggregate()` now returns
|
||||
`AsyncIterator[Aggregation]` instead of `Optional[Aggregation]`. This
|
||||
enables the aggregator to return multiple results based on the provided
|
||||
text.
|
||||
- Refactored text aggregators to use inheritance: `SkipTagsAggregator` and
|
||||
`PatternPairAggregator` now inherit from `SimpleTextAggregator`, reusing
|
||||
the base class's sentence detection logic.
|
||||
|
||||
- Improved interruption handling to prevent bots from repeating themselves. LLM
|
||||
services that return multiple sentences in a single response (e.g.,
|
||||
`GoogleLLMService`) are now split into individual sentences before being sent
|
||||
to TTS. This ensures interruptions occur at sentence boundaries, preventing
|
||||
the bot from repeating content after being interrupted during long responses.
|
||||
|
||||
- Updated `AICFilter` to use Quail STT as the default model
|
||||
(`AICModelType.QUAIL_STT`). Quail STT is optimized for human-to-machine
|
||||
interaction (e.g., voice agents, speech-to-text) and operates at a native
|
||||
sample rate of 16 kHz with fixed enhancement parameters.
|
||||
|
||||
- If an unexpected exception is caught, or if `FrameProcessor.push_error()` is
|
||||
called with an exception, the file name and line number where the exception
|
||||
occured are now logged.
|
||||
|
||||
- Updated Smart Turn model weights to v3.1.
|
||||
|
||||
- Smart Turn analyzer now uses the full context of the turn rather than just
|
||||
the audio since VAD last triggered.
|
||||
|
||||
- Updated `CartesiaSTTService` to return the full transcription `result` in the
|
||||
`TranscriptionFrame` and `InterimTranscriptionFrame`. This provides access to
|
||||
word timestamp data.
|
||||
|
||||
- `HumeTTSService` changes:
|
||||
|
||||
- Added tracking headers (`X-Hume-Client-Name` and `X-Hume-Client-Version`)
|
||||
to all requests made by `HumeTTSService` to the Hume API for better usage
|
||||
tracking and analytics.
|
||||
- Added `stop()` and `cancel()` cleanup methods to `HumeTTSService` to
|
||||
properly close the HTTP client and prevent resource leaks.
|
||||
|
||||
### Deprecated
|
||||
|
||||
- NVIDIA Services name changes (all functionality is unchanged):
|
||||
|
||||
- `NimLLMService` is now deprecated, use `NvidiaLLMService` instead.
|
||||
- `RivaSTTService` is now deprecated, use `NvidiaSTTService` instead.
|
||||
- `RivaTTSService` is now deprecated, use `NvidiaTTSService` instead.
|
||||
- Use `uv pip install pipecat-ai[nvidia]` instead of
|
||||
`uv pip install pipecat-ai[riva]`
|
||||
|
||||
- The `noise_gate_enable` parameter in `AICFilter` is deprecated and no longer
|
||||
has any effect. Noise gating is now handled automatically by the AIC VAD
|
||||
system. Use `AICFilter.create_vad_analyzer()` for VAD functionality instead.
|
||||
|
||||
- Package `pipecat.sync` is deprecated, use `pipecat.utils.sync` instead.
|
||||
|
||||
### Fixed
|
||||
|
||||
- Fixed bug in `PatternPairAggregator` where pattern handlers could be called
|
||||
multiple times for `KEEP` or `AGGREGATE` patterns.
|
||||
|
||||
- Fixed sentence aggregation to correctly handle ambiguous punctuation in
|
||||
streaming text, such as currency ("$29.95") and abbreviations ("Mr. Smith").
|
||||
|
||||
- Fixed an issue in `AWSTranscribeSTTService` where the `region` arg was always
|
||||
set to `us-east-1` when providing an AWS_REGION env var.
|
||||
|
||||
- Fixed an issue in `SarvamTTSService` where the last sentence was not being
|
||||
spoken. Now, audio is flushed when the TTS services receives the
|
||||
`LLMFullResponseEndFrame` or `EndFrame`.
|
||||
|
||||
- Fixed an issue in `DeepgramTTSService` where a `TTSStoppedFrame` was
|
||||
incorrectly pushed after a functional call. This caused an issue with the
|
||||
voice-ui-kit's conversational panel rending of the LLM output after a
|
||||
function call.
|
||||
|
||||
- Fixed an issue where `LLMTextFrame.skip_tts` was being overwritten by LLM
|
||||
services.
|
||||
|
||||
- Fixed an issue that caused `WebsocketService` instances to attempt
|
||||
reconnection during shutdown.
|
||||
|
||||
- Fixed an issue in `ElevenLabsTTSService` where character usage metrics were
|
||||
only reported on the first TTS generation per turn.
|
||||
|
||||
## [0.0.96] - 2025-11-26 🦃 "Happy Thanksgiving!" 🦃
|
||||
|
||||
### Added
|
||||
|
||||
- Added `AWSBedrockAgentCoreProcessor` to support invoking an AgentCore-hosted
|
||||
agent in a Pipecat pipeline.
|
||||
|
||||
- Enhanced error handling across the framework:
|
||||
|
||||
- Added `on_error` callback to `FrameProcessor` for centralized error
|
||||
handling.
|
||||
|
||||
- Renamed `push_error(error: ErrorFrame)` to `push_error_frame(error: ErrorFrame)`
|
||||
for clarity.
|
||||
|
||||
- Added new `push_error` method for simplified error reporting:
|
||||
|
||||
```python
|
||||
async def push_error(error_msg: str,
|
||||
exception: Optional[Exception] = None,
|
||||
fatal: bool = False)
|
||||
```
|
||||
|
||||
- Standardized error logging by replacing `logger.exception` calls with
|
||||
`logger.error` throughout the codebase.
|
||||
|
||||
- Added `cache_read_input_tokens`, `cache_creation_input_tokens` and
|
||||
`reasoning_tokens` to OTel spans for LLM call
|
||||
|
||||
- Added `LiveKitRESTHelper` utility class for managing LiveKit rooms via REST API.
|
||||
|
||||
- Added `DeepgramSageMakerSTTService` which connects to a SageMaker hosted
|
||||
Deepgram STT model. Added `07c-interruptible-deepgram-sagemaker.py`
|
||||
foundational example.
|
||||
|
||||
- Added `SageMakerBidiClient` to connect to SageMaker hosted BiDi compatible
|
||||
services.
|
||||
|
||||
- Added support for `include_timestamps` and `enable_logging` in
|
||||
`ElevenLabsRealtimeSTTService`. When `include_timestamps` is enabled,
|
||||
timestamp data is included in the `TranscriptionFrame`'s `result`
|
||||
parameter.
|
||||
|
||||
- Added optional speaking rate control to `InworldTTSService`.
|
||||
|
||||
- Introduced a new `AggregatedTextFrame` type to support passing text along with
|
||||
an `aggregated_by` field to describe the type of text
|
||||
included. `TTSTextFrame`s now inherit from `AggregatedTextFrame`. With this
|
||||
inheritance, an observer can watch for `AggregatedTextFrame`s to accumlate the
|
||||
perceived output and determine whether or not the text was spoken based on if
|
||||
that frame is also a `TTSTextFrame`.
|
||||
|
||||
With this frame, the llm token stream can be transformed into custom
|
||||
composable chunks, allowing for aggregation outside the TTS service. This
|
||||
makes it possible to listen for or handle those aggregations and sets the
|
||||
stage for doing things like composing a best effort of the perceived llm
|
||||
output in a more digestable form and to do so whether or not it is processed
|
||||
by a TTS or if even a TTS exists.
|
||||
|
||||
- Introduced `LLMTextProcessor`: A new processor meant to allow customization
|
||||
for how LLMTextFrames should be aggregated and considered. It's purpose is to
|
||||
turn `LLMTextFrame`s into `AggregatedTextFrame`s. By default, a TTSService
|
||||
will still aggregate `LLMTextFrame`s by sentence for the service to
|
||||
consume. However, if you wish to override how the llm text is aggregated, you
|
||||
should no longer override the TTS's internal text_aggregator, but instead,
|
||||
insert this processor between your LLM and TTS in the pipeline.
|
||||
|
||||
- New `bot-output` RTVI message to represent what the bot actually "says".
|
||||
|
||||
- The `RTVIObserver` now emits `bot-output` messages based off the new
|
||||
`AggregatedTextFrame`s (`bot-tts-text` and `bot-llm-text` are still
|
||||
supported and generated, but `bot-transcript` is now deprecated in lieu of
|
||||
this new, more thorough, message).
|
||||
|
||||
- The new `RTVIBotOutputMessage` includes the fields:
|
||||
|
||||
- `spoken`: A boolean indicating whether the text was spoken by TTS
|
||||
|
||||
- `aggregated_by`: A string representing how the text was aggregated
|
||||
("sentence", "word", "my custom aggregation")
|
||||
|
||||
- Introduced new fields to `RTVIObserver` to support the new `bot-output`
|
||||
messaging:
|
||||
|
||||
- `bot_output_enabled`: Defaults to True. Set to false to disable bot-output
|
||||
messages.
|
||||
|
||||
- `skip_aggregator_types`: Defaults to `None`. Set to a list of strings that
|
||||
match aggregation types that should not be included in bot-output
|
||||
messages. (Ex. `credit_card`)
|
||||
|
||||
- Introduced new methods, `add_text_transformer()` and
|
||||
`remove_text_transformer()`, to `RTVIObserver` to support providing (and
|
||||
subsequently removing) callbacks for various types of aggregations (or all
|
||||
aggregations with `*`) that can modify the text before being sent as a
|
||||
`bot-output` or `tts-text` message. (Think obscuring the credit card or
|
||||
inserting extra detail the client might want that the context doesn't need.)
|
||||
|
||||
- In `MiniMaxHttpTTSService`:
|
||||
|
||||
- Added support for speech-2.6-hd and speech-2.6-turbo models
|
||||
|
||||
- Added languages: Afrikaans, Bulgarian, Catalan, Danish, Persian, Filipino,
|
||||
Hebrew, Croatian, Hungarian, Malay, Norwegian, Nynorsk, Slovak, Slovenian,
|
||||
Swedish, and Tamil
|
||||
|
||||
- Added new emotions: calm and fluent
|
||||
|
||||
- Added `enable_logging` to `SimliVideoService` input parameters. It's disabled
|
||||
by default.
|
||||
|
||||
### Changed
|
||||
|
||||
- Updated `FishAudioTTSService` default model to `s1`.
|
||||
|
||||
- Updated `DeepgramTTSService` to use Deepgram's TTS websocket API. ⚠️ This is
|
||||
a potential breaking change, which only affects you if you're self-hosting
|
||||
`DeepgramTTSService`. The new service uses Websockets and improves TTFB
|
||||
latency.
|
||||
|
||||
- Updated `daily-python` to 0.22.0.
|
||||
|
||||
- `BaseTextAggregator` changes:
|
||||
|
||||
Modified the BaseTextAggregator type so that when text gets aggregated,
|
||||
metadata can be associated with it. Currently, that just means a `type`, so
|
||||
that the aggregation can be classified or described. Changes made to support
|
||||
this:
|
||||
|
||||
- ⚠️ IMPORTANT: Aggregators are now expected to strip leading/trailing white
|
||||
space characters before returning their aggregation from `aggregation()` or
|
||||
`.text`. This way all aggregators have a consistent contract allowing
|
||||
downstream use to know how to stitch aggregations back together.
|
||||
|
||||
- Introduced a new `Aggregation` dataclass to represent both the aggregated
|
||||
`text` and a string identifying the `type` of aggregation (ex. "sentence",
|
||||
"word", "my custom aggregation")
|
||||
|
||||
- ⚠️ Breaking change: `BaseTextAggregator.text` now returns an `Aggregation`
|
||||
(instead of `str`).
|
||||
|
||||
Before:
|
||||
|
||||
```python
|
||||
aggregated_text = myAggregator.text
|
||||
```
|
||||
|
||||
Now:
|
||||
|
||||
```python
|
||||
aggregated_text = myAggregator.text.text
|
||||
```
|
||||
|
||||
- ⚠️ Breaking change: `BaseTextAggregator.aggregate()` now returns
|
||||
`Optional[Aggregation]` (instead of `Optional[str]`).
|
||||
|
||||
Before:
|
||||
|
||||
```python
|
||||
aggregation = myAggregator.aggregate(text)
|
||||
print(f"successfully aggregated text: {aggregation}")
|
||||
```
|
||||
|
||||
Now:
|
||||
|
||||
```python
|
||||
aggregation = myAggregator.aggregate(text)
|
||||
if aggregation:
|
||||
print(f"successfully aggregated text: {aggregation.text}")
|
||||
```
|
||||
|
||||
- `SimpleTextAggregator`, `SkipTagsAggregator`, `PatternPairAggregator`
|
||||
updated to produce/consume `Aggregation` objects.
|
||||
|
||||
- All uses of the above Aggregators have been updated accordingly.
|
||||
|
||||
- Augmented the `PatternPairAggregator` so that matched patterns can be treated
|
||||
as their own aggregation, taking advantage of the new. To that end:
|
||||
|
||||
- Introduced a new, preferred version of `add_pattern` to support a new option
|
||||
for treating a match as a separate aggregation returned from
|
||||
`aggregate()`. This replaces the now deprecated `add_pattern_pair` method
|
||||
and you provide a `MatchAction` in lieu of the `remove_match` field.
|
||||
|
||||
- `MatchAction` enum: `REMOVE`, `KEEP`, `AGGREGATE`, allowing customization
|
||||
for how a match should be handled.
|
||||
|
||||
- `REMOVE`: The text along with its delimiters will be removed from the
|
||||
streaming text. Sentence aggregation will continue on as if this text
|
||||
did not exist.
|
||||
|
||||
- `KEEP`: The delimiters will be removed, but the content between them
|
||||
will be kept. Sentence aggregation will continue on with the internal
|
||||
text included.
|
||||
|
||||
- `AGGREGATE`: The delimiters will be removed and the content between will
|
||||
be treated as a separate aggregation. Any text before the start of the
|
||||
pattern will be returned early, whether or not a complete sentence was
|
||||
found. Then the pattern will be returned. Then the aggregation will
|
||||
continue on sentence matching after the closing delimiter is found. The
|
||||
content between the delimiters is not aggregated by sentence. It is
|
||||
aggregated as one single block of text.
|
||||
|
||||
- `PatternMatch` now extends `Aggregation` and provides richer info to
|
||||
handlers.
|
||||
|
||||
- ⚠️ Breaking change: The `PatternMatch` type returned to handlers registered
|
||||
via `on_pattern_match` has been updated to subclass from the new
|
||||
`Aggregation` type, which means that `content` has been replaced with
|
||||
`text` and `pattern_id` has been replaced with `type`:
|
||||
|
||||
```python
|
||||
async dev on_match_tag(match: PatternMatch):
|
||||
pattern = match.type # instead of match.pattern_id
|
||||
text = match.text # instead of match.content
|
||||
```
|
||||
|
||||
- `TextFrame` now includes the field `append_to_context` to support setting
|
||||
whether or not the encompassing text should be added to the LLM context (by
|
||||
the LLM assistant aggregator). It defaults to `True`.
|
||||
|
||||
- `TTSService` base class updates:
|
||||
|
||||
- `TTSService`s now accept a new `skip_aggregator_types` to avoid speaking
|
||||
certain aggregation types (now determined/returned by the aggregator)
|
||||
|
||||
- Introduced the ability to do a just-in-time transform of text before it gets
|
||||
sent to the TTS service via callbacks you can set up via a new init field,
|
||||
`text_transforms` or a new method `add_text_transformer()`. This makes it
|
||||
possible to do things like introduce TTS-specific tags for spelling or
|
||||
emotion or change the pronunciation of something on the
|
||||
fly. `remove_text_transformer` has also been added to support removing a
|
||||
registered transform callback.
|
||||
|
||||
- TTS services push `AggregatedTextFrame` in addition to `TTSTextFrame`s when
|
||||
either an aggregation occurs that should not be spoken or when the TTS
|
||||
service supports word-by-word timestamping. In the latter case, the
|
||||
`TTSService` preliminarily generates an `AggregatedTextFrame`, aggregated by
|
||||
sentence to generate the full sentence content as early as possible.
|
||||
|
||||
- Updated `CartesiaTTSService`:
|
||||
|
||||
- Modified use of custom default text_aggregator to avoid deprecation warnings
|
||||
and push users towards use of transformers or the `LLMTextProcessor`
|
||||
|
||||
- Added convenience methods for taking advantage of Cartesia's SSML tags:
|
||||
spell, emotion, pauses, volume, and speed.
|
||||
|
||||
- Updated `RimeTTSService`:
|
||||
|
||||
- Modified use of custom default text_aggregator to avoid deprecation warnings
|
||||
and push users towards use of transformers or the `LLMTextProcessor`
|
||||
|
||||
- Added convenience methods for taking advantage of Rime's customization
|
||||
options: spell, pauses, pronunciations, and inline speed control.
|
||||
|
||||
### Deprecated
|
||||
|
||||
- The TTS constructor field, `text_aggregator` is deprecated in favor of the new
|
||||
`LLMTextProcessor`. TTSServices still have an internal aggregator for support
|
||||
of default behavior, but if you want to override the aggregation behavior, you
|
||||
should use the new processor.
|
||||
|
||||
- The RTVI `bot-transcription` event is deprecated in favor of the new
|
||||
`bot-output` message which is the canonical representation of bot output
|
||||
(spoken or not). The code still emits a transcription message for backwards
|
||||
compatibility while transition occurs.
|
||||
|
||||
- Deprecated `add_pattern_pair` in the `PatternPairAggregator` which takes a
|
||||
`pattern_id` and `remove_match` field in favor of the new `add_pattern` method
|
||||
which takes a `type` and an `action`
|
||||
|
||||
- `english_normalization` input parameter for `MiniMaxHttpTTSService` is
|
||||
deprecated, use `test_normalization` instead.
|
||||
|
||||
### Fixed
|
||||
|
||||
- Fixed an issue in `AWSBedrockLLMService` where the `aws_region` arg was
|
||||
always set to `us-east-1` when providing an AWS_REGION env var.
|
||||
|
||||
- Fixed an issue with `DeepgramFluxSTTService` where it sometimes failed to reconnect.
|
||||
|
||||
- Fixed an issue in `ElevenLabsRealtimeSTTService` where dynamic language
|
||||
updates were not working.
|
||||
|
||||
- Fixed an issue in `ElevenLabsRealtimeSTTService` where setting the sample
|
||||
rate would result in transcripts failing.
|
||||
|
||||
- Fixed `InworldTTSService` audio config payload to use camelCase keys expected
|
||||
by the Inworld API.
|
||||
|
||||
## [0.0.95] - 2025-11-18
|
||||
|
||||
### Added
|
||||
|
||||
- Added ai-coustics integrated VAD (`AICVADAnalyzer`) with `AICFilter` factory and
|
||||
example wiring; leverages the enhancement model for robust detection with no
|
||||
ONNX dependency or added processing complexity.
|
||||
|
||||
- Added a watchdog to `DeepgramFluxSTTService` to prevent dangling tasks in case the
|
||||
user was speaking and we stop receiving audio.
|
||||
|
||||
- Introduced a minimum confidence parameter in `DeepgramFluxSTTService` to avoid
|
||||
generating transcriptions below a defined threshold.
|
||||
|
||||
- Added `ElevenLabsRealtimeSTTService` which implements the Realtime STT
|
||||
service from ElevenLabs.
|
||||
|
||||
- Added a `TTSService.includes_inter_frame_spaces` property getter, so that TTS
|
||||
services that subclass `TTSService` can indicate whether the text in the
|
||||
`TTSTextFrame`s they push already contain any necessary inter-frame spaces.
|
||||
- Added word-level timestamps support to Hume TTS service
|
||||
|
||||
### Changed
|
||||
|
||||
- ⚠️ Breaking change: `LLMContext.create_image_message()`,
|
||||
`LLMContext.create_audio_message()`, `LLMContext.add_image_frame_message()`
|
||||
and `LLMContext.add_audio_frames_message()` are now async methods. This fixes
|
||||
an issue where the asyncio event loop would be blocked while encoding audio or
|
||||
images.
|
||||
|
||||
- `ConsumerProcessor` now queues frames from the producer internally instead of
|
||||
pushing them directly. This allows us to subclass consumer processors and
|
||||
manipulate frames before they are pushed.
|
||||
|
||||
- `BaseTextFilter` only require subclasses to implement the `filter()` method.
|
||||
|
||||
- Extracted the logic for retrying connections, and create a new `send_with_retry`
|
||||
method inside `WebSocketService`.
|
||||
|
||||
- Refactored `DeepgramFluxSTTService` to automatically reconnect if sending a
|
||||
message fails.
|
||||
|
||||
- Updated all STT and TTS services to use consistent error handling pattern with
|
||||
`push_error()` method for better pipeline error event integration.
|
||||
|
||||
- Added support for `maybe_capture_participant_camera()` and
|
||||
`maybe_capture_participant_screen()` for `SmallWebRTCTransport` in the runner
|
||||
utils.
|
||||
|
||||
- Added Hindi support for Rime TTS services.
|
||||
|
||||
- Updated `GeminiTTSService` to use Google Cloud Text-to-Speech streaming API
|
||||
@@ -40,6 +713,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
|
||||
### Fixed
|
||||
|
||||
- Fixed a `SimliVideoService` connection issue.
|
||||
|
||||
- Fixed an issue in the `Runner` where, when using `SmallWebRTCTransport`, the
|
||||
`request_data` was not being passed to the `SmallWebRTCRunnerArguments` body.
|
||||
|
||||
- Fixed subtle issue of assistant context messages ending up with double spaces
|
||||
between words or sentences.
|
||||
|
||||
@@ -54,12 +732,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
|
||||
- Prevented `HeyGenVideoService` from automatically disconnecting after 5 minutes.
|
||||
|
||||
### Added
|
||||
|
||||
- Added ai-coustics integrated VAD (`AICVADAnalyzer`) with `AICFilter` factory and
|
||||
example wiring; leverages the enhancement model for robust detection with no
|
||||
ONNX dependency or added processing complexity.
|
||||
|
||||
## [0.0.94] - 2025-11-10
|
||||
|
||||
### Changed
|
||||
|
||||
@@ -79,7 +79,7 @@ Once your PR is submitted, post in the `#community-integrations` Discord channel
|
||||
|
||||
**Examples:**
|
||||
|
||||
- [RivaSTTService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/riva/stt.py)
|
||||
- [NvidiaSTTService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/nvidia/stt.py)
|
||||
- [FalSTTService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/fal/stt.py)
|
||||
|
||||
#### Key requirements:
|
||||
|
||||
105
CONTRIBUTING.md
105
CONTRIBUTING.md
@@ -17,24 +17,121 @@ We welcome contributions of all kinds! Your help is appreciated. Follow these st
|
||||
git checkout -b your-branch-name
|
||||
```
|
||||
4. **Make your changes**: Edit or add files as necessary.
|
||||
5. **Test your changes**: Ensure that your changes look correct and follow the style set in the codebase.
|
||||
6. **Commit your changes**: Once you're satisfied with your changes, commit them with a meaningful message.
|
||||
5. **Add a changelog entry**: Create a changelog fragment file (see [Changelog Entries](#changelog-entries) below).
|
||||
6. **Test your changes**: Ensure that your changes look correct and follow the style set in the codebase.
|
||||
7. **Commit your changes**: Once you're satisfied with your changes, commit them with a meaningful message.
|
||||
|
||||
```bash
|
||||
git commit -m "Description of your changes"
|
||||
```
|
||||
|
||||
7. **Push your changes**: Push your branch to your forked repository.
|
||||
8. **Push your changes**: Push your branch to your forked repository.
|
||||
|
||||
```bash
|
||||
git push origin your-branch-name
|
||||
```
|
||||
|
||||
8. **Submit a Pull Request (PR)**: Open a PR from your forked repository to the main branch of this repo.
|
||||
9. **Submit a Pull Request (PR)**: Open a PR from your forked repository to the main branch of this repo.
|
||||
> Important: Describe the changes you've made clearly!
|
||||
|
||||
Our maintainers will review your PR, and once everything is good, your contributions will be merged!
|
||||
|
||||
## Changelog Entries
|
||||
|
||||
Every pull request that makes a user-facing change should include a changelog entry. We use a changelog fragment system to avoid merge conflicts.
|
||||
|
||||
### Creating a Changelog Fragment
|
||||
|
||||
1. Create a new file in the `changelog/` directory with this naming pattern:
|
||||
|
||||
```
|
||||
<PR_number>.<type>.md
|
||||
```
|
||||
|
||||
2. Choose the appropriate type:
|
||||
|
||||
- `added.md` - New features
|
||||
- `changed.md` - Changes in existing functionality
|
||||
- `deprecated.md` - Soon-to-be removed features
|
||||
- `removed.md` - Removed features
|
||||
- `fixed.md` - Bug fixes
|
||||
- `security.md` - Security fixes
|
||||
|
||||
3. Write your changelog entry as a Markdown bullet point. Include the `-` at the start:
|
||||
|
||||
**Example files:**
|
||||
|
||||
`changelog/1234.added.md`:
|
||||
|
||||
```markdown
|
||||
- Added support for Anthropic Claude 3.5 Sonnet with improved streaming performance.
|
||||
```
|
||||
|
||||
`changelog/5678.fixed.md`:
|
||||
|
||||
```markdown
|
||||
- Fixed an issue where audio frames were dropped during high-load scenarios.
|
||||
```
|
||||
|
||||
**For entries with nested bullets:**
|
||||
|
||||
`changelog/1234.changed.md`:
|
||||
|
||||
```markdown
|
||||
- Updated service configuration:
|
||||
|
||||
- Changed default timeout to 30 seconds
|
||||
- Added retry logic for failed connections
|
||||
```
|
||||
|
||||
### Multiple Changes in One PR
|
||||
|
||||
**Different types of changes:** Create separate fragment files for each type:
|
||||
|
||||
```
|
||||
changelog/1234.added.md
|
||||
changelog/1234.fixed.md
|
||||
```
|
||||
|
||||
**Multiple changes of the same type:** Create numbered fragment files:
|
||||
|
||||
```
|
||||
changelog/1234.changed.md
|
||||
changelog/1234.changed.2.md
|
||||
```
|
||||
|
||||
**Related changes:** Use nested bullets in a single fragment:
|
||||
|
||||
```markdown
|
||||
- Updated service configuration:
|
||||
|
||||
- Changed default timeout to 30 seconds
|
||||
- Added retry logic for failed connections
|
||||
```
|
||||
|
||||
**Rule of thumb:** One logical change per fragment file. If changes are unrelated, use separate files.
|
||||
|
||||
### Preview Your Changes
|
||||
|
||||
To see what your changelog entry will look like:
|
||||
|
||||
```bash
|
||||
towncrier build --draft --version Unreleased
|
||||
```
|
||||
|
||||
This won't modify any files, just show you a preview.
|
||||
|
||||
### When to Skip Changelog Entries
|
||||
|
||||
You can skip adding a changelog entry for:
|
||||
|
||||
- Documentation-only changes
|
||||
- Internal refactoring with no user-facing impact
|
||||
- Test-only changes
|
||||
- CI/build configuration changes
|
||||
|
||||
If you're unsure whether your change needs a changelog entry, ask in your PR!
|
||||
|
||||
## Dependency Management
|
||||
|
||||
This project uses [uv](https://docs.astral.sh/uv/) for dependency management. The `uv.lock` file is committed to ensure reproducible builds.
|
||||
|
||||
@@ -3,7 +3,6 @@
|
||||
</div></h1>
|
||||
|
||||
[](https://pypi.org/project/pipecat-ai)  [](https://codecov.io/gh/pipecat-ai/pipecat) [](https://docs.pipecat.ai) [](https://discord.gg/pipecat) [](https://deepwiki.com/pipecat-ai/pipecat)
|
||||
[](https://getmanta.ai/pipecat)
|
||||
|
||||
# 🎙️ Pipecat: Real-Time Voice & Multimodal AI Agents
|
||||
|
||||
@@ -74,10 +73,10 @@ Catch new features, interviews, and how-tos on our [Pipecat TV](https://www.yout
|
||||
|
||||
| Category | Services |
|
||||
| ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| Speech-to-Text | [AssemblyAI](https://docs.pipecat.ai/server/services/stt/assemblyai), [AWS](https://docs.pipecat.ai/server/services/stt/aws), [Azure](https://docs.pipecat.ai/server/services/stt/azure), [Cartesia](https://docs.pipecat.ai/server/services/stt/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/stt/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/stt/elevenlabs), [Fal Wizper](https://docs.pipecat.ai/server/services/stt/fal), [Gladia](https://docs.pipecat.ai/server/services/stt/gladia), [Google](https://docs.pipecat.ai/server/services/stt/google), [Groq (Whisper)](https://docs.pipecat.ai/server/services/stt/groq), [NVIDIA Riva](https://docs.pipecat.ai/server/services/stt/riva), [OpenAI (Whisper)](https://docs.pipecat.ai/server/services/stt/openai), [SambaNova (Whisper)](https://docs.pipecat.ai/server/services/stt/sambanova), [Sarvam](https://docs.pipecat.ai/server/services/stt/sarvam), [Soniox](https://docs.pipecat.ai/server/services/stt/soniox), [Speechmatics](https://docs.pipecat.ai/server/services/stt/speechmatics), [Ultravox](https://docs.pipecat.ai/server/services/stt/ultravox), [Whisper](https://docs.pipecat.ai/server/services/stt/whisper) |
|
||||
| Speech-to-Text | [AssemblyAI](https://docs.pipecat.ai/server/services/stt/assemblyai), [AWS](https://docs.pipecat.ai/server/services/stt/aws), [Azure](https://docs.pipecat.ai/server/services/stt/azure), [Cartesia](https://docs.pipecat.ai/server/services/stt/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/stt/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/stt/elevenlabs), [Fal Wizper](https://docs.pipecat.ai/server/services/stt/fal), [Gladia](https://docs.pipecat.ai/server/services/stt/gladia), [Google](https://docs.pipecat.ai/server/services/stt/google), [Gradium](https://docs.pipecat.ai/server/services/stt/gradium), [Groq (Whisper)](https://docs.pipecat.ai/server/services/stt/groq), [NVIDIA Riva](https://docs.pipecat.ai/server/services/stt/riva), [OpenAI (Whisper)](https://docs.pipecat.ai/server/services/stt/openai), [SambaNova (Whisper)](https://docs.pipecat.ai/server/services/stt/sambanova), [Sarvam](https://docs.pipecat.ai/server/services/stt/sarvam), [Soniox](https://docs.pipecat.ai/server/services/stt/soniox), [Speechmatics](https://docs.pipecat.ai/server/services/stt/speechmatics), [Whisper](https://docs.pipecat.ai/server/services/stt/whisper) |
|
||||
| LLMs | [Anthropic](https://docs.pipecat.ai/server/services/llm/anthropic), [AWS](https://docs.pipecat.ai/server/services/llm/aws), [Azure](https://docs.pipecat.ai/server/services/llm/azure), [Cerebras](https://docs.pipecat.ai/server/services/llm/cerebras), [DeepSeek](https://docs.pipecat.ai/server/services/llm/deepseek), [Fireworks AI](https://docs.pipecat.ai/server/services/llm/fireworks), [Gemini](https://docs.pipecat.ai/server/services/llm/gemini), [Grok](https://docs.pipecat.ai/server/services/llm/grok), [Groq](https://docs.pipecat.ai/server/services/llm/groq), [Mistral](https://docs.pipecat.ai/server/services/llm/mistral), [NVIDIA NIM](https://docs.pipecat.ai/server/services/llm/nim), [Ollama](https://docs.pipecat.ai/server/services/llm/ollama), [OpenAI](https://docs.pipecat.ai/server/services/llm/openai), [OpenRouter](https://docs.pipecat.ai/server/services/llm/openrouter), [Perplexity](https://docs.pipecat.ai/server/services/llm/perplexity), [Qwen](https://docs.pipecat.ai/server/services/llm/qwen), [SambaNova](https://docs.pipecat.ai/server/services/llm/sambanova) [Together AI](https://docs.pipecat.ai/server/services/llm/together) |
|
||||
| Text-to-Speech | [Async](https://docs.pipecat.ai/server/services/tts/asyncai), [AWS](https://docs.pipecat.ai/server/services/tts/aws), [Azure](https://docs.pipecat.ai/server/services/tts/azure), [Cartesia](https://docs.pipecat.ai/server/services/tts/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/tts/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/tts/elevenlabs), [Fish](https://docs.pipecat.ai/server/services/tts/fish), [Google](https://docs.pipecat.ai/server/services/tts/google), [Groq](https://docs.pipecat.ai/server/services/tts/groq), [Hume](https://docs.pipecat.ai/server/services/tts/hume), [Inworld](https://docs.pipecat.ai/server/services/tts/inworld), [LMNT](https://docs.pipecat.ai/server/services/tts/lmnt), [MiniMax](https://docs.pipecat.ai/server/services/tts/minimax), [Neuphonic](https://docs.pipecat.ai/server/services/tts/neuphonic), [NVIDIA Riva](https://docs.pipecat.ai/server/services/tts/riva), [OpenAI](https://docs.pipecat.ai/server/services/tts/openai), [Piper](https://docs.pipecat.ai/server/services/tts/piper), [PlayHT](https://docs.pipecat.ai/server/services/tts/playht), [Rime](https://docs.pipecat.ai/server/services/tts/rime), [Sarvam](https://docs.pipecat.ai/server/services/tts/sarvam), [Speechmatics](https://docs.pipecat.ai/server/services/tts/speechmatics), [XTTS](https://docs.pipecat.ai/server/services/tts/xtts) |
|
||||
| Speech-to-Speech | [AWS Nova Sonic](https://docs.pipecat.ai/server/services/s2s/aws), [Gemini Multimodal Live](https://docs.pipecat.ai/server/services/s2s/gemini), [OpenAI Realtime](https://docs.pipecat.ai/server/services/s2s/openai) |
|
||||
| Text-to-Speech | [Async](https://docs.pipecat.ai/server/services/tts/asyncai), [AWS](https://docs.pipecat.ai/server/services/tts/aws), [Azure](https://docs.pipecat.ai/server/services/tts/azure), [Cartesia](https://docs.pipecat.ai/server/services/tts/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/tts/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/tts/elevenlabs), [Fish](https://docs.pipecat.ai/server/services/tts/fish), [Google](https://docs.pipecat.ai/server/services/tts/google), [Gradium](https://docs.pipecat.ai/server/services/tts/gradium), [Groq](https://docs.pipecat.ai/server/services/tts/groq), [Hume](https://docs.pipecat.ai/server/services/tts/hume), [Inworld](https://docs.pipecat.ai/server/services/tts/inworld), [LMNT](https://docs.pipecat.ai/server/services/tts/lmnt), [MiniMax](https://docs.pipecat.ai/server/services/tts/minimax), [Neuphonic](https://docs.pipecat.ai/server/services/tts/neuphonic), [NVIDIA Riva](https://docs.pipecat.ai/server/services/tts/riva), [OpenAI](https://docs.pipecat.ai/server/services/tts/openai), [Piper](https://docs.pipecat.ai/server/services/tts/piper), [PlayHT](https://docs.pipecat.ai/server/services/tts/playht), [Rime](https://docs.pipecat.ai/server/services/tts/rime), [Sarvam](https://docs.pipecat.ai/server/services/tts/sarvam), [Speechmatics](https://docs.pipecat.ai/server/services/tts/speechmatics), [XTTS](https://docs.pipecat.ai/server/services/tts/xtts) |
|
||||
| Speech-to-Speech | [AWS Nova Sonic](https://docs.pipecat.ai/server/services/s2s/aws), [Gemini Multimodal Live](https://docs.pipecat.ai/server/services/s2s/gemini), [OpenAI Realtime](https://docs.pipecat.ai/server/services/s2s/openai), Ultravox, |
|
||||
| Transport | [Daily (WebRTC)](https://docs.pipecat.ai/server/services/transport/daily), [FastAPI Websocket](https://docs.pipecat.ai/server/services/transport/fastapi-websocket), [SmallWebRTCTransport](https://docs.pipecat.ai/server/services/transport/small-webrtc), [WebSocket Server](https://docs.pipecat.ai/server/services/transport/websocket-server), Local |
|
||||
| Serializers | [Plivo](https://docs.pipecat.ai/server/utilities/serializers/plivo), [Twilio](https://docs.pipecat.ai/server/utilities/serializers/twilio), [Telnyx](https://docs.pipecat.ai/server/utilities/serializers/telnyx) |
|
||||
| Video | [HeyGen](https://docs.pipecat.ai/server/services/video/heygen), [Tavus](https://docs.pipecat.ai/server/services/video/tavus), [Simli](https://docs.pipecat.ai/server/services/video/simli) |
|
||||
@@ -154,7 +153,6 @@ You can get started with Pipecat running on your local machine, then move your a
|
||||
--no-extra gstreamer \
|
||||
--no-extra krisp \
|
||||
--no-extra local \
|
||||
--no-extra ultravox # (ultravox not fully supported on macOS)
|
||||
```
|
||||
|
||||
3. Install the git pre-commit hooks:
|
||||
|
||||
16
changelog/_template.md.j2
Normal file
16
changelog/_template.md.j2
Normal file
@@ -0,0 +1,16 @@
|
||||
{% for section, _ in sections.items() %}
|
||||
{% if sections[section] %}
|
||||
{% for category, val in definitions.items() if category in sections[section]%}
|
||||
### {{ definitions[category]['name'] }}
|
||||
|
||||
{% for text, values in sections[section][category].items() %}
|
||||
{{ text }}
|
||||
(PR {{ values|join(', ') }})
|
||||
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
{% else %}
|
||||
No significant changes.
|
||||
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
# Build docs using uv
|
||||
echo "Installing dependencies with uv..."
|
||||
uv sync --group docs --all-extras --no-extra krisp --no-extra gstreamer --no-extra ultravox --no-extra local_smart_turn --no-extra moondream --no-extra riva --no-extra mlx-whisper
|
||||
uv sync --group docs --all-extras --no-extra krisp --no-extra gstreamer --no-extra local_smart_turn --no-extra moondream --no-extra riva --no-extra mlx-whisper
|
||||
|
||||
# Check if sphinx-build is available
|
||||
if ! uv run sphinx-build --version &> /dev/null; then
|
||||
@@ -24,4 +24,4 @@ if [ $? -eq 0 ]; then
|
||||
else
|
||||
echo "Documentation build failed!" >&2
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
@@ -61,9 +61,6 @@ autodoc_mock_imports = [
|
||||
# OpenCV - sometimes has import issues during docs build
|
||||
"cv2",
|
||||
# Heavy ML packages excluded from ReadTheDocs
|
||||
# ultravox dependencies
|
||||
"vllm",
|
||||
"vllm.engine.arg_utils",
|
||||
# local-smart-turn dependencies
|
||||
"coremltools",
|
||||
"coremltools.models",
|
||||
@@ -119,7 +116,6 @@ def import_core_modules():
|
||||
"pipecat.observers",
|
||||
"pipecat.runner",
|
||||
"pipecat.serializers",
|
||||
"pipecat.sync",
|
||||
"pipecat.transcriptions",
|
||||
"pipecat.utils",
|
||||
]
|
||||
|
||||
@@ -30,7 +30,6 @@ Quick Links
|
||||
Runner <api/pipecat.runner>
|
||||
Serializers <api/pipecat.serializers>
|
||||
Services <api/pipecat.services>
|
||||
Sync <api/pipecat.sync>
|
||||
Transcriptions <api/pipecat.transcriptions>
|
||||
Transports <api/pipecat.transports>
|
||||
Utils <api/pipecat.utils>
|
||||
Utils <api/pipecat.utils>
|
||||
|
||||
10
env.example
10
env.example
@@ -44,6 +44,7 @@ DAILY_SAMPLE_ROOM_URL=https://...
|
||||
|
||||
# Deepgram
|
||||
DEEPGRAM_API_KEY=...
|
||||
SAGEMAKER_ENDPOINT_NAME=...
|
||||
|
||||
# DeepSeek
|
||||
DEEPSEEK_API_KEY=...
|
||||
@@ -72,6 +73,9 @@ GOOGLE_CLOUD_PROJECT_ID=...
|
||||
GOOGLE_CLOUD_LOCATION=...
|
||||
GOOGLE_TEST_CREDENTIALS=...
|
||||
|
||||
# Gradium
|
||||
GRAPDIUM_API_KEY=...
|
||||
|
||||
# Grok
|
||||
GROK_API_KEY=...
|
||||
|
||||
@@ -80,6 +84,7 @@ GROQ_API_KEY=...
|
||||
|
||||
# Heygen
|
||||
HEYGEN_API_KEY=...
|
||||
HEYGEN_LIVE_AVATAR_API_KEY=...
|
||||
|
||||
# Hume
|
||||
HUME_API_KEY=...
|
||||
@@ -186,8 +191,11 @@ TOGETHER_API_KEY=...
|
||||
TWILIO_ACCOUNT_SID=...
|
||||
TWILIO_AUTH_TOKEN=...
|
||||
|
||||
# Ultravox Realtime
|
||||
ULTRAVOX_API_KEY=...
|
||||
|
||||
# WhatsApp
|
||||
WHATSAPP_TOKEN=...
|
||||
WHATSAPP_WEBHOOK_VERIFICATION_TOKEN=...
|
||||
WHATSAPP_PHONE_NUMBER_ID=...
|
||||
WHATSAPP_APP_SECRET=...
|
||||
WHATSAPP_APP_SECRET=...
|
||||
|
||||
@@ -15,7 +15,7 @@ from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineTask
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.riva.tts import FastPitchTTSService
|
||||
from pipecat.services.nvidia.tts import NvidiaTTSService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
@@ -36,7 +36,7 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
tts = FastPitchTTSService(api_key=os.getenv("NVIDIA_API_KEY"))
|
||||
tts = NvidiaTTSService(api_key=os.getenv("NVIDIA_API_KEY"))
|
||||
|
||||
task = PipelineTask(
|
||||
Pipeline([tts, transport.output()]),
|
||||
@@ -4,7 +4,6 @@
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
|
||||
import os
|
||||
|
||||
import aiohttp
|
||||
@@ -15,26 +14,26 @@ from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
|
||||
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
from pipecat.frames.frames import LLMRunFrame
|
||||
from pipecat.frames.frames import LLMRunFrame, TTSTextFrame
|
||||
from pipecat.observers.loggers.debug_log_observer import DebugLogObserver, FrameEndpoint
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.processors.frameworks.rtvi import RTVIObserver, RTVIProcessor
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.inworld.tts import InworldTTSService
|
||||
from pipecat.services.inworld.tts import InworldHttpTTSService
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.transports.base_output import BaseOutputTransport
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||
# instantiated. The function will be called when the desired transport gets
|
||||
# selected.
|
||||
transport_params = {
|
||||
"daily": lambda: DailyParams(
|
||||
audio_in_enabled=True,
|
||||
@@ -58,22 +57,18 @@ transport_params = {
|
||||
|
||||
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
logger.info("Starting bot")
|
||||
|
||||
# Create an HTTP session
|
||||
async with aiohttp.ClientSession() as session:
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
|
||||
# Inworld TTS Service - Unified streaming and non-streaming
|
||||
# Set streaming=True for real-time audio, streaming=False for complete audio generation
|
||||
streaming = True # Toggle this to switch between modes
|
||||
|
||||
tts = InworldTTSService(
|
||||
tts = InworldHttpTTSService(
|
||||
api_key=os.getenv("INWORLD_API_KEY", ""),
|
||||
aiohttp_session=session,
|
||||
voice_id="Ashley",
|
||||
model="inworld-tts-1",
|
||||
streaming=streaming, # True: real-time chunks, False: complete audio then playback
|
||||
# Set to False for non-streaming mode or True for streaming mode.
|
||||
streaming=True,
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
|
||||
@@ -81,22 +76,25 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are very knowledgable about dogs. Your output will be spoken aloud, so avoid special characters that can't easily be spoken, such as emojis or bullet points. Respond to what the user said in a creative and helpful way.",
|
||||
"content": "You are a helpful AI demonstrating Inworld AI's TTS. Your output will be spoken aloud, so avoid special characters that can't easily be spoken, such as emojis or bullet points. Respond to what the user said in a friendly and helpful way.",
|
||||
},
|
||||
]
|
||||
|
||||
context = LLMContext(messages)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
rtvi = RTVIProcessor()
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(), # Transport user input
|
||||
stt, # STT
|
||||
context_aggregator.user(), # User responses
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
context_aggregator.assistant(), # Assistant spoken responses
|
||||
transport.input(),
|
||||
rtvi,
|
||||
stt,
|
||||
context_aggregator.user(),
|
||||
llm,
|
||||
tts,
|
||||
transport.output(),
|
||||
context_aggregator.assistant(),
|
||||
]
|
||||
)
|
||||
|
||||
@@ -106,19 +104,27 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
),
|
||||
observers=[
|
||||
RTVIObserver(rtvi),
|
||||
DebugLogObserver(
|
||||
frame_types={
|
||||
TTSTextFrame: (BaseOutputTransport, FrameEndpoint.SOURCE),
|
||||
}
|
||||
),
|
||||
],
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
logger.info("Client connected")
|
||||
# Kick off the conversation.
|
||||
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
logger.info("Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
|
||||
141
examples/foundational/07ab-interruptible-inworld.py
Normal file
141
examples/foundational/07ab-interruptible-inworld.py
Normal file
@@ -0,0 +1,141 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import os
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
|
||||
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
from pipecat.frames.frames import LLMRunFrame, TTSTextFrame
|
||||
from pipecat.observers.loggers.debug_log_observer import DebugLogObserver, FrameEndpoint
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.processors.frameworks.rtvi import RTVIConfig, RTVIObserver, RTVIProcessor
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.inworld.tts import InworldTTSService
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.transports.base_output import BaseOutputTransport
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
transport_params = {
|
||||
"daily": lambda: DailyParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
"twilio": lambda: FastAPIWebsocketParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
"webrtc": lambda: TransportParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info("Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
|
||||
tts = InworldTTSService(
|
||||
api_key=os.getenv("INWORLD_API_KEY", ""),
|
||||
voice_id="Ashley",
|
||||
model="inworld-tts-1",
|
||||
temperature=1.1,
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful AI demonstrating Inworld AI's TTS. Your output will be spoken aloud, so avoid special characters that can't easily be spoken, such as emojis or bullet points. Respond to what the user said in a friendly and helpful way.",
|
||||
},
|
||||
]
|
||||
|
||||
context = LLMContext(messages)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
rtvi = RTVIProcessor(config=RTVIConfig(config=[]))
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(),
|
||||
rtvi,
|
||||
stt,
|
||||
context_aggregator.user(),
|
||||
llm,
|
||||
tts,
|
||||
transport.output(),
|
||||
context_aggregator.assistant(),
|
||||
]
|
||||
)
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
params=PipelineParams(
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
),
|
||||
observers=[
|
||||
RTVIObserver(rtvi),
|
||||
DebugLogObserver(
|
||||
frame_types={
|
||||
TTSTextFrame: (BaseOutputTransport, FrameEndpoint.SOURCE),
|
||||
}
|
||||
),
|
||||
],
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info("Client connected")
|
||||
# Kick off the conversation.
|
||||
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info("Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
async def bot(runner_args: RunnerArguments):
|
||||
"""Main bot entry point compatible with Pipecat Cloud."""
|
||||
transport = await create_transport(runner_args, transport_params)
|
||||
await run_bot(transport, runner_args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from pipecat.runner.run import main
|
||||
|
||||
main()
|
||||
@@ -13,24 +13,29 @@ from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
|
||||
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
from pipecat.frames.frames import LLMRunFrame
|
||||
from pipecat.frames.frames import LLMRunFrame, TTSTextFrame
|
||||
from pipecat.observers.loggers.debug_log_observer import DebugLogObserver, FrameEndpoint
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.processors.aggregators.llm_response_universal import (
|
||||
LLMContextAggregatorPair,
|
||||
)
|
||||
from pipecat.processors.frameworks.rtvi import RTVIConfig, RTVIObserver, RTVIProcessor
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.hume.tts import HUME_SAMPLE_RATE, HumeTTSService
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.transports.base_output import BaseOutputTransport
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||
# instantiated. The function will be called when the desired transport gets
|
||||
# selected.
|
||||
@@ -88,7 +93,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
stt,
|
||||
context_aggregator.user(), # User responses
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
tts, # TTS (HumeTTSService with word timestamps)
|
||||
transport.output(), # Transport bot output
|
||||
context_aggregator.assistant(), # Assistant spoken responses
|
||||
]
|
||||
@@ -102,7 +107,14 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
audio_out_sample_rate=HUME_SAMPLE_RATE,
|
||||
),
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
observers=[RTVIObserver(rtvi)],
|
||||
observers=[
|
||||
RTVIObserver(rtvi),
|
||||
DebugLogObserver(
|
||||
frame_types={
|
||||
TTSTextFrame: (BaseOutputTransport, FrameEndpoint.SOURCE),
|
||||
}
|
||||
),
|
||||
],
|
||||
)
|
||||
|
||||
@rtvi.event_handler("on_client_ready")
|
||||
@@ -112,6 +124,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
logger.info(
|
||||
"💡 Word timestamps are enabled! Watch the console for TTSTextFrame logs showing each word with its PTS."
|
||||
)
|
||||
# Kick off the conversation.
|
||||
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@@ -4,7 +4,6 @@
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
|
||||
import os
|
||||
|
||||
from dotenv import load_dotenv
|
||||
@@ -14,32 +13,23 @@ from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
|
||||
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
from pipecat.frames.frames import LLMRunFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
from pipecat.services.ultravox.stt import UltravoxSTTService
|
||||
from pipecat.services.gradium.stt import GradiumSTTService
|
||||
from pipecat.services.gradium.tts import GradiumTTSService
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
# NOTE: This example requires GPU resources to run efficiently.
|
||||
# The Ultravox model is compute-intensive and performs best with GPU acceleration.
|
||||
# This can be deployed on cloud GPU providers like Cerebrium.ai for optimal performance.
|
||||
|
||||
|
||||
# Want to initialize the ultravox processor since it takes time to load the model and dont
|
||||
# want to load it every time the pipeline is run
|
||||
ultravox_processor = UltravoxSTTService(
|
||||
model_name="fixie-ai/ultravox-v0_5-llama-3_1-8b",
|
||||
hf_token=os.getenv("HF_TOKEN"),
|
||||
)
|
||||
|
||||
|
||||
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||
# instantiated. The function will be called when the desired transport gets
|
||||
# selected.
|
||||
@@ -68,17 +58,34 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.environ.get("CARTESIA_API_KEY"),
|
||||
voice_id="97f4b8fb-f2fe-444b-bb9a-c109783a857a",
|
||||
stt = GradiumSTTService(api_key=os.getenv("GRADIUM_API_KEY"))
|
||||
|
||||
tts = GradiumTTSService(
|
||||
api_key=os.getenv("GRADIUM_API_KEY"),
|
||||
voice_id="YTpq7expH9539ERJ",
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be spoken aloud, so avoid special characters that can't easily be spoken, such as emojis or bullet points. Respond to what the user said in a creative and helpful way.",
|
||||
},
|
||||
]
|
||||
|
||||
context = LLMContext(messages)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(), # Transport user input
|
||||
ultravox_processor,
|
||||
stt,
|
||||
context_aggregator.user(), # User responses
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
context_aggregator.assistant(), # Assistant spoken responses
|
||||
]
|
||||
)
|
||||
|
||||
@@ -94,6 +101,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
@@ -52,7 +52,10 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramFluxSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
stt = DeepgramFluxSTTService(
|
||||
api_key=os.getenv("DEEPGRAM_API_KEY"),
|
||||
params=DeepgramFluxSTTService.InputParams(min_confidence=0.3),
|
||||
)
|
||||
|
||||
tts = DeepgramTTSService(api_key=os.getenv("DEEPGRAM_API_KEY"), voice="aura-2-andromeda-en")
|
||||
|
||||
|
||||
137
examples/foundational/07c-interruptible-deepgram-sagemaker.py
Normal file
137
examples/foundational/07c-interruptible-deepgram-sagemaker.py
Normal file
@@ -0,0 +1,137 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
|
||||
import os
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
|
||||
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
from pipecat.frames.frames import LLMRunFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.aws.llm import AWSBedrockLLMService
|
||||
from pipecat.services.deepgram.stt_sagemaker import DeepgramSageMakerSTTService
|
||||
from pipecat.services.deepgram.tts import DeepgramTTSService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||
# instantiated. The function will be called when the desired transport gets
|
||||
# selected.
|
||||
transport_params = {
|
||||
"daily": lambda: DailyParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
"twilio": lambda: FastAPIWebsocketParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
"webrtc": lambda: TransportParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
# Initialize Deepgram SageMaker STT Service
|
||||
# This requires:
|
||||
# - AWS credentials configured (via environment variables or AWS CLI)
|
||||
# - A deployed SageMaker endpoint with Deepgram model
|
||||
stt = DeepgramSageMakerSTTService(
|
||||
endpoint_name=os.getenv("SAGEMAKER_ENDPOINT_NAME"),
|
||||
region=os.getenv("AWS_REGION"),
|
||||
)
|
||||
|
||||
tts = DeepgramTTSService(api_key=os.getenv("DEEPGRAM_API_KEY"), voice="aura-2-andromeda-en")
|
||||
|
||||
llm = AWSBedrockLLMService(
|
||||
aws_region=os.getenv("AWS_REGION"),
|
||||
model="us.amazon.nova-pro-v1:0",
|
||||
params=AWSBedrockLLMService.InputParams(temperature=0.8),
|
||||
)
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be spoken aloud, so avoid special characters that can't easily be spoken, such as emojis or bullet points. Respond to what the user said in a creative and helpful way.",
|
||||
},
|
||||
]
|
||||
|
||||
context = LLMContext(messages)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(), # Transport user input
|
||||
stt, # STT
|
||||
context_aggregator.user(), # User responses
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
context_aggregator.assistant(), # Assistant spoken responses
|
||||
]
|
||||
)
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
params=PipelineParams(
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
),
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
async def bot(runner_args: RunnerArguments):
|
||||
"""Main bot entry point compatible with Pipecat Cloud."""
|
||||
transport = await create_transport(runner_args, transport_params)
|
||||
await run_bot(transport, runner_args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from pipecat.runner.run import main
|
||||
|
||||
main()
|
||||
@@ -71,9 +71,9 @@ def build_agent(model_id: str, max_tokens: int):
|
||||
@tool
|
||||
def check_weather(location: str) -> str:
|
||||
if location.lower() == "san francisco":
|
||||
return "The weather in San Francisco is sunny and 30 degrees."
|
||||
return "The weather in San Francisco is sunny and 75 degrees."
|
||||
elif location.lower() == "sydney":
|
||||
return "The weather in Sydney is cloudy and 20 degrees."
|
||||
return "The weather in Sydney is cloudy and 60 degrees."
|
||||
else:
|
||||
return "I'm not sure about the weather in that location."
|
||||
|
||||
|
||||
@@ -89,6 +89,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
llm = GoogleLLMService(
|
||||
api_key=os.getenv("GOOGLE_API_KEY"),
|
||||
model="gemini-2.5-flash-image",
|
||||
# model="gemini-3-pro-image-preview", # A more powerful model, but slower
|
||||
)
|
||||
|
||||
messages = [
|
||||
|
||||
@@ -136,7 +136,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
messages.append(
|
||||
{
|
||||
"role": "system",
|
||||
"content": "Hello! I'm your AI assistant. I can help you with a variety of tasks. What would you like to know?",
|
||||
"content": "You are an AI assistant. You can help with a variety of tasks. Introduce yourself and ask the user what they would like to know.",
|
||||
}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@@ -75,8 +75,10 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
llm = GoogleLLMService(
|
||||
api_key=os.getenv("GOOGLE_API_KEY"),
|
||||
model="gemini-2.5-flash",
|
||||
# turn on thinking if you want it
|
||||
# params=GoogleLLMService.InputParams(extra={"thinking_config": {"thinking_budget": 4096}}),)
|
||||
# force a certain amount of thinking if you want it
|
||||
# params=GoogleLLMService.InputParams(
|
||||
# thinking=GoogleLLMService.ThinkingConfig(thinking_budget=4096)
|
||||
# ),
|
||||
)
|
||||
|
||||
messages = [
|
||||
|
||||
@@ -75,8 +75,10 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
llm = GoogleLLMService(
|
||||
api_key=os.getenv("GOOGLE_API_KEY"),
|
||||
model="gemini-2.5-flash",
|
||||
# turn on thinking if you want it
|
||||
# params=GoogleLLMService.InputParams(extra={"thinking_config": {"thinking_budget": 4096}}),)
|
||||
# force a certain amount of thinking if you want it
|
||||
# params=GoogleLLMService.InputParams(
|
||||
# thinking=GoogleLLMService.ThinkingConfig(thinking_budget=4096)
|
||||
# ),
|
||||
)
|
||||
|
||||
messages = [
|
||||
|
||||
@@ -22,9 +22,9 @@ from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.nim.llm import NimLLMService
|
||||
from pipecat.services.riva.stt import RivaSTTService
|
||||
from pipecat.services.riva.tts import RivaTTSService
|
||||
from pipecat.services.nvidia.llm import NvidiaLLMService
|
||||
from pipecat.services.nvidia.stt import NvidiaSTTService
|
||||
from pipecat.services.nvidia.tts import NvidiaTTSService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
@@ -59,11 +59,13 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = RivaSTTService(api_key=os.getenv("NVIDIA_API_KEY"))
|
||||
stt = NvidiaSTTService(api_key=os.getenv("NVIDIA_API_KEY"))
|
||||
|
||||
llm = NimLLMService(api_key=os.getenv("NVIDIA_API_KEY"), model="meta/llama-3.1-405b-instruct")
|
||||
llm = NvidiaLLMService(
|
||||
api_key=os.getenv("NVIDIA_API_KEY"), model="meta/llama-3.1-405b-instruct"
|
||||
)
|
||||
|
||||
tts = RivaTTSService(api_key=os.getenv("NVIDIA_API_KEY"))
|
||||
tts = NvidiaTTSService(api_key=os.getenv("NVIDIA_API_KEY"))
|
||||
|
||||
messages = [
|
||||
{
|
||||
@@ -224,8 +224,10 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
llm = GoogleLLMService(
|
||||
api_key=os.getenv("GOOGLE_API_KEY"),
|
||||
model="gemini-2.5-flash",
|
||||
# turn on thinking if you want it
|
||||
# params=GoogleLLMService.InputParams(extra={"thinking_config": {"thinking_budget": 4096}}),
|
||||
# force a certain amount of thinking if you want it
|
||||
# params=GoogleLLMService.InputParams(
|
||||
# thinking=GoogleLLMService.ThinkingConfig(thinking_budget=4096)
|
||||
# ),
|
||||
)
|
||||
|
||||
tts = GoogleTTSService(
|
||||
|
||||
@@ -110,7 +110,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
|
||||
# Kick off the conversation.
|
||||
image = Image.open(image_path)
|
||||
message = LLMContext.create_image_message(
|
||||
message = await LLMContext.create_image_message(
|
||||
image=image.tobytes(),
|
||||
format="RGB",
|
||||
size=image.size,
|
||||
|
||||
@@ -110,7 +110,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
|
||||
# Kick off the conversation.
|
||||
image = Image.open(image_path)
|
||||
message = LLMContext.create_image_message(
|
||||
message = await LLMContext.create_image_message(
|
||||
image=image.tobytes(),
|
||||
format="RGB",
|
||||
size=image.size,
|
||||
|
||||
@@ -117,7 +117,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
|
||||
# Kick off the conversation.
|
||||
image = Image.open(image_path)
|
||||
message = LLMContext.create_image_message(
|
||||
message = await LLMContext.create_image_message(
|
||||
image=image.tobytes(),
|
||||
format="RGB",
|
||||
size=image.size,
|
||||
|
||||
@@ -110,7 +110,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
|
||||
# Kick off the conversation.
|
||||
image = Image.open(image_path)
|
||||
message = LLMContext.create_image_message(
|
||||
message = await LLMContext.create_image_message(
|
||||
image=image.tobytes(),
|
||||
format="RGB",
|
||||
size=image.size,
|
||||
|
||||
@@ -15,14 +15,21 @@ from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
|
||||
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
from pipecat.frames.frames import LLMRunFrame, UserImageRequestFrame
|
||||
from pipecat.frames.frames import (
|
||||
Frame,
|
||||
LLMFullResponseEndFrame,
|
||||
LLMFullResponseStartFrame,
|
||||
LLMRunFrame,
|
||||
TextFrame,
|
||||
UserImageRequestFrame,
|
||||
)
|
||||
from pipecat.pipeline.parallel_pipeline import ParallelPipeline
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.processors.frame_processor import FrameDirection
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import (
|
||||
create_transport,
|
||||
@@ -66,6 +73,27 @@ async def fetch_user_image(params: FunctionCallParams):
|
||||
# await params.result_callback({"result": "Image is being captured."})
|
||||
|
||||
|
||||
class MoondreamTextFrameWrapper(FrameProcessor):
|
||||
"""Wraps Moondream-provided TextFrames with LLM response start/end frames.
|
||||
|
||||
This processor detects TextFrames and automatically wraps them with
|
||||
LLMFullResponseStartFrame and LLMFullResponseEndFrame to provide proper
|
||||
response boundaries for downstream processors.
|
||||
"""
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
# If we receive a TextFrame, wrap it with response start/end frames
|
||||
if isinstance(frame, TextFrame):
|
||||
await self.push_frame(LLMFullResponseStartFrame(), direction)
|
||||
await self.push_frame(frame, direction)
|
||||
await self.push_frame(LLMFullResponseEndFrame(), direction)
|
||||
else:
|
||||
# For all other frames, just pass them through
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
|
||||
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||
# instantiated. The function will be called when the desired transport gets
|
||||
# selected.
|
||||
@@ -130,6 +158,12 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
# If you run into weird description, try with use_cpu=True
|
||||
moondream = MoondreamService()
|
||||
|
||||
# Wrap TextFrames with LLM response start/end frames, which makes Moondream
|
||||
# output be treated like LLM responses for the purpose of context
|
||||
# aggregation. Without this, the assistant context aggregator would ignore
|
||||
# Moondream output (if the TTS service is disabled).
|
||||
moondream_text_wrapper = MoondreamTextFrameWrapper()
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(), # Transport user input
|
||||
@@ -137,7 +171,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
context_aggregator.user(), # User responses
|
||||
ParallelPipeline(
|
||||
[llm], # LLM
|
||||
[moondream],
|
||||
[moondream, moondream_text_wrapper],
|
||||
),
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
|
||||
@@ -76,7 +76,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
|
||||
llm = FireworksLLMService(
|
||||
api_key=os.getenv("FIREWORKS_API_KEY"),
|
||||
model="accounts/fireworks/models/llama-v3p1-405b-instruct",
|
||||
model="accounts/fireworks/models/gpt-oss-20b",
|
||||
)
|
||||
# You can also register a function_name of None to get all functions
|
||||
# sent to the same callback with an additional function_name parameter.
|
||||
|
||||
@@ -27,7 +27,7 @@ from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.llm_service import FunctionCallParams
|
||||
from pipecat.services.nim.llm import NimLLMService
|
||||
from pipecat.services.nvidia.llm import NvidiaLLMService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
@@ -75,11 +75,11 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
# text_filters=[MarkdownTextFilter()],
|
||||
)
|
||||
|
||||
llm = NimLLMService(
|
||||
llm = NvidiaLLMService(
|
||||
api_key=os.getenv("NVIDIA_API_KEY"),
|
||||
model="nvidia/llama-3.3-nemotron-super-49b-v1.5",
|
||||
# Recommended when turning thinking off
|
||||
params=NimLLMService.InputParams(temperature=0.0),
|
||||
params=NvidiaLLMService.InputParams(temperature=0.0),
|
||||
)
|
||||
# You can also register a function_name of None to get all functions
|
||||
# sent to the same callback with an additional function_name parameter.
|
||||
@@ -14,20 +14,13 @@ from loguru import logger
|
||||
|
||||
from pipecat.adapters.schemas.function_schema import FunctionSchema
|
||||
from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
||||
from pipecat.adapters.services.open_ai_realtime_adapter import OpenAIRealtimeLLMAdapter
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.frames.frames import (
|
||||
LLMRunFrame,
|
||||
LLMSetToolsFrame,
|
||||
LLMUpdateSettingsFrame,
|
||||
TranscriptionMessage,
|
||||
)
|
||||
from pipecat.frames.frames import LLMRunFrame, LLMSetToolsFrame, TranscriptionMessage
|
||||
from pipecat.observers.loggers.transcription_log_observer import TranscriptionLogObserver
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response import LLMAssistantAggregatorParams
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.processors.transcript_processor import TranscriptProcessor
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
|
||||
@@ -19,7 +19,6 @@ from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response import LLMAssistantAggregatorParams
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
|
||||
@@ -28,10 +28,10 @@ from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.llm_service import LLMService
|
||||
from pipecat.services.openai.llm import OpenAIContextAggregatorPair, OpenAILLMService
|
||||
from pipecat.sync.event_notifier import EventNotifier
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
from pipecat.utils.sync.event_notifier import EventNotifier
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
@@ -45,11 +45,11 @@ from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.llm_service import FunctionCallParams, LLMService
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.sync.base_notifier import BaseNotifier
|
||||
from pipecat.sync.event_notifier import EventNotifier
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
from pipecat.utils.sync.base_notifier import BaseNotifier
|
||||
from pipecat.utils.sync.event_notifier import EventNotifier
|
||||
from pipecat.utils.time import time_now_iso8601
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
@@ -46,11 +46,11 @@ from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.llm_service import FunctionCallParams, LLMService
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.sync.base_notifier import BaseNotifier
|
||||
from pipecat.sync.event_notifier import EventNotifier
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
from pipecat.utils.sync.base_notifier import BaseNotifier
|
||||
from pipecat.utils.sync.event_notifier import EventNotifier
|
||||
from pipecat.utils.time import time_now_iso8601
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
@@ -47,11 +47,11 @@ from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
from pipecat.services.google.llm import GoogleLLMService
|
||||
from pipecat.services.llm_service import LLMService
|
||||
from pipecat.sync.base_notifier import BaseNotifier
|
||||
from pipecat.sync.event_notifier import EventNotifier
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
from pipecat.utils.sync.base_notifier import BaseNotifier
|
||||
from pipecat.utils.sync.event_notifier import EventNotifier
|
||||
from pipecat.utils.time import time_now_iso8601
|
||||
|
||||
load_dotenv(override=True)
|
||||
@@ -391,7 +391,7 @@ class AudioAccumulator(FrameProcessor):
|
||||
)
|
||||
self._user_speaking = False
|
||||
context = LLMContext()
|
||||
context.add_audio_frames_message(audio_frames=self._audio_frames)
|
||||
await context.add_audio_frames_message(audio_frames=self._audio_frames)
|
||||
await self.push_frame(LLMContextFrame(context=context))
|
||||
elif isinstance(frame, InputAudioRawFrame):
|
||||
# Append the audio frame to our buffer. Treat the buffer as a ring buffer, dropping the oldest
|
||||
|
||||
@@ -17,7 +17,6 @@ from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response import LLMAssistantAggregatorParams
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.processors.transcript_processor import TranscriptProcessor
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
|
||||
@@ -20,7 +20,6 @@ from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response import LLMAssistantAggregatorParams
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
|
||||
@@ -18,7 +18,6 @@ from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response import LLMAssistantAggregatorParams
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import (
|
||||
|
||||
@@ -150,7 +150,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
LLMLogObserver(),
|
||||
DebugLogObserver(
|
||||
frame_types={
|
||||
TTSTextFrame: (BaseOutputTransport, FrameEndpoint.DESTINATION),
|
||||
TTSTextFrame: (BaseOutputTransport, FrameEndpoint.SOURCE),
|
||||
UserStartedSpeakingFrame: (BaseInputTransport, FrameEndpoint.SOURCE),
|
||||
EndFrame: None,
|
||||
}
|
||||
|
||||
@@ -62,7 +62,11 @@ from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
from pipecat.utils.text.pattern_pair_aggregator import PatternMatch, PatternPairAggregator
|
||||
from pipecat.utils.text.pattern_pair_aggregator import (
|
||||
MatchAction,
|
||||
PatternMatch,
|
||||
PatternPairAggregator,
|
||||
)
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
@@ -106,16 +110,16 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
pattern_aggregator = PatternPairAggregator()
|
||||
|
||||
# Add pattern for voice switching
|
||||
pattern_aggregator.add_pattern_pair(
|
||||
pattern_id="voice_tag",
|
||||
pattern_aggregator.add_pattern(
|
||||
type="voice",
|
||||
start_pattern="<voice>",
|
||||
end_pattern="</voice>",
|
||||
remove_match=True,
|
||||
action=MatchAction.REMOVE, # Remove tags from final text
|
||||
)
|
||||
|
||||
# Register handler for voice switching
|
||||
async def on_voice_tag(match: PatternMatch):
|
||||
voice_name = match.content.strip().lower()
|
||||
voice_name = match.text.strip().lower()
|
||||
if voice_name in VOICE_IDS:
|
||||
# First flush any existing audio to finish the current context
|
||||
await tts.flush_audio()
|
||||
@@ -125,7 +129,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
else:
|
||||
logger.warning(f"Unknown voice: {voice_name}")
|
||||
|
||||
pattern_aggregator.on_pattern_match("voice_tag", on_voice_tag)
|
||||
pattern_aggregator.on_pattern_match("voice", on_voice_tag)
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
|
||||
|
||||
@@ -64,11 +64,14 @@ class UrlToImageProcessor(FrameProcessor):
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
def extract_url(self, text: str):
|
||||
data = json.loads(text)
|
||||
if "artObject" in data:
|
||||
return data["artObject"]["webImage"]["url"]
|
||||
if "artworks" in data and len(data["artworks"]):
|
||||
return data["artworks"][0]["webImage"]["url"]
|
||||
try:
|
||||
data = json.loads(text)
|
||||
if "artObject" in data:
|
||||
return data["artObject"]["webImage"]["url"]
|
||||
if "artworks" in data and len(data["artworks"]):
|
||||
return data["artworks"][0]["webImage"]["url"]
|
||||
except:
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
@@ -88,6 +91,23 @@ class UrlToImageProcessor(FrameProcessor):
|
||||
logger.error(error_msg)
|
||||
|
||||
|
||||
# full list of tools available from rijksmuseum MCP:
|
||||
# - get_artwork_details
|
||||
# - get_artwork_image
|
||||
# - get_user_sets
|
||||
# - get_user_set_details
|
||||
# - open_image_in_browser
|
||||
# - get_artist_timeline
|
||||
|
||||
mcp_tools_filter = ["get_artwork_details", "get_artwork_image", "open_image_in_browser"]
|
||||
|
||||
|
||||
def open_image_output_filter(output: str):
|
||||
pattern = r"Successfully opened image in browser: "
|
||||
text_to_print = re.sub(pattern, "", output)
|
||||
print(f"🖼️ link to high resolution artwork: {text_to_print}")
|
||||
|
||||
|
||||
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||
# instantiated. The function will be called when the desired transport gets
|
||||
# selected.
|
||||
@@ -136,7 +156,10 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
# https://github.com/r-huijts/rijksmuseum-mcp
|
||||
args=["-y", "mcp-server-rijksmuseum"],
|
||||
env={"RIJKSMUSEUM_API_KEY": os.getenv("RIJKSMUSEUM_API_KEY")},
|
||||
)
|
||||
),
|
||||
# Optional
|
||||
tools_filter=mcp_tools_filter, # Optional
|
||||
tools_output_filters={"open_image_in_browser": open_image_output_filter},
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"error setting up mcp")
|
||||
@@ -155,7 +178,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
You are a helpful LLM in a WebRTC call.
|
||||
Your goal is to demonstrate your capabilities in a succinct way.
|
||||
You have access to tools to search the Rijksmuseum collection.
|
||||
Offer, for example, to show the earliest Rembrandt work from the museum. Use the `search_artwork` tool.
|
||||
Offer, for example, to show a floral still life, use the `search_artwork` tool.
|
||||
The tool may respond with a JSON object with an `artworks` array. Choose the art from that array.
|
||||
Once the tool has responded, tell the user the title and use the `open_image_in_browser` tool.
|
||||
Your output will be spoken aloud, so avoid special characters that can't easily be spoken, such as emojis or bullet points.
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
|
||||
import asyncio
|
||||
import io
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
@@ -15,7 +16,7 @@ import aiohttp
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
from mcp import StdioServerParameters
|
||||
from mcp.client.session_group import SseServerParameters
|
||||
from mcp.client.session_group import StreamableHttpParameters
|
||||
from PIL import Image
|
||||
|
||||
from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
||||
@@ -66,11 +67,14 @@ class UrlToImageProcessor(FrameProcessor):
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
def extract_url(self, text: str):
|
||||
pattern = r"!\[[^\]]*\]\((https?://[^)]+\.(png|jpg|jpeg|PNG|JPG|JPEG|gif))\)"
|
||||
match = re.search(pattern, text)
|
||||
if match:
|
||||
return match.group(1)
|
||||
return None
|
||||
try:
|
||||
data = json.loads(text)
|
||||
if "artObject" in data:
|
||||
return data["artObject"]["webImage"]["url"]
|
||||
if "artworks" in data and len(data["artworks"]):
|
||||
return data["artworks"][0]["webImage"]["url"]
|
||||
except:
|
||||
pass
|
||||
|
||||
async def run_image_process(self, image_url: str):
|
||||
try:
|
||||
@@ -132,10 +136,11 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
system = f"""
|
||||
You are a helpful LLM in a WebRTC call.
|
||||
Your goal is to demonstrate your capabilities in a succinct way.
|
||||
You have access to tools to search the Rijksmuseum collection.
|
||||
Offer, for example, to show the earliest Rembrandt work from the museum. Use the `search_artwork` tool.
|
||||
You have access to tools to search the Rijksmuseum collection and the user's GitHub repositories and account.
|
||||
Offer, for example, to show a floral still life, use the `search_artwork` tool.
|
||||
The tool may respond with a JSON object with an `artworks` array. Choose the art from that array.
|
||||
Once the tool has responded, tell the user the title and use the `open_image_in_browser` tool.
|
||||
You can also offer to answer users questions about their GitHub repositories and account.
|
||||
Your output will be spoken aloud, so avoid special characters that can't easily be spoken, such as emojis or bullet points.
|
||||
Respond to what the user said in a creative and helpful way.
|
||||
Don't overexplain what you are doing.
|
||||
@@ -145,11 +150,11 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
messages = [{"role": "system", "content": system}]
|
||||
|
||||
try:
|
||||
mcp = MCPClient(
|
||||
rijksmuseum_mcp = MCPClient(
|
||||
server_params=StdioServerParameters(
|
||||
command=shutil.which("npx"),
|
||||
# https://github.com/r-huijts/rijksmuseum-mcp
|
||||
args=["-y", "mcp-server-error setting up mcp"],
|
||||
args=["-y", "mcp-server-rijksmuseum"],
|
||||
env={"RIJKSMUSEUM_API_KEY": os.getenv("RIJKSMUSEUM_API_KEY")},
|
||||
)
|
||||
)
|
||||
@@ -157,24 +162,32 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.error(f"error setting up rijksmuseum mcp")
|
||||
logger.exception("error trace:")
|
||||
try:
|
||||
# https://docs.mcp.run/integrating/tutorials/mcp-run-sse-openai-agents/
|
||||
# ie. "https://www.mcp.run/api/mcp/sse?..."
|
||||
# ensure the profile has a tool or few installed
|
||||
mcp_run = MCPClient(server_params=SseServerParameters(url=os.getenv("MCP_RUN_SSE_URL")))
|
||||
# Github MCP docs: https://github.com/github/github-mcp-server
|
||||
# Enable Github Copilot on your GitHub account. Free tier is ok. (https://github.com/settings/copilot)
|
||||
# Generate a personal access token. It must be a Fine-grained token, classic tokens are not supported. (https://github.com/settings/personal-access-tokens)
|
||||
# Set permissions you want to use (eg. "all repositories", "profile: read/write", etc)
|
||||
github_mcp = MCPClient(
|
||||
server_params=StreamableHttpParameters(
|
||||
url="https://api.githubcopilot.com/mcp/",
|
||||
headers={
|
||||
"Authorization": f"Bearer {os.getenv('GITHUB_PERSONAL_ACCESS_TOKEN')}"
|
||||
},
|
||||
)
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"error setting up mcp.run")
|
||||
logger.exception("error trace:")
|
||||
|
||||
tools = {}
|
||||
run_tools = {}
|
||||
rijksmuseum_tools = {}
|
||||
github_tools = {}
|
||||
try:
|
||||
tools = await mcp.register_tools(llm)
|
||||
run_tools = await mcp_run.register_tools(llm)
|
||||
rijksmuseum_tools = await rijksmuseum_mcp.register_tools(llm)
|
||||
github_tools = await github_mcp.register_tools(llm)
|
||||
except Exception as e:
|
||||
logger.error(f"error registering tools")
|
||||
logger.exception("error trace:")
|
||||
|
||||
all_standard_tools = run_tools.standard_tools + tools.standard_tools
|
||||
all_standard_tools = rijksmuseum_tools.standard_tools + github_tools.standard_tools
|
||||
all_tools = ToolsSchema(standard_tools=all_standard_tools)
|
||||
|
||||
context = LLMContext(messages, all_tools)
|
||||
@@ -226,9 +239,9 @@ async def bot(runner_args: RunnerArguments):
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if not os.getenv("RIJKSMUSEUM_API_KEY") or not os.getenv("MCP_RUN_SSE_URL"):
|
||||
if not os.getenv("RIJKSMUSEUM_API_KEY") or not os.getenv("GITHUB_PERSONAL_ACCESS_TOKEN"):
|
||||
logger.error(
|
||||
f"Please set RIJKSMUSEUM_API_KEY and MCP_RUN_SSE_URL environment variables. See https://github.com/r-huijts/rijksmuseum-mcp and https://mcp.run"
|
||||
f"Please set `RIJKSMUSEUM_API_KEY` and `GITHUB_PERSONAL_ACCESS_TOKEN` environment variables. See https://github.com/r-huijts/rijksmuseum-mcp."
|
||||
)
|
||||
import sys
|
||||
|
||||
@@ -5,7 +5,9 @@
|
||||
#
|
||||
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import random
|
||||
from datetime import datetime
|
||||
|
||||
from dotenv import load_dotenv
|
||||
@@ -33,11 +35,21 @@ load_dotenv(override=True)
|
||||
|
||||
|
||||
async def fetch_weather_from_api(params: FunctionCallParams):
|
||||
temperature = 75 if params.arguments["format"] == "fahrenheit" else 24
|
||||
temperature = (
|
||||
random.randint(60, 85)
|
||||
if params.arguments["format"] == "fahrenheit"
|
||||
else random.randint(15, 30)
|
||||
)
|
||||
# Simulate a long network delay.
|
||||
# You can continue chatting while waiting for this to complete.
|
||||
# With Nova 2 Sonic (the default model), the assistant will respond
|
||||
# appropriately once the function call is complete.
|
||||
await asyncio.sleep(5)
|
||||
await params.result_callback(
|
||||
{
|
||||
"conditions": "nice",
|
||||
"temperature": temperature,
|
||||
"location": params.arguments["location"],
|
||||
"format": params.arguments["format"],
|
||||
"timestamp": datetime.now().strftime("%Y%m%d_%H%M%S"),
|
||||
}
|
||||
@@ -91,23 +103,31 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
# Specify initial system instruction.
|
||||
# HACK: note that, for now, we need to inject a special bit of text into this instruction to
|
||||
# allow the first assistant response to be programmatically triggered (which happens in the
|
||||
# on_client_connected handler, below)
|
||||
system_instruction = (
|
||||
"You are a friendly assistant. The user and you will engage in a spoken dialog exchanging "
|
||||
"the transcripts of a natural real-time conversation. Keep your responses short, generally "
|
||||
"two or three sentences for chatty scenarios. "
|
||||
f"{AWSNovaSonicLLMService.AWAIT_TRIGGER_ASSISTANT_RESPONSE_INSTRUCTION}"
|
||||
"two or three sentences for chatty scenarios."
|
||||
# HACK: if using the older Nova Sonic (pre-2) model, note that you need to inject a special
|
||||
# bit of text into this instruction to allow the first assistant response to be
|
||||
# programmatically triggered (which happens in the on_client_connected handler)
|
||||
# f"{AWSNovaSonicLLMService.AWAIT_TRIGGER_ASSISTANT_RESPONSE_INSTRUCTION}"
|
||||
)
|
||||
|
||||
# Create the AWS Nova Sonic LLM service
|
||||
llm = AWSNovaSonicLLMService(
|
||||
secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
|
||||
access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
|
||||
region=os.getenv("AWS_REGION"), # as of 2025-05-06, us-east-1 is the only supported region
|
||||
# as of 2025-12-09, these are the supported regions:
|
||||
# - Nova 2 Sonic (the default model):
|
||||
# - us-east-1
|
||||
# - us-west-2
|
||||
# - ap-northeast-1
|
||||
# - Nova Sonic (the older model):
|
||||
# - us-east-1
|
||||
# - ap-northeast-1
|
||||
region=os.getenv("AWS_REGION"),
|
||||
session_token=os.getenv("AWS_SESSION_TOKEN"),
|
||||
voice_id="tiffany", # matthew, tiffany, amy
|
||||
voice_id="tiffany",
|
||||
# you could choose to pass instruction here rather than via context
|
||||
# system_instruction=system_instruction
|
||||
# you could choose to pass tools here rather than via context
|
||||
@@ -117,7 +137,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
# Register function for function calls
|
||||
# you can either register a single function for all function calls, or specific functions
|
||||
# llm.register_function(None, fetch_weather_from_api)
|
||||
llm.register_function("get_current_weather", fetch_weather_from_api)
|
||||
llm.register_function(
|
||||
"get_current_weather", fetch_weather_from_api, cancel_on_interruption=False
|
||||
)
|
||||
|
||||
# Set up context and context management.
|
||||
context = LLMContext(
|
||||
@@ -159,10 +181,10 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
# HACK: for now, we need this special way of triggering the first assistant response in AWS
|
||||
# Nova Sonic. Note that this trigger requires a special corresponding bit of text in the
|
||||
# system instruction. In the future, simply queueing the context frame should be sufficient.
|
||||
await llm.trigger_assistant_response()
|
||||
# HACK: if using the older Nova Sonic (pre-2) model, you need this special way of
|
||||
# triggering the first assistant response. Note that this trigger requires a special
|
||||
# corresponding bit of text in the system instruction.
|
||||
# await llm.trigger_assistant_response()
|
||||
|
||||
# Handle client disconnection events
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
|
||||
@@ -25,7 +25,7 @@ from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.google.llm import GoogleLLMService
|
||||
from pipecat.services.heygen.api import AvatarQuality, NewSessionRequest
|
||||
from pipecat.services.heygen.client import ServiceType
|
||||
from pipecat.services.heygen.video import HeyGenVideoService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams, DailyTransport
|
||||
@@ -73,11 +73,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
llm = GoogleLLMService(api_key=os.getenv("GOOGLE_API_KEY"))
|
||||
|
||||
heyGen = HeyGenVideoService(
|
||||
api_key=os.getenv("HEYGEN_API_KEY"),
|
||||
api_key=os.getenv("HEYGEN_LIVE_AVATAR_API_KEY"),
|
||||
service_type=ServiceType.LIVE_AVATAR,
|
||||
session=session,
|
||||
session_request=NewSessionRequest(
|
||||
avatar_id="Shawn_Therapist_public", version="v2", quality=AvatarQuality.high
|
||||
),
|
||||
)
|
||||
|
||||
messages = [
|
||||
|
||||
@@ -113,8 +113,12 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
@voicemail.event_handler("on_conversation_detected")
|
||||
async def on_conversation_detected(processor):
|
||||
logger.info("Conversation detected!")
|
||||
|
||||
@voicemail.event_handler("on_voicemail_detected")
|
||||
async def handle_voicemail(processor):
|
||||
async def on_voicemail_detected(processor):
|
||||
logger.info("Voicemail detected! Leaving a message...")
|
||||
|
||||
# Push frames using standard Pipecat pattern
|
||||
|
||||
221
examples/foundational/49-ultravox-realtime.py
Normal file
221
examples/foundational/49-ultravox-realtime.py
Normal file
@@ -0,0 +1,221 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import datetime
|
||||
import os
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.adapters.schemas.function_schema import FunctionSchema
|
||||
from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.llm_service import FunctionCallParams
|
||||
from pipecat.services.ultravox.llm import OneShotInputParams, UltravoxRealtimeLLMService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
|
||||
# Load environment variables
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||
# instantiated. The function will be called when the desired transport gets
|
||||
# selected.
|
||||
transport_params = {
|
||||
"daily": lambda: DailyParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
),
|
||||
"twilio": lambda: FastAPIWebsocketParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
),
|
||||
"webrtc": lambda: TransportParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
async def get_secret_menu(params: FunctionCallParams):
|
||||
category = params.arguments.get("category", "both")
|
||||
logger.debug(f"Fetching secret menu with category: {category}")
|
||||
items = []
|
||||
if category in {"donuts", "both"}:
|
||||
items.append(
|
||||
{
|
||||
"name": "Butter Pecan Ice Cream (one scoop)",
|
||||
"price": "$2.99",
|
||||
}
|
||||
)
|
||||
if category in {"drinks", "both"}:
|
||||
items.append(
|
||||
{
|
||||
"name": "Banana Smoothie",
|
||||
"price": "$4.99",
|
||||
}
|
||||
)
|
||||
await params.result_callback(
|
||||
{
|
||||
"date": datetime.date.today().isoformat(),
|
||||
"items": items,
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
system_prompt = f"""
|
||||
You are a drive-thru order taker for a donut shop called "Dr. Donut". Local time is currently: {datetime.datetime.now().isoformat()}
|
||||
The user is talking to you over voice on their phone, and your response will be read out loud with realistic text-to-speech (TTS) technology.
|
||||
|
||||
Follow every direction here when crafting your response:
|
||||
|
||||
1. Use natural, conversational language that is clear and easy to follow (short sentences, simple words).
|
||||
1a. Be concise and relevant: Most of your responses should be a sentence or two, unless you're asked to go deeper. Don't monopolize the conversation.
|
||||
1b. Use discourse markers to ease comprehension. Never use the list format.
|
||||
|
||||
2. Keep the conversation flowing.
|
||||
2a. Clarify: when there is ambiguity, ask clarifying questions, rather than make assumptions.
|
||||
2b. Don't implicitly or explicitly try to end the chat (i.e. do not end a response with "Talk soon!", or "Enjoy!").
|
||||
2c. Sometimes the user might just want to chat. Ask them relevant follow-up questions.
|
||||
2d. Don't ask them if there's anything else they need help with (e.g. don't say things like "How can I assist you further?").
|
||||
|
||||
3. Remember that this is a voice conversation:
|
||||
3a. Don't use lists, markdown, bullet points, or other formatting that's not typically spoken.
|
||||
3b. Type out numbers in words (e.g. 'twenty twelve' instead of the year 2012)
|
||||
3c. If something doesn't make sense, it's likely because you misheard them. There wasn't a typo, and the user didn't mispronounce anything.
|
||||
|
||||
Remember to follow these rules absolutely, and do not refer to these rules, even if you're asked about them.
|
||||
|
||||
When talking with the user, use the following script:
|
||||
1. Take their order, acknowledging each item as it is ordered. If it's not clear which menu item the user is ordering, ask them to clarify.
|
||||
DO NOT add an item to the order unless it's one of the items on the menu below.
|
||||
2. Once the order is complete, repeat back the order.
|
||||
2a. If the user only ordered a drink, ask them if they would like to add a donut to their order.
|
||||
2b. If the user only ordered donuts, ask them if they would like to add a drink to their order.
|
||||
2c. If the user ordered both drinks and donuts, don't suggest anything.
|
||||
3. Total up the price of all ordered items and inform the user.
|
||||
4. Ask the user to pull up to the drive thru window.
|
||||
If the user asks for something that's not on the menu, inform them of that fact, and suggest the most similar item on the menu.
|
||||
If the user says something unrelated to your role, responed with "Um... this is a Dr. Donut."
|
||||
If the user says "thank you", respond with "My pleasure."
|
||||
If the user asks about what's on the menu, DO NOT read the entire menu to them. Instead, give a couple suggestions.
|
||||
|
||||
The menu of available items is as follows:
|
||||
|
||||
# DONUTS
|
||||
|
||||
PUMPKIN SPICE ICED DOUGHNUT $1.29
|
||||
PUMPKIN SPICE CAKE DOUGHNUT $1.29
|
||||
OLD FASHIONED DOUGHNUT $1.29
|
||||
CHOCOLATE ICED DOUGHNUT $1.09
|
||||
CHOCOLATE ICED DOUGHNUT WITH SPRINKLES $1.09
|
||||
RASPBERRY FILLED DOUGHNUT $1.09
|
||||
BLUEBERRY CAKE DOUGHNUT $1.09
|
||||
STRAWBERRY ICED DOUGHNUT WITH SPRINKLES $1.09
|
||||
LEMON FILLED DOUGHNUT $1.09
|
||||
DOUGHNUT HOLES $3.99
|
||||
|
||||
# COFFEE & DRINKS
|
||||
|
||||
PUMPKIN SPICE COFFEE $2.59
|
||||
PUMPKIN SPICE LATTE $4.59
|
||||
REGULAR BREWED COFFEE $1.79
|
||||
DECAF BREWED COFFEE $1.79
|
||||
LATTE $3.49
|
||||
CAPPUCINO $3.49
|
||||
CARAMEL MACCHIATO $3.49
|
||||
MOCHA LATTE $3.49
|
||||
CARAMEL MOCHA LATTE $3.49
|
||||
|
||||
There is also a secret menu that changes daily. If the user asks about it, use the get_secret_menu tool to look up today's secret menu items.
|
||||
"""
|
||||
|
||||
secret_menu_function = FunctionSchema(
|
||||
name="get_secret_menu",
|
||||
description="Get today's secret menu items",
|
||||
properties={
|
||||
"category": {
|
||||
"type": "string",
|
||||
"enum": ["donuts", "drinks", "both"],
|
||||
"description": "The category of secret menu items to retrieve. Defaults to both.",
|
||||
},
|
||||
},
|
||||
required=[],
|
||||
)
|
||||
|
||||
llm = UltravoxRealtimeLLMService(
|
||||
params=OneShotInputParams(
|
||||
api_key=os.getenv("ULTRAVOX_API_KEY"),
|
||||
system_prompt=system_prompt,
|
||||
temperature=0.3,
|
||||
max_duration=datetime.timedelta(minutes=3),
|
||||
),
|
||||
one_shot_selected_tools=ToolsSchema(standard_tools=[secret_menu_function]),
|
||||
)
|
||||
|
||||
llm.register_function("get_secret_menu", get_secret_menu)
|
||||
|
||||
# Necessary to complete the function call lifecycle in Pipecat.
|
||||
context_aggregator = LLMContextAggregatorPair(LLMContext([]))
|
||||
|
||||
# Build the pipeline
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(),
|
||||
context_aggregator.user(),
|
||||
llm,
|
||||
context_aggregator.assistant(),
|
||||
transport.output(),
|
||||
]
|
||||
)
|
||||
|
||||
# Configure the pipeline task
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
params=PipelineParams(
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
),
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
)
|
||||
|
||||
# Handle client connection event
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
|
||||
# Handle client disconnection events
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
# Run the pipeline
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
async def bot(runner_args: RunnerArguments):
|
||||
"""Main bot entry point compatible with Pipecat Cloud."""
|
||||
transport = await create_transport(runner_args, transport_params)
|
||||
await run_bot(transport, runner_args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from pipecat.runner.run import main
|
||||
|
||||
main()
|
||||
@@ -4,29 +4,27 @@
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
|
||||
import os
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
from mcp.client.session_group import SseServerParameters
|
||||
|
||||
from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
|
||||
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
from pipecat.frames.frames import LLMRunFrame
|
||||
from pipecat.frames.frames import LLMRunFrame, ThoughtTranscriptionMessage, TranscriptionMessage
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.processors.transcript_processor import TranscriptProcessor
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.anthropic.llm import AnthropicLLMService
|
||||
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.mcp_service import MCPClient
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
@@ -69,48 +67,35 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
)
|
||||
|
||||
llm = AnthropicLLMService(
|
||||
api_key=os.getenv("ANTHROPIC_API_KEY"), model="claude-3-7-sonnet-latest"
|
||||
api_key=os.getenv("ANTHROPIC_API_KEY"),
|
||||
params=AnthropicLLMService.InputParams(
|
||||
thinking=AnthropicLLMService.ThinkingConfig(type="enabled", budget_tokens=2048)
|
||||
),
|
||||
)
|
||||
|
||||
try:
|
||||
# https://docs.mcp.run/integrating/tutorials/mcp-run-sse-openai-agents/
|
||||
mcp = MCPClient(server_params=SseServerParameters(url=os.getenv("MCP_RUN_SSE_URL")))
|
||||
except Exception as e:
|
||||
logger.error(f"error setting up mcp")
|
||||
logger.exception("error trace:")
|
||||
transcript = TranscriptProcessor(process_thoughts=True)
|
||||
|
||||
tools = {}
|
||||
try:
|
||||
tools = await mcp.register_tools(llm)
|
||||
except Exception as e:
|
||||
logger.error(f"error registering tools")
|
||||
logger.exception("error trace:")
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be spoken aloud, so avoid special characters that can't easily be spoken, such as emojis or bullet points. Respond to what the user said in a creative and helpful way.",
|
||||
},
|
||||
]
|
||||
|
||||
system = f"""
|
||||
You are a helpful LLM in a WebRTC call.
|
||||
Your goal is to demonstrate your capabilities in a succinct way.
|
||||
You have access to a number of tools provided by mcp.run. Use any and all tools to help users.
|
||||
Your output will be spoken aloud, so avoid special characters that can't easily be spoken, such as emojis or bullet points.
|
||||
Respond to what the user said in a creative and helpful way.
|
||||
When asked for today's date, use 'https://www.datetoday.net/'.
|
||||
Don't overexplain what you are doing.
|
||||
Just respond with short sentences when you are carrying out tool calls.
|
||||
"""
|
||||
|
||||
messages = [{"role": "system", "content": system}]
|
||||
|
||||
context = LLMContext(messages, tools)
|
||||
context = LLMContext(messages)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(), # Transport user input
|
||||
stt,
|
||||
context_aggregator.user(), # User spoken responses
|
||||
transcript.user(), # User transcripts
|
||||
context_aggregator.user(), # User responses
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
context_aggregator.assistant(), # Assistant spoken responses and tool context
|
||||
transcript.assistant(), # Assistant transcripts (including thoughts)
|
||||
context_aggregator.assistant(), # Assistant spoken responses
|
||||
]
|
||||
)
|
||||
|
||||
@@ -125,8 +110,24 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected: {client}")
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
messages.append(
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Say hello briefly.",
|
||||
}
|
||||
)
|
||||
# Here are some example prompts conducive to demonstrating
|
||||
# thinking (picked from Google and Anthropic docs).
|
||||
# messages.append(
|
||||
# {
|
||||
# "role": "user",
|
||||
# "content": "Analogize photosynthesis and growing up. Keep your answer concise.",
|
||||
# # "content": "Compare and contrast electric cars and hybrid cars."
|
||||
# # "content": "Are there an infinite number of prime numbers such that n mod 4 == 3?"
|
||||
# }
|
||||
# )
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
@@ -134,6 +135,15 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
# Register event handler for transcript updates
|
||||
@transcript.event_handler("on_transcript_update")
|
||||
async def on_transcript_update(processor, frame):
|
||||
for msg in frame.messages:
|
||||
if isinstance(msg, (ThoughtTranscriptionMessage, TranscriptionMessage)):
|
||||
timestamp = f"[{msg.timestamp}] " if msg.timestamp else ""
|
||||
role = "THOUGHT" if isinstance(msg, ThoughtTranscriptionMessage) else msg.role
|
||||
logger.info(f"Transcript: {timestamp}{role}: {msg.content}")
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
|
||||
await runner.run(task)
|
||||
@@ -146,14 +156,6 @@ async def bot(runner_args: RunnerArguments):
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if not os.getenv("MCP_RUN_SSE_URL"):
|
||||
logger.error(
|
||||
f"Please set MCP_RUN_SSE_URL environment variable for this example. See https://mcp.run"
|
||||
)
|
||||
import sys
|
||||
|
||||
sys.exit(1)
|
||||
|
||||
from pipecat.runner.run import main
|
||||
|
||||
main()
|
||||
167
examples/foundational/49b-thinking-google.py
Normal file
167
examples/foundational/49b-thinking-google.py
Normal file
@@ -0,0 +1,167 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import os
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
|
||||
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
from pipecat.frames.frames import LLMRunFrame, ThoughtTranscriptionMessage, TranscriptionMessage
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.processors.transcript_processor import TranscriptProcessor
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.google.llm import GoogleLLMService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||
# instantiated. The function will be called when the desired transport gets
|
||||
# selected.
|
||||
transport_params = {
|
||||
"daily": lambda: DailyParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
"twilio": lambda: FastAPIWebsocketParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
"webrtc": lambda: TransportParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
voice_id="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
)
|
||||
|
||||
llm = GoogleLLMService(
|
||||
api_key=os.getenv("GOOGLE_API_KEY"),
|
||||
# model="gemini-3-pro-preview", # A more powerful reasoning model, but slower
|
||||
params=GoogleLLMService.InputParams(
|
||||
thinking=GoogleLLMService.ThinkingConfig(
|
||||
# thinking_level="low", # Use this field instead of thinking_budget for Gemini 3 Pro. Defaults to "high".
|
||||
thinking_budget=-1, # Dynamic thinking
|
||||
include_thoughts=True,
|
||||
)
|
||||
),
|
||||
)
|
||||
|
||||
transcript = TranscriptProcessor(process_thoughts=True)
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be spoken aloud, so avoid special characters that can't easily be spoken, such as emojis or bullet points. Respond to what the user said in a creative and helpful way.",
|
||||
},
|
||||
]
|
||||
|
||||
context = LLMContext(messages)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(), # Transport user input
|
||||
stt,
|
||||
transcript.user(), # User transcripts
|
||||
context_aggregator.user(), # User responses
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
transcript.assistant(), # Assistant transcripts (including thoughts)
|
||||
context_aggregator.assistant(), # Assistant spoken responses
|
||||
]
|
||||
)
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
params=PipelineParams(
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
),
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
messages.append(
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Say hello briefly.",
|
||||
}
|
||||
)
|
||||
# Replace the above with one of these example prompts to demonstrate
|
||||
# thinking.
|
||||
# These examples come from Gemini and Anthropic docs.
|
||||
# messages.append(
|
||||
# {
|
||||
# "role": "user",
|
||||
# "content": "Analogize photosynthesis and growing up. Keep your answer concise.",
|
||||
# # "content": "Compare and contrast electric cars and hybrid cars."
|
||||
# # "content": "Are there an infinite number of prime numbers such that n mod 4 == 3?"
|
||||
# }
|
||||
# )
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
# Register event handler for transcript updates
|
||||
@transcript.event_handler("on_transcript_update")
|
||||
async def on_transcript_update(processor, frame):
|
||||
for msg in frame.messages:
|
||||
if isinstance(msg, (ThoughtTranscriptionMessage, TranscriptionMessage)):
|
||||
timestamp = f"[{msg.timestamp}] " if msg.timestamp else ""
|
||||
role = "THOUGHT" if isinstance(msg, ThoughtTranscriptionMessage) else msg.role
|
||||
logger.info(f"Transcript: {timestamp}{role}: {msg.content}")
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
async def bot(runner_args: RunnerArguments):
|
||||
"""Main bot entry point compatible with Pipecat Cloud."""
|
||||
transport = await create_transport(runner_args, transport_params)
|
||||
await run_bot(transport, runner_args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from pipecat.runner.run import main
|
||||
|
||||
main()
|
||||
185
examples/foundational/49c-thinking-functions-anthropic.py
Normal file
185
examples/foundational/49c-thinking-functions-anthropic.py
Normal file
@@ -0,0 +1,185 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import os
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
||||
from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
|
||||
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
from pipecat.frames.frames import LLMRunFrame, ThoughtTranscriptionMessage, TranscriptionMessage
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.processors.transcript_processor import TranscriptProcessor
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.anthropic.llm import AnthropicLLMService
|
||||
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.llm_service import FunctionCallParams
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
async def check_flight_status(params: FunctionCallParams, flight_number: str):
|
||||
"""Check the status of a flight. Returns status (e.g., "on time", "delayed") and departure time.
|
||||
|
||||
Args:
|
||||
flight_number (str): The flight number, e.g. "AA100".
|
||||
"""
|
||||
await params.result_callback({"status": "delayed", "departure_time": "14:30"})
|
||||
|
||||
|
||||
async def book_taxi(params: FunctionCallParams, time: str):
|
||||
"""Book a taxi for a given time. Returns status (e.g., "done").
|
||||
|
||||
Args:
|
||||
time (str): The time to book the taxi for, e.g. "15:00".
|
||||
"""
|
||||
await params.result_callback({"status": "done"})
|
||||
|
||||
|
||||
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||
# instantiated. The function will be called when the desired transport gets
|
||||
# selected.
|
||||
transport_params = {
|
||||
"daily": lambda: DailyParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
"twilio": lambda: FastAPIWebsocketParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
"webrtc": lambda: TransportParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
voice_id="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
)
|
||||
|
||||
llm = AnthropicLLMService(
|
||||
api_key=os.getenv("ANTHROPIC_API_KEY"),
|
||||
params=AnthropicLLMService.InputParams(
|
||||
thinking=AnthropicLLMService.ThinkingConfig(type="enabled", budget_tokens=2048)
|
||||
),
|
||||
)
|
||||
|
||||
llm.register_direct_function(check_flight_status)
|
||||
llm.register_direct_function(book_taxi)
|
||||
|
||||
tools = ToolsSchema(standard_tools=[check_flight_status, book_taxi])
|
||||
|
||||
transcript = TranscriptProcessor(process_thoughts=True)
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be spoken aloud, so avoid special characters that can't easily be spoken, such as emojis or bullet points. Respond to what the user said in a creative and helpful way.",
|
||||
},
|
||||
]
|
||||
|
||||
context = LLMContext(messages, tools)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(), # Transport user input
|
||||
stt,
|
||||
transcript.user(), # User transcripts
|
||||
context_aggregator.user(), # User responses
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
transcript.assistant(), # Assistant transcripts (including thoughts)
|
||||
context_aggregator.assistant(), # Assistant spoken responses
|
||||
]
|
||||
)
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
params=PipelineParams(
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
),
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
messages.append(
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Say hello briefly.",
|
||||
}
|
||||
)
|
||||
# Here is an example prompt conducive to demonstrating thinking and
|
||||
# function calling.
|
||||
# This example comes from Gemini docs.
|
||||
# messages.append(
|
||||
# {
|
||||
# "role": "user",
|
||||
# "content": "Check the status of flight AA100 and, if it's delayed, book me a taxi 2 hours before its departure time.",
|
||||
# }
|
||||
# )
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
@transcript.event_handler("on_transcript_update")
|
||||
async def on_transcript_update(processor, frame):
|
||||
for msg in frame.messages:
|
||||
if isinstance(msg, (ThoughtTranscriptionMessage, TranscriptionMessage)):
|
||||
timestamp = f"[{msg.timestamp}] " if msg.timestamp else ""
|
||||
role = "THOUGHT" if isinstance(msg, ThoughtTranscriptionMessage) else msg.role
|
||||
logger.info(f"Transcript: {timestamp}{role}: {msg.content}")
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
async def bot(runner_args: RunnerArguments):
|
||||
"""Main bot entry point compatible with Pipecat Cloud."""
|
||||
transport = await create_transport(runner_args, transport_params)
|
||||
await run_bot(transport, runner_args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from pipecat.runner.run import main
|
||||
|
||||
main()
|
||||
190
examples/foundational/49d-thinking-functions-google.py
Normal file
190
examples/foundational/49d-thinking-functions-google.py
Normal file
@@ -0,0 +1,190 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import os
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
||||
from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
|
||||
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
from pipecat.frames.frames import LLMRunFrame, ThoughtTranscriptionMessage, TranscriptionMessage
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.processors.transcript_processor import TranscriptProcessor
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.google.llm import GoogleLLMService
|
||||
from pipecat.services.llm_service import FunctionCallParams
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
async def check_flight_status(params: FunctionCallParams, flight_number: str):
|
||||
"""Check the status of a flight. Returns status (e.g., "on time", "delayed") and departure time.
|
||||
|
||||
Args:
|
||||
flight_number (str): The flight number, e.g. "AA100".
|
||||
"""
|
||||
await params.result_callback({"status": "delayed", "departure_time": "14:30"})
|
||||
|
||||
|
||||
async def book_taxi(params: FunctionCallParams, time: str):
|
||||
"""Book a taxi for a given time. Returns status (e.g., "done").
|
||||
|
||||
Args:
|
||||
time (str): The time to book the taxi for, e.g. "15:00".
|
||||
"""
|
||||
await params.result_callback({"status": "done"})
|
||||
|
||||
|
||||
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||
# instantiated. The function will be called when the desired transport gets
|
||||
# selected.
|
||||
transport_params = {
|
||||
"daily": lambda: DailyParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
"twilio": lambda: FastAPIWebsocketParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
"webrtc": lambda: TransportParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
voice_id="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
)
|
||||
|
||||
llm = GoogleLLMService(
|
||||
api_key=os.getenv("GOOGLE_API_KEY"),
|
||||
# model="gemini-3-pro-preview", # A more powerful reasoning model, but slower
|
||||
params=GoogleLLMService.InputParams(
|
||||
thinking=GoogleLLMService.ThinkingConfig(
|
||||
# thinking_level="low", # Use this field instead of thinking_budget for Gemini 3 Pro. Defaults to "high".
|
||||
thinking_budget=-1, # Dynamic thinking
|
||||
include_thoughts=True,
|
||||
)
|
||||
),
|
||||
)
|
||||
|
||||
llm.register_direct_function(check_flight_status)
|
||||
llm.register_direct_function(book_taxi)
|
||||
|
||||
tools = ToolsSchema(standard_tools=[check_flight_status, book_taxi])
|
||||
|
||||
transcript = TranscriptProcessor(process_thoughts=True)
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be spoken aloud, so avoid special characters that can't easily be spoken, such as emojis or bullet points. Respond to what the user said in a creative and helpful way.",
|
||||
},
|
||||
]
|
||||
|
||||
context = LLMContext(messages, tools)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(), # Transport user input
|
||||
stt,
|
||||
transcript.user(), # User transcripts
|
||||
context_aggregator.user(), # User responses
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
transcript.assistant(), # Assistant transcripts (including thoughts)
|
||||
context_aggregator.assistant(), # Assistant spoken responses
|
||||
]
|
||||
)
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
params=PipelineParams(
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
),
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
messages.append(
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Say hello briefly.",
|
||||
}
|
||||
)
|
||||
# Replace the above with one of these example prompts to demonstrate
|
||||
# thinking and function calling.
|
||||
# This example comes from Gemini docs.
|
||||
# messages.append(
|
||||
# {
|
||||
# "role": "user",
|
||||
# "content": "Check the status of flight AA100 and, if it's delayed, book me a taxi 2 hours before its departure time.",
|
||||
# }
|
||||
# )
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
@transcript.event_handler("on_transcript_update")
|
||||
async def on_transcript_update(processor, frame):
|
||||
for msg in frame.messages:
|
||||
if isinstance(msg, (ThoughtTranscriptionMessage, TranscriptionMessage)):
|
||||
timestamp = f"[{msg.timestamp}] " if msg.timestamp else ""
|
||||
role = "THOUGHT" if isinstance(msg, ThoughtTranscriptionMessage) else msg.role
|
||||
logger.info(f"Transcript: {timestamp}{role}: {msg.content}")
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
async def bot(runner_args: RunnerArguments):
|
||||
"""Main bot entry point compatible with Pipecat Cloud."""
|
||||
transport = await create_transport(runner_args, transport_params)
|
||||
await run_bot(transport, runner_args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from pipecat.runner.run import main
|
||||
|
||||
main()
|
||||
@@ -45,61 +45,63 @@ Source = "https://github.com/pipecat-ai/pipecat"
|
||||
Website = "https://pipecat.ai"
|
||||
|
||||
[project.optional-dependencies]
|
||||
aic = [ "aic-sdk~=1.1.0" ]
|
||||
aic = [ "aic-sdk~=1.2.0" ]
|
||||
anthropic = [ "anthropic~=0.49.0" ]
|
||||
assemblyai = [ "pipecat-ai[websockets-base]" ]
|
||||
asyncai = [ "pipecat-ai[websockets-base]" ]
|
||||
aws = [ "aioboto3~=15.0.0", "pipecat-ai[websockets-base]" ]
|
||||
aws-nova-sonic = [ "aws_sdk_bedrock_runtime~=0.1.1; python_version>='3.12'" ]
|
||||
aws = [ "aioboto3~=15.5.0", "pipecat-ai[websockets-base]" ]
|
||||
aws-nova-sonic = [ "aws_sdk_bedrock_runtime~=0.2.0; python_version>='3.12'" ]
|
||||
azure = [ "azure-cognitiveservices-speech~=1.42.0"]
|
||||
cartesia = [ "cartesia~=2.0.3", "pipecat-ai[websockets-base]" ]
|
||||
cerebras = []
|
||||
daily = [ "daily-python~=0.22.0" ]
|
||||
deepgram = [ "deepgram-sdk~=4.7.0", "pipecat-ai[websockets-base]" ]
|
||||
deepseek = []
|
||||
daily = [ "daily-python~=0.21.0" ]
|
||||
deepgram = [ "deepgram-sdk~=4.7.0" ]
|
||||
elevenlabs = [ "pipecat-ai[websockets-base]" ]
|
||||
fal = [ "fal-client~=0.5.9" ]
|
||||
fireworks = []
|
||||
fish = [ "ormsgpack~=1.7.0", "pipecat-ai[websockets-base]" ]
|
||||
gladia = [ "pipecat-ai[websockets-base]" ]
|
||||
google = [ "google-cloud-speech>=2.33.0,<3", "google-cloud-texttospeech>=2.31.0,<3", "google-genai>=1.41.0,<2", "pipecat-ai[websockets-base]" ]
|
||||
google = [ "google-cloud-speech>=2.33.0,<3", "google-cloud-texttospeech>=2.31.0,<3", "google-genai>=1.51.0,<2", "pipecat-ai[websockets-base]" ]
|
||||
gradium = [ "pipecat-ai[websockets-base]" ]
|
||||
grok = []
|
||||
groq = [ "groq~=0.23.0" ]
|
||||
gstreamer = [ "pygobject~=3.50.0" ]
|
||||
heygen = [ "livekit>=1.0.13", "pipecat-ai[websockets-base]" ]
|
||||
hume = [ "hume>=0.11.2" ]
|
||||
inworld = []
|
||||
krisp = [ "pipecat-ai-krisp~=0.4.0" ]
|
||||
koala = [ "pvkoala~=2.0.3" ]
|
||||
krisp = [ "pipecat-ai-krisp~=0.4.0" ]
|
||||
langchain = [ "langchain~=0.3.20", "langchain-community~=0.3.20", "langchain-openai~=0.3.9" ]
|
||||
livekit = [ "livekit~=1.0.13", "livekit-api~=1.0.5", "tenacity>=8.2.3,<10.0.0" ]
|
||||
livekit = [ "livekit~=1.0.13", "livekit-api~=1.0.5", "tenacity>=8.2.3,<10.0.0", "pyjwt>=2.10.1" ]
|
||||
lmnt = [ "pipecat-ai[websockets-base]" ]
|
||||
local = [ "pyaudio~=0.2.14" ]
|
||||
local-smart-turn = [ "coremltools>=8.0", "transformers", "torch>=2.5.0,<3", "torchaudio>=2.5.0,<3" ]
|
||||
local-smart-turn-v3 = [ "transformers", "onnxruntime>=1.20.1,<2" ]
|
||||
mcp = [ "mcp[cli]>=1.11.0,<2" ]
|
||||
mem0 = [ "mem0ai~=0.1.94" ]
|
||||
mistral = []
|
||||
mlx-whisper = [ "mlx-whisper~=0.4.2" ]
|
||||
moondream = [ "accelerate~=1.10.0", "einops~=0.8.0", "pyvips[binary]~=3.0.0", "timm~=1.0.13", "transformers>=4.48.0" ]
|
||||
nim = []
|
||||
neuphonic = [ "pipecat-ai[websockets-base]" ]
|
||||
noisereduce = [ "noisereduce~=3.0.3" ]
|
||||
nvidia = [ "nvidia-riva-client~=2.21.1" ]
|
||||
openai = [ "pipecat-ai[websockets-base]" ]
|
||||
openpipe = [ "openpipe>=4.50.0,<6" ]
|
||||
openrouter = []
|
||||
perplexity = []
|
||||
playht = [ "pipecat-ai[websockets-base]" ]
|
||||
qwen = []
|
||||
remote-smart-turn = []
|
||||
rime = [ "pipecat-ai[websockets-base]" ]
|
||||
riva = [ "nvidia-riva-client~=2.21.1" ]
|
||||
riva = [ "pipecat-ai[nvidia]" ]
|
||||
runner = [ "python-dotenv>=1.0.0,<2.0.0", "uvicorn>=0.32.0,<1.0.0", "fastapi>=0.115.6,<0.122.0", "pipecat-ai-small-webrtc-prebuilt>=1.0.0"]
|
||||
sagemaker = ["aws_sdk_sagemaker_runtime_http2; python_version>='3.12'"]
|
||||
sambanova = []
|
||||
sarvam = [ "sarvamai==0.1.21", "pipecat-ai[websockets-base]" ]
|
||||
sentry = [ "sentry-sdk>=2.28.0,<3" ]
|
||||
local-smart-turn = [ "coremltools>=8.0", "transformers", "torch>=2.5.0,<3", "torchaudio>=2.5.0,<3" ]
|
||||
local-smart-turn-v3 = [ "transformers", "onnxruntime>=1.20.1,<2" ]
|
||||
remote-smart-turn = []
|
||||
silero = [ "onnxruntime>=1.20.1,<2" ]
|
||||
simli = [ "simli-ai~=0.1.25"]
|
||||
simli = [ "simli-ai~=1.0.3"]
|
||||
soniox = [ "pipecat-ai[websockets-base]" ]
|
||||
soundfile = [ "soundfile~=0.13.1" ]
|
||||
speechmatics = [ "speechmatics-rt>=0.5.0" ]
|
||||
@@ -107,7 +109,7 @@ strands = [ "strands-agents>=1.9.1,<2" ]
|
||||
tavus=[]
|
||||
together = []
|
||||
tracing = [ "opentelemetry-sdk>=1.33.0", "opentelemetry-api>=1.33.0", "opentelemetry-instrumentation>=0.54b0" ]
|
||||
ultravox = [ "transformers>=4.48.0", "vllm>=0.9.0" ]
|
||||
ultravox = [ "pipecat-ai[websockets-base]" ]
|
||||
webrtc = [ "aiortc>=1.13.0,<2", "opencv-python>=4.11.0.86,<5" ]
|
||||
websocket = [ "pipecat-ai[websockets-base]", "fastapi>=0.115.6,<0.122.0" ]
|
||||
websockets-base = [ "websockets>=13.1,<16.0" ]
|
||||
@@ -128,6 +130,7 @@ dev = [
|
||||
"setuptools~=78.1.1",
|
||||
"setuptools_scm~=8.3.1",
|
||||
"python-dotenv>=1.0.1,<2.0.0",
|
||||
"towncrier~=25.8.0",
|
||||
]
|
||||
|
||||
docs = [
|
||||
@@ -158,7 +161,7 @@ where = ["src"]
|
||||
"src/pipecat/audio/dtmf/dtmf-star.wav",
|
||||
]
|
||||
"pipecat.services.aws_nova_sonic" = ["src/pipecat/services/aws_nova_sonic/ready.wav"]
|
||||
"pipecat.audio.turn.smart_turn.data" = ["src/pipecat/audio/turn/smart_turn/data/smart-turn-v3.0.onnx"]
|
||||
"pipecat.audio.turn.smart_turn.data" = ["src/pipecat/audio/turn/smart_turn/data/smart-turn-v3.1-cpu.onnx"]
|
||||
|
||||
[tool.pytest.ini_options]
|
||||
addopts = "--verbose"
|
||||
@@ -205,3 +208,45 @@ convention = "google"
|
||||
command_line = "--module pytest"
|
||||
source = ["src"]
|
||||
omit = ["*/tests/*"]
|
||||
|
||||
[tool.towncrier]
|
||||
package = "pipecat"
|
||||
package_dir = "src"
|
||||
filename = "CHANGELOG.md"
|
||||
directory = "changelog"
|
||||
start_string = "<!-- towncrier release notes start -->\n"
|
||||
template = "changelog/_template.md.j2"
|
||||
title_format = "## [{version}] - {project_date}"
|
||||
issue_format = "[#{issue}](https://github.com/pipecat-ai/pipecat/pull/{issue})"
|
||||
underlines = ["", "", ""]
|
||||
wrap = true
|
||||
|
||||
[[tool.towncrier.type]]
|
||||
directory = "added"
|
||||
name = "Added"
|
||||
showcontent = true
|
||||
|
||||
[[tool.towncrier.type]]
|
||||
directory = "changed"
|
||||
name = "Changed"
|
||||
showcontent = true
|
||||
|
||||
[[tool.towncrier.type]]
|
||||
directory = "deprecated"
|
||||
name = "Deprecated"
|
||||
showcontent = true
|
||||
|
||||
[[tool.towncrier.type]]
|
||||
directory = "removed"
|
||||
name = "Removed"
|
||||
showcontent = true
|
||||
|
||||
[[tool.towncrier.type]]
|
||||
directory = "fixed"
|
||||
name = "Fixed"
|
||||
showcontent = true
|
||||
|
||||
[[tool.towncrier.type]]
|
||||
directory = "security"
|
||||
name = "Security"
|
||||
showcontent = true
|
||||
|
||||
@@ -31,7 +31,13 @@ from pipecat.adapters.schemas.function_schema import FunctionSchema
|
||||
from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
from pipecat.frames.frames import EndTaskFrame, LLMRunFrame, OutputImageRawFrame
|
||||
from pipecat.frames.frames import (
|
||||
CancelFrame,
|
||||
EndFrame,
|
||||
EndTaskFrame,
|
||||
LLMRunFrame,
|
||||
OutputImageRawFrame,
|
||||
)
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
@@ -50,6 +56,7 @@ SCRIPT_DIR = Path(__file__).resolve().parent
|
||||
|
||||
PIPELINE_IDLE_TIMEOUT_SECS = 60
|
||||
EVAL_TIMEOUT_SECS = 120
|
||||
EVAL_RESULT_TIMEOUT_SECS = 10
|
||||
|
||||
EvalPrompt = str | Tuple[str, ImageFile]
|
||||
|
||||
@@ -78,7 +85,7 @@ class EvalRunner:
|
||||
self._log_level = log_level
|
||||
self._total_success = 0
|
||||
self._tests: List[EvalResult] = []
|
||||
self._queue = asyncio.Queue()
|
||||
self._result_future: Optional[asyncio.Future[bool]] = None
|
||||
|
||||
# We to save runner files.
|
||||
name = name or f"{datetime.now().strftime('%Y%m%d_%H%M%S')}"
|
||||
@@ -88,16 +95,16 @@ class EvalRunner:
|
||||
os.makedirs(self._logs_dir, exist_ok=True)
|
||||
os.makedirs(self._recordings_dir, exist_ok=True)
|
||||
|
||||
async def assert_eval(self, params: FunctionCallParams):
|
||||
async def function_assert_eval(self, params: FunctionCallParams):
|
||||
result = params.arguments["result"]
|
||||
reasoning = params.arguments["reasoning"]
|
||||
logger.debug(f"🧠 EVAL REASONING(result: {result}): {reasoning}")
|
||||
await self._queue.put(result)
|
||||
await params.result_callback(None)
|
||||
await params.llm.push_frame(EndTaskFrame(), FrameDirection.UPSTREAM)
|
||||
await params.llm.push_frame(EndTaskFrame(reason=result), FrameDirection.UPSTREAM)
|
||||
|
||||
async def assert_eval_false(self):
|
||||
await self._queue.put(False)
|
||||
async def assert_eval(self, result: bool):
|
||||
if self._result_future:
|
||||
self._result_future.set_result(result)
|
||||
|
||||
async def run_eval(
|
||||
self,
|
||||
@@ -117,6 +124,9 @@ class EvalRunner:
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
# Create a future to store the eval result.
|
||||
self._result_future = asyncio.get_running_loop().create_future()
|
||||
|
||||
try:
|
||||
tasks = [
|
||||
asyncio.create_task(run_example_pipeline(script_path, eval_config)),
|
||||
@@ -136,8 +146,10 @@ class EvalRunner:
|
||||
logger.error(f"ERROR: Unable to run {example_file}: {e}")
|
||||
|
||||
try:
|
||||
result = await asyncio.wait_for(self._queue.get(), timeout=1.0)
|
||||
# Wait for the future to resolve.
|
||||
result = await asyncio.wait_for(self._result_future, timeout=EVAL_RESULT_TIMEOUT_SECS)
|
||||
except asyncio.TimeoutError:
|
||||
logger.error(f"ERROR: Timeout waiting for eval result.")
|
||||
result = False
|
||||
|
||||
if result:
|
||||
@@ -244,19 +256,25 @@ async def run_eval_pipeline(
|
||||
|
||||
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
|
||||
|
||||
llm.register_function("eval_function", eval_runner.assert_eval)
|
||||
llm.register_function("eval_function", eval_runner.function_assert_eval)
|
||||
|
||||
eval_function = FunctionSchema(
|
||||
name="eval_function",
|
||||
description="Called when the user answers a question.",
|
||||
description=(
|
||||
"Determines whether the user's response satisfies the evaluation "
|
||||
"criteria defined for the current prompt or interaction."
|
||||
),
|
||||
properties={
|
||||
"result": {
|
||||
"type": "boolean",
|
||||
"description": "Whether the answer is correct or not",
|
||||
"description": "Whether the user's response meets the evaluation criteria.",
|
||||
},
|
||||
"reasoning": {
|
||||
"type": "string",
|
||||
"description": "Why the answer was considered correct or invalid",
|
||||
"description": (
|
||||
"A concise explanation of how the user's response did or did "
|
||||
"not satisfy the evaluation criteria."
|
||||
),
|
||||
},
|
||||
},
|
||||
required=["result", "reasoning"],
|
||||
@@ -278,9 +296,9 @@ async def run_eval_pipeline(
|
||||
"Ignore greetings, comments, non-answers, or requests for clarification."
|
||||
)
|
||||
if eval_config.eval_speaks_first:
|
||||
system_prompt = f"You are an evaluation agent, be extremly brief. You will start the conversation by saying: '{example_prompt}'. {common_system_prompt}"
|
||||
system_prompt = f"You are an evaluation agent, be extremly brief. Numerical word answers are allowed. You will start the conversation by saying: '{example_prompt}'. {common_system_prompt}"
|
||||
else:
|
||||
system_prompt = f"You are an evaluation agent, be extremly brief. First, ask one question: {example_prompt}. {common_system_prompt}"
|
||||
system_prompt = f"You are an evaluation agent, be extremly brief. Numerical word answers are allowed. First, ask one question: {example_prompt}. {common_system_prompt}"
|
||||
|
||||
messages = [
|
||||
{
|
||||
@@ -346,9 +364,12 @@ async def run_eval_pipeline(
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
@task.event_handler("on_idle_timeout")
|
||||
async def on_pipeline_idle_timeout(task):
|
||||
await eval_runner.assert_eval_false()
|
||||
@task.event_handler("on_pipeline_finished")
|
||||
async def on_pipeline_finished(task, frame):
|
||||
if isinstance(frame, EndFrame):
|
||||
await eval_runner.assert_eval(frame.reason)
|
||||
elif isinstance(frame, CancelFrame):
|
||||
await eval_runner.assert_eval(False)
|
||||
|
||||
# TODO(aleix): We should handle SIGINT and SIGTERM so we can cancel both the
|
||||
# eval and the example.
|
||||
|
||||
@@ -30,13 +30,13 @@ EVAL_SIMPLE_MATH = EvalConfig(
|
||||
)
|
||||
|
||||
EVAL_WEATHER = EvalConfig(
|
||||
prompt="What's the weather in San Francisco?",
|
||||
eval="The user says something specific about the current weather in San Francisco, including the degrees.",
|
||||
prompt="What's the weather in San Francisco? Temperature should be in fahrenheits.",
|
||||
eval="The user talks about the weather in San Francisco, including the degrees.",
|
||||
)
|
||||
|
||||
EVAL_ONLINE_SEARCH = EvalConfig(
|
||||
prompt="What's the date right now in London?",
|
||||
eval=f"The user says today is {datetime.now(timezone.utc).strftime('%B %d, %Y')} in London.",
|
||||
prompt="What's the current date in UTC?",
|
||||
eval=f"Current date in UTC is {datetime.now(timezone.utc).strftime('%A, %B %d, %Y')}.",
|
||||
)
|
||||
|
||||
EVAL_SWITCH_LANGUAGE = EvalConfig(
|
||||
@@ -64,16 +64,21 @@ def EVAL_VISION_IMAGE(*, eval_speaks_first: bool = False):
|
||||
|
||||
EVAL_VOICEMAIL = EvalConfig(
|
||||
prompt="Please leave a message.",
|
||||
eval="The user leaves a voicemail message.",
|
||||
eval="The user provides a reasonable voicemail message.",
|
||||
eval_speaks_first=True,
|
||||
)
|
||||
|
||||
EVAL_CONVERSATION = EvalConfig(
|
||||
prompt="Hello, this is Mark.",
|
||||
eval="The user replies with a greeting.",
|
||||
eval="The user provides any reasonable conversational response to the greeting.",
|
||||
eval_speaks_first=True,
|
||||
)
|
||||
|
||||
EVAL_FLIGHT_STATUS = EvalConfig(
|
||||
prompt="Check the status of flight AA100.",
|
||||
eval="The user says something about the status of flight AA100, such as whether it's on time or delayed.",
|
||||
)
|
||||
|
||||
|
||||
TESTS_07 = [
|
||||
# 07 series
|
||||
@@ -81,6 +86,7 @@ TESTS_07 = [
|
||||
("07-interruptible-cartesia-http.py", EVAL_SIMPLE_MATH),
|
||||
("07a-interruptible-speechmatics.py", EVAL_SIMPLE_MATH),
|
||||
("07aa-interruptible-soniox.py", EVAL_SIMPLE_MATH),
|
||||
("07ab-interruptible-inworld.py", EVAL_SIMPLE_MATH),
|
||||
("07ab-interruptible-inworld-http.py", EVAL_SIMPLE_MATH),
|
||||
("07ac-interruptible-asyncai.py", EVAL_SIMPLE_MATH),
|
||||
("07ac-interruptible-asyncai-http.py", EVAL_SIMPLE_MATH),
|
||||
@@ -103,7 +109,7 @@ TESTS_07 = [
|
||||
("07o-interruptible-assemblyai.py", EVAL_SIMPLE_MATH),
|
||||
("07q-interruptible-rime.py", EVAL_SIMPLE_MATH),
|
||||
("07q-interruptible-rime-http.py", EVAL_SIMPLE_MATH),
|
||||
("07r-interruptible-riva-nim.py", EVAL_SIMPLE_MATH),
|
||||
("07r-interruptible-nvidia.py", EVAL_SIMPLE_MATH),
|
||||
("07s-interruptible-google-audio-in.py", EVAL_SIMPLE_MATH),
|
||||
("07t-interruptible-fish.py", EVAL_SIMPLE_MATH),
|
||||
("07v-interruptible-neuphonic.py", EVAL_SIMPLE_MATH),
|
||||
@@ -116,8 +122,6 @@ TESTS_07 = [
|
||||
# ("07i-interruptible-xtts.py", EVAL_SIMPLE_MATH),
|
||||
# Needs a Krisp license.
|
||||
# ("07p-interruptible-krisp.py", EVAL_SIMPLE_MATH),
|
||||
# Needs GPU resources.
|
||||
# ("07u-interruptible-ultravox.py", EVAL_SIMPLE_MATH),
|
||||
]
|
||||
|
||||
TESTS_12 = [
|
||||
@@ -136,7 +140,7 @@ TESTS_14 = [
|
||||
("14g-function-calling-grok.py", EVAL_WEATHER),
|
||||
("14h-function-calling-azure.py", EVAL_WEATHER),
|
||||
("14i-function-calling-fireworks.py", EVAL_WEATHER),
|
||||
("14j-function-calling-nim.py", EVAL_WEATHER),
|
||||
("14j-function-calling-nvidia.py", EVAL_WEATHER),
|
||||
("14k-function-calling-cerebras.py", EVAL_WEATHER),
|
||||
("14m-function-calling-openrouter.py", EVAL_WEATHER),
|
||||
("14n-function-calling-perplexity.py", EVAL_WEATHER),
|
||||
@@ -204,6 +208,13 @@ TESTS_44 = [
|
||||
("44-voicemail-detection.py", EVAL_CONVERSATION),
|
||||
]
|
||||
|
||||
TESTS_49 = [
|
||||
("49a-thinking-anthropic.py", EVAL_SIMPLE_MATH),
|
||||
("49b-thinking-google.py", EVAL_SIMPLE_MATH),
|
||||
("49c-thinking-functions-anthropic.py", EVAL_FLIGHT_STATUS),
|
||||
("49d-thinking-functions-google.py", EVAL_FLIGHT_STATUS),
|
||||
]
|
||||
|
||||
TESTS = [
|
||||
*TESTS_07,
|
||||
*TESTS_12,
|
||||
@@ -216,6 +227,7 @@ TESTS = [
|
||||
*TESTS_40,
|
||||
*TESTS_43,
|
||||
*TESTS_44,
|
||||
*TESTS_49,
|
||||
]
|
||||
|
||||
|
||||
|
||||
@@ -5,14 +5,20 @@
|
||||
#
|
||||
|
||||
import sys
|
||||
from importlib.metadata import version
|
||||
from importlib.metadata import version as lib_version
|
||||
|
||||
from loguru import logger
|
||||
|
||||
__version__ = version("pipecat-ai")
|
||||
__version__ = lib_version("pipecat-ai")
|
||||
|
||||
logger.info(f"ᓚᘏᗢ Pipecat {__version__} (Python {sys.version}) ᓚᘏᗢ")
|
||||
|
||||
|
||||
def version() -> str:
|
||||
"""Returns the Pipecat version."""
|
||||
return __version__
|
||||
|
||||
|
||||
# We replace `asyncio.wait_for()` for `wait_for2.wait_for()` for Python < 3.12.
|
||||
#
|
||||
# In Python 3.12, `asyncio.wait_for()` is implemented in terms of
|
||||
|
||||
@@ -94,6 +94,8 @@ class AnthropicLLMAdapter(BaseLLMAdapter[AnthropicLLMInvocationParams]):
|
||||
for item in msg["content"]:
|
||||
if item["type"] == "image":
|
||||
item["source"]["data"] = "..."
|
||||
if item["type"] == "thinking" and item.get("signature"):
|
||||
item["signature"] = "..."
|
||||
messages_for_logging.append(msg)
|
||||
return messages_for_logging
|
||||
|
||||
@@ -165,9 +167,44 @@ class AnthropicLLMAdapter(BaseLLMAdapter[AnthropicLLMInvocationParams]):
|
||||
|
||||
def _from_universal_context_message(self, message: LLMContextMessage) -> MessageParam:
|
||||
if isinstance(message, LLMSpecificMessage):
|
||||
return copy.deepcopy(message.message)
|
||||
return self._from_anthropic_specific_message(message)
|
||||
return self._from_standard_message(message)
|
||||
|
||||
def _from_anthropic_specific_message(self, message: LLMSpecificMessage) -> MessageParam:
|
||||
"""Convert LLMSpecificMessage to Anthropic format.
|
||||
|
||||
Anthropic-specific messages may either be special thought messages that
|
||||
need to be handled in a special way, or messages already in Anthropic
|
||||
format.
|
||||
|
||||
Args:
|
||||
message: Anthropic-specific message.
|
||||
"""
|
||||
# Handle special case of thought messages.
|
||||
# These can be converted to standalone "assistant" messages; later
|
||||
# these thinking messages will be properly merged into the assistant
|
||||
# response messages before the context is sent to Anthropic for the
|
||||
# next turn.
|
||||
if (
|
||||
isinstance(message.message, dict)
|
||||
and message.message.get("type") == "thought"
|
||||
and (text := message.message.get("text"))
|
||||
and (signature := message.message.get("signature"))
|
||||
):
|
||||
return {
|
||||
"role": "assistant",
|
||||
"content": [
|
||||
{
|
||||
"type": "thinking",
|
||||
"thinking": text,
|
||||
"signature": signature,
|
||||
}
|
||||
],
|
||||
}
|
||||
|
||||
# Fall back to assuming that the message is already in Anthropic format
|
||||
return copy.deepcopy(message.message)
|
||||
|
||||
def _from_standard_message(self, message: LLMStandardMessage) -> MessageParam:
|
||||
"""Convert standard universal context message to Anthropic format.
|
||||
|
||||
@@ -246,11 +283,14 @@ class AnthropicLLMAdapter(BaseLLMAdapter[AnthropicLLMInvocationParams]):
|
||||
# handle image_url -> image conversion
|
||||
if item["type"] == "image_url":
|
||||
if item["image_url"]["url"].startswith("data:"):
|
||||
# Extract MIME type from data URL (format: "data:image/jpeg;base64,...")
|
||||
url = item["image_url"]["url"]
|
||||
mime_type = url.split(":")[1].split(";")[0]
|
||||
item["type"] = "image"
|
||||
item["source"] = {
|
||||
"type": "base64",
|
||||
"media_type": "image/jpeg",
|
||||
"data": item["image_url"]["url"].split(",")[1],
|
||||
"media_type": mime_type,
|
||||
"data": url.split(",")[1],
|
||||
}
|
||||
del item["image_url"]
|
||||
elif item["image_url"]["url"].startswith("http"):
|
||||
|
||||
@@ -257,14 +257,15 @@ class AWSBedrockLLMAdapter(BaseLLMAdapter[AWSBedrockLLMInvocationParams]):
|
||||
# handle image_url -> image conversion
|
||||
if item["type"] == "image_url":
|
||||
if item["image_url"]["url"].startswith("data:"):
|
||||
# Extract format from data URL (format: "data:image/jpeg;base64,...")
|
||||
url = item["image_url"]["url"]
|
||||
mime_type = url.split(":")[1].split(";")[0]
|
||||
# Bedrock expects format like "jpeg", "png" etc., not "image/jpeg"
|
||||
image_format = mime_type.split("/")[1]
|
||||
new_item = {
|
||||
"image": {
|
||||
"format": "jpeg",
|
||||
"source": {
|
||||
"bytes": base64.b64decode(
|
||||
item["image_url"]["url"].split(",")[1]
|
||||
)
|
||||
},
|
||||
"format": image_format,
|
||||
"source": {"bytes": base64.b64decode(url.split(",")[1])},
|
||||
}
|
||||
}
|
||||
new_content.append(new_item)
|
||||
|
||||
@@ -151,6 +151,8 @@ class GeminiLLMAdapter(BaseLLMAdapter[GeminiLLMInvocationParams]):
|
||||
for part in obj["parts"]:
|
||||
if "inline_data" in part:
|
||||
part["inline_data"]["data"] = "..."
|
||||
if "thought_signature" in part:
|
||||
part["thought_signature"] = "..."
|
||||
except Exception as e:
|
||||
logger.debug(f"Error: {e}")
|
||||
messages_for_logging.append(obj)
|
||||
@@ -209,16 +211,37 @@ class GeminiLLMAdapter(BaseLLMAdapter[GeminiLLMInvocationParams]):
|
||||
system_instruction = None
|
||||
messages = []
|
||||
tool_call_id_to_name_mapping = {}
|
||||
thought_signature_dicts = []
|
||||
|
||||
# Process each message, preserving Google-formatted messages and converting others
|
||||
# Process each message, converting to Google format as needed
|
||||
for message in universal_context_messages:
|
||||
result = self._from_universal_context_message(
|
||||
# We have a Google-specific message; this may either be a
|
||||
# thought-signature-containing message that we need to handle in a
|
||||
# special way, or a message already in Google format that we can
|
||||
# use directly
|
||||
if isinstance(message, LLMSpecificMessage):
|
||||
if (
|
||||
isinstance(message.message, dict)
|
||||
and message.message.get("type") == "thought_signature"
|
||||
):
|
||||
thought_signature_dicts.append(message.message)
|
||||
continue
|
||||
|
||||
# Fall back to assuming that the message is already in Google
|
||||
# format
|
||||
messages.append(message.message)
|
||||
continue
|
||||
|
||||
# We have a standard universal context message; convert it to
|
||||
# Google format
|
||||
result = self._from_standard_message(
|
||||
message,
|
||||
params=self.MessageConversionParams(
|
||||
already_have_system_instruction=bool(system_instruction),
|
||||
tool_call_id_to_name_mapping=tool_call_id_to_name_mapping,
|
||||
),
|
||||
)
|
||||
|
||||
# Each result is either a Content or a system instruction
|
||||
if result.content:
|
||||
messages.append(result.content)
|
||||
@@ -229,6 +252,9 @@ class GeminiLLMAdapter(BaseLLMAdapter[GeminiLLMInvocationParams]):
|
||||
if result.tool_call_id_to_name_mapping:
|
||||
tool_call_id_to_name_mapping.update(result.tool_call_id_to_name_mapping)
|
||||
|
||||
# Apply thought signatures to the corresponding messages
|
||||
self._apply_thought_signatures_to_messages(thought_signature_dicts, messages)
|
||||
|
||||
# Check if we only have function-related messages (no regular text)
|
||||
has_regular_messages = any(
|
||||
len(msg.parts) == 1
|
||||
@@ -247,13 +273,6 @@ class GeminiLLMAdapter(BaseLLMAdapter[GeminiLLMInvocationParams]):
|
||||
|
||||
return self.ConvertedMessages(messages=messages, system_instruction=system_instruction)
|
||||
|
||||
def _from_universal_context_message(
|
||||
self, message: LLMContextMessage, *, params: MessageConversionParams
|
||||
) -> MessageConversionResult:
|
||||
if isinstance(message, LLMSpecificMessage):
|
||||
return self.MessageConversionResult(content=message.message)
|
||||
return self._from_standard_message(message, params=params)
|
||||
|
||||
def _from_standard_message(
|
||||
self, message: LLMStandardMessage, *, params: MessageConversionParams
|
||||
) -> MessageConversionResult:
|
||||
@@ -380,11 +399,14 @@ class GeminiLLMAdapter(BaseLLMAdapter[GeminiLLMInvocationParams]):
|
||||
if c["type"] == "text":
|
||||
parts.append(Part(text=c["text"]))
|
||||
elif c["type"] == "image_url" and c["image_url"]["url"].startswith("data:"):
|
||||
# Extract MIME type from data URL (format: "data:image/jpeg;base64,...")
|
||||
url = c["image_url"]["url"]
|
||||
mime_type = url.split(":")[1].split(";")[0]
|
||||
parts.append(
|
||||
Part(
|
||||
inline_data=Blob(
|
||||
mime_type="image/jpeg",
|
||||
data=base64.b64decode(c["image_url"]["url"].split(",")[1]),
|
||||
mime_type=mime_type,
|
||||
data=base64.b64decode(url.split(",")[1]),
|
||||
)
|
||||
)
|
||||
)
|
||||
@@ -410,3 +432,139 @@ class GeminiLLMAdapter(BaseLLMAdapter[GeminiLLMInvocationParams]):
|
||||
content=Content(role=role, parts=parts),
|
||||
tool_call_id_to_name_mapping=tool_call_id_to_name_mapping,
|
||||
)
|
||||
|
||||
def _apply_thought_signatures_to_messages(
|
||||
self, thought_signature_dicts: List[dict], messages: List[Content]
|
||||
) -> None:
|
||||
"""Apply thought signatures to corresponding assistant messages.
|
||||
|
||||
See GoogleLLMService for more details about thought signatures.
|
||||
|
||||
Args:
|
||||
thought_signature_dicts: A list of dicts containing:
|
||||
- "signature": a thought signature
|
||||
- "bookmark": a bookmark to identify the message part to apply the signature to.
|
||||
The bookmark may contain one of:
|
||||
- "function_call" (a function call ID string)
|
||||
- "text" (a text string)
|
||||
- "inline_data" (a Blob)
|
||||
The list of thought signature dicts is in order.
|
||||
messages: List of messages to apply the thought signatures to.
|
||||
"""
|
||||
if not thought_signature_dicts:
|
||||
return
|
||||
|
||||
# For debugging, print out thought signatures and their bookmarks
|
||||
logger.debug(f"Thought signatures to apply: {len(thought_signature_dicts)}")
|
||||
for ts in thought_signature_dicts:
|
||||
bookmark = ts.get("bookmark")
|
||||
if bookmark.get("function_call"):
|
||||
logger.trace(f" - To function call: {bookmark['function_call']}")
|
||||
elif bookmark.get("text"):
|
||||
text = bookmark["text"]
|
||||
log_display_text = f"{text[:50]}..." if len(text) > 50 else text
|
||||
logger.trace(f" - To text: {log_display_text}")
|
||||
elif bookmark.get("inline_data"):
|
||||
logger.trace(f" - To inline data")
|
||||
|
||||
# Get all assistant messages
|
||||
assistant_messages = [
|
||||
message
|
||||
for message in messages
|
||||
if isinstance(message, Content) and message.role == "model"
|
||||
]
|
||||
|
||||
# Apply thought signatures to the corresponding assistant messages.
|
||||
# Thought signatures are already in message order.
|
||||
thought_signatures_applied = 0
|
||||
message_start_index = 0 # Track where to start searching for the next matching message.
|
||||
for thought_signature_dict in thought_signature_dicts:
|
||||
signature = thought_signature_dict.get("signature")
|
||||
bookmark = thought_signature_dict.get("bookmark")
|
||||
if not signature or not bookmark:
|
||||
continue
|
||||
|
||||
# Search through remaining assistant messages for a match
|
||||
for i in range(message_start_index, len(assistant_messages)):
|
||||
message = assistant_messages[i]
|
||||
if not message.parts:
|
||||
continue
|
||||
|
||||
# We're assuming that the thought signature always applies to the last part
|
||||
last_part = message.parts[-1]
|
||||
|
||||
# If the bookmark matches the part...
|
||||
if self._thought_signature_bookmark_matches_part(bookmark, last_part):
|
||||
# Apply the thought signature
|
||||
last_part.thought_signature = signature
|
||||
thought_signatures_applied += 1
|
||||
|
||||
# Update the start index and stop searching for a match
|
||||
message_start_index = i + 1
|
||||
break
|
||||
|
||||
# For debugging, print out how many thought signatures were applied
|
||||
logger.debug(f"Applied {thought_signatures_applied} thought signatures.")
|
||||
|
||||
def _thought_signature_bookmark_matches_part(self, bookmark: dict, part: Part) -> bool:
|
||||
if function_call_bookmark := bookmark.get("function_call"):
|
||||
return self._thought_signature_function_call_bookmark_matches_part(
|
||||
function_call_bookmark, part
|
||||
)
|
||||
elif text_bookmark := bookmark.get("text"):
|
||||
return self._thought_signature_text_bookmark_matches_part(text_bookmark, part)
|
||||
elif inline_data := bookmark.get("inline_data"):
|
||||
return self._thought_signature_inline_data_bookmark_matches_part(inline_data, part)
|
||||
else:
|
||||
logger.warning(f"Unknown thought signature bookmark type: {bookmark}")
|
||||
|
||||
return False
|
||||
|
||||
def _thought_signature_function_call_bookmark_matches_part(
|
||||
self, bookmark_function_call_id: str, part: Part
|
||||
) -> bool:
|
||||
if (
|
||||
hasattr(part, "function_call")
|
||||
and part.function_call
|
||||
and part.function_call.id == bookmark_function_call_id
|
||||
):
|
||||
logger.trace(f"Thought signature function call match: {bookmark_function_call_id}")
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def _thought_signature_text_bookmark_matches_part(self, bookmark_text: str, part: Part) -> bool:
|
||||
if hasattr(part, "text") and part.text:
|
||||
# Normalize whitespace for comparison
|
||||
bookmark_text = " ".join(bookmark_text.split())
|
||||
part_text = " ".join(part.text.split())
|
||||
# Check that either:
|
||||
# - the part text is the same as the bookmark text
|
||||
# - a prefix of the bookmark text (in case the part text was truncated due to interruption)
|
||||
# - the bookmark text is a prefix of the part text (in case the bookmark represents just first chunk of multi-chunk text)
|
||||
if (
|
||||
part_text == bookmark_text
|
||||
or bookmark_text.startswith(part_text)
|
||||
or part_text.startswith(bookmark_text)
|
||||
):
|
||||
log_display_text = f"{part.text[:50]}..." if len(part.text) > 50 else part.text
|
||||
logger.trace(f"Thought signature text match: {log_display_text}")
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def _thought_signature_inline_data_bookmark_matches_part(
|
||||
self, bookmark_inline_data: Blob, part: Part
|
||||
) -> bool:
|
||||
if (
|
||||
hasattr(part, "inline_data")
|
||||
and part.inline_data
|
||||
# Comparing length should be good enough for matching inline data,
|
||||
# especially since we're already matching thought signatures in
|
||||
# strict message order. Comparing actual data is expensive.
|
||||
and len(part.inline_data.data) == len(bookmark_inline_data.data)
|
||||
):
|
||||
logger.trace(f"Thought signature inline data match")
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
@@ -39,7 +39,7 @@ class AICFilter(BaseAudioFilter):
|
||||
self,
|
||||
*,
|
||||
license_key: str = "",
|
||||
model_type: AICModelType = AICModelType.QUAIL_L,
|
||||
model_type: AICModelType = AICModelType.QUAIL_STT,
|
||||
enhancement_level: Optional[float] = 1.0,
|
||||
voice_gain: Optional[float] = 1.0,
|
||||
noise_gate_enable: Optional[bool] = True,
|
||||
@@ -52,12 +52,27 @@ class AICFilter(BaseAudioFilter):
|
||||
enhancement_level: Optional overall enhancement strength (0.0..1.0).
|
||||
voice_gain: Optional linear gain applied to detected speech (0.0..4.0).
|
||||
noise_gate_enable: Optional enable/disable noise gate (default: True).
|
||||
|
||||
.. deprecated:: 1.3.0
|
||||
The `noise_gate_enable` parameter is deprecated and no longer has any effect.
|
||||
It will be removed in a future version.
|
||||
"""
|
||||
self._license_key = license_key
|
||||
self._model_type = model_type
|
||||
|
||||
self._enhancement_level = enhancement_level
|
||||
self._voice_gain = voice_gain
|
||||
if noise_gate_enable is not None:
|
||||
import warnings
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("always")
|
||||
warnings.warn(
|
||||
"Parameter `noise_gate_enable` is deprecated and no longer has any effect. "
|
||||
"It will be removed in a future version. Use AIC VAD instead (create_vad_analyzer()).",
|
||||
DeprecationWarning,
|
||||
)
|
||||
|
||||
self._noise_gate_enable = noise_gate_enable
|
||||
|
||||
self._enabled = True
|
||||
@@ -149,10 +164,6 @@ class AICFilter(BaseAudioFilter):
|
||||
)
|
||||
if self._voice_gain is not None:
|
||||
self._aic.set_parameter(AICParameter.VOICE_GAIN, float(self._voice_gain))
|
||||
if self._noise_gate_enable is not None:
|
||||
self._aic.set_parameter(
|
||||
AICParameter.NOISE_GATE_ENABLE, 1.0 if bool(self._noise_gate_enable) else 0.0
|
||||
)
|
||||
|
||||
self._aic_ready = True
|
||||
|
||||
|
||||
@@ -28,7 +28,6 @@ from pipecat.metrics.metrics import MetricsData, SmartTurnMetricsData
|
||||
STOP_SECS = 3
|
||||
PRE_SPEECH_MS = 0
|
||||
MAX_DURATION_SECONDS = 8 # Max allowed segment duration
|
||||
USE_ONLY_LAST_VAD_SEGMENT = True
|
||||
|
||||
|
||||
class SmartTurnParams(BaseTurnParams):
|
||||
@@ -43,8 +42,6 @@ class SmartTurnParams(BaseTurnParams):
|
||||
stop_secs: float = STOP_SECS
|
||||
pre_speech_ms: float = PRE_SPEECH_MS
|
||||
max_duration_secs: float = MAX_DURATION_SECONDS
|
||||
# not exposing this for now yet until the model can handle it.
|
||||
# use_only_last_vad_segment: bool = USE_ONLY_LAST_VAD_SEGMENT
|
||||
|
||||
|
||||
class SmartTurnTimeoutException(Exception):
|
||||
@@ -160,7 +157,7 @@ class BaseSmartTurn(BaseTurnAnalyzer):
|
||||
state, result = await loop.run_in_executor(
|
||||
self._executor, self._process_speech_segment, self._audio_buffer
|
||||
)
|
||||
if state == EndOfTurnState.COMPLETE or USE_ONLY_LAST_VAD_SEGMENT:
|
||||
if state == EndOfTurnState.COMPLETE:
|
||||
self._clear(state)
|
||||
logger.debug(f"End of Turn result: {state}")
|
||||
return state, result
|
||||
|
||||
Binary file not shown.
@@ -14,6 +14,7 @@ Note: To learn more about the smart-turn model, visit:
|
||||
- https://github.com/pipecat-ai/smart-turn
|
||||
"""
|
||||
|
||||
import warnings
|
||||
from typing import Optional
|
||||
|
||||
import aiohttp
|
||||
@@ -26,6 +27,10 @@ class FalSmartTurnAnalyzer(HttpSmartTurnAnalyzer):
|
||||
|
||||
Extends HttpSmartTurnAnalyzer to provide integration with Fal.ai's
|
||||
smart turn detection API endpoint with proper authentication.
|
||||
|
||||
.. deprecated:: 0.98.0
|
||||
FalSmartTurnAnalyzer is deprecated and will be removed in a future version.
|
||||
Use LocalSmartTurnAnalyzerV3 instead.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
@@ -48,3 +53,12 @@ class FalSmartTurnAnalyzer(HttpSmartTurnAnalyzer):
|
||||
if api_key:
|
||||
headers = {"Authorization": f"Key {api_key}"}
|
||||
super().__init__(url=url, aiohttp_session=aiohttp_session, headers=headers, **kwargs)
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("always")
|
||||
warnings.warn(
|
||||
"FalSmartTurnAnalyzer is deprecated and will be removed in a future version. "
|
||||
"Use LocalSmartTurnAnalyzerV3 instead.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
@@ -10,6 +10,7 @@ This module provides a smart turn analyzer that uses PyTorch models for
|
||||
local end-of-turn detection without requiring network connectivity.
|
||||
"""
|
||||
|
||||
import warnings
|
||||
from typing import Any, Dict
|
||||
|
||||
import numpy as np
|
||||
@@ -34,6 +35,10 @@ class LocalSmartTurnAnalyzer(BaseSmartTurn):
|
||||
Provides end-of-turn detection using locally-stored PyTorch models,
|
||||
enabling offline operation without network dependencies. Uses
|
||||
Wav2Vec2-BERT architecture for audio sequence classification.
|
||||
|
||||
.. deprecated:: 0.98.0
|
||||
LocalSmartTurnAnalyzer is deprecated and will be removed in a future version.
|
||||
Use LocalSmartTurnAnalyzerV3 instead.
|
||||
"""
|
||||
|
||||
def __init__(self, *, smart_turn_model_path: str, **kwargs):
|
||||
@@ -46,6 +51,15 @@ class LocalSmartTurnAnalyzer(BaseSmartTurn):
|
||||
"""
|
||||
super().__init__(**kwargs)
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("always")
|
||||
warnings.warn(
|
||||
"LocalSmartTurnAnalyzer is deprecated and will be removed in a future version. "
|
||||
"Use LocalSmartTurnAnalyzerV3 instead.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
if not smart_turn_model_path:
|
||||
# Define the path to the pretrained model on Hugging Face
|
||||
smart_turn_model_path = "pipecat-ai/smart-turn"
|
||||
|
||||
@@ -42,17 +42,15 @@ class LocalSmartTurnAnalyzerV3(BaseSmartTurn):
|
||||
|
||||
Args:
|
||||
smart_turn_model_path: Path to the ONNX model file. If this is not
|
||||
set, the bundled smart-turn-v3.0 model will be used.
|
||||
set, the bundled smart-turn-v3.1-cpu model will be used.
|
||||
cpu_count: The number of CPUs to use for inference. Defaults to 1.
|
||||
**kwargs: Additional arguments passed to BaseSmartTurn.
|
||||
"""
|
||||
super().__init__(**kwargs)
|
||||
|
||||
logger.debug("Loading Local Smart Turn v3 model...")
|
||||
|
||||
if not smart_turn_model_path:
|
||||
# Load bundled model
|
||||
model_name = "smart-turn-v3.0.onnx"
|
||||
model_name = "smart-turn-v3.1-cpu.onnx"
|
||||
package_path = "pipecat.audio.turn.smart_turn.data"
|
||||
|
||||
try:
|
||||
@@ -70,6 +68,8 @@ class LocalSmartTurnAnalyzerV3(BaseSmartTurn):
|
||||
impresources.files(package_path).joinpath(model_name)
|
||||
)
|
||||
|
||||
logger.debug(f"Loading Local Smart Turn v3.x model from {smart_turn_model_path}...")
|
||||
|
||||
so = ort.SessionOptions()
|
||||
so.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
|
||||
so.inter_op_num_threads = 1
|
||||
@@ -79,7 +79,7 @@ class LocalSmartTurnAnalyzerV3(BaseSmartTurn):
|
||||
self._feature_extractor = WhisperFeatureExtractor(chunk_length=8)
|
||||
self._session = ort.InferenceSession(smart_turn_model_path, sess_options=so)
|
||||
|
||||
logger.debug("Loaded Local Smart Turn v3")
|
||||
logger.debug("Loaded Local Smart Turn v3.x")
|
||||
|
||||
def _predict_endpoint(self, audio_array: np.ndarray) -> Dict[str, Any]:
|
||||
"""Predict end-of-turn using local ONNX model."""
|
||||
|
||||
@@ -18,8 +18,10 @@ from loguru import logger
|
||||
from pipecat.audio.dtmf.types import KeypadEntry
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
from pipecat.frames.frames import (
|
||||
EndFrame,
|
||||
Frame,
|
||||
LLMContextFrame,
|
||||
LLMFullResponseEndFrame,
|
||||
LLMMessagesUpdateFrame,
|
||||
LLMTextFrame,
|
||||
OutputDTMFUrgentFrame,
|
||||
@@ -31,7 +33,11 @@ from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContextFrame
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.services.llm_service import LLMService
|
||||
from pipecat.utils.text.pattern_pair_aggregator import PatternMatch, PatternPairAggregator
|
||||
from pipecat.utils.text.pattern_pair_aggregator import (
|
||||
MatchAction,
|
||||
PatternMatch,
|
||||
PatternPairAggregator,
|
||||
)
|
||||
|
||||
|
||||
class IVRStatus(Enum):
|
||||
@@ -114,15 +120,15 @@ class IVRProcessor(FrameProcessor):
|
||||
def _setup_xml_patterns(self):
|
||||
"""Set up XML pattern detection and handlers."""
|
||||
# Register DTMF pattern
|
||||
self._aggregator.add_pattern_pair("dtmf", "<dtmf>", "</dtmf>", remove_match=True)
|
||||
self._aggregator.add_pattern("dtmf", "<dtmf>", "</dtmf>", action=MatchAction.REMOVE)
|
||||
self._aggregator.on_pattern_match("dtmf", self._handle_dtmf_action)
|
||||
|
||||
# Register mode pattern
|
||||
self._aggregator.add_pattern_pair("mode", "<mode>", "</mode>", remove_match=True)
|
||||
self._aggregator.add_pattern("mode", "<mode>", "</mode>", action=MatchAction.REMOVE)
|
||||
self._aggregator.on_pattern_match("mode", self._handle_mode_action)
|
||||
|
||||
# Register IVR pattern
|
||||
self._aggregator.add_pattern_pair("ivr", "<ivr>", "</ivr>", remove_match=True)
|
||||
self._aggregator.add_pattern("ivr", "<ivr>", "</ivr>", action=MatchAction.REMOVE)
|
||||
self._aggregator.on_pattern_match("ivr", self._handle_ivr_action)
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
@@ -145,10 +151,17 @@ class IVRProcessor(FrameProcessor):
|
||||
|
||||
elif isinstance(frame, LLMTextFrame):
|
||||
# Process text through the pattern aggregator
|
||||
result = await self._aggregator.aggregate(frame.text)
|
||||
if result:
|
||||
async for result in self._aggregator.aggregate(frame.text):
|
||||
# Push aggregated text that doesn't contain XML patterns
|
||||
await self.push_frame(LLMTextFrame(result), direction)
|
||||
await self.push_frame(LLMTextFrame(result.text), direction)
|
||||
|
||||
elif isinstance(frame, (LLMFullResponseEndFrame, EndFrame)):
|
||||
# Flush any remaining text from the aggregator
|
||||
remaining = await self._aggregator.flush()
|
||||
if remaining:
|
||||
await self.push_frame(LLMTextFrame(remaining.text), direction)
|
||||
# Push the end frame
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
else:
|
||||
await self.push_frame(frame, direction)
|
||||
@@ -159,7 +172,7 @@ class IVRProcessor(FrameProcessor):
|
||||
Args:
|
||||
match: The pattern match containing DTMF content.
|
||||
"""
|
||||
value = match.content
|
||||
value = match.text
|
||||
logger.debug(f"DTMF detected: {value}")
|
||||
|
||||
try:
|
||||
@@ -180,7 +193,7 @@ class IVRProcessor(FrameProcessor):
|
||||
Args:
|
||||
match: The pattern match containing IVR status content.
|
||||
"""
|
||||
status = match.content
|
||||
status = match.text
|
||||
logger.trace(f"IVR status detected: {status}")
|
||||
|
||||
# Convert string to enum, with validation
|
||||
@@ -211,7 +224,7 @@ class IVRProcessor(FrameProcessor):
|
||||
Args:
|
||||
match: The pattern match containing mode content.
|
||||
"""
|
||||
mode = match.content
|
||||
mode = match.text
|
||||
logger.debug(f"Mode detected: {mode}")
|
||||
if mode == "conversation":
|
||||
await self._handle_conversation()
|
||||
|
||||
@@ -40,8 +40,8 @@ from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor, FrameProcessorSetup
|
||||
from pipecat.services.llm_service import LLMService
|
||||
from pipecat.sync.base_notifier import BaseNotifier
|
||||
from pipecat.sync.event_notifier import EventNotifier
|
||||
from pipecat.utils.sync.base_notifier import BaseNotifier
|
||||
from pipecat.utils.sync.event_notifier import EventNotifier
|
||||
|
||||
|
||||
class NotifierGate(FrameProcessor):
|
||||
@@ -252,7 +252,8 @@ class ClassificationProcessor(FrameProcessor):
|
||||
self._voicemail_notifier = voicemail_notifier
|
||||
self._voicemail_response_delay = voicemail_response_delay
|
||||
|
||||
# Register the voicemail detected event
|
||||
# Register the conversation and voicemail detected events
|
||||
self._register_event_handler("on_conversation_detected")
|
||||
self._register_event_handler("on_voicemail_detected")
|
||||
|
||||
# Aggregation state for collecting complete LLM responses
|
||||
@@ -350,6 +351,7 @@ class ClassificationProcessor(FrameProcessor):
|
||||
logger.info(f"{self}: CONVERSATION detected")
|
||||
await self._gate_notifier.notify() # Close the classifier gate
|
||||
await self._conversation_notifier.notify() # Release buffered TTS frames
|
||||
await self._call_event_handler("on_conversation_detected")
|
||||
|
||||
elif "VOICEMAIL" in response:
|
||||
# Voicemail detected - trigger voicemail handling
|
||||
@@ -539,6 +541,9 @@ class VoicemailDetector(ParallelPipeline):
|
||||
custom_prompt = "Your custom classification logic here. " + VoicemailDetector.CLASSIFIER_RESPONSE_INSTRUCTION
|
||||
|
||||
Events:
|
||||
on_conversation_detected: Triggered when a human conversation is detected. The
|
||||
event handler receives one argument: the ClassificationProcessor instance
|
||||
which can be used to push frames.
|
||||
on_voicemail_detected: Triggered when voicemail is detected after the configured
|
||||
delay. The event handler receives one argument: the ClassificationProcessor
|
||||
instance which can be used to push frames.
|
||||
@@ -701,7 +706,7 @@ VOICEMAIL SYSTEM (respond "VOICEMAIL"):
|
||||
event_name: The name of the event to handle.
|
||||
handler: The function to call when the event occurs.
|
||||
"""
|
||||
if event_name == "on_voicemail_detected":
|
||||
if event_name in ("on_conversation_detected", "on_voicemail_detected"):
|
||||
self._classification_processor.add_event_handler(event_name, handler)
|
||||
else:
|
||||
super().add_event_handler(event_name, handler)
|
||||
|
||||
@@ -12,6 +12,7 @@ and LLM processing.
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from enum import Enum
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
@@ -37,7 +38,7 @@ from pipecat.utils.time import nanoseconds_to_str
|
||||
from pipecat.utils.utils import obj_count, obj_id
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext, NotGiven
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext, LLMContextMessage, NotGiven
|
||||
from pipecat.processors.frame_processor import FrameProcessor
|
||||
|
||||
|
||||
@@ -185,6 +186,20 @@ class ControlFrame(Frame):
|
||||
#
|
||||
|
||||
|
||||
@dataclass
|
||||
class UninterruptibleFrame:
|
||||
"""A marker for data or control frames that must not be interrupted.
|
||||
|
||||
Frames with this mixin are still ordered normally, but unlike other frames,
|
||||
they are preserved during interruptions: they remain in internal queues and
|
||||
any task processing them will not be cancelled. This ensures the frame is
|
||||
always delivered and processed to completion.
|
||||
|
||||
"""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
@dataclass
|
||||
class AudioRawFrame:
|
||||
"""A frame containing a chunk of raw audio.
|
||||
@@ -212,7 +227,7 @@ class ImageRawFrame:
|
||||
Parameters:
|
||||
image: Raw image bytes.
|
||||
size: Image dimensions as (width, height) tuple.
|
||||
format: Image format (e.g., 'JPEG', 'PNG').
|
||||
format: Image format (e.g., 'RGB', 'RGBA').
|
||||
"""
|
||||
|
||||
image: bytes
|
||||
@@ -329,7 +344,7 @@ class TextFrame(DataFrame):
|
||||
"""
|
||||
|
||||
text: str
|
||||
skip_tts: bool = field(init=False)
|
||||
skip_tts: Optional[bool] = field(init=False)
|
||||
# Whether any necessary inter-frame (leading/trailing) spaces are already
|
||||
# included in the text.
|
||||
# NOTE: Ideally this would be available at init time with a default value,
|
||||
@@ -337,11 +352,14 @@ class TextFrame(DataFrame):
|
||||
# mandatory fields of theirs to have defaults to preserve
|
||||
# non-default-before-default argument order)
|
||||
includes_inter_frame_spaces: bool = field(init=False)
|
||||
# Whether this text frame should be appended to the LLM context.
|
||||
append_to_context: bool = field(init=False)
|
||||
|
||||
def __post_init__(self):
|
||||
super().__post_init__()
|
||||
self.skip_tts = False
|
||||
self.skip_tts = None
|
||||
self.includes_inter_frame_spaces = False
|
||||
self.append_to_context = True
|
||||
|
||||
def __str__(self):
|
||||
pts = format_pts(self.pts)
|
||||
@@ -352,11 +370,45 @@ class TextFrame(DataFrame):
|
||||
class LLMTextFrame(TextFrame):
|
||||
"""Text frame generated by LLM services."""
|
||||
|
||||
def __post_init__(self):
|
||||
super().__post_init__()
|
||||
# LLM services send text frames with all necessary spaces included
|
||||
self.includes_inter_frame_spaces = True
|
||||
|
||||
|
||||
class AggregationType(str, Enum):
|
||||
"""Built-in aggregation strings."""
|
||||
|
||||
SENTENCE = "sentence"
|
||||
WORD = "word"
|
||||
|
||||
def __str__(self):
|
||||
return self.value
|
||||
|
||||
|
||||
@dataclass
|
||||
class AggregatedTextFrame(TextFrame):
|
||||
"""Text frame representing an aggregation of TextFrames.
|
||||
|
||||
This frame contains multiple TextFrames aggregated together for processing
|
||||
or output along with a field to indicate how they are aggregated.
|
||||
|
||||
Parameters:
|
||||
aggregated_by: Method used to aggregate the text frames.
|
||||
"""
|
||||
|
||||
aggregated_by: AggregationType | str
|
||||
|
||||
|
||||
@dataclass
|
||||
class VisionTextFrame(LLMTextFrame):
|
||||
"""Text frame generated by vision services."""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
@dataclass
|
||||
class TTSTextFrame(TextFrame):
|
||||
class TTSTextFrame(AggregatedTextFrame):
|
||||
"""Text frame generated by Text-to-Speech services."""
|
||||
|
||||
pass
|
||||
@@ -467,6 +519,15 @@ class TranscriptionMessage:
|
||||
timestamp: Optional[str] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class ThoughtTranscriptionMessage:
|
||||
"""An LLM thought message in a conversation transcript."""
|
||||
|
||||
role: Literal["assistant"] = field(default="assistant", init=False)
|
||||
content: str
|
||||
timestamp: Optional[str] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class TranscriptionUpdateFrame(DataFrame):
|
||||
"""Frame containing new messages added to conversation transcript.
|
||||
@@ -511,7 +572,7 @@ class TranscriptionUpdateFrame(DataFrame):
|
||||
messages: List of new transcript messages that were added.
|
||||
"""
|
||||
|
||||
messages: List[TranscriptionMessage]
|
||||
messages: List[TranscriptionMessage | ThoughtTranscriptionMessage]
|
||||
|
||||
def __str__(self):
|
||||
pts = format_pts(self.pts)
|
||||
@@ -532,6 +593,75 @@ class LLMContextFrame(Frame):
|
||||
context: "LLMContext"
|
||||
|
||||
|
||||
@dataclass
|
||||
class LLMThoughtStartFrame(ControlFrame):
|
||||
"""Frame indicating the start of an LLM thought.
|
||||
|
||||
Parameters:
|
||||
append_to_context: Whether the thought should be appended to the LLM context.
|
||||
If it is appended, the `llm` field is required, since it will be
|
||||
appended as an `LLMSpecificMessage`.
|
||||
llm: Optional identifier of the LLM provider for LLM-specific handling.
|
||||
Only required if `append_to_context` is True, as the thought is
|
||||
appended to context as an `LLMSpecificMessage`.
|
||||
"""
|
||||
|
||||
append_to_context: bool = False
|
||||
llm: Optional[str] = None
|
||||
|
||||
def __post_init__(self):
|
||||
super().__post_init__()
|
||||
if self.append_to_context and self.llm is None:
|
||||
raise ValueError("When append_to_context is True, llm must be set")
|
||||
|
||||
def __str__(self):
|
||||
pts = format_pts(self.pts)
|
||||
return (
|
||||
f"{self.name}(pts: {pts}, append_to_context: {self.append_to_context}, llm: {self.llm})"
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class LLMThoughtTextFrame(DataFrame):
|
||||
"""Frame containing the text (or text chunk) of an LLM thought.
|
||||
|
||||
Note that despite this containing text, it is a DataFrame and not a
|
||||
TextFrame, to avoid most typical text processing, such as TTS.
|
||||
|
||||
Parameters:
|
||||
text: The text (or text chunk) of the thought.
|
||||
"""
|
||||
|
||||
text: str
|
||||
includes_inter_frame_spaces: bool = field(init=False)
|
||||
|
||||
def __post_init__(self):
|
||||
super().__post_init__()
|
||||
# Assume that thought text chunks include all necessary spaces
|
||||
self.includes_inter_frame_spaces = True
|
||||
|
||||
def __str__(self):
|
||||
pts = format_pts(self.pts)
|
||||
return f"{self.name}(pts: {pts}, thought text: {self.text})"
|
||||
|
||||
|
||||
@dataclass
|
||||
class LLMThoughtEndFrame(ControlFrame):
|
||||
"""Frame indicating the end of an LLM thought.
|
||||
|
||||
Parameters:
|
||||
signature: Optional signature associated with the thought.
|
||||
This is used by Anthropic, which includes a signature at the end of
|
||||
each thought.
|
||||
"""
|
||||
|
||||
signature: Any = None
|
||||
|
||||
def __str__(self):
|
||||
pts = format_pts(self.pts)
|
||||
return f"{self.name}(pts: {pts}, signature: {self.signature})"
|
||||
|
||||
|
||||
@dataclass
|
||||
class LLMMessagesFrame(DataFrame):
|
||||
"""Frame containing LLM messages for chat completion.
|
||||
@@ -665,6 +795,44 @@ class LLMConfigureOutputFrame(DataFrame):
|
||||
skip_tts: bool
|
||||
|
||||
|
||||
@dataclass
|
||||
class FunctionCallResultProperties:
|
||||
"""Properties for configuring function call result behavior.
|
||||
|
||||
Parameters:
|
||||
run_llm: Whether to run the LLM after receiving this result.
|
||||
on_context_updated: Callback to execute when context is updated.
|
||||
"""
|
||||
|
||||
run_llm: Optional[bool] = None
|
||||
on_context_updated: Optional[Callable[[], Awaitable[None]]] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class FunctionCallResultFrame(DataFrame, UninterruptibleFrame):
|
||||
"""Frame containing the result of an LLM function call.
|
||||
|
||||
This is an uninterruptible frame because once a result is generated we
|
||||
always want to update the context.
|
||||
|
||||
Parameters:
|
||||
function_name: Name of the function that was executed.
|
||||
tool_call_id: Unique identifier for the function call.
|
||||
arguments: Arguments that were passed to the function.
|
||||
result: The result returned by the function.
|
||||
run_llm: Whether to run the LLM after this result.
|
||||
properties: Additional properties for result handling.
|
||||
|
||||
"""
|
||||
|
||||
function_name: str
|
||||
tool_call_id: str
|
||||
arguments: Any
|
||||
result: Any
|
||||
run_llm: Optional[bool] = None
|
||||
properties: Optional[FunctionCallResultProperties] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class TTSSpeakFrame(DataFrame):
|
||||
"""Frame containing text that should be spoken by TTS.
|
||||
@@ -786,7 +954,7 @@ class CancelFrame(SystemFrame):
|
||||
reason: Optional reason for pushing a cancel frame.
|
||||
"""
|
||||
|
||||
reason: Optional[str] = None
|
||||
reason: Optional[Any] = None
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.name}(reason: {self.reason})"
|
||||
@@ -804,11 +972,13 @@ class ErrorFrame(SystemFrame):
|
||||
error: Description of the error that occurred.
|
||||
fatal: Whether the error is fatal and requires bot shutdown.
|
||||
processor: The frame processor that generated the error.
|
||||
exception: The exception that occurred.
|
||||
"""
|
||||
|
||||
error: str
|
||||
fatal: bool = False
|
||||
processor: Optional["FrameProcessor"] = None
|
||||
exception: Optional[Exception] = None
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.name}(error: {self.error}, fatal: {self.fatal})"
|
||||
@@ -1056,23 +1226,6 @@ class FunctionCallsStartedFrame(SystemFrame):
|
||||
function_calls: Sequence[FunctionCallFromLLM]
|
||||
|
||||
|
||||
@dataclass
|
||||
class FunctionCallInProgressFrame(SystemFrame):
|
||||
"""Frame signaling that a function call is currently executing.
|
||||
|
||||
Parameters:
|
||||
function_name: Name of the function being executed.
|
||||
tool_call_id: Unique identifier for this function call.
|
||||
arguments: Arguments passed to the function.
|
||||
cancel_on_interruption: Whether to cancel this call if interrupted.
|
||||
"""
|
||||
|
||||
function_name: str
|
||||
tool_call_id: str
|
||||
arguments: Any
|
||||
cancel_on_interruption: bool = False
|
||||
|
||||
|
||||
@dataclass
|
||||
class FunctionCallCancelFrame(SystemFrame):
|
||||
"""Frame signaling that a function call has been cancelled.
|
||||
@@ -1086,40 +1239,6 @@ class FunctionCallCancelFrame(SystemFrame):
|
||||
tool_call_id: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class FunctionCallResultProperties:
|
||||
"""Properties for configuring function call result behavior.
|
||||
|
||||
Parameters:
|
||||
run_llm: Whether to run the LLM after receiving this result.
|
||||
on_context_updated: Callback to execute when context is updated.
|
||||
"""
|
||||
|
||||
run_llm: Optional[bool] = None
|
||||
on_context_updated: Optional[Callable[[], Awaitable[None]]] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class FunctionCallResultFrame(SystemFrame):
|
||||
"""Frame containing the result of an LLM function call.
|
||||
|
||||
Parameters:
|
||||
function_name: Name of the function that was executed.
|
||||
tool_call_id: Unique identifier for the function call.
|
||||
arguments: Arguments that were passed to the function.
|
||||
result: The result returned by the function.
|
||||
run_llm: Whether to run the LLM after this result.
|
||||
properties: Additional properties for result handling.
|
||||
"""
|
||||
|
||||
function_name: str
|
||||
tool_call_id: str
|
||||
arguments: Any
|
||||
result: Any
|
||||
run_llm: Optional[bool] = None
|
||||
properties: Optional[FunctionCallResultProperties] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class STTMuteFrame(SystemFrame):
|
||||
"""Frame to mute/unmute the Speech-to-Text service.
|
||||
@@ -1354,6 +1473,23 @@ class UserImageRawFrame(InputImageRawFrame):
|
||||
return f"{self.name}(pts: {pts}, user: {self.user_id}, source: {self.transport_source}, size: {self.size}, format: {self.format}, text: {self.text}, append_to_context: {self.append_to_context})"
|
||||
|
||||
|
||||
@dataclass
|
||||
class AssistantImageRawFrame(OutputImageRawFrame):
|
||||
"""Frame containing an image generated by the assistant.
|
||||
|
||||
Contains both the raw frame for display (superclass functionality) as well
|
||||
as the original image, which can get used directly in LLM contexts.
|
||||
|
||||
Parameters:
|
||||
original_data: The original image data, which can get used directly in
|
||||
an LLM context message without further encoding.
|
||||
original_mime_type: The MIME type of the original image data.
|
||||
"""
|
||||
|
||||
original_data: Optional[bytes] = None
|
||||
original_mime_type: Optional[str] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class InputDTMFFrame(DTMFFrame, SystemFrame):
|
||||
"""DTMF keypress input frame from transport."""
|
||||
@@ -1421,7 +1557,7 @@ class EndTaskFrame(TaskFrame):
|
||||
reason: Optional reason for pushing an end frame.
|
||||
"""
|
||||
|
||||
reason: Optional[str] = None
|
||||
reason: Optional[Any] = None
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.name}(reason: {self.reason})"
|
||||
@@ -1439,7 +1575,7 @@ class CancelTaskFrame(TaskFrame):
|
||||
reason: Optional reason for pushing a cancel frame.
|
||||
"""
|
||||
|
||||
reason: Optional[str] = None
|
||||
reason: Optional[Any] = None
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.name}(reason: {self.reason})"
|
||||
@@ -1518,7 +1654,7 @@ class EndFrame(ControlFrame):
|
||||
reason: Optional reason for pushing an end frame.
|
||||
"""
|
||||
|
||||
reason: Optional[str] = None
|
||||
reason: Optional[Any] = None
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.name}(reason: {self.reason})"
|
||||
@@ -1599,22 +1735,61 @@ class LLMFullResponseStartFrame(ControlFrame):
|
||||
more TextFrames and a final LLMFullResponseEndFrame.
|
||||
"""
|
||||
|
||||
skip_tts: bool = field(init=False)
|
||||
skip_tts: Optional[bool] = field(init=False)
|
||||
|
||||
def __post_init__(self):
|
||||
super().__post_init__()
|
||||
self.skip_tts = False
|
||||
self.skip_tts = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class LLMFullResponseEndFrame(ControlFrame):
|
||||
"""Frame indicating the end of an LLM response."""
|
||||
|
||||
skip_tts: bool = field(init=False)
|
||||
skip_tts: Optional[bool] = field(init=False)
|
||||
|
||||
def __post_init__(self):
|
||||
super().__post_init__()
|
||||
self.skip_tts = False
|
||||
self.skip_tts = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class FunctionCallInProgressFrame(ControlFrame, UninterruptibleFrame):
|
||||
"""Frame signaling that a function call is currently executing.
|
||||
|
||||
This is an uninterruptible frame because we always want to update the
|
||||
context.
|
||||
|
||||
Parameters:
|
||||
function_name: Name of the function being executed.
|
||||
tool_call_id: Unique identifier for this function call.
|
||||
arguments: Arguments passed to the function.
|
||||
cancel_on_interruption: Whether to cancel this call if interrupted.
|
||||
"""
|
||||
|
||||
function_name: str
|
||||
tool_call_id: str
|
||||
arguments: Any
|
||||
cancel_on_interruption: bool = False
|
||||
|
||||
|
||||
@dataclass
|
||||
class VisionFullResponseStartFrame(LLMFullResponseStartFrame):
|
||||
"""Frame indicating the beginning of a vision model response.
|
||||
|
||||
Used to indicate the beginning of a vision model response. Followed by one
|
||||
or more VisionTextFrames and a final VisionFullResponseEndFrame.
|
||||
|
||||
"""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
@dataclass
|
||||
class VisionFullResponseEndFrame(LLMFullResponseEndFrame):
|
||||
"""Frame indicating the end of a Vision model response."""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
@dataclass
|
||||
|
||||
@@ -15,8 +15,8 @@ from pipecat.frames.frames import (
|
||||
BotStartedSpeakingFrame,
|
||||
CancelFrame,
|
||||
EndFrame,
|
||||
UserStartedSpeakingFrame,
|
||||
UserStoppedSpeakingFrame,
|
||||
VADUserStartedSpeakingFrame,
|
||||
VADUserStoppedSpeakingFrame,
|
||||
)
|
||||
from pipecat.observers.base_observer import BaseObserver, FramePushed
|
||||
from pipecat.processors.frame_processor import FrameDirection
|
||||
@@ -36,7 +36,7 @@ class UserBotLatencyLogObserver(BaseObserver):
|
||||
to calculate response latencies.
|
||||
"""
|
||||
super().__init__()
|
||||
self._processed_frames = set()
|
||||
self._user_bot_latency_processed_frames = set()
|
||||
self._user_stopped_time = 0
|
||||
self._latencies = []
|
||||
|
||||
@@ -51,14 +51,14 @@ class UserBotLatencyLogObserver(BaseObserver):
|
||||
return
|
||||
|
||||
# Skip already processed frames
|
||||
if data.frame.id in self._processed_frames:
|
||||
if data.frame.id in self._user_bot_latency_processed_frames:
|
||||
return
|
||||
|
||||
self._processed_frames.add(data.frame.id)
|
||||
self._user_bot_latency_processed_frames.add(data.frame.id)
|
||||
|
||||
if isinstance(data.frame, UserStartedSpeakingFrame):
|
||||
if isinstance(data.frame, VADUserStartedSpeakingFrame):
|
||||
self._user_stopped_time = 0
|
||||
elif isinstance(data.frame, UserStoppedSpeakingFrame):
|
||||
elif isinstance(data.frame, VADUserStoppedSpeakingFrame):
|
||||
self._user_stopped_time = time.time()
|
||||
elif isinstance(data.frame, (EndFrame, CancelFrame)):
|
||||
self._log_summary()
|
||||
|
||||
@@ -9,7 +9,7 @@
|
||||
from pipecat.frames.frames import CancelFrame, EndFrame, Frame, LLMContextFrame, StartFrame
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContextFrame
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.sync.base_notifier import BaseNotifier
|
||||
from pipecat.utils.sync.base_notifier import BaseNotifier
|
||||
|
||||
|
||||
class GatedLLMContextAggregator(FrameProcessor):
|
||||
|
||||
@@ -14,6 +14,7 @@ translation from this universal context into whatever format it needs, using a
|
||||
service-specific adapter.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import base64
|
||||
import io
|
||||
import wave
|
||||
@@ -137,7 +138,7 @@ class LLMContext:
|
||||
return {"role": role, "content": content}
|
||||
|
||||
@staticmethod
|
||||
def create_image_message(
|
||||
async def create_image_message(
|
||||
*,
|
||||
role: str = "user",
|
||||
format: str,
|
||||
@@ -149,20 +150,34 @@ class LLMContext:
|
||||
|
||||
Args:
|
||||
role: The role of this message (defaults to "user").
|
||||
format: Image format (e.g., 'RGB', 'RGBA').
|
||||
format: Image format (e.g., 'RGB', 'RGBA', or, if already encoded,
|
||||
the MIME type like 'image/jpeg').
|
||||
size: Image dimensions as (width, height) tuple.
|
||||
image: Raw image bytes.
|
||||
text: Optional text to include with the image.
|
||||
"""
|
||||
buffer = io.BytesIO()
|
||||
Image.frombytes(format, size, image).save(buffer, format="JPEG")
|
||||
encoded_image = base64.b64encode(buffer.getvalue()).decode("utf-8")
|
||||
url = f"data:image/jpeg;base64,{encoded_image}"
|
||||
# Format is a mime type: image is already encoded
|
||||
image_already_encoded = format.startswith("image/")
|
||||
|
||||
def encode_image():
|
||||
if image_already_encoded:
|
||||
bytes = image
|
||||
else:
|
||||
# Encode to JPEG
|
||||
buffer = io.BytesIO()
|
||||
Image.frombytes(format, size, image).save(buffer, format="JPEG")
|
||||
bytes = buffer.getvalue()
|
||||
encoded_image = base64.b64encode(bytes).decode("utf-8")
|
||||
return encoded_image
|
||||
|
||||
encoded_image = await asyncio.to_thread(encode_image)
|
||||
|
||||
url = f"data:{format if image_already_encoded else 'image/jpeg'};base64,{encoded_image}"
|
||||
|
||||
return LLMContext.create_image_url_message(role=role, url=url, text=text)
|
||||
|
||||
@staticmethod
|
||||
def create_audio_message(
|
||||
async def create_audio_message(
|
||||
*, role: str = "user", audio_frames: list[AudioRawFrame], text: str = "Audio follows"
|
||||
) -> LLMContextMessage:
|
||||
"""Create a context message containing audio.
|
||||
@@ -172,21 +187,25 @@ class LLMContext:
|
||||
audio_frames: List of audio frame objects to include.
|
||||
text: Optional text to include with the audio.
|
||||
"""
|
||||
sample_rate = audio_frames[0].sample_rate
|
||||
num_channels = audio_frames[0].num_channels
|
||||
content = [{"type": "text", "text": text}]
|
||||
|
||||
content = []
|
||||
content.append({"type": "text", "text": text})
|
||||
data = b"".join(frame.audio for frame in audio_frames)
|
||||
async def encode_audio():
|
||||
sample_rate = audio_frames[0].sample_rate
|
||||
num_channels = audio_frames[0].num_channels
|
||||
|
||||
with io.BytesIO() as buffer:
|
||||
with wave.open(buffer, "wb") as wf:
|
||||
wf.setsampwidth(2)
|
||||
wf.setnchannels(num_channels)
|
||||
wf.setframerate(sample_rate)
|
||||
wf.writeframes(data)
|
||||
data = b"".join(frame.audio for frame in audio_frames)
|
||||
|
||||
encoded_audio = base64.b64encode(buffer.getvalue()).decode("utf-8")
|
||||
with io.BytesIO() as buffer:
|
||||
with wave.open(buffer, "wb") as wf:
|
||||
wf.setsampwidth(2)
|
||||
wf.setnchannels(num_channels)
|
||||
wf.setframerate(sample_rate)
|
||||
wf.writeframes(data)
|
||||
|
||||
encoded_audio = base64.b64encode(buffer.getvalue()).decode("utf-8")
|
||||
return encoded_audio
|
||||
|
||||
encoded_audio = await asyncio.to_thread(encode_audio)
|
||||
|
||||
content.append(
|
||||
{
|
||||
@@ -321,21 +340,31 @@ class LLMContext:
|
||||
"""
|
||||
self._tool_choice = tool_choice
|
||||
|
||||
def add_image_frame_message(
|
||||
self, *, format: str, size: tuple[int, int], image: bytes, text: Optional[str] = None
|
||||
async def add_image_frame_message(
|
||||
self,
|
||||
*,
|
||||
format: str,
|
||||
size: tuple[int, int],
|
||||
image: bytes,
|
||||
text: Optional[str] = None,
|
||||
role: str = "user",
|
||||
):
|
||||
"""Add a message containing an image frame.
|
||||
|
||||
Args:
|
||||
format: Image format (e.g., 'RGB', 'RGBA').
|
||||
format: Image format (e.g., 'RGB', 'RGBA', or, if already encoded,
|
||||
the MIME type like 'image/jpeg').
|
||||
size: Image dimensions as (width, height) tuple.
|
||||
image: Raw image bytes.
|
||||
text: Optional text to include with the image.
|
||||
role: The role of this message (defaults to "user").
|
||||
"""
|
||||
message = LLMContext.create_image_message(format=format, size=size, image=image, text=text)
|
||||
message = await LLMContext.create_image_message(
|
||||
role=role, format=format, size=size, image=image, text=text
|
||||
)
|
||||
self.add_message(message)
|
||||
|
||||
def add_audio_frames_message(
|
||||
async def add_audio_frames_message(
|
||||
self, *, audio_frames: list[AudioRawFrame], text: str = "Audio follows"
|
||||
):
|
||||
"""Add a message containing audio frames.
|
||||
@@ -344,7 +373,7 @@ class LLMContext:
|
||||
audio_frames: List of audio frame objects to include.
|
||||
text: Optional text to include with the audio.
|
||||
"""
|
||||
message = LLMContext.create_audio_message(audio_frames=audio_frames, text=text)
|
||||
message = await LLMContext.create_audio_message(audio_frames=audio_frames, text=text)
|
||||
self.add_message(message)
|
||||
|
||||
@staticmethod
|
||||
|
||||
@@ -1001,7 +1001,7 @@ class LLMAssistantContextAggregator(LLMContextResponseAggregator):
|
||||
await self.push_aggregation()
|
||||
|
||||
async def _handle_text(self, frame: TextFrame):
|
||||
if not self._started:
|
||||
if not self._started or not frame.append_to_context:
|
||||
return
|
||||
|
||||
if self._params.expect_stripped_words:
|
||||
|
||||
@@ -24,6 +24,7 @@ from pipecat.audio.interruptions.base_interruption_strategy import BaseInterrupt
|
||||
from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
from pipecat.frames.frames import (
|
||||
AssistantImageRawFrame,
|
||||
BotStartedSpeakingFrame,
|
||||
BotStoppedSpeakingFrame,
|
||||
CancelFrame,
|
||||
@@ -47,6 +48,9 @@ from pipecat.frames.frames import (
|
||||
LLMRunFrame,
|
||||
LLMSetToolChoiceFrame,
|
||||
LLMSetToolsFrame,
|
||||
LLMThoughtEndFrame,
|
||||
LLMThoughtStartFrame,
|
||||
LLMThoughtTextFrame,
|
||||
SpeechControlParamsFrame,
|
||||
StartFrame,
|
||||
TextFrame,
|
||||
@@ -66,7 +70,7 @@ from pipecat.processors.aggregators.llm_response import (
|
||||
LLMUserAggregatorParams,
|
||||
)
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.utils.string import concatenate_aggregated_text
|
||||
from pipecat.utils.string import TextPartForConcatenation, concatenate_aggregated_text
|
||||
from pipecat.utils.time import time_now_iso8601
|
||||
|
||||
|
||||
@@ -90,15 +94,7 @@ class LLMContextAggregator(FrameProcessor):
|
||||
self._context = context
|
||||
self._role = role
|
||||
|
||||
self._aggregation: List[str] = []
|
||||
|
||||
# Whether to add spaces between text parts.
|
||||
# (Currently only used by LLMAssistantAggregator, but could be expanded
|
||||
# to LLMUserAggregator in the future if needed; that would require
|
||||
# additional work since LLMUserAggregator currently trims spaces from
|
||||
# incoming frames before determining whether it "really" received any
|
||||
# text).
|
||||
self._add_spaces = True
|
||||
self._aggregation: List[TextPartForConcatenation] = []
|
||||
|
||||
@property
|
||||
def messages(self) -> List[LLMContextMessage]:
|
||||
@@ -191,7 +187,7 @@ class LLMContextAggregator(FrameProcessor):
|
||||
Returns:
|
||||
The concatenated aggregation string.
|
||||
"""
|
||||
return concatenate_aggregated_text(self._aggregation, self._add_spaces)
|
||||
return concatenate_aggregated_text(self._aggregation)
|
||||
|
||||
|
||||
class LLMUserAggregator(LLMContextAggregator):
|
||||
@@ -441,7 +437,12 @@ class LLMUserAggregator(LLMContextAggregator):
|
||||
if not text.strip():
|
||||
return
|
||||
|
||||
self._aggregation.append(text)
|
||||
# Transcriptions never include inter-part spaces (so far).
|
||||
self._aggregation.append(
|
||||
TextPartForConcatenation(
|
||||
text, includes_inter_part_spaces=frame.includes_inter_frame_spaces
|
||||
)
|
||||
)
|
||||
# We just got a final result, so let's reset interim results.
|
||||
self._seen_interim_results = False
|
||||
# Reset aggregation timer.
|
||||
@@ -595,6 +596,10 @@ class LLMAssistantAggregator(LLMContextAggregator):
|
||||
self._function_calls_in_progress: Dict[str, Optional[FunctionCallInProgressFrame]] = {}
|
||||
self._context_updated_tasks: Set[asyncio.Task] = set()
|
||||
|
||||
self._thought_aggregation_enabled = False
|
||||
self._thought_llm: str = ""
|
||||
self._thought_aggregation: List[TextPartForConcatenation] = []
|
||||
|
||||
@property
|
||||
def has_function_calls_in_progress(self) -> bool:
|
||||
"""Check if there are any function calls currently in progress.
|
||||
@@ -604,6 +609,17 @@ class LLMAssistantAggregator(LLMContextAggregator):
|
||||
"""
|
||||
return bool(self._function_calls_in_progress)
|
||||
|
||||
async def reset(self):
|
||||
"""Reset the aggregation state."""
|
||||
await super().reset()
|
||||
await self._reset_thought_aggregation() # Just to be safe
|
||||
|
||||
async def _reset_thought_aggregation(self):
|
||||
"""Reset the thought aggregation state."""
|
||||
self._thought_aggregation_enabled = False
|
||||
self._thought_llm = ""
|
||||
self._thought_aggregation = []
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
"""Process frames for assistant response aggregation and function call management.
|
||||
|
||||
@@ -622,6 +638,12 @@ class LLMAssistantAggregator(LLMContextAggregator):
|
||||
await self._handle_llm_end(frame)
|
||||
elif isinstance(frame, TextFrame):
|
||||
await self._handle_text(frame)
|
||||
elif isinstance(frame, LLMThoughtStartFrame):
|
||||
await self._handle_thought_start(frame)
|
||||
elif isinstance(frame, LLMThoughtTextFrame):
|
||||
await self._handle_thought_text(frame)
|
||||
elif isinstance(frame, LLMThoughtEndFrame):
|
||||
await self._handle_thought_end(frame)
|
||||
elif isinstance(frame, LLMRunFrame):
|
||||
await self._handle_llm_run(frame)
|
||||
elif isinstance(frame, LLMMessagesAppendFrame):
|
||||
@@ -642,6 +664,8 @@ class LLMAssistantAggregator(LLMContextAggregator):
|
||||
await self._handle_function_call_cancel(frame)
|
||||
elif isinstance(frame, UserImageRawFrame):
|
||||
await self._handle_user_image_frame(frame)
|
||||
elif isinstance(frame, AssistantImageRawFrame):
|
||||
await self._handle_assistant_image_frame(frame)
|
||||
elif isinstance(frame, BotStoppedSpeakingFrame):
|
||||
await self.push_aggregation()
|
||||
await self.push_frame(frame, direction)
|
||||
@@ -796,7 +820,7 @@ class LLMAssistantAggregator(LLMContextAggregator):
|
||||
|
||||
logger.debug(f"{self} Appending UserImageRawFrame to LLM context (size: {frame.size})")
|
||||
|
||||
self._context.add_image_frame_message(
|
||||
await self._context.add_image_frame_message(
|
||||
format=frame.format,
|
||||
size=frame.size,
|
||||
image=frame.image,
|
||||
@@ -806,6 +830,24 @@ class LLMAssistantAggregator(LLMContextAggregator):
|
||||
await self.push_aggregation()
|
||||
await self.push_context_frame(FrameDirection.UPSTREAM)
|
||||
|
||||
async def _handle_assistant_image_frame(self, frame: AssistantImageRawFrame):
|
||||
logger.debug(f"{self} Appending AssistantImageRawFrame to LLM context (size: {frame.size})")
|
||||
|
||||
if frame.original_data and frame.original_mime_type:
|
||||
await self._context.add_image_frame_message(
|
||||
format=frame.original_mime_type,
|
||||
size=frame.size, # Technically doesn't matter, since already encoded
|
||||
image=frame.original_data,
|
||||
role="assistant",
|
||||
)
|
||||
else:
|
||||
await self._context.add_image_frame_message(
|
||||
format=frame.format,
|
||||
size=frame.size,
|
||||
image=frame.image,
|
||||
role="assistant",
|
||||
)
|
||||
|
||||
async def _handle_llm_start(self, _: LLMFullResponseStartFrame):
|
||||
self._started += 1
|
||||
|
||||
@@ -814,18 +856,59 @@ class LLMAssistantAggregator(LLMContextAggregator):
|
||||
await self.push_aggregation()
|
||||
|
||||
async def _handle_text(self, frame: TextFrame):
|
||||
if not self._started:
|
||||
if not self._started or not frame.append_to_context:
|
||||
return
|
||||
|
||||
# Make sure we really have text (spaces count, too!)
|
||||
if len(frame.text) == 0:
|
||||
return
|
||||
|
||||
# Track whether we need to add spaces between text parts
|
||||
# Assumption: we can just keep track of the latest frame's value
|
||||
self._add_spaces = not frame.includes_inter_frame_spaces
|
||||
self._aggregation.append(
|
||||
TextPartForConcatenation(
|
||||
frame.text, includes_inter_part_spaces=frame.includes_inter_frame_spaces
|
||||
)
|
||||
)
|
||||
|
||||
self._aggregation.append(frame.text)
|
||||
async def _handle_thought_start(self, frame: LLMThoughtStartFrame):
|
||||
if not self._started:
|
||||
return
|
||||
|
||||
await self._reset_thought_aggregation()
|
||||
self._thought_aggregation_enabled = frame.append_to_context
|
||||
self._thought_llm = frame.llm
|
||||
|
||||
async def _handle_thought_text(self, frame: LLMThoughtTextFrame):
|
||||
if not self._started or not self._thought_aggregation_enabled:
|
||||
return
|
||||
|
||||
# Make sure we really have text (spaces count, too!)
|
||||
if len(frame.text) == 0:
|
||||
return
|
||||
|
||||
self._thought_aggregation.append(
|
||||
TextPartForConcatenation(
|
||||
frame.text, includes_inter_part_spaces=frame.includes_inter_frame_spaces
|
||||
)
|
||||
)
|
||||
|
||||
async def _handle_thought_end(self, frame: LLMThoughtEndFrame):
|
||||
if not self._started or not self._thought_aggregation_enabled:
|
||||
return
|
||||
|
||||
thought = concatenate_aggregated_text(self._thought_aggregation)
|
||||
llm = self._thought_llm
|
||||
await self._reset_thought_aggregation()
|
||||
|
||||
self._context.add_message(
|
||||
LLMSpecificMessage(
|
||||
llm=llm,
|
||||
message={
|
||||
"type": "thought",
|
||||
"text": thought,
|
||||
"signature": frame.signature,
|
||||
},
|
||||
)
|
||||
)
|
||||
|
||||
def _context_updated_task_finished(self, task: asyncio.Task):
|
||||
self._context_updated_tasks.discard(task)
|
||||
|
||||
103
src/pipecat/processors/aggregators/llm_text_processor.py
Normal file
103
src/pipecat/processors/aggregators/llm_text_processor.py
Normal file
@@ -0,0 +1,103 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""LLM text processor module for processing and aggregating raw LLM output text.
|
||||
|
||||
This processor will convert LLMTextFrames into AggregatedTextFrames based on the
|
||||
configured text aggregator. Using the customizable aggregator, it provides
|
||||
functionality to handle or manipulate LLM text frames before they are sent to other
|
||||
components such as TTS services or context aggregators. It can be used to pre-aggregate
|
||||
and categorize, modify, or filter direct output tokens from the LLM.
|
||||
"""
|
||||
|
||||
from typing import Optional
|
||||
|
||||
from pipecat.frames.frames import (
|
||||
AggregatedTextFrame,
|
||||
EndFrame,
|
||||
Frame,
|
||||
InterruptionFrame,
|
||||
LLMFullResponseEndFrame,
|
||||
LLMTextFrame,
|
||||
)
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.utils.text.base_text_aggregator import BaseTextAggregator
|
||||
from pipecat.utils.text.simple_text_aggregator import SimpleTextAggregator
|
||||
|
||||
|
||||
class LLMTextProcessor(FrameProcessor):
|
||||
"""A processor for handling or manipulating LLM text frames before they are processed further.
|
||||
|
||||
This processor will convert LLMTextFrames into AggregatedTextFrames based on the configured
|
||||
text aggregator. Using the customizable aggregator, it provides functionality to handle or
|
||||
manipulate LLM text frames before they are sent to other components such as TTS services or
|
||||
context aggregators. It can be used to pre-aggregate and categorize, modify, or filter direct
|
||||
output tokens from the LLM.
|
||||
"""
|
||||
|
||||
def __init__(self, *, text_aggregator: Optional[BaseTextAggregator] = None, **kwargs):
|
||||
"""Initialize the LLM text processor.
|
||||
|
||||
Args:
|
||||
text_aggregator: An optional text aggregator to use for processing LLM text frames. By
|
||||
default, a SimpleTextAggregator aggregating by sentence will be used.
|
||||
**kwargs: Additional arguments passed to parent class.
|
||||
|
||||
TODO: Allow transformations per aggregation type or all (and deprecate the TTS filters).
|
||||
"""
|
||||
super().__init__(**kwargs)
|
||||
self._text_aggregator: BaseTextAggregator = text_aggregator or SimpleTextAggregator()
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
"""Process an LLMTextFrames using the aggregator to generate AggregatedTextFrames.
|
||||
|
||||
Args:
|
||||
frame: The frame to process.
|
||||
direction: The direction of frame flow in the pipeline.
|
||||
"""
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if isinstance(frame, InterruptionFrame):
|
||||
await self._handle_interruption(frame)
|
||||
await self.push_frame(frame, direction)
|
||||
elif isinstance(frame, LLMTextFrame):
|
||||
await self._handle_llm_text(frame)
|
||||
elif isinstance(frame, LLMFullResponseEndFrame):
|
||||
await self._handle_llm_end(frame.skip_tts)
|
||||
await self.push_frame(frame, direction)
|
||||
elif isinstance(frame, EndFrame):
|
||||
await self._handle_llm_end()
|
||||
await self.push_frame(frame, direction)
|
||||
else:
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
async def _handle_interruption(self, _):
|
||||
"""Handle interruptions by resetting the text aggregator."""
|
||||
await self._text_aggregator.handle_interruption()
|
||||
|
||||
async def reset(self):
|
||||
"""Reset the internal state of the text processor and its aggregator."""
|
||||
await self._text_aggregator.reset()
|
||||
|
||||
async def _handle_llm_text(self, in_frame: LLMTextFrame):
|
||||
async for aggregation in self._text_aggregator.aggregate(in_frame.text):
|
||||
out_frame = AggregatedTextFrame(
|
||||
text=aggregation.text,
|
||||
aggregated_by=aggregation.type,
|
||||
)
|
||||
out_frame.skip_tts = in_frame.skip_tts
|
||||
await self.push_frame(out_frame)
|
||||
|
||||
async def _handle_llm_end(self, skip_tts: Optional[bool] = None):
|
||||
# Flush any remaining text
|
||||
remaining = await self._text_aggregator.flush()
|
||||
if remaining:
|
||||
out_frame = AggregatedTextFrame(
|
||||
text=remaining.text,
|
||||
aggregated_by=remaining.type,
|
||||
)
|
||||
out_frame.skip_tts = skip_tts
|
||||
await self.push_frame(out_frame)
|
||||
@@ -83,4 +83,4 @@ class ConsumerProcessor(FrameProcessor):
|
||||
while True:
|
||||
frame = await self._queue.get()
|
||||
new_frame = await self._transformer(frame)
|
||||
await self.push_frame(new_frame, self._direction)
|
||||
await self.queue_frame(new_frame, self._direction)
|
||||
|
||||
@@ -126,6 +126,4 @@ class WakeCheckFilter(FrameProcessor):
|
||||
else:
|
||||
await self.push_frame(frame, direction)
|
||||
except Exception as e:
|
||||
error_msg = f"Error in wake word filter: {e}"
|
||||
logger.exception(error_msg)
|
||||
await self.push_error(ErrorFrame(error_msg))
|
||||
await self.push_error(error_msg=f"Error in wake word filter: {e}", exception=e)
|
||||
|
||||
@@ -10,7 +10,7 @@ from typing import Awaitable, Callable, Tuple, Type
|
||||
|
||||
from pipecat.frames.frames import Frame
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.sync.base_notifier import BaseNotifier
|
||||
from pipecat.utils.sync.base_notifier import BaseNotifier
|
||||
|
||||
|
||||
class WakeNotifierFilter(FrameProcessor):
|
||||
|
||||
@@ -12,6 +12,7 @@ management, and frame flow control mechanisms.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import traceback
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
from typing import Any, Awaitable, Callable, Coroutine, List, Optional, Sequence, Tuple, Type
|
||||
@@ -32,6 +33,7 @@ from pipecat.frames.frames import (
|
||||
InterruptionTaskFrame,
|
||||
StartFrame,
|
||||
SystemFrame,
|
||||
UninterruptibleFrame,
|
||||
)
|
||||
from pipecat.metrics.metrics import LLMTokenUsage, MetricsData
|
||||
from pipecat.observers.base_observer import BaseObserver, FrameProcessed, FramePushed
|
||||
@@ -142,6 +144,7 @@ class FrameProcessor(BaseObject):
|
||||
- on_after_process_frame: Called after a frame is processed
|
||||
- on_before_push_frame: Called before a frame is pushed
|
||||
- on_after_push_frame: Called after a frame is pushed
|
||||
- on_error: Called when an error is raised in the frame processing.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
@@ -209,6 +212,7 @@ class FrameProcessor(BaseObject):
|
||||
# The input task that handles all types of frames. It processes system
|
||||
# frames right away and queues non-system frames for later processing.
|
||||
self.__should_block_system_frames = False
|
||||
self.__input_queue = FrameProcessorQueue()
|
||||
self.__input_event: Optional[asyncio.Event] = None
|
||||
self.__input_frame_task: Optional[asyncio.Task] = None
|
||||
|
||||
@@ -218,8 +222,10 @@ class FrameProcessor(BaseObject):
|
||||
# called. To resume processing frames we need to call
|
||||
# `resume_processing_frames()` which will wake up the event.
|
||||
self.__should_block_frames = False
|
||||
self.__process_queue = asyncio.Queue()
|
||||
self.__process_event: Optional[asyncio.Event] = None
|
||||
self.__process_frame_task: Optional[asyncio.Task] = None
|
||||
self.__process_current_frame: Optional[Frame] = None
|
||||
|
||||
# To interrupt a pipeline, we push an `InterruptionTaskFrame` upstream.
|
||||
# Then we wait for the corresponding `InterruptionFrame` to travel from
|
||||
@@ -234,6 +240,7 @@ class FrameProcessor(BaseObject):
|
||||
self._register_event_handler("on_after_process_frame", sync=True)
|
||||
self._register_event_handler("on_before_push_frame", sync=True)
|
||||
self._register_event_handler("on_after_push_frame", sync=True)
|
||||
self._register_event_handler("on_error", sync=True)
|
||||
|
||||
@property
|
||||
def id(self) -> int:
|
||||
@@ -630,7 +637,43 @@ class FrameProcessor(BaseObject):
|
||||
elif isinstance(frame, (FrameProcessorResumeFrame, FrameProcessorResumeUrgentFrame)):
|
||||
await self.__resume(frame)
|
||||
|
||||
async def push_error(self, error: ErrorFrame):
|
||||
async def push_error(
|
||||
self,
|
||||
error_msg: str,
|
||||
exception: Optional[Exception] = None,
|
||||
fatal: bool = False,
|
||||
):
|
||||
"""Creates and pushes an ErrorFrame upstream.
|
||||
|
||||
Creates and pushes an ErrorFrame upstream to notify other processors in the
|
||||
pipeline about an error condition. The error frame will include context about
|
||||
which processor generated the error.
|
||||
|
||||
Args:
|
||||
error_msg: Descriptive message explaining the error condition.
|
||||
exception: Optional exception object that caused the error, if available.
|
||||
This provides additional context for debugging and error handling.
|
||||
fatal: Whether this error should be considered fatal to the pipeline.
|
||||
Fatal errors typically cause the entire pipeline to stop processing.
|
||||
Defaults to False for non-fatal errors.
|
||||
|
||||
Example::
|
||||
|
||||
```python
|
||||
# Non-fatal error
|
||||
await self.push_error("Failed to process audio chunk, skipping")
|
||||
|
||||
# Fatal error with exception context
|
||||
try:
|
||||
result = some_critical_operation()
|
||||
except Exception as e:
|
||||
await self.push_error("Critical operation failed", exception=e, fatal=True)
|
||||
```
|
||||
"""
|
||||
error_frame = ErrorFrame(error=error_msg, fatal=fatal, exception=exception, processor=self)
|
||||
await self.push_error_frame(error=error_frame)
|
||||
|
||||
async def push_error_frame(self, error: ErrorFrame):
|
||||
"""Push an error frame upstream.
|
||||
|
||||
Args:
|
||||
@@ -638,6 +681,18 @@ class FrameProcessor(BaseObject):
|
||||
"""
|
||||
if not error.processor:
|
||||
error.processor = self
|
||||
await self._call_event_handler("on_error", error)
|
||||
|
||||
if error.exception:
|
||||
tb = traceback.extract_tb(error.exception.__traceback__)
|
||||
last = tb[-1]
|
||||
error_message = (
|
||||
f"{error.processor} exception ({last.filename}:{last.lineno}): {error.error}"
|
||||
)
|
||||
else:
|
||||
error_message = f"{error.processor} error: {error.error}"
|
||||
|
||||
logger.error(error_message)
|
||||
await self.push_frame(error, FrameDirection.UPSTREAM)
|
||||
|
||||
async def push_frame(self, frame: Frame, direction: FrameDirection = FrameDirection.DOWNSTREAM):
|
||||
@@ -754,13 +809,19 @@ class FrameProcessor(BaseObject):
|
||||
# interruption). Instead we just drain the queue because this is
|
||||
# an interruption.
|
||||
self.__reset_process_task()
|
||||
elif isinstance(self.__process_current_frame, UninterruptibleFrame):
|
||||
# We don't want to cancel UninterruptibleFrame, so we simply
|
||||
# cleanup the queue.
|
||||
self.__reset_process_queue()
|
||||
else:
|
||||
# Cancel and re-create the process task including the queue.
|
||||
# Cancel and re-create the process task.
|
||||
await self.__cancel_process_task()
|
||||
self.__create_process_task()
|
||||
except Exception as e:
|
||||
logger.exception(f"Uncaught exception in {self} when handling _start_interruption: {e}")
|
||||
await self.push_error(ErrorFrame(str(e)))
|
||||
await self.push_error(
|
||||
error_msg=f"Uncaught exception handling _start_interruption: {e}",
|
||||
exception=e,
|
||||
)
|
||||
|
||||
async def __internal_push_frame(self, frame: Frame, direction: FrameDirection):
|
||||
"""Internal method to push frames to adjacent processors.
|
||||
@@ -797,8 +858,7 @@ class FrameProcessor(BaseObject):
|
||||
await self._observer.on_push_frame(data)
|
||||
await self._prev.queue_frame(frame, direction)
|
||||
except Exception as e:
|
||||
logger.exception(f"Uncaught exception in {self}: {e}")
|
||||
await self.push_error(ErrorFrame(str(e)))
|
||||
await self.push_error(error_msg=f"Uncaught exception: {e}", exception=e)
|
||||
|
||||
def _check_started(self, frame: Frame):
|
||||
"""Check if the processor has been started.
|
||||
@@ -820,7 +880,6 @@ class FrameProcessor(BaseObject):
|
||||
|
||||
if not self.__input_frame_task:
|
||||
self.__input_event = asyncio.Event()
|
||||
self.__input_queue = FrameProcessorQueue()
|
||||
self.__input_frame_task = self.create_task(self.__input_frame_task_handler())
|
||||
|
||||
async def __cancel_input_task(self):
|
||||
@@ -838,9 +897,7 @@ class FrameProcessor(BaseObject):
|
||||
return
|
||||
|
||||
if not self.__process_frame_task:
|
||||
self.__should_block_frames = False
|
||||
self.__process_event = asyncio.Event()
|
||||
self.__process_queue = asyncio.Queue()
|
||||
self.__reset_process_task()
|
||||
self.__process_frame_task = self.create_task(self.__process_frame_task_handler())
|
||||
|
||||
def __reset_process_task(self):
|
||||
@@ -850,10 +907,26 @@ class FrameProcessor(BaseObject):
|
||||
|
||||
self.__should_block_frames = False
|
||||
self.__process_event = asyncio.Event()
|
||||
self.__reset_process_queue()
|
||||
|
||||
def __reset_process_queue(self):
|
||||
"""Reset non-system frame processing queue."""
|
||||
# Create a new queue to insert UninterruptibleFrame frames.
|
||||
new_queue = asyncio.Queue()
|
||||
|
||||
# Process current queue and keep UninterruptibleFrame frames.
|
||||
while not self.__process_queue.empty():
|
||||
self.__process_queue.get_nowait()
|
||||
item = self.__process_queue.get_nowait()
|
||||
if isinstance(item, UninterruptibleFrame):
|
||||
new_queue.put_nowait(item)
|
||||
self.__process_queue.task_done()
|
||||
|
||||
# Put back UninterruptibleFrame frames into our process queue.
|
||||
while not new_queue.empty():
|
||||
item = new_queue.get_nowait()
|
||||
self.__process_queue.put_nowait(item)
|
||||
new_queue.task_done()
|
||||
|
||||
async def __cancel_process_task(self):
|
||||
"""Cancel the non-system frame processing task."""
|
||||
if self.__process_frame_task:
|
||||
@@ -874,8 +947,7 @@ class FrameProcessor(BaseObject):
|
||||
|
||||
await self._call_event_handler("on_after_process_frame", frame)
|
||||
except Exception as e:
|
||||
logger.exception(f"{self}: error processing frame: {e}")
|
||||
await self.push_error(ErrorFrame(str(e)))
|
||||
await self.push_error(error_msg=f"Error processing frame: {e}", exception=e)
|
||||
|
||||
async def __input_frame_task_handler(self):
|
||||
"""Handle frames from the input queue.
|
||||
@@ -908,8 +980,12 @@ class FrameProcessor(BaseObject):
|
||||
async def __process_frame_task_handler(self):
|
||||
"""Handle non-system frames from the process queue."""
|
||||
while True:
|
||||
self.__process_current_frame = None
|
||||
|
||||
(frame, direction, callback) = await self.__process_queue.get()
|
||||
|
||||
self.__process_current_frame = frame
|
||||
|
||||
if self.__should_block_frames and self.__process_event:
|
||||
logger.trace(f"{self}: frame processing paused")
|
||||
await self.__process_event.wait()
|
||||
|
||||
@@ -24,7 +24,7 @@ try:
|
||||
from langchain_core.messages import AIMessageChunk
|
||||
from langchain_core.runnables import Runnable
|
||||
except ModuleNotFoundError as e:
|
||||
logger.exception("In order to use Langchain, you need to `pip install pipecat-ai[langchain]`. ")
|
||||
logger.error("In order to use Langchain, you need to `pip install pipecat-ai[langchain]`. ")
|
||||
raise Exception(f"Missing module: {e}")
|
||||
|
||||
|
||||
@@ -113,6 +113,6 @@ class LangchainProcessor(FrameProcessor):
|
||||
except GeneratorExit:
|
||||
logger.warning(f"{self} generator was closed prematurely")
|
||||
except Exception as e:
|
||||
logger.exception(f"{self} an unknown error occurred: {e}")
|
||||
await self.push_error(error_msg=f"Unknown error occurred: {e}", exception=e)
|
||||
finally:
|
||||
await self.push_frame(LLMFullResponseEndFrame())
|
||||
|
||||
@@ -24,14 +24,18 @@ from typing import (
|
||||
Literal,
|
||||
Mapping,
|
||||
Optional,
|
||||
Tuple,
|
||||
Union,
|
||||
)
|
||||
|
||||
from loguru import logger
|
||||
from pydantic import BaseModel, Field, PrivateAttr, ValidationError
|
||||
|
||||
from pipecat import version as pipecat_version
|
||||
from pipecat.audio.utils import calculate_audio_volume
|
||||
from pipecat.frames.frames import (
|
||||
AggregatedTextFrame,
|
||||
AggregationType,
|
||||
BotStartedSpeakingFrame,
|
||||
BotStoppedSpeakingFrame,
|
||||
CancelFrame,
|
||||
@@ -82,7 +86,7 @@ from pipecat.transports.base_output import BaseOutputTransport
|
||||
from pipecat.transports.base_transport import BaseTransport
|
||||
from pipecat.utils.string import match_endofsentence
|
||||
|
||||
RTVI_PROTOCOL_VERSION = "1.0.0"
|
||||
RTVI_PROTOCOL_VERSION = "1.1.0"
|
||||
|
||||
RTVI_MESSAGE_LABEL = "rtvi-ai"
|
||||
RTVIMessageLiteral = Literal["rtvi-ai"]
|
||||
@@ -704,6 +708,29 @@ class RTVITextMessageData(BaseModel):
|
||||
text: str
|
||||
|
||||
|
||||
class RTVIBotOutputMessageData(RTVITextMessageData):
|
||||
"""Data for bot output RTVI messages.
|
||||
|
||||
Extends RTVITextMessageData to include metadata about the output.
|
||||
"""
|
||||
|
||||
spoken: bool = False # Indicates if the text has been spoken by TTS
|
||||
aggregated_by: AggregationType | str
|
||||
# Indicates what form the text is in (e.g., by word, sentence, etc.)
|
||||
|
||||
|
||||
class RTVIBotOutputMessage(BaseModel):
|
||||
"""Message containing bot output text.
|
||||
|
||||
An event meant to holistically represent what the bot is outputting,
|
||||
along with metadata about the output and if it has been spoken.
|
||||
"""
|
||||
|
||||
label: RTVIMessageLiteral = RTVI_MESSAGE_LABEL
|
||||
type: Literal["bot-output"] = "bot-output"
|
||||
data: RTVIBotOutputMessageData
|
||||
|
||||
|
||||
class RTVIBotTranscriptionMessage(BaseModel):
|
||||
"""Message containing bot transcription text.
|
||||
|
||||
@@ -896,6 +923,7 @@ class RTVIObserverParams:
|
||||
Parameter `errors_enabled` is deprecated. Error messages are always enabled.
|
||||
|
||||
Parameters:
|
||||
bot_output_enabled: Indicates if bot output messages should be sent.
|
||||
bot_llm_enabled: Indicates if the bot's LLM messages should be sent.
|
||||
bot_tts_enabled: Indicates if the bot's TTS messages should be sent.
|
||||
bot_speaking_enabled: Indicates if the bot's started/stopped speaking messages should be sent.
|
||||
@@ -907,9 +935,17 @@ class RTVIObserverParams:
|
||||
metrics_enabled: Indicates if metrics messages should be sent.
|
||||
system_logs_enabled: Indicates if system logs should be sent.
|
||||
errors_enabled: [Deprecated] Indicates if errors messages should be sent.
|
||||
skip_aggregator_types: List of aggregation types to skip sending as tts/output messages.
|
||||
Note: if using this to avoid sending secure information, be sure to also disable
|
||||
bot_llm_enabled to avoid leaking through LLM messages.
|
||||
bot_output_transforms: A list of callables to transform text before just before sending it
|
||||
to TTS. Each callable takes the aggregated text and its type, and returns the
|
||||
transformed text. To register, provide a list of tuples of
|
||||
(aggregation_type | '*', transform_function).
|
||||
audio_level_period_secs: How often audio levels should be sent if enabled.
|
||||
"""
|
||||
|
||||
bot_output_enabled: bool = True
|
||||
bot_llm_enabled: bool = True
|
||||
bot_tts_enabled: bool = True
|
||||
bot_speaking_enabled: bool = True
|
||||
@@ -921,6 +957,15 @@ class RTVIObserverParams:
|
||||
metrics_enabled: bool = True
|
||||
system_logs_enabled: bool = False
|
||||
errors_enabled: Optional[bool] = None
|
||||
skip_aggregator_types: Optional[List[AggregationType | str]] = None
|
||||
bot_output_transforms: Optional[
|
||||
List[
|
||||
Tuple[
|
||||
AggregationType | str,
|
||||
Callable[[str, AggregationType | str], Awaitable[str]],
|
||||
]
|
||||
]
|
||||
] = None
|
||||
audio_level_period_secs: float = 0.15
|
||||
|
||||
|
||||
@@ -973,8 +1018,45 @@ class RTVIObserver(BaseObserver):
|
||||
DeprecationWarning,
|
||||
)
|
||||
|
||||
self._aggregation_transforms: List[
|
||||
Tuple[AggregationType | str, Callable[[str, AggregationType | str], Awaitable[str]]]
|
||||
] = self._params.bot_output_transforms or []
|
||||
|
||||
def add_bot_output_transformer(
|
||||
self,
|
||||
transform_function: Callable[[str, AggregationType | str], Awaitable[str]],
|
||||
aggregation_type: AggregationType | str = "*",
|
||||
):
|
||||
"""Transform text for a specific aggregation type before sending as Bot Output or TTS.
|
||||
|
||||
Args:
|
||||
transform_function: The function to apply for transformation. This function should take
|
||||
the text and aggregation type as input and return the transformed text.
|
||||
Ex.: async def my_transform(text: str, aggregation_type: str) -> str:
|
||||
aggregation_type: The type of aggregation to transform. This value defaults to "*" to
|
||||
handle all text before sending to the client.
|
||||
"""
|
||||
self._aggregation_transforms.append((aggregation_type, transform_function))
|
||||
|
||||
def remove_bot_output_transformer(
|
||||
self,
|
||||
transform_function: Callable[[str, AggregationType | str], Awaitable[str]],
|
||||
aggregation_type: AggregationType | str = "*",
|
||||
):
|
||||
"""Remove a text transformer for a specific aggregation type.
|
||||
|
||||
Args:
|
||||
transform_function: The function to remove.
|
||||
aggregation_type: The type of aggregation to remove the transformer for.
|
||||
"""
|
||||
self._aggregation_transforms = [
|
||||
(agg_type, func)
|
||||
for agg_type, func in self._aggregation_transforms
|
||||
if not (agg_type == aggregation_type and func == transform_function)
|
||||
]
|
||||
|
||||
async def _logger_sink(self, message):
|
||||
"""Logger sink so we cna send system logs to RTVI clients."""
|
||||
"""Logger sink so we can send system logs to RTVI clients."""
|
||||
message = RTVISystemLogMessage(data=RTVITextMessageData(text=message))
|
||||
await self.send_rtvi_message(message)
|
||||
|
||||
@@ -1048,12 +1130,15 @@ class RTVIObserver(BaseObserver):
|
||||
await self.send_rtvi_message(RTVIBotTTSStartedMessage())
|
||||
elif isinstance(frame, TTSStoppedFrame) and self._params.bot_tts_enabled:
|
||||
await self.send_rtvi_message(RTVIBotTTSStoppedMessage())
|
||||
elif isinstance(frame, TTSTextFrame) and self._params.bot_tts_enabled:
|
||||
if isinstance(src, BaseOutputTransport):
|
||||
message = RTVIBotTTSTextMessage(data=RTVITextMessageData(text=frame.text))
|
||||
await self.send_rtvi_message(message)
|
||||
else:
|
||||
elif isinstance(frame, AggregatedTextFrame) and (
|
||||
self._params.bot_output_enabled or self._params.bot_tts_enabled
|
||||
):
|
||||
if isinstance(frame, TTSTextFrame) and not isinstance(src, BaseOutputTransport):
|
||||
# This check is to make sure we handle the frame when it has gone
|
||||
# through the transport and has correct timing.
|
||||
mark_as_seen = False
|
||||
else:
|
||||
await self._handle_aggregated_llm_text(frame)
|
||||
elif isinstance(frame, MetricsFrame) and self._params.metrics_enabled:
|
||||
await self._handle_metrics(frame)
|
||||
elif isinstance(frame, RTVIServerMessageFrame):
|
||||
@@ -1084,15 +1169,6 @@ class RTVIObserver(BaseObserver):
|
||||
if mark_as_seen:
|
||||
self._frames_seen.add(frame.id)
|
||||
|
||||
async def _push_bot_transcription(self):
|
||||
"""Push accumulated bot transcription as a message."""
|
||||
if len(self._bot_transcription) > 0:
|
||||
message = RTVIBotTranscriptionMessage(
|
||||
data=RTVITextMessageData(text=self._bot_transcription)
|
||||
)
|
||||
await self.send_rtvi_message(message)
|
||||
self._bot_transcription = ""
|
||||
|
||||
async def _handle_interruptions(self, frame: Frame):
|
||||
"""Handle user speaking interruption frames."""
|
||||
message = None
|
||||
@@ -1115,14 +1191,45 @@ class RTVIObserver(BaseObserver):
|
||||
if message:
|
||||
await self.send_rtvi_message(message)
|
||||
|
||||
async def _handle_aggregated_llm_text(self, frame: AggregatedTextFrame):
|
||||
"""Handle aggregated LLM text output frames."""
|
||||
# Skip certain aggregator types if configured to do so.
|
||||
if (
|
||||
self._params.skip_aggregator_types
|
||||
and frame.aggregated_by in self._params.skip_aggregator_types
|
||||
):
|
||||
return
|
||||
|
||||
text = frame.text
|
||||
type = frame.aggregated_by
|
||||
for aggregation_type, transform in self._aggregation_transforms:
|
||||
if aggregation_type == type or aggregation_type == "*":
|
||||
text = await transform(text, type)
|
||||
|
||||
isTTS = isinstance(frame, TTSTextFrame)
|
||||
if self._params.bot_output_enabled:
|
||||
message = RTVIBotOutputMessage(
|
||||
data=RTVIBotOutputMessageData(text=text, spoken=isTTS, aggregated_by=type)
|
||||
)
|
||||
await self.send_rtvi_message(message)
|
||||
|
||||
if isTTS and self._params.bot_tts_enabled:
|
||||
tts_message = RTVIBotTTSTextMessage(data=RTVITextMessageData(text=text))
|
||||
await self.send_rtvi_message(tts_message)
|
||||
|
||||
async def _handle_llm_text_frame(self, frame: LLMTextFrame):
|
||||
"""Handle LLM text output frames."""
|
||||
message = RTVIBotLLMTextMessage(data=RTVITextMessageData(text=frame.text))
|
||||
await self.send_rtvi_message(message)
|
||||
|
||||
# TODO (mrkb): Remove all this logic when we fully deprecate bot-transcription messages.
|
||||
self._bot_transcription += frame.text
|
||||
if match_endofsentence(self._bot_transcription):
|
||||
await self._push_bot_transcription()
|
||||
|
||||
if match_endofsentence(self._bot_transcription) and len(self._bot_transcription) > 0:
|
||||
await self.send_rtvi_message(
|
||||
RTVIBotTranscriptionMessage(data=RTVITextMessageData(text=self._bot_transcription))
|
||||
)
|
||||
self._bot_transcription = ""
|
||||
|
||||
async def _handle_user_transcriptions(self, frame: Frame):
|
||||
"""Handle user transcription frames."""
|
||||
@@ -1248,7 +1355,7 @@ class RTVIProcessor(FrameProcessor):
|
||||
# Default to 0.3.0 which is the last version before actually having a
|
||||
# "client-version".
|
||||
self._client_version = [0, 3, 0]
|
||||
self._skip_tts: bool = False # Keep in sync with llm_service.py
|
||||
self._llm_skip_tts: bool = False # Keep in sync with llm_service.py's configuration.
|
||||
|
||||
self._registered_actions: Dict[str, RTVIAction] = {}
|
||||
self._registered_services: Dict[str, RTVIService] = {}
|
||||
@@ -1311,15 +1418,20 @@ class RTVIProcessor(FrameProcessor):
|
||||
self._client_ready = True
|
||||
await self._call_event_handler("on_client_ready")
|
||||
|
||||
async def set_bot_ready(self):
|
||||
"""Mark the bot as ready and send the bot-ready message."""
|
||||
async def set_bot_ready(self, about: Mapping[str, Any] = None):
|
||||
"""Mark the bot as ready and send the bot-ready message.
|
||||
|
||||
Args:
|
||||
about: Optional information about the bot to include in the ready message.
|
||||
If left as None, the Pipecat library and version will be used.
|
||||
"""
|
||||
self._bot_ready = True
|
||||
# Only call the (deprecated) _update_config method if the we're using a
|
||||
# config (which is deprecated). Otherwise we'd always print an
|
||||
# unnecessary deprecation warning.
|
||||
if self._config.config:
|
||||
await self._update_config(self._config, False)
|
||||
await self._send_bot_ready()
|
||||
await self._send_bot_ready(about=about)
|
||||
|
||||
async def interrupt_bot(self):
|
||||
"""Send a bot interruption frame upstream."""
|
||||
@@ -1441,7 +1553,7 @@ class RTVIProcessor(FrameProcessor):
|
||||
elif isinstance(frame, RTVIActionFrame):
|
||||
await self._action_queue.put(frame)
|
||||
elif isinstance(frame, LLMConfigureOutputFrame):
|
||||
self._skip_tts = frame.skip_tts
|
||||
self._llm_skip_tts = frame.skip_tts
|
||||
await self.push_frame(frame, direction)
|
||||
# Other frames
|
||||
else:
|
||||
@@ -1697,9 +1809,9 @@ class RTVIProcessor(FrameProcessor):
|
||||
opts = data.options if data.options is not None else RTVISendTextOptions()
|
||||
if opts.run_immediately:
|
||||
await self.interrupt_bot()
|
||||
cur_skip_tts = self._skip_tts
|
||||
cur_llm_skip_tts = self._llm_skip_tts
|
||||
should_skip_tts = not opts.audio_response
|
||||
toggle_skip_tts = cur_skip_tts != should_skip_tts
|
||||
toggle_skip_tts = cur_llm_skip_tts != should_skip_tts
|
||||
if toggle_skip_tts:
|
||||
output_frame = LLMConfigureOutputFrame(skip_tts=should_skip_tts)
|
||||
await self.push_frame(output_frame)
|
||||
@@ -1709,7 +1821,7 @@ class RTVIProcessor(FrameProcessor):
|
||||
)
|
||||
await self.push_frame(text_frame)
|
||||
if toggle_skip_tts:
|
||||
output_frame = LLMConfigureOutputFrame(skip_tts=cur_skip_tts)
|
||||
output_frame = LLMConfigureOutputFrame(skip_tts=cur_llm_skip_tts)
|
||||
await self.push_frame(output_frame)
|
||||
|
||||
async def _handle_update_context(self, data: RTVIAppendToContextData):
|
||||
@@ -1767,14 +1879,21 @@ class RTVIProcessor(FrameProcessor):
|
||||
message = RTVIActionResponse(id=request_id, data=RTVIActionResponseData(result=result))
|
||||
await self.push_transport_message(message)
|
||||
|
||||
async def _send_bot_ready(self):
|
||||
"""Send the bot-ready message to the client."""
|
||||
async def _send_bot_ready(self, about: Mapping[str, Any] = None):
|
||||
"""Send the bot-ready message to the client.
|
||||
|
||||
Args:
|
||||
about: Optional information about the bot to include in the ready message.
|
||||
If left as None, the pipecat library and version will be used.
|
||||
"""
|
||||
config = None
|
||||
if self._client_version and self._client_version[0] < 1:
|
||||
config = self._config.config
|
||||
if not about:
|
||||
about = {"library": "pipecat-ai", "library_version": f"{pipecat_version()}"}
|
||||
message = RTVIBotReady(
|
||||
id=self._client_ready_id,
|
||||
data=RTVIBotReadyData(version=RTVI_PROTOCOL_VERSION, config=config),
|
||||
data=RTVIBotReadyData(version=RTVI_PROTOCOL_VERSION, about=about, config=config),
|
||||
)
|
||||
await self.push_transport_message(message)
|
||||
|
||||
|
||||
@@ -23,7 +23,7 @@ try:
|
||||
from strands import Agent
|
||||
from strands.multiagent.graph import Graph
|
||||
except ModuleNotFoundError as e:
|
||||
logger.exception("In order to use Strands Agents, you need to `pip install strands-agents`.")
|
||||
logger.error("In order to use Strands Agents, you need to `pip install strands-agents`.")
|
||||
raise Exception(f"Missing module: {e}")
|
||||
|
||||
|
||||
@@ -143,7 +143,7 @@ class StrandsAgentsProcessor(FrameProcessor):
|
||||
except GeneratorExit:
|
||||
logger.warning(f"{self} generator was closed prematurely")
|
||||
except Exception as e:
|
||||
logger.exception(f"{self} an unknown error occurred: {e}")
|
||||
await self.push_error(error_msg=f"Unknown error occurred: {e}", exception=e)
|
||||
finally:
|
||||
if ttfb_tracking:
|
||||
await self.stop_ttfb_metrics()
|
||||
|
||||
@@ -20,13 +20,17 @@ from pipecat.frames.frames import (
|
||||
EndFrame,
|
||||
Frame,
|
||||
InterruptionFrame,
|
||||
LLMThoughtEndFrame,
|
||||
LLMThoughtStartFrame,
|
||||
LLMThoughtTextFrame,
|
||||
ThoughtTranscriptionMessage,
|
||||
TranscriptionFrame,
|
||||
TranscriptionMessage,
|
||||
TranscriptionUpdateFrame,
|
||||
TTSTextFrame,
|
||||
)
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.utils.string import concatenate_aggregated_text
|
||||
from pipecat.utils.string import TextPartForConcatenation, concatenate_aggregated_text
|
||||
from pipecat.utils.time import time_now_iso8601
|
||||
|
||||
|
||||
@@ -81,98 +85,98 @@ class UserTranscriptProcessor(BaseTranscriptProcessor):
|
||||
|
||||
|
||||
class AssistantTranscriptProcessor(BaseTranscriptProcessor):
|
||||
"""Processes assistant TTS text frames into timestamped conversation messages.
|
||||
"""Processes assistant TTS text frames and LLM thought frames into timestamped messages.
|
||||
|
||||
This processor aggregates TTS text frames into complete utterances and emits them as
|
||||
transcript messages. Utterances are completed when:
|
||||
This processor aggregates both TTS text frames and LLM thought frames into
|
||||
complete utterances and thoughts, emitting them as transcript messages.
|
||||
|
||||
An assistant utterance is completed when:
|
||||
- The bot stops speaking (BotStoppedSpeakingFrame)
|
||||
- The bot is interrupted (InterruptionFrame)
|
||||
- The pipeline ends (EndFrame)
|
||||
- The pipeline ends (EndFrame, CancelFrame)
|
||||
|
||||
A thought is completed when:
|
||||
- The thought ends (LLMThoughtEndFrame)
|
||||
- The bot is interrupted (InterruptionFrame)
|
||||
- The pipeline ends (EndFrame, CancelFrame)
|
||||
"""
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
def __init__(self, *, process_thoughts: bool = False, **kwargs):
|
||||
"""Initialize processor with aggregation state.
|
||||
|
||||
Args:
|
||||
process_thoughts: Whether to process LLM thought frames. Defaults to False.
|
||||
**kwargs: Additional arguments passed to parent class.
|
||||
"""
|
||||
super().__init__(**kwargs)
|
||||
self._current_text_parts: List[str] = []
|
||||
self._aggregation_start_time: Optional[str] = None
|
||||
|
||||
# Whether to add spaces between text parts.
|
||||
# (The use of this could be expanded to the UserTranscriptProcessor in
|
||||
# the future if needed; currently the UserTranscriptProcessor assumes
|
||||
# that user transcription frames do not need aggregation).
|
||||
self._add_spaces = True
|
||||
self._process_thoughts = process_thoughts
|
||||
self._current_assistant_text_parts: List[TextPartForConcatenation] = []
|
||||
self._assistant_text_start_time: Optional[str] = None
|
||||
|
||||
async def _emit_aggregated_text(self):
|
||||
self._current_thought_parts: List[TextPartForConcatenation] = []
|
||||
self._thought_start_time: Optional[str] = None
|
||||
self._thought_active = False
|
||||
|
||||
async def _emit_aggregated_assistant_text(self):
|
||||
"""Aggregates and emits text fragments as a transcript message.
|
||||
|
||||
This method uses a heuristic to automatically detect whether text fragments
|
||||
contain embedded spacing (spaces at the beginning or end of fragments) or not,
|
||||
and applies the appropriate joining strategy. It handles fragments from different
|
||||
TTS services with different formatting patterns.
|
||||
|
||||
Examples:
|
||||
Fragments with embedded spacing (concatenated)::
|
||||
|
||||
TTSTextFrame: ["Hello"]
|
||||
TTSTextFrame: [" there"] # Leading space
|
||||
TTSTextFrame: ["!"]
|
||||
TTSTextFrame: [" How"] # Leading space
|
||||
TTSTextFrame: ["'s"]
|
||||
TTSTextFrame: [" it"] # Leading space
|
||||
|
||||
Result: "Hello there! How's it"
|
||||
|
||||
Fragments with trailing spaces (concatenated)::
|
||||
|
||||
TTSTextFrame: ["Hel"]
|
||||
TTSTextFrame: ["lo "] # Trailing space
|
||||
TTSTextFrame: ["to "] # Trailing space
|
||||
TTSTextFrame: ["you"]
|
||||
|
||||
Result: "Hello to you"
|
||||
|
||||
Word-by-word fragments without spacing (joined with spaces)::
|
||||
|
||||
TTSTextFrame: ["Hello"]
|
||||
TTSTextFrame: ["there"]
|
||||
TTSTextFrame: ["how"]
|
||||
TTSTextFrame: ["are"]
|
||||
TTSTextFrame: ["you"]
|
||||
|
||||
Result: "Hello there how are you"
|
||||
This method aggregates text fragments that may arrive in multiple
|
||||
TTSTextFrame instances and emits them as a single TranscriptionMessage.
|
||||
"""
|
||||
if self._current_text_parts and self._aggregation_start_time:
|
||||
content = concatenate_aggregated_text(self._current_text_parts, self._add_spaces)
|
||||
if self._current_assistant_text_parts and self._assistant_text_start_time:
|
||||
content = concatenate_aggregated_text(self._current_assistant_text_parts)
|
||||
if content:
|
||||
logger.trace(f"Emitting aggregated assistant message: {content}")
|
||||
message = TranscriptionMessage(
|
||||
role="assistant",
|
||||
content=content,
|
||||
timestamp=self._aggregation_start_time,
|
||||
timestamp=self._assistant_text_start_time,
|
||||
)
|
||||
await self._emit_update([message])
|
||||
else:
|
||||
logger.trace("No content to emit after stripping whitespace")
|
||||
|
||||
# Reset aggregation state
|
||||
self._current_text_parts = []
|
||||
self._aggregation_start_time = None
|
||||
self._current_assistant_text_parts = []
|
||||
self._assistant_text_start_time = None
|
||||
|
||||
async def _emit_aggregated_thought(self):
|
||||
"""Aggregates and emits thought text fragments as a thought transcript message.
|
||||
|
||||
This method aggregates thought fragments that may arrive in multiple
|
||||
LLMThoughtTextFrame instances and emits them as a single ThoughtTranscriptionMessage.
|
||||
"""
|
||||
if self._current_thought_parts and self._thought_start_time:
|
||||
content = concatenate_aggregated_text(self._current_thought_parts)
|
||||
if content:
|
||||
logger.trace(f"Emitting aggregated thought message: {content}")
|
||||
message = ThoughtTranscriptionMessage(
|
||||
content=content,
|
||||
timestamp=self._thought_start_time,
|
||||
)
|
||||
await self._emit_update([message])
|
||||
else:
|
||||
logger.trace("No thought content to emit after stripping whitespace")
|
||||
|
||||
# Reset aggregation state
|
||||
self._current_thought_parts = []
|
||||
self._thought_start_time = None
|
||||
self._thought_active = False
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
"""Process frames into assistant conversation messages.
|
||||
"""Process frames into assistant conversation messages and thought messages.
|
||||
|
||||
Handles different frame types:
|
||||
|
||||
- TTSTextFrame: Aggregates text for current utterance
|
||||
- LLMThoughtStartFrame: Begins aggregating a new thought
|
||||
- LLMThoughtTextFrame: Aggregates text for current thought
|
||||
- LLMThoughtEndFrame: Completes current thought
|
||||
- BotStoppedSpeakingFrame: Completes current utterance
|
||||
- InterruptionFrame: Completes current utterance due to interruption
|
||||
- EndFrame: Completes current utterance at pipeline end
|
||||
- CancelFrame: Completes current utterance due to cancellation
|
||||
- InterruptionFrame: Completes current utterance and thought due to interruption
|
||||
- EndFrame: Completes current utterance and thought at pipeline end
|
||||
- CancelFrame: Completes current utterance and thought due to cancellation
|
||||
|
||||
Args:
|
||||
frame: Input frame to process.
|
||||
@@ -184,24 +188,53 @@ class AssistantTranscriptProcessor(BaseTranscriptProcessor):
|
||||
# Push frame first otherwise our emitted transcription update frame
|
||||
# might get cleaned up.
|
||||
await self.push_frame(frame, direction)
|
||||
# Emit accumulated text with interruptions
|
||||
await self._emit_aggregated_text()
|
||||
# Emit accumulated text and thought with interruptions
|
||||
await self._emit_aggregated_assistant_text()
|
||||
if self._process_thoughts and self._thought_active:
|
||||
await self._emit_aggregated_thought()
|
||||
elif isinstance(frame, LLMThoughtStartFrame):
|
||||
# Start a new thought
|
||||
if self._process_thoughts:
|
||||
self._thought_active = True
|
||||
self._thought_start_time = time_now_iso8601()
|
||||
self._current_thought_parts = []
|
||||
# Push frame.
|
||||
await self.push_frame(frame, direction)
|
||||
elif isinstance(frame, LLMThoughtTextFrame):
|
||||
# Aggregate thought text if we have an active thought
|
||||
if self._process_thoughts and self._thought_active:
|
||||
self._current_thought_parts.append(
|
||||
TextPartForConcatenation(
|
||||
frame.text, includes_inter_part_spaces=frame.includes_inter_frame_spaces
|
||||
)
|
||||
)
|
||||
# Push frame.
|
||||
await self.push_frame(frame, direction)
|
||||
elif isinstance(frame, LLMThoughtEndFrame):
|
||||
# Emit accumulated thought when thought ends
|
||||
if self._process_thoughts and self._thought_active:
|
||||
await self._emit_aggregated_thought()
|
||||
# Push frame.
|
||||
await self.push_frame(frame, direction)
|
||||
elif isinstance(frame, TTSTextFrame):
|
||||
# Start timestamp on first text part
|
||||
if not self._aggregation_start_time:
|
||||
self._aggregation_start_time = time_now_iso8601()
|
||||
if not self._assistant_text_start_time:
|
||||
self._assistant_text_start_time = time_now_iso8601()
|
||||
|
||||
# Track whether we need to add spaces between text parts
|
||||
# Assumption: we can just keep track of the latest frame's value
|
||||
self._add_spaces = not frame.includes_inter_frame_spaces
|
||||
|
||||
self._current_text_parts.append(frame.text)
|
||||
self._current_assistant_text_parts.append(
|
||||
TextPartForConcatenation(
|
||||
frame.text, includes_inter_part_spaces=frame.includes_inter_frame_spaces
|
||||
)
|
||||
)
|
||||
|
||||
# Push frame.
|
||||
await self.push_frame(frame, direction)
|
||||
elif isinstance(frame, (BotStoppedSpeakingFrame, EndFrame)):
|
||||
# Emit accumulated text when bot finishes speaking or pipeline ends.
|
||||
await self._emit_aggregated_text()
|
||||
await self._emit_aggregated_assistant_text()
|
||||
# Emit accumulated thought at pipeline end if still active
|
||||
if isinstance(frame, EndFrame) and self._process_thoughts and self._thought_active:
|
||||
await self._emit_aggregated_thought()
|
||||
# Push frame.
|
||||
await self.push_frame(frame, direction)
|
||||
else:
|
||||
@@ -212,7 +245,8 @@ class TranscriptProcessor:
|
||||
"""Factory for creating and managing transcript processors.
|
||||
|
||||
Provides unified access to user and assistant transcript processors
|
||||
with shared event handling.
|
||||
with shared event handling. The assistant processor handles both TTS text
|
||||
and LLM thought frames.
|
||||
|
||||
Example::
|
||||
|
||||
@@ -227,7 +261,7 @@ class TranscriptProcessor:
|
||||
llm,
|
||||
tts,
|
||||
transport.output(),
|
||||
transcript.assistant_tts(), # Assistant transcripts
|
||||
transcript.assistant(), # Assistant transcripts (including thoughts)
|
||||
context_aggregator.assistant(),
|
||||
]
|
||||
)
|
||||
@@ -237,8 +271,14 @@ class TranscriptProcessor:
|
||||
print(f"New messages: {frame.messages}")
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize factory."""
|
||||
def __init__(self, *, process_thoughts: bool = False):
|
||||
"""Initialize factory.
|
||||
|
||||
Args:
|
||||
process_thoughts: Whether the assistant processor should handle LLM thought
|
||||
frames. Defaults to False.
|
||||
"""
|
||||
self._process_thoughts = process_thoughts
|
||||
self._user_processor = None
|
||||
self._assistant_processor = None
|
||||
self._event_handlers = {}
|
||||
@@ -273,7 +313,9 @@ class TranscriptProcessor:
|
||||
The assistant transcript processor instance.
|
||||
"""
|
||||
if self._assistant_processor is None:
|
||||
self._assistant_processor = AssistantTranscriptProcessor(**kwargs)
|
||||
self._assistant_processor = AssistantTranscriptProcessor(
|
||||
process_thoughts=self._process_thoughts, **kwargs
|
||||
)
|
||||
# Apply any registered event handlers
|
||||
for event_name, handler in self._event_handlers.items():
|
||||
|
||||
|
||||
@@ -171,6 +171,7 @@ def _create_server_app(
|
||||
esp32_mode: bool = False,
|
||||
whatsapp_enabled: bool = False,
|
||||
folder: Optional[str] = None,
|
||||
dialin_enabled: bool = False,
|
||||
):
|
||||
"""Create FastAPI app with transport-specific routes."""
|
||||
app = FastAPI()
|
||||
@@ -189,7 +190,7 @@ def _create_server_app(
|
||||
if whatsapp_enabled:
|
||||
_setup_whatsapp_routes(app)
|
||||
elif transport_type == "daily":
|
||||
_setup_daily_routes(app)
|
||||
_setup_daily_routes(app, dialin_enabled=dialin_enabled)
|
||||
elif transport_type in TELEPHONY_TRANSPORTS:
|
||||
_setup_telephony_routes(app, transport_type=transport_type, proxy=proxy)
|
||||
else:
|
||||
@@ -264,7 +265,10 @@ def _setup_webrtc_routes(
|
||||
# Prepare runner arguments with the callback to run your bot
|
||||
async def webrtc_connection_callback(connection):
|
||||
bot_module = _get_bot_module()
|
||||
runner_args = SmallWebRTCRunnerArguments(webrtc_connection=connection)
|
||||
|
||||
runner_args = SmallWebRTCRunnerArguments(
|
||||
webrtc_connection=connection, body=request.request_data
|
||||
)
|
||||
background_tasks.add_task(bot_module.bot, runner_args)
|
||||
|
||||
# Delegate handling to SmallWebRTCRequestHandler
|
||||
@@ -299,7 +303,7 @@ def _setup_webrtc_routes(
|
||||
result: StartBotResult = {"sessionId": session_id}
|
||||
if request_data.get("enableDefaultIceServers"):
|
||||
result["iceConfig"] = IceConfig(
|
||||
iceServers=[IceServer(urls="stun:stun.l.google.com:19302")]
|
||||
iceServers=[IceServer(urls=["stun:stun.l.google.com:19302"])]
|
||||
)
|
||||
|
||||
return result
|
||||
@@ -326,7 +330,8 @@ def _setup_webrtc_routes(
|
||||
type=request_data["type"],
|
||||
pc_id=request_data.get("pc_id"),
|
||||
restart_pc=request_data.get("restart_pc"),
|
||||
request_data=request_data,
|
||||
request_data=request_data.get("request_data")
|
||||
or request_data.get("requestData"),
|
||||
)
|
||||
return await offer(webrtc_request, background_tasks)
|
||||
elif request.method == HTTPMethod.PATCH.value:
|
||||
@@ -529,8 +534,13 @@ def _setup_whatsapp_routes(app: FastAPI):
|
||||
_add_lifespan_to_app(app, whatsapp_lifespan)
|
||||
|
||||
|
||||
def _setup_daily_routes(app: FastAPI):
|
||||
"""Set up Daily-specific routes."""
|
||||
def _setup_daily_routes(app: FastAPI, dialin_enabled: bool = False):
|
||||
"""Set up Daily-specific routes.
|
||||
|
||||
Args:
|
||||
app: FastAPI application instance
|
||||
dialin_enabled: If True, adds /daily-dialin-webhook endpoint for PSTN dial-in handling
|
||||
"""
|
||||
|
||||
@app.get("/")
|
||||
async def create_room_and_start_agent():
|
||||
@@ -635,6 +645,116 @@ def _setup_daily_routes(app: FastAPI):
|
||||
|
||||
return result
|
||||
|
||||
if dialin_enabled:
|
||||
|
||||
@app.post("/daily-dialin-webhook")
|
||||
async def handle_dialin_webhook(request: Request):
|
||||
"""Handle incoming Daily PSTN dial-in webhook.
|
||||
|
||||
This endpoint mimics Pipecat Cloud's dial-in webhook handler.
|
||||
It receives Daily webhook data, creates a SIP-enabled room, and starts the bot.
|
||||
|
||||
Expected webhook payload::
|
||||
|
||||
{
|
||||
"From": "+15551234567",
|
||||
"To": "+15559876543",
|
||||
"callId": "uuid-call-id",
|
||||
"callDomain": "uuid-call-domain",
|
||||
"sipHeaders": {...} // optional
|
||||
}
|
||||
|
||||
Returns::
|
||||
|
||||
{
|
||||
"dailyRoom": "https://...",
|
||||
"dailyToken": "...",
|
||||
"sessionId": "uuid"
|
||||
}
|
||||
"""
|
||||
logger.debug("Received Daily dial-in webhook")
|
||||
|
||||
try:
|
||||
data = await request.json()
|
||||
logger.debug(f"Webhook data: {data}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to parse webhook data: {e}")
|
||||
raise HTTPException(status_code=400, detail="Invalid JSON payload")
|
||||
|
||||
# Handle webhook verification test (sent by Daily when configuring webhook)
|
||||
if data.get("test") or data.get("Test"):
|
||||
logger.debug("Webhook verification test received")
|
||||
return {"status": "OK"}
|
||||
|
||||
# Validate required fields
|
||||
if not all(key in data for key in ["From", "To", "callId", "callDomain"]):
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="Missing required fields: From, To, callId, callDomain",
|
||||
)
|
||||
|
||||
import aiohttp
|
||||
|
||||
from pipecat.runner.daily import configure
|
||||
from pipecat.runner.types import DailyDialinRequest, DialinSettings
|
||||
|
||||
# Create Daily room with SIP capabilities
|
||||
async with aiohttp.ClientSession() as session:
|
||||
try:
|
||||
room_config = await configure(session, sip_caller_phone=data.get("From"))
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to create Daily room: {e}")
|
||||
raise HTTPException(
|
||||
status_code=500, detail=f"Failed to create Daily room: {str(e)}"
|
||||
)
|
||||
|
||||
# Get Daily API URL from environment, fallback to production
|
||||
daily_api_url = os.getenv("DAILY_API_URL", "https://api.daily.co/v1")
|
||||
|
||||
# Get Daily API key from environment
|
||||
daily_api_key = os.getenv("DAILY_API_KEY")
|
||||
if not daily_api_key:
|
||||
logger.error("DAILY_API_KEY not found in environment")
|
||||
raise HTTPException(
|
||||
status_code=500, detail="DAILY_API_KEY not configured on server"
|
||||
)
|
||||
|
||||
# Prepare dial-in settings matching Pipecat Cloud structure
|
||||
dialin_settings = DialinSettings(
|
||||
call_id=data.get("callId"),
|
||||
call_domain=data.get("callDomain"),
|
||||
To=data.get("To"),
|
||||
From=data.get("From"),
|
||||
sip_headers=data.get("sipHeaders"),
|
||||
)
|
||||
|
||||
# Create request body matching Pipecat Cloud payload
|
||||
request_body = DailyDialinRequest(
|
||||
dialin_settings=dialin_settings,
|
||||
daily_api_key=daily_api_key,
|
||||
daily_api_url=daily_api_url,
|
||||
)
|
||||
|
||||
# Start bot with dial-in context
|
||||
bot_module = _get_bot_module()
|
||||
runner_args = DailyRunnerArguments(
|
||||
room_url=room_config.room_url,
|
||||
token=room_config.token,
|
||||
body=request_body.model_dump(),
|
||||
)
|
||||
|
||||
asyncio.create_task(bot_module.bot(runner_args))
|
||||
|
||||
# Generate session ID
|
||||
session_id = str(uuid.uuid4())
|
||||
|
||||
# Return response matching Pipecat Cloud format
|
||||
return {
|
||||
"dailyRoom": room_config.room_url,
|
||||
"dailyToken": room_config.token,
|
||||
"sessionId": session_id,
|
||||
}
|
||||
|
||||
|
||||
def _setup_telephony_routes(app: FastAPI, *, transport_type: str, proxy: str):
|
||||
"""Set up telephony-specific routes."""
|
||||
@@ -809,6 +929,12 @@ def main():
|
||||
default=False,
|
||||
help="Ensure requried WhatsApp environment variables are present",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dialin",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="Enable Daily PSTN dial-in webhook handling (requires Daily transport)",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@@ -828,6 +954,11 @@ def main():
|
||||
logger.error("For ESP32, you need to specify `--host IP` so we can do SDP munging.")
|
||||
return
|
||||
|
||||
# Validate dial-in requirements
|
||||
if args.dialin and args.transport != "daily":
|
||||
logger.error("--dialin flag only works with Daily transport (-t daily)")
|
||||
return
|
||||
|
||||
# Log level
|
||||
logger.remove()
|
||||
logger.add(sys.stderr, level="TRACE" if args.verbose else "DEBUG")
|
||||
@@ -856,7 +987,13 @@ def main():
|
||||
elif args.transport == "daily":
|
||||
print()
|
||||
print(f"🚀 Bot ready!")
|
||||
print(f" → Open http://{args.host}:{args.port} in your browser to start a session")
|
||||
if args.dialin:
|
||||
print(
|
||||
f" → Daily dial-in webhook: http://{args.host}:{args.port}/daily-dialin-webhook"
|
||||
)
|
||||
print(f" → Configure this URL in your Daily phone number settings")
|
||||
else:
|
||||
print(f" → Open http://{args.host}:{args.port} in your browser to start a session")
|
||||
print()
|
||||
|
||||
RUNNER_DOWNLOADS_FOLDER = args.folder
|
||||
@@ -871,6 +1008,7 @@ def main():
|
||||
esp32_mode=args.esp32,
|
||||
whatsapp_enabled=args.whatsapp,
|
||||
folder=args.folder,
|
||||
dialin_enabled=args.dialin,
|
||||
)
|
||||
|
||||
# Run the server
|
||||
|
||||
@@ -11,9 +11,48 @@ information to bot functions.
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any, Optional
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
from fastapi import WebSocket
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
class DialinSettings(BaseModel):
|
||||
"""Dial-in settings from the Daily webhook.
|
||||
|
||||
This model matches the structure sent by Pipecat Cloud and Daily.co webhooks
|
||||
for incoming PSTN/SIP calls.
|
||||
|
||||
Parameters:
|
||||
call_id: Unique identifier for the call (UUID representing sessionId in SIP Network)
|
||||
call_domain: Daily domain for the call (UUID representing Daily Domain on SIP Network)
|
||||
To: The dialed phone number (optional)
|
||||
From: The caller's phone number (optional)
|
||||
sip_headers: Optional SIP headers from the call
|
||||
"""
|
||||
|
||||
call_id: str
|
||||
call_domain: str
|
||||
To: Optional[str] = None
|
||||
From: Optional[str] = None
|
||||
sip_headers: Optional[Dict[str, str]] = None
|
||||
|
||||
|
||||
class DailyDialinRequest(BaseModel):
|
||||
"""Request data for Daily PSTN dial-in requests.
|
||||
|
||||
This is the structure passed in runner_args.body for dial-in calls.
|
||||
It matches the payload structure from Pipecat Cloud's dial-in webhook handler.
|
||||
|
||||
Parameters:
|
||||
dialin_settings: Dial-in configuration including call_id, call_domain, To, From
|
||||
daily_api_key: Daily API key for pinlessCallUpdate (required for dial-in)
|
||||
daily_api_url: Daily API URL (staging or production)
|
||||
"""
|
||||
|
||||
dialin_settings: DialinSettings
|
||||
daily_api_key: str
|
||||
daily_api_url: str
|
||||
|
||||
|
||||
@dataclass
|
||||
|
||||
@@ -281,6 +281,14 @@ async def maybe_capture_participant_camera(
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
try:
|
||||
from pipecat.transports.smallwebrtc.transport import SmallWebRTCTransport
|
||||
|
||||
if isinstance(transport, SmallWebRTCTransport):
|
||||
await transport.capture_participant_video(video_source="camera")
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
|
||||
async def maybe_capture_participant_screen(
|
||||
transport: BaseTransport, client: Any, framerate: int = 0
|
||||
@@ -303,6 +311,14 @@ async def maybe_capture_participant_screen(
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
try:
|
||||
from pipecat.transports.smallwebrtc.transport import SmallWebRTCTransport
|
||||
|
||||
if isinstance(transport, SmallWebRTCTransport):
|
||||
await transport.capture_participant_video(video_source="screenVideo")
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
|
||||
def _smallwebrtc_sdp_cleanup_ice_candidates(text: str, pattern: str) -> str:
|
||||
"""Clean up ICE candidates in SDP text for SmallWebRTC.
|
||||
|
||||
@@ -199,7 +199,7 @@ class PlivoFrameSerializer(FrameSerializer):
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.exception(f"Failed to hang up Plivo call: {e}")
|
||||
logger.error(f"Failed to hang up Plivo call: {e}")
|
||||
|
||||
async def deserialize(self, data: str | bytes) -> Frame | None:
|
||||
"""Deserializes Plivo WebSocket data to Pipecat frames.
|
||||
|
||||
@@ -225,7 +225,7 @@ class TelnyxFrameSerializer(FrameSerializer):
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.exception(f"Failed to hang up Telnyx call: {e}")
|
||||
logger.error(f"Failed to hang up Telnyx call: {e}")
|
||||
|
||||
async def deserialize(self, data: str | bytes) -> Frame | None:
|
||||
"""Deserializes Telnyx WebSocket data to Pipecat frames.
|
||||
|
||||
@@ -236,7 +236,7 @@ class TwilioFrameSerializer(FrameSerializer):
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.exception(f"Failed to hang up Twilio call: {e}")
|
||||
logger.error(f"Failed to hang up Twilio call: {e}")
|
||||
|
||||
async def deserialize(self, data: str | bytes) -> Frame | None:
|
||||
"""Deserializes Twilio WebSocket data to Pipecat frames.
|
||||
|
||||
@@ -166,6 +166,6 @@ class AIService(FrameProcessor):
|
||||
async for f in generator:
|
||||
if f:
|
||||
if isinstance(f, ErrorFrame):
|
||||
await self.push_error(f)
|
||||
await self.push_error_frame(f)
|
||||
else:
|
||||
await self.push_frame(f)
|
||||
|
||||
@@ -17,7 +17,7 @@ import io
|
||||
import json
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
from typing import Any, Dict, List, Literal, Optional, Union
|
||||
|
||||
import httpx
|
||||
from loguru import logger
|
||||
@@ -40,6 +40,9 @@ from pipecat.frames.frames import (
|
||||
LLMFullResponseStartFrame,
|
||||
LLMMessagesFrame,
|
||||
LLMTextFrame,
|
||||
LLMThoughtEndFrame,
|
||||
LLMThoughtStartFrame,
|
||||
LLMThoughtTextFrame,
|
||||
LLMUpdateSettingsFrame,
|
||||
UserImageRawFrame,
|
||||
)
|
||||
@@ -110,6 +113,24 @@ class AnthropicLLMService(LLMService):
|
||||
# Overriding the default adapter to use the Anthropic one.
|
||||
adapter_class = AnthropicLLMAdapter
|
||||
|
||||
class ThinkingConfig(BaseModel):
|
||||
"""Configuration for extended thinking.
|
||||
|
||||
Parameters:
|
||||
type: Type of thinking mode (currently only "enabled" or "disabled").
|
||||
budget_tokens: Maximum number of tokens for thinking.
|
||||
With today's models, the minimum is 1024.
|
||||
Only allowed if type is "enabled".
|
||||
"""
|
||||
|
||||
# Why `| str` here? To not break compatibility in case Anthropic adds
|
||||
# more types in the future.
|
||||
type: Literal["enabled", "disabled"] | str
|
||||
|
||||
# Why not enforce minimnum of 1024 here? To not break compatibility in
|
||||
# case Anthropic changes this requirement in the future.
|
||||
budget_tokens: int
|
||||
|
||||
class InputParams(BaseModel):
|
||||
"""Input parameters for Anthropic model inference.
|
||||
|
||||
@@ -124,6 +145,10 @@ class AnthropicLLMService(LLMService):
|
||||
temperature: Sampling temperature between 0.0 and 1.0.
|
||||
top_k: Top-k sampling parameter.
|
||||
top_p: Top-p sampling parameter between 0.0 and 1.0.
|
||||
thinking: Extended thinking configuration.
|
||||
Enabling extended thinking causes the model to spend more time "thinking" before responding.
|
||||
It also causes this service to emit LLMThinking*Frames during response generation.
|
||||
Extended thinking is disabled by default.
|
||||
extra: Additional parameters to pass to the API.
|
||||
"""
|
||||
|
||||
@@ -133,6 +158,9 @@ class AnthropicLLMService(LLMService):
|
||||
temperature: Optional[float] = Field(default_factory=lambda: NOT_GIVEN, ge=0.0, le=1.0)
|
||||
top_k: Optional[int] = Field(default_factory=lambda: NOT_GIVEN, ge=0)
|
||||
top_p: Optional[float] = Field(default_factory=lambda: NOT_GIVEN, ge=0.0, le=1.0)
|
||||
thinking: Optional["AnthropicLLMService.ThinkingConfig"] = Field(
|
||||
default_factory=lambda: NOT_GIVEN
|
||||
)
|
||||
extra: Optional[Dict[str, Any]] = Field(default_factory=dict)
|
||||
|
||||
def model_post_init(self, __context):
|
||||
@@ -191,6 +219,7 @@ class AnthropicLLMService(LLMService):
|
||||
"temperature": params.temperature,
|
||||
"top_k": params.top_k,
|
||||
"top_p": params.top_p,
|
||||
"thinking": params.thinking,
|
||||
"extra": params.extra if isinstance(params.extra, dict) else {},
|
||||
}
|
||||
|
||||
@@ -238,28 +267,43 @@ class AnthropicLLMService(LLMService):
|
||||
"""
|
||||
messages = []
|
||||
system = NOT_GIVEN
|
||||
tools = []
|
||||
if isinstance(context, LLMContext):
|
||||
adapter: AnthropicLLMAdapter = self.get_llm_adapter()
|
||||
params = adapter.get_llm_invocation_params(
|
||||
invocation_params = adapter.get_llm_invocation_params(
|
||||
context, enable_prompt_caching=self._settings["enable_prompt_caching"]
|
||||
)
|
||||
messages = params["messages"]
|
||||
system = params["system"]
|
||||
messages = invocation_params["messages"]
|
||||
system = invocation_params["system"]
|
||||
tools = invocation_params["tools"]
|
||||
else:
|
||||
context = AnthropicLLMContext.upgrade_to_anthropic(context)
|
||||
messages = context.messages
|
||||
system = getattr(context, "system", NOT_GIVEN)
|
||||
tools = context.tools or []
|
||||
|
||||
# Build params using the same method as streaming completions
|
||||
params = {
|
||||
"model": self.model_name,
|
||||
"max_tokens": self._settings["max_tokens"],
|
||||
"stream": False,
|
||||
"temperature": self._settings["temperature"],
|
||||
"top_k": self._settings["top_k"],
|
||||
"top_p": self._settings["top_p"],
|
||||
"messages": messages,
|
||||
"system": system,
|
||||
"tools": tools,
|
||||
"betas": ["interleaved-thinking-2025-05-14"],
|
||||
}
|
||||
if self._settings["thinking"]:
|
||||
params["thinking"] = self._settings["thinking"].model_dump(exclude_unset=True)
|
||||
|
||||
params.update(self._settings["extra"])
|
||||
|
||||
# LLM completion
|
||||
response = await self._client.messages.create(
|
||||
model=self.model_name,
|
||||
messages=messages,
|
||||
system=system,
|
||||
max_tokens=8192,
|
||||
stream=False,
|
||||
)
|
||||
response = await self._client.beta.messages.create(**params)
|
||||
|
||||
return response.content[0].text
|
||||
return next((block.text for block in response.content if hasattr(block, "text")), None)
|
||||
|
||||
def create_context_aggregator(
|
||||
self,
|
||||
@@ -354,12 +398,21 @@ class AnthropicLLMService(LLMService):
|
||||
"top_p": self._settings["top_p"],
|
||||
}
|
||||
|
||||
# Add thinking parameter if set
|
||||
if self._settings["thinking"]:
|
||||
params["thinking"] = self._settings["thinking"].model_dump(exclude_unset=True)
|
||||
|
||||
# Messages, system, tools
|
||||
params.update(params_from_context)
|
||||
|
||||
params.update(self._settings["extra"])
|
||||
|
||||
response = await self._create_message_stream(self._client.messages.create, params)
|
||||
# "Interleaved thinking" needed to allow thinking between sequences
|
||||
# of function calls, when extended thinking is enabled.
|
||||
# Note that this requires us to use `client.beta`, below.
|
||||
params.update({"betas": ["interleaved-thinking-2025-05-14"]})
|
||||
|
||||
response = await self._create_message_stream(self._client.beta.messages.create, params)
|
||||
|
||||
await self.stop_ttfb_metrics()
|
||||
|
||||
@@ -373,19 +426,28 @@ class AnthropicLLMService(LLMService):
|
||||
|
||||
if event.type == "content_block_delta":
|
||||
if hasattr(event.delta, "text"):
|
||||
frame = LLMTextFrame(event.delta.text)
|
||||
frame.includes_inter_frame_spaces = True
|
||||
await self.push_frame(frame)
|
||||
await self.push_frame(LLMTextFrame(event.delta.text))
|
||||
completion_tokens_estimate += self._estimate_tokens(event.delta.text)
|
||||
elif hasattr(event.delta, "partial_json") and tool_use_block:
|
||||
json_accumulator += event.delta.partial_json
|
||||
completion_tokens_estimate += self._estimate_tokens(
|
||||
event.delta.partial_json
|
||||
)
|
||||
elif hasattr(event.delta, "thinking"):
|
||||
await self.push_frame(LLMThoughtTextFrame(text=event.delta.thinking))
|
||||
elif hasattr(event.delta, "signature"):
|
||||
await self.push_frame(LLMThoughtEndFrame(signature=event.delta.signature))
|
||||
elif event.type == "content_block_start":
|
||||
if event.content_block.type == "tool_use":
|
||||
tool_use_block = event.content_block
|
||||
json_accumulator = ""
|
||||
elif event.content_block.type == "thinking":
|
||||
await self.push_frame(
|
||||
LLMThoughtStartFrame(
|
||||
append_to_context=True,
|
||||
llm=self.get_llm_adapter().id_for_llm_specific_messages,
|
||||
)
|
||||
)
|
||||
elif (
|
||||
event.type == "message_delta"
|
||||
and hasattr(event.delta, "stop_reason")
|
||||
@@ -460,8 +522,7 @@ class AnthropicLLMService(LLMService):
|
||||
except httpx.TimeoutException:
|
||||
await self._call_event_handler("on_completion_timeout")
|
||||
except Exception as e:
|
||||
logger.exception(f"{self} exception: {e}")
|
||||
await self.push_error(ErrorFrame(f"{e}"))
|
||||
await self.push_error(error_msg=f"Unknown error occurred: {e}", exception=e)
|
||||
finally:
|
||||
await self.stop_processing_metrics()
|
||||
await self.push_frame(LLMFullResponseEndFrame())
|
||||
|
||||
@@ -17,11 +17,10 @@ from urllib.parse import urlencode
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from pipecat import __version__ as pipecat_version
|
||||
from pipecat import version as pipecat_version
|
||||
from pipecat.frames.frames import (
|
||||
CancelFrame,
|
||||
EndFrame,
|
||||
ErrorFrame,
|
||||
Frame,
|
||||
InterimTranscriptionFrame,
|
||||
StartFrame,
|
||||
@@ -30,7 +29,7 @@ from pipecat.frames.frames import (
|
||||
UserStoppedSpeakingFrame,
|
||||
)
|
||||
from pipecat.processors.frame_processor import FrameDirection
|
||||
from pipecat.services.stt_service import STTService
|
||||
from pipecat.services.stt_service import WebsocketSTTService
|
||||
from pipecat.transcriptions.language import Language
|
||||
from pipecat.utils.time import time_now_iso8601
|
||||
from pipecat.utils.tracing.service_decorators import traced_stt
|
||||
@@ -44,15 +43,15 @@ from .models import (
|
||||
)
|
||||
|
||||
try:
|
||||
import websockets
|
||||
from websockets.asyncio.client import connect as websocket_connect
|
||||
from websockets.protocol import State
|
||||
except ModuleNotFoundError as e:
|
||||
logger.error(f"Exception: {e}")
|
||||
logger.error('In order to use AssemblyAI, you need to `pip install "pipecat-ai[assemblyai]"`.')
|
||||
raise Exception(f"Missing module: {e}")
|
||||
|
||||
|
||||
class AssemblyAISTTService(STTService):
|
||||
class AssemblyAISTTService(WebsocketSTTService):
|
||||
"""AssemblyAI real-time speech-to-text service.
|
||||
|
||||
Provides real-time speech transcription using AssemblyAI's WebSocket API.
|
||||
@@ -80,15 +79,14 @@ class AssemblyAISTTService(STTService):
|
||||
vad_force_turn_endpoint: Whether to force turn endpoint on VAD stop. Defaults to True.
|
||||
**kwargs: Additional arguments passed to parent STTService class.
|
||||
"""
|
||||
super().__init__(sample_rate=connection_params.sample_rate, **kwargs)
|
||||
|
||||
self._api_key = api_key
|
||||
self._language = language
|
||||
self._api_endpoint_base_url = api_endpoint_base_url
|
||||
self._connection_params = connection_params
|
||||
self._vad_force_turn_endpoint = vad_force_turn_endpoint
|
||||
|
||||
super().__init__(sample_rate=self._connection_params.sample_rate, **kwargs)
|
||||
|
||||
self._websocket = None
|
||||
self._termination_event = asyncio.Event()
|
||||
self._received_termination = False
|
||||
self._connected = False
|
||||
@@ -114,7 +112,7 @@ class AssemblyAISTTService(STTService):
|
||||
frame: Start frame to begin processing.
|
||||
"""
|
||||
await super().start(frame)
|
||||
self._chunk_size_bytes = int(self._chunk_size_ms * self._sample_rate * 2 / 1000)
|
||||
self._chunk_size_bytes = int(self._chunk_size_ms * self.sample_rate * 2 / 1000)
|
||||
await self._connect()
|
||||
|
||||
async def stop(self, frame: EndFrame):
|
||||
@@ -146,10 +144,11 @@ class AssemblyAISTTService(STTService):
|
||||
"""
|
||||
self._audio_buffer.extend(audio)
|
||||
|
||||
while len(self._audio_buffer) >= self._chunk_size_bytes:
|
||||
chunk = bytes(self._audio_buffer[: self._chunk_size_bytes])
|
||||
self._audio_buffer = self._audio_buffer[self._chunk_size_bytes :]
|
||||
await self._websocket.send(chunk)
|
||||
if self._websocket and self._websocket.state is State.OPEN:
|
||||
while len(self._audio_buffer) >= self._chunk_size_bytes:
|
||||
chunk = bytes(self._audio_buffer[: self._chunk_size_bytes])
|
||||
self._audio_buffer = self._audio_buffer[self._chunk_size_bytes :]
|
||||
await self._websocket.send(chunk)
|
||||
|
||||
yield None
|
||||
|
||||
@@ -164,7 +163,11 @@ class AssemblyAISTTService(STTService):
|
||||
if isinstance(frame, UserStartedSpeakingFrame):
|
||||
await self.start_ttfb_metrics()
|
||||
elif isinstance(frame, UserStoppedSpeakingFrame):
|
||||
if self._vad_force_turn_endpoint:
|
||||
if (
|
||||
self._vad_force_turn_endpoint
|
||||
and self._websocket
|
||||
and self._websocket.state is State.OPEN
|
||||
):
|
||||
await self._websocket.send(json.dumps({"type": "ForceEndpoint"}))
|
||||
await self.start_processing_metrics()
|
||||
|
||||
@@ -191,28 +194,20 @@ class AssemblyAISTTService(STTService):
|
||||
return self._api_endpoint_base_url
|
||||
|
||||
async def _connect(self):
|
||||
try:
|
||||
ws_url = self._build_ws_url()
|
||||
headers = {
|
||||
"Authorization": self._api_key,
|
||||
"User-Agent": f"AssemblyAI/1.0 (integration=Pipecat/{pipecat_version})",
|
||||
}
|
||||
self._websocket = await websocket_connect(
|
||||
ws_url,
|
||||
additional_headers=headers,
|
||||
)
|
||||
self._connected = True
|
||||
self._receive_task = self.create_task(self._receive_task_handler())
|
||||
"""Connect to the AssemblyAI service.
|
||||
|
||||
await self._call_event_handler("on_connected")
|
||||
except Exception as e:
|
||||
logger.error(f"{self} exception: {e}")
|
||||
self._connected = False
|
||||
await self.push_error(ErrorFrame(error=f"{self} error: {e}"))
|
||||
raise
|
||||
Establishes websocket connection and starts receive task.
|
||||
"""
|
||||
await self._connect_websocket()
|
||||
|
||||
if self._websocket and not self._receive_task:
|
||||
self._receive_task = self.create_task(self._receive_task_handler(self._report_error))
|
||||
|
||||
async def _disconnect(self):
|
||||
"""Disconnect from AssemblyAI WebSocket and wait for termination message."""
|
||||
"""Disconnect from the AssemblyAI service.
|
||||
|
||||
Sends termination message, waits for acknowledgment, and cleans up.
|
||||
"""
|
||||
if not self._connected or not self._websocket:
|
||||
return
|
||||
|
||||
@@ -220,55 +215,96 @@ class AssemblyAISTTService(STTService):
|
||||
self._termination_event.clear()
|
||||
self._received_termination = False
|
||||
|
||||
if len(self._audio_buffer) > 0:
|
||||
await self._websocket.send(bytes(self._audio_buffer))
|
||||
self._audio_buffer.clear()
|
||||
|
||||
try:
|
||||
await self._websocket.send(json.dumps({"type": "Terminate"}))
|
||||
if self._websocket.state is State.OPEN:
|
||||
# Send any remaining audio
|
||||
if len(self._audio_buffer) > 0:
|
||||
await self._websocket.send(bytes(self._audio_buffer))
|
||||
self._audio_buffer.clear()
|
||||
|
||||
# Send termination message and wait for acknowledgment
|
||||
try:
|
||||
await asyncio.wait_for(self._termination_event.wait(), timeout=5.0)
|
||||
except asyncio.TimeoutError:
|
||||
logger.warning("Timed out waiting for termination message from server")
|
||||
await self._websocket.send(json.dumps({"type": "Terminate"}))
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"{self} exception: {e}")
|
||||
await self.push_error(ErrorFrame(error=f"{self} error: {e}"))
|
||||
try:
|
||||
await asyncio.wait_for(self._termination_event.wait(), timeout=5.0)
|
||||
except asyncio.TimeoutError:
|
||||
logger.warning("Timed out waiting for termination message from server")
|
||||
|
||||
if self._receive_task:
|
||||
await self.cancel_task(self._receive_task)
|
||||
|
||||
await self._websocket.close()
|
||||
except Exception as e:
|
||||
await self.push_error(error_msg=f"Unknown error occurred: {e}", exception=e)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"{self} exception: {e}")
|
||||
await self.push_error(ErrorFrame(error=f"{self} error: {e}"))
|
||||
await self.push_error(error_msg=f"Unknown error occurred: {e}", exception=e)
|
||||
finally:
|
||||
# Clean up tasks and connection
|
||||
if self._receive_task:
|
||||
await self.cancel_task(self._receive_task)
|
||||
self._receive_task = None
|
||||
|
||||
await self._disconnect_websocket()
|
||||
|
||||
async def _connect_websocket(self):
|
||||
"""Establish the websocket connection to AssemblyAI."""
|
||||
try:
|
||||
if self._websocket and self._websocket.state is State.OPEN:
|
||||
return
|
||||
|
||||
logger.debug("Connecting to AssemblyAI WebSocket")
|
||||
|
||||
ws_url = self._build_ws_url()
|
||||
headers = {
|
||||
"Authorization": self._api_key,
|
||||
"User-Agent": f"AssemblyAI/1.0 (integration=Pipecat/{pipecat_version()})",
|
||||
}
|
||||
self._websocket = await websocket_connect(
|
||||
ws_url,
|
||||
additional_headers=headers,
|
||||
)
|
||||
self._connected = True
|
||||
await self._call_event_handler("on_connected")
|
||||
logger.debug(f"{self} Connected to AssemblyAI WebSocket")
|
||||
except Exception as e:
|
||||
self._connected = False
|
||||
await self.push_error(error_msg=f"Unable to connect to AssemblyAI: {e}", exception=e)
|
||||
raise
|
||||
|
||||
async def _disconnect_websocket(self):
|
||||
"""Close the websocket connection to AssemblyAI."""
|
||||
try:
|
||||
if self._websocket:
|
||||
logger.debug("Disconnecting from AssemblyAI WebSocket")
|
||||
await self._websocket.close()
|
||||
except Exception as e:
|
||||
await self.push_error(error_msg=f"Error closing websocket: {e}", exception=e)
|
||||
finally:
|
||||
self._websocket = None
|
||||
self._connected = False
|
||||
self._receive_task = None
|
||||
await self._call_event_handler("on_disconnected")
|
||||
|
||||
async def _receive_task_handler(self):
|
||||
"""Handle incoming WebSocket messages."""
|
||||
try:
|
||||
while self._connected:
|
||||
try:
|
||||
message = await self._websocket.recv()
|
||||
data = json.loads(message)
|
||||
await self._handle_message(data)
|
||||
except websockets.exceptions.ConnectionClosedOK:
|
||||
break
|
||||
except Exception as e:
|
||||
logger.error(f"{self} exception: {e}")
|
||||
await self.push_error(ErrorFrame(error=f"{self} error: {e}"))
|
||||
break
|
||||
def _get_websocket(self):
|
||||
"""Get the current WebSocket connection.
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"{self} exception: {e}")
|
||||
await self.push_error(ErrorFrame(error=f"{self} error: {e}"))
|
||||
Returns:
|
||||
The WebSocket connection.
|
||||
|
||||
Raises:
|
||||
Exception: If WebSocket is not connected.
|
||||
"""
|
||||
if self._websocket:
|
||||
return self._websocket
|
||||
raise Exception("Websocket not connected")
|
||||
|
||||
async def _receive_messages(self):
|
||||
"""Receive and process websocket messages.
|
||||
|
||||
Continuously processes messages from the websocket connection.
|
||||
"""
|
||||
async for message in self._get_websocket():
|
||||
try:
|
||||
data = json.loads(message)
|
||||
await self._handle_message(data)
|
||||
except json.JSONDecodeError:
|
||||
logger.warning(f"Received non-JSON message: {message}")
|
||||
|
||||
def _parse_message(self, message: Dict[str, Any]) -> BaseMessage:
|
||||
"""Parse a raw message into the appropriate message type."""
|
||||
@@ -297,8 +333,7 @@ class AssemblyAISTTService(STTService):
|
||||
elif isinstance(parsed_message, TerminationMessage):
|
||||
await self._handle_termination(parsed_message)
|
||||
except Exception as e:
|
||||
logger.error(f"{self} exception: {e}")
|
||||
await self.push_error(ErrorFrame(error=f"{self} error: {e}"))
|
||||
await self.push_error(error_msg=f"Unknown error occurred: {e}", exception=e)
|
||||
|
||||
async def _handle_termination(self, message: TerminationMessage):
|
||||
"""Handle termination message."""
|
||||
|
||||
@@ -56,6 +56,17 @@ def language_to_async_language(language: Language) -> Optional[str]:
|
||||
Language.ES: "es",
|
||||
Language.DE: "de",
|
||||
Language.IT: "it",
|
||||
Language.PT: "pt",
|
||||
Language.NL: "nl",
|
||||
Language.AR: "ar",
|
||||
Language.RU: "ru",
|
||||
Language.RO: "ro",
|
||||
Language.JA: "ja",
|
||||
Language.HE: "he",
|
||||
Language.HY: "hy",
|
||||
Language.TR: "tr",
|
||||
Language.HI: "hi",
|
||||
Language.ZH: "zh",
|
||||
}
|
||||
|
||||
return resolve_language(language, LANGUAGE_MAP, use_base_code=True)
|
||||
@@ -74,7 +85,7 @@ class AsyncAITTSService(InterruptibleTTSService):
|
||||
language: Language to use for synthesis.
|
||||
"""
|
||||
|
||||
language: Optional[Language] = Language.EN
|
||||
language: Optional[Language] = None
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -83,7 +94,7 @@ class AsyncAITTSService(InterruptibleTTSService):
|
||||
voice_id: str,
|
||||
version: str = "v1",
|
||||
url: str = "wss://api.async.ai/text_to_speech/websocket/ws",
|
||||
model: str = "asyncflow_v2.0",
|
||||
model: str = "asyncflow_multilingual_v1.0",
|
||||
sample_rate: Optional[int] = None,
|
||||
encoding: str = "pcm_s16le",
|
||||
container: str = "raw",
|
||||
@@ -99,7 +110,7 @@ class AsyncAITTSService(InterruptibleTTSService):
|
||||
https://docs.async.ai/list-voices-16699698e0
|
||||
version: Async API version.
|
||||
url: WebSocket URL for Async TTS API.
|
||||
model: TTS model to use (e.g., "asyncflow_v2.0").
|
||||
model: TTS model to use (e.g., "asyncflow_multilingual_v1.0").
|
||||
sample_rate: Audio sample rate.
|
||||
encoding: Audio encoding format.
|
||||
container: Audio container format.
|
||||
@@ -128,7 +139,7 @@ class AsyncAITTSService(InterruptibleTTSService):
|
||||
},
|
||||
"language": self.language_to_service_language(params.language)
|
||||
if params.language
|
||||
else "en",
|
||||
else None,
|
||||
}
|
||||
|
||||
self.set_model_name(model)
|
||||
@@ -146,15 +157,6 @@ class AsyncAITTSService(InterruptibleTTSService):
|
||||
"""
|
||||
return True
|
||||
|
||||
@property
|
||||
def includes_inter_frame_spaces(self) -> bool:
|
||||
"""Indicates that AsyncAI TTSTextFrames include necessary inter-frame spaces.
|
||||
|
||||
Returns:
|
||||
True, indicating that AsyncAI's text frames include necessary inter-frame spaces.
|
||||
"""
|
||||
return True
|
||||
|
||||
def language_to_service_language(self, language: Language) -> Optional[str]:
|
||||
"""Convert a Language enum to Async language format.
|
||||
|
||||
@@ -237,8 +239,7 @@ class AsyncAITTSService(InterruptibleTTSService):
|
||||
|
||||
await self._call_event_handler("on_connected")
|
||||
except Exception as e:
|
||||
logger.error(f"{self} exception: {e}")
|
||||
await self.push_error(ErrorFrame(error=f"{self} error: {e}"))
|
||||
await self.push_error(error_msg=f"Unknown error occurred: {e}", exception=e)
|
||||
self._websocket = None
|
||||
await self._call_event_handler("on_connection_error", f"{e}")
|
||||
|
||||
@@ -250,8 +251,7 @@ class AsyncAITTSService(InterruptibleTTSService):
|
||||
logger.debug("Disconnecting from Async")
|
||||
await self._websocket.close()
|
||||
except Exception as e:
|
||||
logger.error(f"{self} exception: {e}")
|
||||
await self.push_error(ErrorFrame(error=f"{self} error: {e}"))
|
||||
await self.push_error(error_msg=f"Unknown error occurred: {e}", exception=e)
|
||||
finally:
|
||||
self._websocket = None
|
||||
self._started = False
|
||||
@@ -296,12 +296,11 @@ class AsyncAITTSService(InterruptibleTTSService):
|
||||
)
|
||||
await self.push_frame(frame)
|
||||
elif msg.get("error_code"):
|
||||
logger.error(f"{self} error: {msg}")
|
||||
await self.push_frame(TTSStoppedFrame())
|
||||
await self.stop_all_metrics()
|
||||
await self.push_error(ErrorFrame(error=f"{self} error: {msg['message']}"))
|
||||
await self.push_error(error_msg=f"Error: {msg['message']}")
|
||||
else:
|
||||
logger.error(f"{self} error, unknown message type: {msg}")
|
||||
await self.push_error(error_msg=f"Unknown message type: {msg}")
|
||||
|
||||
async def _keepalive_task_handler(self):
|
||||
"""Send periodic keepalive messages to maintain WebSocket connection."""
|
||||
@@ -344,16 +343,14 @@ class AsyncAITTSService(InterruptibleTTSService):
|
||||
await self._get_websocket().send(msg)
|
||||
await self.start_tts_usage_metrics(text)
|
||||
except Exception as e:
|
||||
logger.error(f"{self} exception: {e}")
|
||||
yield ErrorFrame(error=f"{self} error: {e}")
|
||||
yield ErrorFrame(error=f"Unknown error occurred: {e}")
|
||||
yield TTSStoppedFrame()
|
||||
await self._disconnect()
|
||||
await self._connect()
|
||||
return
|
||||
yield None
|
||||
except Exception as e:
|
||||
logger.error(f"{self} exception: {e}")
|
||||
yield ErrorFrame(error=f"{self} error: {e}")
|
||||
yield ErrorFrame(error=f"Unknown error occurred: {e}")
|
||||
|
||||
|
||||
class AsyncAIHttpTTSService(TTSService):
|
||||
@@ -371,7 +368,7 @@ class AsyncAIHttpTTSService(TTSService):
|
||||
language: Language to use for synthesis.
|
||||
"""
|
||||
|
||||
language: Optional[Language] = Language.EN
|
||||
language: Optional[Language] = None
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -379,7 +376,7 @@ class AsyncAIHttpTTSService(TTSService):
|
||||
api_key: str,
|
||||
voice_id: str,
|
||||
aiohttp_session: aiohttp.ClientSession,
|
||||
model: str = "asyncflow_v2.0",
|
||||
model: str = "asyncflow_multilingual_v1.0",
|
||||
url: str = "https://api.async.ai",
|
||||
version: str = "v1",
|
||||
sample_rate: Optional[int] = None,
|
||||
@@ -394,7 +391,7 @@ class AsyncAIHttpTTSService(TTSService):
|
||||
api_key: Async API key.
|
||||
voice_id: ID of the voice to use for synthesis.
|
||||
aiohttp_session: An aiohttp session for making HTTP requests.
|
||||
model: TTS model to use (e.g., "asyncflow_v2.0").
|
||||
model: TTS model to use (e.g., "asyncflow_multilingual_v1.0").
|
||||
url: Base URL for Async API.
|
||||
version: API version string for Async API.
|
||||
sample_rate: Audio sample rate.
|
||||
@@ -418,7 +415,7 @@ class AsyncAIHttpTTSService(TTSService):
|
||||
},
|
||||
"language": self.language_to_service_language(params.language)
|
||||
if params.language
|
||||
else "en",
|
||||
else None,
|
||||
}
|
||||
self.set_voice(voice_id)
|
||||
self.set_model_name(model)
|
||||
@@ -433,15 +430,6 @@ class AsyncAIHttpTTSService(TTSService):
|
||||
"""
|
||||
return True
|
||||
|
||||
@property
|
||||
def includes_inter_frame_spaces(self) -> bool:
|
||||
"""Indicates that AsyncAI TTSTextFrames include necessary inter-frame spaces.
|
||||
|
||||
Returns:
|
||||
True, indicating that AsyncAI's text frames include necessary inter-frame spaces.
|
||||
"""
|
||||
return True
|
||||
|
||||
def language_to_service_language(self, language: Language) -> Optional[str]:
|
||||
"""Convert a Language enum to Async language format.
|
||||
|
||||
@@ -495,8 +483,7 @@ class AsyncAIHttpTTSService(TTSService):
|
||||
async with self._session.post(url, json=payload, headers=headers) as response:
|
||||
if response.status != 200:
|
||||
error_text = await response.text()
|
||||
logger.error(f"Async API error: {error_text}")
|
||||
await self.push_error(ErrorFrame(error=f"Async API error: {error_text}"))
|
||||
await self.push_error(error_msg=f"Async API error: {error_text}")
|
||||
raise Exception(f"Async API returned status {response.status}: {error_text}")
|
||||
|
||||
audio_data = await response.read()
|
||||
@@ -512,8 +499,7 @@ class AsyncAIHttpTTSService(TTSService):
|
||||
yield frame
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"{self} exception: {e}")
|
||||
await self.push_error(ErrorFrame(error=f"{self} error: {e}"))
|
||||
await self.push_error(error_msg=f"Unknown error occurred: {e}", exception=e)
|
||||
finally:
|
||||
await self.stop_ttfb_metrics()
|
||||
yield TTSStoppedFrame()
|
||||
|
||||
@@ -8,8 +8,10 @@ import sys
|
||||
|
||||
from pipecat.services import DeprecatedModuleProxy
|
||||
|
||||
from .agent_core import *
|
||||
from .llm import *
|
||||
from .nova_sonic import *
|
||||
from .sagemaker import *
|
||||
from .stt import *
|
||||
from .tts import *
|
||||
|
||||
|
||||
258
src/pipecat/services/aws/agent_core.py
Normal file
258
src/pipecat/services/aws/agent_core.py
Normal file
@@ -0,0 +1,258 @@
|
||||
#
|
||||
# Copyright (c) 2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""AWS AgentCore Processor Module.
|
||||
|
||||
This module defines the AWSAgentCoreProcessor, which invokes agents hosted on
|
||||
Amazon Bedrock AgentCore Runtime and streams their responses as LLMTextFrames.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
from typing import Callable, Optional
|
||||
|
||||
import aioboto3
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.frames.frames import (
|
||||
Frame,
|
||||
LLMContextFrame,
|
||||
LLMFullResponseEndFrame,
|
||||
LLMFullResponseStartFrame,
|
||||
LLMTextFrame,
|
||||
)
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext, LLMSpecificMessage
|
||||
from pipecat.processors.aggregators.openai_llm_context import (
|
||||
OpenAILLMContext,
|
||||
OpenAILLMContextFrame,
|
||||
)
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
|
||||
|
||||
def default_context_to_payload_transformer(
|
||||
context: LLMContext | OpenAILLMContext,
|
||||
) -> Optional[str]:
|
||||
"""Default transformer to create AgentCore payload from LLM context.
|
||||
|
||||
Extracts the latest user or system message text and wraps it in {"prompt": "<text>"}.
|
||||
|
||||
Args:
|
||||
context: The LLM context containing conversation messages.
|
||||
|
||||
Returns:
|
||||
A JSON string payload for AgentCore, or None if no valid message found.
|
||||
"""
|
||||
messages = context.messages
|
||||
|
||||
if not messages:
|
||||
return None
|
||||
|
||||
last_message = messages[-1]
|
||||
if isinstance(last_message, LLMSpecificMessage) or last_message.get("role") not in (
|
||||
"user",
|
||||
"system",
|
||||
):
|
||||
return None
|
||||
|
||||
content = last_message.get("content")
|
||||
if not content:
|
||||
return None
|
||||
|
||||
if isinstance(content, str):
|
||||
prompt = content
|
||||
elif isinstance(content, list):
|
||||
prompt = " ".join([part.get("text", "") for part in content])
|
||||
else:
|
||||
return None
|
||||
|
||||
return json.dumps({"prompt": prompt})
|
||||
|
||||
|
||||
def default_response_to_output_transformer(response_line: str) -> Optional[str]:
|
||||
"""Default transformer to extract output text from AgentCore response.
|
||||
|
||||
Expects responses with {"response": "<text>"} format.
|
||||
|
||||
Args:
|
||||
response_line: The raw response line from AgentCore (without "data: " prefix).
|
||||
|
||||
Returns:
|
||||
The extracted output text, or None if no text found.
|
||||
"""
|
||||
response_json = json.loads(response_line)
|
||||
return response_json.get("response")
|
||||
|
||||
|
||||
class AWSAgentCoreProcessor(FrameProcessor):
|
||||
"""Processor that runs an Amazon Bedrock AgentCore agent.
|
||||
|
||||
Input:
|
||||
- LLMContextFrame: Supplies a context used to invoke the agent.
|
||||
|
||||
Output:
|
||||
- LLMTextFrame: The agent's text response(s).
|
||||
A single agent invocation may result in multiple text frames.
|
||||
|
||||
This processor transforms the input context to a payload for the AgentCore
|
||||
agent, and transforms the agent's response(s) into output text frame(s). Both
|
||||
mappings are configurable via transformers. Below is the default behavior.
|
||||
|
||||
Input transformer (context_to_payload_transformer):
|
||||
- Grabs the latest user or system message (if it's the latest message)
|
||||
- Extracts its text content
|
||||
- Constructs a payload that looks like {"prompt": "<text>"}
|
||||
|
||||
Output transformer (response_to_output_transformer):
|
||||
- Expects responses that look like {"response": "<text>"}
|
||||
- Extracts the text for use in the LLMTextFrame(s)
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
agentArn: str,
|
||||
aws_access_key: Optional[str] = None,
|
||||
aws_secret_key: Optional[str] = None,
|
||||
aws_session_token: Optional[str] = None,
|
||||
aws_region: Optional[str] = None,
|
||||
context_to_payload_transformer: Optional[
|
||||
Callable[[LLMContext | OpenAILLMContext], Optional[str]]
|
||||
] = None,
|
||||
response_to_output_transformer: Optional[Callable[[str], Optional[str]]] = None,
|
||||
**kwargs,
|
||||
):
|
||||
"""Initialize the AWS AgentCore processor.
|
||||
|
||||
Args:
|
||||
agentArn: The Amazon Web Services Resource Name (ARN) of the agent.
|
||||
aws_access_key: AWS access key ID. If None, uses default credentials.
|
||||
aws_secret_key: AWS secret access key. If None, uses default credentials.
|
||||
aws_session_token: AWS session token for temporary credentials.
|
||||
aws_region: AWS region.
|
||||
context_to_payload_transformer: Optional callable to transform
|
||||
LLMContext into AgentCore payload string. If None, uses
|
||||
default_context_to_payload_transformer.
|
||||
response_to_output_transformer: Optional callable to extract output text
|
||||
from AgentCore response. If None, uses
|
||||
default_response_to_output_transformer.
|
||||
**kwargs: Additional arguments passed to parent FrameProcessor.
|
||||
"""
|
||||
super().__init__(**kwargs)
|
||||
|
||||
self._agentArn = agentArn
|
||||
self._aws_session = aioboto3.Session()
|
||||
|
||||
# Store AWS session parameters for creating client in async context
|
||||
self._aws_params = {
|
||||
"aws_access_key_id": aws_access_key or os.getenv("AWS_ACCESS_KEY_ID"),
|
||||
"aws_secret_access_key": aws_secret_key or os.getenv("AWS_SECRET_ACCESS_KEY"),
|
||||
"aws_session_token": aws_session_token or os.getenv("AWS_SESSION_TOKEN"),
|
||||
"region_name": aws_region or os.getenv("AWS_REGION", "us-east-1"),
|
||||
}
|
||||
|
||||
# Set transformers with defaults
|
||||
self._context_to_payload_transformer = (
|
||||
context_to_payload_transformer or default_context_to_payload_transformer
|
||||
)
|
||||
self._response_to_output_transformer = (
|
||||
response_to_output_transformer or default_response_to_output_transformer
|
||||
)
|
||||
|
||||
# State for managing output response bookends
|
||||
self._output_response_open = False
|
||||
self._last_text_frame_time: Optional[float] = None
|
||||
self._close_task: Optional[asyncio.Task] = None
|
||||
self._output_response_timeout = 1.0 # seconds
|
||||
|
||||
async def _close_output_response_after_timeout(self):
|
||||
"""Close the output response after timeout if no new text frames arrive."""
|
||||
await asyncio.sleep(self._output_response_timeout)
|
||||
if self._output_response_open:
|
||||
self._output_response_open = False
|
||||
await self.push_frame(LLMFullResponseEndFrame())
|
||||
|
||||
async def _push_text_frame(self, text: str):
|
||||
"""Push a text frame, managing output response bookends."""
|
||||
# Cancel any pending close task
|
||||
if self._close_task and not self._close_task.done():
|
||||
await self.cancel_task(self._close_task)
|
||||
|
||||
# Open output response if needed
|
||||
if not self._output_response_open:
|
||||
await self.push_frame(LLMFullResponseStartFrame())
|
||||
self._output_response_open = True
|
||||
|
||||
# Push the text frame
|
||||
await self.push_frame(LLMTextFrame(text))
|
||||
self._last_text_frame_time = asyncio.get_event_loop().time()
|
||||
|
||||
# Schedule closing the output response after timeout
|
||||
self._close_task = self.create_task(self._close_output_response_after_timeout())
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
"""Process incoming frames and handle LLM message frames.
|
||||
|
||||
Args:
|
||||
frame: The incoming frame to process.
|
||||
direction: The direction of frame flow in the pipeline.
|
||||
"""
|
||||
await super().process_frame(frame, direction)
|
||||
if isinstance(frame, (LLMContextFrame, OpenAILLMContextFrame)):
|
||||
# Create payload to invoke AgentCore agent
|
||||
payload = self._context_to_payload_transformer(frame.context)
|
||||
|
||||
if not payload:
|
||||
return
|
||||
|
||||
async with self._aws_session.client("bedrock-agentcore", **self._aws_params) as client:
|
||||
# Invoke the AgentCore agent
|
||||
response = await client.invoke_agent_runtime(
|
||||
agentRuntimeArn=self._agentArn, payload=payload.encode()
|
||||
)
|
||||
|
||||
# Determine if this is a streamed multi-part response, which
|
||||
# will affect our parsing
|
||||
is_multi_part_response = "text/event-stream" in response.get("contentType", "")
|
||||
|
||||
# Handle each response part (there may be one, for single
|
||||
# responses, or multiple, for streamed multi-part responses)
|
||||
async for part in response.get("response", []):
|
||||
part_string = part.decode("utf-8")
|
||||
|
||||
# In streamed multi-part responses, each part might have
|
||||
# one or more lines, each of which starts with "data: ".
|
||||
# Treat each line as a response.
|
||||
if is_multi_part_response:
|
||||
for line in part_string.split("\n"):
|
||||
# Get response text from this line
|
||||
if not line:
|
||||
continue
|
||||
if not line.startswith("data: "):
|
||||
logger.warning(f"Expected line to start with 'data: ', got: {line}")
|
||||
continue
|
||||
line = line[6:] # omit "data: "
|
||||
|
||||
# Transform response line to output text
|
||||
text = self._response_to_output_transformer(line)
|
||||
if text:
|
||||
await self._push_text_frame(text)
|
||||
|
||||
# In single-part responses, the whole part is one response
|
||||
# and there's no "data: " prefix
|
||||
else:
|
||||
# Transform response part string to output text
|
||||
text = self._response_to_output_transformer(part_string)
|
||||
if text:
|
||||
await self._push_text_frame(text)
|
||||
|
||||
# Final close if output response is still open after all parts processed
|
||||
if self._output_response_open:
|
||||
if self._close_task and not self._close_task.done():
|
||||
await self.cancel_task(self._close_task)
|
||||
self._output_response_open = False
|
||||
await self.push_frame(LLMFullResponseEndFrame())
|
||||
else:
|
||||
await self.push_frame(frame, direction)
|
||||
@@ -734,7 +734,7 @@ class AWSBedrockLLMService(LLMService):
|
||||
aws_access_key: Optional[str] = None,
|
||||
aws_secret_key: Optional[str] = None,
|
||||
aws_session_token: Optional[str] = None,
|
||||
aws_region: str = "us-east-1",
|
||||
aws_region: Optional[str] = None,
|
||||
params: Optional[InputParams] = None,
|
||||
client_config: Optional[Config] = None,
|
||||
retry_timeout_secs: Optional[float] = 5.0,
|
||||
@@ -840,15 +840,13 @@ class AWSBedrockLLMService(LLMService):
|
||||
messages = context.messages
|
||||
system = getattr(context, "system", None) # [{"text": "system message"}]
|
||||
|
||||
# Determine if we're using Claude or Nova based on model ID
|
||||
model_id = self.model_name
|
||||
|
||||
# Prepare request parameters
|
||||
# Prepare request parameters using the same method as streaming
|
||||
inference_config = self._build_inference_config()
|
||||
|
||||
request_params = {
|
||||
"modelId": model_id,
|
||||
"modelId": self.model_name,
|
||||
"messages": messages,
|
||||
"additionalModelRequestFields": self._settings["additional_model_request_fields"],
|
||||
}
|
||||
|
||||
if inference_config:
|
||||
@@ -1078,9 +1076,7 @@ class AWSBedrockLLMService(LLMService):
|
||||
if "contentBlockDelta" in event:
|
||||
delta = event["contentBlockDelta"]["delta"]
|
||||
if "text" in delta:
|
||||
frame = LLMTextFrame(delta["text"])
|
||||
frame.includes_inter_frame_spaces = True
|
||||
await self.push_frame(frame)
|
||||
await self.push_frame(LLMTextFrame(delta["text"]))
|
||||
completion_tokens_estimate += self._estimate_tokens(delta["text"])
|
||||
elif "toolUse" in delta and "input" in delta["toolUse"]:
|
||||
# Handle partial JSON for tool use
|
||||
@@ -1138,7 +1134,7 @@ class AWSBedrockLLMService(LLMService):
|
||||
except (ReadTimeoutError, asyncio.TimeoutError):
|
||||
await self._call_event_handler("on_completion_timeout")
|
||||
except Exception as e:
|
||||
logger.exception(f"{self} exception: {e}")
|
||||
await self.push_error(error_msg=f"Unknown error occurred: {e}", exception=e)
|
||||
finally:
|
||||
await self.stop_processing_metrics()
|
||||
await self.push_frame(LLMFullResponseEndFrame())
|
||||
|
||||
@@ -27,6 +27,7 @@ from pydantic import BaseModel, Field
|
||||
from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
||||
from pipecat.adapters.services.aws_nova_sonic_adapter import AWSNovaSonicLLMAdapter, Role
|
||||
from pipecat.frames.frames import (
|
||||
AggregationType,
|
||||
BotStoppedSpeakingFrame,
|
||||
CancelFrame,
|
||||
EndFrame,
|
||||
@@ -156,6 +157,12 @@ class Params(BaseModel):
|
||||
max_tokens: Maximum number of tokens to generate.
|
||||
top_p: Nucleus sampling parameter.
|
||||
temperature: Sampling temperature for text generation.
|
||||
endpointing_sensitivity: Controls how quickly Nova Sonic decides the
|
||||
user has stopped speaking. Can be "LOW", "MEDIUM", or "HIGH", with
|
||||
"HIGH" being the most sensitive (i.e., causing the model to respond
|
||||
most quickly).
|
||||
If not set, uses the model's default behavior.
|
||||
Only supported with Nova 2 Sonic (the default model).
|
||||
"""
|
||||
|
||||
# Audio input
|
||||
@@ -173,6 +180,9 @@ class Params(BaseModel):
|
||||
top_p: Optional[float] = Field(default=0.9)
|
||||
temperature: Optional[float] = Field(default=0.7)
|
||||
|
||||
# Turn-taking
|
||||
endpointing_sensitivity: Optional[str] = Field(default=None)
|
||||
|
||||
|
||||
class AWSNovaSonicLLMService(LLMService):
|
||||
"""AWS Nova Sonic speech-to-speech LLM service.
|
||||
@@ -191,8 +201,8 @@ class AWSNovaSonicLLMService(LLMService):
|
||||
access_key_id: str,
|
||||
session_token: Optional[str] = None,
|
||||
region: str,
|
||||
model: str = "amazon.nova-sonic-v1:0",
|
||||
voice_id: str = "matthew", # matthew, tiffany, amy
|
||||
model: str = "amazon.nova-2-sonic-v1:0",
|
||||
voice_id: str = "matthew",
|
||||
params: Optional[Params] = None,
|
||||
system_instruction: Optional[str] = None,
|
||||
tools: Optional[ToolsSchema] = None,
|
||||
@@ -206,8 +216,15 @@ class AWSNovaSonicLLMService(LLMService):
|
||||
access_key_id: AWS access key ID for authentication.
|
||||
session_token: AWS session token for authentication.
|
||||
region: AWS region where the service is hosted.
|
||||
model: Model identifier. Defaults to "amazon.nova-sonic-v1:0".
|
||||
voice_id: Voice ID for speech synthesis. Options: matthew, tiffany, amy.
|
||||
Supported regions:
|
||||
- Nova 2 Sonic (the default model): "us-east-1", "us-west-2", "ap-northeast-1"
|
||||
- Nova Sonic (the older model): "us-east-1", "ap-northeast-1"
|
||||
model: Model identifier. Defaults to "amazon.nova-2-sonic-v1:0".
|
||||
voice_id: Voice ID for speech synthesis.
|
||||
Note that some voices are designed for use with a specific language.
|
||||
Options:
|
||||
- Nova 2 Sonic (the default model): see https://docs.aws.amazon.com/nova/latest/nova2-userguide/sonic-language-support.html
|
||||
- Nova Sonic (the older model): see https://docs.aws.amazon.com/nova/latest/userguide/available-voices.html.
|
||||
params: Model parameters for audio configuration and inference.
|
||||
system_instruction: System-level instruction for the model.
|
||||
tools: Available tools/functions for the model to use.
|
||||
@@ -231,6 +248,17 @@ class AWSNovaSonicLLMService(LLMService):
|
||||
self._system_instruction = system_instruction
|
||||
self._tools = tools
|
||||
|
||||
# Validate endpointing_sensitivity parameter
|
||||
if (
|
||||
self._params.endpointing_sensitivity
|
||||
and not self._is_endpointing_sensitivity_supported()
|
||||
):
|
||||
logger.warning(
|
||||
f"endpointing_sensitivity is not supported for model '{model}' and will be ignored. "
|
||||
"This parameter is only supported starting with Nova 2 Sonic (amazon.nova-2-sonic-v1:0)."
|
||||
)
|
||||
self._params.endpointing_sensitivity = None
|
||||
|
||||
if not send_transcription_frames:
|
||||
import warnings
|
||||
|
||||
@@ -452,13 +480,13 @@ class AWSNovaSonicLLMService(LLMService):
|
||||
self._ready_to_send_context = True
|
||||
await self._finish_connecting_if_context_available()
|
||||
except Exception as e:
|
||||
logger.error(f"{self} initialization error: {e}")
|
||||
await self.push_error(error_msg=f"Initialization error: {e}", exception=e)
|
||||
await self._disconnect()
|
||||
|
||||
async def _process_completed_function_calls(self, send_new_results: bool):
|
||||
# Check for set of completed function calls in the context
|
||||
for message in self._context.get_messages():
|
||||
if message.get("role") and message.get("content") != "IN_PROGRESS":
|
||||
if message.get("role") and message.get("content") not in ["IN_PROGRESS", "CANCELLED"]:
|
||||
tool_call_id = message.get("tool_call_id")
|
||||
if tool_call_id and tool_call_id not in self._completed_tool_calls:
|
||||
# Found a newly-completed function call - send the result to the service
|
||||
@@ -576,7 +604,7 @@ class AWSNovaSonicLLMService(LLMService):
|
||||
|
||||
logger.info("Finished disconnecting")
|
||||
except Exception as e:
|
||||
logger.error(f"{self} error disconnecting: {e}")
|
||||
await self.push_error(error_msg=f"Error disconnecting: {e}", exception=e)
|
||||
|
||||
def _create_client(self) -> BedrockRuntimeClient:
|
||||
config = Config(
|
||||
@@ -590,11 +618,33 @@ class AWSNovaSonicLLMService(LLMService):
|
||||
)
|
||||
return BedrockRuntimeClient(config=config)
|
||||
|
||||
def _is_first_generation_sonic_model(self) -> bool:
|
||||
# Nova Sonic (the older model) is identified by "amazon.nova-sonic-v1:0"
|
||||
return self._model == "amazon.nova-sonic-v1:0"
|
||||
|
||||
def _is_endpointing_sensitivity_supported(self) -> bool:
|
||||
# endpointing_sensitivity is only supported with Nova 2 Sonic (and,
|
||||
# presumably, future models)
|
||||
return not self._is_first_generation_sonic_model()
|
||||
|
||||
def _is_assistant_response_trigger_needed(self) -> bool:
|
||||
# Assistant response trigger audio is only needed with the older model
|
||||
return self._is_first_generation_sonic_model()
|
||||
|
||||
#
|
||||
# LLM communication: input events (pipecat -> LLM)
|
||||
#
|
||||
|
||||
async def _send_session_start_event(self):
|
||||
turn_detection_config = (
|
||||
f""",
|
||||
"turnDetectionConfiguration": {{
|
||||
"endpointingSensitivity": "{self._params.endpointing_sensitivity}"
|
||||
}}"""
|
||||
if self._params.endpointing_sensitivity
|
||||
else ""
|
||||
)
|
||||
|
||||
session_start = f"""
|
||||
{{
|
||||
"event": {{
|
||||
@@ -603,7 +653,7 @@ class AWSNovaSonicLLMService(LLMService):
|
||||
"maxTokens": {self._params.max_tokens},
|
||||
"topP": {self._params.top_p},
|
||||
"temperature": {self._params.temperature}
|
||||
}}
|
||||
}}{turn_detection_config}
|
||||
}}
|
||||
}}
|
||||
}}
|
||||
@@ -884,7 +934,7 @@ class AWSNovaSonicLLMService(LLMService):
|
||||
# Errors are kind of expected while disconnecting, so just
|
||||
# ignore them and do nothing
|
||||
return
|
||||
logger.error(f"{self} error processing responses: {e}")
|
||||
await self.push_error(error_msg=f"Error processing responses: {e}", exception=e)
|
||||
if self._wants_connection:
|
||||
await self.reset_conversation()
|
||||
|
||||
@@ -1027,7 +1077,7 @@ class AWSNovaSonicLLMService(LLMService):
|
||||
logger.debug(f"Assistant response text added: {text}")
|
||||
|
||||
# Report the text of the assistant response.
|
||||
frame = TTSTextFrame(text)
|
||||
frame = TTSTextFrame(text, aggregated_by=AggregationType.SENTENCE)
|
||||
frame.includes_inter_frame_spaces = True
|
||||
await self.push_frame(frame)
|
||||
|
||||
@@ -1062,7 +1112,9 @@ class AWSNovaSonicLLMService(LLMService):
|
||||
# TTSTextFrame would be ignored otherwise (the interruption frame
|
||||
# would have cleared the assistant aggregator state).
|
||||
await self.push_frame(LLMFullResponseStartFrame())
|
||||
frame = TTSTextFrame(self._assistant_text_buffer)
|
||||
frame = TTSTextFrame(
|
||||
self._assistant_text_buffer, aggregated_by=AggregationType.SENTENCE
|
||||
)
|
||||
frame.includes_inter_frame_spaces = True
|
||||
await self.push_frame(frame)
|
||||
self._may_need_repush_assistant_text = False
|
||||
@@ -1186,7 +1238,8 @@ class AWSNovaSonicLLMService(LLMService):
|
||||
)
|
||||
|
||||
#
|
||||
# assistant response trigger (HACK)
|
||||
# assistant response trigger
|
||||
# HACK: only needed for the older Nova Sonic (as opposed to Nova 2 Sonic) model
|
||||
#
|
||||
|
||||
# Class variable
|
||||
@@ -1200,12 +1253,17 @@ class AWSNovaSonicLLMService(LLMService):
|
||||
|
||||
Sends a pre-recorded "ready" audio trigger to prompt the assistant
|
||||
to start speaking. This is useful for controlling conversation flow.
|
||||
|
||||
Returns:
|
||||
False if already triggering a response, True otherwise.
|
||||
"""
|
||||
if not self._is_assistant_response_trigger_needed():
|
||||
logger.warning(
|
||||
f"Assistant response trigger not needed for model '{self._model}'; skipping. "
|
||||
"An LLMRunFrame() should be sufficient to prompt the assistant to respond, "
|
||||
"assuming the context ends in a user message."
|
||||
)
|
||||
return
|
||||
|
||||
if self._triggering_assistant_response:
|
||||
return False
|
||||
return
|
||||
|
||||
self._triggering_assistant_response = True
|
||||
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user